mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00

* added Azure Resource Manager (ARM) template * removed Azure doc (moved to separate PR) * nit * fixpaths * nit Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
93 lines
No EOL
1.8 KiB
Bash
Executable file
93 lines
No EOL
1.8 KiB
Bash
Executable file
#!/bin/sh
|
|
|
|
USERNAME=$1
|
|
CONDA_ENV=$2
|
|
WHEEL=$3
|
|
RAY_HEAD_IP=$4
|
|
TYPE=$5
|
|
|
|
echo "Installing wheel..."
|
|
sudo -u $USERNAME -i /bin/bash -l -c "conda init bash"
|
|
sudo -u $USERNAME -i /bin/bash -l -c "conda activate $CONDA_ENV; pip install $WHEEL"
|
|
|
|
echo "Setting up service scripts..."
|
|
cat > /home/$USERNAME/ray-head.sh << EOM
|
|
#!/bin/bash
|
|
conda activate $CONDA_ENV
|
|
|
|
NUM_GPUS=\`nvidia-smi -L | wc -l\`
|
|
|
|
ray stop
|
|
ulimit -n 65536
|
|
ray start --head --redis-port=6379 --object-manager-port=8076 --num-gpus=\$NUM_GPUS --block --webui-host 0.0.0.0
|
|
EOM
|
|
|
|
cat > /home/$USERNAME/ray-worker.sh << EOM
|
|
#!/bin/bash
|
|
conda activate $CONDA_ENV
|
|
|
|
NUM_GPUS=\`nvidia-smi -L | wc -l\`
|
|
|
|
ray stop
|
|
ulimit -n 65536
|
|
|
|
while true
|
|
do
|
|
ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --num-gpus=\$NUM_GPUS --block
|
|
echo Ray exited. Auto-restarting in 1 second...
|
|
sleep 1
|
|
done
|
|
EOM
|
|
|
|
cat > /home/$USERNAME/tensorboard.sh << EOM
|
|
#!/bin/bash
|
|
|
|
conda activate $CONDA_ENV
|
|
mkdir -p /home/$USERNAME/ray_results
|
|
|
|
tensorboard --bind_all --logdir=/home/$USERNAME/ray_results
|
|
EOM
|
|
|
|
chmod +x /home/$USERNAME/ray-head.sh
|
|
chmod +x /home/$USERNAME/ray-worker.sh
|
|
chmod +x /home/$USERNAME/tensorboard.sh
|
|
|
|
cat > /lib/systemd/system/ray.service << EOM
|
|
[Unit]
|
|
Description=Ray
|
|
|
|
[Service]
|
|
Type=simple
|
|
User=$USERNAME
|
|
ExecStart=/bin/bash -l /home/$USERNAME/ray-$TYPE.sh
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
EOM
|
|
|
|
cat > /lib/systemd/system/tensorboard.service << EOM
|
|
[Unit]
|
|
Description=TensorBoard
|
|
|
|
[Service]
|
|
Type=simple
|
|
User=$USERNAME
|
|
ExecStart=/bin/bash -l /home/$USERNAME/tensorboard.sh
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
EOM
|
|
|
|
echo "Configure ray to start at boot..."
|
|
systemctl enable ray
|
|
|
|
echo "Starting ray..."
|
|
systemctl start ray
|
|
|
|
if [ "$type" = "head" ]; then
|
|
echo "Configure TensorBoard to start at boot..."
|
|
systemctl enable tensorboard
|
|
|
|
echo "Starting TensorBoard..."
|
|
systemctl start tensorboard
|
|
fi |