2020-06-26 09:29:22 -07:00
.. _ray-slurm-deploy:
2019-08-19 00:46:26 -04:00
Deploying on Slurm
==================
Clusters managed by Slurm may require that Ray is initialized as a part of the submitted job. This can be done by using `` srun `` within the submitted script. For example:
.. code-block :: bash
#!/bin/bash
#SBATCH --job-name=test
2019-09-21 15:33:15 -07:00
#SBATCH --cpus-per-task=5
2019-08-19 00:46:26 -04:00
#SBATCH --mem-per-cpu=1GB
2019-09-21 15:33:15 -07:00
#SBATCH --nodes=3
2019-08-19 00:46:26 -04:00
#SBATCH --tasks-per-node 1
2019-09-21 15:33:15 -07:00
worker_num=2 # Must be one less that the total number of nodes
2019-08-19 00:46:26 -04:00
2019-09-21 15:33:15 -07:00
# module load Langs/Python/3.6.4 # This will vary depending on your environment
# source venv/bin/activate
2019-08-19 00:46:26 -04:00
nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names
nodes_array=( $nodes )
node1=${nodes_array[0]}
2019-09-01 16:53:02 -07:00
ip_prefix=$(srun --nodes=1 --ntasks=1 -w $node1 hostname --ip-address) # Making address
2019-08-19 00:46:26 -04:00
suffix=':6379'
ip_head=$ip_prefix$suffix
2019-09-21 15:33:15 -07:00
redis_password=$(uuidgen)
2019-08-19 00:46:26 -04:00
export ip_head # Exporting for latter access by trainer.py
2019-09-21 15:33:15 -07:00
srun --nodes=1 --ntasks=1 -w $node1 ray start --block --head --redis-port=6379 --redis-password=$redis_password & # Starting the head
2019-08-19 00:46:26 -04:00
sleep 5
2020-04-27 09:30:16 -07:00
# Make sure the head successfully starts before any worker does, otherwise
2020-06-26 09:29:22 -07:00
# the worker will not be able to connect to redis. In case of longer delay,
2020-04-27 09:30:16 -07:00
# adjust the sleeptime above to ensure proper order.
2019-08-19 00:46:26 -04:00
for (( i=1; i<=$worker_num; i++ ))
do
node2=${nodes_array[$i]}
2019-09-21 15:33:15 -07:00
srun --nodes=1 --ntasks=1 -w $node2 ray start --block --address=$ip_head --redis-password=$redis_password & # Starting the workers
2020-04-27 09:30:16 -07:00
# Flag --block will keep ray process alive on each compute node.
2019-08-19 00:46:26 -04:00
sleep 5
done
2019-09-21 15:33:15 -07:00
python -u trainer.py $redis_password 15 # Pass the total number of allocated CPUs
2019-08-19 00:46:26 -04:00
.. code-block :: python
# trainer.py
2019-09-21 15:33:15 -07:00
from collections import Counter
2019-08-19 00:46:26 -04:00
import os
import sys
import time
import ray
2019-09-21 15:33:15 -07:00
redis_password = sys.argv[1]
num_cpus = int(sys.argv[2])
ray.init(address=os.environ["ip_head"], redis_password=redis_password)
print("Nodes in the Ray cluster:")
print(ray.nodes())
2019-08-19 00:46:26 -04:00
@ray.remote
def f():
2019-09-21 15:33:15 -07:00
time.sleep(1)
return ray.services.get_node_ip_address()
2019-08-19 00:46:26 -04:00
# The following takes one second (assuming that ray was able to access all of the allocated nodes).
2019-09-21 15:33:15 -07:00
for i in range(60):
start = time.time()
ip_addresses = ray.get([f.remote() for _ in range(num_cpus)])
print(Counter(ip_addresses))
end = time.time()
print(end - start)