mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
Set redis password in slurm deployment documentation. (#5747)
This commit is contained in:
parent
d17b35494d
commit
c91a37f622
1 changed files with 27 additions and 17 deletions
|
@ -8,15 +8,15 @@ Clusters managed by Slurm may require that Ray is initialized as a part of the s
|
|||
#!/bin/bash
|
||||
|
||||
#SBATCH --job-name=test
|
||||
#SBATCH --cpus-per-task=20
|
||||
#SBATCH --cpus-per-task=5
|
||||
#SBATCH --mem-per-cpu=1GB
|
||||
#SBATCH --nodes=5
|
||||
#SBATCH --nodes=3
|
||||
#SBATCH --tasks-per-node 1
|
||||
|
||||
worker_num=4 # Must be one less that the total number of nodes
|
||||
worker_num=2 # Must be one less that the total number of nodes
|
||||
|
||||
module load Langs/Python/3.6.4 # This will vary depending on your environment
|
||||
source venv/bin/activate
|
||||
# module load Langs/Python/3.6.4 # This will vary depending on your environment
|
||||
# source venv/bin/activate
|
||||
|
||||
nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names
|
||||
nodes_array=( $nodes )
|
||||
|
@ -26,38 +26,48 @@ Clusters managed by Slurm may require that Ray is initialized as a part of the s
|
|||
ip_prefix=$(srun --nodes=1 --ntasks=1 -w $node1 hostname --ip-address) # Making address
|
||||
suffix=':6379'
|
||||
ip_head=$ip_prefix$suffix
|
||||
redis_password=$(uuidgen)
|
||||
|
||||
export ip_head # Exporting for latter access by trainer.py
|
||||
|
||||
srun --nodes=1 --ntasks=1 -w $node1 ray start --block --head --redis-port=6379 & # Starting the head
|
||||
srun --nodes=1 --ntasks=1 -w $node1 ray start --block --head --redis-port=6379 --redis-password=$redis_password & # Starting the head
|
||||
sleep 5
|
||||
|
||||
for (( i=1; i<=$worker_num; i++ ))
|
||||
do
|
||||
node2=${nodes_array[$i]}
|
||||
srun --nodes=1 --ntasks=1 -w $node2 ray start --block --address=$ip_head & # Starting the workers
|
||||
srun --nodes=1 --ntasks=1 -w $node2 ray start --block --address=$ip_head --redis-password=$redis_password & # Starting the workers
|
||||
sleep 5
|
||||
done
|
||||
|
||||
python trainer.py 100 # Pass the total number of allocated CPUs
|
||||
python -u trainer.py $redis_password 15 # Pass the total number of allocated CPUs
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# trainer.py
|
||||
from collections import Counter
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import ray
|
||||
|
||||
ray.init(address=os.environ["ip_head"])
|
||||
redis_password = sys.argv[1]
|
||||
num_cpus = int(sys.argv[2])
|
||||
|
||||
ray.init(address=os.environ["ip_head"], redis_password=redis_password)
|
||||
|
||||
print("Nodes in the Ray cluster:")
|
||||
print(ray.nodes())
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
time.sleep(1)
|
||||
return ray.services.get_node_ip_address()
|
||||
|
||||
# The following takes one second (assuming that ray was able to access all of the allocated nodes).
|
||||
for i in range(60):
|
||||
start = time.time()
|
||||
num_cpus = int(sys.argv[1])
|
||||
ray.get([f.remote() for _ in range(num_cpus)])
|
||||
ip_addresses = ray.get([f.remote() for _ in range(num_cpus)])
|
||||
print(Counter(ip_addresses))
|
||||
end = time.time()
|
||||
print(end - start)
|
||||
|
|
Loading…
Add table
Reference in a new issue