#!/bin/bash # Basics #SBATCH --output=/beegfs/ws/0/s8896854-master_07/project/python/energy_flow_proper/07_one_bath_systematics/logs/out_slurm.txt #SBATCH --error=/beegfs/ws/0/s8896854-master_07/project/python/energy_flow_proper/07_one_bath_systematics/logs/err_slurm.txt #SBATCH --time=24:00:00 #SBATCH -D /beegfs/ws/0/s8896854-master_07/project/python/energy_flow_proper/07_one_bath_systematics #SBATCH --mail-type=ALL #SBATCH --mail-user=hiro@protagon.space #SBATCH --job-name=master_07 #SBATCH --constraint=fs_beegfs ## This shuts down the integration gracefully. #SBATCH --signal=B:HUP@60 # Ray Workers #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu=2500MB #SBATCH --nodes=20 log_dir=/beegfs/ws/0/s8896854-master_07/project/python/energy_flow_proper/07_one_bath_systematics/logs COMMAND='poetry run python scripts/integrate_slip.py' mkdir -p $log_dir # Getting the node names nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") nodes_array=($nodes) head_node=${nodes_array[0]} head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) # Start the head node port=$(srun --nodes=1 --ntasks=1 -w "$head_node" poetry run python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()') ip_head=$head_node_ip:$port export ip_head echo "IP Head: $ip_head" echo "Starting HEAD at $head_node" srun --output "$log_dir/log_head.txt" --error "$log_dir/err_head.txt" \ -J ray-head --nodes=1 --ntasks=1 -w "$head_node" \ poetry run ray start --head --port=$port \ --num-cpus $((SLURM_CPUS_PER_TASK - 1)) --block & sleep 10 # number of nodes other than the head node worker_num=$((SLURM_JOB_NUM_NODES - 1)) for ((i = 1; i <= worker_num; i++)); do node_i=${nodes_array[$i]} export TMPDIR="/tmp" node_ip=$(srun --nodes=1 --ntasks=1 -w "$node_i" hostname --ip-address) echo "Starting WORKER $i at $node_i" srun --output "$log_dir/log$i.txt" --error "$log_dir/err$i.txt" \ -J "r-$i" --nodes=1 --ntasks=1 -w "$node_i" \ poetry run ray start --address "$ip_head" \ --num-cpus "${SLURM_CPUS_PER_TASK}" --node-ip-address "$node_ip" --block & sleep 5 done sleep 10 export RAY_ADDRESS="auto" bash -c "$COMMAND"