mirror of
https://github.com/vale981/master-thesis
synced 2025-03-06 18:41:38 -05:00
68 lines
2.2 KiB
Bash
68 lines
2.2 KiB
Bash
#!/bin/bash
|
|
|
|
# Basics
|
|
#SBATCH --output=/beegfs/ws/0/s8896854-master_07/project/python/energy_flow_proper/07_one_bath_systematics/logs/out_slurm.txt
|
|
#SBATCH --error=/beegfs/ws/0/s8896854-master_07/project/python/energy_flow_proper/07_one_bath_systematics/logs/err_slurm.txt
|
|
#SBATCH --time=24:00:00
|
|
#SBATCH -D /beegfs/ws/0/s8896854-master_07/project/python/energy_flow_proper/07_one_bath_systematics
|
|
#SBATCH --mail-type=ALL
|
|
#SBATCH --mail-user=hiro@protagon.space
|
|
#SBATCH --job-name=master_07
|
|
#SBATCH --constraint=fs_beegfs
|
|
|
|
## This shuts down the integration gracefully.
|
|
#SBATCH --signal=B:HUP@60
|
|
|
|
# Ray Workers
|
|
#SBATCH --ntasks-per-node=1
|
|
#SBATCH --cpus-per-task=10
|
|
#SBATCH --mem-per-cpu=2500MB
|
|
#SBATCH --nodes=20
|
|
|
|
log_dir=/beegfs/ws/0/s8896854-master_07/project/python/energy_flow_proper/07_one_bath_systematics/logs
|
|
COMMAND='poetry run python scripts/integrate_slip.py'
|
|
|
|
mkdir -p $log_dir
|
|
|
|
# Getting the node names
|
|
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
|
|
nodes_array=($nodes)
|
|
|
|
head_node=${nodes_array[0]}
|
|
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
|
|
|
|
# Start the head node
|
|
port=$(srun --nodes=1 --ntasks=1 -w "$head_node" poetry run python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
|
|
|
|
ip_head=$head_node_ip:$port
|
|
|
|
export ip_head
|
|
echo "IP Head: $ip_head"
|
|
|
|
echo "Starting HEAD at $head_node"
|
|
srun --output "$log_dir/log_head.txt" --error "$log_dir/err_head.txt" \
|
|
-J ray-head --nodes=1 --ntasks=1 -w "$head_node" \
|
|
poetry run ray start --head --port=$port \
|
|
--num-cpus $((SLURM_CPUS_PER_TASK - 1)) --block &
|
|
|
|
sleep 10
|
|
|
|
# number of nodes other than the head node
|
|
worker_num=$((SLURM_JOB_NUM_NODES - 1))
|
|
|
|
for ((i = 1; i <= worker_num; i++)); do
|
|
node_i=${nodes_array[$i]}
|
|
export TMPDIR="/tmp"
|
|
node_ip=$(srun --nodes=1 --ntasks=1 -w "$node_i" hostname --ip-address)
|
|
echo "Starting WORKER $i at $node_i"
|
|
srun --output "$log_dir/log$i.txt" --error "$log_dir/err$i.txt" \
|
|
-J "r-$i" --nodes=1 --ntasks=1 -w "$node_i" \
|
|
poetry run ray start --address "$ip_head" \
|
|
--num-cpus "${SLURM_CPUS_PER_TASK}" --node-ip-address "$node_ip" --block &
|
|
sleep 5
|
|
done
|
|
|
|
sleep 10
|
|
|
|
export RAY_ADDRESS="auto"
|
|
bash -c "$COMMAND"
|