master-thesis/python/energy_flow_proper/07_one_bath_systematics/slurm.sh
2022-04-06 11:46:21 +02:00

68 lines
2.2 KiB
Bash

#!/bin/bash
# Basics
#SBATCH --output=/beegfs/ws/0/s8896854-master_07/project/python/energy_flow_proper/07_one_bath_systematics/logs/out_slurm.txt
#SBATCH --error=/beegfs/ws/0/s8896854-master_07/project/python/energy_flow_proper/07_one_bath_systematics/logs/err_slurm.txt
#SBATCH --time=24:00:00
#SBATCH -D /beegfs/ws/0/s8896854-master_07/project/python/energy_flow_proper/07_one_bath_systematics
#SBATCH --mail-type=ALL
#SBATCH --mail-user=hiro@protagon.space
#SBATCH --job-name=master_07
#SBATCH --constraint=fs_beegfs
## This shuts down the integration gracefully.
#SBATCH --signal=B:HUP@60
# Ray Workers
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=10
#SBATCH --mem-per-cpu=2500MB
#SBATCH --nodes=20
log_dir=/beegfs/ws/0/s8896854-master_07/project/python/energy_flow_proper/07_one_bath_systematics/logs
COMMAND='poetry run python scripts/integrate_slip.py'
mkdir -p $log_dir
# Getting the node names
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
nodes_array=($nodes)
head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
# Start the head node
port=$(srun --nodes=1 --ntasks=1 -w "$head_node" poetry run python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
ip_head=$head_node_ip:$port
export ip_head
echo "IP Head: $ip_head"
echo "Starting HEAD at $head_node"
srun --output "$log_dir/log_head.txt" --error "$log_dir/err_head.txt" \
-J ray-head --nodes=1 --ntasks=1 -w "$head_node" \
poetry run ray start --head --port=$port \
--num-cpus $((SLURM_CPUS_PER_TASK - 1)) --block &
sleep 10
# number of nodes other than the head node
worker_num=$((SLURM_JOB_NUM_NODES - 1))
for ((i = 1; i <= worker_num; i++)); do
node_i=${nodes_array[$i]}
export TMPDIR="/tmp"
node_ip=$(srun --nodes=1 --ntasks=1 -w "$node_i" hostname --ip-address)
echo "Starting WORKER $i at $node_i"
srun --output "$log_dir/log$i.txt" --error "$log_dir/err$i.txt" \
-J "r-$i" --nodes=1 --ntasks=1 -w "$node_i" \
poetry run ray start --address "$ip_head" \
--num-cpus "${SLURM_CPUS_PER_TASK}" --node-ip-address "$node_ip" --block &
sleep 5
done
sleep 10
export RAY_ADDRESS="auto"
bash -c "$COMMAND"