mirror of
https://github.com/vale981/ray
synced 2025-03-10 05:16:49 -04:00

Running `./ci/long_running_tests/start_workloads.sh` will start several workloads running (each in their own EC2 instance). - The workloads run forever. - The workloads all simulate multiple nodes but use a single machine. - You can get the tail of each workload by running `./ci/long_running_tests/check_workloads.sh`. - You have to manually shut down the instances. As discussed with @ericl @richardliaw, the idea here is to optimize for the debuggability of the tests. If one of them fails, you can ssh to the relevant instance and see all of the logs.
63 lines
1.7 KiB
Bash
Executable file
63 lines
1.7 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
|
|
set -e
|
|
|
|
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
|
|
|
|
pushd "$ROOT_DIR"
|
|
|
|
# Start one instance per workload.
|
|
for workload_file in "$ROOT_DIR"/workloads/*; do
|
|
file_name=$(basename -- $workload_file)
|
|
workload_name="${file_name%.*}"
|
|
ray up -y config.yaml --cluster-name="$workload_name" &
|
|
done
|
|
|
|
# Wait for all of the nodes to be up.
|
|
for pid in `jobs -p`; do
|
|
wait $pid
|
|
done
|
|
|
|
# Start the workloads running.
|
|
for workload_file in "$ROOT_DIR"/workloads/*; do
|
|
file_name=$(basename -- $workload_file)
|
|
workload_name="${file_name%.*}"
|
|
# Copy the workload to the cluster.
|
|
ray rsync_up config.yaml --cluster-name="$workload_name" "$workload_file" "$file_name"
|
|
# Clean up previous runs if relevant.
|
|
ray exec config.yaml --cluster-name="$workload_name" "ray stop; rm -r /tmp/ray; tmux kill-server | true"
|
|
# Start the workload.
|
|
ray exec config.yaml --cluster-name="$workload_name" "python $file_name" --tmux
|
|
done
|
|
|
|
popd
|
|
|
|
# Print some helpful information.
|
|
|
|
echo ""
|
|
echo ""
|
|
|
|
echo "To kill the instances, use the following commands."
|
|
echo ""
|
|
for workload_file in "$ROOT_DIR"/workloads/*; do
|
|
file_name=$(basename -- $workload_file)
|
|
workload_name="${file_name%.*}"
|
|
echo " ray down -y $ROOT_DIR/config.yaml --cluster-name=$workload_name"
|
|
done
|
|
|
|
echo ""
|
|
echo ""
|
|
|
|
echo "Use the following commands to attach to the relevant drivers."
|
|
echo ""
|
|
for workload_file in "$ROOT_DIR"/workloads/*; do
|
|
file_name=$(basename -- $workload_file)
|
|
workload_name="${file_name%.*}"
|
|
echo " ray attach $ROOT_DIR/config.yaml --cluster-name=$workload_name --tmux"
|
|
done
|
|
|
|
echo ""
|
|
echo ""
|
|
|
|
echo "To check up on the scripts, run the following."
|
|
echo " $ROOT_DIR/check_workloads.sh"
|