ray/ci/long_running_tests/start_workloads.sh
Eric Liang 5ab5017c67
[rllib] Fix impala stress test (#5101)
* add copy

* upgrade to tf 1.14

* update

* reduce count to workaround https://github.com/ray-project/ray/issues/5125

* Update impala.py

* placeholder

* comments

* update
2019-07-09 20:22:30 -07:00

74 lines
2.1 KiB
Bash
Executable file

#!/usr/bin/env bash
set -e
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
pushd "$ROOT_DIR"
# Substitute in the appropriate Ray version and commit in the config file and
# store it in a temporary file.
CLUSTER_CONFIG="config.yaml"
if grep -q RAY_WHEEL_TO_TEST_HERE $CLUSTER_CONFIG; then
echo "You must replace the RAY_WHEEL_TO_TEST_HERE string in $CLUSTER_CONFIG."
exit 1
fi
# Start one instance per workload.
for workload_file in "$ROOT_DIR"/workloads/*; do
file_name=$(basename -- "$workload_file")
workload_name="${file_name%.*}"
ray up -y $CLUSTER_CONFIG --cluster-name="$workload_name" &
done
# Wait for all of the nodes to be up.
wait
status=$?
if [ $status != 0 ]; then
echo "Some update processes failed with $status"
exit 1
fi
# Start the workloads running.
for workload_file in "$ROOT_DIR"/workloads/*; do
file_name=$(basename -- "$workload_file")
workload_name="${file_name%.*}"
(
# Copy the workload to the cluster.
ray rsync_up $CLUSTER_CONFIG --cluster-name="$workload_name" "$workload_file" "$file_name"
# Clean up previous runs if relevant.
ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && ray stop; rm -r /tmp/ray; tmux kill-server | true"
# Start the workload.
ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && python $file_name" --tmux
) &
done
# Wait for child processes to finish.
wait
popd
# Print some helpful information.
echo ""
echo ""
echo "Use the following commands to attach to the relevant drivers."
echo ""
for workload_file in "$ROOT_DIR"/workloads/*; do
file_name=$(basename -- "$workload_file")
workload_name="${file_name%.*}"
echo " ray attach $ROOT_DIR/$CLUSTER_CONFIG --cluster-name=$workload_name --tmux"
done
echo ""
echo ""
echo "To shut down all instances, run the following."
echo " $ROOT_DIR/shut_down_workloads.sh"
echo ""
echo ""
echo "To check up on the scripts, run the following."
echo " $ROOT_DIR/check_workloads.sh --load"
echo " $ROOT_DIR/check_workloads.sh --logs"