From 57a5871ea60058d0d09f3a61e80d0e10c461f3cf Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 26 Sep 2019 11:25:09 -0700 Subject: [PATCH] Convert long running stress tests to projects (#5641) --- .../{config.yaml => .rayproject/cluster.yaml} | 5 -- .../.rayproject/project.yaml | 31 ++++++++ ci/long_running_tests/README.rst | 33 ++++++--- ci/long_running_tests/check_workloads.sh | 36 --------- ci/long_running_tests/shut_down_workloads.sh | 18 ----- ci/long_running_tests/start_workloads.sh | 74 ------------------- 6 files changed, 52 insertions(+), 145 deletions(-) rename ci/long_running_tests/{config.yaml => .rayproject/cluster.yaml} (82%) create mode 100644 ci/long_running_tests/.rayproject/project.yaml delete mode 100755 ci/long_running_tests/check_workloads.sh delete mode 100755 ci/long_running_tests/shut_down_workloads.sh delete mode 100755 ci/long_running_tests/start_workloads.sh diff --git a/ci/long_running_tests/config.yaml b/ci/long_running_tests/.rayproject/cluster.yaml similarity index 82% rename from ci/long_running_tests/config.yaml rename to ci/long_running_tests/.rayproject/cluster.yaml index f6ccaa93e..95a70d209 100644 --- a/ci/long_running_tests/config.yaml +++ b/ci/long_running_tests/.rayproject/cluster.yaml @@ -37,11 +37,6 @@ setup_commands: # Install latest TensorFlow - source activate tensorflow_p36 && conda remove -y --force wrapt || true - source activate tensorflow_p36 && pip install -U tensorflow==1.14 - # Install nightly Ray wheels. - # Example: https://s3-us-west-2.amazonaws.com/ray-wheels/<<>>/<<>>/ray-<<>>-cp36-cp36m-manylinux1_x86_64.whl - - source activate tensorflow_p36 && pip install -U RAY_WHEEL_TO_TEST_HERE - - source activate tensorflow_p36 && pip install ray[rllib] ray[debug] gym[atari] - - source activate tensorflow_p36 && pip install ray[debug] - echo set-window-option -g mouse on > ~/.tmux.conf - echo 'termcapinfo xterm* ti@:te@' > ~/.screenrc diff --git a/ci/long_running_tests/.rayproject/project.yaml b/ci/long_running_tests/.rayproject/project.yaml new file mode 100644 index 000000000..658ec1b31 --- /dev/null +++ b/ci/long_running_tests/.rayproject/project.yaml @@ -0,0 +1,31 @@ +name: long-running-tests +description: "Ray's long running stress tests" + +cluster: .rayproject/cluster.yaml + +commands: + - name: run + help: "Start a long running stress test." + command: | + # Install nightly Ray wheels. + source activate tensorflow_p36 && pip install -U {{wheel}} + source activate tensorflow_p36 && pip install ray[rllib] ray[debug] gym[atari] + source activate tensorflow_p36 && pip install ray[debug] + source activate tensorflow_p36 && python workloads/{{workload}}.py + params: + - name: wheel + help: "URL to the ray wheel to test (defaults to latest)." + default: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.8.0.dev4-cp36-cp36m-manylinux1_x86_64.whl + - name: workload + help: "Name of the workload to run." + choices: ["actor_deaths", "apex", "impala", "many_actor_tasks", "many_drivers", "many_tasks", "node_failures", "pbt"] + config: + tmux: true + + - name: check-load + command: uptime + help: "Check load of the workload." + + - name: show-output + command: tmux capture-pane -p + help: "Show tail of the workoad output." diff --git a/ci/long_running_tests/README.rst b/ci/long_running_tests/README.rst index b1192ba8e..c99065ed9 100644 --- a/ci/long_running_tests/README.rst +++ b/ci/long_running_tests/README.rst @@ -1,29 +1,37 @@ Long Running Tests ================== -This directory contains scripts for starting long-running workloads which are -intended to run forever until they fail. +This directory contains the long-running workloads which are intended to run +forever until they fail. To set up the project you need to run + +.. code-block:: bash + + pip install any + any project create + Running the Workloads --------------------- -To run the workloads, first edit the config.yaml and replace -``RAY_WHEEL_TO_TEST_HERE`` with the desired version to test, then run: +You can start all the workloads with: .. code-block:: bash - ./start_workloads.sh + any session start -y run --workload="*" --wheel=https://s3-us-west-2.amazonaws.com/ray-wheels/releases/0.7.5/6da7eff4b20340f92d3fe1160df35caa68922a97/ray-0.7.5-cp36-cp36m-manylinux1_x86_64.whl This will start one EC2 instance per workload and will start the workloads -running (one per instance). Running the ``./start_workloads.sh`` script again -will clean up any state from the previous runs and will start the workloads -again. +running (one per instance). You can start a specific workload by specifying +its name as an argument ``--workload=`` instead of ``"*"``. A list of available options +is available via `any session start run --help`. + Check Workload Statuses ----------------------- -To check up on the workloads, run either ``./check_workloads.sh --load``, which -will print the load on each machine, or ``./check_workloads.sh --logs``, which +To check up on the workloads, run either +``any session --name="*" execute check-load``, which +will print the load on each machine, or +``any session --name="*" execute show-output``, which will print the tail of the output for each workload. To debug workloads that have failed, you may find it useful to ssh to the @@ -35,9 +43,10 @@ Shut Down the Workloads ----------------------- The instances running the workloads can all be killed by running -``./shut_down_workloads.sh``. +``any session stop --name "*"``. Adding a Workload ----------------- -To create a new workload, simply add a new Python file under ``workloads/``. +To create a new workload, simply add a new Python file under ``workloads/`` and +add the workload in the run command in `.rayproject/project.yaml`. diff --git a/ci/long_running_tests/check_workloads.sh b/ci/long_running_tests/check_workloads.sh deleted file mode 100755 index a7a613848..000000000 --- a/ci/long_running_tests/check_workloads.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash - -# set -x - -ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) - -if [ "$1" == "--load" ]; then - check_load=true -elif [ "$1" == "--logs" ]; then - check_load=false -else - echo "Usage: $0 [--load|--logs]" - exit 1 -fi - -cd "$ROOT_DIR" - -for workload_file in "$ROOT_DIR"/workloads/*; do - file_name=$(basename -- $workload_file) - workload_name="${file_name%.*}" - if $check_load; then - echo -n "$workload_name: " - ray --logging-level=WARNING exec config.yaml --cluster-name="$workload_name" uptime 2>/dev/null || echo "" - else - echo "======================================================================" - echo "WORKLOAD: $workload_name" - echo "======================================================================" - - ray exec config.yaml --cluster-name="$workload_name" "tmux capture-pane -p" - echo "" - echo "ssh to this machine with:" - echo " ray attach $ROOT_DIR/config.yaml --cluster-name=$workload_name" - echo "" - echo "" - fi -done diff --git a/ci/long_running_tests/shut_down_workloads.sh b/ci/long_running_tests/shut_down_workloads.sh deleted file mode 100755 index a4573e1cf..000000000 --- a/ci/long_running_tests/shut_down_workloads.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env bash - -set -e - -ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) - -pushd "$ROOT_DIR" - -# Kill all of the workloads. -for workload_file in "$ROOT_DIR"/workloads/*; do - file_name=$(basename -- $workload_file) - workload_name="${file_name%.*}" - ray down -y config.yaml --cluster-name="$workload_name" & -done -# Wait for all of the ray down commands to finish. -wait - -popd diff --git a/ci/long_running_tests/start_workloads.sh b/ci/long_running_tests/start_workloads.sh deleted file mode 100755 index 3d92a0739..000000000 --- a/ci/long_running_tests/start_workloads.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env bash - -set -e - -ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) -pushd "$ROOT_DIR" - -# Substitute in the appropriate Ray version and commit in the config file and -# store it in a temporary file. -CLUSTER_CONFIG="config.yaml" - -if grep -q RAY_WHEEL_TO_TEST_HERE $CLUSTER_CONFIG; then - echo "You must replace the RAY_WHEEL_TO_TEST_HERE string in $CLUSTER_CONFIG." - exit 1 -fi - -# Start one instance per workload. -for workload_file in "$ROOT_DIR"/workloads/*; do - file_name=$(basename -- "$workload_file") - workload_name="${file_name%.*}" - ray up -y $CLUSTER_CONFIG --cluster-name="$workload_name" & -done -# Wait for all of the nodes to be up. -wait - -status=$? -if [ $status != 0 ]; then - echo "Some update processes failed with $status" - exit 1 -fi - -# Start the workloads running. -for workload_file in "$ROOT_DIR"/workloads/*; do - file_name=$(basename -- "$workload_file") - workload_name="${file_name%.*}" - ( - # Copy the workload to the cluster. - ray rsync_up $CLUSTER_CONFIG --cluster-name="$workload_name" "$workload_file" "$file_name" - # Clean up previous runs if relevant. - ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && ray stop; rm -r /tmp/ray; tmux kill-server | true" - # Start the workload. - ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && python $file_name" --tmux - ) & -done -# Wait for child processes to finish. -wait - -popd - -# Print some helpful information. - -echo "" -echo "" - -echo "Use the following commands to attach to the relevant drivers." -echo "" -for workload_file in "$ROOT_DIR"/workloads/*; do - file_name=$(basename -- "$workload_file") - workload_name="${file_name%.*}" - echo " ray attach $ROOT_DIR/$CLUSTER_CONFIG --cluster-name=$workload_name --tmux" -done - -echo "" -echo "" - -echo "To shut down all instances, run the following." -echo " $ROOT_DIR/shut_down_workloads.sh" - -echo "" -echo "" - -echo "To check up on the scripts, run the following." -echo " $ROOT_DIR/check_workloads.sh --load" -echo " $ROOT_DIR/check_workloads.sh --logs"