Convert long running stress tests to projects (#5641)

2025-03-06 10:31:39 -05:00 · 2019-09-26 11:25:09 -07:00 · 2019-09-26 11:25:09 -07:00 · 57a5871ea6
commit 57a5871ea6
parent 5ecb02fb80
6 changed files with 52 additions and 145 deletions
--- a/ci/long_running_tests/.rayproject/cluster.yaml
+++ b/ci/long_running_tests/.rayproject/cluster.yaml
@ -37,11 +37,6 @@ setup_commands:
    # Install latest TensorFlow
    - source activate tensorflow_p36 && conda remove -y --force wrapt || true
    - source activate tensorflow_p36 && pip install -U tensorflow==1.14
-    # Install nightly Ray wheels.
-    # Example: https://s3-us-west-2.amazonaws.com/ray-wheels/<<<RAY_BRANCH>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-cp36-cp36m-manylinux1_x86_64.whl
-    - source activate tensorflow_p36 && pip install -U RAY_WHEEL_TO_TEST_HERE
-    - source activate tensorflow_p36 && pip install ray[rllib] ray[debug] gym[atari]
-    - source activate tensorflow_p36 && pip install ray[debug]
    - echo set-window-option -g mouse on > ~/.tmux.conf
    - echo 'termcapinfo xterm* ti@:te@' > ~/.screenrc

--- a/ci/long_running_tests/.rayproject/project.yaml
+++ b/ci/long_running_tests/.rayproject/project.yaml
@ -0,0 +1,31 @@
+name: long-running-tests
+description: "Ray's long running stress tests"
+
+cluster: .rayproject/cluster.yaml
+
+commands:
+  - name: run
+    help: "Start a long running stress test."
+    command: |
+      # Install nightly Ray wheels.
+      source activate tensorflow_p36 && pip install -U {{wheel}}
+      source activate tensorflow_p36 && pip install ray[rllib] ray[debug] gym[atari]
+      source activate tensorflow_p36 && pip install ray[debug]
+      source activate tensorflow_p36 && python workloads/{{workload}}.py
+    params:
+      - name: wheel
+        help: "URL to the ray wheel to test (defaults to latest)."
+        default: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.8.0.dev4-cp36-cp36m-manylinux1_x86_64.whl
+      - name: workload
+        help: "Name of the workload to run."
+        choices: ["actor_deaths", "apex", "impala", "many_actor_tasks", "many_drivers", "many_tasks", "node_failures", "pbt"]
+    config:
+      tmux: true
+
+  - name: check-load
+    command: uptime
+    help: "Check load of the workload."
+
+  - name: show-output
+    command: tmux capture-pane -p
+    help: "Show tail of the workoad output."
--- a/ci/long_running_tests/README.rst
+++ b/ci/long_running_tests/README.rst
@ -1,29 +1,37 @@
 Long Running Tests
 ==================

-This directory contains scripts for starting long-running workloads which are
-intended to run forever until they fail.
+This directory contains the long-running workloads which are intended to run
+forever until they fail. To set up the project you need to run
+
+.. code-block:: bash
+
+    pip install any
+    any project create
+

 Running the Workloads
 ---------------------

-To run the workloads, first edit the config.yaml and replace
-``RAY_WHEEL_TO_TEST_HERE`` with the desired version to test, then run:
+You can start all the workloads with:

 .. code-block:: bash

-    ./start_workloads.sh
+    any session start -y run --workload="*" --wheel=https://s3-us-west-2.amazonaws.com/ray-wheels/releases/0.7.5/6da7eff4b20340f92d3fe1160df35caa68922a97/ray-0.7.5-cp36-cp36m-manylinux1_x86_64.whl

 This will start one EC2 instance per workload and will start the workloads
-running (one per instance). Running the ``./start_workloads.sh`` script again
-will clean up any state from the previous runs and will start the workloads
-again.
+running (one per instance). You can start a specific workload by specifying
+its name as an argument ``--workload=`` instead of ``"*"``. A list of available options
+is available via `any session start run --help`.
+

 Check Workload Statuses
 -----------------------

-To check up on the workloads, run either ``./check_workloads.sh --load``, which
-will print the load on each machine, or ``./check_workloads.sh --logs``, which
+To check up on the workloads, run either
+``any session --name="*" execute check-load``, which
+will print the load on each machine, or
+``any session --name="*" execute show-output``, which
 will print the tail of the output for each workload.

 To debug workloads that have failed, you may find it useful to ssh to the
@ -35,9 +43,10 @@ Shut Down the Workloads
 -----------------------

 The instances running the workloads can all be killed by running
-``./shut_down_workloads.sh``.
+``any session stop --name "*"``.

 Adding a Workload
 -----------------

-To create a new workload, simply add a new Python file under ``workloads/``.
+To create a new workload, simply add a new Python file under ``workloads/`` and
+add the workload in the run command in `.rayproject/project.yaml`.
--- a/ci/long_running_tests/check_workloads.sh
+++ b/ci/long_running_tests/check_workloads.sh
@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-
-# set -x
-
-ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
-
-if [ "$1" == "--load" ]; then
-    check_load=true
-elif [ "$1" == "--logs" ]; then
-    check_load=false
-else
-    echo "Usage: $0 [--load|--logs]"
-    exit 1
-fi
-
-cd "$ROOT_DIR"
-
-for workload_file in "$ROOT_DIR"/workloads/*; do
-  file_name=$(basename -- $workload_file)
-  workload_name="${file_name%.*}"
-  if $check_load; then
-    echo -n "$workload_name: "
-    ray --logging-level=WARNING exec config.yaml --cluster-name="$workload_name" uptime 2>/dev/null || echo "<offline>"
-  else
-    echo "======================================================================"
-    echo "WORKLOAD: $workload_name"
-    echo "======================================================================"
-
-    ray exec config.yaml --cluster-name="$workload_name" "tmux capture-pane -p"
-    echo ""
-    echo "ssh to this machine with:"
-    echo "    ray attach $ROOT_DIR/config.yaml --cluster-name=$workload_name"
-    echo ""
-    echo ""
-  fi
-done
--- a/ci/long_running_tests/shut_down_workloads.sh
+++ b/ci/long_running_tests/shut_down_workloads.sh
@ -1,18 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
-
-pushd "$ROOT_DIR"
-
-# Kill all of the workloads.
-for workload_file in "$ROOT_DIR"/workloads/*; do
-  file_name=$(basename -- $workload_file)
-  workload_name="${file_name%.*}"
-  ray down -y config.yaml --cluster-name="$workload_name" &
-done
-# Wait for all of the ray down commands to finish.
-wait
-
-popd
--- a/ci/long_running_tests/start_workloads.sh
+++ b/ci/long_running_tests/start_workloads.sh
@ -1,74 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
-pushd "$ROOT_DIR"
-
-# Substitute in the appropriate Ray version and commit in the config file and
-# store it in a temporary file.
-CLUSTER_CONFIG="config.yaml"
-
-if grep -q RAY_WHEEL_TO_TEST_HERE $CLUSTER_CONFIG; then
-    echo "You must replace the RAY_WHEEL_TO_TEST_HERE string in $CLUSTER_CONFIG."
-    exit 1
-fi
-
-# Start one instance per workload.
-for workload_file in "$ROOT_DIR"/workloads/*; do
-  file_name=$(basename -- "$workload_file")
-  workload_name="${file_name%.*}"
-  ray up -y $CLUSTER_CONFIG --cluster-name="$workload_name" &
-done
-# Wait for all of the nodes to be up.
-wait
-
-status=$?
-if [ $status != 0 ]; then
-    echo "Some update processes failed with $status"
-    exit 1
-fi
-
-# Start the workloads running.
-for workload_file in "$ROOT_DIR"/workloads/*; do
-  file_name=$(basename -- "$workload_file")
-  workload_name="${file_name%.*}"
-  (
-      # Copy the workload to the cluster.
-      ray rsync_up $CLUSTER_CONFIG --cluster-name="$workload_name" "$workload_file" "$file_name"
-      # Clean up previous runs if relevant.
-      ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && ray stop; rm -r /tmp/ray; tmux kill-server | true"
-      # Start the workload.
-      ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && python $file_name" --tmux
-   ) &
-done
-# Wait for child processes to finish.
-wait
-
-popd
-
-# Print some helpful information.
-
-echo ""
-echo ""
-
-echo "Use the following commands to attach to the relevant drivers."
-echo ""
-for workload_file in "$ROOT_DIR"/workloads/*; do
-  file_name=$(basename -- "$workload_file")
-  workload_name="${file_name%.*}"
-  echo "    ray attach $ROOT_DIR/$CLUSTER_CONFIG --cluster-name=$workload_name --tmux"
-done
-
-echo ""
-echo ""
-
-echo "To shut down all instances, run the following."
-echo "    $ROOT_DIR/shut_down_workloads.sh"
-
-echo ""
-echo ""
-
-echo "To check up on the scripts, run the following."
-echo "    $ROOT_DIR/check_workloads.sh --load"
-echo "    $ROOT_DIR/check_workloads.sh --logs"