From 57a5871ea60058d0d09f3a61e80d0e10c461f3cf Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 26 Sep 2019 11:25:09 -0700
Subject: [PATCH] Convert long running stress tests to projects (#5641)

---
 .../{config.yaml => .rayproject/cluster.yaml} |  5 --
 .../.rayproject/project.yaml                  | 31 ++++++++
 ci/long_running_tests/README.rst              | 33 ++++++---
 ci/long_running_tests/check_workloads.sh      | 36 ---------
 ci/long_running_tests/shut_down_workloads.sh  | 18 -----
 ci/long_running_tests/start_workloads.sh      | 74 -------------------
 6 files changed, 52 insertions(+), 145 deletions(-)
 rename ci/long_running_tests/{config.yaml => .rayproject/cluster.yaml} (82%)
 create mode 100644 ci/long_running_tests/.rayproject/project.yaml
 delete mode 100755 ci/long_running_tests/check_workloads.sh
 delete mode 100755 ci/long_running_tests/shut_down_workloads.sh
 delete mode 100755 ci/long_running_tests/start_workloads.sh

diff --git a/ci/long_running_tests/config.yaml b/ci/long_running_tests/.rayproject/cluster.yaml
similarity index 82%
rename from ci/long_running_tests/config.yaml
rename to ci/long_running_tests/.rayproject/cluster.yaml
index f6ccaa93e..95a70d209 100644
--- a/ci/long_running_tests/config.yaml
+++ b/ci/long_running_tests/.rayproject/cluster.yaml
@@ -37,11 +37,6 @@ setup_commands:
     # Install latest TensorFlow
     - source activate tensorflow_p36 && conda remove -y --force wrapt || true
     - source activate tensorflow_p36 && pip install -U tensorflow==1.14
-    # Install nightly Ray wheels.
-    # Example: https://s3-us-west-2.amazonaws.com/ray-wheels/<<<RAY_BRANCH>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-cp36-cp36m-manylinux1_x86_64.whl
-    - source activate tensorflow_p36 && pip install -U RAY_WHEEL_TO_TEST_HERE
-    - source activate tensorflow_p36 && pip install ray[rllib] ray[debug] gym[atari]
-    - source activate tensorflow_p36 && pip install ray[debug]
     - echo set-window-option -g mouse on > ~/.tmux.conf
     - echo 'termcapinfo xterm* ti@:te@' > ~/.screenrc
 
diff --git a/ci/long_running_tests/.rayproject/project.yaml b/ci/long_running_tests/.rayproject/project.yaml
new file mode 100644
index 000000000..658ec1b31
--- /dev/null
+++ b/ci/long_running_tests/.rayproject/project.yaml
@@ -0,0 +1,31 @@
+name: long-running-tests
+description: "Ray's long running stress tests"
+
+cluster: .rayproject/cluster.yaml
+
+commands:
+  - name: run
+    help: "Start a long running stress test."
+    command: |
+      # Install nightly Ray wheels.
+      source activate tensorflow_p36 && pip install -U {{wheel}}
+      source activate tensorflow_p36 && pip install ray[rllib] ray[debug] gym[atari]
+      source activate tensorflow_p36 && pip install ray[debug]
+      source activate tensorflow_p36 && python workloads/{{workload}}.py
+    params:
+      - name: wheel
+        help: "URL to the ray wheel to test (defaults to latest)."
+        default: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.8.0.dev4-cp36-cp36m-manylinux1_x86_64.whl
+      - name: workload
+        help: "Name of the workload to run."
+        choices: ["actor_deaths", "apex", "impala", "many_actor_tasks", "many_drivers", "many_tasks", "node_failures", "pbt"]
+    config:
+      tmux: true
+
+  - name: check-load
+    command: uptime
+    help: "Check load of the workload."
+
+  - name: show-output
+    command: tmux capture-pane -p
+    help: "Show tail of the workoad output."
diff --git a/ci/long_running_tests/README.rst b/ci/long_running_tests/README.rst
index b1192ba8e..c99065ed9 100644
--- a/ci/long_running_tests/README.rst
+++ b/ci/long_running_tests/README.rst
@@ -1,29 +1,37 @@
 Long Running Tests
 ==================
 
-This directory contains scripts for starting long-running workloads which are
-intended to run forever until they fail.
+This directory contains the long-running workloads which are intended to run
+forever until they fail. To set up the project you need to run
+
+.. code-block:: bash
+
+    pip install any
+    any project create
+
 
 Running the Workloads
 ---------------------
 
-To run the workloads, first edit the config.yaml and replace
-``RAY_WHEEL_TO_TEST_HERE`` with the desired version to test, then run:
+You can start all the workloads with:
 
 .. code-block:: bash
 
-    ./start_workloads.sh
+    any session start -y run --workload="*" --wheel=https://s3-us-west-2.amazonaws.com/ray-wheels/releases/0.7.5/6da7eff4b20340f92d3fe1160df35caa68922a97/ray-0.7.5-cp36-cp36m-manylinux1_x86_64.whl
 
 This will start one EC2 instance per workload and will start the workloads
-running (one per instance). Running the ``./start_workloads.sh`` script again
-will clean up any state from the previous runs and will start the workloads
-again.
+running (one per instance). You can start a specific workload by specifying
+its name as an argument ``--workload=`` instead of ``"*"``. A list of available options
+is available via `any session start run --help`.
+
 
 Check Workload Statuses
 -----------------------
 
-To check up on the workloads, run either ``./check_workloads.sh --load``, which
-will print the load on each machine, or ``./check_workloads.sh --logs``, which
+To check up on the workloads, run either
+``any session --name="*" execute check-load``, which
+will print the load on each machine, or
+``any session --name="*" execute show-output``, which
 will print the tail of the output for each workload.
 
 To debug workloads that have failed, you may find it useful to ssh to the
@@ -35,9 +43,10 @@ Shut Down the Workloads
 -----------------------
 
 The instances running the workloads can all be killed by running
-``./shut_down_workloads.sh``.
+``any session stop --name "*"``.
 
 Adding a Workload
 -----------------
 
-To create a new workload, simply add a new Python file under ``workloads/``.
+To create a new workload, simply add a new Python file under ``workloads/`` and
+add the workload in the run command in `.rayproject/project.yaml`.
diff --git a/ci/long_running_tests/check_workloads.sh b/ci/long_running_tests/check_workloads.sh
deleted file mode 100755
index a7a613848..000000000
--- a/ci/long_running_tests/check_workloads.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-
-# set -x
-
-ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
-
-if [ "$1" == "--load" ]; then
-    check_load=true
-elif [ "$1" == "--logs" ]; then
-    check_load=false
-else
-    echo "Usage: $0 [--load|--logs]"
-    exit 1
-fi
-
-cd "$ROOT_DIR"
-
-for workload_file in "$ROOT_DIR"/workloads/*; do
-  file_name=$(basename -- $workload_file)
-  workload_name="${file_name%.*}"
-  if $check_load; then
-    echo -n "$workload_name: "
-    ray --logging-level=WARNING exec config.yaml --cluster-name="$workload_name" uptime 2>/dev/null || echo "<offline>"
-  else
-    echo "======================================================================"
-    echo "WORKLOAD: $workload_name"
-    echo "======================================================================"
-
-    ray exec config.yaml --cluster-name="$workload_name" "tmux capture-pane -p"
-    echo ""
-    echo "ssh to this machine with:"
-    echo "    ray attach $ROOT_DIR/config.yaml --cluster-name=$workload_name"
-    echo ""
-    echo ""
-  fi
-done
diff --git a/ci/long_running_tests/shut_down_workloads.sh b/ci/long_running_tests/shut_down_workloads.sh
deleted file mode 100755
index a4573e1cf..000000000
--- a/ci/long_running_tests/shut_down_workloads.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
-
-pushd "$ROOT_DIR"
-
-# Kill all of the workloads.
-for workload_file in "$ROOT_DIR"/workloads/*; do
-  file_name=$(basename -- $workload_file)
-  workload_name="${file_name%.*}"
-  ray down -y config.yaml --cluster-name="$workload_name" &
-done
-# Wait for all of the ray down commands to finish.
-wait
-
-popd
diff --git a/ci/long_running_tests/start_workloads.sh b/ci/long_running_tests/start_workloads.sh
deleted file mode 100755
index 3d92a0739..000000000
--- a/ci/long_running_tests/start_workloads.sh
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
-pushd "$ROOT_DIR"
-
-# Substitute in the appropriate Ray version and commit in the config file and
-# store it in a temporary file.
-CLUSTER_CONFIG="config.yaml"
-
-if grep -q RAY_WHEEL_TO_TEST_HERE $CLUSTER_CONFIG; then
-    echo "You must replace the RAY_WHEEL_TO_TEST_HERE string in $CLUSTER_CONFIG."
-    exit 1
-fi
-
-# Start one instance per workload.
-for workload_file in "$ROOT_DIR"/workloads/*; do
-  file_name=$(basename -- "$workload_file")
-  workload_name="${file_name%.*}"
-  ray up -y $CLUSTER_CONFIG --cluster-name="$workload_name" &
-done
-# Wait for all of the nodes to be up.
-wait
-
-status=$?
-if [ $status != 0 ]; then
-    echo "Some update processes failed with $status"
-    exit 1
-fi
-
-# Start the workloads running.
-for workload_file in "$ROOT_DIR"/workloads/*; do
-  file_name=$(basename -- "$workload_file")
-  workload_name="${file_name%.*}"
-  (
-      # Copy the workload to the cluster.
-      ray rsync_up $CLUSTER_CONFIG --cluster-name="$workload_name" "$workload_file" "$file_name"
-      # Clean up previous runs if relevant.
-      ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && ray stop; rm -r /tmp/ray; tmux kill-server | true"
-      # Start the workload.
-      ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && python $file_name" --tmux
-   ) &
-done
-# Wait for child processes to finish.
-wait
-
-popd
-
-# Print some helpful information.
-
-echo ""
-echo ""
-
-echo "Use the following commands to attach to the relevant drivers."
-echo ""
-for workload_file in "$ROOT_DIR"/workloads/*; do
-  file_name=$(basename -- "$workload_file")
-  workload_name="${file_name%.*}"
-  echo "    ray attach $ROOT_DIR/$CLUSTER_CONFIG --cluster-name=$workload_name --tmux"
-done
-
-echo ""
-echo ""
-
-echo "To shut down all instances, run the following."
-echo "    $ROOT_DIR/shut_down_workloads.sh"
-
-echo ""
-echo ""
-
-echo "To check up on the scripts, run the following."
-echo "    $ROOT_DIR/check_workloads.sh --load"
-echo "    $ROOT_DIR/check_workloads.sh --logs"