Convert long running stress tests to projects (#5641)

This commit is contained in:
Philipp Moritz 2019-09-26 11:25:09 -07:00 committed by GitHub
parent 5ecb02fb80
commit 57a5871ea6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 52 additions and 145 deletions

View file

@ -37,11 +37,6 @@ setup_commands:
# Install latest TensorFlow
- source activate tensorflow_p36 && conda remove -y --force wrapt || true
- source activate tensorflow_p36 && pip install -U tensorflow==1.14
# Install nightly Ray wheels.
# Example: https://s3-us-west-2.amazonaws.com/ray-wheels/<<<RAY_BRANCH>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-cp36-cp36m-manylinux1_x86_64.whl
- source activate tensorflow_p36 && pip install -U RAY_WHEEL_TO_TEST_HERE
- source activate tensorflow_p36 && pip install ray[rllib] ray[debug] gym[atari]
- source activate tensorflow_p36 && pip install ray[debug]
- echo set-window-option -g mouse on > ~/.tmux.conf
- echo 'termcapinfo xterm* ti@:te@' > ~/.screenrc

View file

@ -0,0 +1,31 @@
name: long-running-tests
description: "Ray's long running stress tests"
cluster: .rayproject/cluster.yaml
commands:
- name: run
help: "Start a long running stress test."
command: |
# Install nightly Ray wheels.
source activate tensorflow_p36 && pip install -U {{wheel}}
source activate tensorflow_p36 && pip install ray[rllib] ray[debug] gym[atari]
source activate tensorflow_p36 && pip install ray[debug]
source activate tensorflow_p36 && python workloads/{{workload}}.py
params:
- name: wheel
help: "URL to the ray wheel to test (defaults to latest)."
default: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.8.0.dev4-cp36-cp36m-manylinux1_x86_64.whl
- name: workload
help: "Name of the workload to run."
choices: ["actor_deaths", "apex", "impala", "many_actor_tasks", "many_drivers", "many_tasks", "node_failures", "pbt"]
config:
tmux: true
- name: check-load
command: uptime
help: "Check load of the workload."
- name: show-output
command: tmux capture-pane -p
help: "Show tail of the workoad output."

View file

@ -1,29 +1,37 @@
Long Running Tests
==================
This directory contains scripts for starting long-running workloads which are
intended to run forever until they fail.
This directory contains the long-running workloads which are intended to run
forever until they fail. To set up the project you need to run
.. code-block:: bash
pip install any
any project create
Running the Workloads
---------------------
To run the workloads, first edit the config.yaml and replace
``RAY_WHEEL_TO_TEST_HERE`` with the desired version to test, then run:
You can start all the workloads with:
.. code-block:: bash
./start_workloads.sh
any session start -y run --workload="*" --wheel=https://s3-us-west-2.amazonaws.com/ray-wheels/releases/0.7.5/6da7eff4b20340f92d3fe1160df35caa68922a97/ray-0.7.5-cp36-cp36m-manylinux1_x86_64.whl
This will start one EC2 instance per workload and will start the workloads
running (one per instance). Running the ``./start_workloads.sh`` script again
will clean up any state from the previous runs and will start the workloads
again.
running (one per instance). You can start a specific workload by specifying
its name as an argument ``--workload=`` instead of ``"*"``. A list of available options
is available via `any session start run --help`.
Check Workload Statuses
-----------------------
To check up on the workloads, run either ``./check_workloads.sh --load``, which
will print the load on each machine, or ``./check_workloads.sh --logs``, which
To check up on the workloads, run either
``any session --name="*" execute check-load``, which
will print the load on each machine, or
``any session --name="*" execute show-output``, which
will print the tail of the output for each workload.
To debug workloads that have failed, you may find it useful to ssh to the
@ -35,9 +43,10 @@ Shut Down the Workloads
-----------------------
The instances running the workloads can all be killed by running
``./shut_down_workloads.sh``.
``any session stop --name "*"``.
Adding a Workload
-----------------
To create a new workload, simply add a new Python file under ``workloads/``.
To create a new workload, simply add a new Python file under ``workloads/`` and
add the workload in the run command in `.rayproject/project.yaml`.

View file

@ -1,36 +0,0 @@
#!/usr/bin/env bash
# set -x
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
if [ "$1" == "--load" ]; then
check_load=true
elif [ "$1" == "--logs" ]; then
check_load=false
else
echo "Usage: $0 [--load|--logs]"
exit 1
fi
cd "$ROOT_DIR"
for workload_file in "$ROOT_DIR"/workloads/*; do
file_name=$(basename -- $workload_file)
workload_name="${file_name%.*}"
if $check_load; then
echo -n "$workload_name: "
ray --logging-level=WARNING exec config.yaml --cluster-name="$workload_name" uptime 2>/dev/null || echo "<offline>"
else
echo "======================================================================"
echo "WORKLOAD: $workload_name"
echo "======================================================================"
ray exec config.yaml --cluster-name="$workload_name" "tmux capture-pane -p"
echo ""
echo "ssh to this machine with:"
echo " ray attach $ROOT_DIR/config.yaml --cluster-name=$workload_name"
echo ""
echo ""
fi
done

View file

@ -1,18 +0,0 @@
#!/usr/bin/env bash
set -e
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
pushd "$ROOT_DIR"
# Kill all of the workloads.
for workload_file in "$ROOT_DIR"/workloads/*; do
file_name=$(basename -- $workload_file)
workload_name="${file_name%.*}"
ray down -y config.yaml --cluster-name="$workload_name" &
done
# Wait for all of the ray down commands to finish.
wait
popd

View file

@ -1,74 +0,0 @@
#!/usr/bin/env bash
set -e
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
pushd "$ROOT_DIR"
# Substitute in the appropriate Ray version and commit in the config file and
# store it in a temporary file.
CLUSTER_CONFIG="config.yaml"
if grep -q RAY_WHEEL_TO_TEST_HERE $CLUSTER_CONFIG; then
echo "You must replace the RAY_WHEEL_TO_TEST_HERE string in $CLUSTER_CONFIG."
exit 1
fi
# Start one instance per workload.
for workload_file in "$ROOT_DIR"/workloads/*; do
file_name=$(basename -- "$workload_file")
workload_name="${file_name%.*}"
ray up -y $CLUSTER_CONFIG --cluster-name="$workload_name" &
done
# Wait for all of the nodes to be up.
wait
status=$?
if [ $status != 0 ]; then
echo "Some update processes failed with $status"
exit 1
fi
# Start the workloads running.
for workload_file in "$ROOT_DIR"/workloads/*; do
file_name=$(basename -- "$workload_file")
workload_name="${file_name%.*}"
(
# Copy the workload to the cluster.
ray rsync_up $CLUSTER_CONFIG --cluster-name="$workload_name" "$workload_file" "$file_name"
# Clean up previous runs if relevant.
ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && ray stop; rm -r /tmp/ray; tmux kill-server | true"
# Start the workload.
ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && python $file_name" --tmux
) &
done
# Wait for child processes to finish.
wait
popd
# Print some helpful information.
echo ""
echo ""
echo "Use the following commands to attach to the relevant drivers."
echo ""
for workload_file in "$ROOT_DIR"/workloads/*; do
file_name=$(basename -- "$workload_file")
workload_name="${file_name%.*}"
echo " ray attach $ROOT_DIR/$CLUSTER_CONFIG --cluster-name=$workload_name --tmux"
done
echo ""
echo ""
echo "To shut down all instances, run the following."
echo " $ROOT_DIR/shut_down_workloads.sh"
echo ""
echo ""
echo "To check up on the scripts, run the following."
echo " $ROOT_DIR/check_workloads.sh --load"
echo " $ROOT_DIR/check_workloads.sh --logs"