mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
Convert long running stress tests to projects (#5641)
This commit is contained in:
parent
5ecb02fb80
commit
57a5871ea6
6 changed files with 52 additions and 145 deletions
|
@ -37,11 +37,6 @@ setup_commands:
|
|||
# Install latest TensorFlow
|
||||
- source activate tensorflow_p36 && conda remove -y --force wrapt || true
|
||||
- source activate tensorflow_p36 && pip install -U tensorflow==1.14
|
||||
# Install nightly Ray wheels.
|
||||
# Example: https://s3-us-west-2.amazonaws.com/ray-wheels/<<<RAY_BRANCH>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-cp36-cp36m-manylinux1_x86_64.whl
|
||||
- source activate tensorflow_p36 && pip install -U RAY_WHEEL_TO_TEST_HERE
|
||||
- source activate tensorflow_p36 && pip install ray[rllib] ray[debug] gym[atari]
|
||||
- source activate tensorflow_p36 && pip install ray[debug]
|
||||
- echo set-window-option -g mouse on > ~/.tmux.conf
|
||||
- echo 'termcapinfo xterm* ti@:te@' > ~/.screenrc
|
||||
|
31
ci/long_running_tests/.rayproject/project.yaml
Normal file
31
ci/long_running_tests/.rayproject/project.yaml
Normal file
|
@ -0,0 +1,31 @@
|
|||
name: long-running-tests
|
||||
description: "Ray's long running stress tests"
|
||||
|
||||
cluster: .rayproject/cluster.yaml
|
||||
|
||||
commands:
|
||||
- name: run
|
||||
help: "Start a long running stress test."
|
||||
command: |
|
||||
# Install nightly Ray wheels.
|
||||
source activate tensorflow_p36 && pip install -U {{wheel}}
|
||||
source activate tensorflow_p36 && pip install ray[rllib] ray[debug] gym[atari]
|
||||
source activate tensorflow_p36 && pip install ray[debug]
|
||||
source activate tensorflow_p36 && python workloads/{{workload}}.py
|
||||
params:
|
||||
- name: wheel
|
||||
help: "URL to the ray wheel to test (defaults to latest)."
|
||||
default: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.8.0.dev4-cp36-cp36m-manylinux1_x86_64.whl
|
||||
- name: workload
|
||||
help: "Name of the workload to run."
|
||||
choices: ["actor_deaths", "apex", "impala", "many_actor_tasks", "many_drivers", "many_tasks", "node_failures", "pbt"]
|
||||
config:
|
||||
tmux: true
|
||||
|
||||
- name: check-load
|
||||
command: uptime
|
||||
help: "Check load of the workload."
|
||||
|
||||
- name: show-output
|
||||
command: tmux capture-pane -p
|
||||
help: "Show tail of the workoad output."
|
|
@ -1,29 +1,37 @@
|
|||
Long Running Tests
|
||||
==================
|
||||
|
||||
This directory contains scripts for starting long-running workloads which are
|
||||
intended to run forever until they fail.
|
||||
This directory contains the long-running workloads which are intended to run
|
||||
forever until they fail. To set up the project you need to run
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install any
|
||||
any project create
|
||||
|
||||
|
||||
Running the Workloads
|
||||
---------------------
|
||||
|
||||
To run the workloads, first edit the config.yaml and replace
|
||||
``RAY_WHEEL_TO_TEST_HERE`` with the desired version to test, then run:
|
||||
You can start all the workloads with:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./start_workloads.sh
|
||||
any session start -y run --workload="*" --wheel=https://s3-us-west-2.amazonaws.com/ray-wheels/releases/0.7.5/6da7eff4b20340f92d3fe1160df35caa68922a97/ray-0.7.5-cp36-cp36m-manylinux1_x86_64.whl
|
||||
|
||||
This will start one EC2 instance per workload and will start the workloads
|
||||
running (one per instance). Running the ``./start_workloads.sh`` script again
|
||||
will clean up any state from the previous runs and will start the workloads
|
||||
again.
|
||||
running (one per instance). You can start a specific workload by specifying
|
||||
its name as an argument ``--workload=`` instead of ``"*"``. A list of available options
|
||||
is available via `any session start run --help`.
|
||||
|
||||
|
||||
Check Workload Statuses
|
||||
-----------------------
|
||||
|
||||
To check up on the workloads, run either ``./check_workloads.sh --load``, which
|
||||
will print the load on each machine, or ``./check_workloads.sh --logs``, which
|
||||
To check up on the workloads, run either
|
||||
``any session --name="*" execute check-load``, which
|
||||
will print the load on each machine, or
|
||||
``any session --name="*" execute show-output``, which
|
||||
will print the tail of the output for each workload.
|
||||
|
||||
To debug workloads that have failed, you may find it useful to ssh to the
|
||||
|
@ -35,9 +43,10 @@ Shut Down the Workloads
|
|||
-----------------------
|
||||
|
||||
The instances running the workloads can all be killed by running
|
||||
``./shut_down_workloads.sh``.
|
||||
``any session stop --name "*"``.
|
||||
|
||||
Adding a Workload
|
||||
-----------------
|
||||
|
||||
To create a new workload, simply add a new Python file under ``workloads/``.
|
||||
To create a new workload, simply add a new Python file under ``workloads/`` and
|
||||
add the workload in the run command in `.rayproject/project.yaml`.
|
||||
|
|
|
@ -1,36 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# set -x
|
||||
|
||||
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
|
||||
|
||||
if [ "$1" == "--load" ]; then
|
||||
check_load=true
|
||||
elif [ "$1" == "--logs" ]; then
|
||||
check_load=false
|
||||
else
|
||||
echo "Usage: $0 [--load|--logs]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd "$ROOT_DIR"
|
||||
|
||||
for workload_file in "$ROOT_DIR"/workloads/*; do
|
||||
file_name=$(basename -- $workload_file)
|
||||
workload_name="${file_name%.*}"
|
||||
if $check_load; then
|
||||
echo -n "$workload_name: "
|
||||
ray --logging-level=WARNING exec config.yaml --cluster-name="$workload_name" uptime 2>/dev/null || echo "<offline>"
|
||||
else
|
||||
echo "======================================================================"
|
||||
echo "WORKLOAD: $workload_name"
|
||||
echo "======================================================================"
|
||||
|
||||
ray exec config.yaml --cluster-name="$workload_name" "tmux capture-pane -p"
|
||||
echo ""
|
||||
echo "ssh to this machine with:"
|
||||
echo " ray attach $ROOT_DIR/config.yaml --cluster-name=$workload_name"
|
||||
echo ""
|
||||
echo ""
|
||||
fi
|
||||
done
|
|
@ -1,18 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
|
||||
|
||||
pushd "$ROOT_DIR"
|
||||
|
||||
# Kill all of the workloads.
|
||||
for workload_file in "$ROOT_DIR"/workloads/*; do
|
||||
file_name=$(basename -- $workload_file)
|
||||
workload_name="${file_name%.*}"
|
||||
ray down -y config.yaml --cluster-name="$workload_name" &
|
||||
done
|
||||
# Wait for all of the ray down commands to finish.
|
||||
wait
|
||||
|
||||
popd
|
|
@ -1,74 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
|
||||
pushd "$ROOT_DIR"
|
||||
|
||||
# Substitute in the appropriate Ray version and commit in the config file and
|
||||
# store it in a temporary file.
|
||||
CLUSTER_CONFIG="config.yaml"
|
||||
|
||||
if grep -q RAY_WHEEL_TO_TEST_HERE $CLUSTER_CONFIG; then
|
||||
echo "You must replace the RAY_WHEEL_TO_TEST_HERE string in $CLUSTER_CONFIG."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Start one instance per workload.
|
||||
for workload_file in "$ROOT_DIR"/workloads/*; do
|
||||
file_name=$(basename -- "$workload_file")
|
||||
workload_name="${file_name%.*}"
|
||||
ray up -y $CLUSTER_CONFIG --cluster-name="$workload_name" &
|
||||
done
|
||||
# Wait for all of the nodes to be up.
|
||||
wait
|
||||
|
||||
status=$?
|
||||
if [ $status != 0 ]; then
|
||||
echo "Some update processes failed with $status"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Start the workloads running.
|
||||
for workload_file in "$ROOT_DIR"/workloads/*; do
|
||||
file_name=$(basename -- "$workload_file")
|
||||
workload_name="${file_name%.*}"
|
||||
(
|
||||
# Copy the workload to the cluster.
|
||||
ray rsync_up $CLUSTER_CONFIG --cluster-name="$workload_name" "$workload_file" "$file_name"
|
||||
# Clean up previous runs if relevant.
|
||||
ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && ray stop; rm -r /tmp/ray; tmux kill-server | true"
|
||||
# Start the workload.
|
||||
ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && python $file_name" --tmux
|
||||
) &
|
||||
done
|
||||
# Wait for child processes to finish.
|
||||
wait
|
||||
|
||||
popd
|
||||
|
||||
# Print some helpful information.
|
||||
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
echo "Use the following commands to attach to the relevant drivers."
|
||||
echo ""
|
||||
for workload_file in "$ROOT_DIR"/workloads/*; do
|
||||
file_name=$(basename -- "$workload_file")
|
||||
workload_name="${file_name%.*}"
|
||||
echo " ray attach $ROOT_DIR/$CLUSTER_CONFIG --cluster-name=$workload_name --tmux"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
echo "To shut down all instances, run the following."
|
||||
echo " $ROOT_DIR/shut_down_workloads.sh"
|
||||
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
echo "To check up on the scripts, run the following."
|
||||
echo " $ROOT_DIR/check_workloads.sh --load"
|
||||
echo " $ROOT_DIR/check_workloads.sh --logs"
|
Loading…
Add table
Reference in a new issue