mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
Improve release process from 0.8.2 (#7303)
This commit is contained in:
parent
f2faf8d26e
commit
29b08ddc09
31 changed files with 594 additions and 443 deletions
|
@ -6,8 +6,8 @@ forever until they fail. To set up the project you need to run
|
|||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install any
|
||||
any project create
|
||||
pip install anyscale
|
||||
anyscale project create
|
||||
|
||||
|
||||
Running the Workloads
|
||||
|
@ -17,21 +17,21 @@ You can start all the workloads with:
|
|||
|
||||
.. code-block:: bash
|
||||
|
||||
any session start -y run --workload="*" --wheel=https://s3-us-west-2.amazonaws.com/ray-wheels/releases/0.7.5/6da7eff4b20340f92d3fe1160df35caa68922a97/ray-0.7.5-cp36-cp36m-manylinux1_x86_64.whl
|
||||
anyscale session start -y run --workload="*" --wheel=https://s3-us-west-2.amazonaws.com/ray-wheels/releases/0.7.5/6da7eff4b20340f92d3fe1160df35caa68922a97/ray-0.7.5-cp36-cp36m-manylinux1_x86_64.whl
|
||||
|
||||
This will start one EC2 instance per workload and will start the workloads
|
||||
running (one per instance). You can start a specific workload by specifying
|
||||
its name as an argument ``--workload=`` instead of ``"*"``. A list of available options
|
||||
is available via `any session start run --help`.
|
||||
its name as an argument ``--workload=`` instead of ``"*"``. A list of
|
||||
available options is available via `any session start run --help`.
|
||||
|
||||
|
||||
Check Workload Statuses
|
||||
-----------------------
|
||||
|
||||
To check up on the workloads, run either
|
||||
``any session --name="*" execute check-load``, which
|
||||
``anyscale session --name="*" execute check-load``, which
|
||||
will print the load on each machine, or
|
||||
``any session --name="*" execute show-output``, which
|
||||
``anyscale session --name="*" execute show-output``, which
|
||||
will print the tail of the output for each workload.
|
||||
|
||||
To debug workloads that have failed, you may find it useful to ssh to the
|
||||
|
@ -43,7 +43,7 @@ Shut Down the Workloads
|
|||
-----------------------
|
||||
|
||||
The instances running the workloads can all be killed by running
|
||||
``any session stop --name "*"``.
|
||||
``anyscale session stop --name "*"``.
|
||||
|
||||
Adding a Workload
|
||||
-----------------
|
||||
|
|
|
@ -57,15 +57,17 @@ for _ in range(5):
|
|||
time.sleep(0.5)
|
||||
|
||||
connections = int(config.num_replicas * config.max_batch_size * 0.75)
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
"./hey_linux_amd64", "-c",
|
||||
str(connections), "-z", "360m", "http://127.0.0.1:8000/echo"
|
||||
],
|
||||
stdout=PIPE,
|
||||
stderr=PIPE)
|
||||
print("started load testing")
|
||||
proc.wait()
|
||||
out, err = proc.communicate()
|
||||
print(out.decode())
|
||||
print(err.decode())
|
||||
|
||||
while True:
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
"./hey_linux_amd64", "-c",
|
||||
str(connections), "-z", "60m", "http://127.0.0.1:8000/echo"
|
||||
],
|
||||
stdout=PIPE,
|
||||
stderr=PIPE)
|
||||
print("started load testing")
|
||||
proc.wait()
|
||||
out, err = proc.communicate()
|
||||
print(out.decode())
|
||||
print(err.decode())
|
||||
|
|
55
ci/microbenchmark/ray-project/cluster.yaml
Normal file
55
ci/microbenchmark/ray-project/cluster.yaml
Normal file
|
@ -0,0 +1,55 @@
|
|||
cluster_name: ray-release-microbenchmark
|
||||
min_workers: 0
|
||||
max_workers: 0
|
||||
target_utilization_fraction: 0.8
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
availability_zone: us-west-2a
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
head_node:
|
||||
InstanceType: m4.16xlarge
|
||||
ImageId: ami-06d51e91cea0dac8d # Ubuntu 18.04
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 150
|
||||
|
||||
worker_nodes:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-06d51e91cea0dac8d # Ubuntu 18.04
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 150
|
||||
|
||||
# Run workers on spot by default. Comment this out to use on-demand.
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands:
|
||||
# Install latest TensorFlow
|
||||
- echo set-window-option -g mouse on > ~/.tmux.conf
|
||||
- echo 'termcapinfo xterm* ti@:te@' > ~/.screenrc
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
# Install Anaconda.
|
||||
- wget --quiet https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh || true
|
||||
- bash Anaconda3-5.0.1-Linux-x86_64.sh -b -p $HOME/anaconda3 || true
|
||||
- echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.bashrc
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
||||
# Command to start ray on the head node. You don't need to change this.
|
||||
head_start_ray_commands: []
|
||||
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands: []
|
39
ci/microbenchmark/ray-project/project.yaml
Normal file
39
ci/microbenchmark/ray-project/project.yaml
Normal file
|
@ -0,0 +1,39 @@
|
|||
name: microbenchmark
|
||||
description: "Ray's microbenchmark"
|
||||
|
||||
cluster:
|
||||
config: ray-project/cluster.yaml
|
||||
|
||||
commands:
|
||||
- name: run
|
||||
help: "Start one microbenchmark trial."
|
||||
command: |
|
||||
rm ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl || true
|
||||
wget https://s3-us-west-2.amazonaws.com/ray-wheels/{{ray_branch}}/{{commit}}/ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl
|
||||
|
||||
pip uninstall -y -q ray
|
||||
pip install -U ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl
|
||||
|
||||
OMP_NUM_THREADS=64 ray microbenchmark
|
||||
params:
|
||||
- name: ray_version # Ray version string.
|
||||
default: "0.9.0.dev0"
|
||||
|
||||
- name: commit # Ray commit SHA string.
|
||||
default: "FILL ME IN"
|
||||
|
||||
- name: ray_branch
|
||||
default: "master"
|
||||
config:
|
||||
tmux: true
|
||||
|
||||
# Pathnames for files and directories that should be saved
|
||||
# in a snapshot but that should not be synced with a# session. Pathnames can be relative to the project
|
||||
# directory or absolute. Generally, this should be files
|
||||
# that were created by an active session, such as
|
||||
# application checkpoints and logs.
|
||||
output_files: [
|
||||
# For example, uncomment this to save the logs from the
|
||||
# last ray job.
|
||||
# "/tmp/ray/session_latest",
|
||||
]
|
|
@ -0,0 +1,145 @@
|
|||
# This file runs on a single g3.16xl or p3.16xl node. It is suggested
|
||||
# to run these in a DLAMI / tensorflow_p36 env. Note that RL runs are
|
||||
# inherently high variance, so you'll have to check to see if the
|
||||
# rewards reached seem reasonably in line with previous results.
|
||||
#
|
||||
# You can find the reference results here:
|
||||
# https://github.com/ray-project/ray/tree/master/doc/dev/release_logs
|
||||
atari-impala:
|
||||
env: BreakoutNoFrameskip-v4
|
||||
run: IMPALA
|
||||
num_samples: 4
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
config:
|
||||
sample_batch_size: 50
|
||||
train_batch_size: 500
|
||||
num_workers: 10
|
||||
num_envs_per_worker: 5
|
||||
clip_rewards: True
|
||||
lr_schedule: [
|
||||
[0, 0.0005],
|
||||
[20000000, 0.000000000001],
|
||||
]
|
||||
num_gpus: 1
|
||||
atari-ppo-tf:
|
||||
env: BreakoutNoFrameskip-v4
|
||||
run: PPO
|
||||
num_samples: 4
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
config:
|
||||
lambda: 0.95
|
||||
kl_coeff: 0.5
|
||||
clip_rewards: True
|
||||
clip_param: 0.1
|
||||
vf_clip_param: 10.0
|
||||
entropy_coeff: 0.01
|
||||
train_batch_size: 5000
|
||||
sample_batch_size: 100
|
||||
sgd_minibatch_size: 500
|
||||
num_sgd_iter: 10
|
||||
num_workers: 10
|
||||
num_envs_per_worker: 5
|
||||
batch_mode: truncate_episodes
|
||||
observation_filter: NoFilter
|
||||
vf_share_layers: true
|
||||
num_gpus: 1
|
||||
atari-ppo-torch:
|
||||
env: BreakoutNoFrameskip-v4
|
||||
run: PPO
|
||||
num_samples: 4
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
config:
|
||||
use_pytorch: true,
|
||||
lambda: 0.95
|
||||
kl_coeff: 0.5
|
||||
clip_rewards: True
|
||||
clip_param: 0.1
|
||||
vf_clip_param: 10.0
|
||||
entropy_coeff: 0.01
|
||||
train_batch_size: 5000
|
||||
sample_batch_size: 100
|
||||
sgd_minibatch_size: 500
|
||||
num_sgd_iter: 10
|
||||
num_workers: 10
|
||||
num_envs_per_worker: 5
|
||||
batch_mode: truncate_episodes
|
||||
observation_filter: NoFilter
|
||||
vf_share_layers: true
|
||||
num_gpus: 1
|
||||
apex:
|
||||
env: BreakoutNoFrameskip-v4
|
||||
run: APEX
|
||||
num_samples: 4
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
config:
|
||||
double_q: false
|
||||
dueling: false
|
||||
num_atoms: 1
|
||||
noisy: false
|
||||
n_step: 3
|
||||
lr: .0001
|
||||
adam_epsilon: .00015
|
||||
hiddens: [512]
|
||||
buffer_size: 1000000
|
||||
exploration_config:
|
||||
epsilon_timesteps: 200000
|
||||
final_epsilon: 0.01
|
||||
prioritized_replay_alpha: 0.5
|
||||
final_prioritized_replay_beta: 1.0
|
||||
prioritized_replay_beta_annealing_timesteps: 2000000
|
||||
num_gpus: 1
|
||||
num_workers: 8
|
||||
num_envs_per_worker: 8
|
||||
sample_batch_size: 20
|
||||
train_batch_size: 512
|
||||
target_network_update_freq: 50000
|
||||
timesteps_per_iteration: 25000
|
||||
atari-a2c:
|
||||
env: BreakoutNoFrameskip-v4
|
||||
run: A2C
|
||||
num_samples: 4
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
config:
|
||||
sample_batch_size: 20
|
||||
clip_rewards: True
|
||||
num_workers: 5
|
||||
num_envs_per_worker: 5
|
||||
num_gpus: 1
|
||||
lr_schedule: [
|
||||
[0, 0.0007],
|
||||
[20000000, 0.000000000001],
|
||||
]
|
||||
atari-basic-dqn:
|
||||
env: BreakoutNoFrameskip-v4
|
||||
run: DQN
|
||||
num_samples: 4
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
config:
|
||||
double_q: false
|
||||
dueling: false
|
||||
num_atoms: 1
|
||||
noisy: false
|
||||
prioritized_replay: false
|
||||
n_step: 1
|
||||
target_network_update_freq: 8000
|
||||
lr: .0000625
|
||||
adam_epsilon: .00015
|
||||
hiddens: [512]
|
||||
learning_starts: 20000
|
||||
buffer_size: 1000000
|
||||
sample_batch_size: 4
|
||||
train_batch_size: 32
|
||||
exploration_config:
|
||||
epsilon_timesteps: 200000
|
||||
final_epsilon: 0.01
|
||||
prioritized_replay_alpha: 0.5
|
||||
final_prioritized_replay_beta: 1.0
|
||||
prioritized_replay_beta_annealing_timesteps: 2000000
|
||||
num_gpus: 0.2
|
||||
timesteps_per_iteration: 10000
|
|
@ -0,0 +1,43 @@
|
|||
cluster_name: ray-rllib-regression-tests
|
||||
|
||||
min_workers: 0
|
||||
max_workers: 0
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
availability_zone: us-west-2a
|
||||
cache_stopped_nodes: False
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
head_node:
|
||||
InstanceType: p3.16xlarge
|
||||
ImageId: ami-07728e9e2742b0662 # Deep Learning AMI (Ubuntu 16.04)
|
||||
|
||||
# Set primary volume to 25 GiB
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 100
|
||||
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands:
|
||||
- wget --quiet https://s3-us-west-2.amazonaws.com/ray-wheels/releases/{{ray_version}}/{{commit}}/ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl
|
||||
- source activate tensorflow_p36 && pip install -U ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl
|
||||
- source activate tensorflow_p36 && pip install ray[rllib] ray[debug]
|
||||
- source activate tensorflow_p36 && pip install boto3==1.4.8 cython==0.29.0
|
||||
|
||||
# Command to start ray on the head node. You don't need to change this.
|
||||
head_start_ray_commands:
|
||||
- source activate tensorflow_p36 && ray stop
|
||||
- ulimit -n 65536; source activate tensorflow_p36 && OMP_NUM_THREADS=1 ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- source activate tensorflow_p36 && ray stop
|
||||
- ulimit -n 65536; source activate tensorflow_p36 && OMP_NUM_THREADS=1 ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
|
@ -0,0 +1,53 @@
|
|||
# This file is generated by `ray project create`.
|
||||
|
||||
name: rllib_regression_tests
|
||||
|
||||
# description: A short description of the project.
|
||||
# The URL of the repo this project is part of.
|
||||
# repo: ...
|
||||
|
||||
cluster:
|
||||
config: ray-project/cluster.yaml
|
||||
params:
|
||||
- name: ray_version # Ray version string.
|
||||
default: "0.8.2"
|
||||
|
||||
- name: commit # Ray commit SHA string.
|
||||
default: "f5a1307a608fe5fdbdb04616b22c91f029af329a"
|
||||
|
||||
|
||||
environment:
|
||||
# dockerfile: The dockerfile to be built and ran the commands with.
|
||||
# dockerimage: The docker image to be used to run the project in, e.g. ubuntu:18.04.
|
||||
requirements: ray-project/requirements.txt
|
||||
|
||||
shell: # Shell commands to be ran for environment setup.
|
||||
- echo "Setting up the environment"
|
||||
|
||||
commands:
|
||||
- name: check-load
|
||||
command: uptime
|
||||
help: "Check load of the workload."
|
||||
|
||||
- name: check-gpu
|
||||
command: nvidia-smi
|
||||
help: "Check load of the gpu."
|
||||
|
||||
- name: show-output
|
||||
command: tmux capture-pane -p
|
||||
help: "Show tail of the workoad output."
|
||||
|
||||
- name: run-regression-tests
|
||||
command: source activate tensorflow_p36 && rllib train -f compact-regression-test.yaml
|
||||
help: "Run rllib regression tests"
|
||||
|
||||
# Pathnames for files and directories that should be saved
|
||||
# in a snapshot but that should not be synced with a# session. Pathnames can be relative to the project
|
||||
# directory or absolute. Generally, this should be files
|
||||
# that were created by an active session, such as
|
||||
# application checkpoints and logs.
|
||||
output_files: [
|
||||
# For example, uncomment this to save the logs from the
|
||||
# last ray job.
|
||||
# "/tmp/ray/session_latest",
|
||||
]
|
|
@ -0,0 +1 @@
|
|||
ray[rllib]
|
|
@ -0,0 +1,24 @@
|
|||
# Taken from rllib/tuned_examples/atari_impala_large.yaml
|
||||
|
||||
# Runs on a g3.16xl node with 5 m5.24xl workers
|
||||
# Takes roughly 10 minutes. x10?
|
||||
atari-impala:
|
||||
env:
|
||||
grid_search:
|
||||
- BreakoutNoFrameskip-v4
|
||||
- BeamRiderNoFrameskip-v4
|
||||
- QbertNoFrameskip-v4
|
||||
- SpaceInvadersNoFrameskip-v4
|
||||
run: IMPALA
|
||||
stop:
|
||||
timesteps_total: 30000000
|
||||
config:
|
||||
sample_batch_size: 50
|
||||
train_batch_size: 500
|
||||
num_workers: 128
|
||||
num_envs_per_worker: 5
|
||||
clip_rewards: True
|
||||
lr_schedule: [
|
||||
[0, 0.0005],
|
||||
[20000000, 0.000000000001],
|
||||
]
|
|
@ -3,22 +3,15 @@
|
|||
####################################################################
|
||||
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: <<<CLUSTER_NAME>>>
|
||||
cluster_name: ray-rllib-stress-tests
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: <<<MIN_WORKERS>>>
|
||||
min_workers: 9
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: <<<MAX_WORKERS>>>
|
||||
|
||||
# This executes all commands on all nodes in the docker container,
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "" # e.g., tensorflow/tensorflow:1.5.0-py3
|
||||
container_name: "" # e.g. ray_docker
|
||||
max_workers: 9
|
||||
|
||||
# The autoscaler will scale up the cluster to this target fraction of resource
|
||||
# usage. For example, if a cluster of 10 nodes is 100% busy and
|
||||
|
@ -35,7 +28,7 @@ provider:
|
|||
type: aws
|
||||
region: us-west-2
|
||||
availability_zone: us-west-2a
|
||||
cache_stopped_nodes: false
|
||||
cache_stopped_nodes: False
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
|
@ -50,10 +43,10 @@ auth:
|
|||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
head_node:
|
||||
InstanceType: <<<HEAD_TYPE>>>
|
||||
InstanceType: p3.16xlarge
|
||||
ImageId: ami-07728e9e2742b0662 # Deep Learning AMI (Ubuntu 16.04)
|
||||
|
||||
# You can provision additional disk space with a conf as follows
|
||||
# Set primary volume to 25 GiB
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
|
@ -66,12 +59,19 @@ head_node:
|
|||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
worker_nodes:
|
||||
InstanceType: <<<WORKER_TYPE>>>
|
||||
InstanceType: m4.16xlarge
|
||||
ImageId: ami-07728e9e2742b0662 # Deep Learning AMI (Ubuntu 16.04)
|
||||
|
||||
|
||||
# Set primary volume to 25 GiB
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 100
|
||||
|
||||
# Run workers on spot by default. Comment this out to use on-demand.
|
||||
# InstanceMarketOptions:
|
||||
# MarketType: spot
|
||||
# InstanceMarketOptions:
|
||||
# MarketType: spot
|
||||
# Additional options can be found in the boto docs, e.g.
|
||||
# SpotOptions:
|
||||
# MaxPrice: MAX_HOURLY_PRICE
|
||||
|
@ -87,17 +87,13 @@ file_mounts: {
|
|||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands:
|
||||
- wget --quiet https://s3-us-west-2.amazonaws.com/ray-wheels/releases/<<<RAY_VERSION>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl
|
||||
- source activate tensorflow_p36 && pip install -U ray-<<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl
|
||||
- wget --quiet https://s3-us-west-2.amazonaws.com/ray-wheels/releases/{{ray_version}}/{{commit}}/ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl
|
||||
- source activate tensorflow_p36 && pip install -U ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl
|
||||
- source activate tensorflow_p36 && pip install ray[rllib] ray[debug]
|
||||
# Consider uncommenting these if you also want to run apt-get commands during setup
|
||||
# - sudo pkill -9 apt-get || true
|
||||
# - sudo pkill -9 dpkg || true
|
||||
# - sudo dpkg --configure -a
|
||||
- source activate tensorflow_p36 && pip install boto3==1.4.8 cython==0.29.0
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
- pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions
|
||||
head_setup_commands: []
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
|
@ -0,0 +1,49 @@
|
|||
# This file is generated by `ray project create`.
|
||||
|
||||
name: rllib_stress_tests
|
||||
|
||||
# description: A short description of the project.
|
||||
# The URL of the repo this project is part of.
|
||||
# repo: ...
|
||||
|
||||
cluster:
|
||||
config: ray-project/cluster.yaml
|
||||
params:
|
||||
- name: ray_version # Ray version string.
|
||||
default: "0.8.2"
|
||||
|
||||
- name: commit # Ray commit SHA string.
|
||||
default: "f5a1307a608fe5fdbdb04616b22c91f029af329a"
|
||||
|
||||
|
||||
environment:
|
||||
# dockerfile: The dockerfile to be built and ran the commands with.
|
||||
# dockerimage: The docker image to be used to run the project in, e.g. ubuntu:18.04.
|
||||
requirements: ray-project/requirements.txt
|
||||
|
||||
shell: # Shell commands to be ran for environment setup.
|
||||
- echo "Setting up the environment"
|
||||
|
||||
commands:
|
||||
- name: check-load
|
||||
command: uptime
|
||||
help: "Check load of the workload."
|
||||
|
||||
- name: show-output
|
||||
command: tmux capture-pane -p
|
||||
help: "Show tail of the workoad output."
|
||||
|
||||
- name: run-impala
|
||||
command: bash run.sh
|
||||
help: "Run impala stress test"
|
||||
|
||||
# Pathnames for files and directories that should be saved
|
||||
# in a snapshot but that should not be synced with a# session. Pathnames can be relative to the project
|
||||
# directory or absolute. Generally, this should be files
|
||||
# that were created by an active session, such as
|
||||
# application checkpoints and logs.
|
||||
output_files: [
|
||||
# For example, uncomment this to save the logs from the
|
||||
# last ray job.
|
||||
# "/tmp/ray/session_latest",
|
||||
]
|
|
@ -0,0 +1 @@
|
|||
ray[rllib]
|
6
ci/regression_test/rllib_stress_tests/run.sh
Normal file
6
ci/regression_test/rllib_stress_tests/run.sh
Normal file
|
@ -0,0 +1,6 @@
|
|||
|
||||
source activate tensorflow_p36
|
||||
|
||||
python3 wait_cluster.py
|
||||
|
||||
rllib train -f atari_impala_xlarge.yaml --ray-address=auto --queue-trials
|
10
ci/regression_test/rllib_stress_tests/wait_cluster.py
Normal file
10
ci/regression_test/rllib_stress_tests/wait_cluster.py
Normal file
|
@ -0,0 +1,10 @@
|
|||
import ray
|
||||
import time
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
curr_nodes = 0
|
||||
while not curr_nodes > 8:
|
||||
print("Waiting for more nodes to come up: {}/{}".format(curr_nodes, 8))
|
||||
curr_nodes = len(ray.nodes())
|
||||
time.sleep(5)
|
|
@ -98,7 +98,7 @@ setup_commands:
|
|||
# - ray/ci/travis/install-bazel.sh
|
||||
- pip install boto3==1.4.8 cython==0.29.0
|
||||
# - cd ray/python; git checkout master; git pull; pip install -e . --verbose
|
||||
- "pip install https://s3-us-west-2.amazonaws.com/ray-wheels/releases/{{ray_version}}/{{commit}}/ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl"
|
||||
- "pip install https://s3-us-west-2.amazonaws.com/ray-wheels/{{ray_branch}}/{{commit}}/ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl"
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands: []
|
|
@ -6,11 +6,13 @@ cluster:
|
|||
config: ray-project/cluster.yaml
|
||||
params:
|
||||
- name: ray_version # Ray version string.
|
||||
default: "0.8.1"
|
||||
default: "0.8.2"
|
||||
|
||||
- name: commit # Ray commit SHA string.
|
||||
default: "38ec2e70524a277d5aea307f6c843065ff982da5"
|
||||
default: "f5a1307a608fe5fdbdb04616b22c91f029af329a"
|
||||
|
||||
- name: ray_branch
|
||||
default: "releases/0.8.2"
|
||||
|
||||
commands:
|
||||
- name: test_many_tasks
|
4
ci/stress_tests/.gitignore
vendored
4
ci/stress_tests/.gitignore
vendored
|
@ -1,4 +0,0 @@
|
|||
*.log
|
||||
*temporary.yaml
|
||||
rllib_impala_p36.yaml
|
||||
sgd_p36.yaml
|
|
@ -1,158 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# This script should be run as follows:
|
||||
# ./run_application_stress_tests.sh <ray-version> <ray-commit>
|
||||
# For example, <ray-version> might be 0.7.1
|
||||
# and <ray-commit> might be bc3b6efdb6933d410563ee70f690855c05f25483. The commit
|
||||
# should be the latest commit on the branch "releases/<ray-version>".
|
||||
|
||||
# This script runs all of the application tests.
|
||||
# Currently includes an IMPALA stress test and a SGD stress test on Python 3.6.
|
||||
# All tests use a separate cluster, and each cluster
|
||||
# will be destroyed upon test completion (or failure).
|
||||
|
||||
# Note that if the environment variable DEBUG_MODE is detected,
|
||||
# the clusters will not be automatically shut down after the test runs.
|
||||
|
||||
# This script will exit with code 1 if the test did not run successfully.
|
||||
|
||||
# Show explicitly which commands are currently running. This should only be AFTER
|
||||
# the private key is placed.
|
||||
set -x
|
||||
|
||||
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
|
||||
RESULT_FILE=$ROOT_DIR/"results-$(date '+%Y-%m-%d_%H-%M-%S').log"
|
||||
|
||||
touch "$RESULT_FILE"
|
||||
echo "Logging to" "$RESULT_FILE"
|
||||
|
||||
if [[ -z "$1" ]]; then
|
||||
echo "ERROR: The first argument must be the Ray version string."
|
||||
exit 1
|
||||
else
|
||||
RAY_VERSION=$1
|
||||
fi
|
||||
|
||||
if [[ -z "$2" ]]; then
|
||||
echo "ERROR: The second argument must be the commit hash to test."
|
||||
exit 1
|
||||
else
|
||||
RAY_COMMIT=$2
|
||||
fi
|
||||
|
||||
echo "Testing ray==$RAY_VERSION at commit $RAY_COMMIT."
|
||||
echo "The wheels used will live under https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_COMMIT/"
|
||||
|
||||
# This function identifies the right string for the Ray wheel.
|
||||
_find_wheel_str(){
|
||||
local python_version=$1
|
||||
# echo "PYTHON_VERSION", $python_version
|
||||
local wheel_str=""
|
||||
if [ "$python_version" == "p27" ]; then
|
||||
wheel_str="cp27-cp27mu"
|
||||
else
|
||||
wheel_str="cp36-cp36m"
|
||||
fi
|
||||
echo $wheel_str
|
||||
}
|
||||
|
||||
# Total time is roughly 25 minutes.
|
||||
# Actual test runtime is roughly 10 minutes.
|
||||
test_impala(){
|
||||
local PYTHON_VERSION=$1
|
||||
local WHEEL_STR=$(_find_wheel_str "$PYTHON_VERSION")
|
||||
|
||||
pushd "$ROOT_DIR"
|
||||
local TEST_NAME="rllib_impala_$PYTHON_VERSION"
|
||||
local CLUSTER="$TEST_NAME.yaml"
|
||||
echo "Creating IMPALA cluster YAML from template."
|
||||
|
||||
cat application_cluster_template.yaml |
|
||||
sed -e "
|
||||
s/<<<RAY_VERSION>>>/$RAY_VERSION/g;
|
||||
s/<<<RAY_COMMIT>>>/$RAY_COMMIT/;
|
||||
s/<<<CLUSTER_NAME>>>/$TEST_NAME/;
|
||||
s/<<<HEAD_TYPE>>>/p3.16xlarge/;
|
||||
s/<<<WORKER_TYPE>>>/m4.16xlarge/;
|
||||
s/<<<MIN_WORKERS>>>/9/;
|
||||
s/<<<MAX_WORKERS>>>/9/;
|
||||
s/<<<PYTHON_VERSION>>>/$PYTHON_VERSION/;
|
||||
s/<<<WHEEL_STR>>>/$WHEEL_STR/;" > "$CLUSTER"
|
||||
|
||||
echo "Try running IMPALA stress test."
|
||||
{
|
||||
RLLIB_DIR=../../python/ray/rllib/
|
||||
ray --logging-level=DEBUG up -y "$CLUSTER" &&
|
||||
ray rsync_up "$CLUSTER" $RLLIB_DIR/tuned_examples/ tuned_examples/ &&
|
||||
# HACK: the test will deadlock if it scales up slowly, so we have to wait
|
||||
# for the cluster to be fully launched first. This is because the first
|
||||
# trial will occupy all the CPU slots if it can, preventing GPU access.
|
||||
sleep 200 &&
|
||||
ray --logging-level=DEBUG exec "$CLUSTER" "source activate tensorflow_p36 && rllib train -f tuned_examples/atari-impala-large.yaml --ray-address='localhost:6379' --queue-trials" &&
|
||||
echo "PASS: IMPALA Test for" "$PYTHON_VERSION" >> "$RESULT_FILE"
|
||||
} || echo "FAIL: IMPALA Test for" "$PYTHON_VERSION" >> "$RESULT_FILE"
|
||||
|
||||
# Tear down cluster.
|
||||
if [ "$DEBUG_MODE" = "" ]; then
|
||||
ray down -y "$CLUSTER"
|
||||
rm "$CLUSTER"
|
||||
else
|
||||
echo "Not tearing down cluster" "$CLUSTER"
|
||||
fi
|
||||
popd
|
||||
}
|
||||
|
||||
# Total runtime is about 20 minutes (if the AWS spot instance order is fulfilled).
|
||||
# Actual test runtime is roughly 10 minutes.
|
||||
test_sgd(){
|
||||
local PYTHON_VERSION=$1
|
||||
local WHEEL_STR=$(_find_wheel_str $PYTHON_VERSION)
|
||||
|
||||
pushd "$ROOT_DIR"
|
||||
local TEST_NAME="sgd_$PYTHON_VERSION"
|
||||
local CLUSTER="$TEST_NAME.yaml"
|
||||
echo "Creating SGD cluster YAML from template."
|
||||
|
||||
cat application_cluster_template.yaml |
|
||||
sed -e "
|
||||
s/<<<RAY_VERSION>>>/$RAY_VERSION/g;
|
||||
s/<<<RAY_COMMIT>>>/$RAY_COMMIT/;
|
||||
s/<<<CLUSTER_NAME>>>/$TEST_NAME/;
|
||||
s/<<<HEAD_TYPE>>>/p3.16xlarge/;
|
||||
s/<<<WORKER_TYPE>>>/p3.16xlarge/;
|
||||
s/<<<MIN_WORKERS>>>/3/;
|
||||
s/<<<MAX_WORKERS>>>/3/;
|
||||
s/<<<PYTHON_VERSION>>>/$PYTHON_VERSION/;
|
||||
s/<<<WHEEL_STR>>>/$WHEEL_STR/;" > "$CLUSTER"
|
||||
|
||||
echo "Try running SGD stress test."
|
||||
{
|
||||
SGD_DIR=$ROOT_DIR/../../python/ray/util/sgd/
|
||||
ray --logging-level=DEBUG up -y "$CLUSTER" &&
|
||||
# TODO: fix submit so that args work
|
||||
ray rsync_up "$CLUSTER" "$SGD_DIR/mnist_example.py" mnist_example.py &&
|
||||
sleep 1 &&
|
||||
ray --logging-level=DEBUG exec "$CLUSTER" "
|
||||
python mnist_example.py --address=localhost:6379 --num-iters=2000 --num-workers=8 --devices-per-worker=2 --gpu" &&
|
||||
echo "PASS: SGD Test for" "$PYTHON_VERSION" >> "$RESULT_FILE"
|
||||
} || echo "FAIL: SGD Test for" "$PYTHON_VERSION" >> "$RESULT_FILE"
|
||||
|
||||
# Tear down cluster.
|
||||
if [ "$DEBUG_MODE" = "" ]; then
|
||||
ray down -y "$CLUSTER"
|
||||
rm "$CLUSTER"
|
||||
else
|
||||
echo "Not tearing down cluster" "$CLUSTER"
|
||||
fi
|
||||
popd
|
||||
}
|
||||
|
||||
# RUN TESTS
|
||||
for PYTHON_VERSION in "p36"
|
||||
do
|
||||
test_impala $PYTHON_VERSION
|
||||
done
|
||||
|
||||
cat "$RESULT_FILE"
|
||||
cat "$RESULT_FILE" | grep FAIL > test.log
|
||||
[ ! -s test.log ] || exit 1
|
|
@ -1,28 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Cause the script to exit if a single command fails.
|
||||
set -e
|
||||
|
||||
# Show explicitly which commands are currently running.
|
||||
set -x
|
||||
|
||||
MEMORY_SIZE="20G"
|
||||
SHM_SIZE="20G"
|
||||
|
||||
docker build -q --no-cache -t ray-project/base-deps docker/base-deps
|
||||
|
||||
# Add Ray source
|
||||
git rev-parse HEAD > ./docker/stress_test/git-rev
|
||||
git archive -o ./docker/stress_test/ray.tar $(git rev-parse HEAD)
|
||||
DOCKER_SHA=$(docker build --no-cache -q -t ray-project/stress_test docker/stress_test)
|
||||
|
||||
echo "Using Docker image" $DOCKER_SHA
|
||||
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 \
|
||||
-e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e RAY_AWS_SSH_KEY \
|
||||
$DOCKER_SHA \
|
||||
bash /ray/ci/stress_tests/run_stress_tests.sh
|
||||
|
||||
# docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 \
|
||||
# -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e RAY_AWS_SSH_KEY \
|
||||
# $DOCKER_SHA \
|
||||
# bash /ray/ci/stress_tests/run_application_stress_tests.sh
|
|
@ -1,61 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Show explicitly which commands are currently running.
|
||||
set -x
|
||||
|
||||
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
|
||||
RESULT_FILE=$ROOT_DIR/results-$(date '+%Y-%m-%d_%H-%M-%S').log
|
||||
|
||||
touch "$RESULT_FILE"
|
||||
echo "Logging to" "$RESULT_FILE"
|
||||
|
||||
if [[ -z "$1" ]]; then
|
||||
echo "ERROR: The first argument must be the Ray version string."
|
||||
exit 1
|
||||
else
|
||||
RAY_VERSION=$1
|
||||
fi
|
||||
|
||||
if [[ -z "$2" ]]; then
|
||||
echo "ERROR: The second argument must be the commit hash to test."
|
||||
exit 1
|
||||
else
|
||||
RAY_COMMIT=$2
|
||||
fi
|
||||
|
||||
echo "Testing ray==$RAY_VERSION at commit $RAY_COMMIT."
|
||||
echo "The wheels used will live under https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_COMMIT/"
|
||||
|
||||
run_test(){
|
||||
local test_name=$1
|
||||
|
||||
local CLUSTER="stress_testing_config_temporary.yaml"
|
||||
|
||||
cat stress_testing_config.yaml |
|
||||
sed -e "
|
||||
s/<<<RAY_VERSION>>>/$RAY_VERSION/g;
|
||||
s/<<<RAY_COMMIT>>>/$RAY_COMMIT/;" > "$CLUSTER"
|
||||
|
||||
echo "Try running $test_name."
|
||||
{
|
||||
ray up -y $CLUSTER --cluster-name "$test_name" &&
|
||||
sleep 1 &&
|
||||
ray --logging-level=DEBUG submit "$CLUSTER" --cluster-name "$test_name" "$test_name.py"
|
||||
} || echo "FAIL: $test_name" >> "$RESULT_FILE"
|
||||
|
||||
# Tear down cluster.
|
||||
if [ "$DEBUG_MODE" = "" ]; then
|
||||
ray down -y $CLUSTER --cluster-name "$test_name"
|
||||
rm "$CLUSTER"
|
||||
else
|
||||
echo "Not tearing down cluster" "$CLUSTER"
|
||||
fi
|
||||
}
|
||||
|
||||
pushd "$ROOT_DIR"
|
||||
run_test test_many_tasks
|
||||
run_test test_dead_actors
|
||||
popd
|
||||
|
||||
cat "$RESULT_FILE"
|
||||
[ ! -s "$RESULT_FILE" ] || exit 1
|
|
@ -1,117 +0,0 @@
|
|||
####################################################################
|
||||
# All nodes in this cluster will auto-terminate in 1 hour
|
||||
####################################################################
|
||||
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: stress-testing
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 105
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 105
|
||||
|
||||
# The autoscaler will scale up the cluster to this target fraction of resource
|
||||
# usage. For example, if a cluster of 10 nodes is 100% busy and
|
||||
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
|
||||
# can be decreased to increase the aggressiveness of upscaling.
|
||||
# This value must be less than 1.0 for scaling to happen.
|
||||
target_utilization_fraction: 0.8
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
availability_zone: us-west-2a
|
||||
cache_stopped_nodes: False
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
# By default Ray creates a new private keypair, but you can also use your own.
|
||||
# If you do so, make sure to also set "KeyName" in the head and worker node
|
||||
# configurations below.
|
||||
# ssh_private_key: /path/to/your/key.pem
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
head_node:
|
||||
InstanceType: m4.16xlarge
|
||||
ImageId: ami-06d51e91cea0dac8d # Ubuntu 18.04
|
||||
|
||||
# Set primary volume to 25 GiB
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 100
|
||||
|
||||
# Additional options in the boto docs.
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
worker_nodes:
|
||||
InstanceType: m4.large
|
||||
ImageId: ami-06d51e91cea0dac8d # Ubuntu 18.04
|
||||
|
||||
# Set primary volume to 25 GiB
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 100
|
||||
|
||||
# Run workers on spot by default. Comment this out to use on-demand.
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
# Additional options can be found in the boto docs, e.g.
|
||||
# SpotOptions:
|
||||
# MaxPrice: MAX_HOURLY_PRICE
|
||||
|
||||
# Additional options in the boto docs.
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
}
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands:
|
||||
# Uncomment these if you want to build ray from source.
|
||||
# - sudo apt-get -qq update
|
||||
# - sudo apt-get install -y build-essential curl unzip
|
||||
# Install Anaconda.
|
||||
- wget --quiet https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh || true
|
||||
- bash Anaconda3-5.0.1-Linux-x86_64.sh -b -p $HOME/anaconda3 || true
|
||||
- echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.bashrc
|
||||
# # Build Ray.
|
||||
# - git clone https://github.com/ray-project/ray || true
|
||||
# - ray/ci/travis/install-bazel.sh
|
||||
- pip install boto3==1.4.8 cython==0.29.0
|
||||
# - cd ray/python; git checkout master; git pull; pip install -e . --verbose
|
||||
- pip install https://s3-us-west-2.amazonaws.com/ray-wheels/releases/<<<RAY_VERSION>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-cp36-cp36m-manylinux1_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands: []
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
||||
# Command to start ray on the head node. You don't need to change this.
|
||||
head_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --head --num-redis-shards=5 --redis-port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --num-gpus=100
|
|
@ -24,9 +24,10 @@ This document describes the process for creating new releases.
|
|||
|
||||
For a new micro release (e.g., 0.7.1): No action is required.
|
||||
|
||||
4. **Testing:** Before releasing, the following sets of tests should be run. The results
|
||||
of each of these tests for previous releases are checked in under ``doc/dev/release_tests``,
|
||||
and should be compared against to identify any regressions.
|
||||
4. **Testing:** Before releasing, the following sets of tests should be run.
|
||||
The results of each of these tests for previous releases are checked in
|
||||
under ``doc/dev/release_tests``, and should be compared against to identify
|
||||
any regressions.
|
||||
|
||||
1. Long-running tests
|
||||
|
||||
|
@ -38,16 +39,16 @@ This document describes the process for creating new releases.
|
|||
These tests should run for at least 24 hours (printing new iterations and CPU load
|
||||
stable in the AWS console).
|
||||
|
||||
The last hundred lines or so printed by each test should be checked in under
|
||||
``doc/dev/release_logs/<version>``.
|
||||
2. Multi-node regression tests
|
||||
|
||||
2. Stress tests
|
||||
Follow the same instruction as long running stress tests. The large scale distributed
|
||||
regression tests identify potential performance regression in distributed environment.
|
||||
The following test should be ran:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ray/ci/stress_tests/run_stress_tests.sh <release-version> <release-commit>
|
||||
ray/ci/stress_tests/run_application_stress_tests.sh <release-version> <release-commit>
|
||||
rllib train -f rllib/tuned_examples/compact-regression-test.yaml
|
||||
- ``ci/regression_test/rllib_regression-tests`` run the compact regression test for rllib.
|
||||
- ``ci/regression_test/rllib_stress_tests`` run multinode 8hr IMPALA trial.
|
||||
- ``ci/regression_test/stress_tests`` contains two tests: ``many_tasks`` and ``dead_actors``.
|
||||
Each of the test runs on 105 spot instances.
|
||||
|
||||
Make sure that these pass. For the RLlib regression tests, see the comment on the
|
||||
file for the pass criteria. For the rest, it will be obvious if they passed.
|
||||
|
@ -59,12 +60,9 @@ This document describes the process for creating new releases.
|
|||
|
||||
3. Microbenchmarks
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ray microbenchmark
|
||||
|
||||
Run `ray microbenchmark` on an `m4.16xl` instance running `Ubuntu 18.04` with `Python 3` to get the
|
||||
latest microbenchmark numbers.
|
||||
Run the ``ci/microbenchmark`` with the commit. Under the hood, the session will
|
||||
run `ray microbenchmark` on an `m4.16xl` instance running `Ubuntu 18.04` with `Python 3`
|
||||
to get the latest microbenchmark numbers.
|
||||
|
||||
The results should be checked in under ``doc/dev/release_logs/<version>``.
|
||||
|
||||
|
@ -82,10 +80,11 @@ This document describes the process for creating new releases.
|
|||
changes/updates/bugfixes and their PR numbers. Once you have a draft, send it
|
||||
out to other Ray developers (especially those who contributed heavily during
|
||||
this release) for feedback. At the end of the release note, you should also
|
||||
add a list of contributors.
|
||||
add a list of contributors. Make sure Ray, Tune, RLLib, Autoscaler are
|
||||
capitalized correctly.
|
||||
|
||||
Run ``doc/dev/get_contributors.py`` to generate the list of commits corresponding
|
||||
to this release and the formatted list of contributors.
|
||||
to this release and the formatted list of contributors.
|
||||
You will need to provide a GitHub personal access token
|
||||
(github.com -> settings -> developer settings -> personal access tokens).
|
||||
|
||||
|
@ -107,14 +106,16 @@ This document describes the process for creating new releases.
|
|||
|
||||
export RAY_HASH=... # e.g., 618147f57fb40368448da3b2fb4fd213828fa12b
|
||||
export RAY_VERSION=... # e.g., 0.7.0
|
||||
pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp27-cp27mu-manylinux1_x86_64.whl
|
||||
|
||||
# Linux Wheels
|
||||
pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp35-cp35m-manylinux1_x86_64.whl
|
||||
pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp36-cp36m-manylinux1_x86_64.whl
|
||||
pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp37-cp37m-manylinux1_x86_64.whl
|
||||
pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp27-cp27m-macosx_10_6_intel.whl
|
||||
pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp35-cp35m-macosx_10_6_intel.whl
|
||||
pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp36-cp36m-macosx_10_6_intel.whl
|
||||
pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp37-cp37m-macosx_10_6_intel.whl
|
||||
|
||||
# Mac Wheels
|
||||
pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp35-cp35m-macosx_10_13_intel.whl
|
||||
pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp36-cp36m-macosx_10_13_intel.whl
|
||||
pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp37-cp37m-macosx_10_13_intel.whl
|
||||
|
||||
8. **Upload to PyPI Test:** Upload the wheels to the PyPI test site using
|
||||
``twine``.
|
||||
|
@ -164,9 +165,14 @@ This document describes the process for creating new releases.
|
|||
|
||||
pip install -U ray
|
||||
|
||||
10. **Improve the release process:** Find some way to improve the release
|
||||
10. **Create a point release on readthedocs page:** In the `read the docs project page`_,
|
||||
mark the release branch as "active" so there is a point release for the documentation.
|
||||
Add @richardliaw to add you if you don't have access.
|
||||
|
||||
11. **Improve the release process:** Find some way to improve the release
|
||||
process so that whoever manages the release next will have an easier time.
|
||||
|
||||
.. _`sample PR for bumping a minor release version`: https://github.com/ray-project/ray/pull/6303
|
||||
.. _`sample commit for bumping the release branch version`: https://github.com/ray-project/ray/commit/a39325d818339970e51677708d5596f4b8f790ce
|
||||
.. _`GitHub release`: https://github.com/ray-project/ray/releases
|
||||
.. _`read the docs project page`: https://readthedocs.org/projects/ray/
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp35-cp35m-manylinux1_x86_64.whl
|
||||
wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp36-cp36m-manylinux1_x86_64.whl
|
||||
wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp37-cp37m-manylinux1_x86_64.whl
|
||||
wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp35-cp35m-macosx_10_6_intel.whl
|
||||
wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp36-cp36m-macosx_10_6_intel.whl
|
||||
wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp37-cp37m-macosx_10_6_intel.whl
|
||||
wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp35-cp35m-macosx_10_13_intel.whl
|
||||
wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp36-cp36m-macosx_10_13_intel.whl
|
||||
wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp37-cp37m-macosx_10_13_intel.whl
|
||||
|
|
18
doc/dev/release_logs/0.8.2/microbenchmark.txt
Normal file
18
doc/dev/release_logs/0.8.2/microbenchmark.txt
Normal file
|
@ -0,0 +1,18 @@
|
|||
# NOTE: Make sure to run this with OMP_NUM_THREADS=64, otherwise the put gigabytes per
|
||||
# seconds will be reduced. Put latency was reduced due to extra ipc call to raylet
|
||||
# for ref counting.
|
||||
|
||||
single client get calls per second 11743.14 +- 2062.85
|
||||
single client put calls per second 3133.08 +- 89.81
|
||||
single client put gigabytes per second 10.33 +- 7.96
|
||||
multi client put calls per second 3590.16 +- 22.04
|
||||
multi client put gigabytes per second 23.38 +- 0.63
|
||||
single client tasks sync per second 1263.59 +- 63.16
|
||||
single client tasks async per second 13959.14 +- 393.16
|
||||
multi client tasks async per second 42285.81 +- 238.55
|
||||
1:1 actor calls sync per second 2159.21 +- 112.97
|
||||
1:1 actor calls async per second 7048.53 +- 63.8
|
||||
1:1 actor calls concurrent per second 6167.01 +- 75.67
|
||||
1:n actor calls async per second 12241.67 +- 62.13
|
||||
n:n actor calls async per second 41766.33 +- 672.14
|
||||
n:n actor calls with arg async per second 13134.22 +- 71.68
|
36
doc/dev/release_logs/0.8.2/rllib_regression.txt
Normal file
36
doc/dev/release_logs/0.8.2/rllib_regression.txt
Normal file
|
@ -0,0 +1,36 @@
|
|||
== Status ==
|
||||
Memory usage on this node: 43.4/480.3 GiB
|
||||
Using FIFO scheduling algorithm.
|
||||
Resources requested: 0/64 CPUs, 0.0/8 GPUs, 0.0/440.23 GiB heap, 0.0/12.84 GiB objects
|
||||
Result logdir: /home/ubuntu/ray_results/apex
|
||||
Result logdir: /home/ubuntu/ray_results/atari-a2c
|
||||
Result logdir: /home/ubuntu/ray_results/atari-basic-dqn
|
||||
Result logdir: /home/ubuntu/ray_results/atari-impala
|
||||
Result logdir: /home/ubuntu/ray_results/atari-ppo-tf
|
||||
Result logdir: /home/ubuntu/ray_results/atari-ppo-torch
|
||||
Number of trials: 24 (24 TERMINATED)
|
||||
Table truncated to 20 rows. 4 trials (4 TERMINATED) not shown.
|
||||
+--------------------------------------+------------+-------+----------+------------------+---------+--------+
|
||||
| Trial name | status | loc | reward | total time (s) | ts | iter |
|
||||
|--------------------------------------+------------+-------+----------+------------------+---------+--------|
|
||||
| A2C_BreakoutNoFrameskip-v4_c8ad5a48 | TERMINATED | | 139.19 | 3606.77 | 3686000 | 352 |
|
||||
| A2C_BreakoutNoFrameskip-v4_c8ad1c54 | TERMINATED | | 75.56 | 3601.57 | 2932000 | 349 |
|
||||
| A2C_BreakoutNoFrameskip-v4_c8acd28a | TERMINATED | | 131.97 | 3603.39 | 2928000 | 349 |
|
||||
| A2C_BreakoutNoFrameskip-v4_c8ac8d16 | TERMINATED | | 105.42 | 3601.03 | 2901500 | 349 |
|
||||
| DQN_BreakoutNoFrameskip-v4_c8af8a02 | TERMINATED | | 15.81 | 3665.65 | 270000 | 27 |
|
||||
| DQN_BreakoutNoFrameskip-v4_c8af079e | TERMINATED | | 11.32 | 3612.1 | 270000 | 27 |
|
||||
| APEX_BreakoutNoFrameskip-v4_c8ac4694 | TERMINATED | | 50.56 | 3627.89 | 5786880 | 115 |
|
||||
| DQN_BreakoutNoFrameskip-v4_c8ae61ae | TERMINATED | | 7.14 | 3620.61 | 270000 | 27 |
|
||||
| DQN_BreakoutNoFrameskip-v4_c8adbcea | TERMINATED | | 11.24 | 3640.35 | 270000 | 27 |
|
||||
| APEX_BreakoutNoFrameskip-v4_c8abef3c | TERMINATED | | 94.5 | 3625.19 | 5820800 | 115 |
|
||||
| PPO_BreakoutNoFrameskip-v4_c8ab0572 | TERMINATED | | 25.26 | 3603.23 | 1335000 | 267 |
|
||||
| PPO_BreakoutNoFrameskip-v4_c8aabf36 | TERMINATED | | 18.2 | 3603.36 | 1300000 | 260 |
|
||||
| APEX_BreakoutNoFrameskip-v4_c8abaa86 | TERMINATED | | 90.98 | 3627.03 | 7350400 | 116 |
|
||||
| PPO_BreakoutNoFrameskip-v4_c8aa6f5e | TERMINATED | | 17.01 | 3611.01 | 1555000 | 311 |
|
||||
| PPO_BreakoutNoFrameskip-v4_c8aa27e2 | TERMINATED | | 22.41 | 3609.64 | 1545000 | 309 |
|
||||
| PPO_BreakoutNoFrameskip-v4_c8a9e39a | TERMINATED | | 61.25 | 3602.17 | 4475000 | 895 |
|
||||
| PPO_BreakoutNoFrameskip-v4_c8a97978 | TERMINATED | | 28.19 | 3601.33 | 4415000 | 883 |
|
||||
| PPO_BreakoutNoFrameskip-v4_c8a904ca | TERMINATED | | 41.3 | 3600.42 | 4515000 | 903 |
|
||||
| APEX_BreakoutNoFrameskip-v4_c8ab5108 | TERMINATED | | 62.46 | 3626.37 | 5091840 | 114 |
|
||||
| PPO_BreakoutNoFrameskip-v4_c8a88004 | TERMINATED | | 60.44 | 3602.52 | 3380000 | 676 |
|
||||
+--------------------------------------+------------+-------+----------+------------------+---------+--------+
|
|
@ -0,0 +1,14 @@
|
|||
== Status ==
|
||||
Memory usage on this node: 34.6/480.3 GiB
|
||||
Using FIFO scheduling algorithm.
|
||||
Resources requested: 0/640 CPUs, 0/8 GPUs, 0.0/2541.21 GiB heap, 0.0/128.42 GiB objects
|
||||
Result logdir: /home/ubuntu/ray_results/atari-impala
|
||||
Number of trials: 4 (4 TERMINATED)
|
||||
+---------------------------------------------+------------+-------+-----------------------------+----------+------------------+----------+--------+
|
||||
| Trial name | status | loc | env | reward | total time (s) | ts | iter |
|
||||
|---------------------------------------------+------------+-------+-----------------------------+----------+------------------+----------+--------|
|
||||
| IMPALA_BreakoutNoFrameskip-v4_2565545c | TERMINATED | | BreakoutNoFrameskip-v4 | 451.07 | 22555.3 | 30039500 | 381 |
|
||||
| IMPALA_BeamRiderNoFrameskip-v4_2565e804 | TERMINATED | | BeamRiderNoFrameskip-v4 | 3124.8 | 24121.2 | 30057000 | 408 |
|
||||
| IMPALA_QbertNoFrameskip-v4_256671de | TERMINATED | | QbertNoFrameskip-v4 | 8388.25 | 25163.5 | 30080000 | 453 |
|
||||
| IMPALA_SpaceInvadersNoFrameskip-v4_256725ac | TERMINATED | | SpaceInvadersNoFrameskip-v4 | 780.65 | 23148.1 | 30026500 | 384 |
|
||||
+---------------------------------------------+------------+-------+-----------------------------+----------+------------------+----------+--------+
|
|
@ -0,0 +1,4 @@
|
|||
Finished in: 98.49777579307556s
|
||||
Average iteration time: 0.9849753308296204s
|
||||
Max iteration time: 2.9459526538848877s
|
||||
Min iteration time: 0.08075928688049316s
|
15
doc/dev/release_logs/0.8.2/stress_tests/test_many_tasks.txt
Normal file
15
doc/dev/release_logs/0.8.2/stress_tests/test_many_tasks.txt
Normal file
|
@ -0,0 +1,15 @@
|
|||
Stage 0 results:
|
||||
Total time: 22.579216480255127
|
||||
Stage 1 results:
|
||||
Total time: 154.41431832313538
|
||||
Average iteration time: 15.441423058509827
|
||||
Max iteration time: 15.943994760513306
|
||||
Min iteration time: 15.029884099960327
|
||||
Stage 2 results:
|
||||
Total time: 646.7662391662598
|
||||
Average iteration time: 129.35279755592347
|
||||
Max iteration time: 134.80017256736755
|
||||
Min iteration time: 121.44297170639038
|
||||
Stage 3 results:
|
||||
Actor creation time: 0.0635519027709961
|
||||
Total time: 3464.0461547374725
|
Loading…
Add table
Reference in a new issue