[RLlib] Add an RLlib Tune experiment to UserTest suite. (#19807)

* Add an RLlib Tune experiment to UserTest suite.

* Add ray.init()

* Move example script to example/tune/, so it can be imported as module.

* add __init__.py so our new module will get included in python wheel.

* Add block device to RLlib test instances.

* Reduce disk size a little bit.

* Add metrics reporting

* Allow max of 5 workers to accomodate all the worker tasks.

* revert disk size change.

* Minor updates

* Trigger build

* set max num workers

* Add a compute cfg for autoscaled cpu and gpu nodes.

* use 1gpu instance.

* install tblib for debugging worker crashes.

* Manually upgrade to pytorch 1.9.0

* -y

* torch=1.9.0

* install torch on driver

* Add an RLlib Tune experiment to UserTest suite.

* Add ray.init()

* Move example script to example/tune/, so it can be imported as module.

* add __init__.py so our new module will get included in python wheel.

* Add block device to RLlib test instances.

* Reduce disk size a little bit.

* Add metrics reporting

* Allow max of 5 workers to accomodate all the worker tasks.

* revert disk size change.

* Minor updates

* Trigger build

* set max num workers

* Add a compute cfg for autoscaled cpu and gpu nodes.

* use 1gpu instance.

* install tblib for debugging worker crashes.

* Manually upgrade to pytorch 1.9.0

* -y

* torch=1.9.0

* install torch on driver

* bump timeout

* Write a more informational result dict.

* Revert changes to compute config files that are not used.

* add smoke test

* update

* reduce timeout

* Reduce the # of env per worker to 1.

* Small fix for getting trial_states

* Trigger build

* simply result dict

* lint

* more lint

* fix smoke test

Co-authored-by: Amog Kamsetty <amogkamsetty@yahoo.com>
This commit is contained in:
gjoliver 2021-11-03 17:04:27 -07:00 committed by GitHub
parent 91c730efd0
commit 2c1fa459d4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 198 additions and 3 deletions

View file

@ -3,7 +3,6 @@ ipython
# Needed for Ray Client error message serialization/deserialization.
tblib
# In TF >v2, GPU support is included in the base package.
tensorflow==2.5.0
tensorflow-probability==0.13.0

View file

@ -269,6 +269,12 @@ HOROVOD_SETUP_COMMANDS = [
# 3. Use GPUs if applicable
# 4. Have the `use_connect` flag set.
USER_TESTS = {
"~/ray/release/rllib_tests/rllib_tests.yaml": [
ConnectTest(
"connect_tests",
requirements_file="release/rllib_tests"
"/connect_driver_requirements.txt")
],
"~/ray/release/ray_lightning_tests/ray_lightning_tests.yaml": [
ConnectTest(
"ray_lightning_user_test_latest",

View file

@ -14,3 +14,8 @@ worker_node_types:
max_workers: 0
use_spot: false
aws:
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 500

View file

@ -14,3 +14,8 @@ worker_node_types:
max_workers: 0
use_spot: false
aws:
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 500

View file

@ -14,3 +14,8 @@ worker_node_types:
max_workers: 0
use_spot: false
aws:
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 500

View file

@ -7,7 +7,8 @@ debian_packages:
python:
# These dependencies should be handled by requirements_rllib.txt and
# requirements_ml_docker.txt
pip_packages: []
pip_packages:
- torch==1.9.0 # TODO(amogkam): Remove after nightly images are available.
conda_packages: []
post_build_cmds:
@ -16,7 +17,7 @@ post_build_cmds:
- sudo ln -s /usr/local/cuda /usr/local/nvidia
- sudo ln -s /usr/local/cuda/lib64/libcusolver.so.11 /usr/local/cuda/lib64/libcusolver.so.10
- pip install tensorflow==2.5.0
# END: TO-DO
# END: TODO
- pip uninstall -y ray || true
- pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}

View file

@ -0,0 +1,26 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2
max_workers: 10
head_node_type:
name: head_node
instance_type: m5.xlarge
worker_node_types:
- name: worker_node_cpu
instance_type: m5.xlarge
min_workers: 0
max_workers: 10
use_spot: false
- name: worker_node_gpu
instance_type: g3.4xlarge
min_workers: 0
max_workers: 10
use_spot: false
aws:
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 500

View file

@ -0,0 +1,8 @@
# Make sure the driver versions are the same as cluster versions.
# The cluster uses ray-ml Docker image.
# ray-ml Docker image installs dependencies from ray/python/requirements/ml/ directory.
# We constrain on these requirements file so that the same versions are installed.
-c ../../python/requirements/ml/requirements_dl.txt
tensorflow
torch

View file

@ -0,0 +1,38 @@
"""Connect tests for Tune & RLlib.
Runs a couple of hard learning tests using Anyscale connect.
"""
import json
import os
import time
import ray
from ray.rllib.examples.tune.framework import run
if __name__ == "__main__":
addr = os.environ.get("RAY_ADDRESS")
job_name = os.environ.get("RAY_JOB_NAME", "rllib_connect_tests")
if addr is not None and addr.startswith("anyscale://"):
ray.init(address=addr, job_name=job_name)
else:
ray.init(address="auto")
start_time = time.time()
exp_analysis = run()
end_time = time.time()
result = {
"time_taken": end_time - start_time,
"trial_states": {
t.config["framework"]: t.status
for t in exp_analysis.trials
},
}
test_output_json = os.environ.get("TEST_OUTPUT_JSON",
"/tmp/release_test_out.json")
with open(test_output_json, "wt") as f:
json.dump(result, f)
print("Ok.")

View file

@ -68,3 +68,14 @@
smoke_test:
run:
timeout: 2000
# Tests that exercise auto-scaling and Anyscale connect.
- name: connect_tests
cluster:
app_config: app_config.yaml
compute_template: auto_scale.yaml
run:
use_connect: True
timeout: 3000
script: python connect_tests/run_connect_tests.py

View file

@ -2533,6 +2533,15 @@ py_test(
args = ["--as-test", "--framework=torch", "--stop-reward=100.0"]
)
py_test(
name = "examples/tune/framework",
main = "examples/tune/framework.py",
tags = ["team:ml", "examples", "examples_F"],
size = "medium",
srcs = ["examples/tune/framework.py"],
args = ["--smoke-test"]
)
py_test(
name = "examples/two_trainer_workflow_tf",
main = "examples/two_trainer_workflow.py",

View file

View file

@ -0,0 +1,82 @@
#!/usr/bin/env python3
""" Benchmarking TF against PyTorch on an example task using Ray Tune.
"""
import logging
from pprint import pformat
import ray
from ray import tune
from ray.tune import CLIReporter
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger("tune_framework")
def run(smoke_test=False):
stop = {"training_iteration": 1 if smoke_test else 50}
num_workers = 1 if smoke_test else 50
num_gpus = 0 if smoke_test else 1
config = {
"env": "PongNoFrameskip-v4",
"framework": tune.grid_search(["tf", "torch"]),
"num_gpus": num_gpus,
"rollout_fragment_length": 50,
"train_batch_size": 750,
"num_workers": num_workers,
"num_envs_per_worker": 1,
"clip_rewards": True,
"num_sgd_iter": 2,
"vf_loss_coeff": 1.0,
"clip_param": 0.3,
"grad_clip": 10,
"vtrace": True,
"use_kl_loss": False,
}
logger.info("Configuration: \n %s", pformat(config))
# Run the experiment.
# TODO(jungong) : maybe add checkpointing.
return tune.run(
"APPO",
config=config,
stop=stop,
verbose=1,
num_samples=1,
progress_reporter=CLIReporter(
metric_columns={
"training_iteration": "iter",
"time_total_s": "time_total_s",
"timesteps_total": "ts",
"snapshots": "snapshots",
"episodes_this_iter": "train_episodes",
"episode_reward_mean": "reward_mean",
},
sort_by_metric=True,
max_report_frequency=30,
))
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Tune+RLlib Example",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"--smoke-test",
action="store_true",
default=False,
help="Finish quickly for testing.")
args = parser.parse_args()
if args.smoke_test:
ray.init(num_cpus=2)
else:
ray.init()
run(smoke_test=args.smoke_test)
ray.shutdown()