mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
[RLlib] Add an RLlib Tune experiment to UserTest suite. (#19807)
* Add an RLlib Tune experiment to UserTest suite. * Add ray.init() * Move example script to example/tune/, so it can be imported as module. * add __init__.py so our new module will get included in python wheel. * Add block device to RLlib test instances. * Reduce disk size a little bit. * Add metrics reporting * Allow max of 5 workers to accomodate all the worker tasks. * revert disk size change. * Minor updates * Trigger build * set max num workers * Add a compute cfg for autoscaled cpu and gpu nodes. * use 1gpu instance. * install tblib for debugging worker crashes. * Manually upgrade to pytorch 1.9.0 * -y * torch=1.9.0 * install torch on driver * Add an RLlib Tune experiment to UserTest suite. * Add ray.init() * Move example script to example/tune/, so it can be imported as module. * add __init__.py so our new module will get included in python wheel. * Add block device to RLlib test instances. * Reduce disk size a little bit. * Add metrics reporting * Allow max of 5 workers to accomodate all the worker tasks. * revert disk size change. * Minor updates * Trigger build * set max num workers * Add a compute cfg for autoscaled cpu and gpu nodes. * use 1gpu instance. * install tblib for debugging worker crashes. * Manually upgrade to pytorch 1.9.0 * -y * torch=1.9.0 * install torch on driver * bump timeout * Write a more informational result dict. * Revert changes to compute config files that are not used. * add smoke test * update * reduce timeout * Reduce the # of env per worker to 1. * Small fix for getting trial_states * Trigger build * simply result dict * lint * more lint * fix smoke test Co-authored-by: Amog Kamsetty <amogkamsetty@yahoo.com>
This commit is contained in:
parent
91c730efd0
commit
2c1fa459d4
13 changed files with 198 additions and 3 deletions
|
@ -3,7 +3,6 @@ ipython
|
||||||
# Needed for Ray Client error message serialization/deserialization.
|
# Needed for Ray Client error message serialization/deserialization.
|
||||||
tblib
|
tblib
|
||||||
|
|
||||||
|
|
||||||
# In TF >v2, GPU support is included in the base package.
|
# In TF >v2, GPU support is included in the base package.
|
||||||
tensorflow==2.5.0
|
tensorflow==2.5.0
|
||||||
tensorflow-probability==0.13.0
|
tensorflow-probability==0.13.0
|
||||||
|
|
|
@ -269,6 +269,12 @@ HOROVOD_SETUP_COMMANDS = [
|
||||||
# 3. Use GPUs if applicable
|
# 3. Use GPUs if applicable
|
||||||
# 4. Have the `use_connect` flag set.
|
# 4. Have the `use_connect` flag set.
|
||||||
USER_TESTS = {
|
USER_TESTS = {
|
||||||
|
"~/ray/release/rllib_tests/rllib_tests.yaml": [
|
||||||
|
ConnectTest(
|
||||||
|
"connect_tests",
|
||||||
|
requirements_file="release/rllib_tests"
|
||||||
|
"/connect_driver_requirements.txt")
|
||||||
|
],
|
||||||
"~/ray/release/ray_lightning_tests/ray_lightning_tests.yaml": [
|
"~/ray/release/ray_lightning_tests/ray_lightning_tests.yaml": [
|
||||||
ConnectTest(
|
ConnectTest(
|
||||||
"ray_lightning_user_test_latest",
|
"ray_lightning_user_test_latest",
|
||||||
|
|
|
@ -14,3 +14,8 @@ worker_node_types:
|
||||||
max_workers: 0
|
max_workers: 0
|
||||||
use_spot: false
|
use_spot: false
|
||||||
|
|
||||||
|
aws:
|
||||||
|
BlockDeviceMappings:
|
||||||
|
- DeviceName: /dev/sda1
|
||||||
|
Ebs:
|
||||||
|
VolumeSize: 500
|
||||||
|
|
|
@ -14,3 +14,8 @@ worker_node_types:
|
||||||
max_workers: 0
|
max_workers: 0
|
||||||
use_spot: false
|
use_spot: false
|
||||||
|
|
||||||
|
aws:
|
||||||
|
BlockDeviceMappings:
|
||||||
|
- DeviceName: /dev/sda1
|
||||||
|
Ebs:
|
||||||
|
VolumeSize: 500
|
||||||
|
|
|
@ -14,3 +14,8 @@ worker_node_types:
|
||||||
max_workers: 0
|
max_workers: 0
|
||||||
use_spot: false
|
use_spot: false
|
||||||
|
|
||||||
|
aws:
|
||||||
|
BlockDeviceMappings:
|
||||||
|
- DeviceName: /dev/sda1
|
||||||
|
Ebs:
|
||||||
|
VolumeSize: 500
|
||||||
|
|
|
@ -7,7 +7,8 @@ debian_packages:
|
||||||
python:
|
python:
|
||||||
# These dependencies should be handled by requirements_rllib.txt and
|
# These dependencies should be handled by requirements_rllib.txt and
|
||||||
# requirements_ml_docker.txt
|
# requirements_ml_docker.txt
|
||||||
pip_packages: []
|
pip_packages:
|
||||||
|
- torch==1.9.0 # TODO(amogkam): Remove after nightly images are available.
|
||||||
conda_packages: []
|
conda_packages: []
|
||||||
|
|
||||||
post_build_cmds:
|
post_build_cmds:
|
||||||
|
@ -16,7 +17,7 @@ post_build_cmds:
|
||||||
- sudo ln -s /usr/local/cuda /usr/local/nvidia
|
- sudo ln -s /usr/local/cuda /usr/local/nvidia
|
||||||
- sudo ln -s /usr/local/cuda/lib64/libcusolver.so.11 /usr/local/cuda/lib64/libcusolver.so.10
|
- sudo ln -s /usr/local/cuda/lib64/libcusolver.so.11 /usr/local/cuda/lib64/libcusolver.so.10
|
||||||
- pip install tensorflow==2.5.0
|
- pip install tensorflow==2.5.0
|
||||||
# END: TO-DO
|
# END: TODO
|
||||||
|
|
||||||
- pip uninstall -y ray || true
|
- pip uninstall -y ray || true
|
||||||
- pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
|
- pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
|
||||||
|
|
26
release/rllib_tests/auto_scale.yaml
Normal file
26
release/rllib_tests/auto_scale.yaml
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
|
||||||
|
region: us-west-2
|
||||||
|
|
||||||
|
max_workers: 10
|
||||||
|
|
||||||
|
head_node_type:
|
||||||
|
name: head_node
|
||||||
|
instance_type: m5.xlarge
|
||||||
|
|
||||||
|
worker_node_types:
|
||||||
|
- name: worker_node_cpu
|
||||||
|
instance_type: m5.xlarge
|
||||||
|
min_workers: 0
|
||||||
|
max_workers: 10
|
||||||
|
use_spot: false
|
||||||
|
- name: worker_node_gpu
|
||||||
|
instance_type: g3.4xlarge
|
||||||
|
min_workers: 0
|
||||||
|
max_workers: 10
|
||||||
|
use_spot: false
|
||||||
|
|
||||||
|
aws:
|
||||||
|
BlockDeviceMappings:
|
||||||
|
- DeviceName: /dev/sda1
|
||||||
|
Ebs:
|
||||||
|
VolumeSize: 500
|
8
release/rllib_tests/connect_driver_requirements.txt
Normal file
8
release/rllib_tests/connect_driver_requirements.txt
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
# Make sure the driver versions are the same as cluster versions.
|
||||||
|
# The cluster uses ray-ml Docker image.
|
||||||
|
# ray-ml Docker image installs dependencies from ray/python/requirements/ml/ directory.
|
||||||
|
# We constrain on these requirements file so that the same versions are installed.
|
||||||
|
-c ../../python/requirements/ml/requirements_dl.txt
|
||||||
|
|
||||||
|
tensorflow
|
||||||
|
torch
|
38
release/rllib_tests/connect_tests/run_connect_tests.py
Normal file
38
release/rllib_tests/connect_tests/run_connect_tests.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
"""Connect tests for Tune & RLlib.
|
||||||
|
|
||||||
|
Runs a couple of hard learning tests using Anyscale connect.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
|
import ray
|
||||||
|
from ray.rllib.examples.tune.framework import run
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
addr = os.environ.get("RAY_ADDRESS")
|
||||||
|
job_name = os.environ.get("RAY_JOB_NAME", "rllib_connect_tests")
|
||||||
|
if addr is not None and addr.startswith("anyscale://"):
|
||||||
|
ray.init(address=addr, job_name=job_name)
|
||||||
|
else:
|
||||||
|
ray.init(address="auto")
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
exp_analysis = run()
|
||||||
|
end_time = time.time()
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"time_taken": end_time - start_time,
|
||||||
|
"trial_states": {
|
||||||
|
t.config["framework"]: t.status
|
||||||
|
for t in exp_analysis.trials
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
test_output_json = os.environ.get("TEST_OUTPUT_JSON",
|
||||||
|
"/tmp/release_test_out.json")
|
||||||
|
with open(test_output_json, "wt") as f:
|
||||||
|
json.dump(result, f)
|
||||||
|
|
||||||
|
print("Ok.")
|
|
@ -68,3 +68,14 @@
|
||||||
smoke_test:
|
smoke_test:
|
||||||
run:
|
run:
|
||||||
timeout: 2000
|
timeout: 2000
|
||||||
|
|
||||||
|
# Tests that exercise auto-scaling and Anyscale connect.
|
||||||
|
- name: connect_tests
|
||||||
|
cluster:
|
||||||
|
app_config: app_config.yaml
|
||||||
|
compute_template: auto_scale.yaml
|
||||||
|
|
||||||
|
run:
|
||||||
|
use_connect: True
|
||||||
|
timeout: 3000
|
||||||
|
script: python connect_tests/run_connect_tests.py
|
||||||
|
|
|
@ -2533,6 +2533,15 @@ py_test(
|
||||||
args = ["--as-test", "--framework=torch", "--stop-reward=100.0"]
|
args = ["--as-test", "--framework=torch", "--stop-reward=100.0"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
py_test(
|
||||||
|
name = "examples/tune/framework",
|
||||||
|
main = "examples/tune/framework.py",
|
||||||
|
tags = ["team:ml", "examples", "examples_F"],
|
||||||
|
size = "medium",
|
||||||
|
srcs = ["examples/tune/framework.py"],
|
||||||
|
args = ["--smoke-test"]
|
||||||
|
)
|
||||||
|
|
||||||
py_test(
|
py_test(
|
||||||
name = "examples/two_trainer_workflow_tf",
|
name = "examples/two_trainer_workflow_tf",
|
||||||
main = "examples/two_trainer_workflow.py",
|
main = "examples/two_trainer_workflow.py",
|
||||||
|
|
0
rllib/examples/tune/__init__.py
Normal file
0
rllib/examples/tune/__init__.py
Normal file
82
rllib/examples/tune/framework.py
Normal file
82
rllib/examples/tune/framework.py
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
""" Benchmarking TF against PyTorch on an example task using Ray Tune.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pprint import pformat
|
||||||
|
|
||||||
|
import ray
|
||||||
|
from ray import tune
|
||||||
|
from ray.tune import CLIReporter
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.WARN)
|
||||||
|
logger = logging.getLogger("tune_framework")
|
||||||
|
|
||||||
|
|
||||||
|
def run(smoke_test=False):
|
||||||
|
stop = {"training_iteration": 1 if smoke_test else 50}
|
||||||
|
num_workers = 1 if smoke_test else 50
|
||||||
|
num_gpus = 0 if smoke_test else 1
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"env": "PongNoFrameskip-v4",
|
||||||
|
"framework": tune.grid_search(["tf", "torch"]),
|
||||||
|
"num_gpus": num_gpus,
|
||||||
|
"rollout_fragment_length": 50,
|
||||||
|
"train_batch_size": 750,
|
||||||
|
"num_workers": num_workers,
|
||||||
|
"num_envs_per_worker": 1,
|
||||||
|
"clip_rewards": True,
|
||||||
|
"num_sgd_iter": 2,
|
||||||
|
"vf_loss_coeff": 1.0,
|
||||||
|
"clip_param": 0.3,
|
||||||
|
"grad_clip": 10,
|
||||||
|
"vtrace": True,
|
||||||
|
"use_kl_loss": False,
|
||||||
|
}
|
||||||
|
logger.info("Configuration: \n %s", pformat(config))
|
||||||
|
|
||||||
|
# Run the experiment.
|
||||||
|
# TODO(jungong) : maybe add checkpointing.
|
||||||
|
return tune.run(
|
||||||
|
"APPO",
|
||||||
|
config=config,
|
||||||
|
stop=stop,
|
||||||
|
verbose=1,
|
||||||
|
num_samples=1,
|
||||||
|
progress_reporter=CLIReporter(
|
||||||
|
metric_columns={
|
||||||
|
"training_iteration": "iter",
|
||||||
|
"time_total_s": "time_total_s",
|
||||||
|
"timesteps_total": "ts",
|
||||||
|
"snapshots": "snapshots",
|
||||||
|
"episodes_this_iter": "train_episodes",
|
||||||
|
"episode_reward_mean": "reward_mean",
|
||||||
|
},
|
||||||
|
sort_by_metric=True,
|
||||||
|
max_report_frequency=30,
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Tune+RLlib Example",
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--smoke-test",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Finish quickly for testing.")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.smoke_test:
|
||||||
|
ray.init(num_cpus=2)
|
||||||
|
else:
|
||||||
|
ray.init()
|
||||||
|
|
||||||
|
run(smoke_test=args.smoke_test)
|
||||||
|
ray.shutdown()
|
Loading…
Add table
Reference in a new issue