mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[RLlib] Add an RLlib Tune experiment to UserTest suite. (#19807)
* Add an RLlib Tune experiment to UserTest suite. * Add ray.init() * Move example script to example/tune/, so it can be imported as module. * add __init__.py so our new module will get included in python wheel. * Add block device to RLlib test instances. * Reduce disk size a little bit. * Add metrics reporting * Allow max of 5 workers to accomodate all the worker tasks. * revert disk size change. * Minor updates * Trigger build * set max num workers * Add a compute cfg for autoscaled cpu and gpu nodes. * use 1gpu instance. * install tblib for debugging worker crashes. * Manually upgrade to pytorch 1.9.0 * -y * torch=1.9.0 * install torch on driver * Add an RLlib Tune experiment to UserTest suite. * Add ray.init() * Move example script to example/tune/, so it can be imported as module. * add __init__.py so our new module will get included in python wheel. * Add block device to RLlib test instances. * Reduce disk size a little bit. * Add metrics reporting * Allow max of 5 workers to accomodate all the worker tasks. * revert disk size change. * Minor updates * Trigger build * set max num workers * Add a compute cfg for autoscaled cpu and gpu nodes. * use 1gpu instance. * install tblib for debugging worker crashes. * Manually upgrade to pytorch 1.9.0 * -y * torch=1.9.0 * install torch on driver * bump timeout * Write a more informational result dict. * Revert changes to compute config files that are not used. * add smoke test * update * reduce timeout * Reduce the # of env per worker to 1. * Small fix for getting trial_states * Trigger build * simply result dict * lint * more lint * fix smoke test Co-authored-by: Amog Kamsetty <amogkamsetty@yahoo.com>
This commit is contained in:
parent
91c730efd0
commit
2c1fa459d4
13 changed files with 198 additions and 3 deletions
|
@ -3,7 +3,6 @@ ipython
|
|||
# Needed for Ray Client error message serialization/deserialization.
|
||||
tblib
|
||||
|
||||
|
||||
# In TF >v2, GPU support is included in the base package.
|
||||
tensorflow==2.5.0
|
||||
tensorflow-probability==0.13.0
|
||||
|
|
|
@ -269,6 +269,12 @@ HOROVOD_SETUP_COMMANDS = [
|
|||
# 3. Use GPUs if applicable
|
||||
# 4. Have the `use_connect` flag set.
|
||||
USER_TESTS = {
|
||||
"~/ray/release/rllib_tests/rllib_tests.yaml": [
|
||||
ConnectTest(
|
||||
"connect_tests",
|
||||
requirements_file="release/rllib_tests"
|
||||
"/connect_driver_requirements.txt")
|
||||
],
|
||||
"~/ray/release/ray_lightning_tests/ray_lightning_tests.yaml": [
|
||||
ConnectTest(
|
||||
"ray_lightning_user_test_latest",
|
||||
|
|
|
@ -14,3 +14,8 @@ worker_node_types:
|
|||
max_workers: 0
|
||||
use_spot: false
|
||||
|
||||
aws:
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 500
|
||||
|
|
|
@ -14,3 +14,8 @@ worker_node_types:
|
|||
max_workers: 0
|
||||
use_spot: false
|
||||
|
||||
aws:
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 500
|
||||
|
|
|
@ -14,3 +14,8 @@ worker_node_types:
|
|||
max_workers: 0
|
||||
use_spot: false
|
||||
|
||||
aws:
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 500
|
||||
|
|
|
@ -7,7 +7,8 @@ debian_packages:
|
|||
python:
|
||||
# These dependencies should be handled by requirements_rllib.txt and
|
||||
# requirements_ml_docker.txt
|
||||
pip_packages: []
|
||||
pip_packages:
|
||||
- torch==1.9.0 # TODO(amogkam): Remove after nightly images are available.
|
||||
conda_packages: []
|
||||
|
||||
post_build_cmds:
|
||||
|
@ -16,7 +17,7 @@ post_build_cmds:
|
|||
- sudo ln -s /usr/local/cuda /usr/local/nvidia
|
||||
- sudo ln -s /usr/local/cuda/lib64/libcusolver.so.11 /usr/local/cuda/lib64/libcusolver.so.10
|
||||
- pip install tensorflow==2.5.0
|
||||
# END: TO-DO
|
||||
# END: TODO
|
||||
|
||||
- pip uninstall -y ray || true
|
||||
- pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
|
||||
|
|
26
release/rllib_tests/auto_scale.yaml
Normal file
26
release/rllib_tests/auto_scale.yaml
Normal file
|
@ -0,0 +1,26 @@
|
|||
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
|
||||
region: us-west-2
|
||||
|
||||
max_workers: 10
|
||||
|
||||
head_node_type:
|
||||
name: head_node
|
||||
instance_type: m5.xlarge
|
||||
|
||||
worker_node_types:
|
||||
- name: worker_node_cpu
|
||||
instance_type: m5.xlarge
|
||||
min_workers: 0
|
||||
max_workers: 10
|
||||
use_spot: false
|
||||
- name: worker_node_gpu
|
||||
instance_type: g3.4xlarge
|
||||
min_workers: 0
|
||||
max_workers: 10
|
||||
use_spot: false
|
||||
|
||||
aws:
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 500
|
8
release/rllib_tests/connect_driver_requirements.txt
Normal file
8
release/rllib_tests/connect_driver_requirements.txt
Normal file
|
@ -0,0 +1,8 @@
|
|||
# Make sure the driver versions are the same as cluster versions.
|
||||
# The cluster uses ray-ml Docker image.
|
||||
# ray-ml Docker image installs dependencies from ray/python/requirements/ml/ directory.
|
||||
# We constrain on these requirements file so that the same versions are installed.
|
||||
-c ../../python/requirements/ml/requirements_dl.txt
|
||||
|
||||
tensorflow
|
||||
torch
|
38
release/rllib_tests/connect_tests/run_connect_tests.py
Normal file
38
release/rllib_tests/connect_tests/run_connect_tests.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
"""Connect tests for Tune & RLlib.
|
||||
|
||||
Runs a couple of hard learning tests using Anyscale connect.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.rllib.examples.tune.framework import run
|
||||
|
||||
if __name__ == "__main__":
|
||||
addr = os.environ.get("RAY_ADDRESS")
|
||||
job_name = os.environ.get("RAY_JOB_NAME", "rllib_connect_tests")
|
||||
if addr is not None and addr.startswith("anyscale://"):
|
||||
ray.init(address=addr, job_name=job_name)
|
||||
else:
|
||||
ray.init(address="auto")
|
||||
|
||||
start_time = time.time()
|
||||
exp_analysis = run()
|
||||
end_time = time.time()
|
||||
|
||||
result = {
|
||||
"time_taken": end_time - start_time,
|
||||
"trial_states": {
|
||||
t.config["framework"]: t.status
|
||||
for t in exp_analysis.trials
|
||||
},
|
||||
}
|
||||
|
||||
test_output_json = os.environ.get("TEST_OUTPUT_JSON",
|
||||
"/tmp/release_test_out.json")
|
||||
with open(test_output_json, "wt") as f:
|
||||
json.dump(result, f)
|
||||
|
||||
print("Ok.")
|
|
@ -68,3 +68,14 @@
|
|||
smoke_test:
|
||||
run:
|
||||
timeout: 2000
|
||||
|
||||
# Tests that exercise auto-scaling and Anyscale connect.
|
||||
- name: connect_tests
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: auto_scale.yaml
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
timeout: 3000
|
||||
script: python connect_tests/run_connect_tests.py
|
||||
|
|
|
@ -2533,6 +2533,15 @@ py_test(
|
|||
args = ["--as-test", "--framework=torch", "--stop-reward=100.0"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/tune/framework",
|
||||
main = "examples/tune/framework.py",
|
||||
tags = ["team:ml", "examples", "examples_F"],
|
||||
size = "medium",
|
||||
srcs = ["examples/tune/framework.py"],
|
||||
args = ["--smoke-test"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/two_trainer_workflow_tf",
|
||||
main = "examples/two_trainer_workflow.py",
|
||||
|
|
0
rllib/examples/tune/__init__.py
Normal file
0
rllib/examples/tune/__init__.py
Normal file
82
rllib/examples/tune/framework.py
Normal file
82
rllib/examples/tune/framework.py
Normal file
|
@ -0,0 +1,82 @@
|
|||
#!/usr/bin/env python3
|
||||
""" Benchmarking TF against PyTorch on an example task using Ray Tune.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pprint import pformat
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.tune import CLIReporter
|
||||
|
||||
logging.basicConfig(level=logging.WARN)
|
||||
logger = logging.getLogger("tune_framework")
|
||||
|
||||
|
||||
def run(smoke_test=False):
|
||||
stop = {"training_iteration": 1 if smoke_test else 50}
|
||||
num_workers = 1 if smoke_test else 50
|
||||
num_gpus = 0 if smoke_test else 1
|
||||
|
||||
config = {
|
||||
"env": "PongNoFrameskip-v4",
|
||||
"framework": tune.grid_search(["tf", "torch"]),
|
||||
"num_gpus": num_gpus,
|
||||
"rollout_fragment_length": 50,
|
||||
"train_batch_size": 750,
|
||||
"num_workers": num_workers,
|
||||
"num_envs_per_worker": 1,
|
||||
"clip_rewards": True,
|
||||
"num_sgd_iter": 2,
|
||||
"vf_loss_coeff": 1.0,
|
||||
"clip_param": 0.3,
|
||||
"grad_clip": 10,
|
||||
"vtrace": True,
|
||||
"use_kl_loss": False,
|
||||
}
|
||||
logger.info("Configuration: \n %s", pformat(config))
|
||||
|
||||
# Run the experiment.
|
||||
# TODO(jungong) : maybe add checkpointing.
|
||||
return tune.run(
|
||||
"APPO",
|
||||
config=config,
|
||||
stop=stop,
|
||||
verbose=1,
|
||||
num_samples=1,
|
||||
progress_reporter=CLIReporter(
|
||||
metric_columns={
|
||||
"training_iteration": "iter",
|
||||
"time_total_s": "time_total_s",
|
||||
"timesteps_total": "ts",
|
||||
"snapshots": "snapshots",
|
||||
"episodes_this_iter": "train_episodes",
|
||||
"episode_reward_mean": "reward_mean",
|
||||
},
|
||||
sort_by_metric=True,
|
||||
max_report_frequency=30,
|
||||
))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Tune+RLlib Example",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
|
||||
parser.add_argument(
|
||||
"--smoke-test",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Finish quickly for testing.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.smoke_test:
|
||||
ray.init(num_cpus=2)
|
||||
else:
|
||||
ray.init()
|
||||
|
||||
run(smoke_test=args.smoke_test)
|
||||
ray.shutdown()
|
Loading…
Add table
Reference in a new issue