diff --git a/python/requirements_ml_docker.txt b/python/requirements_ml_docker.txt index ede4f9b30..70c4413e8 100644 --- a/python/requirements_ml_docker.txt +++ b/python/requirements_ml_docker.txt @@ -3,7 +3,6 @@ ipython # Needed for Ray Client error message serialization/deserialization. tblib - # In TF >v2, GPU support is included in the base package. tensorflow==2.5.0 tensorflow-probability==0.13.0 diff --git a/release/.buildkite/build_pipeline.py b/release/.buildkite/build_pipeline.py index 5d8be2fd0..6b4b3acb4 100644 --- a/release/.buildkite/build_pipeline.py +++ b/release/.buildkite/build_pipeline.py @@ -269,6 +269,12 @@ HOROVOD_SETUP_COMMANDS = [ # 3. Use GPUs if applicable # 4. Have the `use_connect` flag set. USER_TESTS = { + "~/ray/release/rllib_tests/rllib_tests.yaml": [ + ConnectTest( + "connect_tests", + requirements_file="release/rllib_tests" + "/connect_driver_requirements.txt") + ], "~/ray/release/ray_lightning_tests/ray_lightning_tests.yaml": [ ConnectTest( "ray_lightning_user_test_latest", diff --git a/release/rllib_tests/1gpu_4cpus.yaml b/release/rllib_tests/1gpu_4cpus.yaml index 6a5b9b6cb..f9f6ebb92 100644 --- a/release/rllib_tests/1gpu_4cpus.yaml +++ b/release/rllib_tests/1gpu_4cpus.yaml @@ -14,3 +14,8 @@ worker_node_types: max_workers: 0 use_spot: false +aws: + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 500 diff --git a/release/rllib_tests/2gpus_32cpus.yaml b/release/rllib_tests/2gpus_32cpus.yaml index f23bd6a0d..84965d13d 100644 --- a/release/rllib_tests/2gpus_32cpus.yaml +++ b/release/rllib_tests/2gpus_32cpus.yaml @@ -14,3 +14,8 @@ worker_node_types: max_workers: 0 use_spot: false +aws: + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 500 diff --git a/release/rllib_tests/4gpus_64cpus.yaml b/release/rllib_tests/4gpus_64cpus.yaml index 3276552bf..9efae5aa1 100644 --- a/release/rllib_tests/4gpus_64cpus.yaml +++ b/release/rllib_tests/4gpus_64cpus.yaml @@ -14,3 +14,8 @@ worker_node_types: max_workers: 0 use_spot: false +aws: + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 500 diff --git a/release/rllib_tests/app_config.yaml b/release/rllib_tests/app_config.yaml index 44537d8ca..2aeb12e82 100755 --- a/release/rllib_tests/app_config.yaml +++ b/release/rllib_tests/app_config.yaml @@ -7,7 +7,8 @@ debian_packages: python: # These dependencies should be handled by requirements_rllib.txt and # requirements_ml_docker.txt - pip_packages: [] + pip_packages: + - torch==1.9.0 # TODO(amogkam): Remove after nightly images are available. conda_packages: [] post_build_cmds: @@ -16,7 +17,7 @@ post_build_cmds: - sudo ln -s /usr/local/cuda /usr/local/nvidia - sudo ln -s /usr/local/cuda/lib64/libcusolver.so.11 /usr/local/cuda/lib64/libcusolver.so.10 - pip install tensorflow==2.5.0 - # END: TO-DO + # END: TODO - pip uninstall -y ray || true - pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }} diff --git a/release/rllib_tests/auto_scale.yaml b/release/rllib_tests/auto_scale.yaml new file mode 100644 index 000000000..e68d89c33 --- /dev/null +++ b/release/rllib_tests/auto_scale.yaml @@ -0,0 +1,26 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +max_workers: 10 + +head_node_type: + name: head_node + instance_type: m5.xlarge + +worker_node_types: + - name: worker_node_cpu + instance_type: m5.xlarge + min_workers: 0 + max_workers: 10 + use_spot: false + - name: worker_node_gpu + instance_type: g3.4xlarge + min_workers: 0 + max_workers: 10 + use_spot: false + +aws: + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 500 diff --git a/release/rllib_tests/connect_driver_requirements.txt b/release/rllib_tests/connect_driver_requirements.txt new file mode 100644 index 000000000..827da8405 --- /dev/null +++ b/release/rllib_tests/connect_driver_requirements.txt @@ -0,0 +1,8 @@ +# Make sure the driver versions are the same as cluster versions. +# The cluster uses ray-ml Docker image. +# ray-ml Docker image installs dependencies from ray/python/requirements/ml/ directory. +# We constrain on these requirements file so that the same versions are installed. +-c ../../python/requirements/ml/requirements_dl.txt + +tensorflow +torch \ No newline at end of file diff --git a/release/rllib_tests/connect_tests/run_connect_tests.py b/release/rllib_tests/connect_tests/run_connect_tests.py new file mode 100644 index 000000000..aaf20deab --- /dev/null +++ b/release/rllib_tests/connect_tests/run_connect_tests.py @@ -0,0 +1,38 @@ +"""Connect tests for Tune & RLlib. + +Runs a couple of hard learning tests using Anyscale connect. +""" + +import json +import os +import time + +import ray +from ray.rllib.examples.tune.framework import run + +if __name__ == "__main__": + addr = os.environ.get("RAY_ADDRESS") + job_name = os.environ.get("RAY_JOB_NAME", "rllib_connect_tests") + if addr is not None and addr.startswith("anyscale://"): + ray.init(address=addr, job_name=job_name) + else: + ray.init(address="auto") + + start_time = time.time() + exp_analysis = run() + end_time = time.time() + + result = { + "time_taken": end_time - start_time, + "trial_states": { + t.config["framework"]: t.status + for t in exp_analysis.trials + }, + } + + test_output_json = os.environ.get("TEST_OUTPUT_JSON", + "/tmp/release_test_out.json") + with open(test_output_json, "wt") as f: + json.dump(result, f) + + print("Ok.") diff --git a/release/rllib_tests/rllib_tests.yaml b/release/rllib_tests/rllib_tests.yaml index d9da28d28..4423b7f65 100644 --- a/release/rllib_tests/rllib_tests.yaml +++ b/release/rllib_tests/rllib_tests.yaml @@ -68,3 +68,14 @@ smoke_test: run: timeout: 2000 + +# Tests that exercise auto-scaling and Anyscale connect. +- name: connect_tests + cluster: + app_config: app_config.yaml + compute_template: auto_scale.yaml + + run: + use_connect: True + timeout: 3000 + script: python connect_tests/run_connect_tests.py diff --git a/rllib/BUILD b/rllib/BUILD index 993238fe3..4147b33e8 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2533,6 +2533,15 @@ py_test( args = ["--as-test", "--framework=torch", "--stop-reward=100.0"] ) +py_test( + name = "examples/tune/framework", + main = "examples/tune/framework.py", + tags = ["team:ml", "examples", "examples_F"], + size = "medium", + srcs = ["examples/tune/framework.py"], + args = ["--smoke-test"] +) + py_test( name = "examples/two_trainer_workflow_tf", main = "examples/two_trainer_workflow.py", diff --git a/rllib/examples/tune/__init__.py b/rllib/examples/tune/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/rllib/examples/tune/framework.py b/rllib/examples/tune/framework.py new file mode 100644 index 000000000..636859540 --- /dev/null +++ b/rllib/examples/tune/framework.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +""" Benchmarking TF against PyTorch on an example task using Ray Tune. +""" + +import logging +from pprint import pformat + +import ray +from ray import tune +from ray.tune import CLIReporter + +logging.basicConfig(level=logging.WARN) +logger = logging.getLogger("tune_framework") + + +def run(smoke_test=False): + stop = {"training_iteration": 1 if smoke_test else 50} + num_workers = 1 if smoke_test else 50 + num_gpus = 0 if smoke_test else 1 + + config = { + "env": "PongNoFrameskip-v4", + "framework": tune.grid_search(["tf", "torch"]), + "num_gpus": num_gpus, + "rollout_fragment_length": 50, + "train_batch_size": 750, + "num_workers": num_workers, + "num_envs_per_worker": 1, + "clip_rewards": True, + "num_sgd_iter": 2, + "vf_loss_coeff": 1.0, + "clip_param": 0.3, + "grad_clip": 10, + "vtrace": True, + "use_kl_loss": False, + } + logger.info("Configuration: \n %s", pformat(config)) + + # Run the experiment. + # TODO(jungong) : maybe add checkpointing. + return tune.run( + "APPO", + config=config, + stop=stop, + verbose=1, + num_samples=1, + progress_reporter=CLIReporter( + metric_columns={ + "training_iteration": "iter", + "time_total_s": "time_total_s", + "timesteps_total": "ts", + "snapshots": "snapshots", + "episodes_this_iter": "train_episodes", + "episode_reward_mean": "reward_mean", + }, + sort_by_metric=True, + max_report_frequency=30, + )) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + description="Tune+RLlib Example", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument( + "--smoke-test", + action="store_true", + default=False, + help="Finish quickly for testing.") + + args = parser.parse_args() + + if args.smoke_test: + ray.init(num_cpus=2) + else: + ray.init() + + run(smoke_test=args.smoke_test) + ray.shutdown()