[RLlib] Add an RLlib Tune experiment to UserTest suite. (#19807)

* Add an RLlib Tune experiment to UserTest suite. * Add ray.init() * Move example script to example/tune/, so it can be imported as module. * add __init__.py so our new module will get included in python wheel. * Add block device to RLlib test instances. * Reduce disk size a little bit. * Add metrics reporting * Allow max of 5 workers to accomodate all the worker tasks. * revert disk size change. * Minor updates * Trigger build * set max num workers * Add a compute cfg for autoscaled cpu and gpu nodes. * use 1gpu instance. * install tblib for debugging worker crashes. * Manually upgrade to pytorch 1.9.0 * -y * torch=1.9.0 * install torch on driver * Add an RLlib Tune experiment to UserTest suite. * Add ray.init() * Move example script to example/tune/, so it can be imported as module. * add __init__.py so our new module will get included in python wheel. * Add block device to RLlib test instances. * Reduce disk size a little bit. * Add metrics reporting * Allow max of 5 workers to accomodate all the worker tasks. * revert disk size change. * Minor updates * Trigger build * set max num workers * Add a compute cfg for autoscaled cpu and gpu nodes. * use 1gpu instance. * install tblib for debugging worker crashes. * Manually upgrade to pytorch 1.9.0 * -y * torch=1.9.0 * install torch on driver * bump timeout * Write a more informational result dict. * Revert changes to compute config files that are not used. * add smoke test * update * reduce timeout * Reduce the # of env per worker to 1. * Small fix for getting trial_states * Trigger build * simply result dict * lint * more lint * fix smoke test Co-authored-by: Amog Kamsetty <amogkamsetty@yahoo.com>
2025-03-05 10:01:43 -05:00 · 2021-11-03 17:04:27 -07:00 · 2021-11-03 17:04:27 -07:00 · 2c1fa459d4
commit 2c1fa459d4
parent 91c730efd0
13 changed files with 198 additions and 3 deletions
--- a/python/requirements_ml_docker.txt
+++ b/python/requirements_ml_docker.txt
@ -3,7 +3,6 @@ ipython
 # Needed for Ray Client error message serialization/deserialization.
 tblib

-
 # In TF >v2, GPU support is included in the base package.
 tensorflow==2.5.0
 tensorflow-probability==0.13.0
--- a/release/.buildkite/build_pipeline.py
+++ b/release/.buildkite/build_pipeline.py
@ -269,6 +269,12 @@ HOROVOD_SETUP_COMMANDS = [
 #   3. Use GPUs if applicable
 #   4. Have the `use_connect` flag set.
 USER_TESTS = {
+    "~/ray/release/rllib_tests/rllib_tests.yaml": [
+        ConnectTest(
+            "connect_tests",
+            requirements_file="release/rllib_tests"
+            "/connect_driver_requirements.txt")
+    ],
    "~/ray/release/ray_lightning_tests/ray_lightning_tests.yaml": [
        ConnectTest(
            "ray_lightning_user_test_latest",
--- a/release/rllib_tests/1gpu_4cpus.yaml
+++ b/release/rllib_tests/1gpu_4cpus.yaml
@ -14,3 +14,8 @@ worker_node_types:
      max_workers: 0
      use_spot: false

+aws:
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+            VolumeSize: 500
--- a/release/rllib_tests/2gpus_32cpus.yaml
+++ b/release/rllib_tests/2gpus_32cpus.yaml
@ -14,3 +14,8 @@ worker_node_types:
      max_workers: 0
      use_spot: false

+aws:
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+            VolumeSize: 500
--- a/release/rllib_tests/4gpus_64cpus.yaml
+++ b/release/rllib_tests/4gpus_64cpus.yaml
@ -14,3 +14,8 @@ worker_node_types:
      max_workers: 0
      use_spot: false

+aws:
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+            VolumeSize: 500
--- a/release/rllib_tests/app_config.yaml
+++ b/release/rllib_tests/app_config.yaml
@ -7,7 +7,8 @@ debian_packages:
 python:
  # These dependencies should be handled by requirements_rllib.txt and
  # requirements_ml_docker.txt
-  pip_packages: []
+  pip_packages:
+    - torch==1.9.0 # TODO(amogkam): Remove after nightly images are available.
  conda_packages: []

 post_build_cmds:
@ -16,7 +17,7 @@ post_build_cmds:
  - sudo ln -s /usr/local/cuda /usr/local/nvidia
  - sudo ln -s /usr/local/cuda/lib64/libcusolver.so.11 /usr/local/cuda/lib64/libcusolver.so.10
  - pip install tensorflow==2.5.0
-  # END: TO-DO
+  # END: TODO

  - pip uninstall -y ray || true
  - pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
--- a/release/rllib_tests/auto_scale.yaml
+++ b/release/rllib_tests/auto_scale.yaml
@ -0,0 +1,26 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+max_workers: 10
+
+head_node_type:
+    name: head_node
+    instance_type: m5.xlarge
+
+worker_node_types:
+    - name: worker_node_cpu
+      instance_type: m5.xlarge
+      min_workers: 0
+      max_workers: 10
+      use_spot: false
+    - name: worker_node_gpu
+      instance_type: g3.4xlarge
+      min_workers: 0
+      max_workers: 10
+      use_spot: false
+
+aws:
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+            VolumeSize: 500
--- a/release/rllib_tests/connect_driver_requirements.txt
+++ b/release/rllib_tests/connect_driver_requirements.txt
@ -0,0 +1,8 @@
+# Make sure the driver versions are the same as cluster versions.
+# The cluster uses ray-ml Docker image.
+# ray-ml Docker image installs dependencies from ray/python/requirements/ml/ directory.
+# We constrain on these requirements file so that the same versions are installed.
+-c ../../python/requirements/ml/requirements_dl.txt
+
+tensorflow
+torch
--- a/release/rllib_tests/connect_tests/run_connect_tests.py
+++ b/release/rllib_tests/connect_tests/run_connect_tests.py
@ -0,0 +1,38 @@
+"""Connect tests for Tune & RLlib.
+
+Runs a couple of hard learning tests using Anyscale connect.
+"""
+
+import json
+import os
+import time
+
+import ray
+from ray.rllib.examples.tune.framework import run
+
+if __name__ == "__main__":
+    addr = os.environ.get("RAY_ADDRESS")
+    job_name = os.environ.get("RAY_JOB_NAME", "rllib_connect_tests")
+    if addr is not None and addr.startswith("anyscale://"):
+        ray.init(address=addr, job_name=job_name)
+    else:
+        ray.init(address="auto")
+
+    start_time = time.time()
+    exp_analysis = run()
+    end_time = time.time()
+
+    result = {
+        "time_taken": end_time - start_time,
+        "trial_states": {
+            t.config["framework"]: t.status
+            for t in exp_analysis.trials
+        },
+    }
+
+    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
+                                      "/tmp/release_test_out.json")
+    with open(test_output_json, "wt") as f:
+        json.dump(result, f)
+
+    print("Ok.")
--- a/release/rllib_tests/rllib_tests.yaml
+++ b/release/rllib_tests/rllib_tests.yaml
@ -68,3 +68,14 @@
  smoke_test:
      run:
        timeout: 2000
+
+# Tests that exercise auto-scaling and Anyscale connect.
+- name: connect_tests
+  cluster:
+    app_config: app_config.yaml
+    compute_template: auto_scale.yaml
+
+  run:
+    use_connect: True
+    timeout: 3000
+    script: python connect_tests/run_connect_tests.py
--- a/rllib/BUILD
+++ b/rllib/BUILD
@ -2533,6 +2533,15 @@ py_test(
    args = ["--as-test", "--framework=torch", "--stop-reward=100.0"]
 )

+py_test(
+    name = "examples/tune/framework",
+    main = "examples/tune/framework.py",
+    tags = ["team:ml", "examples", "examples_F"],
+    size = "medium",
+    srcs = ["examples/tune/framework.py"],
+    args = ["--smoke-test"]
+)
+
 py_test(
    name = "examples/two_trainer_workflow_tf",
    main = "examples/two_trainer_workflow.py",
--- a/rllib/examples/tune/init.py
+++ b/rllib/examples/tune/init.py
--- a/rllib/examples/tune/framework.py
+++ b/rllib/examples/tune/framework.py
@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+""" Benchmarking TF against PyTorch on an example task using Ray Tune.
+"""
+
+import logging
+from pprint import pformat
+
+import ray
+from ray import tune
+from ray.tune import CLIReporter
+
+logging.basicConfig(level=logging.WARN)
+logger = logging.getLogger("tune_framework")
+
+
+def run(smoke_test=False):
+    stop = {"training_iteration": 1 if smoke_test else 50}
+    num_workers = 1 if smoke_test else 50
+    num_gpus = 0 if smoke_test else 1
+
+    config = {
+        "env": "PongNoFrameskip-v4",
+        "framework": tune.grid_search(["tf", "torch"]),
+        "num_gpus": num_gpus,
+        "rollout_fragment_length": 50,
+        "train_batch_size": 750,
+        "num_workers": num_workers,
+        "num_envs_per_worker": 1,
+        "clip_rewards": True,
+        "num_sgd_iter": 2,
+        "vf_loss_coeff": 1.0,
+        "clip_param": 0.3,
+        "grad_clip": 10,
+        "vtrace": True,
+        "use_kl_loss": False,
+    }
+    logger.info("Configuration: \n %s", pformat(config))
+
+    # Run the experiment.
+    # TODO(jungong) : maybe add checkpointing.
+    return tune.run(
+        "APPO",
+        config=config,
+        stop=stop,
+        verbose=1,
+        num_samples=1,
+        progress_reporter=CLIReporter(
+            metric_columns={
+                "training_iteration": "iter",
+                "time_total_s": "time_total_s",
+                "timesteps_total": "ts",
+                "snapshots": "snapshots",
+                "episodes_this_iter": "train_episodes",
+                "episode_reward_mean": "reward_mean",
+            },
+            sort_by_metric=True,
+            max_report_frequency=30,
+        ))
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Tune+RLlib Example",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument(
+        "--smoke-test",
+        action="store_true",
+        default=False,
+        help="Finish quickly for testing.")
+
+    args = parser.parse_args()
+
+    if args.smoke_test:
+        ray.init(num_cpus=2)
+    else:
+        ray.init()
+
+    run(smoke_test=args.smoke_test)
+    ray.shutdown()