[RLlib] Add an RLlib Tune experiment to UserTest suite. (#19807)

* Add an RLlib Tune experiment to UserTest suite. * Add ray.init() * Move example script to example/tune/, so it can be imported as module. * add __init__.py so our new module will get included in python wheel. * Add block device to RLlib test instances. * Reduce disk size a little bit. * Add metrics reporting * Allow max of 5 workers to accomodate all the worker tasks. * revert disk size change. * Minor updates * Trigger build * set max num workers * Add a compute cfg for autoscaled cpu and gpu nodes. * use 1gpu instance. * install tblib for debugging worker crashes. * Manually upgrade to pytorch 1.9.0 * -y * torch=1.9.0 * install torch on driver * Add an RLlib Tune experiment to UserTest suite. * Add ray.init() * Move example script to example/tune/, so it can be imported as module. * add __init__.py so our new module will get included in python wheel. * Add block device to RLlib test instances. * Reduce disk size a little bit. * Add metrics reporting * Allow max of 5 workers to accomodate all the worker tasks. * revert disk size change. * Minor updates * Trigger build * set max num workers * Add a compute cfg for autoscaled cpu and gpu nodes. * use 1gpu instance. * install tblib for debugging worker crashes. * Manually upgrade to pytorch 1.9.0 * -y * torch=1.9.0 * install torch on driver * bump timeout * Write a more informational result dict. * Revert changes to compute config files that are not used. * add smoke test * update * reduce timeout * Reduce the # of env per worker to 1. * Small fix for getting trial_states * Trigger build * simply result dict * lint * more lint * fix smoke test Co-authored-by: Amog Kamsetty <amogkamsetty@yahoo.com>
2025-03-05 18:11:42 -05:00 · 2021-11-03 17:04:27 -07:00 · 2021-11-03 17:04:27 -07:00 · 2c1fa459d4
commit 2c1fa459d4
parent 91c730efd0
13 changed files with 198 additions and 3 deletions
--- a/python/requirements_ml_docker.txt
+++ b/python/requirements_ml_docker.txt
@ -3,7 +3,6 @@ ipython
 # Needed for Ray Client error message serialization/deserialization.
 tblib
 # In TF >v2, GPU support is included in the base package.
 tensorflow==2.5.0
 tensorflow-probability==0.13.0
--- a/release/.buildkite/build_pipeline.py
+++ b/release/.buildkite/build_pipeline.py
@ -269,6 +269,12 @@ HOROVOD_SETUP_COMMANDS = [
 #   3. Use GPUs if applicable
 #   4. Have the `use_connect` flag set.
 USER_TESTS = {
    "~/ray/release/rllib_tests/rllib_tests.yaml": [
        ConnectTest(
            "connect_tests",
            requirements_file="release/rllib_tests"
            "/connect_driver_requirements.txt")
    ],
    "~/ray/release/ray_lightning_tests/ray_lightning_tests.yaml": [
        ConnectTest(
            "ray_lightning_user_test_latest",
--- a/release/rllib_tests/1gpu_4cpus.yaml
+++ b/release/rllib_tests/1gpu_4cpus.yaml
@ -14,3 +14,8 @@ worker_node_types:
      max_workers: 0
      use_spot: false
 aws:
    BlockDeviceMappings:
        - DeviceName: /dev/sda1
          Ebs:
            VolumeSize: 500
--- a/release/rllib_tests/2gpus_32cpus.yaml
+++ b/release/rllib_tests/2gpus_32cpus.yaml
@ -14,3 +14,8 @@ worker_node_types:
      max_workers: 0
      use_spot: false
 aws:
    BlockDeviceMappings:
        - DeviceName: /dev/sda1
          Ebs:
            VolumeSize: 500
--- a/release/rllib_tests/4gpus_64cpus.yaml
+++ b/release/rllib_tests/4gpus_64cpus.yaml
@ -14,3 +14,8 @@ worker_node_types:
      max_workers: 0
      use_spot: false
 aws:
    BlockDeviceMappings:
        - DeviceName: /dev/sda1
          Ebs:
            VolumeSize: 500
--- a/release/rllib_tests/app_config.yaml
+++ b/release/rllib_tests/app_config.yaml
@ -7,7 +7,8 @@ debian_packages:
 python:
  # These dependencies should be handled by requirements_rllib.txt and
  # requirements_ml_docker.txt
-  pip_packages: []
+  pip_packages:
    - torch==1.9.0 # TODO(amogkam): Remove after nightly images are available.
  conda_packages: []
 post_build_cmds:
@ -16,7 +17,7 @@ post_build_cmds:
  - sudo ln -s /usr/local/cuda /usr/local/nvidia
  - sudo ln -s /usr/local/cuda/lib64/libcusolver.so.11 /usr/local/cuda/lib64/libcusolver.so.10
  - pip install tensorflow==2.5.0
-  # END: TO-DO
+  # END: TODO
  - pip uninstall -y ray || true
  - pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
--- a/release/rllib_tests/auto_scale.yaml
+++ b/release/rllib_tests/auto_scale.yaml
@ -0,0 +1,26 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 max_workers: 10
 head_node_type:
    name: head_node
    instance_type: m5.xlarge
 worker_node_types:
    - name: worker_node_cpu
      instance_type: m5.xlarge
      min_workers: 0
      max_workers: 10
      use_spot: false
    - name: worker_node_gpu
      instance_type: g3.4xlarge
      min_workers: 0
      max_workers: 10
      use_spot: false
 aws:
    BlockDeviceMappings:
        - DeviceName: /dev/sda1
          Ebs:
            VolumeSize: 500
--- a/release/rllib_tests/connect_driver_requirements.txt
+++ b/release/rllib_tests/connect_driver_requirements.txt
@ -0,0 +1,8 @@
 # Make sure the driver versions are the same as cluster versions.
 # The cluster uses ray-ml Docker image.
 # ray-ml Docker image installs dependencies from ray/python/requirements/ml/ directory.
 # We constrain on these requirements file so that the same versions are installed.
 -c ../../python/requirements/ml/requirements_dl.txt
 tensorflow
 torch
--- a/release/rllib_tests/connect_tests/run_connect_tests.py
+++ b/release/rllib_tests/connect_tests/run_connect_tests.py
@ -0,0 +1,38 @@
 """Connect tests for Tune & RLlib.
 Runs a couple of hard learning tests using Anyscale connect.
 """
 import json
 import os
 import time
 import ray
 from ray.rllib.examples.tune.framework import run
 if __name__ == "__main__":
    addr = os.environ.get("RAY_ADDRESS")
    job_name = os.environ.get("RAY_JOB_NAME", "rllib_connect_tests")
    if addr is not None and addr.startswith("anyscale://"):
        ray.init(address=addr, job_name=job_name)
    else:
        ray.init(address="auto")
    start_time = time.time()
    exp_analysis = run()
    end_time = time.time()
    result = {
        "time_taken": end_time - start_time,
        "trial_states": {
            t.config["framework"]: t.status
            for t in exp_analysis.trials
        },
    }
    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
                                      "/tmp/release_test_out.json")
    with open(test_output_json, "wt") as f:
        json.dump(result, f)
    print("Ok.")
--- a/release/rllib_tests/rllib_tests.yaml
+++ b/release/rllib_tests/rllib_tests.yaml
@ -68,3 +68,14 @@
  smoke_test:
      run:
        timeout: 2000
 # Tests that exercise auto-scaling and Anyscale connect.
 - name: connect_tests
  cluster:
    app_config: app_config.yaml
    compute_template: auto_scale.yaml
  run:
    use_connect: True
    timeout: 3000
    script: python connect_tests/run_connect_tests.py
--- a/rllib/BUILD
+++ b/rllib/BUILD
@ -2533,6 +2533,15 @@ py_test(
    args = ["--as-test", "--framework=torch", "--stop-reward=100.0"]
 )
 py_test(
    name = "examples/tune/framework",
    main = "examples/tune/framework.py",
    tags = ["team:ml", "examples", "examples_F"],
    size = "medium",
    srcs = ["examples/tune/framework.py"],
    args = ["--smoke-test"]
 )
 py_test(
    name = "examples/two_trainer_workflow_tf",
    main = "examples/two_trainer_workflow.py",
--- a/rllib/examples/tune/init.py
+++ b/rllib/examples/tune/init.py
--- a/rllib/examples/tune/framework.py
+++ b/rllib/examples/tune/framework.py
@ -0,0 +1,82 @@
 #!/usr/bin/env python3
 """ Benchmarking TF against PyTorch on an example task using Ray Tune.
 """
 import logging
 from pprint import pformat
 import ray
 from ray import tune
 from ray.tune import CLIReporter
 logging.basicConfig(level=logging.WARN)
 logger = logging.getLogger("tune_framework")
 def run(smoke_test=False):
    stop = {"training_iteration": 1 if smoke_test else 50}
    num_workers = 1 if smoke_test else 50
    num_gpus = 0 if smoke_test else 1
    config = {
        "env": "PongNoFrameskip-v4",
        "framework": tune.grid_search(["tf", "torch"]),
        "num_gpus": num_gpus,
        "rollout_fragment_length": 50,
        "train_batch_size": 750,
        "num_workers": num_workers,
        "num_envs_per_worker": 1,
        "clip_rewards": True,
        "num_sgd_iter": 2,
        "vf_loss_coeff": 1.0,
        "clip_param": 0.3,
        "grad_clip": 10,
        "vtrace": True,
        "use_kl_loss": False,
    }
    logger.info("Configuration: \n %s", pformat(config))
    # Run the experiment.
    # TODO(jungong) : maybe add checkpointing.
    return tune.run(
        "APPO",
        config=config,
        stop=stop,
        verbose=1,
        num_samples=1,
        progress_reporter=CLIReporter(
            metric_columns={
                "training_iteration": "iter",
                "time_total_s": "time_total_s",
                "timesteps_total": "ts",
                "snapshots": "snapshots",
                "episodes_this_iter": "train_episodes",
                "episode_reward_mean": "reward_mean",
            },
            sort_by_metric=True,
            max_report_frequency=30,
        ))
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(
        description="Tune+RLlib Example",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--smoke-test",
        action="store_true",
        default=False,
        help="Finish quickly for testing.")
    args = parser.parse_args()
    if args.smoke_test:
        ray.init(num_cpus=2)
    else:
        ray.init()
    run(smoke_test=args.smoke_test)
    ray.shutdown()