ray/release/rllib_tests/stress_tests/cluster.yaml

cluster_name: ray-rllib-stress-tests

min_workers: 9
max_workers: 9

idle_timeout_minutes: 15

docker:
    image: anyscale/ray-ml:latest-gpu
    container_name: ray_container
    pull_before_run: True

provider:
    type: aws
    region: us-west-2
    availability_zone: us-west-2a
    cache_stopped_nodes: False

available_node_types:
    gpu_ondemand:
        node_config:
            InstanceType: p3.16xlarge
        resources: {"CPU": 56, "GPU": 8}  # Keep one CPU free for each GPU
        min_workers: 0
        max_workers: 0
    cpu_ondemand:
        node_config:
            InstanceType: m5.16xlarge
        resources: {"CPU": 64}
        min_workers: 9
        max_workers: 9

auth:
    ssh_user: ubuntu

head_node_type: gpu_ondemand
worker_default_node_type: cpu_ondemand

setup_commands:
    - sudo apt update
    - sudo apt-get install -y libglib2.0-0 libcudnn7=7.6.5.32-1+cuda10.1
    # libcudnn8=8.0.5.39-1+cuda11.0
    - pip install tensorflow-gpu==2.3.0
    - pip install -U {{env["RAY_WHEEL"]}}