cluster_name: ray-rllib-stress-tests min_workers: 9 max_workers: 9 idle_timeout_minutes: 15 docker: image: anyscale/ray-ml:latest-gpu container_name: ray_container pull_before_run: True provider: type: aws region: us-west-2 availability_zone: us-west-2a cache_stopped_nodes: False available_node_types: gpu_ondemand: node_config: InstanceType: p3.16xlarge resources: {"CPU": 56, "GPU": 8} # Keep one CPU free for each GPU min_workers: 0 max_workers: 0 cpu_ondemand: node_config: InstanceType: m5.16xlarge resources: {"CPU": 64} min_workers: 9 max_workers: 9 auth: ssh_user: ubuntu head_node_type: gpu_ondemand worker_default_node_type: cpu_ondemand setup_commands: - sudo apt update - sudo apt-get install -y libglib2.0-0 libcudnn7=7.6.5.32-1+cuda10.1 # libcudnn8=8.0.5.39-1+cuda11.0 - pip install tensorflow-gpu==2.3.0 - pip install -U {{env["RAY_WHEEL"]}}