ray/release/rllib_tests/stress_tests/cluster.yaml
Kai Fricke 1d52ab819f
[release] release 1.3.0 results and test updates (#15366)
Convert a number of release tests and add logs for release 1.3.0
2021-05-04 22:10:04 +01:00

44 lines
1,015 B
YAML

cluster_name: ray-rllib-stress-tests
min_workers: 9
max_workers: 9
idle_timeout_minutes: 15
docker:
image: anyscale/ray-ml:latest-gpu
container_name: ray_container
pull_before_run: True
provider:
type: aws
region: us-west-2
availability_zone: us-west-2a
cache_stopped_nodes: False
available_node_types:
gpu_ondemand:
node_config:
InstanceType: p3.16xlarge
resources: {"CPU": 56, "GPU": 8} # Keep one CPU free for each GPU
min_workers: 0
max_workers: 0
cpu_ondemand:
node_config:
InstanceType: m5.16xlarge
resources: {"CPU": 64}
min_workers: 9
max_workers: 9
auth:
ssh_user: ubuntu
head_node_type: gpu_ondemand
worker_default_node_type: cpu_ondemand
setup_commands:
- sudo apt update
- sudo apt-get install -y libglib2.0-0 libcudnn7=7.6.5.32-1+cuda10.1
# libcudnn8=8.0.5.39-1+cuda11.0
- pip install tensorflow-gpu==2.3.0
- pip install -U {{env["RAY_WHEEL"]}}