ray/release/long_running_distributed_tests/cluster.yaml
Kai Fricke b366500938
[tune] fix long running release test WIP (#14866)
- Use placement groups
- Introduce time between checks for failure testing
- Use gloo instead of nccl
2021-03-25 11:03:22 +01:00

37 lines
676 B
YAML

cluster_name: long-running-distributed-tests
max_workers: 3
idle_timeout_minutes: 15
docker:
image: anyscale/ray-ml:nightly-gpu
container_name: ray_container
pull_before_run: True
provider:
type: aws
region: us-west-2
availability_zone: us-west-2a
cache_stopped_nodes: False
available_node_types:
gpu_2_32:
node_config:
InstanceType: g3.8xlarge
resources: {"CPU": 32, "GPU": 2}
min_workers: 3
max_workers: 3
auth:
ssh_user: ubuntu
head_node_type: gpu_2_32
worker_default_node_type: gpu_2_32
file_mounts_sync_continuously: false
setup_commands:
- pip install -U {{env["RAY_WHEEL"]}}