[autoscaler][interface] Use multi node types in defaults.yaml and example-full.yaml (#14239)

* random doc typo

* example-full-multi

* left off max workers

* wip

* address comments, modify defaults, wip

* fix

* wip

* reformat more things

* undo useless diff

* space

* max workers

* space

* copy-paste mishaps

* space

* More copy-paste mishaps

* copy-paste issues, space, max_workers

* head_node_type

* legacy yamls

* line undeleted

* correct-gpu

* Remove redundant GPU example.

* Extraneous comment

* whitespace

* example-java.yaml

* Revert "example-java.yaml"

This reverts commit 1e9c0124b9d97e651aaeeb6ec5bf7a4ef2a2df17.

* tests and other things

* doc

* doc

* revert max worker default

* Kubernetes comment

* wip

* wip

* tweak

* Address comments

* test_resource_demand_scheduler fixes

* Head type min/max workers, aws resources

* fix example_cluster2.yaml

* Fix external node type test (compatibility with legacy-style external node types)

* fix test_autoscaler_aws

* gcp-images

* gcp node type names

* fix gcp defaults

* doc format

* typo

* Skip failed Windows tests

* doc string and comment

* assert

* remove contents of default external head and worker

* legacy external failed validation test

* Readability -- define the minimal external config at the top of the file.

* Remove default worker type min worker

* Remove extraneous global min_workers comment.

* per-node-type docker in aws/example-gpu-docker

* ray.worker.small -> ray.worker.default

* fix-docker

* fix gpu docker again

* undo kubernetes experiment

* fix doc

* remove worker max_worker from kubernetes

* remove max_worker from local worker node type

* fix doc again

* py38

* eric-comment

* fix cluster name

* fix-test-autoscaler

* legacy config logic

* pop resources

* Remove min_workers AFTER merge

* comment, warning message

* warning, comment
This commit is contained in:
Dmitri Gekhtman 2021-03-02 20:16:19 -08:00 committed by GitHub
parent ef873be9e8
commit 1675156a8b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
32 changed files with 1774 additions and 715 deletions

View file

@ -341,14 +341,13 @@ The key is the name of the node type, which is just for debugging purposes.
resources: {"CPU": 2}
min_workers: 0
max_workers: 0
ray.worker.small:
ray.worker.default:
node_config:
InstanceType: m5.large
InstanceMarketOptions:
MarketType: spot
resources: {"CPU": 2}
min_workers: 0
max_workers: 1
.. _cluster-configuration-head-node-type:
@ -1073,12 +1072,12 @@ Minimal configuration
:language: yaml
.. group-tab:: Azure
.. literalinclude:: ../../../python/ray/autoscaler/azure/example-minimal.yaml
:language: yaml
.. group-tab:: GCP
.. literalinclude:: ../../../python/ray/autoscaler/gcp/example-minimal.yaml
:language: yaml
@ -1092,11 +1091,11 @@ Full configuration
:language: yaml
.. group-tab:: Azure
.. literalinclude:: ../../../python/ray/autoscaler/azure/example-full.yaml
:language: yaml
.. group-tab:: GCP
.. literalinclude:: ../../../python/ray/autoscaler/gcp/example-full.yaml
:language: yaml

View file

@ -71,8 +71,14 @@ def fillout_resources_kubernetes(config):
return config
node_types = copy.deepcopy(config["available_node_types"])
for node_type in node_types:
container_data = node_types[node_type]["node_config"]["spec"][
"containers"][0]
node_config = node_types[node_type]["node_config"]
# The next line is for compatibility with configs like
# kubernetes/example-ingress.yaml,
# cf. KubernetesNodeProvider.create_node().
pod = node_config.get("pod", node_config)
container_data = pod["spec"]["containers"][0]
autodetected_resources = get_autodetected_resources(container_data)
if "resources" not in config["available_node_types"][node_type]:
config["available_node_types"][node_type]["resources"] = {}

View file

@ -1,3 +1,4 @@
import copy
import importlib
import logging
import json
@ -11,6 +12,17 @@ logger = logging.getLogger(__name__)
# For caching provider instantiations across API calls of one python session
_provider_instances = {}
# Minimal config for compatibility with legacy-style external configs.
MINIMAL_EXTERNAL_CONFIG = {
"available_node_types": {
"ray.head.default": {},
"ray.worker.default": {},
},
"head_node_type": "ray.head.default",
"head_node": {},
"worker_nodes": {},
}
def _import_aws(provider_config):
from ray.autoscaler._private.aws.node_provider import AWSNodeProvider
@ -192,7 +204,7 @@ def _get_default_config(provider_config):
package outside the autoscaler.
"""
if provider_config["type"] == "external":
return {}
return copy.deepcopy(MINIMAL_EXTERNAL_CONFIG)
load_config = _DEFAULT_CONFIGS.get(provider_config["type"])
if load_config is None:
raise NotImplementedError("Unsupported node provider: {}".format(

View file

@ -1,4 +1,5 @@
import collections
import copy
from datetime import datetime
import logging
import hashlib
@ -103,38 +104,91 @@ def prepare_config(config):
return with_defaults
def rewrite_legacy_yaml_to_available_node_types(
config: Dict[str, Any]) -> Dict[str, Any]:
if "available_node_types" not in config:
# TODO(ameer/ekl/alex): we can also rewrite here many other fields
# that include initialization/setup/start commands and ImageId.
logger.debug("Converting legacy cluster config to multi node types.")
config["available_node_types"] = {
NODE_TYPE_LEGACY_HEAD: {
"node_config": config["head_node"],
"resources": config["head_node"].get("resources") or {},
"min_workers": 0,
"max_workers": 0,
},
NODE_TYPE_LEGACY_WORKER: {
"node_config": config["worker_nodes"],
"resources": config["worker_nodes"].get("resources") or {},
"min_workers": config.get("min_workers", 0),
"max_workers": config.get("max_workers", 0),
},
}
config["head_node_type"] = NODE_TYPE_LEGACY_HEAD
del config["min_workers"]
return config
def fillout_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
defaults = _get_default_config(config["provider"])
defaults.update(config)
defaults["auth"] = defaults.get("auth", {})
defaults = rewrite_legacy_yaml_to_available_node_types(defaults)
return defaults
# Just for clarity:
merged_config = copy.deepcopy(defaults)
# Fill auth field to avoid key errors.
# This field is accessed when calling NodeUpdater but is not relevant to
# certain node providers and is thus left out of some cluster launching
# configs.
merged_config["auth"] = merged_config.get("auth", {})
# A legacy config is one which doesn't have available_node_types,
# but has at least one of head_node or worker_nodes.
is_legacy_config = (("available_node_types" not in config) and
("head_node" in config or "worker_nodes" in config))
# Do merging logic for legacy configs.
if is_legacy_config:
merged_config = merge_legacy_yaml_with_defaults(merged_config)
# Take care of this here, in case a config does not specify any of head,
# workers, node types, but does specify min workers:
merged_config.pop("min_workers", None)
return merged_config
def merge_legacy_yaml_with_defaults(
merged_config: Dict[str, Any]) -> Dict[str, Any]:
"""Rewrite legacy config's available node types after it has been merged
with defaults yaml.
"""
logger.warning("Converting legacy cluster config to multi node types.\n"
"Refer to the docs for examples of multi-node-type "
"autoscaling:\n"
"https://docs.ray.io/en/master/cluster/config.html"
"#full-configuration")
# Get default head and worker types.
default_head_type = merged_config["head_node_type"]
# Default configs are assumed to have two node types -- one for the head
# and one for the workers.
assert len(merged_config["available_node_types"].keys()) == 2
default_worker_type = (merged_config["available_node_types"].keys() -
{default_head_type}).pop()
if merged_config["head_node"]:
# User specified a head node in legacy config.
# Convert it into data for the head's node type.
head_node_info = {
"node_config": merged_config["head_node"],
"resources": merged_config["head_node"].get("resources") or {},
"min_workers": 0,
"max_workers": 0,
}
else:
# Use default data for the head's node type.
head_node_info = merged_config["available_node_types"][
default_head_type]
if merged_config["worker_nodes"]:
# User specified a worker node in legacy config.
# Convert it into data for the workers' node type.
worker_node_info = {
"node_config": merged_config["worker_nodes"],
"resources": merged_config["worker_nodes"].get("resources") or {},
"min_workers": merged_config.get("min_workers", 0),
"max_workers": merged_config["max_workers"],
}
else:
# Use default data for the workers' node type.
worker_node_info = merged_config["available_node_types"][
default_worker_type]
# Rewrite available_node_types.
merged_config["available_node_types"] = {
NODE_TYPE_LEGACY_HEAD: head_node_info,
NODE_TYPE_LEGACY_WORKER: worker_node_info
}
merged_config["head_node_type"] = NODE_TYPE_LEGACY_HEAD
# Resources field in head/worker fields cause node launch to fail.
merged_config["head_node"].pop("resources", None)
merged_config["worker_nodes"].pop("resources", None)
return merged_config
def merge_setup_commands(config):
@ -147,7 +201,6 @@ def merge_setup_commands(config):
def fill_node_type_max_workers(config):
"""Sets default per-node max workers to global max_workers.
This equivalent to setting the default per-node max workers to infinity,
with the only upper constraint coming from the global max_workers.
"""

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
# node.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -43,38 +39,63 @@ auth:
# configurations below.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
head_node:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
ray.head.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
node_config:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
ray.worker.default:
# The minimum number of nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
node_config:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
worker_nodes:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -108,15 +129,8 @@ initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands:
# Note: if you're developing Ray, you probably want to create an AMI that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
- echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
# Consider uncommenting these if you also want to run apt-get commands during setup
# - sudo pkill -9 apt-get || true
# - sudo pkill -9 dpkg || true
# - sudo dpkg --configure -a
- echo 'export PATH="$HOME/anaconda3/envs/tensorflow2_latest_p37/bin:$PATH"' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
@ -134,3 +148,6 @@ head_start_ray_commands:
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
head_node: {}
worker_nodes: {}

View file

@ -0,0 +1,148 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0
# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_container"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
pull_before_run: True
run_options: [] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers
# head_image: "rayproject/ray-ml:latest-gpu"
# Allow Ray to automatically detect GPUs
# worker_image: "rayproject/ray-ml:latest-cpu"
# worker_run_options: []
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Cloud-provider specific configuration.
provider:
type: aws
region: us-west-2
# Availability zone(s), comma-separated, that nodes may be launched in.
# Nodes are currently spread between zones by a round-robin approach,
# however this implementation detail should not be relied upon.
availability_zone: us-west-2a,us-west-2b
# Whether to allow node reuse. If set to False, nodes will be terminated
# instead of stopped.
cache_stopped_nodes: True # If not present, the default is True.
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
head_node:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
worker_nodes:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
- "**/.git"
- "**/.git/**"
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
- ".gitignore"
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Note: if you're developing Ray, you probably want to create a Docker image that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
# node.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -57,38 +53,66 @@ auth:
# configurations below.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
head_node:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
ray.head.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
node_config:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
ray.worker.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 2
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
node_config:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
worker_nodes:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -146,3 +170,6 @@ head_start_ray_commands:
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
head_node: {}
worker_nodes: {}

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: gpu-docker
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
# node.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -23,10 +19,6 @@ docker:
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_nvidia_docker" # e.g. ray_docker
# # Example of running a GPU head with CPU workers
# head_image: "rayproject/ray-ml:latest-gpu"
# worker_image: "rayproject/ray-ml:latest"
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
@ -48,38 +40,74 @@ auth:
# configurations below.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
head_node:
InstanceType: p2.xlarge
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
# GPU head node.
ray.head.gpu:
# worker_image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
node_config:
InstanceType: p2.xlarge
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
# CPU workers.
ray.worker.default:
# Override global docker setting.
# This node type will run a CPU image,
# rather than the GPU image specified in the global docker settings.
docker:
worker_image: "rayproject/ray-ml:latest-cpu"
# The minimum number of nodes of this type to launch.
# This number should be >= 0.
min_workers: 1
# The maximum number of workers nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 2
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
node_config:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
worker_nodes:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# Specify the node type of the head node (as configured above).
head_node_type: ray.head.gpu
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.

View file

@ -2,7 +2,7 @@
cluster_name: minimal
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. min_workers default to 0.
# node. min_workers default to 0.
max_workers: 1
# Cloud-provider specific configuration.

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
# node.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -17,7 +13,7 @@ upscaling_speed: 1.0
# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
# Empty object means disabled.
docker: {}
# If a node is idle for this many minutes, it will be removed.
@ -46,30 +42,52 @@ auth:
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
# Provider-specific config for the head node, e.g. instance type.
head_node:
azure_arm_parameters:
vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
ray.head.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config, e.g. instance type.
node_config:
azure_arm_parameters:
vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
# Provider-specific config for worker nodes, e.g. instance type.
worker_nodes:
azure_arm_parameters:
vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
# billingProfile:
# maxPrice: -1
ray.worker.default:
# The minimum number of nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config, e.g. instance type.
node_config:
azure_arm_parameters:
vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
# billingProfile:
# maxPrice: -1
# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -134,3 +152,6 @@ head_start_ray_commands:
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
head_node: {}
worker_nodes: {}

View file

@ -19,18 +19,20 @@ upscaling_speed: 1.0
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray-ml:latest-gpu"
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_docker"
container_name: "ray_container"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
pull_before_run: False
pull_before_run: True
run_options: [] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers
# head_image: "rayproject/ray-ml:latest-gpu"
# Allow Ray to automatically detect GPUs
# worker_image: "rayproject/ray-ml:latest"
# worker_image: "rayproject/ray-ml:latest-cpu"
# worker_run_options: []
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
@ -42,7 +44,7 @@ provider:
location: westus2
resource_group: ray-cluster
# set subscription id otherwise the default from az cli will be used
# subscription_id: 00000000-0000-0000-0000-000000000000
# subscription_id: 00000000-0000-0000-0000-000000000000
# How Ray will authenticate with newly launched nodes.
auth:
@ -53,27 +55,35 @@ auth:
# changes to this should match what is specified in file_mounts
ssh_public_key: ~/.ssh/id_rsa.pub
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields using defaults.yaml
# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
# See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
# Provider-specific config for the head node, e.g. instance type.
head_node:
azure_arm_parameters:
vmSize: Standard_NC6
vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: "1804"
imageSku: 1804-gen2
imageVersion: 20.07.06
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields using defaults.yaml
# Provider-specific config for worker nodes, e.g. instance type.
worker_nodes:
azure_arm_parameters:
vmSize: Standard_NC6
vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: "1804"
imageSku: 1804-gen2
imageVersion: 20.07.06
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
# billingProfile:
# maxPrice: -1
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -83,6 +93,27 @@ file_mounts: {
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
}
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
- "**/.git"
- "**/.git/**"
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
- ".gitignore"
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
@ -92,20 +123,16 @@ initialization_commands:
# List of shell commands to run to set up nodes.
setup_commands:
# Note: if you're developing Ray, you probably want to create an AMI that
# Note: if you're developing Ray, you probably want to create a Docker image that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Consider uncommenting these if you also want to run apt-get commands during setup
# - sudo pkill -9 apt-get || true
# - sudo pkill -9 dpkg || true
# - sudo dpkg --configure -a
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
head_setup_commands:
- pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0
# Custom commands that will be run on worker nodes after common setup.

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
# node.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -17,7 +13,7 @@ upscaling_speed: 1.0
# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
# Empty object means disabled.
docker:
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
@ -60,30 +56,55 @@ auth:
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
# Provider-specific config for the head node, e.g. instance type.
head_node:
azure_arm_parameters:
vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
ray.head.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config, e.g. instance type.
node_config:
azure_arm_parameters:
vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
# Provider-specific config for worker nodes, e.g. instance type.
worker_nodes:
azure_arm_parameters:
vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
# billingProfile:
# maxPrice: -1
ray.worker.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 2
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config, e.g. instance type.
node_config:
azure_arm_parameters:
vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
# billingProfile:
# maxPrice: -1
# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -147,3 +168,6 @@ head_start_ray_commands:
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
head_node: {}
worker_nodes: {}

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: gpu-docker
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
# node.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -21,7 +17,7 @@ upscaling_speed: 1.0
docker:
image: "rayproject/ray-ml:latest-gpu"
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_nvidia_docker" # e.g. ray_docker
container_name: "ray_nvidia_docker"
# # Example of running a GPU head with CPU workers
# head_image: "rayproject/ray-ml:latest-gpu"
@ -45,17 +41,40 @@ auth:
# changes to this should match what is specified in file_mounts
ssh_public_key: ~/.ssh/id_rsa.pub
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields using defaults.yaml
head_node:
azure_arm_parameters:
vmSize: Standard_NC6s_v3
# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
ray.head.gpu:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The resources provided by this node type.
resources: {"CPU": 6, "GPU": 1}
# Provider-specific config, e.g. instance type.
node_config:
azure_arm_parameters:
vmSize: Standard_NC6_v3
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields using defaults.yaml
worker_nodes:
azure_arm_parameters:
vmSize: Standard_NC6s_v3
ray.worker.gpu:
# The minimum number of nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 2
# The resources provided by this node type.
resources: {"CPU": 6, "GPU": 1}
# Provider-specific config, e.g. instance type.
node_config:
azure_arm_parameters:
vmSize: Standard_NC6_v3
# Specify the node type of the head node (as configured above).
head_node_type: ray.head.gpu
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -69,7 +88,7 @@ file_mounts: {
# NOTE: rayproject/ray-ml:latest has ray latest bundled
setup_commands: []
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
- pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0

View file

@ -2,7 +2,7 @@
cluster_name: minimal
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. min_workers default to 0.
# node. min_workers default to 0.
max_workers: 1
# Cloud-provider specific configuration.

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
# node.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -39,50 +35,75 @@ auth:
# project wide meta-data.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
head_node:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
ray_head_default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
node_config:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# If the network interface is specified as below in both head and worker
# nodes, the manual network config is used. Otherwise an existing subnet is
# used. To use a shared subnet, ask the subnet owner to grant permission
# for 'compute.subnetworks.use' to the ray autoscaler account...
# networkInterfaces:
# - kind: compute#networkInterface
# subnetwork: path/to/subnet
# aliasIpRanges: []
ray_worker_small:
# The minimum number of nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
node_config:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# If the network interface is specified as below in both head and worker
# nodes, the manual network config is used. Otherwise an existing subnet is
# used. To use a shared subnet, ask the subnet owner to grant permission
# for 'compute.subnetworks.use' to the ray autoscaler account...
# networkInterfaces:
# - kind: compute#networkInterface
# subnetwork: path/to/subnet
# aliasIpRanges: []
worker_nodes:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# Specify the node type of the head node (as configured above).
head_node_type: ray_head_default
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -159,3 +180,6 @@ worker_start_ray_commands:
ray start
--address=$RAY_HEAD_IP:6379
--object-manager-port=8076
head_node: {}
worker_nodes: {}

View file

@ -0,0 +1,167 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0
# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_container"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
pull_before_run: True
run_options: [] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers
# head_image: "rayproject/ray-ml:latest-gpu"
# Allow Ray to automatically detect GPUs
# worker_image: "rayproject/ray-ml:latest-cpu"
# worker_run_options: []
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Cloud-provider specific configuration.
provider:
type: gcp
region: us-west1
availability_zone: us-west1-a
project_id: null # Globally unique project id
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below. This requires that you have added the key into the
# project wide meta-data.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
head_node:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# If the network interface is specified as below in both head and worker
# nodes, the manual network config is used. Otherwise an existing subnet is
# used. To use a shared subnet, ask the subnet owner to grant permission
# for 'compute.subnetworks.use' to the ray autoscaler account...
# networkInterfaces:
# - kind: compute#networkInterface
# subnetwork: path/to/subnet
# aliasIpRanges: []
worker_nodes:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
- "**/.git"
- "**/.git/**"
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
- ".gitignore"
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Note: if you're developing Ray, you probably want to create a Docker image that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
- pip install google-api-python-client==1.7.8
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- >-
ulimit -n 65536;
ray start
--head
--port=6379
--object-manager-port=8076
--autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- >-
ulimit -n 65536;
ray start
--address=$RAY_HEAD_IP:6379
--object-manager-port=8076

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
# node.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -53,50 +49,78 @@ auth:
# project wide meta-data.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
head_node:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
ray_head_default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
node_config:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# If the network interface is specified as below in both head and worker
# nodes, the manual network config is used. Otherwise an existing subnet is
# used. To use a shared subnet, ask the subnet owner to grant permission
# for 'compute.subnetworks.use' to the ray autoscaler account...
# networkInterfaces:
# - kind: compute#networkInterface
# subnetwork: path/to/subnet
# aliasIpRanges: []
ray_worker_small:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 2
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
node_config:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# If the network interface is specified as below in both head and worker
# nodes, the manual network config is used. Otherwise an existing subnet is
# used. To use a shared subnet, ask the subnet owner to grant permission
# for 'compute.subnetworks.use' to the ray autoscaler account...
# networkInterfaces:
# - kind: compute#networkInterface
# subnetwork: path/to/subnet
# aliasIpRanges: []
worker_nodes:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# Specify the node type of the head node (as configured above).
head_node_type: ray_head_default
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -166,3 +190,6 @@ worker_start_ray_commands:
ray start
--address=$RAY_HEAD_IP:6379
--object-manager-port=8076
head_node: {}
worker_nodes: {}

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: gpu-docker
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
# node.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -48,58 +44,81 @@ auth:
# project wide meta-data.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
head_node:
machineType: custom-6-16384
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
guestAccelerators:
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
acceleratorCount: 1
metadata:
items:
- key: install-nvidia-driver
value: "True"
scheduling:
- onHostMaintenance: TERMINATE
# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
ray_head_gpu:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The resources provided by this node type.
resources: {"CPU": 6, "GPU": 1}
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
node_config:
machineType: custom-6-16384
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
guestAccelerators:
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
acceleratorCount: 1
metadata:
items:
- key: install-nvidia-driver
value: "True"
scheduling:
- onHostMaintenance: TERMINATE
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
ray_worker_gpu:
# The minimum number of nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 2
# The resources provided by this node type.
resources: {"CPU": 2, "GPU": 1}
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
node_config:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
guestAccelerators:
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
acceleratorCount: 1
metadata:
items:
- key: install-nvidia-driver
value: "True"
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
- onHostMaintenance: TERMINATE
worker_nodes:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
guestAccelerators:
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
acceleratorCount: 1
metadata:
items:
- key: install-nvidia-driver
value: "True"
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
- onHostMaintenance: TERMINATE
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# Specify the node type of the head node (as configured above).
head_node_type: ray_head_gpu
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.

View file

@ -2,7 +2,7 @@
cluster_name: minimal
# The maximum number of worker nodes to launch in addition to the head
# node. This takes precedence over min_workers. min_workers default to 0.
# node. min_workers default to 0.
max_workers: 1
# Cloud-provider specific configuration.

View file

@ -96,8 +96,6 @@ available_node_types:
worker_node:
# Minimum number of Ray workers of this Pod type.
min_workers: 0
# Maximum number of Ray workers of this Pod type. Takes precedence over min_workers.
max_workers: 2
node_config:
apiVersion: v1
kind: Pod
@ -136,6 +134,12 @@ available_node_types:
# cause problems for other pods.
memory: 512Mi
head_node:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
node_config:
apiVersion: v1
kind: Pod

View file

@ -139,6 +139,12 @@ available_node_types:
# cause problems for other pods.
memory: 512Mi
head_node:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
node_config:
apiVersion: v1
kind: Pod

View file

@ -17,6 +17,10 @@ spec:
# Specify the allowed pod types for this ray cluster and the resources they provide.
podTypes:
- name: head-node
# Minimum number of Ray workers of this Pod type.
minWorkers: 0
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
maxWorkers: 0
podConfig:
apiVersion: v1
kind: Pod

View file

@ -17,6 +17,10 @@ spec:
# Specify the allowed pod types for this ray cluster and the resources they provide.
podTypes:
- name: head-node
# Minimum number of Ray workers of this Pod type.
minWorkers: 0
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
maxWorkers: 0
podConfig:
apiVersion: v1
kind: Pod

View file

@ -1,16 +1,8 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
## NOTE: Typically for local clusters, min_workers == max_workers == len(worker_ips).
## NOTE: Typically for local clusters, max_workers == len(worker_ips).
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
# Typically, min_workers == max_workers == len(worker_ips).
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head node.
# This takes precedence over min_workers.
# Typically, min_workers == max_workers == len(worker_ips).
max_workers: 0
# The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -42,11 +34,20 @@ auth:
# Optional if an ssh private key is necessary to ssh to the cluster.
# ssh_private_key: ~/.ssh/id_rsa
# Leave this empty.
head_node: {}
# Leave this empty.
worker_nodes: {}
available_node_types:
ray.head.default:
resources: {}
min_workers: 0
max_workers: 0
# Leave this empty
node_config: {}
ray.worker.default:
resources: {}
## NOTE: Typically for local clusters, max_workers == len(worker_ips).
min_workers: 0
# Leave this empty
node_config: {}
head_node_type: ray.head.default
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -97,3 +98,6 @@ head_start_ray_commands:
worker_start_ray_commands:
- ray stop
- ray start --address=$RAY_HEAD_IP:6379
head_node: {}
worker_nodes: {}

View file

@ -2,10 +2,6 @@
# A namespace will be automatically created for each cluster_name in SKE.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
@ -85,174 +81,184 @@ provider:
# Exposing external IP addresses for ray pods isn't currently supported.
use_internal_ips: true
# Kubernetes pod config for the head node pod.
head_node:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-head-
head_node_type: ray.head.default
# Must match the head node service selector above if a head node
# service is required.
labels:
component: ray-head
available_node_types:
ray.head.default:
resources: {"CPU": 1}
min_workers: 0
max_workers: 0
# Kubernetes pod config for the head node pod.
node_config:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-head-
# https://docs.staroid.com/ske/pod.html#pod
pod.staroid.com/spot: "false" # use on-demand instance for head.
# Must match the head node service selector above if a head node
# service is required.
labels:
component: ray-head
# Uncomment to locate ray head to dedicated Kubernetes node
# (GPU instance is only available for 'dedicated' isolation)
#pod.staroid.com/isolation: dedicated
#pod.staroid.com/instance-type: gpu-1
spec:
automountServiceAccountToken: true
# https://docs.staroid.com/ske/pod.html#pod
pod.staroid.com/spot: "false" # use on-demand instance for head.
# Restarting the head node automatically is not currently supported.
# If the head node goes down, `ray up` must be run again.
restartPolicy: Never
# Uncomment to locate ray head to dedicated Kubernetes node
# (GPU instance is only available for 'dedicated' isolation)
#pod.staroid.com/isolation: dedicated
#pod.staroid.com/instance-type: gpu-1
spec:
automountServiceAccountToken: true
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
# nfs volume provides a shared volume across all ray-nodes.
- name: nfs-volume
persistentVolumeClaim:
claimName: nfs
# Restarting the head node automatically is not currently supported.
# If the head node goes down, `ray up` must be run again.
restartPolicy: Never
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
# - screen (used for `ray attach`)
# - kubectl (used by the autoscaler to manage worker pods)
# Image will be overridden when 'image_from_project' is true.
image: rayproject/ray
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 6379 # Redis port.
- containerPort: 6380 # Redis port.
- containerPort: 6381 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
# nfs volume provides a shared volume across all ray-nodes.
- name: nfs-volume
persistentVolumeClaim:
claimName: nfs
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /nfs
name: nfs-volume
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
- name: RAY_ADDRESS
value: "auto"
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
# - screen (used for `ray attach`)
# - kubectl (used by the autoscaler to manage worker pods)
# Image will be overridden when 'image_from_project' is true.
image: rayproject/ray
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 6379 # Redis port.
- containerPort: 6380 # Redis port.
- containerPort: 6381 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# Kubernetes pod config for worker node pods.
worker_nodes:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-worker-
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /nfs
name: nfs-volume
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
- name: RAY_ADDRESS
value: "auto"
# Must match the worker node service selector above if a worker node
# service is required.
labels:
component: ray-worker
ray.worker.default:
min_workers: 0
resources: {"CPU": 1}
# Kubernetes pod config for worker node pods.
node_config:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-worker-
# https://docs.staroid.com/ske/pod.html#pod
pod.staroid.com/spot: "true" # use spot instance for workers.
# Must match the worker node service selector above if a worker node
# service is required.
labels:
component: ray-worker
# Uncomment to locate ray head to dedicated Kubernetes node
# (GPU instance is only available for 'dedicated' isolation)
#pod.staroid.com/isolation: dedicated
#pod.staroid.com/instance-type: gpu-1
spec:
serviceAccountName: default
# https://docs.staroid.com/ske/pod.html#pod
pod.staroid.com/spot: "true" # use spot instance for workers.
# Worker nodes will be managed automatically by the head node, so
# do not change the restart policy.
restartPolicy: Never
# Uncomment to locate ray head to dedicated Kubernetes node
# (GPU instance is only available for 'dedicated' isolation)
#pod.staroid.com/isolation: dedicated
#pod.staroid.com/instance-type: gpu-1
spec:
serviceAccountName: default
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: nfs-volume
persistentVolumeClaim:
claimName: nfs
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
image: rayproject/autoscaler
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# Worker nodes will be managed automatically by the head node, so
# do not change the restart policy.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /nfs
name: nfs-volume
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
# This memory limit will be detected by ray and split into
# 30% for plasma, and 70% for workers.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: nfs-volume
persistentVolumeClaim:
claimName: nfs
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
image: rayproject/autoscaler
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /nfs
name: nfs-volume
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
# This memory limit will be detected by ray and split into
# 30% for plasma, and 70% for workers.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -307,3 +313,6 @@ head_start_ray_commands:
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
head_node: {}
worker_nodes: {}

View file

@ -1,8 +1,8 @@
import pytest
from ray.autoscaler._private.aws.config import _get_vpc_id_or_die, \
bootstrap_aws, \
DEFAULT_AMI
bootstrap_aws, \
DEFAULT_AMI
import ray.tests.aws.utils.stubs as stubs
import ray.tests.aws.utils.helpers as helpers
from ray.tests.aws.utils.constants import AUX_SUBNET, DEFAULT_SUBNET, \
@ -143,8 +143,10 @@ def test_fills_out_amis(iam_client_stub, ec2_client_stub):
stubs.configure_subnet_default(ec2_client_stub)
config = helpers.load_aws_example_config_file("example-full.yaml")
del config["head_node"]["ImageId"]
del config["worker_nodes"]["ImageId"]
del config["available_node_types"]["ray.head.default"]["node_config"][
"ImageId"]
del config["available_node_types"]["ray.worker.default"]["node_config"][
"ImageId"]
# Pass in SG for stub to work
config["head_node"]["SecurityGroupIds"] = ["sg-1234abcd"]

View file

@ -1,4 +1,5 @@
import json
import jsonschema
import os
import shutil
from subprocess import CalledProcessError
@ -264,6 +265,55 @@ SMALL_CLUSTER = {
"worker_start_ray_commands": ["start_ray_worker"],
}
MOCK_DEFAULT_CONFIG = {
"cluster_name": "default",
"max_workers": 2,
"upscaling_speed": 1.0,
"idle_timeout_minutes": 5,
"provider": {
"type": "mock",
"region": "us-east-1",
"availability_zone": "us-east-1a",
},
"docker": {
"image": "example",
"container_name": "mock",
},
"auth": {
"ssh_user": "ubuntu",
"ssh_private_key": os.devnull,
},
"available_node_types": {
"ray.head.default": {
"min_workers": 0,
"max_workers": 0,
"resources": {},
"node_config": {
"head_default_prop": 4
}
},
"ray.worker.default": {
"min_workers": 0,
"max_workers": 2,
"resources": {},
"node_config": {
"worker_default_prop": 7
}
}
},
"head_node_type": "ray.head.default",
"head_node": {},
"worker_nodes": {},
"file_mounts": {},
"cluster_synced_files": [],
"initialization_commands": [],
"setup_commands": [],
"head_setup_commands": [],
"worker_setup_commands": [],
"head_start_ray_commands": [],
"worker_start_ray_commands": [],
}
class LoadMetricsTest(unittest.TestCase):
def testHeartbeat(self):
@ -1645,6 +1695,28 @@ class AutoscalingTest(unittest.TestCase):
config_path, LoadMetrics(), max_failures=0, update_interval_s=0)
assert isinstance(autoscaler.provider, NodeProvider)
def testLegacyExternalNodeScalerMissingFields(self):
"""Should fail to validate legacy external config with missing
head_node, worker_nodes, or both."""
external_config = copy.deepcopy(SMALL_CLUSTER)
external_config["provider"] = {
"type": "external",
"module": "ray.autoscaler.node_provider.NodeProvider",
}
missing_workers, missing_head, missing_both = [
copy.deepcopy(external_config) for _ in range(3)
]
del missing_workers["worker_nodes"]
del missing_head["head_node"]
del missing_both["worker_nodes"]
del missing_both["head_node"]
for faulty_config in missing_workers, missing_head, missing_both:
faulty_config = prepare_config(faulty_config)
with pytest.raises(jsonschema.ValidationError):
validate_config(faulty_config)
def testExternalNodeScalerWrongImport(self):
config = SMALL_CLUSTER.copy()
config["provider"] = {

View file

@ -1,4 +1,5 @@
import jsonschema
import logging
import os
import sys
import tempfile
@ -9,10 +10,12 @@ import copy
from unittest.mock import MagicMock, Mock, patch
import pytest
from ray.autoscaler._private.util import prepare_config, validate_config
from ray.autoscaler._private.util import prepare_config, validate_config,\
_get_default_config, merge_setup_commands
from ray.autoscaler._private.providers import _NODE_PROVIDERS
from ray.autoscaler._private.kubernetes.node_provider import\
KubernetesNodeProvider
from ray.autoscaler.tags import NODE_TYPE_LEGACY_HEAD, NODE_TYPE_LEGACY_WORKER
from ray.test_utils import load_test_config, recursive_fnmatch
@ -37,18 +40,19 @@ CONFIG_PATHS = ignore_k8s_operator_configs(CONFIG_PATHS)
class AutoscalingConfigTest(unittest.TestCase):
def testValidateDefaultConfig(self):
for config_path in CONFIG_PATHS:
if "aws/example-multi-node-type.yaml" in config_path:
# aws is tested in testValidateDefaultConfigAWSMultiNodeTypes.
continue
with open(config_path) as f:
config = yaml.safe_load(f)
config = prepare_config(config)
if config["provider"]["type"] == "kubernetes":
KubernetesNodeProvider.fillout_available_node_types_resources(
config)
try:
if "aws/example-multi-node-type.yaml" in config_path:
# aws tested in testValidateDefaultConfigAWSMultiNodeTypes.
continue
with open(config_path) as f:
config = yaml.safe_load(f)
config = prepare_config(config)
if config["provider"]["type"] == "kubernetes":
KubernetesNodeProvider.\
fillout_available_node_types_resources(config)
validate_config(config)
except Exception:
logging.exception("")
self.fail(
f"Config {config_path} did not pass validation test!")
@ -232,7 +236,6 @@ class AutoscalingConfigTest(unittest.TestCase):
self.fail("Failed to validate config with security group name!")
def testMaxWorkerDefault(self):
# Load config, call prepare config, check that default max_workers
# is filled correctly for node types that don't specify it.
# Check that max_workers is untouched for node types
@ -254,7 +257,7 @@ class AutoscalingConfigTest(unittest.TestCase):
# Max workers auto-filled with specified cluster-wide value of 5.
assert config["max_workers"] ==\
prepared_node_types["worker_node_max_unspecified"]["max_workers"]\
== config["max_workers"] == 5
== 5
# Repeat with a config that doesn't specify global max workers.
# Default value of 2 should be pulled in for global max workers.
@ -275,8 +278,87 @@ class AutoscalingConfigTest(unittest.TestCase):
prepared_node_types["worker_node_max_specified"][
"max_workers"] == 3
# Max workers auto-filled with default cluster-wide value of 2.
assert prepared_node_types["worker_node_max_unspecified"][
"max_workers"] == 2
assert prepared_config["max_workers"] ==\
prepared_node_types["worker_node_max_unspecified"]["max_workers"]\
== 2
def testFillEdgeLegacyConfigs(self):
# Test edge cases: legacy configs which specify workers but not head
# or vice-versa.
no_head = load_test_config("test_no_head.yaml")
aws_defaults = _get_default_config(no_head["provider"])
head_prepared = prepare_config(no_head)
assert head_prepared["available_node_types"][
"ray-legacy-head-node-type"]["node_config"] ==\
aws_defaults["available_node_types"][
"ray.head.default"]["node_config"]
assert head_prepared["head_node"] == {}
# Custom worker config preserved
node_types = head_prepared["available_node_types"]
worker_type = node_types["ray-legacy-worker-node-type"]
assert worker_type["node_config"] == head_prepared["worker_nodes"] == {
"foo": "bar"
}
no_workers = load_test_config("test_no_workers.yaml")
workers_prepared = prepare_config(no_workers)
assert workers_prepared["available_node_types"][
"ray-legacy-worker-node-type"]["node_config"] ==\
aws_defaults["available_node_types"][
"ray.worker.default"]["node_config"]
assert workers_prepared["worker_nodes"] == {}
# Custom head config preserved
node_types = workers_prepared["available_node_types"]
head_type = node_types["ray-legacy-head-node-type"]
assert head_type["node_config"] == workers_prepared["head_node"] == {
"baz": "qux"
}
@pytest.mark.skipif(
sys.platform.startswith("win"), reason="Fails on Windows.")
def testExampleFull(self):
"""
Test that example-full yamls are unmodified by prepared_config,
except possibly by having setup_commands merged.
"""
providers = ["aws", "gcp", "azure"]
for provider in providers:
path = os.path.join(RAY_PATH, "autoscaler", provider,
"example-full.yaml")
config = yaml.safe_load(open(path).read())
config_copy = copy.deepcopy(config)
merge_setup_commands(config_copy)
assert config_copy == prepare_config(config)
@pytest.mark.skipif(
sys.platform.startswith("win"), reason="Fails on Windows.")
def testLegacyYaml(self):
# Test correct default-merging behavior for legacy yamls.
providers = ["aws", "gcp", "azure"]
for provider in providers:
path = os.path.join(RAY_PATH, "autoscaler", provider,
"example-full-legacy.yaml")
legacy_config = yaml.safe_load(open(path).read())
# custom head and workers
legacy_config["head_node"] = {"blahblah": 0}
legacy_config["worker_nodes"] = {"halbhalhb": 0}
legacy_config_copy = copy.deepcopy(legacy_config)
prepared_legacy = prepare_config(legacy_config_copy)
assert prepared_legacy["available_node_types"][
NODE_TYPE_LEGACY_HEAD]["max_workers"] == 0
assert prepared_legacy["available_node_types"][
NODE_TYPE_LEGACY_HEAD]["min_workers"] == 0
assert prepared_legacy["available_node_types"][
NODE_TYPE_LEGACY_HEAD]["node_config"] == legacy_config[
"head_node"]
assert prepared_legacy["available_node_types"][
NODE_TYPE_LEGACY_WORKER]["max_workers"] == 2
assert prepared_legacy["available_node_types"][
NODE_TYPE_LEGACY_WORKER]["min_workers"] == 0
assert prepared_legacy["available_node_types"][
NODE_TYPE_LEGACY_WORKER]["node_config"] == legacy_config[
"worker_nodes"]
if __name__ == "__main__":

View file

@ -0,0 +1,123 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0
# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_container"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
pull_before_run: True
run_options: [] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers
# head_image: "rayproject/ray-ml:latest-gpu"
# Allow Ray to automatically detect GPUs
# worker_image: "rayproject/ray-ml:latest-cpu"
# worker_run_options: []
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Cloud-provider specific configuration.
provider:
type: aws
region: us-west-2
# Availability zone(s), comma-separated, that nodes may be launched in.
# Nodes are currently spread between zones by a round-robin approach,
# however this implementation detail should not be relied upon.
availability_zone: us-west-2a,us-west-2b
# Whether to allow node reuse. If set to False, nodes will be terminated
# instead of stopped.
cache_stopped_nodes: True # If not present, the default is True.
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
worker_nodes:
foo: bar
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
- "**/.git"
- "**/.git/**"
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
- ".gitignore"
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Note: if you're developing Ray, you probably want to create a Docker image that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

View file

@ -0,0 +1,124 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0
# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_container"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
pull_before_run: True
run_options: [] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers
# head_image: "rayproject/ray-ml:latest-gpu"
# Allow Ray to automatically detect GPUs
# worker_image: "rayproject/ray-ml:latest-cpu"
# worker_run_options: []
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Cloud-provider specific configuration.
provider:
type: aws
region: us-west-2
# Availability zone(s), comma-separated, that nodes may be launched in.
# Nodes are currently spread between zones by a round-robin approach,
# however this implementation detail should not be relied upon.
availability_zone: us-west-2a,us-west-2b
# Whether to allow node reuse. If set to False, nodes will be terminated
# instead of stopped.
cache_stopped_nodes: True # If not present, the default is True.
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
head_node:
baz: qux
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
- "**/.git"
- "**/.git/**"
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
- ".gitignore"
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Note: if you're developing Ray, you probably want to create a Docker image that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

View file

@ -195,4 +195,4 @@ class KubernetesOperatorTest(unittest.TestCase):
if __name__ == "__main__":
kubernetes.config.load_kube_config()
sys.exit(pytest.main(["-v", __file__]))
sys.exit(pytest.main(["-sv", __file__]))

View file

@ -5,15 +5,16 @@ import yaml
import tempfile
import shutil
import unittest
from unittest import mock
import copy
import ray
import ray.ray_constants
from ray.autoscaler._private.util import \
rewrite_legacy_yaml_to_available_node_types, format_info_string, \
prepare_config, format_info_string, \
format_info_string_no_node_types
from ray.tests.test_autoscaler import SMALL_CLUSTER, MockProvider, \
MockProcessRunner
from ray.tests.test_autoscaler import SMALL_CLUSTER, MOCK_DEFAULT_CONFIG, \
MockProvider, MockProcessRunner
from ray.autoscaler._private.providers import (_NODE_PROVIDERS,
_clear_provider_cache)
from ray.autoscaler._private.autoscaler import StandardAutoscaler, \
@ -38,6 +39,8 @@ from ray.autoscaler._private.constants import \
from time import sleep
GET_DEFAULT_METHOD = "ray.autoscaler._private.util._get_default_config"
TYPES_A = {
"empty_node": {
"node_config": {
@ -1042,131 +1045,135 @@ def test_get_nodes_to_launch_max_launch_concurrency():
def test_rewrite_legacy_yaml_to_available_node_types():
cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config.
cluster_config = rewrite_legacy_yaml_to_available_node_types(
cluster_config)
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
"max_workers"] == 0
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
"min_workers"] == 0
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
"node_config"] == SMALL_CLUSTER["head_node"]
with mock.patch(GET_DEFAULT_METHOD, return_value=MOCK_DEFAULT_CONFIG):
cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config.
cluster_config = prepare_config(cluster_config)
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
"max_workers"] == 0
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
"min_workers"] == 0
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
"node_config"] == SMALL_CLUSTER["head_node"]
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
"node_config"] == SMALL_CLUSTER["worker_nodes"]
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
"max_workers"] == SMALL_CLUSTER["max_workers"]
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
"min_workers"] == SMALL_CLUSTER["min_workers"]
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
"node_config"] == SMALL_CLUSTER["worker_nodes"]
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
"max_workers"] == SMALL_CLUSTER["max_workers"]
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
"min_workers"] == SMALL_CLUSTER["min_workers"]
def test_handle_legacy_cluster_config_yaml():
provider = MockProvider()
head_resources = {"CPU": 8, "GPU": 1}
worker_resources = {"CPU": 32, "GPU": 8}
cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config.
cluster_config = rewrite_legacy_yaml_to_available_node_types(
cluster_config)
scheduler = ResourceDemandScheduler(
provider,
cluster_config["available_node_types"],
0,
head_node_type=NODE_TYPE_LEGACY_HEAD)
provider.create_node({}, {
TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_HEAD
}, 1)
head_ip = provider.non_terminated_node_ips({})[0]
head_node_id = provider.non_terminated_nodes({})[0]
to_launch = scheduler.get_nodes_to_launch([], {}, [], {}, [],
{head_ip: head_resources})
assert to_launch == {} # Should always be empty with max_workers = 0.
with mock.patch(GET_DEFAULT_METHOD, return_value=MOCK_DEFAULT_CONFIG):
provider = MockProvider()
head_resources = {"CPU": 8, "GPU": 1}
worker_resources = {"CPU": 32, "GPU": 8}
cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config.
cluster_config = prepare_config(cluster_config)
scheduler = ResourceDemandScheduler(
provider,
cluster_config["available_node_types"],
0,
head_node_type=NODE_TYPE_LEGACY_HEAD)
provider.create_node({}, {
TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_HEAD
}, 1)
head_ip = provider.non_terminated_node_ips({})[0]
head_node_id = provider.non_terminated_nodes({})[0]
to_launch = scheduler.get_nodes_to_launch([], {}, [], {}, [],
{head_ip: head_resources})
assert to_launch == {} # Should always be empty with max_workers = 0.
scheduler.max_workers = 30
min_workers = scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"]
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = 0
to_launch = scheduler.get_nodes_to_launch([head_node_id], {}, [], {}, [],
{head_ip: head_resources})
assert to_launch == {
} # Since the resource demand does not require adding nodes.
to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
[head_resources], {}, [],
{head_ip: head_resources})
assert to_launch == {
} # Since the resource demand does not require adding nodes.
scheduler.max_workers = 30
min_workers = scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
"min_workers"]
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = 0
to_launch = scheduler.get_nodes_to_launch(
[head_node_id], {}, [], {}, [], {head_ip: head_resources})
assert to_launch == {
} # Since the resource demand does not require adding nodes.
to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
[head_resources], {}, [],
{head_ip: head_resources})
assert to_launch == {
} # Since the resource demand does not require adding nodes.
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = min_workers
# Returns min_workers when min_workers>0.
to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
[head_resources], {}, [],
{head_ip: head_resources})
assert to_launch == {NODE_TYPE_LEGACY_WORKER: min_workers}
scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
"min_workers"] = min_workers
# Returns min_workers when min_workers>0.
to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
[head_resources], {}, [],
{head_ip: head_resources})
assert to_launch == {NODE_TYPE_LEGACY_WORKER: min_workers}
provider.create_node({}, {
TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_WORKER
}, min_workers)
nodes = provider.non_terminated_nodes({})
to_launch = scheduler.get_nodes_to_launch(nodes, {}, [head_resources], {},
[], {head_ip: head_resources})
assert to_launch == {} # A node is running, at some point it'll connect.
pending_launches = {NODE_TYPE_LEGACY_WORKER: 4}
to_launch = scheduler.get_nodes_to_launch([], pending_launches,
[head_resources], {}, [],
{head_ip: head_resources})
assert to_launch == {} # A node is launching, at some point it'll connect.
provider.create_node({}, {
TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_WORKER
}, min_workers)
nodes = provider.non_terminated_nodes({})
to_launch = scheduler.get_nodes_to_launch(
nodes, {}, [head_resources], {}, [], {head_ip: head_resources})
# A node is running, at some point it'll connect.
assert to_launch == {}
pending_launches = {NODE_TYPE_LEGACY_WORKER: 4}
to_launch = scheduler.get_nodes_to_launch([], pending_launches,
[head_resources], {}, [],
{head_ip: head_resources})
# A node is launching, at some point it'll connect.
assert to_launch == {}
# Now assume that we already launched/connected the nodes.
ips = provider.non_terminated_node_ips({})
lm = LoadMetrics()
worker_ips = []
for ip in ips:
if ip == head_ip:
lm.update(ip, head_resources, head_resources, {})
else:
lm.update(ip, worker_resources, worker_resources, {})
worker_ips.append(ip)
# Now assume that we already launched/connected the nodes.
ips = provider.non_terminated_node_ips({})
lm = LoadMetrics()
worker_ips = []
for ip in ips:
if ip == head_ip:
lm.update(ip, head_resources, head_resources, {})
else:
lm.update(ip, worker_resources, worker_resources, {})
worker_ips.append(ip)
assert not scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["resources"]
to_launch = scheduler.get_nodes_to_launch(
nodes, {}, [], {}, [], lm.get_static_node_resources_by_ip())
assert scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
"resources"] == worker_resources
assert to_launch == {}
utilizations = {ip: worker_resources for ip in worker_ips}
utilizations[head_ip] = head_resources
# Requires 4 nodes since worker resources is bigger than head reasources.
demands = [worker_resources] * (len(utilizations) + 3)
to_launch = scheduler.get_nodes_to_launch(
nodes, {}, demands, utilizations, [],
lm.get_static_node_resources_by_ip())
# 4 nodes are necessary to meet resource demand, but we never exceed
# max_workers.
assert to_launch == {}
scheduler.max_workers = 10
to_launch = scheduler.get_nodes_to_launch(
nodes, {}, demands, utilizations, [],
lm.get_static_node_resources_by_ip())
# 4 nodes are necessary to meet resource demand, but we never exceed
# max_workers.
assert to_launch == {}
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["max_workers"] = 10
to_launch = scheduler.get_nodes_to_launch(
nodes, {}, demands, utilizations, [],
lm.get_static_node_resources_by_ip())
# 4 nodes are necessary to meet resource demand.
assert to_launch == {NODE_TYPE_LEGACY_WORKER: 4}
to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches, demands,
utilizations, [],
lm.get_node_resources())
# 0 because there are 4 pending launches and we only need 4.
assert to_launch == {}
to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches,
demands * 2, utilizations, [],
lm.get_node_resources())
# 1 because there are 4 pending launches and we only allow a max of 5.
assert to_launch == {NODE_TYPE_LEGACY_WORKER: 1}
assert not scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["resources"]
to_launch = scheduler.get_nodes_to_launch(
nodes, {}, [], {}, [], lm.get_static_node_resources_by_ip())
assert scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
"resources"] == worker_resources
assert to_launch == {}
utilizations = {ip: worker_resources for ip in worker_ips}
utilizations[head_ip] = head_resources
# Needs 4 nodes since worker resources is bigger than head reasources.
demands = [worker_resources] * (len(utilizations) + 3)
to_launch = scheduler.get_nodes_to_launch(
nodes, {}, demands, utilizations, [],
lm.get_static_node_resources_by_ip())
# 4 nodes are necessary to meet resource demand, but we never exceed
# max_workers.
assert to_launch == {}
scheduler.max_workers = 10
to_launch = scheduler.get_nodes_to_launch(
nodes, {}, demands, utilizations, [],
lm.get_static_node_resources_by_ip())
# 4 nodes are necessary to meet resource demand, but we never exceed
# max_workers.
assert to_launch == {}
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["max_workers"] = 10
to_launch = scheduler.get_nodes_to_launch(
nodes, {}, demands, utilizations, [],
lm.get_static_node_resources_by_ip())
# 4 nodes are necessary to meet resource demand.
assert to_launch == {NODE_TYPE_LEGACY_WORKER: 4}
to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches,
demands, utilizations, [],
lm.get_node_resources())
# 0 because there are 4 pending launches and we only need 4.
assert to_launch == {}
to_launch = scheduler.get_nodes_to_launch(
nodes, pending_launches, demands * 2, utilizations, [],
lm.get_node_resources())
# 1 because there are 4 pending launches and we only allow a max of 5.
assert to_launch == {NODE_TYPE_LEGACY_WORKER: 1}
class LoadMetricsTest(unittest.TestCase):