[autoscaler][interface] Use multi node types in defaults.yaml and example-full.yaml (#14239)

* random doc typo

* example-full-multi

* left off max workers

* wip

* address comments, modify defaults, wip

* fix

* wip

* reformat more things

* undo useless diff

* space

* max workers

* space

* copy-paste mishaps

* space

* More copy-paste mishaps

* copy-paste issues, space, max_workers

* head_node_type

* legacy yamls

* line undeleted

* correct-gpu

* Remove redundant GPU example.

* Extraneous comment

* whitespace

* example-java.yaml

* Revert "example-java.yaml"

This reverts commit 1e9c0124b9d97e651aaeeb6ec5bf7a4ef2a2df17.

* tests and other things

* doc

* doc

* revert max worker default

* Kubernetes comment

* wip

* wip

* tweak

* Address comments

* test_resource_demand_scheduler fixes

* Head type min/max workers, aws resources

* fix example_cluster2.yaml

* Fix external node type test (compatibility with legacy-style external node types)

* fix test_autoscaler_aws

* gcp-images

* gcp node type names

* fix gcp defaults

* doc format

* typo

* Skip failed Windows tests

* doc string and comment

* assert

* remove contents of default external head and worker

* legacy external failed validation test

* Readability -- define the minimal external config at the top of the file.

* Remove default worker type min worker

* Remove extraneous global min_workers comment.

* per-node-type docker in aws/example-gpu-docker

* ray.worker.small -> ray.worker.default

* fix-docker

* fix gpu docker again

* undo kubernetes experiment

* fix doc

* remove worker max_worker from kubernetes

* remove max_worker from local worker node type

* fix doc again

* py38

* eric-comment

* fix cluster name

* fix-test-autoscaler

* legacy config logic

* pop resources

* Remove min_workers AFTER merge

* comment, warning message

* warning, comment
This commit is contained in:
Dmitri Gekhtman 2021-03-02 20:16:19 -08:00 committed by GitHub
parent ef873be9e8
commit 1675156a8b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
32 changed files with 1774 additions and 715 deletions

View file

@ -341,14 +341,13 @@ The key is the name of the node type, which is just for debugging purposes.
resources: {"CPU": 2} resources: {"CPU": 2}
min_workers: 0 min_workers: 0
max_workers: 0 max_workers: 0
ray.worker.small: ray.worker.default:
node_config: node_config:
InstanceType: m5.large InstanceType: m5.large
InstanceMarketOptions: InstanceMarketOptions:
MarketType: spot MarketType: spot
resources: {"CPU": 2} resources: {"CPU": 2}
min_workers: 0 min_workers: 0
max_workers: 1
.. _cluster-configuration-head-node-type: .. _cluster-configuration-head-node-type:
@ -1073,12 +1072,12 @@ Minimal configuration
:language: yaml :language: yaml
.. group-tab:: Azure .. group-tab:: Azure
.. literalinclude:: ../../../python/ray/autoscaler/azure/example-minimal.yaml .. literalinclude:: ../../../python/ray/autoscaler/azure/example-minimal.yaml
:language: yaml :language: yaml
.. group-tab:: GCP .. group-tab:: GCP
.. literalinclude:: ../../../python/ray/autoscaler/gcp/example-minimal.yaml .. literalinclude:: ../../../python/ray/autoscaler/gcp/example-minimal.yaml
:language: yaml :language: yaml
@ -1092,11 +1091,11 @@ Full configuration
:language: yaml :language: yaml
.. group-tab:: Azure .. group-tab:: Azure
.. literalinclude:: ../../../python/ray/autoscaler/azure/example-full.yaml .. literalinclude:: ../../../python/ray/autoscaler/azure/example-full.yaml
:language: yaml :language: yaml
.. group-tab:: GCP .. group-tab:: GCP
.. literalinclude:: ../../../python/ray/autoscaler/gcp/example-full.yaml .. literalinclude:: ../../../python/ray/autoscaler/gcp/example-full.yaml
:language: yaml :language: yaml

View file

@ -71,8 +71,14 @@ def fillout_resources_kubernetes(config):
return config return config
node_types = copy.deepcopy(config["available_node_types"]) node_types = copy.deepcopy(config["available_node_types"])
for node_type in node_types: for node_type in node_types:
container_data = node_types[node_type]["node_config"]["spec"][
"containers"][0] node_config = node_types[node_type]["node_config"]
# The next line is for compatibility with configs like
# kubernetes/example-ingress.yaml,
# cf. KubernetesNodeProvider.create_node().
pod = node_config.get("pod", node_config)
container_data = pod["spec"]["containers"][0]
autodetected_resources = get_autodetected_resources(container_data) autodetected_resources = get_autodetected_resources(container_data)
if "resources" not in config["available_node_types"][node_type]: if "resources" not in config["available_node_types"][node_type]:
config["available_node_types"][node_type]["resources"] = {} config["available_node_types"][node_type]["resources"] = {}

View file

@ -1,3 +1,4 @@
import copy
import importlib import importlib
import logging import logging
import json import json
@ -11,6 +12,17 @@ logger = logging.getLogger(__name__)
# For caching provider instantiations across API calls of one python session # For caching provider instantiations across API calls of one python session
_provider_instances = {} _provider_instances = {}
# Minimal config for compatibility with legacy-style external configs.
MINIMAL_EXTERNAL_CONFIG = {
"available_node_types": {
"ray.head.default": {},
"ray.worker.default": {},
},
"head_node_type": "ray.head.default",
"head_node": {},
"worker_nodes": {},
}
def _import_aws(provider_config): def _import_aws(provider_config):
from ray.autoscaler._private.aws.node_provider import AWSNodeProvider from ray.autoscaler._private.aws.node_provider import AWSNodeProvider
@ -192,7 +204,7 @@ def _get_default_config(provider_config):
package outside the autoscaler. package outside the autoscaler.
""" """
if provider_config["type"] == "external": if provider_config["type"] == "external":
return {} return copy.deepcopy(MINIMAL_EXTERNAL_CONFIG)
load_config = _DEFAULT_CONFIGS.get(provider_config["type"]) load_config = _DEFAULT_CONFIGS.get(provider_config["type"])
if load_config is None: if load_config is None:
raise NotImplementedError("Unsupported node provider: {}".format( raise NotImplementedError("Unsupported node provider: {}".format(

View file

@ -1,4 +1,5 @@
import collections import collections
import copy
from datetime import datetime from datetime import datetime
import logging import logging
import hashlib import hashlib
@ -103,38 +104,91 @@ def prepare_config(config):
return with_defaults return with_defaults
def rewrite_legacy_yaml_to_available_node_types(
config: Dict[str, Any]) -> Dict[str, Any]:
if "available_node_types" not in config:
# TODO(ameer/ekl/alex): we can also rewrite here many other fields
# that include initialization/setup/start commands and ImageId.
logger.debug("Converting legacy cluster config to multi node types.")
config["available_node_types"] = {
NODE_TYPE_LEGACY_HEAD: {
"node_config": config["head_node"],
"resources": config["head_node"].get("resources") or {},
"min_workers": 0,
"max_workers": 0,
},
NODE_TYPE_LEGACY_WORKER: {
"node_config": config["worker_nodes"],
"resources": config["worker_nodes"].get("resources") or {},
"min_workers": config.get("min_workers", 0),
"max_workers": config.get("max_workers", 0),
},
}
config["head_node_type"] = NODE_TYPE_LEGACY_HEAD
del config["min_workers"]
return config
def fillout_defaults(config: Dict[str, Any]) -> Dict[str, Any]: def fillout_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
defaults = _get_default_config(config["provider"]) defaults = _get_default_config(config["provider"])
defaults.update(config) defaults.update(config)
defaults["auth"] = defaults.get("auth", {})
defaults = rewrite_legacy_yaml_to_available_node_types(defaults) # Just for clarity:
return defaults merged_config = copy.deepcopy(defaults)
# Fill auth field to avoid key errors.
# This field is accessed when calling NodeUpdater but is not relevant to
# certain node providers and is thus left out of some cluster launching
# configs.
merged_config["auth"] = merged_config.get("auth", {})
# A legacy config is one which doesn't have available_node_types,
# but has at least one of head_node or worker_nodes.
is_legacy_config = (("available_node_types" not in config) and
("head_node" in config or "worker_nodes" in config))
# Do merging logic for legacy configs.
if is_legacy_config:
merged_config = merge_legacy_yaml_with_defaults(merged_config)
# Take care of this here, in case a config does not specify any of head,
# workers, node types, but does specify min workers:
merged_config.pop("min_workers", None)
return merged_config
def merge_legacy_yaml_with_defaults(
merged_config: Dict[str, Any]) -> Dict[str, Any]:
"""Rewrite legacy config's available node types after it has been merged
with defaults yaml.
"""
logger.warning("Converting legacy cluster config to multi node types.\n"
"Refer to the docs for examples of multi-node-type "
"autoscaling:\n"
"https://docs.ray.io/en/master/cluster/config.html"
"#full-configuration")
# Get default head and worker types.
default_head_type = merged_config["head_node_type"]
# Default configs are assumed to have two node types -- one for the head
# and one for the workers.
assert len(merged_config["available_node_types"].keys()) == 2
default_worker_type = (merged_config["available_node_types"].keys() -
{default_head_type}).pop()
if merged_config["head_node"]:
# User specified a head node in legacy config.
# Convert it into data for the head's node type.
head_node_info = {
"node_config": merged_config["head_node"],
"resources": merged_config["head_node"].get("resources") or {},
"min_workers": 0,
"max_workers": 0,
}
else:
# Use default data for the head's node type.
head_node_info = merged_config["available_node_types"][
default_head_type]
if merged_config["worker_nodes"]:
# User specified a worker node in legacy config.
# Convert it into data for the workers' node type.
worker_node_info = {
"node_config": merged_config["worker_nodes"],
"resources": merged_config["worker_nodes"].get("resources") or {},
"min_workers": merged_config.get("min_workers", 0),
"max_workers": merged_config["max_workers"],
}
else:
# Use default data for the workers' node type.
worker_node_info = merged_config["available_node_types"][
default_worker_type]
# Rewrite available_node_types.
merged_config["available_node_types"] = {
NODE_TYPE_LEGACY_HEAD: head_node_info,
NODE_TYPE_LEGACY_WORKER: worker_node_info
}
merged_config["head_node_type"] = NODE_TYPE_LEGACY_HEAD
# Resources field in head/worker fields cause node launch to fail.
merged_config["head_node"].pop("resources", None)
merged_config["worker_nodes"].pop("resources", None)
return merged_config
def merge_setup_commands(config): def merge_setup_commands(config):
@ -147,7 +201,6 @@ def merge_setup_commands(config):
def fill_node_type_max_workers(config): def fill_node_type_max_workers(config):
"""Sets default per-node max workers to global max_workers. """Sets default per-node max workers to global max_workers.
This equivalent to setting the default per-node max workers to infinity, This equivalent to setting the default per-node max workers to infinity,
with the only upper constraint coming from the global max_workers. with the only upper constraint coming from the global max_workers.
""" """

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster. # An unique identifier for the head node and workers of this cluster.
cluster_name: default cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head # The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. # node.
max_workers: 2 max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed. # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -43,38 +39,63 @@ auth:
# configurations below. # configurations below.
# ssh_private_key: /path/to/your/key.pem # ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default # Tell the autoscaler the allowed node types and the resources they provide.
# Ray will auto-configure unspecified fields such as SubnetId and KeyName. # The key is the name of the node type, which is just for debugging purposes.
# For more documentation on available fields, see: # The node config specifies the launch config and physical instance type.
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances available_node_types:
head_node: ray.head.default:
InstanceType: m5.large # The minimum number of worker nodes of this type to launch.
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30 # This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
node_config:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
ray.worker.default:
# The minimum number of nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
node_config:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# You can provision additional disk space with a conf as follows # Specify the node type of the head node (as configured above).
BlockDeviceMappings: head_node_type: ray.head.default
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
worker_nodes:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# Files or directories to copy to the head and worker nodes. The format is a # Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -108,15 +129,8 @@ initialization_commands: []
# List of shell commands to run to set up nodes. # List of shell commands to run to set up nodes.
setup_commands: setup_commands:
# Note: if you're developing Ray, you probably want to create an AMI that - echo 'export PATH="$HOME/anaconda3/envs/tensorflow2_latest_p37/bin:$PATH"' >> ~/.bashrc
# has your Ray repo pre-cloned. Then, you can replace the pip installs - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# below with a git checkout <your_sha> (and possibly a recompile).
- echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
# Consider uncommenting these if you also want to run apt-get commands during setup
# - sudo pkill -9 apt-get || true
# - sudo pkill -9 dpkg || true
# - sudo dpkg --configure -a
# Custom commands that will be run on the head node after common setup. # Custom commands that will be run on the head node after common setup.
head_setup_commands: head_setup_commands:
@ -134,3 +148,6 @@ head_start_ray_commands:
worker_start_ray_commands: worker_start_ray_commands:
- ray stop - ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
head_node: {}
worker_nodes: {}

View file

@ -0,0 +1,148 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0
# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_container"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
pull_before_run: True
run_options: [] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers
# head_image: "rayproject/ray-ml:latest-gpu"
# Allow Ray to automatically detect GPUs
# worker_image: "rayproject/ray-ml:latest-cpu"
# worker_run_options: []
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Cloud-provider specific configuration.
provider:
type: aws
region: us-west-2
# Availability zone(s), comma-separated, that nodes may be launched in.
# Nodes are currently spread between zones by a round-robin approach,
# however this implementation detail should not be relied upon.
availability_zone: us-west-2a,us-west-2b
# Whether to allow node reuse. If set to False, nodes will be terminated
# instead of stopped.
cache_stopped_nodes: True # If not present, the default is True.
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
head_node:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
worker_nodes:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
- "**/.git"
- "**/.git/**"
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
- ".gitignore"
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Note: if you're developing Ray, you probably want to create a Docker image that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster. # An unique identifier for the head node and workers of this cluster.
cluster_name: default cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head # The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. # node.
max_workers: 2 max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed. # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -57,38 +53,66 @@ auth:
# configurations below. # configurations below.
# ssh_private_key: /path/to/your/key.pem # ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default # Tell the autoscaler the allowed node types and the resources they provide.
# Ray will auto-configure unspecified fields such as SubnetId and KeyName. # The key is the name of the node type, which is just for debugging purposes.
# For more documentation on available fields, see: # The node config specifies the launch config and physical instance type.
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances available_node_types:
head_node: ray.head.default:
InstanceType: m5.large # The minimum number of worker nodes of this type to launch.
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30 # This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
node_config:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
ray.worker.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 2
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
node_config:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# You can provision additional disk space with a conf as follows # Specify the node type of the head node (as configured above).
BlockDeviceMappings: head_node_type: ray.head.default
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
worker_nodes:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# Files or directories to copy to the head and worker nodes. The format is a # Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -146,3 +170,6 @@ head_start_ray_commands:
worker_start_ray_commands: worker_start_ray_commands:
- ray stop - ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
head_node: {}
worker_nodes: {}

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster. # An unique identifier for the head node and workers of this cluster.
cluster_name: gpu-docker cluster_name: gpu-docker
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head # The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. # node.
max_workers: 2 max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed. # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -23,10 +19,6 @@ docker:
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull # image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_nvidia_docker" # e.g. ray_docker container_name: "ray_nvidia_docker" # e.g. ray_docker
# # Example of running a GPU head with CPU workers
# head_image: "rayproject/ray-ml:latest-gpu"
# worker_image: "rayproject/ray-ml:latest"
# If a node is idle for this many minutes, it will be removed. # If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5 idle_timeout_minutes: 5
@ -48,38 +40,74 @@ auth:
# configurations below. # configurations below.
# ssh_private_key: /path/to/your/key.pem # ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default # Tell the autoscaler the allowed node types and the resources they provide.
# Ray will auto-configure unspecified fields such as SubnetId and KeyName. # The key is the name of the node type, which is just for debugging purposes.
# For more documentation on available fields, see: # The node config specifies the launch config and physical instance type.
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances available_node_types:
head_node: # GPU head node.
InstanceType: p2.xlarge ray.head.gpu:
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30 # worker_image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
node_config:
InstanceType: p2.xlarge
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
# CPU workers.
ray.worker.default:
# Override global docker setting.
# This node type will run a CPU image,
# rather than the GPU image specified in the global docker settings.
docker:
worker_image: "rayproject/ray-ml:latest-cpu"
# The minimum number of nodes of this type to launch.
# This number should be >= 0.
min_workers: 1
# The maximum number of workers nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 2
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
node_config:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# You can provision additional disk space with a conf as follows # Specify the node type of the head node (as configured above).
BlockDeviceMappings: head_node_type: ray.head.gpu
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
worker_nodes:
InstanceType: m5.large
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# Files or directories to copy to the head and worker nodes. The format is a # Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.

View file

@ -2,7 +2,7 @@
cluster_name: minimal cluster_name: minimal
# The maximum number of workers nodes to launch in addition to the head # The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. min_workers default to 0. # node. min_workers default to 0.
max_workers: 1 max_workers: 1
# Cloud-provider specific configuration. # Cloud-provider specific configuration.

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster. # An unique identifier for the head node and workers of this cluster.
cluster_name: default cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head # The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. # node.
max_workers: 2 max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed. # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -17,7 +13,7 @@ upscaling_speed: 1.0
# This executes all commands on all nodes in the docker container, # This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster. # and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled. # Empty object means disabled.
docker: {} docker: {}
# If a node is idle for this many minutes, it will be removed. # If a node is idle for this many minutes, it will be removed.
@ -46,30 +42,52 @@ auth:
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs # Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below # on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
# Provider-specific config for the head node, e.g. instance type. # Tell the autoscaler the allowed node types and the resources they provide.
head_node: # The key is the name of the node type, which is just for debugging purposes.
azure_arm_parameters: # The node config specifies the launch config and physical instance type.
vmSize: Standard_D2s_v3 available_node_types:
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage ray.head.default:
imagePublisher: microsoft-dsvm # The minimum number of worker nodes of this type to launch.
imageOffer: ubuntu-1804 # This number should be >= 0.
imageSku: 1804-gen2 min_workers: 0
imageVersion: 20.07.06 # The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config, e.g. instance type.
node_config:
azure_arm_parameters:
vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
# Provider-specific config for worker nodes, e.g. instance type. ray.worker.default:
worker_nodes: # The minimum number of nodes of this type to launch.
azure_arm_parameters: # This number should be >= 0.
vmSize: Standard_D2s_v3 min_workers: 0
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage # The resources provided by this node type.
imagePublisher: microsoft-dsvm resources: {"CPU": 2}
imageOffer: ubuntu-1804 # Provider-specific config, e.g. instance type.
imageSku: 1804-gen2 node_config:
imageVersion: 20.07.06 azure_arm_parameters:
# optionally set priority to use Spot instances vmSize: Standard_D2s_v3
priority: Spot # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
# set a maximum price for spot instances if desired imagePublisher: microsoft-dsvm
# billingProfile: imageOffer: ubuntu-1804
# maxPrice: -1 imageSku: 1804-gen2
imageVersion: 20.07.06
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
# billingProfile:
# maxPrice: -1
# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default
# Files or directories to copy to the head and worker nodes. The format is a # Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -134,3 +152,6 @@ head_start_ray_commands:
worker_start_ray_commands: worker_start_ray_commands:
- ray stop - ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
head_node: {}
worker_nodes: {}

View file

@ -19,18 +19,20 @@ upscaling_speed: 1.0
# and opens all the necessary ports to support the Ray cluster. # and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled. # Empty string means disabled.
docker: docker:
image: "rayproject/ray-ml:latest-gpu" image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull # image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_docker" container_name: "ray_container"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present. # if no cached version is present.
pull_before_run: False pull_before_run: True
run_options: [] # Extra options to pass into "docker run" run_options: [] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers # Example of running a GPU head with CPU workers
# head_image: "rayproject/ray-ml:latest-gpu" # head_image: "rayproject/ray-ml:latest-gpu"
# Allow Ray to automatically detect GPUs
# worker_image: "rayproject/ray-ml:latest" # worker_image: "rayproject/ray-ml:latest-cpu"
# worker_run_options: []
# If a node is idle for this many minutes, it will be removed. # If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5 idle_timeout_minutes: 5
@ -42,7 +44,7 @@ provider:
location: westus2 location: westus2
resource_group: ray-cluster resource_group: ray-cluster
# set subscription id otherwise the default from az cli will be used # set subscription id otherwise the default from az cli will be used
# subscription_id: 00000000-0000-0000-0000-000000000000 # subscription_id: 00000000-0000-0000-0000-000000000000
# How Ray will authenticate with newly launched nodes. # How Ray will authenticate with newly launched nodes.
auth: auth:
@ -53,27 +55,35 @@ auth:
# changes to this should match what is specified in file_mounts # changes to this should match what is specified in file_mounts
ssh_public_key: ~/.ssh/id_rsa.pub ssh_public_key: ~/.ssh/id_rsa.pub
# Provider-specific config for the head node, e.g. instance type. By default # More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
# Ray will auto-configure unspecified fields using defaults.yaml # See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
# Provider-specific config for the head node, e.g. instance type.
head_node: head_node:
azure_arm_parameters: azure_arm_parameters:
vmSize: Standard_NC6 vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804 imageOffer: ubuntu-1804
imageSku: "1804" imageSku: 1804-gen2
imageVersion: 20.07.06 imageVersion: 20.07.06
# Provider-specific config for worker nodes, e.g. instance type. By default # Provider-specific config for worker nodes, e.g. instance type.
# Ray will auto-configure unspecified fields using defaults.yaml
worker_nodes: worker_nodes:
azure_arm_parameters: azure_arm_parameters:
vmSize: Standard_NC6 vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804 imageOffer: ubuntu-1804
imageSku: "1804" imageSku: 1804-gen2
imageVersion: 20.07.06 imageVersion: 20.07.06
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
# billingProfile:
# maxPrice: -1
# Files or directories to copy to the head and worker nodes. The format is a # Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -83,6 +93,27 @@ file_mounts: {
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub" "/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
} }
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
- "**/.git"
- "**/.git/**"
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
- ".gitignore"
# List of commands that will be run before `setup_commands`. If docker is # List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker # enabled, these commands will run outside the container and before docker
# is setup. # is setup.
@ -92,20 +123,16 @@ initialization_commands:
# List of shell commands to run to set up nodes. # List of shell commands to run to set up nodes.
setup_commands: setup_commands:
# Note: if you're developing Ray, you probably want to create an AMI that # Note: if you're developing Ray, you probably want to create a Docker image that
# has your Ray repo pre-cloned. Then, you can replace the pip installs # has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile). # below with a git checkout <your_sha> (and possibly a recompile).
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc - echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc - echo 'conda activate py37_tensorflow' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Consider uncommenting these if you also want to run apt-get commands during setup
# - sudo pkill -9 apt-get || true
# - sudo pkill -9 dpkg || true
# - sudo dpkg --configure -a
# Custom commands that will be run on the head node after common setup. # Custom commands that will be run on the head node after common setup.
head_setup_commands: head_setup_commands:
- pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0 - pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0
# Custom commands that will be run on worker nodes after common setup. # Custom commands that will be run on worker nodes after common setup.

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster. # An unique identifier for the head node and workers of this cluster.
cluster_name: default cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head # The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. # node.
max_workers: 2 max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed. # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -17,7 +13,7 @@ upscaling_speed: 1.0
# This executes all commands on all nodes in the docker container, # This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster. # and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled. # Empty object means disabled.
docker: docker:
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull # image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
@ -60,30 +56,55 @@ auth:
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs # Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below # on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
# Provider-specific config for the head node, e.g. instance type. # Tell the autoscaler the allowed node types and the resources they provide.
head_node: # The key is the name of the node type, which is just for debugging purposes.
azure_arm_parameters: # The node config specifies the launch config and physical instance type.
vmSize: Standard_D2s_v3 available_node_types:
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage ray.head.default:
imagePublisher: microsoft-dsvm # The minimum number of worker nodes of this type to launch.
imageOffer: ubuntu-1804 # This number should be >= 0.
imageSku: 1804-gen2 min_workers: 0
imageVersion: 20.07.06 # The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config, e.g. instance type.
node_config:
azure_arm_parameters:
vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
# Provider-specific config for worker nodes, e.g. instance type. ray.worker.default:
worker_nodes: # The minimum number of worker nodes of this type to launch.
azure_arm_parameters: # This number should be >= 0.
vmSize: Standard_D2s_v3 min_workers: 0
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage # The maximum number of worker nodes of this type to launch.
imagePublisher: microsoft-dsvm # This takes precedence over min_workers.
imageOffer: ubuntu-1804 max_workers: 2
imageSku: 1804-gen2 # The resources provided by this node type.
imageVersion: 20.07.06 resources: {"CPU": 2}
# optionally set priority to use Spot instances # Provider-specific config, e.g. instance type.
priority: Spot node_config:
# set a maximum price for spot instances if desired azure_arm_parameters:
# billingProfile: vmSize: Standard_D2s_v3
# maxPrice: -1 # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
# billingProfile:
# maxPrice: -1
# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default
# Files or directories to copy to the head and worker nodes. The format is a # Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -147,3 +168,6 @@ head_start_ray_commands:
worker_start_ray_commands: worker_start_ray_commands:
- ray stop - ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
head_node: {}
worker_nodes: {}

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster. # An unique identifier for the head node and workers of this cluster.
cluster_name: gpu-docker cluster_name: gpu-docker
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head # The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. # node.
max_workers: 2 max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed. # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -21,7 +17,7 @@ upscaling_speed: 1.0
docker: docker:
image: "rayproject/ray-ml:latest-gpu" image: "rayproject/ray-ml:latest-gpu"
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull # image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_nvidia_docker" # e.g. ray_docker container_name: "ray_nvidia_docker"
# # Example of running a GPU head with CPU workers # # Example of running a GPU head with CPU workers
# head_image: "rayproject/ray-ml:latest-gpu" # head_image: "rayproject/ray-ml:latest-gpu"
@ -45,17 +41,40 @@ auth:
# changes to this should match what is specified in file_mounts # changes to this should match what is specified in file_mounts
ssh_public_key: ~/.ssh/id_rsa.pub ssh_public_key: ~/.ssh/id_rsa.pub
# Provider-specific config for the head node, e.g. instance type. By default # Tell the autoscaler the allowed node types and the resources they provide.
# Ray will auto-configure unspecified fields using defaults.yaml # The key is the name of the node type, which is just for debugging purposes.
head_node: # The node config specifies the launch config and physical instance type.
azure_arm_parameters: available_node_types:
vmSize: Standard_NC6s_v3 ray.head.gpu:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The resources provided by this node type.
resources: {"CPU": 6, "GPU": 1}
# Provider-specific config, e.g. instance type.
node_config:
azure_arm_parameters:
vmSize: Standard_NC6_v3
# Provider-specific config for worker nodes, e.g. instance type. By default ray.worker.gpu:
# Ray will auto-configure unspecified fields using defaults.yaml # The minimum number of nodes of this type to launch.
worker_nodes: # This number should be >= 0.
azure_arm_parameters: min_workers: 0
vmSize: Standard_NC6s_v3 # The maximum number of workers nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 2
# The resources provided by this node type.
resources: {"CPU": 6, "GPU": 1}
# Provider-specific config, e.g. instance type.
node_config:
azure_arm_parameters:
vmSize: Standard_NC6_v3
# Specify the node type of the head node (as configured above).
head_node_type: ray.head.gpu
# Files or directories to copy to the head and worker nodes. The format is a # Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -69,7 +88,7 @@ file_mounts: {
# NOTE: rayproject/ray-ml:latest has ray latest bundled # NOTE: rayproject/ray-ml:latest has ray latest bundled
setup_commands: [] setup_commands: []
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup. # Custom commands that will be run on the head node after common setup.
head_setup_commands: head_setup_commands:
- pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0 - pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0

View file

@ -2,7 +2,7 @@
cluster_name: minimal cluster_name: minimal
# The maximum number of workers nodes to launch in addition to the head # The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. min_workers default to 0. # node. min_workers default to 0.
max_workers: 1 max_workers: 1
# Cloud-provider specific configuration. # Cloud-provider specific configuration.

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster. # An unique identifier for the head node and workers of this cluster.
cluster_name: default cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head # The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. # node.
max_workers: 2 max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed. # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -39,50 +35,75 @@ auth:
# project wide meta-data. # project wide meta-data.
# ssh_private_key: /path/to/your/key.pem # ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default # Tell the autoscaler the allowed node types and the resources they provide.
# Ray will auto-configure unspecified fields such as subnets and ssh-keys. # The key is the name of the node type, which is just for debugging purposes.
# For more documentation on available fields, see: # The node config specifies the launch config and physical instance type.
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert available_node_types:
head_node: ray_head_default:
machineType: n1-standard-2 # The minimum number of worker nodes of this type to launch.
disks: # This number should be >= 0.
- boot: true min_workers: 0
autoDelete: true # The maximum number of worker nodes of this type to launch.
type: PERSISTENT # This takes precedence over min_workers.
initializeParams: max_workers: 0
diskSizeGb: 50 # The resources provided by this node type.
# See https://cloud.google.com/compute/docs/images for more images resources: {"CPU": 2}
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu # Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
node_config:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# If the network interface is specified as below in both head and worker
# nodes, the manual network config is used. Otherwise an existing subnet is
# used. To use a shared subnet, ask the subnet owner to grant permission
# for 'compute.subnetworks.use' to the ray autoscaler account...
# networkInterfaces:
# - kind: compute#networkInterface
# subnetwork: path/to/subnet
# aliasIpRanges: []
ray_worker_small:
# The minimum number of nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
node_config:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
# Additional options can be found in in the compute docs at # Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# If the network interface is specified as below in both head and worker # Specify the node type of the head node (as configured above).
# nodes, the manual network config is used. Otherwise an existing subnet is head_node_type: ray_head_default
# used. To use a shared subnet, ask the subnet owner to grant permission
# for 'compute.subnetworks.use' to the ray autoscaler account...
# networkInterfaces:
# - kind: compute#networkInterface
# subnetwork: path/to/subnet
# aliasIpRanges: []
worker_nodes:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# Files or directories to copy to the head and worker nodes. The format is a # Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -159,3 +180,6 @@ worker_start_ray_commands:
ray start ray start
--address=$RAY_HEAD_IP:6379 --address=$RAY_HEAD_IP:6379
--object-manager-port=8076 --object-manager-port=8076
head_node: {}
worker_nodes: {}

View file

@ -0,0 +1,167 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0
# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_container"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
pull_before_run: True
run_options: [] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers
# head_image: "rayproject/ray-ml:latest-gpu"
# Allow Ray to automatically detect GPUs
# worker_image: "rayproject/ray-ml:latest-cpu"
# worker_run_options: []
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Cloud-provider specific configuration.
provider:
type: gcp
region: us-west1
availability_zone: us-west1-a
project_id: null # Globally unique project id
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below. This requires that you have added the key into the
# project wide meta-data.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
head_node:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# If the network interface is specified as below in both head and worker
# nodes, the manual network config is used. Otherwise an existing subnet is
# used. To use a shared subnet, ask the subnet owner to grant permission
# for 'compute.subnetworks.use' to the ray autoscaler account...
# networkInterfaces:
# - kind: compute#networkInterface
# subnetwork: path/to/subnet
# aliasIpRanges: []
worker_nodes:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
- "**/.git"
- "**/.git/**"
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
- ".gitignore"
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Note: if you're developing Ray, you probably want to create a Docker image that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
- pip install google-api-python-client==1.7.8
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- >-
ulimit -n 65536;
ray start
--head
--port=6379
--object-manager-port=8076
--autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- >-
ulimit -n 65536;
ray start
--address=$RAY_HEAD_IP:6379
--object-manager-port=8076

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster. # An unique identifier for the head node and workers of this cluster.
cluster_name: default cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head # The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. # node.
max_workers: 2 max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed. # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -53,50 +49,78 @@ auth:
# project wide meta-data. # project wide meta-data.
# ssh_private_key: /path/to/your/key.pem # ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default # Tell the autoscaler the allowed node types and the resources they provide.
# Ray will auto-configure unspecified fields such as subnets and ssh-keys. # The key is the name of the node type, which is just for debugging purposes.
# For more documentation on available fields, see: # The node config specifies the launch config and physical instance type.
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert available_node_types:
head_node: ray_head_default:
machineType: n1-standard-2 # The minimum number of worker nodes of this type to launch.
disks: # This number should be >= 0.
- boot: true min_workers: 0
autoDelete: true # The maximum number of worker nodes of this type to launch.
type: PERSISTENT # This takes precedence over min_workers.
initializeParams: max_workers: 0
diskSizeGb: 50 # The resources provided by this node type.
# See https://cloud.google.com/compute/docs/images for more images resources: {"CPU": 2}
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu # Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
node_config:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# If the network interface is specified as below in both head and worker
# nodes, the manual network config is used. Otherwise an existing subnet is
# used. To use a shared subnet, ask the subnet owner to grant permission
# for 'compute.subnetworks.use' to the ray autoscaler account...
# networkInterfaces:
# - kind: compute#networkInterface
# subnetwork: path/to/subnet
# aliasIpRanges: []
ray_worker_small:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 2
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
node_config:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
# Additional options can be found in in the compute docs at # Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# If the network interface is specified as below in both head and worker # Specify the node type of the head node (as configured above).
# nodes, the manual network config is used. Otherwise an existing subnet is head_node_type: ray_head_default
# used. To use a shared subnet, ask the subnet owner to grant permission
# for 'compute.subnetworks.use' to the ray autoscaler account...
# networkInterfaces:
# - kind: compute#networkInterface
# subnetwork: path/to/subnet
# aliasIpRanges: []
worker_nodes:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# Files or directories to copy to the head and worker nodes. The format is a # Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -166,3 +190,6 @@ worker_start_ray_commands:
ray start ray start
--address=$RAY_HEAD_IP:6379 --address=$RAY_HEAD_IP:6379
--object-manager-port=8076 --object-manager-port=8076
head_node: {}
worker_nodes: {}

View file

@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster. # An unique identifier for the head node and workers of this cluster.
cluster_name: gpu-docker cluster_name: gpu-docker
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head # The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. # node.
max_workers: 2 max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed. # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -48,58 +44,81 @@ auth:
# project wide meta-data. # project wide meta-data.
# ssh_private_key: /path/to/your/key.pem # ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default # Tell the autoscaler the allowed node types and the resources they provide.
# Ray will auto-configure unspecified fields such as subnets and ssh-keys. # The key is the name of the node type, which is just for debugging purposes.
# For more documentation on available fields, see: # The node config specifies the launch config and physical instance type.
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert available_node_types:
head_node: ray_head_gpu:
machineType: custom-6-16384 # The minimum number of worker nodes of this type to launch.
disks: # This number should be >= 0.
- boot: true min_workers: 0
autoDelete: true # The maximum number of worker nodes of this type to launch.
type: PERSISTENT # This takes precedence over min_workers.
initializeParams: max_workers: 0
diskSizeGb: 50 # The resources provided by this node type.
# See https://cloud.google.com/compute/docs/images for more images resources: {"CPU": 6, "GPU": 1}
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110 # Provider-specific config for the head node, e.g. instance type. By default
guestAccelerators: # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80 # For more documentation on available fields, see:
acceleratorCount: 1 # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
metadata: node_config:
items: machineType: custom-6-16384
- key: install-nvidia-driver disks:
value: "True" - boot: true
scheduling: autoDelete: true
- onHostMaintenance: TERMINATE type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
guestAccelerators:
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
acceleratorCount: 1
metadata:
items:
- key: install-nvidia-driver
value: "True"
scheduling:
- onHostMaintenance: TERMINATE
# Additional options can be found in in the compute docs at ray_worker_gpu:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert # The minimum number of nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 2
# The resources provided by this node type.
resources: {"CPU": 2, "GPU": 1}
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
node_config:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
guestAccelerators:
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
acceleratorCount: 1
metadata:
items:
- key: install-nvidia-driver
value: "True"
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
- onHostMaintenance: TERMINATE
worker_nodes: # Specify the node type of the head node (as configured above).
machineType: n1-standard-2 head_node_type: ray_head_gpu
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
guestAccelerators:
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
acceleratorCount: 1
metadata:
items:
- key: install-nvidia-driver
value: "True"
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
- onHostMaintenance: TERMINATE
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# Files or directories to copy to the head and worker nodes. The format is a # Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.

View file

@ -2,7 +2,7 @@
cluster_name: minimal cluster_name: minimal
# The maximum number of worker nodes to launch in addition to the head # The maximum number of worker nodes to launch in addition to the head
# node. This takes precedence over min_workers. min_workers default to 0. # node. min_workers default to 0.
max_workers: 1 max_workers: 1
# Cloud-provider specific configuration. # Cloud-provider specific configuration.

View file

@ -96,8 +96,6 @@ available_node_types:
worker_node: worker_node:
# Minimum number of Ray workers of this Pod type. # Minimum number of Ray workers of this Pod type.
min_workers: 0 min_workers: 0
# Maximum number of Ray workers of this Pod type. Takes precedence over min_workers.
max_workers: 2
node_config: node_config:
apiVersion: v1 apiVersion: v1
kind: Pod kind: Pod
@ -136,6 +134,12 @@ available_node_types:
# cause problems for other pods. # cause problems for other pods.
memory: 512Mi memory: 512Mi
head_node: head_node:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
node_config: node_config:
apiVersion: v1 apiVersion: v1
kind: Pod kind: Pod

View file

@ -139,6 +139,12 @@ available_node_types:
# cause problems for other pods. # cause problems for other pods.
memory: 512Mi memory: 512Mi
head_node: head_node:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
node_config: node_config:
apiVersion: v1 apiVersion: v1
kind: Pod kind: Pod

View file

@ -17,6 +17,10 @@ spec:
# Specify the allowed pod types for this ray cluster and the resources they provide. # Specify the allowed pod types for this ray cluster and the resources they provide.
podTypes: podTypes:
- name: head-node - name: head-node
# Minimum number of Ray workers of this Pod type.
minWorkers: 0
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
maxWorkers: 0
podConfig: podConfig:
apiVersion: v1 apiVersion: v1
kind: Pod kind: Pod

View file

@ -17,6 +17,10 @@ spec:
# Specify the allowed pod types for this ray cluster and the resources they provide. # Specify the allowed pod types for this ray cluster and the resources they provide.
podTypes: podTypes:
- name: head-node - name: head-node
# Minimum number of Ray workers of this Pod type.
minWorkers: 0
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
maxWorkers: 0
podConfig: podConfig:
apiVersion: v1 apiVersion: v1
kind: Pod kind: Pod

View file

@ -1,16 +1,8 @@
# An unique identifier for the head node and workers of this cluster. # An unique identifier for the head node and workers of this cluster.
cluster_name: default cluster_name: default
## NOTE: Typically for local clusters, min_workers == max_workers == len(worker_ips). ## NOTE: Typically for local clusters, max_workers == len(worker_ips).
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
# Typically, min_workers == max_workers == len(worker_ips).
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head node.
# This takes precedence over min_workers.
# Typically, min_workers == max_workers == len(worker_ips).
max_workers: 0 max_workers: 0
# The autoscaler will scale up the cluster faster with higher upscaling speed. # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -42,11 +34,20 @@ auth:
# Optional if an ssh private key is necessary to ssh to the cluster. # Optional if an ssh private key is necessary to ssh to the cluster.
# ssh_private_key: ~/.ssh/id_rsa # ssh_private_key: ~/.ssh/id_rsa
# Leave this empty. available_node_types:
head_node: {} ray.head.default:
resources: {}
# Leave this empty. min_workers: 0
worker_nodes: {} max_workers: 0
# Leave this empty
node_config: {}
ray.worker.default:
resources: {}
## NOTE: Typically for local clusters, max_workers == len(worker_ips).
min_workers: 0
# Leave this empty
node_config: {}
head_node_type: ray.head.default
# Files or directories to copy to the head and worker nodes. The format is a # Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -97,3 +98,6 @@ head_start_ray_commands:
worker_start_ray_commands: worker_start_ray_commands:
- ray stop - ray stop
- ray start --address=$RAY_HEAD_IP:6379 - ray start --address=$RAY_HEAD_IP:6379
head_node: {}
worker_nodes: {}

View file

@ -2,10 +2,6 @@
# A namespace will be automatically created for each cluster_name in SKE. # A namespace will be automatically created for each cluster_name in SKE.
cluster_name: default cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head # The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. # node. This takes precedence over min_workers.
max_workers: 2 max_workers: 2
@ -85,174 +81,184 @@ provider:
# Exposing external IP addresses for ray pods isn't currently supported. # Exposing external IP addresses for ray pods isn't currently supported.
use_internal_ips: true use_internal_ips: true
# Kubernetes pod config for the head node pod. head_node_type: ray.head.default
head_node:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-head-
# Must match the head node service selector above if a head node available_node_types:
# service is required. ray.head.default:
labels: resources: {"CPU": 1}
component: ray-head min_workers: 0
max_workers: 0
# Kubernetes pod config for the head node pod.
node_config:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-head-
# https://docs.staroid.com/ske/pod.html#pod # Must match the head node service selector above if a head node
pod.staroid.com/spot: "false" # use on-demand instance for head. # service is required.
labels:
component: ray-head
# Uncomment to locate ray head to dedicated Kubernetes node # https://docs.staroid.com/ske/pod.html#pod
# (GPU instance is only available for 'dedicated' isolation) pod.staroid.com/spot: "false" # use on-demand instance for head.
#pod.staroid.com/isolation: dedicated
#pod.staroid.com/instance-type: gpu-1
spec:
automountServiceAccountToken: true
# Restarting the head node automatically is not currently supported. # Uncomment to locate ray head to dedicated Kubernetes node
# If the head node goes down, `ray up` must be run again. # (GPU instance is only available for 'dedicated' isolation)
restartPolicy: Never #pod.staroid.com/isolation: dedicated
#pod.staroid.com/instance-type: gpu-1
spec:
automountServiceAccountToken: true
# This volume allocates shared memory for Ray to use for its plasma # Restarting the head node automatically is not currently supported.
# object store. If you do not provide this, Ray will fall back to # If the head node goes down, `ray up` must be run again.
# /tmp which cause slowdowns if is not a shared memory volume. restartPolicy: Never
volumes:
- name: dshm
emptyDir:
medium: Memory
# nfs volume provides a shared volume across all ray-nodes.
- name: nfs-volume
persistentVolumeClaim:
claimName: nfs
containers: # This volume allocates shared memory for Ray to use for its plasma
- name: ray-node # object store. If you do not provide this, Ray will fall back to
imagePullPolicy: Always # /tmp which cause slowdowns if is not a shared memory volume.
# You are free (and encouraged) to use your own container image, volumes:
# but it should have the following installed: - name: dshm
# - rsync (used for `ray rsync` commands and file mounts) emptyDir:
# - screen (used for `ray attach`) medium: Memory
# - kubectl (used by the autoscaler to manage worker pods) # nfs volume provides a shared volume across all ray-nodes.
# Image will be overridden when 'image_from_project' is true. - name: nfs-volume
image: rayproject/ray persistentVolumeClaim:
# Do not change this command - it keeps the pod alive until it is claimName: nfs
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 6379 # Redis port.
- containerPort: 6380 # Redis port.
- containerPort: 6381 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma containers:
# object store. If you do not provide this, Ray will fall back to - name: ray-node
# /tmp which cause slowdowns if is not a shared memory volume. imagePullPolicy: Always
volumeMounts: # You are free (and encouraged) to use your own container image,
- mountPath: /dev/shm # but it should have the following installed:
name: dshm # - rsync (used for `ray rsync` commands and file mounts)
- mountPath: /nfs # - screen (used for `ray attach`)
name: nfs-volume # - kubectl (used by the autoscaler to manage worker pods)
resources: # Image will be overridden when 'image_from_project' is true.
requests: image: rayproject/ray
cpu: 1000m # Do not change this command - it keeps the pod alive until it is
memory: 2Gi # explicitly killed.
limits: command: ["/bin/bash", "-c", "--"]
# The maximum memory that this pod is allowed to use. The args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
# limit will be detected by ray and split to use 10% for ports:
# redis, 30% for the shared memory object store, and the - containerPort: 6379 # Redis port.
# rest for application memory. If this limit is not set and - containerPort: 6380 # Redis port.
# the object store size is not set manually, ray will - containerPort: 6381 # Redis port.
# allocate a very large object store in each pod that may - containerPort: 12345 # Ray internal communication.
# cause problems for other pods. - containerPort: 12346 # Ray internal communication.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
- name: RAY_ADDRESS
value: "auto"
# Kubernetes pod config for worker node pods. # This volume allocates shared memory for Ray to use for its plasma
worker_nodes: # object store. If you do not provide this, Ray will fall back to
apiVersion: v1 # /tmp which cause slowdowns if is not a shared memory volume.
kind: Pod volumeMounts:
metadata: - mountPath: /dev/shm
# Automatically generates a name for the pod with this prefix. name: dshm
generateName: ray-worker- - mountPath: /nfs
name: nfs-volume
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
- name: RAY_ADDRESS
value: "auto"
# Must match the worker node service selector above if a worker node ray.worker.default:
# service is required. min_workers: 0
labels: resources: {"CPU": 1}
component: ray-worker # Kubernetes pod config for worker node pods.
node_config:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-worker-
# https://docs.staroid.com/ske/pod.html#pod # Must match the worker node service selector above if a worker node
pod.staroid.com/spot: "true" # use spot instance for workers. # service is required.
labels:
component: ray-worker
# Uncomment to locate ray head to dedicated Kubernetes node # https://docs.staroid.com/ske/pod.html#pod
# (GPU instance is only available for 'dedicated' isolation) pod.staroid.com/spot: "true" # use spot instance for workers.
#pod.staroid.com/isolation: dedicated
#pod.staroid.com/instance-type: gpu-1
spec:
serviceAccountName: default
# Worker nodes will be managed automatically by the head node, so # Uncomment to locate ray head to dedicated Kubernetes node
# do not change the restart policy. # (GPU instance is only available for 'dedicated' isolation)
restartPolicy: Never #pod.staroid.com/isolation: dedicated
#pod.staroid.com/instance-type: gpu-1
spec:
serviceAccountName: default
# This volume allocates shared memory for Ray to use for its plasma # Worker nodes will be managed automatically by the head node, so
# object store. If you do not provide this, Ray will fall back to # do not change the restart policy.
# /tmp which cause slowdowns if is not a shared memory volume. restartPolicy: Never
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: nfs-volume
persistentVolumeClaim:
claimName: nfs
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
image: rayproject/autoscaler
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma # This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to # object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume. # /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts: volumes:
- mountPath: /dev/shm - name: dshm
name: dshm emptyDir:
- mountPath: /nfs medium: Memory
name: nfs-volume - name: nfs-volume
resources: persistentVolumeClaim:
requests: claimName: nfs
cpu: 1000m containers:
memory: 2Gi - name: ray-node
limits: imagePullPolicy: Always
# This memory limit will be detected by ray and split into # You are free (and encouraged) to use your own container image,
# 30% for plasma, and 70% for workers. # but it should have the following installed:
memory: 2Gi # - rsync (used for `ray rsync` commands and file mounts)
env: image: rayproject/autoscaler
# This is used in the head_start_ray_commands below so that # Do not change this command - it keeps the pod alive until it is
# Ray can spawn the correct number of processes. Omitting this # explicitly killed.
# may lead to degraded performance. command: ["/bin/bash", "-c", "--"]
- name: MY_CPU_REQUEST args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
valueFrom: ports:
resourceFieldRef: - containerPort: 12345 # Ray internal communication.
resource: requests.cpu - containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /nfs
name: nfs-volume
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
# This memory limit will be detected by ray and split into
# 30% for plasma, and 70% for workers.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
# Files or directories to copy to the head and worker nodes. The format is a # Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -307,3 +313,6 @@ head_start_ray_commands:
worker_start_ray_commands: worker_start_ray_commands:
- ray stop - ray stop
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
head_node: {}
worker_nodes: {}

View file

@ -1,8 +1,8 @@
import pytest import pytest
from ray.autoscaler._private.aws.config import _get_vpc_id_or_die, \ from ray.autoscaler._private.aws.config import _get_vpc_id_or_die, \
bootstrap_aws, \ bootstrap_aws, \
DEFAULT_AMI DEFAULT_AMI
import ray.tests.aws.utils.stubs as stubs import ray.tests.aws.utils.stubs as stubs
import ray.tests.aws.utils.helpers as helpers import ray.tests.aws.utils.helpers as helpers
from ray.tests.aws.utils.constants import AUX_SUBNET, DEFAULT_SUBNET, \ from ray.tests.aws.utils.constants import AUX_SUBNET, DEFAULT_SUBNET, \
@ -143,8 +143,10 @@ def test_fills_out_amis(iam_client_stub, ec2_client_stub):
stubs.configure_subnet_default(ec2_client_stub) stubs.configure_subnet_default(ec2_client_stub)
config = helpers.load_aws_example_config_file("example-full.yaml") config = helpers.load_aws_example_config_file("example-full.yaml")
del config["head_node"]["ImageId"] del config["available_node_types"]["ray.head.default"]["node_config"][
del config["worker_nodes"]["ImageId"] "ImageId"]
del config["available_node_types"]["ray.worker.default"]["node_config"][
"ImageId"]
# Pass in SG for stub to work # Pass in SG for stub to work
config["head_node"]["SecurityGroupIds"] = ["sg-1234abcd"] config["head_node"]["SecurityGroupIds"] = ["sg-1234abcd"]

View file

@ -1,4 +1,5 @@
import json import json
import jsonschema
import os import os
import shutil import shutil
from subprocess import CalledProcessError from subprocess import CalledProcessError
@ -264,6 +265,55 @@ SMALL_CLUSTER = {
"worker_start_ray_commands": ["start_ray_worker"], "worker_start_ray_commands": ["start_ray_worker"],
} }
MOCK_DEFAULT_CONFIG = {
"cluster_name": "default",
"max_workers": 2,
"upscaling_speed": 1.0,
"idle_timeout_minutes": 5,
"provider": {
"type": "mock",
"region": "us-east-1",
"availability_zone": "us-east-1a",
},
"docker": {
"image": "example",
"container_name": "mock",
},
"auth": {
"ssh_user": "ubuntu",
"ssh_private_key": os.devnull,
},
"available_node_types": {
"ray.head.default": {
"min_workers": 0,
"max_workers": 0,
"resources": {},
"node_config": {
"head_default_prop": 4
}
},
"ray.worker.default": {
"min_workers": 0,
"max_workers": 2,
"resources": {},
"node_config": {
"worker_default_prop": 7
}
}
},
"head_node_type": "ray.head.default",
"head_node": {},
"worker_nodes": {},
"file_mounts": {},
"cluster_synced_files": [],
"initialization_commands": [],
"setup_commands": [],
"head_setup_commands": [],
"worker_setup_commands": [],
"head_start_ray_commands": [],
"worker_start_ray_commands": [],
}
class LoadMetricsTest(unittest.TestCase): class LoadMetricsTest(unittest.TestCase):
def testHeartbeat(self): def testHeartbeat(self):
@ -1645,6 +1695,28 @@ class AutoscalingTest(unittest.TestCase):
config_path, LoadMetrics(), max_failures=0, update_interval_s=0) config_path, LoadMetrics(), max_failures=0, update_interval_s=0)
assert isinstance(autoscaler.provider, NodeProvider) assert isinstance(autoscaler.provider, NodeProvider)
def testLegacyExternalNodeScalerMissingFields(self):
"""Should fail to validate legacy external config with missing
head_node, worker_nodes, or both."""
external_config = copy.deepcopy(SMALL_CLUSTER)
external_config["provider"] = {
"type": "external",
"module": "ray.autoscaler.node_provider.NodeProvider",
}
missing_workers, missing_head, missing_both = [
copy.deepcopy(external_config) for _ in range(3)
]
del missing_workers["worker_nodes"]
del missing_head["head_node"]
del missing_both["worker_nodes"]
del missing_both["head_node"]
for faulty_config in missing_workers, missing_head, missing_both:
faulty_config = prepare_config(faulty_config)
with pytest.raises(jsonschema.ValidationError):
validate_config(faulty_config)
def testExternalNodeScalerWrongImport(self): def testExternalNodeScalerWrongImport(self):
config = SMALL_CLUSTER.copy() config = SMALL_CLUSTER.copy()
config["provider"] = { config["provider"] = {

View file

@ -1,4 +1,5 @@
import jsonschema import jsonschema
import logging
import os import os
import sys import sys
import tempfile import tempfile
@ -9,10 +10,12 @@ import copy
from unittest.mock import MagicMock, Mock, patch from unittest.mock import MagicMock, Mock, patch
import pytest import pytest
from ray.autoscaler._private.util import prepare_config, validate_config from ray.autoscaler._private.util import prepare_config, validate_config,\
_get_default_config, merge_setup_commands
from ray.autoscaler._private.providers import _NODE_PROVIDERS from ray.autoscaler._private.providers import _NODE_PROVIDERS
from ray.autoscaler._private.kubernetes.node_provider import\ from ray.autoscaler._private.kubernetes.node_provider import\
KubernetesNodeProvider KubernetesNodeProvider
from ray.autoscaler.tags import NODE_TYPE_LEGACY_HEAD, NODE_TYPE_LEGACY_WORKER
from ray.test_utils import load_test_config, recursive_fnmatch from ray.test_utils import load_test_config, recursive_fnmatch
@ -37,18 +40,19 @@ CONFIG_PATHS = ignore_k8s_operator_configs(CONFIG_PATHS)
class AutoscalingConfigTest(unittest.TestCase): class AutoscalingConfigTest(unittest.TestCase):
def testValidateDefaultConfig(self): def testValidateDefaultConfig(self):
for config_path in CONFIG_PATHS: for config_path in CONFIG_PATHS:
if "aws/example-multi-node-type.yaml" in config_path:
# aws is tested in testValidateDefaultConfigAWSMultiNodeTypes.
continue
with open(config_path) as f:
config = yaml.safe_load(f)
config = prepare_config(config)
if config["provider"]["type"] == "kubernetes":
KubernetesNodeProvider.fillout_available_node_types_resources(
config)
try: try:
if "aws/example-multi-node-type.yaml" in config_path:
# aws tested in testValidateDefaultConfigAWSMultiNodeTypes.
continue
with open(config_path) as f:
config = yaml.safe_load(f)
config = prepare_config(config)
if config["provider"]["type"] == "kubernetes":
KubernetesNodeProvider.\
fillout_available_node_types_resources(config)
validate_config(config) validate_config(config)
except Exception: except Exception:
logging.exception("")
self.fail( self.fail(
f"Config {config_path} did not pass validation test!") f"Config {config_path} did not pass validation test!")
@ -232,7 +236,6 @@ class AutoscalingConfigTest(unittest.TestCase):
self.fail("Failed to validate config with security group name!") self.fail("Failed to validate config with security group name!")
def testMaxWorkerDefault(self): def testMaxWorkerDefault(self):
# Load config, call prepare config, check that default max_workers # Load config, call prepare config, check that default max_workers
# is filled correctly for node types that don't specify it. # is filled correctly for node types that don't specify it.
# Check that max_workers is untouched for node types # Check that max_workers is untouched for node types
@ -254,7 +257,7 @@ class AutoscalingConfigTest(unittest.TestCase):
# Max workers auto-filled with specified cluster-wide value of 5. # Max workers auto-filled with specified cluster-wide value of 5.
assert config["max_workers"] ==\ assert config["max_workers"] ==\
prepared_node_types["worker_node_max_unspecified"]["max_workers"]\ prepared_node_types["worker_node_max_unspecified"]["max_workers"]\
== config["max_workers"] == 5 == 5
# Repeat with a config that doesn't specify global max workers. # Repeat with a config that doesn't specify global max workers.
# Default value of 2 should be pulled in for global max workers. # Default value of 2 should be pulled in for global max workers.
@ -275,8 +278,87 @@ class AutoscalingConfigTest(unittest.TestCase):
prepared_node_types["worker_node_max_specified"][ prepared_node_types["worker_node_max_specified"][
"max_workers"] == 3 "max_workers"] == 3
# Max workers auto-filled with default cluster-wide value of 2. # Max workers auto-filled with default cluster-wide value of 2.
assert prepared_node_types["worker_node_max_unspecified"][ assert prepared_config["max_workers"] ==\
"max_workers"] == 2 prepared_node_types["worker_node_max_unspecified"]["max_workers"]\
== 2
def testFillEdgeLegacyConfigs(self):
# Test edge cases: legacy configs which specify workers but not head
# or vice-versa.
no_head = load_test_config("test_no_head.yaml")
aws_defaults = _get_default_config(no_head["provider"])
head_prepared = prepare_config(no_head)
assert head_prepared["available_node_types"][
"ray-legacy-head-node-type"]["node_config"] ==\
aws_defaults["available_node_types"][
"ray.head.default"]["node_config"]
assert head_prepared["head_node"] == {}
# Custom worker config preserved
node_types = head_prepared["available_node_types"]
worker_type = node_types["ray-legacy-worker-node-type"]
assert worker_type["node_config"] == head_prepared["worker_nodes"] == {
"foo": "bar"
}
no_workers = load_test_config("test_no_workers.yaml")
workers_prepared = prepare_config(no_workers)
assert workers_prepared["available_node_types"][
"ray-legacy-worker-node-type"]["node_config"] ==\
aws_defaults["available_node_types"][
"ray.worker.default"]["node_config"]
assert workers_prepared["worker_nodes"] == {}
# Custom head config preserved
node_types = workers_prepared["available_node_types"]
head_type = node_types["ray-legacy-head-node-type"]
assert head_type["node_config"] == workers_prepared["head_node"] == {
"baz": "qux"
}
@pytest.mark.skipif(
sys.platform.startswith("win"), reason="Fails on Windows.")
def testExampleFull(self):
"""
Test that example-full yamls are unmodified by prepared_config,
except possibly by having setup_commands merged.
"""
providers = ["aws", "gcp", "azure"]
for provider in providers:
path = os.path.join(RAY_PATH, "autoscaler", provider,
"example-full.yaml")
config = yaml.safe_load(open(path).read())
config_copy = copy.deepcopy(config)
merge_setup_commands(config_copy)
assert config_copy == prepare_config(config)
@pytest.mark.skipif(
sys.platform.startswith("win"), reason="Fails on Windows.")
def testLegacyYaml(self):
# Test correct default-merging behavior for legacy yamls.
providers = ["aws", "gcp", "azure"]
for provider in providers:
path = os.path.join(RAY_PATH, "autoscaler", provider,
"example-full-legacy.yaml")
legacy_config = yaml.safe_load(open(path).read())
# custom head and workers
legacy_config["head_node"] = {"blahblah": 0}
legacy_config["worker_nodes"] = {"halbhalhb": 0}
legacy_config_copy = copy.deepcopy(legacy_config)
prepared_legacy = prepare_config(legacy_config_copy)
assert prepared_legacy["available_node_types"][
NODE_TYPE_LEGACY_HEAD]["max_workers"] == 0
assert prepared_legacy["available_node_types"][
NODE_TYPE_LEGACY_HEAD]["min_workers"] == 0
assert prepared_legacy["available_node_types"][
NODE_TYPE_LEGACY_HEAD]["node_config"] == legacy_config[
"head_node"]
assert prepared_legacy["available_node_types"][
NODE_TYPE_LEGACY_WORKER]["max_workers"] == 2
assert prepared_legacy["available_node_types"][
NODE_TYPE_LEGACY_WORKER]["min_workers"] == 0
assert prepared_legacy["available_node_types"][
NODE_TYPE_LEGACY_WORKER]["node_config"] == legacy_config[
"worker_nodes"]
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -0,0 +1,123 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0
# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_container"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
pull_before_run: True
run_options: [] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers
# head_image: "rayproject/ray-ml:latest-gpu"
# Allow Ray to automatically detect GPUs
# worker_image: "rayproject/ray-ml:latest-cpu"
# worker_run_options: []
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Cloud-provider specific configuration.
provider:
type: aws
region: us-west-2
# Availability zone(s), comma-separated, that nodes may be launched in.
# Nodes are currently spread between zones by a round-robin approach,
# however this implementation detail should not be relied upon.
availability_zone: us-west-2a,us-west-2b
# Whether to allow node reuse. If set to False, nodes will be terminated
# instead of stopped.
cache_stopped_nodes: True # If not present, the default is True.
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
worker_nodes:
foo: bar
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
- "**/.git"
- "**/.git/**"
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
- ".gitignore"
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Note: if you're developing Ray, you probably want to create a Docker image that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

View file

@ -0,0 +1,124 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0
# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_container"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
pull_before_run: True
run_options: [] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers
# head_image: "rayproject/ray-ml:latest-gpu"
# Allow Ray to automatically detect GPUs
# worker_image: "rayproject/ray-ml:latest-cpu"
# worker_run_options: []
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Cloud-provider specific configuration.
provider:
type: aws
region: us-west-2
# Availability zone(s), comma-separated, that nodes may be launched in.
# Nodes are currently spread between zones by a round-robin approach,
# however this implementation detail should not be relied upon.
availability_zone: us-west-2a,us-west-2b
# Whether to allow node reuse. If set to False, nodes will be terminated
# instead of stopped.
cache_stopped_nodes: True # If not present, the default is True.
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
head_node:
baz: qux
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
- "**/.git"
- "**/.git/**"
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
- ".gitignore"
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Note: if you're developing Ray, you probably want to create a Docker image that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

View file

@ -195,4 +195,4 @@ class KubernetesOperatorTest(unittest.TestCase):
if __name__ == "__main__": if __name__ == "__main__":
kubernetes.config.load_kube_config() kubernetes.config.load_kube_config()
sys.exit(pytest.main(["-v", __file__])) sys.exit(pytest.main(["-sv", __file__]))

View file

@ -5,15 +5,16 @@ import yaml
import tempfile import tempfile
import shutil import shutil
import unittest import unittest
from unittest import mock
import copy import copy
import ray import ray
import ray.ray_constants import ray.ray_constants
from ray.autoscaler._private.util import \ from ray.autoscaler._private.util import \
rewrite_legacy_yaml_to_available_node_types, format_info_string, \ prepare_config, format_info_string, \
format_info_string_no_node_types format_info_string_no_node_types
from ray.tests.test_autoscaler import SMALL_CLUSTER, MockProvider, \ from ray.tests.test_autoscaler import SMALL_CLUSTER, MOCK_DEFAULT_CONFIG, \
MockProcessRunner MockProvider, MockProcessRunner
from ray.autoscaler._private.providers import (_NODE_PROVIDERS, from ray.autoscaler._private.providers import (_NODE_PROVIDERS,
_clear_provider_cache) _clear_provider_cache)
from ray.autoscaler._private.autoscaler import StandardAutoscaler, \ from ray.autoscaler._private.autoscaler import StandardAutoscaler, \
@ -38,6 +39,8 @@ from ray.autoscaler._private.constants import \
from time import sleep from time import sleep
GET_DEFAULT_METHOD = "ray.autoscaler._private.util._get_default_config"
TYPES_A = { TYPES_A = {
"empty_node": { "empty_node": {
"node_config": { "node_config": {
@ -1042,131 +1045,135 @@ def test_get_nodes_to_launch_max_launch_concurrency():
def test_rewrite_legacy_yaml_to_available_node_types(): def test_rewrite_legacy_yaml_to_available_node_types():
cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config. with mock.patch(GET_DEFAULT_METHOD, return_value=MOCK_DEFAULT_CONFIG):
cluster_config = rewrite_legacy_yaml_to_available_node_types( cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config.
cluster_config) cluster_config = prepare_config(cluster_config)
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][ assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
"max_workers"] == 0 "max_workers"] == 0
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][ assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
"min_workers"] == 0 "min_workers"] == 0
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][ assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
"node_config"] == SMALL_CLUSTER["head_node"] "node_config"] == SMALL_CLUSTER["head_node"]
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][ assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
"node_config"] == SMALL_CLUSTER["worker_nodes"] "node_config"] == SMALL_CLUSTER["worker_nodes"]
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][ assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
"max_workers"] == SMALL_CLUSTER["max_workers"] "max_workers"] == SMALL_CLUSTER["max_workers"]
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][ assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
"min_workers"] == SMALL_CLUSTER["min_workers"] "min_workers"] == SMALL_CLUSTER["min_workers"]
def test_handle_legacy_cluster_config_yaml(): def test_handle_legacy_cluster_config_yaml():
provider = MockProvider() with mock.patch(GET_DEFAULT_METHOD, return_value=MOCK_DEFAULT_CONFIG):
head_resources = {"CPU": 8, "GPU": 1} provider = MockProvider()
worker_resources = {"CPU": 32, "GPU": 8} head_resources = {"CPU": 8, "GPU": 1}
cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config. worker_resources = {"CPU": 32, "GPU": 8}
cluster_config = rewrite_legacy_yaml_to_available_node_types( cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config.
cluster_config) cluster_config = prepare_config(cluster_config)
scheduler = ResourceDemandScheduler( scheduler = ResourceDemandScheduler(
provider, provider,
cluster_config["available_node_types"], cluster_config["available_node_types"],
0, 0,
head_node_type=NODE_TYPE_LEGACY_HEAD) head_node_type=NODE_TYPE_LEGACY_HEAD)
provider.create_node({}, { provider.create_node({}, {
TAG_RAY_NODE_KIND: NODE_KIND_HEAD, TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_HEAD TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_HEAD
}, 1) }, 1)
head_ip = provider.non_terminated_node_ips({})[0] head_ip = provider.non_terminated_node_ips({})[0]
head_node_id = provider.non_terminated_nodes({})[0] head_node_id = provider.non_terminated_nodes({})[0]
to_launch = scheduler.get_nodes_to_launch([], {}, [], {}, [], to_launch = scheduler.get_nodes_to_launch([], {}, [], {}, [],
{head_ip: head_resources}) {head_ip: head_resources})
assert to_launch == {} # Should always be empty with max_workers = 0. assert to_launch == {} # Should always be empty with max_workers = 0.
scheduler.max_workers = 30 scheduler.max_workers = 30
min_workers = scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] min_workers = scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = 0 "min_workers"]
to_launch = scheduler.get_nodes_to_launch([head_node_id], {}, [], {}, [], scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = 0
{head_ip: head_resources}) to_launch = scheduler.get_nodes_to_launch(
assert to_launch == { [head_node_id], {}, [], {}, [], {head_ip: head_resources})
} # Since the resource demand does not require adding nodes. assert to_launch == {
to_launch = scheduler.get_nodes_to_launch([head_node_id], {}, } # Since the resource demand does not require adding nodes.
[head_resources], {}, [], to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
{head_ip: head_resources}) [head_resources], {}, [],
assert to_launch == { {head_ip: head_resources})
} # Since the resource demand does not require adding nodes. assert to_launch == {
} # Since the resource demand does not require adding nodes.
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = min_workers scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
# Returns min_workers when min_workers>0. "min_workers"] = min_workers
to_launch = scheduler.get_nodes_to_launch([head_node_id], {}, # Returns min_workers when min_workers>0.
[head_resources], {}, [], to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
{head_ip: head_resources}) [head_resources], {}, [],
assert to_launch == {NODE_TYPE_LEGACY_WORKER: min_workers} {head_ip: head_resources})
assert to_launch == {NODE_TYPE_LEGACY_WORKER: min_workers}
provider.create_node({}, { provider.create_node({}, {
TAG_RAY_NODE_KIND: NODE_KIND_WORKER, TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED, TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_WORKER TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_WORKER
}, min_workers) }, min_workers)
nodes = provider.non_terminated_nodes({}) nodes = provider.non_terminated_nodes({})
to_launch = scheduler.get_nodes_to_launch(nodes, {}, [head_resources], {}, to_launch = scheduler.get_nodes_to_launch(
[], {head_ip: head_resources}) nodes, {}, [head_resources], {}, [], {head_ip: head_resources})
assert to_launch == {} # A node is running, at some point it'll connect. # A node is running, at some point it'll connect.
pending_launches = {NODE_TYPE_LEGACY_WORKER: 4} assert to_launch == {}
to_launch = scheduler.get_nodes_to_launch([], pending_launches, pending_launches = {NODE_TYPE_LEGACY_WORKER: 4}
[head_resources], {}, [], to_launch = scheduler.get_nodes_to_launch([], pending_launches,
{head_ip: head_resources}) [head_resources], {}, [],
assert to_launch == {} # A node is launching, at some point it'll connect. {head_ip: head_resources})
# A node is launching, at some point it'll connect.
assert to_launch == {}
# Now assume that we already launched/connected the nodes. # Now assume that we already launched/connected the nodes.
ips = provider.non_terminated_node_ips({}) ips = provider.non_terminated_node_ips({})
lm = LoadMetrics() lm = LoadMetrics()
worker_ips = [] worker_ips = []
for ip in ips: for ip in ips:
if ip == head_ip: if ip == head_ip:
lm.update(ip, head_resources, head_resources, {}) lm.update(ip, head_resources, head_resources, {})
else: else:
lm.update(ip, worker_resources, worker_resources, {}) lm.update(ip, worker_resources, worker_resources, {})
worker_ips.append(ip) worker_ips.append(ip)
assert not scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["resources"] assert not scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["resources"]
to_launch = scheduler.get_nodes_to_launch( to_launch = scheduler.get_nodes_to_launch(
nodes, {}, [], {}, [], lm.get_static_node_resources_by_ip()) nodes, {}, [], {}, [], lm.get_static_node_resources_by_ip())
assert scheduler.node_types[NODE_TYPE_LEGACY_WORKER][ assert scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
"resources"] == worker_resources "resources"] == worker_resources
assert to_launch == {} assert to_launch == {}
utilizations = {ip: worker_resources for ip in worker_ips} utilizations = {ip: worker_resources for ip in worker_ips}
utilizations[head_ip] = head_resources utilizations[head_ip] = head_resources
# Requires 4 nodes since worker resources is bigger than head reasources. # Needs 4 nodes since worker resources is bigger than head reasources.
demands = [worker_resources] * (len(utilizations) + 3) demands = [worker_resources] * (len(utilizations) + 3)
to_launch = scheduler.get_nodes_to_launch( to_launch = scheduler.get_nodes_to_launch(
nodes, {}, demands, utilizations, [], nodes, {}, demands, utilizations, [],
lm.get_static_node_resources_by_ip()) lm.get_static_node_resources_by_ip())
# 4 nodes are necessary to meet resource demand, but we never exceed # 4 nodes are necessary to meet resource demand, but we never exceed
# max_workers. # max_workers.
assert to_launch == {} assert to_launch == {}
scheduler.max_workers = 10 scheduler.max_workers = 10
to_launch = scheduler.get_nodes_to_launch( to_launch = scheduler.get_nodes_to_launch(
nodes, {}, demands, utilizations, [], nodes, {}, demands, utilizations, [],
lm.get_static_node_resources_by_ip()) lm.get_static_node_resources_by_ip())
# 4 nodes are necessary to meet resource demand, but we never exceed # 4 nodes are necessary to meet resource demand, but we never exceed
# max_workers. # max_workers.
assert to_launch == {} assert to_launch == {}
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["max_workers"] = 10 scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["max_workers"] = 10
to_launch = scheduler.get_nodes_to_launch( to_launch = scheduler.get_nodes_to_launch(
nodes, {}, demands, utilizations, [], nodes, {}, demands, utilizations, [],
lm.get_static_node_resources_by_ip()) lm.get_static_node_resources_by_ip())
# 4 nodes are necessary to meet resource demand. # 4 nodes are necessary to meet resource demand.
assert to_launch == {NODE_TYPE_LEGACY_WORKER: 4} assert to_launch == {NODE_TYPE_LEGACY_WORKER: 4}
to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches, demands, to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches,
utilizations, [], demands, utilizations, [],
lm.get_node_resources()) lm.get_node_resources())
# 0 because there are 4 pending launches and we only need 4. # 0 because there are 4 pending launches and we only need 4.
assert to_launch == {} assert to_launch == {}
to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches, to_launch = scheduler.get_nodes_to_launch(
demands * 2, utilizations, [], nodes, pending_launches, demands * 2, utilizations, [],
lm.get_node_resources()) lm.get_node_resources())
# 1 because there are 4 pending launches and we only allow a max of 5. # 1 because there are 4 pending launches and we only allow a max of 5.
assert to_launch == {NODE_TYPE_LEGACY_WORKER: 1} assert to_launch == {NODE_TYPE_LEGACY_WORKER: 1}
class LoadMetricsTest(unittest.TestCase): class LoadMetricsTest(unittest.TestCase):