mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[autoscaler][interface] Use multi node types in defaults.yaml and example-full.yaml (#14239)
* random doc typo * example-full-multi * left off max workers * wip * address comments, modify defaults, wip * fix * wip * reformat more things * undo useless diff * space * max workers * space * copy-paste mishaps * space * More copy-paste mishaps * copy-paste issues, space, max_workers * head_node_type * legacy yamls * line undeleted * correct-gpu * Remove redundant GPU example. * Extraneous comment * whitespace * example-java.yaml * Revert "example-java.yaml" This reverts commit 1e9c0124b9d97e651aaeeb6ec5bf7a4ef2a2df17. * tests and other things * doc * doc * revert max worker default * Kubernetes comment * wip * wip * tweak * Address comments * test_resource_demand_scheduler fixes * Head type min/max workers, aws resources * fix example_cluster2.yaml * Fix external node type test (compatibility with legacy-style external node types) * fix test_autoscaler_aws * gcp-images * gcp node type names * fix gcp defaults * doc format * typo * Skip failed Windows tests * doc string and comment * assert * remove contents of default external head and worker * legacy external failed validation test * Readability -- define the minimal external config at the top of the file. * Remove default worker type min worker * Remove extraneous global min_workers comment. * per-node-type docker in aws/example-gpu-docker * ray.worker.small -> ray.worker.default * fix-docker * fix gpu docker again * undo kubernetes experiment * fix doc * remove worker max_worker from kubernetes * remove max_worker from local worker node type * fix doc again * py38 * eric-comment * fix cluster name * fix-test-autoscaler * legacy config logic * pop resources * Remove min_workers AFTER merge * comment, warning message * warning, comment
This commit is contained in:
parent
ef873be9e8
commit
1675156a8b
32 changed files with 1774 additions and 715 deletions
|
@ -341,14 +341,13 @@ The key is the name of the node type, which is just for debugging purposes.
|
||||||
resources: {"CPU": 2}
|
resources: {"CPU": 2}
|
||||||
min_workers: 0
|
min_workers: 0
|
||||||
max_workers: 0
|
max_workers: 0
|
||||||
ray.worker.small:
|
ray.worker.default:
|
||||||
node_config:
|
node_config:
|
||||||
InstanceType: m5.large
|
InstanceType: m5.large
|
||||||
InstanceMarketOptions:
|
InstanceMarketOptions:
|
||||||
MarketType: spot
|
MarketType: spot
|
||||||
resources: {"CPU": 2}
|
resources: {"CPU": 2}
|
||||||
min_workers: 0
|
min_workers: 0
|
||||||
max_workers: 1
|
|
||||||
|
|
||||||
.. _cluster-configuration-head-node-type:
|
.. _cluster-configuration-head-node-type:
|
||||||
|
|
||||||
|
@ -1073,12 +1072,12 @@ Minimal configuration
|
||||||
:language: yaml
|
:language: yaml
|
||||||
|
|
||||||
.. group-tab:: Azure
|
.. group-tab:: Azure
|
||||||
|
|
||||||
.. literalinclude:: ../../../python/ray/autoscaler/azure/example-minimal.yaml
|
.. literalinclude:: ../../../python/ray/autoscaler/azure/example-minimal.yaml
|
||||||
:language: yaml
|
:language: yaml
|
||||||
|
|
||||||
.. group-tab:: GCP
|
.. group-tab:: GCP
|
||||||
|
|
||||||
.. literalinclude:: ../../../python/ray/autoscaler/gcp/example-minimal.yaml
|
.. literalinclude:: ../../../python/ray/autoscaler/gcp/example-minimal.yaml
|
||||||
:language: yaml
|
:language: yaml
|
||||||
|
|
||||||
|
@ -1092,11 +1091,11 @@ Full configuration
|
||||||
:language: yaml
|
:language: yaml
|
||||||
|
|
||||||
.. group-tab:: Azure
|
.. group-tab:: Azure
|
||||||
|
|
||||||
.. literalinclude:: ../../../python/ray/autoscaler/azure/example-full.yaml
|
.. literalinclude:: ../../../python/ray/autoscaler/azure/example-full.yaml
|
||||||
:language: yaml
|
:language: yaml
|
||||||
|
|
||||||
.. group-tab:: GCP
|
.. group-tab:: GCP
|
||||||
|
|
||||||
.. literalinclude:: ../../../python/ray/autoscaler/gcp/example-full.yaml
|
.. literalinclude:: ../../../python/ray/autoscaler/gcp/example-full.yaml
|
||||||
:language: yaml
|
:language: yaml
|
||||||
|
|
|
@ -71,8 +71,14 @@ def fillout_resources_kubernetes(config):
|
||||||
return config
|
return config
|
||||||
node_types = copy.deepcopy(config["available_node_types"])
|
node_types = copy.deepcopy(config["available_node_types"])
|
||||||
for node_type in node_types:
|
for node_type in node_types:
|
||||||
container_data = node_types[node_type]["node_config"]["spec"][
|
|
||||||
"containers"][0]
|
node_config = node_types[node_type]["node_config"]
|
||||||
|
# The next line is for compatibility with configs like
|
||||||
|
# kubernetes/example-ingress.yaml,
|
||||||
|
# cf. KubernetesNodeProvider.create_node().
|
||||||
|
pod = node_config.get("pod", node_config)
|
||||||
|
container_data = pod["spec"]["containers"][0]
|
||||||
|
|
||||||
autodetected_resources = get_autodetected_resources(container_data)
|
autodetected_resources = get_autodetected_resources(container_data)
|
||||||
if "resources" not in config["available_node_types"][node_type]:
|
if "resources" not in config["available_node_types"][node_type]:
|
||||||
config["available_node_types"][node_type]["resources"] = {}
|
config["available_node_types"][node_type]["resources"] = {}
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import copy
|
||||||
import importlib
|
import importlib
|
||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
|
@ -11,6 +12,17 @@ logger = logging.getLogger(__name__)
|
||||||
# For caching provider instantiations across API calls of one python session
|
# For caching provider instantiations across API calls of one python session
|
||||||
_provider_instances = {}
|
_provider_instances = {}
|
||||||
|
|
||||||
|
# Minimal config for compatibility with legacy-style external configs.
|
||||||
|
MINIMAL_EXTERNAL_CONFIG = {
|
||||||
|
"available_node_types": {
|
||||||
|
"ray.head.default": {},
|
||||||
|
"ray.worker.default": {},
|
||||||
|
},
|
||||||
|
"head_node_type": "ray.head.default",
|
||||||
|
"head_node": {},
|
||||||
|
"worker_nodes": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _import_aws(provider_config):
|
def _import_aws(provider_config):
|
||||||
from ray.autoscaler._private.aws.node_provider import AWSNodeProvider
|
from ray.autoscaler._private.aws.node_provider import AWSNodeProvider
|
||||||
|
@ -192,7 +204,7 @@ def _get_default_config(provider_config):
|
||||||
package outside the autoscaler.
|
package outside the autoscaler.
|
||||||
"""
|
"""
|
||||||
if provider_config["type"] == "external":
|
if provider_config["type"] == "external":
|
||||||
return {}
|
return copy.deepcopy(MINIMAL_EXTERNAL_CONFIG)
|
||||||
load_config = _DEFAULT_CONFIGS.get(provider_config["type"])
|
load_config = _DEFAULT_CONFIGS.get(provider_config["type"])
|
||||||
if load_config is None:
|
if load_config is None:
|
||||||
raise NotImplementedError("Unsupported node provider: {}".format(
|
raise NotImplementedError("Unsupported node provider: {}".format(
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import collections
|
import collections
|
||||||
|
import copy
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import logging
|
import logging
|
||||||
import hashlib
|
import hashlib
|
||||||
|
@ -103,38 +104,91 @@ def prepare_config(config):
|
||||||
return with_defaults
|
return with_defaults
|
||||||
|
|
||||||
|
|
||||||
def rewrite_legacy_yaml_to_available_node_types(
|
|
||||||
config: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
|
|
||||||
if "available_node_types" not in config:
|
|
||||||
# TODO(ameer/ekl/alex): we can also rewrite here many other fields
|
|
||||||
# that include initialization/setup/start commands and ImageId.
|
|
||||||
logger.debug("Converting legacy cluster config to multi node types.")
|
|
||||||
config["available_node_types"] = {
|
|
||||||
NODE_TYPE_LEGACY_HEAD: {
|
|
||||||
"node_config": config["head_node"],
|
|
||||||
"resources": config["head_node"].get("resources") or {},
|
|
||||||
"min_workers": 0,
|
|
||||||
"max_workers": 0,
|
|
||||||
},
|
|
||||||
NODE_TYPE_LEGACY_WORKER: {
|
|
||||||
"node_config": config["worker_nodes"],
|
|
||||||
"resources": config["worker_nodes"].get("resources") or {},
|
|
||||||
"min_workers": config.get("min_workers", 0),
|
|
||||||
"max_workers": config.get("max_workers", 0),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
config["head_node_type"] = NODE_TYPE_LEGACY_HEAD
|
|
||||||
del config["min_workers"]
|
|
||||||
return config
|
|
||||||
|
|
||||||
|
|
||||||
def fillout_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
|
def fillout_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
defaults = _get_default_config(config["provider"])
|
defaults = _get_default_config(config["provider"])
|
||||||
defaults.update(config)
|
defaults.update(config)
|
||||||
defaults["auth"] = defaults.get("auth", {})
|
|
||||||
defaults = rewrite_legacy_yaml_to_available_node_types(defaults)
|
# Just for clarity:
|
||||||
return defaults
|
merged_config = copy.deepcopy(defaults)
|
||||||
|
|
||||||
|
# Fill auth field to avoid key errors.
|
||||||
|
# This field is accessed when calling NodeUpdater but is not relevant to
|
||||||
|
# certain node providers and is thus left out of some cluster launching
|
||||||
|
# configs.
|
||||||
|
merged_config["auth"] = merged_config.get("auth", {})
|
||||||
|
|
||||||
|
# A legacy config is one which doesn't have available_node_types,
|
||||||
|
# but has at least one of head_node or worker_nodes.
|
||||||
|
is_legacy_config = (("available_node_types" not in config) and
|
||||||
|
("head_node" in config or "worker_nodes" in config))
|
||||||
|
# Do merging logic for legacy configs.
|
||||||
|
if is_legacy_config:
|
||||||
|
merged_config = merge_legacy_yaml_with_defaults(merged_config)
|
||||||
|
# Take care of this here, in case a config does not specify any of head,
|
||||||
|
# workers, node types, but does specify min workers:
|
||||||
|
merged_config.pop("min_workers", None)
|
||||||
|
|
||||||
|
return merged_config
|
||||||
|
|
||||||
|
|
||||||
|
def merge_legacy_yaml_with_defaults(
|
||||||
|
merged_config: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Rewrite legacy config's available node types after it has been merged
|
||||||
|
with defaults yaml.
|
||||||
|
"""
|
||||||
|
logger.warning("Converting legacy cluster config to multi node types.\n"
|
||||||
|
"Refer to the docs for examples of multi-node-type "
|
||||||
|
"autoscaling:\n"
|
||||||
|
"https://docs.ray.io/en/master/cluster/config.html"
|
||||||
|
"#full-configuration")
|
||||||
|
|
||||||
|
# Get default head and worker types.
|
||||||
|
default_head_type = merged_config["head_node_type"]
|
||||||
|
# Default configs are assumed to have two node types -- one for the head
|
||||||
|
# and one for the workers.
|
||||||
|
assert len(merged_config["available_node_types"].keys()) == 2
|
||||||
|
default_worker_type = (merged_config["available_node_types"].keys() -
|
||||||
|
{default_head_type}).pop()
|
||||||
|
|
||||||
|
if merged_config["head_node"]:
|
||||||
|
# User specified a head node in legacy config.
|
||||||
|
# Convert it into data for the head's node type.
|
||||||
|
head_node_info = {
|
||||||
|
"node_config": merged_config["head_node"],
|
||||||
|
"resources": merged_config["head_node"].get("resources") or {},
|
||||||
|
"min_workers": 0,
|
||||||
|
"max_workers": 0,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# Use default data for the head's node type.
|
||||||
|
head_node_info = merged_config["available_node_types"][
|
||||||
|
default_head_type]
|
||||||
|
if merged_config["worker_nodes"]:
|
||||||
|
# User specified a worker node in legacy config.
|
||||||
|
# Convert it into data for the workers' node type.
|
||||||
|
worker_node_info = {
|
||||||
|
"node_config": merged_config["worker_nodes"],
|
||||||
|
"resources": merged_config["worker_nodes"].get("resources") or {},
|
||||||
|
"min_workers": merged_config.get("min_workers", 0),
|
||||||
|
"max_workers": merged_config["max_workers"],
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# Use default data for the workers' node type.
|
||||||
|
worker_node_info = merged_config["available_node_types"][
|
||||||
|
default_worker_type]
|
||||||
|
|
||||||
|
# Rewrite available_node_types.
|
||||||
|
merged_config["available_node_types"] = {
|
||||||
|
NODE_TYPE_LEGACY_HEAD: head_node_info,
|
||||||
|
NODE_TYPE_LEGACY_WORKER: worker_node_info
|
||||||
|
}
|
||||||
|
merged_config["head_node_type"] = NODE_TYPE_LEGACY_HEAD
|
||||||
|
|
||||||
|
# Resources field in head/worker fields cause node launch to fail.
|
||||||
|
merged_config["head_node"].pop("resources", None)
|
||||||
|
merged_config["worker_nodes"].pop("resources", None)
|
||||||
|
|
||||||
|
return merged_config
|
||||||
|
|
||||||
|
|
||||||
def merge_setup_commands(config):
|
def merge_setup_commands(config):
|
||||||
|
@ -147,7 +201,6 @@ def merge_setup_commands(config):
|
||||||
|
|
||||||
def fill_node_type_max_workers(config):
|
def fill_node_type_max_workers(config):
|
||||||
"""Sets default per-node max workers to global max_workers.
|
"""Sets default per-node max workers to global max_workers.
|
||||||
|
|
||||||
This equivalent to setting the default per-node max workers to infinity,
|
This equivalent to setting the default per-node max workers to infinity,
|
||||||
with the only upper constraint coming from the global max_workers.
|
with the only upper constraint coming from the global max_workers.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,12 +1,8 @@
|
||||||
# An unique identifier for the head node and workers of this cluster.
|
# An unique identifier for the head node and workers of this cluster.
|
||||||
cluster_name: default
|
cluster_name: default
|
||||||
|
|
||||||
# The minimum number of workers nodes to launch in addition to the head
|
|
||||||
# node. This number should be >= 0.
|
|
||||||
min_workers: 0
|
|
||||||
|
|
||||||
# The maximum number of workers nodes to launch in addition to the head
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
# node. This takes precedence over min_workers.
|
# node.
|
||||||
max_workers: 2
|
max_workers: 2
|
||||||
|
|
||||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||||
|
@ -43,38 +39,63 @@ auth:
|
||||||
# configurations below.
|
# configurations below.
|
||||||
# ssh_private_key: /path/to/your/key.pem
|
# ssh_private_key: /path/to/your/key.pem
|
||||||
|
|
||||||
# Provider-specific config for the head node, e.g. instance type. By default
|
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
# The key is the name of the node type, which is just for debugging purposes.
|
||||||
# For more documentation on available fields, see:
|
# The node config specifies the launch config and physical instance type.
|
||||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
available_node_types:
|
||||||
head_node:
|
ray.head.default:
|
||||||
InstanceType: m5.large
|
# The minimum number of worker nodes of this type to launch.
|
||||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
# This number should be >= 0.
|
||||||
|
min_workers: 0
|
||||||
|
# The maximum number of worker nodes of this type to launch.
|
||||||
|
# This takes precedence over min_workers.
|
||||||
|
max_workers: 0
|
||||||
|
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
|
||||||
|
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
|
||||||
|
# You can also set custom resources.
|
||||||
|
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
|
||||||
|
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
|
||||||
|
resources: {}
|
||||||
|
# Provider-specific config for this node type, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||||
|
node_config:
|
||||||
|
InstanceType: m5.large
|
||||||
|
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||||
|
# You can provision additional disk space with a conf as follows
|
||||||
|
BlockDeviceMappings:
|
||||||
|
- DeviceName: /dev/sda1
|
||||||
|
Ebs:
|
||||||
|
VolumeSize: 100
|
||||||
|
# Additional options in the boto docs.
|
||||||
|
ray.worker.default:
|
||||||
|
# The minimum number of nodes of this type to launch.
|
||||||
|
# This number should be >= 0.
|
||||||
|
min_workers: 0
|
||||||
|
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
|
||||||
|
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
|
||||||
|
# You can also set custom resources.
|
||||||
|
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
|
||||||
|
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
|
||||||
|
resources: {}
|
||||||
|
# Provider-specific config for this node type, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||||
|
node_config:
|
||||||
|
InstanceType: m5.large
|
||||||
|
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||||
|
# Run workers on spot by default. Comment this out to use on-demand.
|
||||||
|
InstanceMarketOptions:
|
||||||
|
MarketType: spot
|
||||||
|
# Additional options can be found in the boto docs, e.g.
|
||||||
|
# SpotOptions:
|
||||||
|
# MaxPrice: MAX_HOURLY_PRICE
|
||||||
|
# Additional options in the boto docs.
|
||||||
|
|
||||||
# You can provision additional disk space with a conf as follows
|
# Specify the node type of the head node (as configured above).
|
||||||
BlockDeviceMappings:
|
head_node_type: ray.head.default
|
||||||
- DeviceName: /dev/sda1
|
|
||||||
Ebs:
|
|
||||||
VolumeSize: 100
|
|
||||||
|
|
||||||
# Additional options in the boto docs.
|
|
||||||
|
|
||||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
|
||||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
|
||||||
# For more documentation on available fields, see:
|
|
||||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
|
||||||
worker_nodes:
|
|
||||||
InstanceType: m5.large
|
|
||||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
|
||||||
|
|
||||||
# Run workers on spot by default. Comment this out to use on-demand.
|
|
||||||
InstanceMarketOptions:
|
|
||||||
MarketType: spot
|
|
||||||
# Additional options can be found in the boto docs, e.g.
|
|
||||||
# SpotOptions:
|
|
||||||
# MaxPrice: MAX_HOURLY_PRICE
|
|
||||||
|
|
||||||
# Additional options in the boto docs.
|
|
||||||
|
|
||||||
# Files or directories to copy to the head and worker nodes. The format is a
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
@ -108,15 +129,8 @@ initialization_commands: []
|
||||||
|
|
||||||
# List of shell commands to run to set up nodes.
|
# List of shell commands to run to set up nodes.
|
||||||
setup_commands:
|
setup_commands:
|
||||||
# Note: if you're developing Ray, you probably want to create an AMI that
|
- echo 'export PATH="$HOME/anaconda3/envs/tensorflow2_latest_p37/bin:$PATH"' >> ~/.bashrc
|
||||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
|
||||||
- echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
|
|
||||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
|
|
||||||
# Consider uncommenting these if you also want to run apt-get commands during setup
|
|
||||||
# - sudo pkill -9 apt-get || true
|
|
||||||
# - sudo pkill -9 dpkg || true
|
|
||||||
# - sudo dpkg --configure -a
|
|
||||||
|
|
||||||
# Custom commands that will be run on the head node after common setup.
|
# Custom commands that will be run on the head node after common setup.
|
||||||
head_setup_commands:
|
head_setup_commands:
|
||||||
|
@ -134,3 +148,6 @@ head_start_ray_commands:
|
||||||
worker_start_ray_commands:
|
worker_start_ray_commands:
|
||||||
- ray stop
|
- ray stop
|
||||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||||
|
|
||||||
|
head_node: {}
|
||||||
|
worker_nodes: {}
|
||||||
|
|
148
python/ray/autoscaler/aws/example-full-legacy.yaml
Normal file
148
python/ray/autoscaler/aws/example-full-legacy.yaml
Normal file
|
@ -0,0 +1,148 @@
|
||||||
|
# An unique identifier for the head node and workers of this cluster.
|
||||||
|
cluster_name: default
|
||||||
|
|
||||||
|
# The minimum number of workers nodes to launch in addition to the head
|
||||||
|
# node. This number should be >= 0.
|
||||||
|
min_workers: 0
|
||||||
|
|
||||||
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
|
# node. This takes precedence over min_workers.
|
||||||
|
max_workers: 2
|
||||||
|
|
||||||
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||||
|
# E.g., if the task requires adding more nodes then autoscaler will gradually
|
||||||
|
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
|
||||||
|
# This number should be > 0.
|
||||||
|
upscaling_speed: 1.0
|
||||||
|
|
||||||
|
# This executes all commands on all nodes in the docker container,
|
||||||
|
# and opens all the necessary ports to support the Ray cluster.
|
||||||
|
# Empty string means disabled.
|
||||||
|
docker:
|
||||||
|
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||||
|
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||||
|
container_name: "ray_container"
|
||||||
|
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||||
|
# if no cached version is present.
|
||||||
|
pull_before_run: True
|
||||||
|
run_options: [] # Extra options to pass into "docker run"
|
||||||
|
|
||||||
|
# Example of running a GPU head with CPU workers
|
||||||
|
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||||
|
# Allow Ray to automatically detect GPUs
|
||||||
|
|
||||||
|
# worker_image: "rayproject/ray-ml:latest-cpu"
|
||||||
|
# worker_run_options: []
|
||||||
|
|
||||||
|
# If a node is idle for this many minutes, it will be removed.
|
||||||
|
idle_timeout_minutes: 5
|
||||||
|
|
||||||
|
# Cloud-provider specific configuration.
|
||||||
|
provider:
|
||||||
|
type: aws
|
||||||
|
region: us-west-2
|
||||||
|
# Availability zone(s), comma-separated, that nodes may be launched in.
|
||||||
|
# Nodes are currently spread between zones by a round-robin approach,
|
||||||
|
# however this implementation detail should not be relied upon.
|
||||||
|
availability_zone: us-west-2a,us-west-2b
|
||||||
|
# Whether to allow node reuse. If set to False, nodes will be terminated
|
||||||
|
# instead of stopped.
|
||||||
|
cache_stopped_nodes: True # If not present, the default is True.
|
||||||
|
|
||||||
|
# How Ray will authenticate with newly launched nodes.
|
||||||
|
auth:
|
||||||
|
ssh_user: ubuntu
|
||||||
|
# By default Ray creates a new private keypair, but you can also use your own.
|
||||||
|
# If you do so, make sure to also set "KeyName" in the head and worker node
|
||||||
|
# configurations below.
|
||||||
|
# ssh_private_key: /path/to/your/key.pem
|
||||||
|
|
||||||
|
# Provider-specific config for the head node, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||||
|
head_node:
|
||||||
|
InstanceType: m5.large
|
||||||
|
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||||
|
|
||||||
|
# You can provision additional disk space with a conf as follows
|
||||||
|
BlockDeviceMappings:
|
||||||
|
- DeviceName: /dev/sda1
|
||||||
|
Ebs:
|
||||||
|
VolumeSize: 100
|
||||||
|
|
||||||
|
# Additional options in the boto docs.
|
||||||
|
|
||||||
|
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||||
|
worker_nodes:
|
||||||
|
InstanceType: m5.large
|
||||||
|
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||||
|
|
||||||
|
# Run workers on spot by default. Comment this out to use on-demand.
|
||||||
|
InstanceMarketOptions:
|
||||||
|
MarketType: spot
|
||||||
|
# Additional options can be found in the boto docs, e.g.
|
||||||
|
# SpotOptions:
|
||||||
|
# MaxPrice: MAX_HOURLY_PRICE
|
||||||
|
|
||||||
|
# Additional options in the boto docs.
|
||||||
|
|
||||||
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
file_mounts: {
|
||||||
|
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||||
|
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||||
|
# list of paths. The same path on the head node will be copied to the worker node.
|
||||||
|
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
||||||
|
# you should just use file_mounts. Only use this if you know what you're doing!
|
||||||
|
cluster_synced_files: []
|
||||||
|
|
||||||
|
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
|
||||||
|
# should sync to the worker node continuously
|
||||||
|
file_mounts_sync_continuously: False
|
||||||
|
|
||||||
|
# Patterns for files to exclude when running rsync up or rsync down
|
||||||
|
rsync_exclude:
|
||||||
|
- "**/.git"
|
||||||
|
- "**/.git/**"
|
||||||
|
|
||||||
|
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
|
||||||
|
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
|
||||||
|
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
|
||||||
|
rsync_filter:
|
||||||
|
- ".gitignore"
|
||||||
|
|
||||||
|
# List of commands that will be run before `setup_commands`. If docker is
|
||||||
|
# enabled, these commands will run outside the container and before docker
|
||||||
|
# is setup.
|
||||||
|
initialization_commands: []
|
||||||
|
|
||||||
|
# List of shell commands to run to set up nodes.
|
||||||
|
setup_commands: []
|
||||||
|
# Note: if you're developing Ray, you probably want to create a Docker image that
|
||||||
|
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||||
|
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||||
|
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||||
|
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||||
|
|
||||||
|
# Custom commands that will be run on the head node after common setup.
|
||||||
|
head_setup_commands: []
|
||||||
|
|
||||||
|
# Custom commands that will be run on worker nodes after common setup.
|
||||||
|
worker_setup_commands: []
|
||||||
|
|
||||||
|
# Command to start ray on the head node. You don't need to change this.
|
||||||
|
head_start_ray_commands:
|
||||||
|
- ray stop
|
||||||
|
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||||
|
|
||||||
|
# Command to start ray on worker nodes. You don't need to change this.
|
||||||
|
worker_start_ray_commands:
|
||||||
|
- ray stop
|
||||||
|
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
|
@ -1,12 +1,8 @@
|
||||||
# An unique identifier for the head node and workers of this cluster.
|
# An unique identifier for the head node and workers of this cluster.
|
||||||
cluster_name: default
|
cluster_name: default
|
||||||
|
|
||||||
# The minimum number of workers nodes to launch in addition to the head
|
|
||||||
# node. This number should be >= 0.
|
|
||||||
min_workers: 0
|
|
||||||
|
|
||||||
# The maximum number of workers nodes to launch in addition to the head
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
# node. This takes precedence over min_workers.
|
# node.
|
||||||
max_workers: 2
|
max_workers: 2
|
||||||
|
|
||||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||||
|
@ -57,38 +53,66 @@ auth:
|
||||||
# configurations below.
|
# configurations below.
|
||||||
# ssh_private_key: /path/to/your/key.pem
|
# ssh_private_key: /path/to/your/key.pem
|
||||||
|
|
||||||
# Provider-specific config for the head node, e.g. instance type. By default
|
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
# The key is the name of the node type, which is just for debugging purposes.
|
||||||
# For more documentation on available fields, see:
|
# The node config specifies the launch config and physical instance type.
|
||||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
available_node_types:
|
||||||
head_node:
|
ray.head.default:
|
||||||
InstanceType: m5.large
|
# The minimum number of worker nodes of this type to launch.
|
||||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
# This number should be >= 0.
|
||||||
|
min_workers: 0
|
||||||
|
# The maximum number of worker nodes of this type to launch.
|
||||||
|
# This takes precedence over min_workers.
|
||||||
|
max_workers: 0
|
||||||
|
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
|
||||||
|
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
|
||||||
|
# You can also set custom resources.
|
||||||
|
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
|
||||||
|
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
|
||||||
|
resources: {}
|
||||||
|
# Provider-specific config for this node type, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||||
|
node_config:
|
||||||
|
InstanceType: m5.large
|
||||||
|
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||||
|
# You can provision additional disk space with a conf as follows
|
||||||
|
BlockDeviceMappings:
|
||||||
|
- DeviceName: /dev/sda1
|
||||||
|
Ebs:
|
||||||
|
VolumeSize: 100
|
||||||
|
# Additional options in the boto docs.
|
||||||
|
ray.worker.default:
|
||||||
|
# The minimum number of worker nodes of this type to launch.
|
||||||
|
# This number should be >= 0.
|
||||||
|
min_workers: 0
|
||||||
|
# The maximum number of worker nodes of this type to launch.
|
||||||
|
# This takes precedence over min_workers.
|
||||||
|
max_workers: 2
|
||||||
|
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
|
||||||
|
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
|
||||||
|
# You can also set custom resources.
|
||||||
|
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
|
||||||
|
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
|
||||||
|
resources: {}
|
||||||
|
# Provider-specific config for this node type, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||||
|
node_config:
|
||||||
|
InstanceType: m5.large
|
||||||
|
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||||
|
# Run workers on spot by default. Comment this out to use on-demand.
|
||||||
|
InstanceMarketOptions:
|
||||||
|
MarketType: spot
|
||||||
|
# Additional options can be found in the boto docs, e.g.
|
||||||
|
# SpotOptions:
|
||||||
|
# MaxPrice: MAX_HOURLY_PRICE
|
||||||
|
# Additional options in the boto docs.
|
||||||
|
|
||||||
# You can provision additional disk space with a conf as follows
|
# Specify the node type of the head node (as configured above).
|
||||||
BlockDeviceMappings:
|
head_node_type: ray.head.default
|
||||||
- DeviceName: /dev/sda1
|
|
||||||
Ebs:
|
|
||||||
VolumeSize: 100
|
|
||||||
|
|
||||||
# Additional options in the boto docs.
|
|
||||||
|
|
||||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
|
||||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
|
||||||
# For more documentation on available fields, see:
|
|
||||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
|
||||||
worker_nodes:
|
|
||||||
InstanceType: m5.large
|
|
||||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
|
||||||
|
|
||||||
# Run workers on spot by default. Comment this out to use on-demand.
|
|
||||||
InstanceMarketOptions:
|
|
||||||
MarketType: spot
|
|
||||||
# Additional options can be found in the boto docs, e.g.
|
|
||||||
# SpotOptions:
|
|
||||||
# MaxPrice: MAX_HOURLY_PRICE
|
|
||||||
|
|
||||||
# Additional options in the boto docs.
|
|
||||||
|
|
||||||
# Files or directories to copy to the head and worker nodes. The format is a
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
@ -146,3 +170,6 @@ head_start_ray_commands:
|
||||||
worker_start_ray_commands:
|
worker_start_ray_commands:
|
||||||
- ray stop
|
- ray stop
|
||||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||||
|
|
||||||
|
head_node: {}
|
||||||
|
worker_nodes: {}
|
||||||
|
|
|
@ -1,12 +1,8 @@
|
||||||
# An unique identifier for the head node and workers of this cluster.
|
# An unique identifier for the head node and workers of this cluster.
|
||||||
cluster_name: gpu-docker
|
cluster_name: gpu-docker
|
||||||
|
|
||||||
# The minimum number of workers nodes to launch in addition to the head
|
|
||||||
# node. This number should be >= 0.
|
|
||||||
min_workers: 0
|
|
||||||
|
|
||||||
# The maximum number of workers nodes to launch in addition to the head
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
# node. This takes precedence over min_workers.
|
# node.
|
||||||
max_workers: 2
|
max_workers: 2
|
||||||
|
|
||||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||||
|
@ -23,10 +19,6 @@ docker:
|
||||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||||
container_name: "ray_nvidia_docker" # e.g. ray_docker
|
container_name: "ray_nvidia_docker" # e.g. ray_docker
|
||||||
|
|
||||||
# # Example of running a GPU head with CPU workers
|
|
||||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
|
||||||
|
|
||||||
# worker_image: "rayproject/ray-ml:latest"
|
|
||||||
|
|
||||||
# If a node is idle for this many minutes, it will be removed.
|
# If a node is idle for this many minutes, it will be removed.
|
||||||
idle_timeout_minutes: 5
|
idle_timeout_minutes: 5
|
||||||
|
@ -48,38 +40,74 @@ auth:
|
||||||
# configurations below.
|
# configurations below.
|
||||||
# ssh_private_key: /path/to/your/key.pem
|
# ssh_private_key: /path/to/your/key.pem
|
||||||
|
|
||||||
# Provider-specific config for the head node, e.g. instance type. By default
|
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
# The key is the name of the node type, which is just for debugging purposes.
|
||||||
# For more documentation on available fields, see:
|
# The node config specifies the launch config and physical instance type.
|
||||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
available_node_types:
|
||||||
head_node:
|
# GPU head node.
|
||||||
InstanceType: p2.xlarge
|
ray.head.gpu:
|
||||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
# worker_image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||||
|
# The minimum number of worker nodes of this type to launch.
|
||||||
|
# This number should be >= 0.
|
||||||
|
min_workers: 0
|
||||||
|
# The maximum number of worker nodes of this type to launch.
|
||||||
|
# This takes precedence over min_workers.
|
||||||
|
max_workers: 0
|
||||||
|
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
|
||||||
|
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
|
||||||
|
# You can also set custom resources.
|
||||||
|
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
|
||||||
|
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
|
||||||
|
resources: {}
|
||||||
|
# Provider-specific config for this node type, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||||
|
node_config:
|
||||||
|
InstanceType: p2.xlarge
|
||||||
|
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||||
|
# You can provision additional disk space with a conf as follows
|
||||||
|
BlockDeviceMappings:
|
||||||
|
- DeviceName: /dev/sda1
|
||||||
|
Ebs:
|
||||||
|
VolumeSize: 100
|
||||||
|
# Additional options in the boto docs.
|
||||||
|
# CPU workers.
|
||||||
|
ray.worker.default:
|
||||||
|
# Override global docker setting.
|
||||||
|
# This node type will run a CPU image,
|
||||||
|
# rather than the GPU image specified in the global docker settings.
|
||||||
|
docker:
|
||||||
|
worker_image: "rayproject/ray-ml:latest-cpu"
|
||||||
|
# The minimum number of nodes of this type to launch.
|
||||||
|
# This number should be >= 0.
|
||||||
|
min_workers: 1
|
||||||
|
# The maximum number of workers nodes of this type to launch.
|
||||||
|
# This takes precedence over min_workers.
|
||||||
|
max_workers: 2
|
||||||
|
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
|
||||||
|
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
|
||||||
|
# You can also set custom resources.
|
||||||
|
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
|
||||||
|
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
|
||||||
|
resources: {}
|
||||||
|
# Provider-specific config for this node type, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||||
|
node_config:
|
||||||
|
InstanceType: m5.large
|
||||||
|
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||||
|
# Run workers on spot by default. Comment this out to use on-demand.
|
||||||
|
InstanceMarketOptions:
|
||||||
|
MarketType: spot
|
||||||
|
# Additional options can be found in the boto docs, e.g.
|
||||||
|
# SpotOptions:
|
||||||
|
# MaxPrice: MAX_HOURLY_PRICE
|
||||||
|
# Additional options in the boto docs.
|
||||||
|
|
||||||
# You can provision additional disk space with a conf as follows
|
# Specify the node type of the head node (as configured above).
|
||||||
BlockDeviceMappings:
|
head_node_type: ray.head.gpu
|
||||||
- DeviceName: /dev/sda1
|
|
||||||
Ebs:
|
|
||||||
VolumeSize: 100
|
|
||||||
|
|
||||||
# Additional options in the boto docs.
|
|
||||||
|
|
||||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
|
||||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
|
||||||
# For more documentation on available fields, see:
|
|
||||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
|
||||||
worker_nodes:
|
|
||||||
InstanceType: m5.large
|
|
||||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
|
||||||
|
|
||||||
# Run workers on spot by default. Comment this out to use on-demand.
|
|
||||||
InstanceMarketOptions:
|
|
||||||
MarketType: spot
|
|
||||||
# Additional options can be found in the boto docs, e.g.
|
|
||||||
# SpotOptions:
|
|
||||||
# MaxPrice: MAX_HOURLY_PRICE
|
|
||||||
|
|
||||||
# Additional options in the boto docs.
|
|
||||||
|
|
||||||
# Files or directories to copy to the head and worker nodes. The format is a
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
cluster_name: minimal
|
cluster_name: minimal
|
||||||
|
|
||||||
# The maximum number of workers nodes to launch in addition to the head
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
# node. min_workers default to 0.
|
||||||
max_workers: 1
|
max_workers: 1
|
||||||
|
|
||||||
# Cloud-provider specific configuration.
|
# Cloud-provider specific configuration.
|
||||||
|
|
|
@ -1,12 +1,8 @@
|
||||||
# An unique identifier for the head node and workers of this cluster.
|
# An unique identifier for the head node and workers of this cluster.
|
||||||
cluster_name: default
|
cluster_name: default
|
||||||
|
|
||||||
# The minimum number of workers nodes to launch in addition to the head
|
|
||||||
# node. This number should be >= 0.
|
|
||||||
min_workers: 0
|
|
||||||
|
|
||||||
# The maximum number of workers nodes to launch in addition to the head
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
# node. This takes precedence over min_workers.
|
# node.
|
||||||
max_workers: 2
|
max_workers: 2
|
||||||
|
|
||||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||||
|
@ -17,7 +13,7 @@ upscaling_speed: 1.0
|
||||||
|
|
||||||
# This executes all commands on all nodes in the docker container,
|
# This executes all commands on all nodes in the docker container,
|
||||||
# and opens all the necessary ports to support the Ray cluster.
|
# and opens all the necessary ports to support the Ray cluster.
|
||||||
# Empty string means disabled.
|
# Empty object means disabled.
|
||||||
docker: {}
|
docker: {}
|
||||||
|
|
||||||
# If a node is idle for this many minutes, it will be removed.
|
# If a node is idle for this many minutes, it will be removed.
|
||||||
|
@ -46,30 +42,52 @@ auth:
|
||||||
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
|
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
|
||||||
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
|
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
|
||||||
|
|
||||||
# Provider-specific config for the head node, e.g. instance type.
|
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||||
head_node:
|
# The key is the name of the node type, which is just for debugging purposes.
|
||||||
azure_arm_parameters:
|
# The node config specifies the launch config and physical instance type.
|
||||||
vmSize: Standard_D2s_v3
|
available_node_types:
|
||||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
ray.head.default:
|
||||||
imagePublisher: microsoft-dsvm
|
# The minimum number of worker nodes of this type to launch.
|
||||||
imageOffer: ubuntu-1804
|
# This number should be >= 0.
|
||||||
imageSku: 1804-gen2
|
min_workers: 0
|
||||||
imageVersion: 20.07.06
|
# The maximum number of worker nodes of this type to launch.
|
||||||
|
# This takes precedence over min_workers.
|
||||||
|
max_workers: 0
|
||||||
|
# The resources provided by this node type.
|
||||||
|
resources: {"CPU": 2}
|
||||||
|
# Provider-specific config, e.g. instance type.
|
||||||
|
node_config:
|
||||||
|
azure_arm_parameters:
|
||||||
|
vmSize: Standard_D2s_v3
|
||||||
|
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||||
|
imagePublisher: microsoft-dsvm
|
||||||
|
imageOffer: ubuntu-1804
|
||||||
|
imageSku: 1804-gen2
|
||||||
|
imageVersion: 20.07.06
|
||||||
|
|
||||||
# Provider-specific config for worker nodes, e.g. instance type.
|
ray.worker.default:
|
||||||
worker_nodes:
|
# The minimum number of nodes of this type to launch.
|
||||||
azure_arm_parameters:
|
# This number should be >= 0.
|
||||||
vmSize: Standard_D2s_v3
|
min_workers: 0
|
||||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
# The resources provided by this node type.
|
||||||
imagePublisher: microsoft-dsvm
|
resources: {"CPU": 2}
|
||||||
imageOffer: ubuntu-1804
|
# Provider-specific config, e.g. instance type.
|
||||||
imageSku: 1804-gen2
|
node_config:
|
||||||
imageVersion: 20.07.06
|
azure_arm_parameters:
|
||||||
# optionally set priority to use Spot instances
|
vmSize: Standard_D2s_v3
|
||||||
priority: Spot
|
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||||
# set a maximum price for spot instances if desired
|
imagePublisher: microsoft-dsvm
|
||||||
# billingProfile:
|
imageOffer: ubuntu-1804
|
||||||
# maxPrice: -1
|
imageSku: 1804-gen2
|
||||||
|
imageVersion: 20.07.06
|
||||||
|
# optionally set priority to use Spot instances
|
||||||
|
priority: Spot
|
||||||
|
# set a maximum price for spot instances if desired
|
||||||
|
# billingProfile:
|
||||||
|
# maxPrice: -1
|
||||||
|
|
||||||
|
# Specify the node type of the head node (as configured above).
|
||||||
|
head_node_type: ray.head.default
|
||||||
|
|
||||||
# Files or directories to copy to the head and worker nodes. The format is a
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
@ -134,3 +152,6 @@ head_start_ray_commands:
|
||||||
worker_start_ray_commands:
|
worker_start_ray_commands:
|
||||||
- ray stop
|
- ray stop
|
||||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||||
|
|
||||||
|
head_node: {}
|
||||||
|
worker_nodes: {}
|
||||||
|
|
|
@ -19,18 +19,20 @@ upscaling_speed: 1.0
|
||||||
# and opens all the necessary ports to support the Ray cluster.
|
# and opens all the necessary ports to support the Ray cluster.
|
||||||
# Empty string means disabled.
|
# Empty string means disabled.
|
||||||
docker:
|
docker:
|
||||||
image: "rayproject/ray-ml:latest-gpu"
|
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||||
container_name: "ray_docker"
|
container_name: "ray_container"
|
||||||
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||||
# if no cached version is present.
|
# if no cached version is present.
|
||||||
pull_before_run: False
|
pull_before_run: True
|
||||||
run_options: [] # Extra options to pass into "docker run"
|
run_options: [] # Extra options to pass into "docker run"
|
||||||
|
|
||||||
# Example of running a GPU head with CPU workers
|
# Example of running a GPU head with CPU workers
|
||||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||||
|
# Allow Ray to automatically detect GPUs
|
||||||
|
|
||||||
# worker_image: "rayproject/ray-ml:latest"
|
# worker_image: "rayproject/ray-ml:latest-cpu"
|
||||||
|
# worker_run_options: []
|
||||||
|
|
||||||
# If a node is idle for this many minutes, it will be removed.
|
# If a node is idle for this many minutes, it will be removed.
|
||||||
idle_timeout_minutes: 5
|
idle_timeout_minutes: 5
|
||||||
|
@ -42,7 +44,7 @@ provider:
|
||||||
location: westus2
|
location: westus2
|
||||||
resource_group: ray-cluster
|
resource_group: ray-cluster
|
||||||
# set subscription id otherwise the default from az cli will be used
|
# set subscription id otherwise the default from az cli will be used
|
||||||
# subscription_id: 00000000-0000-0000-0000-000000000000
|
# subscription_id: 00000000-0000-0000-0000-000000000000
|
||||||
|
|
||||||
# How Ray will authenticate with newly launched nodes.
|
# How Ray will authenticate with newly launched nodes.
|
||||||
auth:
|
auth:
|
||||||
|
@ -53,27 +55,35 @@ auth:
|
||||||
# changes to this should match what is specified in file_mounts
|
# changes to this should match what is specified in file_mounts
|
||||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||||
|
|
||||||
# Provider-specific config for the head node, e.g. instance type. By default
|
# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
|
||||||
# Ray will auto-configure unspecified fields using defaults.yaml
|
# See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines
|
||||||
|
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
|
||||||
|
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
|
||||||
|
|
||||||
|
# Provider-specific config for the head node, e.g. instance type.
|
||||||
head_node:
|
head_node:
|
||||||
azure_arm_parameters:
|
azure_arm_parameters:
|
||||||
vmSize: Standard_NC6
|
vmSize: Standard_D2s_v3
|
||||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||||
imagePublisher: microsoft-dsvm
|
imagePublisher: microsoft-dsvm
|
||||||
imageOffer: ubuntu-1804
|
imageOffer: ubuntu-1804
|
||||||
imageSku: "1804"
|
imageSku: 1804-gen2
|
||||||
imageVersion: 20.07.06
|
imageVersion: 20.07.06
|
||||||
|
|
||||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
# Provider-specific config for worker nodes, e.g. instance type.
|
||||||
# Ray will auto-configure unspecified fields using defaults.yaml
|
|
||||||
worker_nodes:
|
worker_nodes:
|
||||||
azure_arm_parameters:
|
azure_arm_parameters:
|
||||||
vmSize: Standard_NC6
|
vmSize: Standard_D2s_v3
|
||||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||||
imagePublisher: microsoft-dsvm
|
imagePublisher: microsoft-dsvm
|
||||||
imageOffer: ubuntu-1804
|
imageOffer: ubuntu-1804
|
||||||
imageSku: "1804"
|
imageSku: 1804-gen2
|
||||||
imageVersion: 20.07.06
|
imageVersion: 20.07.06
|
||||||
|
# optionally set priority to use Spot instances
|
||||||
|
priority: Spot
|
||||||
|
# set a maximum price for spot instances if desired
|
||||||
|
# billingProfile:
|
||||||
|
# maxPrice: -1
|
||||||
|
|
||||||
# Files or directories to copy to the head and worker nodes. The format is a
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
@ -83,6 +93,27 @@ file_mounts: {
|
||||||
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
|
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||||
|
# list of paths. The same path on the head node will be copied to the worker node.
|
||||||
|
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
||||||
|
# you should just use file_mounts. Only use this if you know what you're doing!
|
||||||
|
cluster_synced_files: []
|
||||||
|
|
||||||
|
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
|
||||||
|
# should sync to the worker node continuously
|
||||||
|
file_mounts_sync_continuously: False
|
||||||
|
|
||||||
|
# Patterns for files to exclude when running rsync up or rsync down
|
||||||
|
rsync_exclude:
|
||||||
|
- "**/.git"
|
||||||
|
- "**/.git/**"
|
||||||
|
|
||||||
|
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
|
||||||
|
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
|
||||||
|
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
|
||||||
|
rsync_filter:
|
||||||
|
- ".gitignore"
|
||||||
|
|
||||||
# List of commands that will be run before `setup_commands`. If docker is
|
# List of commands that will be run before `setup_commands`. If docker is
|
||||||
# enabled, these commands will run outside the container and before docker
|
# enabled, these commands will run outside the container and before docker
|
||||||
# is setup.
|
# is setup.
|
||||||
|
@ -92,20 +123,16 @@ initialization_commands:
|
||||||
|
|
||||||
# List of shell commands to run to set up nodes.
|
# List of shell commands to run to set up nodes.
|
||||||
setup_commands:
|
setup_commands:
|
||||||
# Note: if you're developing Ray, you probably want to create an AMI that
|
# Note: if you're developing Ray, you probably want to create a Docker image that
|
||||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||||
|
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||||
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
|
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
|
||||||
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
|
|
||||||
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
||||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||||
# Consider uncommenting these if you also want to run apt-get commands during setup
|
|
||||||
# - sudo pkill -9 apt-get || true
|
|
||||||
# - sudo pkill -9 dpkg || true
|
|
||||||
# - sudo dpkg --configure -a
|
|
||||||
|
|
||||||
# Custom commands that will be run on the head node after common setup.
|
# Custom commands that will be run on the head node after common setup.
|
||||||
head_setup_commands:
|
head_setup_commands:
|
||||||
- pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0
|
- pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0
|
||||||
|
|
||||||
# Custom commands that will be run on worker nodes after common setup.
|
# Custom commands that will be run on worker nodes after common setup.
|
|
@ -1,12 +1,8 @@
|
||||||
# An unique identifier for the head node and workers of this cluster.
|
# An unique identifier for the head node and workers of this cluster.
|
||||||
cluster_name: default
|
cluster_name: default
|
||||||
|
|
||||||
# The minimum number of workers nodes to launch in addition to the head
|
|
||||||
# node. This number should be >= 0.
|
|
||||||
min_workers: 0
|
|
||||||
|
|
||||||
# The maximum number of workers nodes to launch in addition to the head
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
# node. This takes precedence over min_workers.
|
# node.
|
||||||
max_workers: 2
|
max_workers: 2
|
||||||
|
|
||||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||||
|
@ -17,7 +13,7 @@ upscaling_speed: 1.0
|
||||||
|
|
||||||
# This executes all commands on all nodes in the docker container,
|
# This executes all commands on all nodes in the docker container,
|
||||||
# and opens all the necessary ports to support the Ray cluster.
|
# and opens all the necessary ports to support the Ray cluster.
|
||||||
# Empty string means disabled.
|
# Empty object means disabled.
|
||||||
docker:
|
docker:
|
||||||
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||||
|
@ -60,30 +56,55 @@ auth:
|
||||||
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
|
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
|
||||||
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
|
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
|
||||||
|
|
||||||
# Provider-specific config for the head node, e.g. instance type.
|
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||||
head_node:
|
# The key is the name of the node type, which is just for debugging purposes.
|
||||||
azure_arm_parameters:
|
# The node config specifies the launch config and physical instance type.
|
||||||
vmSize: Standard_D2s_v3
|
available_node_types:
|
||||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
ray.head.default:
|
||||||
imagePublisher: microsoft-dsvm
|
# The minimum number of worker nodes of this type to launch.
|
||||||
imageOffer: ubuntu-1804
|
# This number should be >= 0.
|
||||||
imageSku: 1804-gen2
|
min_workers: 0
|
||||||
imageVersion: 20.07.06
|
# The maximum number of worker nodes of this type to launch.
|
||||||
|
# This takes precedence over min_workers.
|
||||||
|
max_workers: 0
|
||||||
|
# The resources provided by this node type.
|
||||||
|
resources: {"CPU": 2}
|
||||||
|
# Provider-specific config, e.g. instance type.
|
||||||
|
node_config:
|
||||||
|
azure_arm_parameters:
|
||||||
|
vmSize: Standard_D2s_v3
|
||||||
|
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||||
|
imagePublisher: microsoft-dsvm
|
||||||
|
imageOffer: ubuntu-1804
|
||||||
|
imageSku: 1804-gen2
|
||||||
|
imageVersion: 20.07.06
|
||||||
|
|
||||||
# Provider-specific config for worker nodes, e.g. instance type.
|
ray.worker.default:
|
||||||
worker_nodes:
|
# The minimum number of worker nodes of this type to launch.
|
||||||
azure_arm_parameters:
|
# This number should be >= 0.
|
||||||
vmSize: Standard_D2s_v3
|
min_workers: 0
|
||||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
# The maximum number of worker nodes of this type to launch.
|
||||||
imagePublisher: microsoft-dsvm
|
# This takes precedence over min_workers.
|
||||||
imageOffer: ubuntu-1804
|
max_workers: 2
|
||||||
imageSku: 1804-gen2
|
# The resources provided by this node type.
|
||||||
imageVersion: 20.07.06
|
resources: {"CPU": 2}
|
||||||
# optionally set priority to use Spot instances
|
# Provider-specific config, e.g. instance type.
|
||||||
priority: Spot
|
node_config:
|
||||||
# set a maximum price for spot instances if desired
|
azure_arm_parameters:
|
||||||
# billingProfile:
|
vmSize: Standard_D2s_v3
|
||||||
# maxPrice: -1
|
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||||
|
imagePublisher: microsoft-dsvm
|
||||||
|
imageOffer: ubuntu-1804
|
||||||
|
imageSku: 1804-gen2
|
||||||
|
imageVersion: 20.07.06
|
||||||
|
# optionally set priority to use Spot instances
|
||||||
|
priority: Spot
|
||||||
|
# set a maximum price for spot instances if desired
|
||||||
|
# billingProfile:
|
||||||
|
# maxPrice: -1
|
||||||
|
|
||||||
|
# Specify the node type of the head node (as configured above).
|
||||||
|
head_node_type: ray.head.default
|
||||||
|
|
||||||
# Files or directories to copy to the head and worker nodes. The format is a
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
@ -147,3 +168,6 @@ head_start_ray_commands:
|
||||||
worker_start_ray_commands:
|
worker_start_ray_commands:
|
||||||
- ray stop
|
- ray stop
|
||||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||||
|
|
||||||
|
head_node: {}
|
||||||
|
worker_nodes: {}
|
||||||
|
|
|
@ -1,12 +1,8 @@
|
||||||
# An unique identifier for the head node and workers of this cluster.
|
# An unique identifier for the head node and workers of this cluster.
|
||||||
cluster_name: gpu-docker
|
cluster_name: gpu-docker
|
||||||
|
|
||||||
# The minimum number of workers nodes to launch in addition to the head
|
|
||||||
# node. This number should be >= 0.
|
|
||||||
min_workers: 0
|
|
||||||
|
|
||||||
# The maximum number of workers nodes to launch in addition to the head
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
# node. This takes precedence over min_workers.
|
# node.
|
||||||
max_workers: 2
|
max_workers: 2
|
||||||
|
|
||||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||||
|
@ -21,7 +17,7 @@ upscaling_speed: 1.0
|
||||||
docker:
|
docker:
|
||||||
image: "rayproject/ray-ml:latest-gpu"
|
image: "rayproject/ray-ml:latest-gpu"
|
||||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||||
container_name: "ray_nvidia_docker" # e.g. ray_docker
|
container_name: "ray_nvidia_docker"
|
||||||
|
|
||||||
# # Example of running a GPU head with CPU workers
|
# # Example of running a GPU head with CPU workers
|
||||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||||
|
@ -45,17 +41,40 @@ auth:
|
||||||
# changes to this should match what is specified in file_mounts
|
# changes to this should match what is specified in file_mounts
|
||||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||||
|
|
||||||
# Provider-specific config for the head node, e.g. instance type. By default
|
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||||
# Ray will auto-configure unspecified fields using defaults.yaml
|
# The key is the name of the node type, which is just for debugging purposes.
|
||||||
head_node:
|
# The node config specifies the launch config and physical instance type.
|
||||||
azure_arm_parameters:
|
available_node_types:
|
||||||
vmSize: Standard_NC6s_v3
|
ray.head.gpu:
|
||||||
|
# The minimum number of worker nodes of this type to launch.
|
||||||
|
# This number should be >= 0.
|
||||||
|
min_workers: 0
|
||||||
|
# The maximum number of worker nodes of this type to launch.
|
||||||
|
# This takes precedence over min_workers.
|
||||||
|
max_workers: 0
|
||||||
|
# The resources provided by this node type.
|
||||||
|
resources: {"CPU": 6, "GPU": 1}
|
||||||
|
# Provider-specific config, e.g. instance type.
|
||||||
|
node_config:
|
||||||
|
azure_arm_parameters:
|
||||||
|
vmSize: Standard_NC6_v3
|
||||||
|
|
||||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
ray.worker.gpu:
|
||||||
# Ray will auto-configure unspecified fields using defaults.yaml
|
# The minimum number of nodes of this type to launch.
|
||||||
worker_nodes:
|
# This number should be >= 0.
|
||||||
azure_arm_parameters:
|
min_workers: 0
|
||||||
vmSize: Standard_NC6s_v3
|
# The maximum number of workers nodes of this type to launch.
|
||||||
|
# This takes precedence over min_workers.
|
||||||
|
max_workers: 2
|
||||||
|
# The resources provided by this node type.
|
||||||
|
resources: {"CPU": 6, "GPU": 1}
|
||||||
|
# Provider-specific config, e.g. instance type.
|
||||||
|
node_config:
|
||||||
|
azure_arm_parameters:
|
||||||
|
vmSize: Standard_NC6_v3
|
||||||
|
|
||||||
|
# Specify the node type of the head node (as configured above).
|
||||||
|
head_node_type: ray.head.gpu
|
||||||
|
|
||||||
# Files or directories to copy to the head and worker nodes. The format is a
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
@ -69,7 +88,7 @@ file_mounts: {
|
||||||
# NOTE: rayproject/ray-ml:latest has ray latest bundled
|
# NOTE: rayproject/ray-ml:latest has ray latest bundled
|
||||||
setup_commands: []
|
setup_commands: []
|
||||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||||
|
|
||||||
# Custom commands that will be run on the head node after common setup.
|
# Custom commands that will be run on the head node after common setup.
|
||||||
head_setup_commands:
|
head_setup_commands:
|
||||||
- pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0
|
- pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
cluster_name: minimal
|
cluster_name: minimal
|
||||||
|
|
||||||
# The maximum number of workers nodes to launch in addition to the head
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
# node. min_workers default to 0.
|
||||||
max_workers: 1
|
max_workers: 1
|
||||||
|
|
||||||
# Cloud-provider specific configuration.
|
# Cloud-provider specific configuration.
|
||||||
|
|
|
@ -1,12 +1,8 @@
|
||||||
# An unique identifier for the head node and workers of this cluster.
|
# An unique identifier for the head node and workers of this cluster.
|
||||||
cluster_name: default
|
cluster_name: default
|
||||||
|
|
||||||
# The minimum number of workers nodes to launch in addition to the head
|
|
||||||
# node. This number should be >= 0.
|
|
||||||
min_workers: 0
|
|
||||||
|
|
||||||
# The maximum number of workers nodes to launch in addition to the head
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
# node. This takes precedence over min_workers.
|
# node.
|
||||||
max_workers: 2
|
max_workers: 2
|
||||||
|
|
||||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||||
|
@ -39,50 +35,75 @@ auth:
|
||||||
# project wide meta-data.
|
# project wide meta-data.
|
||||||
# ssh_private_key: /path/to/your/key.pem
|
# ssh_private_key: /path/to/your/key.pem
|
||||||
|
|
||||||
# Provider-specific config for the head node, e.g. instance type. By default
|
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||||
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
# The key is the name of the node type, which is just for debugging purposes.
|
||||||
# For more documentation on available fields, see:
|
# The node config specifies the launch config and physical instance type.
|
||||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
available_node_types:
|
||||||
head_node:
|
ray_head_default:
|
||||||
machineType: n1-standard-2
|
# The minimum number of worker nodes of this type to launch.
|
||||||
disks:
|
# This number should be >= 0.
|
||||||
- boot: true
|
min_workers: 0
|
||||||
autoDelete: true
|
# The maximum number of worker nodes of this type to launch.
|
||||||
type: PERSISTENT
|
# This takes precedence over min_workers.
|
||||||
initializeParams:
|
max_workers: 0
|
||||||
diskSizeGb: 50
|
# The resources provided by this node type.
|
||||||
# See https://cloud.google.com/compute/docs/images for more images
|
resources: {"CPU": 2}
|
||||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
# Provider-specific config for this node type, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||||
|
node_config:
|
||||||
|
machineType: n1-standard-2
|
||||||
|
disks:
|
||||||
|
- boot: true
|
||||||
|
autoDelete: true
|
||||||
|
type: PERSISTENT
|
||||||
|
initializeParams:
|
||||||
|
diskSizeGb: 50
|
||||||
|
# See https://cloud.google.com/compute/docs/images for more images
|
||||||
|
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||||
|
|
||||||
|
# Additional options can be found in in the compute docs at
|
||||||
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||||
|
|
||||||
|
# If the network interface is specified as below in both head and worker
|
||||||
|
# nodes, the manual network config is used. Otherwise an existing subnet is
|
||||||
|
# used. To use a shared subnet, ask the subnet owner to grant permission
|
||||||
|
# for 'compute.subnetworks.use' to the ray autoscaler account...
|
||||||
|
# networkInterfaces:
|
||||||
|
# - kind: compute#networkInterface
|
||||||
|
# subnetwork: path/to/subnet
|
||||||
|
# aliasIpRanges: []
|
||||||
|
ray_worker_small:
|
||||||
|
# The minimum number of nodes of this type to launch.
|
||||||
|
# This number should be >= 0.
|
||||||
|
min_workers: 0
|
||||||
|
# The resources provided by this node type.
|
||||||
|
resources: {"CPU": 2}
|
||||||
|
# Provider-specific config for this node type, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||||
|
node_config:
|
||||||
|
machineType: n1-standard-2
|
||||||
|
disks:
|
||||||
|
- boot: true
|
||||||
|
autoDelete: true
|
||||||
|
type: PERSISTENT
|
||||||
|
initializeParams:
|
||||||
|
diskSizeGb: 50
|
||||||
|
# See https://cloud.google.com/compute/docs/images for more images
|
||||||
|
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||||
|
# Run workers on preemtible instance by default.
|
||||||
|
# Comment this out to use on-demand.
|
||||||
|
scheduling:
|
||||||
|
- preemptible: true
|
||||||
|
|
||||||
# Additional options can be found in in the compute docs at
|
# Additional options can be found in in the compute docs at
|
||||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||||
|
|
||||||
# If the network interface is specified as below in both head and worker
|
# Specify the node type of the head node (as configured above).
|
||||||
# nodes, the manual network config is used. Otherwise an existing subnet is
|
head_node_type: ray_head_default
|
||||||
# used. To use a shared subnet, ask the subnet owner to grant permission
|
|
||||||
# for 'compute.subnetworks.use' to the ray autoscaler account...
|
|
||||||
# networkInterfaces:
|
|
||||||
# - kind: compute#networkInterface
|
|
||||||
# subnetwork: path/to/subnet
|
|
||||||
# aliasIpRanges: []
|
|
||||||
|
|
||||||
worker_nodes:
|
|
||||||
machineType: n1-standard-2
|
|
||||||
disks:
|
|
||||||
- boot: true
|
|
||||||
autoDelete: true
|
|
||||||
type: PERSISTENT
|
|
||||||
initializeParams:
|
|
||||||
diskSizeGb: 50
|
|
||||||
# See https://cloud.google.com/compute/docs/images for more images
|
|
||||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
|
||||||
# Run workers on preemtible instance by default.
|
|
||||||
# Comment this out to use on-demand.
|
|
||||||
scheduling:
|
|
||||||
- preemptible: true
|
|
||||||
|
|
||||||
# Additional options can be found in in the compute docs at
|
|
||||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
|
||||||
|
|
||||||
# Files or directories to copy to the head and worker nodes. The format is a
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
@ -159,3 +180,6 @@ worker_start_ray_commands:
|
||||||
ray start
|
ray start
|
||||||
--address=$RAY_HEAD_IP:6379
|
--address=$RAY_HEAD_IP:6379
|
||||||
--object-manager-port=8076
|
--object-manager-port=8076
|
||||||
|
|
||||||
|
head_node: {}
|
||||||
|
worker_nodes: {}
|
||||||
|
|
167
python/ray/autoscaler/gcp/example-full-legacy.yaml
Normal file
167
python/ray/autoscaler/gcp/example-full-legacy.yaml
Normal file
|
@ -0,0 +1,167 @@
|
||||||
|
# An unique identifier for the head node and workers of this cluster.
|
||||||
|
cluster_name: default
|
||||||
|
|
||||||
|
# The minimum number of workers nodes to launch in addition to the head
|
||||||
|
# node. This number should be >= 0.
|
||||||
|
min_workers: 0
|
||||||
|
|
||||||
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
|
# node. This takes precedence over min_workers.
|
||||||
|
max_workers: 2
|
||||||
|
|
||||||
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||||
|
# E.g., if the task requires adding more nodes then autoscaler will gradually
|
||||||
|
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
|
||||||
|
# This number should be > 0.
|
||||||
|
upscaling_speed: 1.0
|
||||||
|
|
||||||
|
# This executes all commands on all nodes in the docker container,
|
||||||
|
# and opens all the necessary ports to support the Ray cluster.
|
||||||
|
# Empty string means disabled.
|
||||||
|
docker:
|
||||||
|
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||||
|
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||||
|
container_name: "ray_container"
|
||||||
|
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||||
|
# if no cached version is present.
|
||||||
|
pull_before_run: True
|
||||||
|
run_options: [] # Extra options to pass into "docker run"
|
||||||
|
|
||||||
|
# Example of running a GPU head with CPU workers
|
||||||
|
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||||
|
# Allow Ray to automatically detect GPUs
|
||||||
|
|
||||||
|
# worker_image: "rayproject/ray-ml:latest-cpu"
|
||||||
|
# worker_run_options: []
|
||||||
|
|
||||||
|
# If a node is idle for this many minutes, it will be removed.
|
||||||
|
idle_timeout_minutes: 5
|
||||||
|
|
||||||
|
# Cloud-provider specific configuration.
|
||||||
|
provider:
|
||||||
|
type: gcp
|
||||||
|
region: us-west1
|
||||||
|
availability_zone: us-west1-a
|
||||||
|
project_id: null # Globally unique project id
|
||||||
|
|
||||||
|
# How Ray will authenticate with newly launched nodes.
|
||||||
|
auth:
|
||||||
|
ssh_user: ubuntu
|
||||||
|
# By default Ray creates a new private keypair, but you can also use your own.
|
||||||
|
# If you do so, make sure to also set "KeyName" in the head and worker node
|
||||||
|
# configurations below. This requires that you have added the key into the
|
||||||
|
# project wide meta-data.
|
||||||
|
# ssh_private_key: /path/to/your/key.pem
|
||||||
|
|
||||||
|
# Provider-specific config for the head node, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||||
|
head_node:
|
||||||
|
machineType: n1-standard-2
|
||||||
|
disks:
|
||||||
|
- boot: true
|
||||||
|
autoDelete: true
|
||||||
|
type: PERSISTENT
|
||||||
|
initializeParams:
|
||||||
|
diskSizeGb: 50
|
||||||
|
# See https://cloud.google.com/compute/docs/images for more images
|
||||||
|
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||||
|
|
||||||
|
# Additional options can be found in in the compute docs at
|
||||||
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||||
|
|
||||||
|
# If the network interface is specified as below in both head and worker
|
||||||
|
# nodes, the manual network config is used. Otherwise an existing subnet is
|
||||||
|
# used. To use a shared subnet, ask the subnet owner to grant permission
|
||||||
|
# for 'compute.subnetworks.use' to the ray autoscaler account...
|
||||||
|
# networkInterfaces:
|
||||||
|
# - kind: compute#networkInterface
|
||||||
|
# subnetwork: path/to/subnet
|
||||||
|
# aliasIpRanges: []
|
||||||
|
|
||||||
|
worker_nodes:
|
||||||
|
machineType: n1-standard-2
|
||||||
|
disks:
|
||||||
|
- boot: true
|
||||||
|
autoDelete: true
|
||||||
|
type: PERSISTENT
|
||||||
|
initializeParams:
|
||||||
|
diskSizeGb: 50
|
||||||
|
# See https://cloud.google.com/compute/docs/images for more images
|
||||||
|
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||||
|
# Run workers on preemtible instance by default.
|
||||||
|
# Comment this out to use on-demand.
|
||||||
|
scheduling:
|
||||||
|
- preemptible: true
|
||||||
|
|
||||||
|
# Additional options can be found in in the compute docs at
|
||||||
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||||
|
|
||||||
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
file_mounts: {
|
||||||
|
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||||
|
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||||
|
# list of paths. The same path on the head node will be copied to the worker node.
|
||||||
|
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
||||||
|
# you should just use file_mounts. Only use this if you know what you're doing!
|
||||||
|
cluster_synced_files: []
|
||||||
|
|
||||||
|
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
|
||||||
|
# should sync to the worker node continuously
|
||||||
|
file_mounts_sync_continuously: False
|
||||||
|
|
||||||
|
# Patterns for files to exclude when running rsync up or rsync down
|
||||||
|
rsync_exclude:
|
||||||
|
- "**/.git"
|
||||||
|
- "**/.git/**"
|
||||||
|
|
||||||
|
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
|
||||||
|
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
|
||||||
|
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
|
||||||
|
rsync_filter:
|
||||||
|
- ".gitignore"
|
||||||
|
|
||||||
|
# List of commands that will be run before `setup_commands`. If docker is
|
||||||
|
# enabled, these commands will run outside the container and before docker
|
||||||
|
# is setup.
|
||||||
|
initialization_commands: []
|
||||||
|
|
||||||
|
# List of shell commands to run to set up nodes.
|
||||||
|
setup_commands: []
|
||||||
|
# Note: if you're developing Ray, you probably want to create a Docker image that
|
||||||
|
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||||
|
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||||
|
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||||
|
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||||
|
|
||||||
|
|
||||||
|
# Custom commands that will be run on the head node after common setup.
|
||||||
|
head_setup_commands:
|
||||||
|
- pip install google-api-python-client==1.7.8
|
||||||
|
|
||||||
|
# Custom commands that will be run on worker nodes after common setup.
|
||||||
|
worker_setup_commands: []
|
||||||
|
|
||||||
|
# Command to start ray on the head node. You don't need to change this.
|
||||||
|
head_start_ray_commands:
|
||||||
|
- ray stop
|
||||||
|
- >-
|
||||||
|
ulimit -n 65536;
|
||||||
|
ray start
|
||||||
|
--head
|
||||||
|
--port=6379
|
||||||
|
--object-manager-port=8076
|
||||||
|
--autoscaling-config=~/ray_bootstrap_config.yaml
|
||||||
|
# Command to start ray on worker nodes. You don't need to change this.
|
||||||
|
worker_start_ray_commands:
|
||||||
|
- ray stop
|
||||||
|
- >-
|
||||||
|
ulimit -n 65536;
|
||||||
|
ray start
|
||||||
|
--address=$RAY_HEAD_IP:6379
|
||||||
|
--object-manager-port=8076
|
|
@ -1,12 +1,8 @@
|
||||||
# An unique identifier for the head node and workers of this cluster.
|
# An unique identifier for the head node and workers of this cluster.
|
||||||
cluster_name: default
|
cluster_name: default
|
||||||
|
|
||||||
# The minimum number of workers nodes to launch in addition to the head
|
|
||||||
# node. This number should be >= 0.
|
|
||||||
min_workers: 0
|
|
||||||
|
|
||||||
# The maximum number of workers nodes to launch in addition to the head
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
# node. This takes precedence over min_workers.
|
# node.
|
||||||
max_workers: 2
|
max_workers: 2
|
||||||
|
|
||||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||||
|
@ -53,50 +49,78 @@ auth:
|
||||||
# project wide meta-data.
|
# project wide meta-data.
|
||||||
# ssh_private_key: /path/to/your/key.pem
|
# ssh_private_key: /path/to/your/key.pem
|
||||||
|
|
||||||
# Provider-specific config for the head node, e.g. instance type. By default
|
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||||
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
# The key is the name of the node type, which is just for debugging purposes.
|
||||||
# For more documentation on available fields, see:
|
# The node config specifies the launch config and physical instance type.
|
||||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
available_node_types:
|
||||||
head_node:
|
ray_head_default:
|
||||||
machineType: n1-standard-2
|
# The minimum number of worker nodes of this type to launch.
|
||||||
disks:
|
# This number should be >= 0.
|
||||||
- boot: true
|
min_workers: 0
|
||||||
autoDelete: true
|
# The maximum number of worker nodes of this type to launch.
|
||||||
type: PERSISTENT
|
# This takes precedence over min_workers.
|
||||||
initializeParams:
|
max_workers: 0
|
||||||
diskSizeGb: 50
|
# The resources provided by this node type.
|
||||||
# See https://cloud.google.com/compute/docs/images for more images
|
resources: {"CPU": 2}
|
||||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
# Provider-specific config for the head node, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||||
|
node_config:
|
||||||
|
machineType: n1-standard-2
|
||||||
|
disks:
|
||||||
|
- boot: true
|
||||||
|
autoDelete: true
|
||||||
|
type: PERSISTENT
|
||||||
|
initializeParams:
|
||||||
|
diskSizeGb: 50
|
||||||
|
# See https://cloud.google.com/compute/docs/images for more images
|
||||||
|
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||||
|
|
||||||
|
# Additional options can be found in in the compute docs at
|
||||||
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||||
|
|
||||||
|
# If the network interface is specified as below in both head and worker
|
||||||
|
# nodes, the manual network config is used. Otherwise an existing subnet is
|
||||||
|
# used. To use a shared subnet, ask the subnet owner to grant permission
|
||||||
|
# for 'compute.subnetworks.use' to the ray autoscaler account...
|
||||||
|
# networkInterfaces:
|
||||||
|
# - kind: compute#networkInterface
|
||||||
|
# subnetwork: path/to/subnet
|
||||||
|
# aliasIpRanges: []
|
||||||
|
ray_worker_small:
|
||||||
|
# The minimum number of worker nodes of this type to launch.
|
||||||
|
# This number should be >= 0.
|
||||||
|
min_workers: 0
|
||||||
|
# The maximum number of worker nodes of this type to launch.
|
||||||
|
# This takes precedence over min_workers.
|
||||||
|
max_workers: 2
|
||||||
|
# The resources provided by this node type.
|
||||||
|
resources: {"CPU": 2}
|
||||||
|
# Provider-specific config for the head node, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||||
|
node_config:
|
||||||
|
machineType: n1-standard-2
|
||||||
|
disks:
|
||||||
|
- boot: true
|
||||||
|
autoDelete: true
|
||||||
|
type: PERSISTENT
|
||||||
|
initializeParams:
|
||||||
|
diskSizeGb: 50
|
||||||
|
# See https://cloud.google.com/compute/docs/images for more images
|
||||||
|
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||||
|
# Run workers on preemtible instance by default.
|
||||||
|
# Comment this out to use on-demand.
|
||||||
|
scheduling:
|
||||||
|
- preemptible: true
|
||||||
|
|
||||||
# Additional options can be found in in the compute docs at
|
# Additional options can be found in in the compute docs at
|
||||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||||
|
|
||||||
# If the network interface is specified as below in both head and worker
|
# Specify the node type of the head node (as configured above).
|
||||||
# nodes, the manual network config is used. Otherwise an existing subnet is
|
head_node_type: ray_head_default
|
||||||
# used. To use a shared subnet, ask the subnet owner to grant permission
|
|
||||||
# for 'compute.subnetworks.use' to the ray autoscaler account...
|
|
||||||
# networkInterfaces:
|
|
||||||
# - kind: compute#networkInterface
|
|
||||||
# subnetwork: path/to/subnet
|
|
||||||
# aliasIpRanges: []
|
|
||||||
|
|
||||||
worker_nodes:
|
|
||||||
machineType: n1-standard-2
|
|
||||||
disks:
|
|
||||||
- boot: true
|
|
||||||
autoDelete: true
|
|
||||||
type: PERSISTENT
|
|
||||||
initializeParams:
|
|
||||||
diskSizeGb: 50
|
|
||||||
# See https://cloud.google.com/compute/docs/images for more images
|
|
||||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
|
||||||
# Run workers on preemtible instance by default.
|
|
||||||
# Comment this out to use on-demand.
|
|
||||||
scheduling:
|
|
||||||
- preemptible: true
|
|
||||||
|
|
||||||
# Additional options can be found in in the compute docs at
|
|
||||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
|
||||||
|
|
||||||
# Files or directories to copy to the head and worker nodes. The format is a
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
@ -166,3 +190,6 @@ worker_start_ray_commands:
|
||||||
ray start
|
ray start
|
||||||
--address=$RAY_HEAD_IP:6379
|
--address=$RAY_HEAD_IP:6379
|
||||||
--object-manager-port=8076
|
--object-manager-port=8076
|
||||||
|
|
||||||
|
head_node: {}
|
||||||
|
worker_nodes: {}
|
||||||
|
|
|
@ -1,12 +1,8 @@
|
||||||
# An unique identifier for the head node and workers of this cluster.
|
# An unique identifier for the head node and workers of this cluster.
|
||||||
cluster_name: gpu-docker
|
cluster_name: gpu-docker
|
||||||
|
|
||||||
# The minimum number of workers nodes to launch in addition to the head
|
|
||||||
# node. This number should be >= 0.
|
|
||||||
min_workers: 0
|
|
||||||
|
|
||||||
# The maximum number of workers nodes to launch in addition to the head
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
# node. This takes precedence over min_workers.
|
# node.
|
||||||
max_workers: 2
|
max_workers: 2
|
||||||
|
|
||||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||||
|
@ -48,58 +44,81 @@ auth:
|
||||||
# project wide meta-data.
|
# project wide meta-data.
|
||||||
# ssh_private_key: /path/to/your/key.pem
|
# ssh_private_key: /path/to/your/key.pem
|
||||||
|
|
||||||
# Provider-specific config for the head node, e.g. instance type. By default
|
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||||
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
# The key is the name of the node type, which is just for debugging purposes.
|
||||||
# For more documentation on available fields, see:
|
# The node config specifies the launch config and physical instance type.
|
||||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
available_node_types:
|
||||||
head_node:
|
ray_head_gpu:
|
||||||
machineType: custom-6-16384
|
# The minimum number of worker nodes of this type to launch.
|
||||||
disks:
|
# This number should be >= 0.
|
||||||
- boot: true
|
min_workers: 0
|
||||||
autoDelete: true
|
# The maximum number of worker nodes of this type to launch.
|
||||||
type: PERSISTENT
|
# This takes precedence over min_workers.
|
||||||
initializeParams:
|
max_workers: 0
|
||||||
diskSizeGb: 50
|
# The resources provided by this node type.
|
||||||
# See https://cloud.google.com/compute/docs/images for more images
|
resources: {"CPU": 6, "GPU": 1}
|
||||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
|
# Provider-specific config for the head node, e.g. instance type. By default
|
||||||
guestAccelerators:
|
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||||
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
|
# For more documentation on available fields, see:
|
||||||
acceleratorCount: 1
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||||
metadata:
|
node_config:
|
||||||
items:
|
machineType: custom-6-16384
|
||||||
- key: install-nvidia-driver
|
disks:
|
||||||
value: "True"
|
- boot: true
|
||||||
scheduling:
|
autoDelete: true
|
||||||
- onHostMaintenance: TERMINATE
|
type: PERSISTENT
|
||||||
|
initializeParams:
|
||||||
|
diskSizeGb: 50
|
||||||
|
# See https://cloud.google.com/compute/docs/images for more images
|
||||||
|
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
|
||||||
|
guestAccelerators:
|
||||||
|
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
|
||||||
|
acceleratorCount: 1
|
||||||
|
metadata:
|
||||||
|
items:
|
||||||
|
- key: install-nvidia-driver
|
||||||
|
value: "True"
|
||||||
|
scheduling:
|
||||||
|
- onHostMaintenance: TERMINATE
|
||||||
|
|
||||||
# Additional options can be found in in the compute docs at
|
ray_worker_gpu:
|
||||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
# The minimum number of nodes of this type to launch.
|
||||||
|
# This number should be >= 0.
|
||||||
|
min_workers: 0
|
||||||
|
# The maximum number of workers nodes of this type to launch.
|
||||||
|
# This takes precedence over min_workers.
|
||||||
|
max_workers: 2
|
||||||
|
# The resources provided by this node type.
|
||||||
|
resources: {"CPU": 2, "GPU": 1}
|
||||||
|
# Provider-specific config for the head node, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||||
|
node_config:
|
||||||
|
machineType: n1-standard-2
|
||||||
|
disks:
|
||||||
|
- boot: true
|
||||||
|
autoDelete: true
|
||||||
|
type: PERSISTENT
|
||||||
|
initializeParams:
|
||||||
|
diskSizeGb: 50
|
||||||
|
# See https://cloud.google.com/compute/docs/images for more images
|
||||||
|
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
|
||||||
|
guestAccelerators:
|
||||||
|
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
|
||||||
|
acceleratorCount: 1
|
||||||
|
metadata:
|
||||||
|
items:
|
||||||
|
- key: install-nvidia-driver
|
||||||
|
value: "True"
|
||||||
|
# Run workers on preemtible instance by default.
|
||||||
|
# Comment this out to use on-demand.
|
||||||
|
scheduling:
|
||||||
|
- preemptible: true
|
||||||
|
- onHostMaintenance: TERMINATE
|
||||||
|
|
||||||
worker_nodes:
|
# Specify the node type of the head node (as configured above).
|
||||||
machineType: n1-standard-2
|
head_node_type: ray_head_gpu
|
||||||
disks:
|
|
||||||
- boot: true
|
|
||||||
autoDelete: true
|
|
||||||
type: PERSISTENT
|
|
||||||
initializeParams:
|
|
||||||
diskSizeGb: 50
|
|
||||||
# See https://cloud.google.com/compute/docs/images for more images
|
|
||||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
|
|
||||||
guestAccelerators:
|
|
||||||
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
|
|
||||||
acceleratorCount: 1
|
|
||||||
metadata:
|
|
||||||
items:
|
|
||||||
- key: install-nvidia-driver
|
|
||||||
value: "True"
|
|
||||||
# Run workers on preemtible instance by default.
|
|
||||||
# Comment this out to use on-demand.
|
|
||||||
scheduling:
|
|
||||||
- preemptible: true
|
|
||||||
- onHostMaintenance: TERMINATE
|
|
||||||
|
|
||||||
# Additional options can be found in in the compute docs at
|
|
||||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
|
||||||
|
|
||||||
# Files or directories to copy to the head and worker nodes. The format is a
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
cluster_name: minimal
|
cluster_name: minimal
|
||||||
|
|
||||||
# The maximum number of worker nodes to launch in addition to the head
|
# The maximum number of worker nodes to launch in addition to the head
|
||||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
# node. min_workers default to 0.
|
||||||
max_workers: 1
|
max_workers: 1
|
||||||
|
|
||||||
# Cloud-provider specific configuration.
|
# Cloud-provider specific configuration.
|
||||||
|
|
|
@ -96,8 +96,6 @@ available_node_types:
|
||||||
worker_node:
|
worker_node:
|
||||||
# Minimum number of Ray workers of this Pod type.
|
# Minimum number of Ray workers of this Pod type.
|
||||||
min_workers: 0
|
min_workers: 0
|
||||||
# Maximum number of Ray workers of this Pod type. Takes precedence over min_workers.
|
|
||||||
max_workers: 2
|
|
||||||
node_config:
|
node_config:
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Pod
|
kind: Pod
|
||||||
|
@ -136,6 +134,12 @@ available_node_types:
|
||||||
# cause problems for other pods.
|
# cause problems for other pods.
|
||||||
memory: 512Mi
|
memory: 512Mi
|
||||||
head_node:
|
head_node:
|
||||||
|
# The minimum number of worker nodes of this type to launch.
|
||||||
|
# This number should be >= 0.
|
||||||
|
min_workers: 0
|
||||||
|
# The maximum number of worker nodes of this type to launch.
|
||||||
|
# This takes precedence over min_workers.
|
||||||
|
max_workers: 0
|
||||||
node_config:
|
node_config:
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Pod
|
kind: Pod
|
||||||
|
|
|
@ -139,6 +139,12 @@ available_node_types:
|
||||||
# cause problems for other pods.
|
# cause problems for other pods.
|
||||||
memory: 512Mi
|
memory: 512Mi
|
||||||
head_node:
|
head_node:
|
||||||
|
# The minimum number of worker nodes of this type to launch.
|
||||||
|
# This number should be >= 0.
|
||||||
|
min_workers: 0
|
||||||
|
# The maximum number of worker nodes of this type to launch.
|
||||||
|
# This takes precedence over min_workers.
|
||||||
|
max_workers: 0
|
||||||
node_config:
|
node_config:
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Pod
|
kind: Pod
|
||||||
|
|
|
@ -17,6 +17,10 @@ spec:
|
||||||
# Specify the allowed pod types for this ray cluster and the resources they provide.
|
# Specify the allowed pod types for this ray cluster and the resources they provide.
|
||||||
podTypes:
|
podTypes:
|
||||||
- name: head-node
|
- name: head-node
|
||||||
|
# Minimum number of Ray workers of this Pod type.
|
||||||
|
minWorkers: 0
|
||||||
|
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
|
||||||
|
maxWorkers: 0
|
||||||
podConfig:
|
podConfig:
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Pod
|
kind: Pod
|
||||||
|
|
|
@ -17,6 +17,10 @@ spec:
|
||||||
# Specify the allowed pod types for this ray cluster and the resources they provide.
|
# Specify the allowed pod types for this ray cluster and the resources they provide.
|
||||||
podTypes:
|
podTypes:
|
||||||
- name: head-node
|
- name: head-node
|
||||||
|
# Minimum number of Ray workers of this Pod type.
|
||||||
|
minWorkers: 0
|
||||||
|
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
|
||||||
|
maxWorkers: 0
|
||||||
podConfig:
|
podConfig:
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Pod
|
kind: Pod
|
||||||
|
|
|
@ -1,16 +1,8 @@
|
||||||
# An unique identifier for the head node and workers of this cluster.
|
# An unique identifier for the head node and workers of this cluster.
|
||||||
cluster_name: default
|
cluster_name: default
|
||||||
|
|
||||||
## NOTE: Typically for local clusters, min_workers == max_workers == len(worker_ips).
|
## NOTE: Typically for local clusters, max_workers == len(worker_ips).
|
||||||
|
|
||||||
# The minimum number of workers nodes to launch in addition to the head
|
|
||||||
# node. This number should be >= 0.
|
|
||||||
# Typically, min_workers == max_workers == len(worker_ips).
|
|
||||||
min_workers: 0
|
|
||||||
|
|
||||||
# The maximum number of workers nodes to launch in addition to the head node.
|
|
||||||
# This takes precedence over min_workers.
|
|
||||||
# Typically, min_workers == max_workers == len(worker_ips).
|
|
||||||
max_workers: 0
|
max_workers: 0
|
||||||
|
|
||||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||||
|
@ -42,11 +34,20 @@ auth:
|
||||||
# Optional if an ssh private key is necessary to ssh to the cluster.
|
# Optional if an ssh private key is necessary to ssh to the cluster.
|
||||||
# ssh_private_key: ~/.ssh/id_rsa
|
# ssh_private_key: ~/.ssh/id_rsa
|
||||||
|
|
||||||
# Leave this empty.
|
available_node_types:
|
||||||
head_node: {}
|
ray.head.default:
|
||||||
|
resources: {}
|
||||||
# Leave this empty.
|
min_workers: 0
|
||||||
worker_nodes: {}
|
max_workers: 0
|
||||||
|
# Leave this empty
|
||||||
|
node_config: {}
|
||||||
|
ray.worker.default:
|
||||||
|
resources: {}
|
||||||
|
## NOTE: Typically for local clusters, max_workers == len(worker_ips).
|
||||||
|
min_workers: 0
|
||||||
|
# Leave this empty
|
||||||
|
node_config: {}
|
||||||
|
head_node_type: ray.head.default
|
||||||
|
|
||||||
# Files or directories to copy to the head and worker nodes. The format is a
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
@ -97,3 +98,6 @@ head_start_ray_commands:
|
||||||
worker_start_ray_commands:
|
worker_start_ray_commands:
|
||||||
- ray stop
|
- ray stop
|
||||||
- ray start --address=$RAY_HEAD_IP:6379
|
- ray start --address=$RAY_HEAD_IP:6379
|
||||||
|
|
||||||
|
head_node: {}
|
||||||
|
worker_nodes: {}
|
||||||
|
|
|
@ -2,10 +2,6 @@
|
||||||
# A namespace will be automatically created for each cluster_name in SKE.
|
# A namespace will be automatically created for each cluster_name in SKE.
|
||||||
cluster_name: default
|
cluster_name: default
|
||||||
|
|
||||||
# The minimum number of workers nodes to launch in addition to the head
|
|
||||||
# node. This number should be >= 0.
|
|
||||||
min_workers: 0
|
|
||||||
|
|
||||||
# The maximum number of workers nodes to launch in addition to the head
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
# node. This takes precedence over min_workers.
|
# node. This takes precedence over min_workers.
|
||||||
max_workers: 2
|
max_workers: 2
|
||||||
|
@ -85,174 +81,184 @@ provider:
|
||||||
# Exposing external IP addresses for ray pods isn't currently supported.
|
# Exposing external IP addresses for ray pods isn't currently supported.
|
||||||
use_internal_ips: true
|
use_internal_ips: true
|
||||||
|
|
||||||
# Kubernetes pod config for the head node pod.
|
head_node_type: ray.head.default
|
||||||
head_node:
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Pod
|
|
||||||
metadata:
|
|
||||||
# Automatically generates a name for the pod with this prefix.
|
|
||||||
generateName: ray-head-
|
|
||||||
|
|
||||||
# Must match the head node service selector above if a head node
|
available_node_types:
|
||||||
# service is required.
|
ray.head.default:
|
||||||
labels:
|
resources: {"CPU": 1}
|
||||||
component: ray-head
|
min_workers: 0
|
||||||
|
max_workers: 0
|
||||||
|
# Kubernetes pod config for the head node pod.
|
||||||
|
node_config:
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
# Automatically generates a name for the pod with this prefix.
|
||||||
|
generateName: ray-head-
|
||||||
|
|
||||||
# https://docs.staroid.com/ske/pod.html#pod
|
# Must match the head node service selector above if a head node
|
||||||
pod.staroid.com/spot: "false" # use on-demand instance for head.
|
# service is required.
|
||||||
|
labels:
|
||||||
|
component: ray-head
|
||||||
|
|
||||||
# Uncomment to locate ray head to dedicated Kubernetes node
|
# https://docs.staroid.com/ske/pod.html#pod
|
||||||
# (GPU instance is only available for 'dedicated' isolation)
|
pod.staroid.com/spot: "false" # use on-demand instance for head.
|
||||||
#pod.staroid.com/isolation: dedicated
|
|
||||||
#pod.staroid.com/instance-type: gpu-1
|
|
||||||
spec:
|
|
||||||
automountServiceAccountToken: true
|
|
||||||
|
|
||||||
# Restarting the head node automatically is not currently supported.
|
# Uncomment to locate ray head to dedicated Kubernetes node
|
||||||
# If the head node goes down, `ray up` must be run again.
|
# (GPU instance is only available for 'dedicated' isolation)
|
||||||
restartPolicy: Never
|
#pod.staroid.com/isolation: dedicated
|
||||||
|
#pod.staroid.com/instance-type: gpu-1
|
||||||
|
spec:
|
||||||
|
automountServiceAccountToken: true
|
||||||
|
|
||||||
# This volume allocates shared memory for Ray to use for its plasma
|
# Restarting the head node automatically is not currently supported.
|
||||||
# object store. If you do not provide this, Ray will fall back to
|
# If the head node goes down, `ray up` must be run again.
|
||||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
restartPolicy: Never
|
||||||
volumes:
|
|
||||||
- name: dshm
|
|
||||||
emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
# nfs volume provides a shared volume across all ray-nodes.
|
|
||||||
- name: nfs-volume
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: nfs
|
|
||||||
|
|
||||||
containers:
|
# This volume allocates shared memory for Ray to use for its plasma
|
||||||
- name: ray-node
|
# object store. If you do not provide this, Ray will fall back to
|
||||||
imagePullPolicy: Always
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||||
# You are free (and encouraged) to use your own container image,
|
volumes:
|
||||||
# but it should have the following installed:
|
- name: dshm
|
||||||
# - rsync (used for `ray rsync` commands and file mounts)
|
emptyDir:
|
||||||
# - screen (used for `ray attach`)
|
medium: Memory
|
||||||
# - kubectl (used by the autoscaler to manage worker pods)
|
# nfs volume provides a shared volume across all ray-nodes.
|
||||||
# Image will be overridden when 'image_from_project' is true.
|
- name: nfs-volume
|
||||||
image: rayproject/ray
|
persistentVolumeClaim:
|
||||||
# Do not change this command - it keeps the pod alive until it is
|
claimName: nfs
|
||||||
# explicitly killed.
|
|
||||||
command: ["/bin/bash", "-c", "--"]
|
|
||||||
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
|
|
||||||
ports:
|
|
||||||
- containerPort: 6379 # Redis port.
|
|
||||||
- containerPort: 6380 # Redis port.
|
|
||||||
- containerPort: 6381 # Redis port.
|
|
||||||
- containerPort: 12345 # Ray internal communication.
|
|
||||||
- containerPort: 12346 # Ray internal communication.
|
|
||||||
|
|
||||||
# This volume allocates shared memory for Ray to use for its plasma
|
containers:
|
||||||
# object store. If you do not provide this, Ray will fall back to
|
- name: ray-node
|
||||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
imagePullPolicy: Always
|
||||||
volumeMounts:
|
# You are free (and encouraged) to use your own container image,
|
||||||
- mountPath: /dev/shm
|
# but it should have the following installed:
|
||||||
name: dshm
|
# - rsync (used for `ray rsync` commands and file mounts)
|
||||||
- mountPath: /nfs
|
# - screen (used for `ray attach`)
|
||||||
name: nfs-volume
|
# - kubectl (used by the autoscaler to manage worker pods)
|
||||||
resources:
|
# Image will be overridden when 'image_from_project' is true.
|
||||||
requests:
|
image: rayproject/ray
|
||||||
cpu: 1000m
|
# Do not change this command - it keeps the pod alive until it is
|
||||||
memory: 2Gi
|
# explicitly killed.
|
||||||
limits:
|
command: ["/bin/bash", "-c", "--"]
|
||||||
# The maximum memory that this pod is allowed to use. The
|
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
|
||||||
# limit will be detected by ray and split to use 10% for
|
ports:
|
||||||
# redis, 30% for the shared memory object store, and the
|
- containerPort: 6379 # Redis port.
|
||||||
# rest for application memory. If this limit is not set and
|
- containerPort: 6380 # Redis port.
|
||||||
# the object store size is not set manually, ray will
|
- containerPort: 6381 # Redis port.
|
||||||
# allocate a very large object store in each pod that may
|
- containerPort: 12345 # Ray internal communication.
|
||||||
# cause problems for other pods.
|
- containerPort: 12346 # Ray internal communication.
|
||||||
memory: 2Gi
|
|
||||||
env:
|
|
||||||
# This is used in the head_start_ray_commands below so that
|
|
||||||
# Ray can spawn the correct number of processes. Omitting this
|
|
||||||
# may lead to degraded performance.
|
|
||||||
- name: MY_CPU_REQUEST
|
|
||||||
valueFrom:
|
|
||||||
resourceFieldRef:
|
|
||||||
resource: requests.cpu
|
|
||||||
- name: RAY_ADDRESS
|
|
||||||
value: "auto"
|
|
||||||
|
|
||||||
# Kubernetes pod config for worker node pods.
|
# This volume allocates shared memory for Ray to use for its plasma
|
||||||
worker_nodes:
|
# object store. If you do not provide this, Ray will fall back to
|
||||||
apiVersion: v1
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||||
kind: Pod
|
volumeMounts:
|
||||||
metadata:
|
- mountPath: /dev/shm
|
||||||
# Automatically generates a name for the pod with this prefix.
|
name: dshm
|
||||||
generateName: ray-worker-
|
- mountPath: /nfs
|
||||||
|
name: nfs-volume
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 1000m
|
||||||
|
memory: 2Gi
|
||||||
|
limits:
|
||||||
|
# The maximum memory that this pod is allowed to use. The
|
||||||
|
# limit will be detected by ray and split to use 10% for
|
||||||
|
# redis, 30% for the shared memory object store, and the
|
||||||
|
# rest for application memory. If this limit is not set and
|
||||||
|
# the object store size is not set manually, ray will
|
||||||
|
# allocate a very large object store in each pod that may
|
||||||
|
# cause problems for other pods.
|
||||||
|
memory: 2Gi
|
||||||
|
env:
|
||||||
|
# This is used in the head_start_ray_commands below so that
|
||||||
|
# Ray can spawn the correct number of processes. Omitting this
|
||||||
|
# may lead to degraded performance.
|
||||||
|
- name: MY_CPU_REQUEST
|
||||||
|
valueFrom:
|
||||||
|
resourceFieldRef:
|
||||||
|
resource: requests.cpu
|
||||||
|
- name: RAY_ADDRESS
|
||||||
|
value: "auto"
|
||||||
|
|
||||||
# Must match the worker node service selector above if a worker node
|
ray.worker.default:
|
||||||
# service is required.
|
min_workers: 0
|
||||||
labels:
|
resources: {"CPU": 1}
|
||||||
component: ray-worker
|
# Kubernetes pod config for worker node pods.
|
||||||
|
node_config:
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
# Automatically generates a name for the pod with this prefix.
|
||||||
|
generateName: ray-worker-
|
||||||
|
|
||||||
# https://docs.staroid.com/ske/pod.html#pod
|
# Must match the worker node service selector above if a worker node
|
||||||
pod.staroid.com/spot: "true" # use spot instance for workers.
|
# service is required.
|
||||||
|
labels:
|
||||||
|
component: ray-worker
|
||||||
|
|
||||||
# Uncomment to locate ray head to dedicated Kubernetes node
|
# https://docs.staroid.com/ske/pod.html#pod
|
||||||
# (GPU instance is only available for 'dedicated' isolation)
|
pod.staroid.com/spot: "true" # use spot instance for workers.
|
||||||
#pod.staroid.com/isolation: dedicated
|
|
||||||
#pod.staroid.com/instance-type: gpu-1
|
|
||||||
spec:
|
|
||||||
serviceAccountName: default
|
|
||||||
|
|
||||||
# Worker nodes will be managed automatically by the head node, so
|
# Uncomment to locate ray head to dedicated Kubernetes node
|
||||||
# do not change the restart policy.
|
# (GPU instance is only available for 'dedicated' isolation)
|
||||||
restartPolicy: Never
|
#pod.staroid.com/isolation: dedicated
|
||||||
|
#pod.staroid.com/instance-type: gpu-1
|
||||||
|
spec:
|
||||||
|
serviceAccountName: default
|
||||||
|
|
||||||
# This volume allocates shared memory for Ray to use for its plasma
|
# Worker nodes will be managed automatically by the head node, so
|
||||||
# object store. If you do not provide this, Ray will fall back to
|
# do not change the restart policy.
|
||||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
restartPolicy: Never
|
||||||
volumes:
|
|
||||||
- name: dshm
|
|
||||||
emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
- name: nfs-volume
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: nfs
|
|
||||||
containers:
|
|
||||||
- name: ray-node
|
|
||||||
imagePullPolicy: Always
|
|
||||||
# You are free (and encouraged) to use your own container image,
|
|
||||||
# but it should have the following installed:
|
|
||||||
# - rsync (used for `ray rsync` commands and file mounts)
|
|
||||||
image: rayproject/autoscaler
|
|
||||||
# Do not change this command - it keeps the pod alive until it is
|
|
||||||
# explicitly killed.
|
|
||||||
command: ["/bin/bash", "-c", "--"]
|
|
||||||
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
|
|
||||||
ports:
|
|
||||||
- containerPort: 12345 # Ray internal communication.
|
|
||||||
- containerPort: 12346 # Ray internal communication.
|
|
||||||
|
|
||||||
# This volume allocates shared memory for Ray to use for its plasma
|
# This volume allocates shared memory for Ray to use for its plasma
|
||||||
# object store. If you do not provide this, Ray will fall back to
|
# object store. If you do not provide this, Ray will fall back to
|
||||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||||
volumeMounts:
|
volumes:
|
||||||
- mountPath: /dev/shm
|
- name: dshm
|
||||||
name: dshm
|
emptyDir:
|
||||||
- mountPath: /nfs
|
medium: Memory
|
||||||
name: nfs-volume
|
- name: nfs-volume
|
||||||
resources:
|
persistentVolumeClaim:
|
||||||
requests:
|
claimName: nfs
|
||||||
cpu: 1000m
|
containers:
|
||||||
memory: 2Gi
|
- name: ray-node
|
||||||
limits:
|
imagePullPolicy: Always
|
||||||
# This memory limit will be detected by ray and split into
|
# You are free (and encouraged) to use your own container image,
|
||||||
# 30% for plasma, and 70% for workers.
|
# but it should have the following installed:
|
||||||
memory: 2Gi
|
# - rsync (used for `ray rsync` commands and file mounts)
|
||||||
env:
|
image: rayproject/autoscaler
|
||||||
# This is used in the head_start_ray_commands below so that
|
# Do not change this command - it keeps the pod alive until it is
|
||||||
# Ray can spawn the correct number of processes. Omitting this
|
# explicitly killed.
|
||||||
# may lead to degraded performance.
|
command: ["/bin/bash", "-c", "--"]
|
||||||
- name: MY_CPU_REQUEST
|
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
|
||||||
valueFrom:
|
ports:
|
||||||
resourceFieldRef:
|
- containerPort: 12345 # Ray internal communication.
|
||||||
resource: requests.cpu
|
- containerPort: 12346 # Ray internal communication.
|
||||||
|
|
||||||
|
# This volume allocates shared memory for Ray to use for its plasma
|
||||||
|
# object store. If you do not provide this, Ray will fall back to
|
||||||
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /dev/shm
|
||||||
|
name: dshm
|
||||||
|
- mountPath: /nfs
|
||||||
|
name: nfs-volume
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 1000m
|
||||||
|
memory: 2Gi
|
||||||
|
limits:
|
||||||
|
# This memory limit will be detected by ray and split into
|
||||||
|
# 30% for plasma, and 70% for workers.
|
||||||
|
memory: 2Gi
|
||||||
|
env:
|
||||||
|
# This is used in the head_start_ray_commands below so that
|
||||||
|
# Ray can spawn the correct number of processes. Omitting this
|
||||||
|
# may lead to degraded performance.
|
||||||
|
- name: MY_CPU_REQUEST
|
||||||
|
valueFrom:
|
||||||
|
resourceFieldRef:
|
||||||
|
resource: requests.cpu
|
||||||
|
|
||||||
# Files or directories to copy to the head and worker nodes. The format is a
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
@ -307,3 +313,6 @@ head_start_ray_commands:
|
||||||
worker_start_ray_commands:
|
worker_start_ray_commands:
|
||||||
- ray stop
|
- ray stop
|
||||||
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||||
|
|
||||||
|
head_node: {}
|
||||||
|
worker_nodes: {}
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from ray.autoscaler._private.aws.config import _get_vpc_id_or_die, \
|
from ray.autoscaler._private.aws.config import _get_vpc_id_or_die, \
|
||||||
bootstrap_aws, \
|
bootstrap_aws, \
|
||||||
DEFAULT_AMI
|
DEFAULT_AMI
|
||||||
import ray.tests.aws.utils.stubs as stubs
|
import ray.tests.aws.utils.stubs as stubs
|
||||||
import ray.tests.aws.utils.helpers as helpers
|
import ray.tests.aws.utils.helpers as helpers
|
||||||
from ray.tests.aws.utils.constants import AUX_SUBNET, DEFAULT_SUBNET, \
|
from ray.tests.aws.utils.constants import AUX_SUBNET, DEFAULT_SUBNET, \
|
||||||
|
@ -143,8 +143,10 @@ def test_fills_out_amis(iam_client_stub, ec2_client_stub):
|
||||||
stubs.configure_subnet_default(ec2_client_stub)
|
stubs.configure_subnet_default(ec2_client_stub)
|
||||||
|
|
||||||
config = helpers.load_aws_example_config_file("example-full.yaml")
|
config = helpers.load_aws_example_config_file("example-full.yaml")
|
||||||
del config["head_node"]["ImageId"]
|
del config["available_node_types"]["ray.head.default"]["node_config"][
|
||||||
del config["worker_nodes"]["ImageId"]
|
"ImageId"]
|
||||||
|
del config["available_node_types"]["ray.worker.default"]["node_config"][
|
||||||
|
"ImageId"]
|
||||||
|
|
||||||
# Pass in SG for stub to work
|
# Pass in SG for stub to work
|
||||||
config["head_node"]["SecurityGroupIds"] = ["sg-1234abcd"]
|
config["head_node"]["SecurityGroupIds"] = ["sg-1234abcd"]
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import json
|
import json
|
||||||
|
import jsonschema
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
from subprocess import CalledProcessError
|
from subprocess import CalledProcessError
|
||||||
|
@ -264,6 +265,55 @@ SMALL_CLUSTER = {
|
||||||
"worker_start_ray_commands": ["start_ray_worker"],
|
"worker_start_ray_commands": ["start_ray_worker"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MOCK_DEFAULT_CONFIG = {
|
||||||
|
"cluster_name": "default",
|
||||||
|
"max_workers": 2,
|
||||||
|
"upscaling_speed": 1.0,
|
||||||
|
"idle_timeout_minutes": 5,
|
||||||
|
"provider": {
|
||||||
|
"type": "mock",
|
||||||
|
"region": "us-east-1",
|
||||||
|
"availability_zone": "us-east-1a",
|
||||||
|
},
|
||||||
|
"docker": {
|
||||||
|
"image": "example",
|
||||||
|
"container_name": "mock",
|
||||||
|
},
|
||||||
|
"auth": {
|
||||||
|
"ssh_user": "ubuntu",
|
||||||
|
"ssh_private_key": os.devnull,
|
||||||
|
},
|
||||||
|
"available_node_types": {
|
||||||
|
"ray.head.default": {
|
||||||
|
"min_workers": 0,
|
||||||
|
"max_workers": 0,
|
||||||
|
"resources": {},
|
||||||
|
"node_config": {
|
||||||
|
"head_default_prop": 4
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"ray.worker.default": {
|
||||||
|
"min_workers": 0,
|
||||||
|
"max_workers": 2,
|
||||||
|
"resources": {},
|
||||||
|
"node_config": {
|
||||||
|
"worker_default_prop": 7
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"head_node_type": "ray.head.default",
|
||||||
|
"head_node": {},
|
||||||
|
"worker_nodes": {},
|
||||||
|
"file_mounts": {},
|
||||||
|
"cluster_synced_files": [],
|
||||||
|
"initialization_commands": [],
|
||||||
|
"setup_commands": [],
|
||||||
|
"head_setup_commands": [],
|
||||||
|
"worker_setup_commands": [],
|
||||||
|
"head_start_ray_commands": [],
|
||||||
|
"worker_start_ray_commands": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class LoadMetricsTest(unittest.TestCase):
|
class LoadMetricsTest(unittest.TestCase):
|
||||||
def testHeartbeat(self):
|
def testHeartbeat(self):
|
||||||
|
@ -1645,6 +1695,28 @@ class AutoscalingTest(unittest.TestCase):
|
||||||
config_path, LoadMetrics(), max_failures=0, update_interval_s=0)
|
config_path, LoadMetrics(), max_failures=0, update_interval_s=0)
|
||||||
assert isinstance(autoscaler.provider, NodeProvider)
|
assert isinstance(autoscaler.provider, NodeProvider)
|
||||||
|
|
||||||
|
def testLegacyExternalNodeScalerMissingFields(self):
|
||||||
|
"""Should fail to validate legacy external config with missing
|
||||||
|
head_node, worker_nodes, or both."""
|
||||||
|
external_config = copy.deepcopy(SMALL_CLUSTER)
|
||||||
|
external_config["provider"] = {
|
||||||
|
"type": "external",
|
||||||
|
"module": "ray.autoscaler.node_provider.NodeProvider",
|
||||||
|
}
|
||||||
|
|
||||||
|
missing_workers, missing_head, missing_both = [
|
||||||
|
copy.deepcopy(external_config) for _ in range(3)
|
||||||
|
]
|
||||||
|
del missing_workers["worker_nodes"]
|
||||||
|
del missing_head["head_node"]
|
||||||
|
del missing_both["worker_nodes"]
|
||||||
|
del missing_both["head_node"]
|
||||||
|
|
||||||
|
for faulty_config in missing_workers, missing_head, missing_both:
|
||||||
|
faulty_config = prepare_config(faulty_config)
|
||||||
|
with pytest.raises(jsonschema.ValidationError):
|
||||||
|
validate_config(faulty_config)
|
||||||
|
|
||||||
def testExternalNodeScalerWrongImport(self):
|
def testExternalNodeScalerWrongImport(self):
|
||||||
config = SMALL_CLUSTER.copy()
|
config = SMALL_CLUSTER.copy()
|
||||||
config["provider"] = {
|
config["provider"] = {
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import jsonschema
|
import jsonschema
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
|
@ -9,10 +10,12 @@ import copy
|
||||||
from unittest.mock import MagicMock, Mock, patch
|
from unittest.mock import MagicMock, Mock, patch
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from ray.autoscaler._private.util import prepare_config, validate_config
|
from ray.autoscaler._private.util import prepare_config, validate_config,\
|
||||||
|
_get_default_config, merge_setup_commands
|
||||||
from ray.autoscaler._private.providers import _NODE_PROVIDERS
|
from ray.autoscaler._private.providers import _NODE_PROVIDERS
|
||||||
from ray.autoscaler._private.kubernetes.node_provider import\
|
from ray.autoscaler._private.kubernetes.node_provider import\
|
||||||
KubernetesNodeProvider
|
KubernetesNodeProvider
|
||||||
|
from ray.autoscaler.tags import NODE_TYPE_LEGACY_HEAD, NODE_TYPE_LEGACY_WORKER
|
||||||
|
|
||||||
from ray.test_utils import load_test_config, recursive_fnmatch
|
from ray.test_utils import load_test_config, recursive_fnmatch
|
||||||
|
|
||||||
|
@ -37,18 +40,19 @@ CONFIG_PATHS = ignore_k8s_operator_configs(CONFIG_PATHS)
|
||||||
class AutoscalingConfigTest(unittest.TestCase):
|
class AutoscalingConfigTest(unittest.TestCase):
|
||||||
def testValidateDefaultConfig(self):
|
def testValidateDefaultConfig(self):
|
||||||
for config_path in CONFIG_PATHS:
|
for config_path in CONFIG_PATHS:
|
||||||
if "aws/example-multi-node-type.yaml" in config_path:
|
|
||||||
# aws is tested in testValidateDefaultConfigAWSMultiNodeTypes.
|
|
||||||
continue
|
|
||||||
with open(config_path) as f:
|
|
||||||
config = yaml.safe_load(f)
|
|
||||||
config = prepare_config(config)
|
|
||||||
if config["provider"]["type"] == "kubernetes":
|
|
||||||
KubernetesNodeProvider.fillout_available_node_types_resources(
|
|
||||||
config)
|
|
||||||
try:
|
try:
|
||||||
|
if "aws/example-multi-node-type.yaml" in config_path:
|
||||||
|
# aws tested in testValidateDefaultConfigAWSMultiNodeTypes.
|
||||||
|
continue
|
||||||
|
with open(config_path) as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
config = prepare_config(config)
|
||||||
|
if config["provider"]["type"] == "kubernetes":
|
||||||
|
KubernetesNodeProvider.\
|
||||||
|
fillout_available_node_types_resources(config)
|
||||||
validate_config(config)
|
validate_config(config)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
logging.exception("")
|
||||||
self.fail(
|
self.fail(
|
||||||
f"Config {config_path} did not pass validation test!")
|
f"Config {config_path} did not pass validation test!")
|
||||||
|
|
||||||
|
@ -232,7 +236,6 @@ class AutoscalingConfigTest(unittest.TestCase):
|
||||||
self.fail("Failed to validate config with security group name!")
|
self.fail("Failed to validate config with security group name!")
|
||||||
|
|
||||||
def testMaxWorkerDefault(self):
|
def testMaxWorkerDefault(self):
|
||||||
|
|
||||||
# Load config, call prepare config, check that default max_workers
|
# Load config, call prepare config, check that default max_workers
|
||||||
# is filled correctly for node types that don't specify it.
|
# is filled correctly for node types that don't specify it.
|
||||||
# Check that max_workers is untouched for node types
|
# Check that max_workers is untouched for node types
|
||||||
|
@ -254,7 +257,7 @@ class AutoscalingConfigTest(unittest.TestCase):
|
||||||
# Max workers auto-filled with specified cluster-wide value of 5.
|
# Max workers auto-filled with specified cluster-wide value of 5.
|
||||||
assert config["max_workers"] ==\
|
assert config["max_workers"] ==\
|
||||||
prepared_node_types["worker_node_max_unspecified"]["max_workers"]\
|
prepared_node_types["worker_node_max_unspecified"]["max_workers"]\
|
||||||
== config["max_workers"] == 5
|
== 5
|
||||||
|
|
||||||
# Repeat with a config that doesn't specify global max workers.
|
# Repeat with a config that doesn't specify global max workers.
|
||||||
# Default value of 2 should be pulled in for global max workers.
|
# Default value of 2 should be pulled in for global max workers.
|
||||||
|
@ -275,8 +278,87 @@ class AutoscalingConfigTest(unittest.TestCase):
|
||||||
prepared_node_types["worker_node_max_specified"][
|
prepared_node_types["worker_node_max_specified"][
|
||||||
"max_workers"] == 3
|
"max_workers"] == 3
|
||||||
# Max workers auto-filled with default cluster-wide value of 2.
|
# Max workers auto-filled with default cluster-wide value of 2.
|
||||||
assert prepared_node_types["worker_node_max_unspecified"][
|
assert prepared_config["max_workers"] ==\
|
||||||
"max_workers"] == 2
|
prepared_node_types["worker_node_max_unspecified"]["max_workers"]\
|
||||||
|
== 2
|
||||||
|
|
||||||
|
def testFillEdgeLegacyConfigs(self):
|
||||||
|
# Test edge cases: legacy configs which specify workers but not head
|
||||||
|
# or vice-versa.
|
||||||
|
no_head = load_test_config("test_no_head.yaml")
|
||||||
|
aws_defaults = _get_default_config(no_head["provider"])
|
||||||
|
head_prepared = prepare_config(no_head)
|
||||||
|
assert head_prepared["available_node_types"][
|
||||||
|
"ray-legacy-head-node-type"]["node_config"] ==\
|
||||||
|
aws_defaults["available_node_types"][
|
||||||
|
"ray.head.default"]["node_config"]
|
||||||
|
assert head_prepared["head_node"] == {}
|
||||||
|
# Custom worker config preserved
|
||||||
|
node_types = head_prepared["available_node_types"]
|
||||||
|
worker_type = node_types["ray-legacy-worker-node-type"]
|
||||||
|
assert worker_type["node_config"] == head_prepared["worker_nodes"] == {
|
||||||
|
"foo": "bar"
|
||||||
|
}
|
||||||
|
|
||||||
|
no_workers = load_test_config("test_no_workers.yaml")
|
||||||
|
workers_prepared = prepare_config(no_workers)
|
||||||
|
assert workers_prepared["available_node_types"][
|
||||||
|
"ray-legacy-worker-node-type"]["node_config"] ==\
|
||||||
|
aws_defaults["available_node_types"][
|
||||||
|
"ray.worker.default"]["node_config"]
|
||||||
|
assert workers_prepared["worker_nodes"] == {}
|
||||||
|
# Custom head config preserved
|
||||||
|
node_types = workers_prepared["available_node_types"]
|
||||||
|
head_type = node_types["ray-legacy-head-node-type"]
|
||||||
|
assert head_type["node_config"] == workers_prepared["head_node"] == {
|
||||||
|
"baz": "qux"
|
||||||
|
}
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
sys.platform.startswith("win"), reason="Fails on Windows.")
|
||||||
|
def testExampleFull(self):
|
||||||
|
"""
|
||||||
|
Test that example-full yamls are unmodified by prepared_config,
|
||||||
|
except possibly by having setup_commands merged.
|
||||||
|
"""
|
||||||
|
providers = ["aws", "gcp", "azure"]
|
||||||
|
for provider in providers:
|
||||||
|
path = os.path.join(RAY_PATH, "autoscaler", provider,
|
||||||
|
"example-full.yaml")
|
||||||
|
config = yaml.safe_load(open(path).read())
|
||||||
|
config_copy = copy.deepcopy(config)
|
||||||
|
merge_setup_commands(config_copy)
|
||||||
|
assert config_copy == prepare_config(config)
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
sys.platform.startswith("win"), reason="Fails on Windows.")
|
||||||
|
def testLegacyYaml(self):
|
||||||
|
# Test correct default-merging behavior for legacy yamls.
|
||||||
|
providers = ["aws", "gcp", "azure"]
|
||||||
|
for provider in providers:
|
||||||
|
path = os.path.join(RAY_PATH, "autoscaler", provider,
|
||||||
|
"example-full-legacy.yaml")
|
||||||
|
legacy_config = yaml.safe_load(open(path).read())
|
||||||
|
# custom head and workers
|
||||||
|
legacy_config["head_node"] = {"blahblah": 0}
|
||||||
|
legacy_config["worker_nodes"] = {"halbhalhb": 0}
|
||||||
|
legacy_config_copy = copy.deepcopy(legacy_config)
|
||||||
|
prepared_legacy = prepare_config(legacy_config_copy)
|
||||||
|
assert prepared_legacy["available_node_types"][
|
||||||
|
NODE_TYPE_LEGACY_HEAD]["max_workers"] == 0
|
||||||
|
assert prepared_legacy["available_node_types"][
|
||||||
|
NODE_TYPE_LEGACY_HEAD]["min_workers"] == 0
|
||||||
|
assert prepared_legacy["available_node_types"][
|
||||||
|
NODE_TYPE_LEGACY_HEAD]["node_config"] == legacy_config[
|
||||||
|
"head_node"]
|
||||||
|
|
||||||
|
assert prepared_legacy["available_node_types"][
|
||||||
|
NODE_TYPE_LEGACY_WORKER]["max_workers"] == 2
|
||||||
|
assert prepared_legacy["available_node_types"][
|
||||||
|
NODE_TYPE_LEGACY_WORKER]["min_workers"] == 0
|
||||||
|
assert prepared_legacy["available_node_types"][
|
||||||
|
NODE_TYPE_LEGACY_WORKER]["node_config"] == legacy_config[
|
||||||
|
"worker_nodes"]
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
123
python/ray/tests/test_cli_patterns/test_no_head.yaml
Normal file
123
python/ray/tests/test_cli_patterns/test_no_head.yaml
Normal file
|
@ -0,0 +1,123 @@
|
||||||
|
# An unique identifier for the head node and workers of this cluster.
|
||||||
|
cluster_name: default
|
||||||
|
|
||||||
|
# The minimum number of workers nodes to launch in addition to the head
|
||||||
|
# node. This number should be >= 0.
|
||||||
|
min_workers: 0
|
||||||
|
|
||||||
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
|
# node. This takes precedence over min_workers.
|
||||||
|
max_workers: 2
|
||||||
|
|
||||||
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||||
|
# E.g., if the task requires adding more nodes then autoscaler will gradually
|
||||||
|
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
|
||||||
|
# This number should be > 0.
|
||||||
|
upscaling_speed: 1.0
|
||||||
|
|
||||||
|
# This executes all commands on all nodes in the docker container,
|
||||||
|
# and opens all the necessary ports to support the Ray cluster.
|
||||||
|
# Empty string means disabled.
|
||||||
|
docker:
|
||||||
|
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||||
|
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||||
|
container_name: "ray_container"
|
||||||
|
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||||
|
# if no cached version is present.
|
||||||
|
pull_before_run: True
|
||||||
|
run_options: [] # Extra options to pass into "docker run"
|
||||||
|
|
||||||
|
# Example of running a GPU head with CPU workers
|
||||||
|
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||||
|
# Allow Ray to automatically detect GPUs
|
||||||
|
|
||||||
|
# worker_image: "rayproject/ray-ml:latest-cpu"
|
||||||
|
# worker_run_options: []
|
||||||
|
|
||||||
|
# If a node is idle for this many minutes, it will be removed.
|
||||||
|
idle_timeout_minutes: 5
|
||||||
|
|
||||||
|
# Cloud-provider specific configuration.
|
||||||
|
provider:
|
||||||
|
type: aws
|
||||||
|
region: us-west-2
|
||||||
|
# Availability zone(s), comma-separated, that nodes may be launched in.
|
||||||
|
# Nodes are currently spread between zones by a round-robin approach,
|
||||||
|
# however this implementation detail should not be relied upon.
|
||||||
|
availability_zone: us-west-2a,us-west-2b
|
||||||
|
# Whether to allow node reuse. If set to False, nodes will be terminated
|
||||||
|
# instead of stopped.
|
||||||
|
cache_stopped_nodes: True # If not present, the default is True.
|
||||||
|
|
||||||
|
# How Ray will authenticate with newly launched nodes.
|
||||||
|
auth:
|
||||||
|
ssh_user: ubuntu
|
||||||
|
# By default Ray creates a new private keypair, but you can also use your own.
|
||||||
|
# If you do so, make sure to also set "KeyName" in the head and worker node
|
||||||
|
# configurations below.
|
||||||
|
# ssh_private_key: /path/to/your/key.pem
|
||||||
|
|
||||||
|
|
||||||
|
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||||
|
worker_nodes:
|
||||||
|
foo: bar
|
||||||
|
|
||||||
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
file_mounts: {
|
||||||
|
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||||
|
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||||
|
# list of paths. The same path on the head node will be copied to the worker node.
|
||||||
|
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
||||||
|
# you should just use file_mounts. Only use this if you know what you're doing!
|
||||||
|
cluster_synced_files: []
|
||||||
|
|
||||||
|
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
|
||||||
|
# should sync to the worker node continuously
|
||||||
|
file_mounts_sync_continuously: False
|
||||||
|
|
||||||
|
# Patterns for files to exclude when running rsync up or rsync down
|
||||||
|
rsync_exclude:
|
||||||
|
- "**/.git"
|
||||||
|
- "**/.git/**"
|
||||||
|
|
||||||
|
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
|
||||||
|
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
|
||||||
|
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
|
||||||
|
rsync_filter:
|
||||||
|
- ".gitignore"
|
||||||
|
|
||||||
|
# List of commands that will be run before `setup_commands`. If docker is
|
||||||
|
# enabled, these commands will run outside the container and before docker
|
||||||
|
# is setup.
|
||||||
|
initialization_commands: []
|
||||||
|
|
||||||
|
# List of shell commands to run to set up nodes.
|
||||||
|
setup_commands: []
|
||||||
|
# Note: if you're developing Ray, you probably want to create a Docker image that
|
||||||
|
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||||
|
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||||
|
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||||
|
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||||
|
|
||||||
|
# Custom commands that will be run on the head node after common setup.
|
||||||
|
head_setup_commands: []
|
||||||
|
|
||||||
|
# Custom commands that will be run on worker nodes after common setup.
|
||||||
|
worker_setup_commands: []
|
||||||
|
|
||||||
|
# Command to start ray on the head node. You don't need to change this.
|
||||||
|
head_start_ray_commands:
|
||||||
|
- ray stop
|
||||||
|
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||||
|
|
||||||
|
# Command to start ray on worker nodes. You don't need to change this.
|
||||||
|
worker_start_ray_commands:
|
||||||
|
- ray stop
|
||||||
|
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
124
python/ray/tests/test_cli_patterns/test_no_workers.yaml
Normal file
124
python/ray/tests/test_cli_patterns/test_no_workers.yaml
Normal file
|
@ -0,0 +1,124 @@
|
||||||
|
|
||||||
|
# An unique identifier for the head node and workers of this cluster.
|
||||||
|
cluster_name: default
|
||||||
|
|
||||||
|
# The minimum number of workers nodes to launch in addition to the head
|
||||||
|
# node. This number should be >= 0.
|
||||||
|
min_workers: 0
|
||||||
|
|
||||||
|
# The maximum number of workers nodes to launch in addition to the head
|
||||||
|
# node. This takes precedence over min_workers.
|
||||||
|
max_workers: 2
|
||||||
|
|
||||||
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||||
|
# E.g., if the task requires adding more nodes then autoscaler will gradually
|
||||||
|
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
|
||||||
|
# This number should be > 0.
|
||||||
|
upscaling_speed: 1.0
|
||||||
|
|
||||||
|
# This executes all commands on all nodes in the docker container,
|
||||||
|
# and opens all the necessary ports to support the Ray cluster.
|
||||||
|
# Empty string means disabled.
|
||||||
|
docker:
|
||||||
|
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||||
|
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||||
|
container_name: "ray_container"
|
||||||
|
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||||
|
# if no cached version is present.
|
||||||
|
pull_before_run: True
|
||||||
|
run_options: [] # Extra options to pass into "docker run"
|
||||||
|
|
||||||
|
# Example of running a GPU head with CPU workers
|
||||||
|
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||||
|
# Allow Ray to automatically detect GPUs
|
||||||
|
|
||||||
|
# worker_image: "rayproject/ray-ml:latest-cpu"
|
||||||
|
# worker_run_options: []
|
||||||
|
|
||||||
|
# If a node is idle for this many minutes, it will be removed.
|
||||||
|
idle_timeout_minutes: 5
|
||||||
|
|
||||||
|
# Cloud-provider specific configuration.
|
||||||
|
provider:
|
||||||
|
type: aws
|
||||||
|
region: us-west-2
|
||||||
|
# Availability zone(s), comma-separated, that nodes may be launched in.
|
||||||
|
# Nodes are currently spread between zones by a round-robin approach,
|
||||||
|
# however this implementation detail should not be relied upon.
|
||||||
|
availability_zone: us-west-2a,us-west-2b
|
||||||
|
# Whether to allow node reuse. If set to False, nodes will be terminated
|
||||||
|
# instead of stopped.
|
||||||
|
cache_stopped_nodes: True # If not present, the default is True.
|
||||||
|
|
||||||
|
# How Ray will authenticate with newly launched nodes.
|
||||||
|
auth:
|
||||||
|
ssh_user: ubuntu
|
||||||
|
# By default Ray creates a new private keypair, but you can also use your own.
|
||||||
|
# If you do so, make sure to also set "KeyName" in the head and worker node
|
||||||
|
# configurations below.
|
||||||
|
# ssh_private_key: /path/to/your/key.pem
|
||||||
|
|
||||||
|
# Provider-specific config for the head node, e.g. instance type. By default
|
||||||
|
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||||
|
# For more documentation on available fields, see:
|
||||||
|
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||||
|
head_node:
|
||||||
|
baz: qux
|
||||||
|
|
||||||
|
|
||||||
|
# Files or directories to copy to the head and worker nodes. The format is a
|
||||||
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||||
|
file_mounts: {
|
||||||
|
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||||
|
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||||
|
# list of paths. The same path on the head node will be copied to the worker node.
|
||||||
|
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
||||||
|
# you should just use file_mounts. Only use this if you know what you're doing!
|
||||||
|
cluster_synced_files: []
|
||||||
|
|
||||||
|
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
|
||||||
|
# should sync to the worker node continuously
|
||||||
|
file_mounts_sync_continuously: False
|
||||||
|
|
||||||
|
# Patterns for files to exclude when running rsync up or rsync down
|
||||||
|
rsync_exclude:
|
||||||
|
- "**/.git"
|
||||||
|
- "**/.git/**"
|
||||||
|
|
||||||
|
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
|
||||||
|
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
|
||||||
|
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
|
||||||
|
rsync_filter:
|
||||||
|
- ".gitignore"
|
||||||
|
|
||||||
|
# List of commands that will be run before `setup_commands`. If docker is
|
||||||
|
# enabled, these commands will run outside the container and before docker
|
||||||
|
# is setup.
|
||||||
|
initialization_commands: []
|
||||||
|
|
||||||
|
# List of shell commands to run to set up nodes.
|
||||||
|
setup_commands: []
|
||||||
|
# Note: if you're developing Ray, you probably want to create a Docker image that
|
||||||
|
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||||
|
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||||
|
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||||
|
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||||
|
|
||||||
|
# Custom commands that will be run on the head node after common setup.
|
||||||
|
head_setup_commands: []
|
||||||
|
|
||||||
|
# Custom commands that will be run on worker nodes after common setup.
|
||||||
|
worker_setup_commands: []
|
||||||
|
|
||||||
|
# Command to start ray on the head node. You don't need to change this.
|
||||||
|
head_start_ray_commands:
|
||||||
|
- ray stop
|
||||||
|
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||||
|
|
||||||
|
# Command to start ray on worker nodes. You don't need to change this.
|
||||||
|
worker_start_ray_commands:
|
||||||
|
- ray stop
|
||||||
|
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
|
@ -195,4 +195,4 @@ class KubernetesOperatorTest(unittest.TestCase):
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
kubernetes.config.load_kube_config()
|
kubernetes.config.load_kube_config()
|
||||||
sys.exit(pytest.main(["-v", __file__]))
|
sys.exit(pytest.main(["-sv", __file__]))
|
||||||
|
|
|
@ -5,15 +5,16 @@ import yaml
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
import unittest
|
import unittest
|
||||||
|
from unittest import mock
|
||||||
import copy
|
import copy
|
||||||
|
|
||||||
import ray
|
import ray
|
||||||
import ray.ray_constants
|
import ray.ray_constants
|
||||||
from ray.autoscaler._private.util import \
|
from ray.autoscaler._private.util import \
|
||||||
rewrite_legacy_yaml_to_available_node_types, format_info_string, \
|
prepare_config, format_info_string, \
|
||||||
format_info_string_no_node_types
|
format_info_string_no_node_types
|
||||||
from ray.tests.test_autoscaler import SMALL_CLUSTER, MockProvider, \
|
from ray.tests.test_autoscaler import SMALL_CLUSTER, MOCK_DEFAULT_CONFIG, \
|
||||||
MockProcessRunner
|
MockProvider, MockProcessRunner
|
||||||
from ray.autoscaler._private.providers import (_NODE_PROVIDERS,
|
from ray.autoscaler._private.providers import (_NODE_PROVIDERS,
|
||||||
_clear_provider_cache)
|
_clear_provider_cache)
|
||||||
from ray.autoscaler._private.autoscaler import StandardAutoscaler, \
|
from ray.autoscaler._private.autoscaler import StandardAutoscaler, \
|
||||||
|
@ -38,6 +39,8 @@ from ray.autoscaler._private.constants import \
|
||||||
|
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
|
GET_DEFAULT_METHOD = "ray.autoscaler._private.util._get_default_config"
|
||||||
|
|
||||||
TYPES_A = {
|
TYPES_A = {
|
||||||
"empty_node": {
|
"empty_node": {
|
||||||
"node_config": {
|
"node_config": {
|
||||||
|
@ -1042,131 +1045,135 @@ def test_get_nodes_to_launch_max_launch_concurrency():
|
||||||
|
|
||||||
|
|
||||||
def test_rewrite_legacy_yaml_to_available_node_types():
|
def test_rewrite_legacy_yaml_to_available_node_types():
|
||||||
cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config.
|
with mock.patch(GET_DEFAULT_METHOD, return_value=MOCK_DEFAULT_CONFIG):
|
||||||
cluster_config = rewrite_legacy_yaml_to_available_node_types(
|
cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config.
|
||||||
cluster_config)
|
cluster_config = prepare_config(cluster_config)
|
||||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
|
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
|
||||||
"max_workers"] == 0
|
"max_workers"] == 0
|
||||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
|
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
|
||||||
"min_workers"] == 0
|
"min_workers"] == 0
|
||||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
|
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
|
||||||
"node_config"] == SMALL_CLUSTER["head_node"]
|
"node_config"] == SMALL_CLUSTER["head_node"]
|
||||||
|
|
||||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
|
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
|
||||||
"node_config"] == SMALL_CLUSTER["worker_nodes"]
|
"node_config"] == SMALL_CLUSTER["worker_nodes"]
|
||||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
|
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
|
||||||
"max_workers"] == SMALL_CLUSTER["max_workers"]
|
"max_workers"] == SMALL_CLUSTER["max_workers"]
|
||||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
|
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
|
||||||
"min_workers"] == SMALL_CLUSTER["min_workers"]
|
"min_workers"] == SMALL_CLUSTER["min_workers"]
|
||||||
|
|
||||||
|
|
||||||
def test_handle_legacy_cluster_config_yaml():
|
def test_handle_legacy_cluster_config_yaml():
|
||||||
provider = MockProvider()
|
with mock.patch(GET_DEFAULT_METHOD, return_value=MOCK_DEFAULT_CONFIG):
|
||||||
head_resources = {"CPU": 8, "GPU": 1}
|
provider = MockProvider()
|
||||||
worker_resources = {"CPU": 32, "GPU": 8}
|
head_resources = {"CPU": 8, "GPU": 1}
|
||||||
cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config.
|
worker_resources = {"CPU": 32, "GPU": 8}
|
||||||
cluster_config = rewrite_legacy_yaml_to_available_node_types(
|
cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config.
|
||||||
cluster_config)
|
cluster_config = prepare_config(cluster_config)
|
||||||
scheduler = ResourceDemandScheduler(
|
scheduler = ResourceDemandScheduler(
|
||||||
provider,
|
provider,
|
||||||
cluster_config["available_node_types"],
|
cluster_config["available_node_types"],
|
||||||
0,
|
0,
|
||||||
head_node_type=NODE_TYPE_LEGACY_HEAD)
|
head_node_type=NODE_TYPE_LEGACY_HEAD)
|
||||||
provider.create_node({}, {
|
provider.create_node({}, {
|
||||||
TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
|
TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
|
||||||
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_HEAD
|
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_HEAD
|
||||||
}, 1)
|
}, 1)
|
||||||
head_ip = provider.non_terminated_node_ips({})[0]
|
head_ip = provider.non_terminated_node_ips({})[0]
|
||||||
head_node_id = provider.non_terminated_nodes({})[0]
|
head_node_id = provider.non_terminated_nodes({})[0]
|
||||||
to_launch = scheduler.get_nodes_to_launch([], {}, [], {}, [],
|
to_launch = scheduler.get_nodes_to_launch([], {}, [], {}, [],
|
||||||
{head_ip: head_resources})
|
{head_ip: head_resources})
|
||||||
assert to_launch == {} # Should always be empty with max_workers = 0.
|
assert to_launch == {} # Should always be empty with max_workers = 0.
|
||||||
|
|
||||||
scheduler.max_workers = 30
|
scheduler.max_workers = 30
|
||||||
min_workers = scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"]
|
min_workers = scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
|
||||||
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = 0
|
"min_workers"]
|
||||||
to_launch = scheduler.get_nodes_to_launch([head_node_id], {}, [], {}, [],
|
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = 0
|
||||||
{head_ip: head_resources})
|
to_launch = scheduler.get_nodes_to_launch(
|
||||||
assert to_launch == {
|
[head_node_id], {}, [], {}, [], {head_ip: head_resources})
|
||||||
} # Since the resource demand does not require adding nodes.
|
assert to_launch == {
|
||||||
to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
|
} # Since the resource demand does not require adding nodes.
|
||||||
[head_resources], {}, [],
|
to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
|
||||||
{head_ip: head_resources})
|
[head_resources], {}, [],
|
||||||
assert to_launch == {
|
{head_ip: head_resources})
|
||||||
} # Since the resource demand does not require adding nodes.
|
assert to_launch == {
|
||||||
|
} # Since the resource demand does not require adding nodes.
|
||||||
|
|
||||||
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = min_workers
|
scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
|
||||||
# Returns min_workers when min_workers>0.
|
"min_workers"] = min_workers
|
||||||
to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
|
# Returns min_workers when min_workers>0.
|
||||||
[head_resources], {}, [],
|
to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
|
||||||
{head_ip: head_resources})
|
[head_resources], {}, [],
|
||||||
assert to_launch == {NODE_TYPE_LEGACY_WORKER: min_workers}
|
{head_ip: head_resources})
|
||||||
|
assert to_launch == {NODE_TYPE_LEGACY_WORKER: min_workers}
|
||||||
|
|
||||||
provider.create_node({}, {
|
provider.create_node({}, {
|
||||||
TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
|
TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
|
||||||
TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
|
TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
|
||||||
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_WORKER
|
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_WORKER
|
||||||
}, min_workers)
|
}, min_workers)
|
||||||
nodes = provider.non_terminated_nodes({})
|
nodes = provider.non_terminated_nodes({})
|
||||||
to_launch = scheduler.get_nodes_to_launch(nodes, {}, [head_resources], {},
|
to_launch = scheduler.get_nodes_to_launch(
|
||||||
[], {head_ip: head_resources})
|
nodes, {}, [head_resources], {}, [], {head_ip: head_resources})
|
||||||
assert to_launch == {} # A node is running, at some point it'll connect.
|
# A node is running, at some point it'll connect.
|
||||||
pending_launches = {NODE_TYPE_LEGACY_WORKER: 4}
|
assert to_launch == {}
|
||||||
to_launch = scheduler.get_nodes_to_launch([], pending_launches,
|
pending_launches = {NODE_TYPE_LEGACY_WORKER: 4}
|
||||||
[head_resources], {}, [],
|
to_launch = scheduler.get_nodes_to_launch([], pending_launches,
|
||||||
{head_ip: head_resources})
|
[head_resources], {}, [],
|
||||||
assert to_launch == {} # A node is launching, at some point it'll connect.
|
{head_ip: head_resources})
|
||||||
|
# A node is launching, at some point it'll connect.
|
||||||
|
assert to_launch == {}
|
||||||
|
|
||||||
# Now assume that we already launched/connected the nodes.
|
# Now assume that we already launched/connected the nodes.
|
||||||
ips = provider.non_terminated_node_ips({})
|
ips = provider.non_terminated_node_ips({})
|
||||||
lm = LoadMetrics()
|
lm = LoadMetrics()
|
||||||
worker_ips = []
|
worker_ips = []
|
||||||
for ip in ips:
|
for ip in ips:
|
||||||
if ip == head_ip:
|
if ip == head_ip:
|
||||||
lm.update(ip, head_resources, head_resources, {})
|
lm.update(ip, head_resources, head_resources, {})
|
||||||
else:
|
else:
|
||||||
lm.update(ip, worker_resources, worker_resources, {})
|
lm.update(ip, worker_resources, worker_resources, {})
|
||||||
worker_ips.append(ip)
|
worker_ips.append(ip)
|
||||||
|
|
||||||
assert not scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["resources"]
|
assert not scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["resources"]
|
||||||
to_launch = scheduler.get_nodes_to_launch(
|
to_launch = scheduler.get_nodes_to_launch(
|
||||||
nodes, {}, [], {}, [], lm.get_static_node_resources_by_ip())
|
nodes, {}, [], {}, [], lm.get_static_node_resources_by_ip())
|
||||||
assert scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
|
assert scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
|
||||||
"resources"] == worker_resources
|
"resources"] == worker_resources
|
||||||
assert to_launch == {}
|
assert to_launch == {}
|
||||||
utilizations = {ip: worker_resources for ip in worker_ips}
|
utilizations = {ip: worker_resources for ip in worker_ips}
|
||||||
utilizations[head_ip] = head_resources
|
utilizations[head_ip] = head_resources
|
||||||
# Requires 4 nodes since worker resources is bigger than head reasources.
|
# Needs 4 nodes since worker resources is bigger than head reasources.
|
||||||
demands = [worker_resources] * (len(utilizations) + 3)
|
demands = [worker_resources] * (len(utilizations) + 3)
|
||||||
to_launch = scheduler.get_nodes_to_launch(
|
to_launch = scheduler.get_nodes_to_launch(
|
||||||
nodes, {}, demands, utilizations, [],
|
nodes, {}, demands, utilizations, [],
|
||||||
lm.get_static_node_resources_by_ip())
|
lm.get_static_node_resources_by_ip())
|
||||||
# 4 nodes are necessary to meet resource demand, but we never exceed
|
# 4 nodes are necessary to meet resource demand, but we never exceed
|
||||||
# max_workers.
|
# max_workers.
|
||||||
assert to_launch == {}
|
assert to_launch == {}
|
||||||
scheduler.max_workers = 10
|
scheduler.max_workers = 10
|
||||||
to_launch = scheduler.get_nodes_to_launch(
|
to_launch = scheduler.get_nodes_to_launch(
|
||||||
nodes, {}, demands, utilizations, [],
|
nodes, {}, demands, utilizations, [],
|
||||||
lm.get_static_node_resources_by_ip())
|
lm.get_static_node_resources_by_ip())
|
||||||
# 4 nodes are necessary to meet resource demand, but we never exceed
|
# 4 nodes are necessary to meet resource demand, but we never exceed
|
||||||
# max_workers.
|
# max_workers.
|
||||||
assert to_launch == {}
|
assert to_launch == {}
|
||||||
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["max_workers"] = 10
|
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["max_workers"] = 10
|
||||||
to_launch = scheduler.get_nodes_to_launch(
|
to_launch = scheduler.get_nodes_to_launch(
|
||||||
nodes, {}, demands, utilizations, [],
|
nodes, {}, demands, utilizations, [],
|
||||||
lm.get_static_node_resources_by_ip())
|
lm.get_static_node_resources_by_ip())
|
||||||
# 4 nodes are necessary to meet resource demand.
|
# 4 nodes are necessary to meet resource demand.
|
||||||
assert to_launch == {NODE_TYPE_LEGACY_WORKER: 4}
|
assert to_launch == {NODE_TYPE_LEGACY_WORKER: 4}
|
||||||
to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches, demands,
|
to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches,
|
||||||
utilizations, [],
|
demands, utilizations, [],
|
||||||
lm.get_node_resources())
|
lm.get_node_resources())
|
||||||
# 0 because there are 4 pending launches and we only need 4.
|
# 0 because there are 4 pending launches and we only need 4.
|
||||||
assert to_launch == {}
|
assert to_launch == {}
|
||||||
to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches,
|
to_launch = scheduler.get_nodes_to_launch(
|
||||||
demands * 2, utilizations, [],
|
nodes, pending_launches, demands * 2, utilizations, [],
|
||||||
lm.get_node_resources())
|
lm.get_node_resources())
|
||||||
# 1 because there are 4 pending launches and we only allow a max of 5.
|
# 1 because there are 4 pending launches and we only allow a max of 5.
|
||||||
assert to_launch == {NODE_TYPE_LEGACY_WORKER: 1}
|
assert to_launch == {NODE_TYPE_LEGACY_WORKER: 1}
|
||||||
|
|
||||||
|
|
||||||
class LoadMetricsTest(unittest.TestCase):
|
class LoadMetricsTest(unittest.TestCase):
|
||||||
|
|
Loading…
Add table
Reference in a new issue