mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[autoscaler][interface] Use multi node types in defaults.yaml and example-full.yaml (#14239)
* random doc typo * example-full-multi * left off max workers * wip * address comments, modify defaults, wip * fix * wip * reformat more things * undo useless diff * space * max workers * space * copy-paste mishaps * space * More copy-paste mishaps * copy-paste issues, space, max_workers * head_node_type * legacy yamls * line undeleted * correct-gpu * Remove redundant GPU example. * Extraneous comment * whitespace * example-java.yaml * Revert "example-java.yaml" This reverts commit 1e9c0124b9d97e651aaeeb6ec5bf7a4ef2a2df17. * tests and other things * doc * doc * revert max worker default * Kubernetes comment * wip * wip * tweak * Address comments * test_resource_demand_scheduler fixes * Head type min/max workers, aws resources * fix example_cluster2.yaml * Fix external node type test (compatibility with legacy-style external node types) * fix test_autoscaler_aws * gcp-images * gcp node type names * fix gcp defaults * doc format * typo * Skip failed Windows tests * doc string and comment * assert * remove contents of default external head and worker * legacy external failed validation test * Readability -- define the minimal external config at the top of the file. * Remove default worker type min worker * Remove extraneous global min_workers comment. * per-node-type docker in aws/example-gpu-docker * ray.worker.small -> ray.worker.default * fix-docker * fix gpu docker again * undo kubernetes experiment * fix doc * remove worker max_worker from kubernetes * remove max_worker from local worker node type * fix doc again * py38 * eric-comment * fix cluster name * fix-test-autoscaler * legacy config logic * pop resources * Remove min_workers AFTER merge * comment, warning message * warning, comment
This commit is contained in:
parent
ef873be9e8
commit
1675156a8b
32 changed files with 1774 additions and 715 deletions
|
@ -341,14 +341,13 @@ The key is the name of the node type, which is just for debugging purposes.
|
|||
resources: {"CPU": 2}
|
||||
min_workers: 0
|
||||
max_workers: 0
|
||||
ray.worker.small:
|
||||
ray.worker.default:
|
||||
node_config:
|
||||
InstanceType: m5.large
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
resources: {"CPU": 2}
|
||||
min_workers: 0
|
||||
max_workers: 1
|
||||
|
||||
.. _cluster-configuration-head-node-type:
|
||||
|
||||
|
@ -1073,12 +1072,12 @@ Minimal configuration
|
|||
:language: yaml
|
||||
|
||||
.. group-tab:: Azure
|
||||
|
||||
|
||||
.. literalinclude:: ../../../python/ray/autoscaler/azure/example-minimal.yaml
|
||||
:language: yaml
|
||||
|
||||
.. group-tab:: GCP
|
||||
|
||||
|
||||
.. literalinclude:: ../../../python/ray/autoscaler/gcp/example-minimal.yaml
|
||||
:language: yaml
|
||||
|
||||
|
@ -1092,11 +1091,11 @@ Full configuration
|
|||
:language: yaml
|
||||
|
||||
.. group-tab:: Azure
|
||||
|
||||
|
||||
.. literalinclude:: ../../../python/ray/autoscaler/azure/example-full.yaml
|
||||
:language: yaml
|
||||
|
||||
.. group-tab:: GCP
|
||||
|
||||
|
||||
.. literalinclude:: ../../../python/ray/autoscaler/gcp/example-full.yaml
|
||||
:language: yaml
|
||||
|
|
|
@ -71,8 +71,14 @@ def fillout_resources_kubernetes(config):
|
|||
return config
|
||||
node_types = copy.deepcopy(config["available_node_types"])
|
||||
for node_type in node_types:
|
||||
container_data = node_types[node_type]["node_config"]["spec"][
|
||||
"containers"][0]
|
||||
|
||||
node_config = node_types[node_type]["node_config"]
|
||||
# The next line is for compatibility with configs like
|
||||
# kubernetes/example-ingress.yaml,
|
||||
# cf. KubernetesNodeProvider.create_node().
|
||||
pod = node_config.get("pod", node_config)
|
||||
container_data = pod["spec"]["containers"][0]
|
||||
|
||||
autodetected_resources = get_autodetected_resources(container_data)
|
||||
if "resources" not in config["available_node_types"][node_type]:
|
||||
config["available_node_types"][node_type]["resources"] = {}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import copy
|
||||
import importlib
|
||||
import logging
|
||||
import json
|
||||
|
@ -11,6 +12,17 @@ logger = logging.getLogger(__name__)
|
|||
# For caching provider instantiations across API calls of one python session
|
||||
_provider_instances = {}
|
||||
|
||||
# Minimal config for compatibility with legacy-style external configs.
|
||||
MINIMAL_EXTERNAL_CONFIG = {
|
||||
"available_node_types": {
|
||||
"ray.head.default": {},
|
||||
"ray.worker.default": {},
|
||||
},
|
||||
"head_node_type": "ray.head.default",
|
||||
"head_node": {},
|
||||
"worker_nodes": {},
|
||||
}
|
||||
|
||||
|
||||
def _import_aws(provider_config):
|
||||
from ray.autoscaler._private.aws.node_provider import AWSNodeProvider
|
||||
|
@ -192,7 +204,7 @@ def _get_default_config(provider_config):
|
|||
package outside the autoscaler.
|
||||
"""
|
||||
if provider_config["type"] == "external":
|
||||
return {}
|
||||
return copy.deepcopy(MINIMAL_EXTERNAL_CONFIG)
|
||||
load_config = _DEFAULT_CONFIGS.get(provider_config["type"])
|
||||
if load_config is None:
|
||||
raise NotImplementedError("Unsupported node provider: {}".format(
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import collections
|
||||
import copy
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import hashlib
|
||||
|
@ -103,38 +104,91 @@ def prepare_config(config):
|
|||
return with_defaults
|
||||
|
||||
|
||||
def rewrite_legacy_yaml_to_available_node_types(
|
||||
config: Dict[str, Any]) -> Dict[str, Any]:
|
||||
|
||||
if "available_node_types" not in config:
|
||||
# TODO(ameer/ekl/alex): we can also rewrite here many other fields
|
||||
# that include initialization/setup/start commands and ImageId.
|
||||
logger.debug("Converting legacy cluster config to multi node types.")
|
||||
config["available_node_types"] = {
|
||||
NODE_TYPE_LEGACY_HEAD: {
|
||||
"node_config": config["head_node"],
|
||||
"resources": config["head_node"].get("resources") or {},
|
||||
"min_workers": 0,
|
||||
"max_workers": 0,
|
||||
},
|
||||
NODE_TYPE_LEGACY_WORKER: {
|
||||
"node_config": config["worker_nodes"],
|
||||
"resources": config["worker_nodes"].get("resources") or {},
|
||||
"min_workers": config.get("min_workers", 0),
|
||||
"max_workers": config.get("max_workers", 0),
|
||||
},
|
||||
}
|
||||
config["head_node_type"] = NODE_TYPE_LEGACY_HEAD
|
||||
del config["min_workers"]
|
||||
return config
|
||||
|
||||
|
||||
def fillout_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
|
||||
defaults = _get_default_config(config["provider"])
|
||||
defaults.update(config)
|
||||
defaults["auth"] = defaults.get("auth", {})
|
||||
defaults = rewrite_legacy_yaml_to_available_node_types(defaults)
|
||||
return defaults
|
||||
|
||||
# Just for clarity:
|
||||
merged_config = copy.deepcopy(defaults)
|
||||
|
||||
# Fill auth field to avoid key errors.
|
||||
# This field is accessed when calling NodeUpdater but is not relevant to
|
||||
# certain node providers and is thus left out of some cluster launching
|
||||
# configs.
|
||||
merged_config["auth"] = merged_config.get("auth", {})
|
||||
|
||||
# A legacy config is one which doesn't have available_node_types,
|
||||
# but has at least one of head_node or worker_nodes.
|
||||
is_legacy_config = (("available_node_types" not in config) and
|
||||
("head_node" in config or "worker_nodes" in config))
|
||||
# Do merging logic for legacy configs.
|
||||
if is_legacy_config:
|
||||
merged_config = merge_legacy_yaml_with_defaults(merged_config)
|
||||
# Take care of this here, in case a config does not specify any of head,
|
||||
# workers, node types, but does specify min workers:
|
||||
merged_config.pop("min_workers", None)
|
||||
|
||||
return merged_config
|
||||
|
||||
|
||||
def merge_legacy_yaml_with_defaults(
|
||||
merged_config: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Rewrite legacy config's available node types after it has been merged
|
||||
with defaults yaml.
|
||||
"""
|
||||
logger.warning("Converting legacy cluster config to multi node types.\n"
|
||||
"Refer to the docs for examples of multi-node-type "
|
||||
"autoscaling:\n"
|
||||
"https://docs.ray.io/en/master/cluster/config.html"
|
||||
"#full-configuration")
|
||||
|
||||
# Get default head and worker types.
|
||||
default_head_type = merged_config["head_node_type"]
|
||||
# Default configs are assumed to have two node types -- one for the head
|
||||
# and one for the workers.
|
||||
assert len(merged_config["available_node_types"].keys()) == 2
|
||||
default_worker_type = (merged_config["available_node_types"].keys() -
|
||||
{default_head_type}).pop()
|
||||
|
||||
if merged_config["head_node"]:
|
||||
# User specified a head node in legacy config.
|
||||
# Convert it into data for the head's node type.
|
||||
head_node_info = {
|
||||
"node_config": merged_config["head_node"],
|
||||
"resources": merged_config["head_node"].get("resources") or {},
|
||||
"min_workers": 0,
|
||||
"max_workers": 0,
|
||||
}
|
||||
else:
|
||||
# Use default data for the head's node type.
|
||||
head_node_info = merged_config["available_node_types"][
|
||||
default_head_type]
|
||||
if merged_config["worker_nodes"]:
|
||||
# User specified a worker node in legacy config.
|
||||
# Convert it into data for the workers' node type.
|
||||
worker_node_info = {
|
||||
"node_config": merged_config["worker_nodes"],
|
||||
"resources": merged_config["worker_nodes"].get("resources") or {},
|
||||
"min_workers": merged_config.get("min_workers", 0),
|
||||
"max_workers": merged_config["max_workers"],
|
||||
}
|
||||
else:
|
||||
# Use default data for the workers' node type.
|
||||
worker_node_info = merged_config["available_node_types"][
|
||||
default_worker_type]
|
||||
|
||||
# Rewrite available_node_types.
|
||||
merged_config["available_node_types"] = {
|
||||
NODE_TYPE_LEGACY_HEAD: head_node_info,
|
||||
NODE_TYPE_LEGACY_WORKER: worker_node_info
|
||||
}
|
||||
merged_config["head_node_type"] = NODE_TYPE_LEGACY_HEAD
|
||||
|
||||
# Resources field in head/worker fields cause node launch to fail.
|
||||
merged_config["head_node"].pop("resources", None)
|
||||
merged_config["worker_nodes"].pop("resources", None)
|
||||
|
||||
return merged_config
|
||||
|
||||
|
||||
def merge_setup_commands(config):
|
||||
|
@ -147,7 +201,6 @@ def merge_setup_commands(config):
|
|||
|
||||
def fill_node_type_max_workers(config):
|
||||
"""Sets default per-node max workers to global max_workers.
|
||||
|
||||
This equivalent to setting the default per-node max workers to infinity,
|
||||
with the only upper constraint coming from the global max_workers.
|
||||
"""
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: default
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
# node.
|
||||
max_workers: 2
|
||||
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
|
@ -43,38 +39,63 @@ auth:
|
|||
# configurations below.
|
||||
# ssh_private_key: /path/to/your/key.pem
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
head_node:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||
# The key is the name of the node type, which is just for debugging purposes.
|
||||
# The node config specifies the launch config and physical instance type.
|
||||
available_node_types:
|
||||
ray.head.default:
|
||||
# The minimum number of worker nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of worker nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 0
|
||||
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
|
||||
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
|
||||
# You can also set custom resources.
|
||||
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
|
||||
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
|
||||
resources: {}
|
||||
# Provider-specific config for this node type, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
node_config:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||
# You can provision additional disk space with a conf as follows
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 100
|
||||
# Additional options in the boto docs.
|
||||
ray.worker.default:
|
||||
# The minimum number of nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
|
||||
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
|
||||
# You can also set custom resources.
|
||||
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
|
||||
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
|
||||
resources: {}
|
||||
# Provider-specific config for this node type, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
node_config:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||
# Run workers on spot by default. Comment this out to use on-demand.
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
# Additional options can be found in the boto docs, e.g.
|
||||
# SpotOptions:
|
||||
# MaxPrice: MAX_HOURLY_PRICE
|
||||
# Additional options in the boto docs.
|
||||
|
||||
# You can provision additional disk space with a conf as follows
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 100
|
||||
|
||||
# Additional options in the boto docs.
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
worker_nodes:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||
|
||||
# Run workers on spot by default. Comment this out to use on-demand.
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
# Additional options can be found in the boto docs, e.g.
|
||||
# SpotOptions:
|
||||
# MaxPrice: MAX_HOURLY_PRICE
|
||||
|
||||
# Additional options in the boto docs.
|
||||
# Specify the node type of the head node (as configured above).
|
||||
head_node_type: ray.head.default
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
|
@ -108,15 +129,8 @@ initialization_commands: []
|
|||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands:
|
||||
# Note: if you're developing Ray, you probably want to create an AMI that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
- echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
|
||||
# Consider uncommenting these if you also want to run apt-get commands during setup
|
||||
# - sudo pkill -9 apt-get || true
|
||||
# - sudo pkill -9 dpkg || true
|
||||
# - sudo dpkg --configure -a
|
||||
- echo 'export PATH="$HOME/anaconda3/envs/tensorflow2_latest_p37/bin:$PATH"' >> ~/.bashrc
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
|
@ -134,3 +148,6 @@ head_start_ray_commands:
|
|||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
|
||||
head_node: {}
|
||||
worker_nodes: {}
|
||||
|
|
148
python/ray/autoscaler/aws/example-full-legacy.yaml
Normal file
148
python/ray/autoscaler/aws/example-full-legacy.yaml
Normal file
|
@ -0,0 +1,148 @@
|
|||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: default
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
# E.g., if the task requires adding more nodes then autoscaler will gradually
|
||||
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
|
||||
# This number should be > 0.
|
||||
upscaling_speed: 1.0
|
||||
|
||||
# This executes all commands on all nodes in the docker container,
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
container_name: "ray_container"
|
||||
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||
# if no cached version is present.
|
||||
pull_before_run: True
|
||||
run_options: [] # Extra options to pass into "docker run"
|
||||
|
||||
# Example of running a GPU head with CPU workers
|
||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||
# Allow Ray to automatically detect GPUs
|
||||
|
||||
# worker_image: "rayproject/ray-ml:latest-cpu"
|
||||
# worker_run_options: []
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
# Availability zone(s), comma-separated, that nodes may be launched in.
|
||||
# Nodes are currently spread between zones by a round-robin approach,
|
||||
# however this implementation detail should not be relied upon.
|
||||
availability_zone: us-west-2a,us-west-2b
|
||||
# Whether to allow node reuse. If set to False, nodes will be terminated
|
||||
# instead of stopped.
|
||||
cache_stopped_nodes: True # If not present, the default is True.
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
# By default Ray creates a new private keypair, but you can also use your own.
|
||||
# If you do so, make sure to also set "KeyName" in the head and worker node
|
||||
# configurations below.
|
||||
# ssh_private_key: /path/to/your/key.pem
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
head_node:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||
|
||||
# You can provision additional disk space with a conf as follows
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 100
|
||||
|
||||
# Additional options in the boto docs.
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
worker_nodes:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||
|
||||
# Run workers on spot by default. Comment this out to use on-demand.
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
# Additional options can be found in the boto docs, e.g.
|
||||
# SpotOptions:
|
||||
# MaxPrice: MAX_HOURLY_PRICE
|
||||
|
||||
# Additional options in the boto docs.
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
}
|
||||
|
||||
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||
# list of paths. The same path on the head node will be copied to the worker node.
|
||||
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
||||
# you should just use file_mounts. Only use this if you know what you're doing!
|
||||
cluster_synced_files: []
|
||||
|
||||
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
|
||||
# should sync to the worker node continuously
|
||||
file_mounts_sync_continuously: False
|
||||
|
||||
# Patterns for files to exclude when running rsync up or rsync down
|
||||
rsync_exclude:
|
||||
- "**/.git"
|
||||
- "**/.git/**"
|
||||
|
||||
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
|
||||
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
|
||||
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
|
||||
rsync_filter:
|
||||
- ".gitignore"
|
||||
|
||||
# List of commands that will be run before `setup_commands`. If docker is
|
||||
# enabled, these commands will run outside the container and before docker
|
||||
# is setup.
|
||||
initialization_commands: []
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands: []
|
||||
# Note: if you're developing Ray, you probably want to create a Docker image that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands: []
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
||||
# Command to start ray on the head node. You don't need to change this.
|
||||
head_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
|
@ -1,12 +1,8 @@
|
|||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: default
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
# node.
|
||||
max_workers: 2
|
||||
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
|
@ -57,38 +53,66 @@ auth:
|
|||
# configurations below.
|
||||
# ssh_private_key: /path/to/your/key.pem
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
head_node:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||
# The key is the name of the node type, which is just for debugging purposes.
|
||||
# The node config specifies the launch config and physical instance type.
|
||||
available_node_types:
|
||||
ray.head.default:
|
||||
# The minimum number of worker nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of worker nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 0
|
||||
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
|
||||
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
|
||||
# You can also set custom resources.
|
||||
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
|
||||
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
|
||||
resources: {}
|
||||
# Provider-specific config for this node type, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
node_config:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||
# You can provision additional disk space with a conf as follows
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 100
|
||||
# Additional options in the boto docs.
|
||||
ray.worker.default:
|
||||
# The minimum number of worker nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of worker nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
|
||||
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
|
||||
# You can also set custom resources.
|
||||
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
|
||||
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
|
||||
resources: {}
|
||||
# Provider-specific config for this node type, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
node_config:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||
# Run workers on spot by default. Comment this out to use on-demand.
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
# Additional options can be found in the boto docs, e.g.
|
||||
# SpotOptions:
|
||||
# MaxPrice: MAX_HOURLY_PRICE
|
||||
# Additional options in the boto docs.
|
||||
|
||||
# You can provision additional disk space with a conf as follows
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 100
|
||||
|
||||
# Additional options in the boto docs.
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
worker_nodes:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||
|
||||
# Run workers on spot by default. Comment this out to use on-demand.
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
# Additional options can be found in the boto docs, e.g.
|
||||
# SpotOptions:
|
||||
# MaxPrice: MAX_HOURLY_PRICE
|
||||
|
||||
# Additional options in the boto docs.
|
||||
# Specify the node type of the head node (as configured above).
|
||||
head_node_type: ray.head.default
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
|
@ -146,3 +170,6 @@ head_start_ray_commands:
|
|||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
|
||||
head_node: {}
|
||||
worker_nodes: {}
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: gpu-docker
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
# node.
|
||||
max_workers: 2
|
||||
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
|
@ -23,10 +19,6 @@ docker:
|
|||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
container_name: "ray_nvidia_docker" # e.g. ray_docker
|
||||
|
||||
# # Example of running a GPU head with CPU workers
|
||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||
|
||||
# worker_image: "rayproject/ray-ml:latest"
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
@ -48,38 +40,74 @@ auth:
|
|||
# configurations below.
|
||||
# ssh_private_key: /path/to/your/key.pem
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
head_node:
|
||||
InstanceType: p2.xlarge
|
||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||
# The key is the name of the node type, which is just for debugging purposes.
|
||||
# The node config specifies the launch config and physical instance type.
|
||||
available_node_types:
|
||||
# GPU head node.
|
||||
ray.head.gpu:
|
||||
# worker_image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
# The minimum number of worker nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of worker nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 0
|
||||
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
|
||||
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
|
||||
# You can also set custom resources.
|
||||
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
|
||||
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
|
||||
resources: {}
|
||||
# Provider-specific config for this node type, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
node_config:
|
||||
InstanceType: p2.xlarge
|
||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||
# You can provision additional disk space with a conf as follows
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 100
|
||||
# Additional options in the boto docs.
|
||||
# CPU workers.
|
||||
ray.worker.default:
|
||||
# Override global docker setting.
|
||||
# This node type will run a CPU image,
|
||||
# rather than the GPU image specified in the global docker settings.
|
||||
docker:
|
||||
worker_image: "rayproject/ray-ml:latest-cpu"
|
||||
# The minimum number of nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 1
|
||||
# The maximum number of workers nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
|
||||
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
|
||||
# You can also set custom resources.
|
||||
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
|
||||
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
|
||||
resources: {}
|
||||
# Provider-specific config for this node type, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
node_config:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||
# Run workers on spot by default. Comment this out to use on-demand.
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
# Additional options can be found in the boto docs, e.g.
|
||||
# SpotOptions:
|
||||
# MaxPrice: MAX_HOURLY_PRICE
|
||||
# Additional options in the boto docs.
|
||||
|
||||
# You can provision additional disk space with a conf as follows
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 100
|
||||
|
||||
# Additional options in the boto docs.
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
worker_nodes:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
|
||||
|
||||
# Run workers on spot by default. Comment this out to use on-demand.
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
# Additional options can be found in the boto docs, e.g.
|
||||
# SpotOptions:
|
||||
# MaxPrice: MAX_HOURLY_PRICE
|
||||
|
||||
# Additional options in the boto docs.
|
||||
# Specify the node type of the head node (as configured above).
|
||||
head_node_type: ray.head.gpu
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
cluster_name: minimal
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
# node. min_workers default to 0.
|
||||
max_workers: 1
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: default
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
# node.
|
||||
max_workers: 2
|
||||
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
|
@ -17,7 +13,7 @@ upscaling_speed: 1.0
|
|||
|
||||
# This executes all commands on all nodes in the docker container,
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
# Empty object means disabled.
|
||||
docker: {}
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
|
@ -46,30 +42,52 @@ auth:
|
|||
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
|
||||
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type.
|
||||
head_node:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_D2s_v3
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||
# The key is the name of the node type, which is just for debugging purposes.
|
||||
# The node config specifies the launch config and physical instance type.
|
||||
available_node_types:
|
||||
ray.head.default:
|
||||
# The minimum number of worker nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of worker nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 0
|
||||
# The resources provided by this node type.
|
||||
resources: {"CPU": 2}
|
||||
# Provider-specific config, e.g. instance type.
|
||||
node_config:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_D2s_v3
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type.
|
||||
worker_nodes:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_D2s_v3
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
# optionally set priority to use Spot instances
|
||||
priority: Spot
|
||||
# set a maximum price for spot instances if desired
|
||||
# billingProfile:
|
||||
# maxPrice: -1
|
||||
ray.worker.default:
|
||||
# The minimum number of nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The resources provided by this node type.
|
||||
resources: {"CPU": 2}
|
||||
# Provider-specific config, e.g. instance type.
|
||||
node_config:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_D2s_v3
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
# optionally set priority to use Spot instances
|
||||
priority: Spot
|
||||
# set a maximum price for spot instances if desired
|
||||
# billingProfile:
|
||||
# maxPrice: -1
|
||||
|
||||
# Specify the node type of the head node (as configured above).
|
||||
head_node_type: ray.head.default
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
|
@ -134,3 +152,6 @@ head_start_ray_commands:
|
|||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
|
||||
head_node: {}
|
||||
worker_nodes: {}
|
||||
|
|
|
@ -19,18 +19,20 @@ upscaling_speed: 1.0
|
|||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "rayproject/ray-ml:latest-gpu"
|
||||
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
container_name: "ray_docker"
|
||||
container_name: "ray_container"
|
||||
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||
# if no cached version is present.
|
||||
pull_before_run: False
|
||||
pull_before_run: True
|
||||
run_options: [] # Extra options to pass into "docker run"
|
||||
|
||||
# Example of running a GPU head with CPU workers
|
||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||
# Allow Ray to automatically detect GPUs
|
||||
|
||||
# worker_image: "rayproject/ray-ml:latest"
|
||||
# worker_image: "rayproject/ray-ml:latest-cpu"
|
||||
# worker_run_options: []
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
@ -42,7 +44,7 @@ provider:
|
|||
location: westus2
|
||||
resource_group: ray-cluster
|
||||
# set subscription id otherwise the default from az cli will be used
|
||||
# subscription_id: 00000000-0000-0000-0000-000000000000
|
||||
# subscription_id: 00000000-0000-0000-0000-000000000000
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
|
@ -53,27 +55,35 @@ auth:
|
|||
# changes to this should match what is specified in file_mounts
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields using defaults.yaml
|
||||
# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
|
||||
# See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines
|
||||
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
|
||||
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type.
|
||||
head_node:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_NC6
|
||||
vmSize: Standard_D2s_v3
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: "1804"
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields using defaults.yaml
|
||||
# Provider-specific config for worker nodes, e.g. instance type.
|
||||
worker_nodes:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_NC6
|
||||
vmSize: Standard_D2s_v3
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: "1804"
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
# optionally set priority to use Spot instances
|
||||
priority: Spot
|
||||
# set a maximum price for spot instances if desired
|
||||
# billingProfile:
|
||||
# maxPrice: -1
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
|
@ -83,6 +93,27 @@ file_mounts: {
|
|||
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
|
||||
}
|
||||
|
||||
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||
# list of paths. The same path on the head node will be copied to the worker node.
|
||||
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
||||
# you should just use file_mounts. Only use this if you know what you're doing!
|
||||
cluster_synced_files: []
|
||||
|
||||
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
|
||||
# should sync to the worker node continuously
|
||||
file_mounts_sync_continuously: False
|
||||
|
||||
# Patterns for files to exclude when running rsync up or rsync down
|
||||
rsync_exclude:
|
||||
- "**/.git"
|
||||
- "**/.git/**"
|
||||
|
||||
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
|
||||
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
|
||||
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
|
||||
rsync_filter:
|
||||
- ".gitignore"
|
||||
|
||||
# List of commands that will be run before `setup_commands`. If docker is
|
||||
# enabled, these commands will run outside the container and before docker
|
||||
# is setup.
|
||||
|
@ -92,20 +123,16 @@ initialization_commands:
|
|||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands:
|
||||
# Note: if you're developing Ray, you probably want to create an AMI that
|
||||
# Note: if you're developing Ray, you probably want to create a Docker image that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
|
||||
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
|
||||
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
# Consider uncommenting these if you also want to run apt-get commands during setup
|
||||
# - sudo pkill -9 apt-get || true
|
||||
# - sudo pkill -9 dpkg || true
|
||||
# - sudo dpkg --configure -a
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
head_setup_commands:
|
||||
- pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
|
@ -1,12 +1,8 @@
|
|||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: default
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
# node.
|
||||
max_workers: 2
|
||||
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
|
@ -17,7 +13,7 @@ upscaling_speed: 1.0
|
|||
|
||||
# This executes all commands on all nodes in the docker container,
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
# Empty object means disabled.
|
||||
docker:
|
||||
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
|
@ -60,30 +56,55 @@ auth:
|
|||
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
|
||||
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type.
|
||||
head_node:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_D2s_v3
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||
# The key is the name of the node type, which is just for debugging purposes.
|
||||
# The node config specifies the launch config and physical instance type.
|
||||
available_node_types:
|
||||
ray.head.default:
|
||||
# The minimum number of worker nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of worker nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 0
|
||||
# The resources provided by this node type.
|
||||
resources: {"CPU": 2}
|
||||
# Provider-specific config, e.g. instance type.
|
||||
node_config:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_D2s_v3
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type.
|
||||
worker_nodes:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_D2s_v3
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
# optionally set priority to use Spot instances
|
||||
priority: Spot
|
||||
# set a maximum price for spot instances if desired
|
||||
# billingProfile:
|
||||
# maxPrice: -1
|
||||
ray.worker.default:
|
||||
# The minimum number of worker nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of worker nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
# The resources provided by this node type.
|
||||
resources: {"CPU": 2}
|
||||
# Provider-specific config, e.g. instance type.
|
||||
node_config:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_D2s_v3
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
# optionally set priority to use Spot instances
|
||||
priority: Spot
|
||||
# set a maximum price for spot instances if desired
|
||||
# billingProfile:
|
||||
# maxPrice: -1
|
||||
|
||||
# Specify the node type of the head node (as configured above).
|
||||
head_node_type: ray.head.default
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
|
@ -147,3 +168,6 @@ head_start_ray_commands:
|
|||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
|
||||
head_node: {}
|
||||
worker_nodes: {}
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: gpu-docker
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
# node.
|
||||
max_workers: 2
|
||||
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
|
@ -21,7 +17,7 @@ upscaling_speed: 1.0
|
|||
docker:
|
||||
image: "rayproject/ray-ml:latest-gpu"
|
||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
container_name: "ray_nvidia_docker" # e.g. ray_docker
|
||||
container_name: "ray_nvidia_docker"
|
||||
|
||||
# # Example of running a GPU head with CPU workers
|
||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||
|
@ -45,17 +41,40 @@ auth:
|
|||
# changes to this should match what is specified in file_mounts
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields using defaults.yaml
|
||||
head_node:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_NC6s_v3
|
||||
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||
# The key is the name of the node type, which is just for debugging purposes.
|
||||
# The node config specifies the launch config and physical instance type.
|
||||
available_node_types:
|
||||
ray.head.gpu:
|
||||
# The minimum number of worker nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of worker nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 0
|
||||
# The resources provided by this node type.
|
||||
resources: {"CPU": 6, "GPU": 1}
|
||||
# Provider-specific config, e.g. instance type.
|
||||
node_config:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_NC6_v3
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields using defaults.yaml
|
||||
worker_nodes:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_NC6s_v3
|
||||
ray.worker.gpu:
|
||||
# The minimum number of nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of workers nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
# The resources provided by this node type.
|
||||
resources: {"CPU": 6, "GPU": 1}
|
||||
# Provider-specific config, e.g. instance type.
|
||||
node_config:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_NC6_v3
|
||||
|
||||
# Specify the node type of the head node (as configured above).
|
||||
head_node_type: ray.head.gpu
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
|
@ -69,7 +88,7 @@ file_mounts: {
|
|||
# NOTE: rayproject/ray-ml:latest has ray latest bundled
|
||||
setup_commands: []
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
- pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
cluster_name: minimal
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
# node. min_workers default to 0.
|
||||
max_workers: 1
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: default
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
# node.
|
||||
max_workers: 2
|
||||
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
|
@ -39,50 +35,75 @@ auth:
|
|||
# project wide meta-data.
|
||||
# ssh_private_key: /path/to/your/key.pem
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||
# For more documentation on available fields, see:
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
head_node:
|
||||
machineType: n1-standard-2
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||
# The key is the name of the node type, which is just for debugging purposes.
|
||||
# The node config specifies the launch config and physical instance type.
|
||||
available_node_types:
|
||||
ray_head_default:
|
||||
# The minimum number of worker nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of worker nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 0
|
||||
# The resources provided by this node type.
|
||||
resources: {"CPU": 2}
|
||||
# Provider-specific config for this node type, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||
# For more documentation on available fields, see:
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
node_config:
|
||||
machineType: n1-standard-2
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||
|
||||
# Additional options can be found in in the compute docs at
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
|
||||
# If the network interface is specified as below in both head and worker
|
||||
# nodes, the manual network config is used. Otherwise an existing subnet is
|
||||
# used. To use a shared subnet, ask the subnet owner to grant permission
|
||||
# for 'compute.subnetworks.use' to the ray autoscaler account...
|
||||
# networkInterfaces:
|
||||
# - kind: compute#networkInterface
|
||||
# subnetwork: path/to/subnet
|
||||
# aliasIpRanges: []
|
||||
ray_worker_small:
|
||||
# The minimum number of nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The resources provided by this node type.
|
||||
resources: {"CPU": 2}
|
||||
# Provider-specific config for this node type, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||
# For more documentation on available fields, see:
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
node_config:
|
||||
machineType: n1-standard-2
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||
# Run workers on preemtible instance by default.
|
||||
# Comment this out to use on-demand.
|
||||
scheduling:
|
||||
- preemptible: true
|
||||
|
||||
# Additional options can be found in in the compute docs at
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
|
||||
# If the network interface is specified as below in both head and worker
|
||||
# nodes, the manual network config is used. Otherwise an existing subnet is
|
||||
# used. To use a shared subnet, ask the subnet owner to grant permission
|
||||
# for 'compute.subnetworks.use' to the ray autoscaler account...
|
||||
# networkInterfaces:
|
||||
# - kind: compute#networkInterface
|
||||
# subnetwork: path/to/subnet
|
||||
# aliasIpRanges: []
|
||||
|
||||
worker_nodes:
|
||||
machineType: n1-standard-2
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||
# Run workers on preemtible instance by default.
|
||||
# Comment this out to use on-demand.
|
||||
scheduling:
|
||||
- preemptible: true
|
||||
|
||||
# Additional options can be found in in the compute docs at
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
# Specify the node type of the head node (as configured above).
|
||||
head_node_type: ray_head_default
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
|
@ -159,3 +180,6 @@ worker_start_ray_commands:
|
|||
ray start
|
||||
--address=$RAY_HEAD_IP:6379
|
||||
--object-manager-port=8076
|
||||
|
||||
head_node: {}
|
||||
worker_nodes: {}
|
||||
|
|
167
python/ray/autoscaler/gcp/example-full-legacy.yaml
Normal file
167
python/ray/autoscaler/gcp/example-full-legacy.yaml
Normal file
|
@ -0,0 +1,167 @@
|
|||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: default
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
# E.g., if the task requires adding more nodes then autoscaler will gradually
|
||||
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
|
||||
# This number should be > 0.
|
||||
upscaling_speed: 1.0
|
||||
|
||||
# This executes all commands on all nodes in the docker container,
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
container_name: "ray_container"
|
||||
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||
# if no cached version is present.
|
||||
pull_before_run: True
|
||||
run_options: [] # Extra options to pass into "docker run"
|
||||
|
||||
# Example of running a GPU head with CPU workers
|
||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||
# Allow Ray to automatically detect GPUs
|
||||
|
||||
# worker_image: "rayproject/ray-ml:latest-cpu"
|
||||
# worker_run_options: []
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: gcp
|
||||
region: us-west1
|
||||
availability_zone: us-west1-a
|
||||
project_id: null # Globally unique project id
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
# By default Ray creates a new private keypair, but you can also use your own.
|
||||
# If you do so, make sure to also set "KeyName" in the head and worker node
|
||||
# configurations below. This requires that you have added the key into the
|
||||
# project wide meta-data.
|
||||
# ssh_private_key: /path/to/your/key.pem
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||
# For more documentation on available fields, see:
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
head_node:
|
||||
machineType: n1-standard-2
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||
|
||||
# Additional options can be found in in the compute docs at
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
|
||||
# If the network interface is specified as below in both head and worker
|
||||
# nodes, the manual network config is used. Otherwise an existing subnet is
|
||||
# used. To use a shared subnet, ask the subnet owner to grant permission
|
||||
# for 'compute.subnetworks.use' to the ray autoscaler account...
|
||||
# networkInterfaces:
|
||||
# - kind: compute#networkInterface
|
||||
# subnetwork: path/to/subnet
|
||||
# aliasIpRanges: []
|
||||
|
||||
worker_nodes:
|
||||
machineType: n1-standard-2
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||
# Run workers on preemtible instance by default.
|
||||
# Comment this out to use on-demand.
|
||||
scheduling:
|
||||
- preemptible: true
|
||||
|
||||
# Additional options can be found in in the compute docs at
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
}
|
||||
|
||||
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||
# list of paths. The same path on the head node will be copied to the worker node.
|
||||
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
||||
# you should just use file_mounts. Only use this if you know what you're doing!
|
||||
cluster_synced_files: []
|
||||
|
||||
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
|
||||
# should sync to the worker node continuously
|
||||
file_mounts_sync_continuously: False
|
||||
|
||||
# Patterns for files to exclude when running rsync up or rsync down
|
||||
rsync_exclude:
|
||||
- "**/.git"
|
||||
- "**/.git/**"
|
||||
|
||||
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
|
||||
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
|
||||
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
|
||||
rsync_filter:
|
||||
- ".gitignore"
|
||||
|
||||
# List of commands that will be run before `setup_commands`. If docker is
|
||||
# enabled, these commands will run outside the container and before docker
|
||||
# is setup.
|
||||
initialization_commands: []
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands: []
|
||||
# Note: if you're developing Ray, you probably want to create a Docker image that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
- pip install google-api-python-client==1.7.8
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
||||
# Command to start ray on the head node. You don't need to change this.
|
||||
head_start_ray_commands:
|
||||
- ray stop
|
||||
- >-
|
||||
ulimit -n 65536;
|
||||
ray start
|
||||
--head
|
||||
--port=6379
|
||||
--object-manager-port=8076
|
||||
--autoscaling-config=~/ray_bootstrap_config.yaml
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- >-
|
||||
ulimit -n 65536;
|
||||
ray start
|
||||
--address=$RAY_HEAD_IP:6379
|
||||
--object-manager-port=8076
|
|
@ -1,12 +1,8 @@
|
|||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: default
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
# node.
|
||||
max_workers: 2
|
||||
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
|
@ -53,50 +49,78 @@ auth:
|
|||
# project wide meta-data.
|
||||
# ssh_private_key: /path/to/your/key.pem
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||
# For more documentation on available fields, see:
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
head_node:
|
||||
machineType: n1-standard-2
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||
# The key is the name of the node type, which is just for debugging purposes.
|
||||
# The node config specifies the launch config and physical instance type.
|
||||
available_node_types:
|
||||
ray_head_default:
|
||||
# The minimum number of worker nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of worker nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 0
|
||||
# The resources provided by this node type.
|
||||
resources: {"CPU": 2}
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||
# For more documentation on available fields, see:
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
node_config:
|
||||
machineType: n1-standard-2
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||
|
||||
# Additional options can be found in in the compute docs at
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
|
||||
# If the network interface is specified as below in both head and worker
|
||||
# nodes, the manual network config is used. Otherwise an existing subnet is
|
||||
# used. To use a shared subnet, ask the subnet owner to grant permission
|
||||
# for 'compute.subnetworks.use' to the ray autoscaler account...
|
||||
# networkInterfaces:
|
||||
# - kind: compute#networkInterface
|
||||
# subnetwork: path/to/subnet
|
||||
# aliasIpRanges: []
|
||||
ray_worker_small:
|
||||
# The minimum number of worker nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of worker nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
# The resources provided by this node type.
|
||||
resources: {"CPU": 2}
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||
# For more documentation on available fields, see:
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
node_config:
|
||||
machineType: n1-standard-2
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||
# Run workers on preemtible instance by default.
|
||||
# Comment this out to use on-demand.
|
||||
scheduling:
|
||||
- preemptible: true
|
||||
|
||||
# Additional options can be found in in the compute docs at
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
|
||||
# If the network interface is specified as below in both head and worker
|
||||
# nodes, the manual network config is used. Otherwise an existing subnet is
|
||||
# used. To use a shared subnet, ask the subnet owner to grant permission
|
||||
# for 'compute.subnetworks.use' to the ray autoscaler account...
|
||||
# networkInterfaces:
|
||||
# - kind: compute#networkInterface
|
||||
# subnetwork: path/to/subnet
|
||||
# aliasIpRanges: []
|
||||
|
||||
worker_nodes:
|
||||
machineType: n1-standard-2
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
|
||||
# Run workers on preemtible instance by default.
|
||||
# Comment this out to use on-demand.
|
||||
scheduling:
|
||||
- preemptible: true
|
||||
|
||||
# Additional options can be found in in the compute docs at
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
# Specify the node type of the head node (as configured above).
|
||||
head_node_type: ray_head_default
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
|
@ -166,3 +190,6 @@ worker_start_ray_commands:
|
|||
ray start
|
||||
--address=$RAY_HEAD_IP:6379
|
||||
--object-manager-port=8076
|
||||
|
||||
head_node: {}
|
||||
worker_nodes: {}
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: gpu-docker
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
# node.
|
||||
max_workers: 2
|
||||
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
|
@ -48,58 +44,81 @@ auth:
|
|||
# project wide meta-data.
|
||||
# ssh_private_key: /path/to/your/key.pem
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||
# For more documentation on available fields, see:
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
head_node:
|
||||
machineType: custom-6-16384
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
|
||||
guestAccelerators:
|
||||
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
|
||||
acceleratorCount: 1
|
||||
metadata:
|
||||
items:
|
||||
- key: install-nvidia-driver
|
||||
value: "True"
|
||||
scheduling:
|
||||
- onHostMaintenance: TERMINATE
|
||||
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||
# The key is the name of the node type, which is just for debugging purposes.
|
||||
# The node config specifies the launch config and physical instance type.
|
||||
available_node_types:
|
||||
ray_head_gpu:
|
||||
# The minimum number of worker nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of worker nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 0
|
||||
# The resources provided by this node type.
|
||||
resources: {"CPU": 6, "GPU": 1}
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||
# For more documentation on available fields, see:
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
node_config:
|
||||
machineType: custom-6-16384
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
|
||||
guestAccelerators:
|
||||
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
|
||||
acceleratorCount: 1
|
||||
metadata:
|
||||
items:
|
||||
- key: install-nvidia-driver
|
||||
value: "True"
|
||||
scheduling:
|
||||
- onHostMaintenance: TERMINATE
|
||||
|
||||
# Additional options can be found in in the compute docs at
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
ray_worker_gpu:
|
||||
# The minimum number of nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of workers nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
# The resources provided by this node type.
|
||||
resources: {"CPU": 2, "GPU": 1}
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||
# For more documentation on available fields, see:
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
node_config:
|
||||
machineType: n1-standard-2
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
|
||||
guestAccelerators:
|
||||
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
|
||||
acceleratorCount: 1
|
||||
metadata:
|
||||
items:
|
||||
- key: install-nvidia-driver
|
||||
value: "True"
|
||||
# Run workers on preemtible instance by default.
|
||||
# Comment this out to use on-demand.
|
||||
scheduling:
|
||||
- preemptible: true
|
||||
- onHostMaintenance: TERMINATE
|
||||
|
||||
worker_nodes:
|
||||
machineType: n1-standard-2
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
|
||||
guestAccelerators:
|
||||
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
|
||||
acceleratorCount: 1
|
||||
metadata:
|
||||
items:
|
||||
- key: install-nvidia-driver
|
||||
value: "True"
|
||||
# Run workers on preemtible instance by default.
|
||||
# Comment this out to use on-demand.
|
||||
scheduling:
|
||||
- preemptible: true
|
||||
- onHostMaintenance: TERMINATE
|
||||
|
||||
# Additional options can be found in in the compute docs at
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
# Specify the node type of the head node (as configured above).
|
||||
head_node_type: ray_head_gpu
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
cluster_name: minimal
|
||||
|
||||
# The maximum number of worker nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
# node. min_workers default to 0.
|
||||
max_workers: 1
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
|
|
|
@ -96,8 +96,6 @@ available_node_types:
|
|||
worker_node:
|
||||
# Minimum number of Ray workers of this Pod type.
|
||||
min_workers: 0
|
||||
# Maximum number of Ray workers of this Pod type. Takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
node_config:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
|
@ -136,6 +134,12 @@ available_node_types:
|
|||
# cause problems for other pods.
|
||||
memory: 512Mi
|
||||
head_node:
|
||||
# The minimum number of worker nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of worker nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 0
|
||||
node_config:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
|
|
|
@ -139,6 +139,12 @@ available_node_types:
|
|||
# cause problems for other pods.
|
||||
memory: 512Mi
|
||||
head_node:
|
||||
# The minimum number of worker nodes of this type to launch.
|
||||
# This number should be >= 0.
|
||||
min_workers: 0
|
||||
# The maximum number of worker nodes of this type to launch.
|
||||
# This takes precedence over min_workers.
|
||||
max_workers: 0
|
||||
node_config:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
|
|
|
@ -17,6 +17,10 @@ spec:
|
|||
# Specify the allowed pod types for this ray cluster and the resources they provide.
|
||||
podTypes:
|
||||
- name: head-node
|
||||
# Minimum number of Ray workers of this Pod type.
|
||||
minWorkers: 0
|
||||
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
|
||||
maxWorkers: 0
|
||||
podConfig:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
|
|
|
@ -17,6 +17,10 @@ spec:
|
|||
# Specify the allowed pod types for this ray cluster and the resources they provide.
|
||||
podTypes:
|
||||
- name: head-node
|
||||
# Minimum number of Ray workers of this Pod type.
|
||||
minWorkers: 0
|
||||
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
|
||||
maxWorkers: 0
|
||||
podConfig:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
|
|
|
@ -1,16 +1,8 @@
|
|||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: default
|
||||
|
||||
## NOTE: Typically for local clusters, min_workers == max_workers == len(worker_ips).
|
||||
## NOTE: Typically for local clusters, max_workers == len(worker_ips).
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
# Typically, min_workers == max_workers == len(worker_ips).
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head node.
|
||||
# This takes precedence over min_workers.
|
||||
# Typically, min_workers == max_workers == len(worker_ips).
|
||||
max_workers: 0
|
||||
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
|
@ -42,11 +34,20 @@ auth:
|
|||
# Optional if an ssh private key is necessary to ssh to the cluster.
|
||||
# ssh_private_key: ~/.ssh/id_rsa
|
||||
|
||||
# Leave this empty.
|
||||
head_node: {}
|
||||
|
||||
# Leave this empty.
|
||||
worker_nodes: {}
|
||||
available_node_types:
|
||||
ray.head.default:
|
||||
resources: {}
|
||||
min_workers: 0
|
||||
max_workers: 0
|
||||
# Leave this empty
|
||||
node_config: {}
|
||||
ray.worker.default:
|
||||
resources: {}
|
||||
## NOTE: Typically for local clusters, max_workers == len(worker_ips).
|
||||
min_workers: 0
|
||||
# Leave this empty
|
||||
node_config: {}
|
||||
head_node_type: ray.head.default
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
|
@ -97,3 +98,6 @@ head_start_ray_commands:
|
|||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ray start --address=$RAY_HEAD_IP:6379
|
||||
|
||||
head_node: {}
|
||||
worker_nodes: {}
|
||||
|
|
|
@ -2,10 +2,6 @@
|
|||
# A namespace will be automatically created for each cluster_name in SKE.
|
||||
cluster_name: default
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
|
@ -85,174 +81,184 @@ provider:
|
|||
# Exposing external IP addresses for ray pods isn't currently supported.
|
||||
use_internal_ips: true
|
||||
|
||||
# Kubernetes pod config for the head node pod.
|
||||
head_node:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: ray-head-
|
||||
head_node_type: ray.head.default
|
||||
|
||||
# Must match the head node service selector above if a head node
|
||||
# service is required.
|
||||
labels:
|
||||
component: ray-head
|
||||
available_node_types:
|
||||
ray.head.default:
|
||||
resources: {"CPU": 1}
|
||||
min_workers: 0
|
||||
max_workers: 0
|
||||
# Kubernetes pod config for the head node pod.
|
||||
node_config:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: ray-head-
|
||||
|
||||
# https://docs.staroid.com/ske/pod.html#pod
|
||||
pod.staroid.com/spot: "false" # use on-demand instance for head.
|
||||
# Must match the head node service selector above if a head node
|
||||
# service is required.
|
||||
labels:
|
||||
component: ray-head
|
||||
|
||||
# Uncomment to locate ray head to dedicated Kubernetes node
|
||||
# (GPU instance is only available for 'dedicated' isolation)
|
||||
#pod.staroid.com/isolation: dedicated
|
||||
#pod.staroid.com/instance-type: gpu-1
|
||||
spec:
|
||||
automountServiceAccountToken: true
|
||||
# https://docs.staroid.com/ske/pod.html#pod
|
||||
pod.staroid.com/spot: "false" # use on-demand instance for head.
|
||||
|
||||
# Restarting the head node automatically is not currently supported.
|
||||
# If the head node goes down, `ray up` must be run again.
|
||||
restartPolicy: Never
|
||||
# Uncomment to locate ray head to dedicated Kubernetes node
|
||||
# (GPU instance is only available for 'dedicated' isolation)
|
||||
#pod.staroid.com/isolation: dedicated
|
||||
#pod.staroid.com/instance-type: gpu-1
|
||||
spec:
|
||||
automountServiceAccountToken: true
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
# nfs volume provides a shared volume across all ray-nodes.
|
||||
- name: nfs-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: nfs
|
||||
# Restarting the head node automatically is not currently supported.
|
||||
# If the head node goes down, `ray up` must be run again.
|
||||
restartPolicy: Never
|
||||
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: Always
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
# - screen (used for `ray attach`)
|
||||
# - kubectl (used by the autoscaler to manage worker pods)
|
||||
# Image will be overridden when 'image_from_project' is true.
|
||||
image: rayproject/ray
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 6379 # Redis port.
|
||||
- containerPort: 6380 # Redis port.
|
||||
- containerPort: 6381 # Redis port.
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
# nfs volume provides a shared volume across all ray-nodes.
|
||||
- name: nfs-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: nfs
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /nfs
|
||||
name: nfs-volume
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
limits:
|
||||
# The maximum memory that this pod is allowed to use. The
|
||||
# limit will be detected by ray and split to use 10% for
|
||||
# redis, 30% for the shared memory object store, and the
|
||||
# rest for application memory. If this limit is not set and
|
||||
# the object store size is not set manually, ray will
|
||||
# allocate a very large object store in each pod that may
|
||||
# cause problems for other pods.
|
||||
memory: 2Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
# may lead to degraded performance.
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: requests.cpu
|
||||
- name: RAY_ADDRESS
|
||||
value: "auto"
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: Always
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
# - screen (used for `ray attach`)
|
||||
# - kubectl (used by the autoscaler to manage worker pods)
|
||||
# Image will be overridden when 'image_from_project' is true.
|
||||
image: rayproject/ray
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 6379 # Redis port.
|
||||
- containerPort: 6380 # Redis port.
|
||||
- containerPort: 6381 # Redis port.
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
|
||||
# Kubernetes pod config for worker node pods.
|
||||
worker_nodes:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: ray-worker-
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /nfs
|
||||
name: nfs-volume
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
limits:
|
||||
# The maximum memory that this pod is allowed to use. The
|
||||
# limit will be detected by ray and split to use 10% for
|
||||
# redis, 30% for the shared memory object store, and the
|
||||
# rest for application memory. If this limit is not set and
|
||||
# the object store size is not set manually, ray will
|
||||
# allocate a very large object store in each pod that may
|
||||
# cause problems for other pods.
|
||||
memory: 2Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
# may lead to degraded performance.
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: requests.cpu
|
||||
- name: RAY_ADDRESS
|
||||
value: "auto"
|
||||
|
||||
# Must match the worker node service selector above if a worker node
|
||||
# service is required.
|
||||
labels:
|
||||
component: ray-worker
|
||||
ray.worker.default:
|
||||
min_workers: 0
|
||||
resources: {"CPU": 1}
|
||||
# Kubernetes pod config for worker node pods.
|
||||
node_config:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: ray-worker-
|
||||
|
||||
# https://docs.staroid.com/ske/pod.html#pod
|
||||
pod.staroid.com/spot: "true" # use spot instance for workers.
|
||||
# Must match the worker node service selector above if a worker node
|
||||
# service is required.
|
||||
labels:
|
||||
component: ray-worker
|
||||
|
||||
# Uncomment to locate ray head to dedicated Kubernetes node
|
||||
# (GPU instance is only available for 'dedicated' isolation)
|
||||
#pod.staroid.com/isolation: dedicated
|
||||
#pod.staroid.com/instance-type: gpu-1
|
||||
spec:
|
||||
serviceAccountName: default
|
||||
# https://docs.staroid.com/ske/pod.html#pod
|
||||
pod.staroid.com/spot: "true" # use spot instance for workers.
|
||||
|
||||
# Worker nodes will be managed automatically by the head node, so
|
||||
# do not change the restart policy.
|
||||
restartPolicy: Never
|
||||
# Uncomment to locate ray head to dedicated Kubernetes node
|
||||
# (GPU instance is only available for 'dedicated' isolation)
|
||||
#pod.staroid.com/isolation: dedicated
|
||||
#pod.staroid.com/instance-type: gpu-1
|
||||
spec:
|
||||
serviceAccountName: default
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: nfs-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: nfs
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: Always
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
image: rayproject/autoscaler
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
# Worker nodes will be managed automatically by the head node, so
|
||||
# do not change the restart policy.
|
||||
restartPolicy: Never
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /nfs
|
||||
name: nfs-volume
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
limits:
|
||||
# This memory limit will be detected by ray and split into
|
||||
# 30% for plasma, and 70% for workers.
|
||||
memory: 2Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
# may lead to degraded performance.
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: requests.cpu
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: nfs-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: nfs
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: Always
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
image: rayproject/autoscaler
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /nfs
|
||||
name: nfs-volume
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
limits:
|
||||
# This memory limit will be detected by ray and split into
|
||||
# 30% for plasma, and 70% for workers.
|
||||
memory: 2Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
# may lead to degraded performance.
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: requests.cpu
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
|
@ -307,3 +313,6 @@ head_start_ray_commands:
|
|||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
|
||||
head_node: {}
|
||||
worker_nodes: {}
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
import pytest
|
||||
|
||||
from ray.autoscaler._private.aws.config import _get_vpc_id_or_die, \
|
||||
bootstrap_aws, \
|
||||
DEFAULT_AMI
|
||||
bootstrap_aws, \
|
||||
DEFAULT_AMI
|
||||
import ray.tests.aws.utils.stubs as stubs
|
||||
import ray.tests.aws.utils.helpers as helpers
|
||||
from ray.tests.aws.utils.constants import AUX_SUBNET, DEFAULT_SUBNET, \
|
||||
|
@ -143,8 +143,10 @@ def test_fills_out_amis(iam_client_stub, ec2_client_stub):
|
|||
stubs.configure_subnet_default(ec2_client_stub)
|
||||
|
||||
config = helpers.load_aws_example_config_file("example-full.yaml")
|
||||
del config["head_node"]["ImageId"]
|
||||
del config["worker_nodes"]["ImageId"]
|
||||
del config["available_node_types"]["ray.head.default"]["node_config"][
|
||||
"ImageId"]
|
||||
del config["available_node_types"]["ray.worker.default"]["node_config"][
|
||||
"ImageId"]
|
||||
|
||||
# Pass in SG for stub to work
|
||||
config["head_node"]["SecurityGroupIds"] = ["sg-1234abcd"]
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import json
|
||||
import jsonschema
|
||||
import os
|
||||
import shutil
|
||||
from subprocess import CalledProcessError
|
||||
|
@ -264,6 +265,55 @@ SMALL_CLUSTER = {
|
|||
"worker_start_ray_commands": ["start_ray_worker"],
|
||||
}
|
||||
|
||||
MOCK_DEFAULT_CONFIG = {
|
||||
"cluster_name": "default",
|
||||
"max_workers": 2,
|
||||
"upscaling_speed": 1.0,
|
||||
"idle_timeout_minutes": 5,
|
||||
"provider": {
|
||||
"type": "mock",
|
||||
"region": "us-east-1",
|
||||
"availability_zone": "us-east-1a",
|
||||
},
|
||||
"docker": {
|
||||
"image": "example",
|
||||
"container_name": "mock",
|
||||
},
|
||||
"auth": {
|
||||
"ssh_user": "ubuntu",
|
||||
"ssh_private_key": os.devnull,
|
||||
},
|
||||
"available_node_types": {
|
||||
"ray.head.default": {
|
||||
"min_workers": 0,
|
||||
"max_workers": 0,
|
||||
"resources": {},
|
||||
"node_config": {
|
||||
"head_default_prop": 4
|
||||
}
|
||||
},
|
||||
"ray.worker.default": {
|
||||
"min_workers": 0,
|
||||
"max_workers": 2,
|
||||
"resources": {},
|
||||
"node_config": {
|
||||
"worker_default_prop": 7
|
||||
}
|
||||
}
|
||||
},
|
||||
"head_node_type": "ray.head.default",
|
||||
"head_node": {},
|
||||
"worker_nodes": {},
|
||||
"file_mounts": {},
|
||||
"cluster_synced_files": [],
|
||||
"initialization_commands": [],
|
||||
"setup_commands": [],
|
||||
"head_setup_commands": [],
|
||||
"worker_setup_commands": [],
|
||||
"head_start_ray_commands": [],
|
||||
"worker_start_ray_commands": [],
|
||||
}
|
||||
|
||||
|
||||
class LoadMetricsTest(unittest.TestCase):
|
||||
def testHeartbeat(self):
|
||||
|
@ -1645,6 +1695,28 @@ class AutoscalingTest(unittest.TestCase):
|
|||
config_path, LoadMetrics(), max_failures=0, update_interval_s=0)
|
||||
assert isinstance(autoscaler.provider, NodeProvider)
|
||||
|
||||
def testLegacyExternalNodeScalerMissingFields(self):
|
||||
"""Should fail to validate legacy external config with missing
|
||||
head_node, worker_nodes, or both."""
|
||||
external_config = copy.deepcopy(SMALL_CLUSTER)
|
||||
external_config["provider"] = {
|
||||
"type": "external",
|
||||
"module": "ray.autoscaler.node_provider.NodeProvider",
|
||||
}
|
||||
|
||||
missing_workers, missing_head, missing_both = [
|
||||
copy.deepcopy(external_config) for _ in range(3)
|
||||
]
|
||||
del missing_workers["worker_nodes"]
|
||||
del missing_head["head_node"]
|
||||
del missing_both["worker_nodes"]
|
||||
del missing_both["head_node"]
|
||||
|
||||
for faulty_config in missing_workers, missing_head, missing_both:
|
||||
faulty_config = prepare_config(faulty_config)
|
||||
with pytest.raises(jsonschema.ValidationError):
|
||||
validate_config(faulty_config)
|
||||
|
||||
def testExternalNodeScalerWrongImport(self):
|
||||
config = SMALL_CLUSTER.copy()
|
||||
config["provider"] = {
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import jsonschema
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
@ -9,10 +10,12 @@ import copy
|
|||
from unittest.mock import MagicMock, Mock, patch
|
||||
import pytest
|
||||
|
||||
from ray.autoscaler._private.util import prepare_config, validate_config
|
||||
from ray.autoscaler._private.util import prepare_config, validate_config,\
|
||||
_get_default_config, merge_setup_commands
|
||||
from ray.autoscaler._private.providers import _NODE_PROVIDERS
|
||||
from ray.autoscaler._private.kubernetes.node_provider import\
|
||||
KubernetesNodeProvider
|
||||
from ray.autoscaler.tags import NODE_TYPE_LEGACY_HEAD, NODE_TYPE_LEGACY_WORKER
|
||||
|
||||
from ray.test_utils import load_test_config, recursive_fnmatch
|
||||
|
||||
|
@ -37,18 +40,19 @@ CONFIG_PATHS = ignore_k8s_operator_configs(CONFIG_PATHS)
|
|||
class AutoscalingConfigTest(unittest.TestCase):
|
||||
def testValidateDefaultConfig(self):
|
||||
for config_path in CONFIG_PATHS:
|
||||
if "aws/example-multi-node-type.yaml" in config_path:
|
||||
# aws is tested in testValidateDefaultConfigAWSMultiNodeTypes.
|
||||
continue
|
||||
with open(config_path) as f:
|
||||
config = yaml.safe_load(f)
|
||||
config = prepare_config(config)
|
||||
if config["provider"]["type"] == "kubernetes":
|
||||
KubernetesNodeProvider.fillout_available_node_types_resources(
|
||||
config)
|
||||
try:
|
||||
if "aws/example-multi-node-type.yaml" in config_path:
|
||||
# aws tested in testValidateDefaultConfigAWSMultiNodeTypes.
|
||||
continue
|
||||
with open(config_path) as f:
|
||||
config = yaml.safe_load(f)
|
||||
config = prepare_config(config)
|
||||
if config["provider"]["type"] == "kubernetes":
|
||||
KubernetesNodeProvider.\
|
||||
fillout_available_node_types_resources(config)
|
||||
validate_config(config)
|
||||
except Exception:
|
||||
logging.exception("")
|
||||
self.fail(
|
||||
f"Config {config_path} did not pass validation test!")
|
||||
|
||||
|
@ -232,7 +236,6 @@ class AutoscalingConfigTest(unittest.TestCase):
|
|||
self.fail("Failed to validate config with security group name!")
|
||||
|
||||
def testMaxWorkerDefault(self):
|
||||
|
||||
# Load config, call prepare config, check that default max_workers
|
||||
# is filled correctly for node types that don't specify it.
|
||||
# Check that max_workers is untouched for node types
|
||||
|
@ -254,7 +257,7 @@ class AutoscalingConfigTest(unittest.TestCase):
|
|||
# Max workers auto-filled with specified cluster-wide value of 5.
|
||||
assert config["max_workers"] ==\
|
||||
prepared_node_types["worker_node_max_unspecified"]["max_workers"]\
|
||||
== config["max_workers"] == 5
|
||||
== 5
|
||||
|
||||
# Repeat with a config that doesn't specify global max workers.
|
||||
# Default value of 2 should be pulled in for global max workers.
|
||||
|
@ -275,8 +278,87 @@ class AutoscalingConfigTest(unittest.TestCase):
|
|||
prepared_node_types["worker_node_max_specified"][
|
||||
"max_workers"] == 3
|
||||
# Max workers auto-filled with default cluster-wide value of 2.
|
||||
assert prepared_node_types["worker_node_max_unspecified"][
|
||||
"max_workers"] == 2
|
||||
assert prepared_config["max_workers"] ==\
|
||||
prepared_node_types["worker_node_max_unspecified"]["max_workers"]\
|
||||
== 2
|
||||
|
||||
def testFillEdgeLegacyConfigs(self):
|
||||
# Test edge cases: legacy configs which specify workers but not head
|
||||
# or vice-versa.
|
||||
no_head = load_test_config("test_no_head.yaml")
|
||||
aws_defaults = _get_default_config(no_head["provider"])
|
||||
head_prepared = prepare_config(no_head)
|
||||
assert head_prepared["available_node_types"][
|
||||
"ray-legacy-head-node-type"]["node_config"] ==\
|
||||
aws_defaults["available_node_types"][
|
||||
"ray.head.default"]["node_config"]
|
||||
assert head_prepared["head_node"] == {}
|
||||
# Custom worker config preserved
|
||||
node_types = head_prepared["available_node_types"]
|
||||
worker_type = node_types["ray-legacy-worker-node-type"]
|
||||
assert worker_type["node_config"] == head_prepared["worker_nodes"] == {
|
||||
"foo": "bar"
|
||||
}
|
||||
|
||||
no_workers = load_test_config("test_no_workers.yaml")
|
||||
workers_prepared = prepare_config(no_workers)
|
||||
assert workers_prepared["available_node_types"][
|
||||
"ray-legacy-worker-node-type"]["node_config"] ==\
|
||||
aws_defaults["available_node_types"][
|
||||
"ray.worker.default"]["node_config"]
|
||||
assert workers_prepared["worker_nodes"] == {}
|
||||
# Custom head config preserved
|
||||
node_types = workers_prepared["available_node_types"]
|
||||
head_type = node_types["ray-legacy-head-node-type"]
|
||||
assert head_type["node_config"] == workers_prepared["head_node"] == {
|
||||
"baz": "qux"
|
||||
}
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.platform.startswith("win"), reason="Fails on Windows.")
|
||||
def testExampleFull(self):
|
||||
"""
|
||||
Test that example-full yamls are unmodified by prepared_config,
|
||||
except possibly by having setup_commands merged.
|
||||
"""
|
||||
providers = ["aws", "gcp", "azure"]
|
||||
for provider in providers:
|
||||
path = os.path.join(RAY_PATH, "autoscaler", provider,
|
||||
"example-full.yaml")
|
||||
config = yaml.safe_load(open(path).read())
|
||||
config_copy = copy.deepcopy(config)
|
||||
merge_setup_commands(config_copy)
|
||||
assert config_copy == prepare_config(config)
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.platform.startswith("win"), reason="Fails on Windows.")
|
||||
def testLegacyYaml(self):
|
||||
# Test correct default-merging behavior for legacy yamls.
|
||||
providers = ["aws", "gcp", "azure"]
|
||||
for provider in providers:
|
||||
path = os.path.join(RAY_PATH, "autoscaler", provider,
|
||||
"example-full-legacy.yaml")
|
||||
legacy_config = yaml.safe_load(open(path).read())
|
||||
# custom head and workers
|
||||
legacy_config["head_node"] = {"blahblah": 0}
|
||||
legacy_config["worker_nodes"] = {"halbhalhb": 0}
|
||||
legacy_config_copy = copy.deepcopy(legacy_config)
|
||||
prepared_legacy = prepare_config(legacy_config_copy)
|
||||
assert prepared_legacy["available_node_types"][
|
||||
NODE_TYPE_LEGACY_HEAD]["max_workers"] == 0
|
||||
assert prepared_legacy["available_node_types"][
|
||||
NODE_TYPE_LEGACY_HEAD]["min_workers"] == 0
|
||||
assert prepared_legacy["available_node_types"][
|
||||
NODE_TYPE_LEGACY_HEAD]["node_config"] == legacy_config[
|
||||
"head_node"]
|
||||
|
||||
assert prepared_legacy["available_node_types"][
|
||||
NODE_TYPE_LEGACY_WORKER]["max_workers"] == 2
|
||||
assert prepared_legacy["available_node_types"][
|
||||
NODE_TYPE_LEGACY_WORKER]["min_workers"] == 0
|
||||
assert prepared_legacy["available_node_types"][
|
||||
NODE_TYPE_LEGACY_WORKER]["node_config"] == legacy_config[
|
||||
"worker_nodes"]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
123
python/ray/tests/test_cli_patterns/test_no_head.yaml
Normal file
123
python/ray/tests/test_cli_patterns/test_no_head.yaml
Normal file
|
@ -0,0 +1,123 @@
|
|||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: default
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
# E.g., if the task requires adding more nodes then autoscaler will gradually
|
||||
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
|
||||
# This number should be > 0.
|
||||
upscaling_speed: 1.0
|
||||
|
||||
# This executes all commands on all nodes in the docker container,
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
container_name: "ray_container"
|
||||
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||
# if no cached version is present.
|
||||
pull_before_run: True
|
||||
run_options: [] # Extra options to pass into "docker run"
|
||||
|
||||
# Example of running a GPU head with CPU workers
|
||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||
# Allow Ray to automatically detect GPUs
|
||||
|
||||
# worker_image: "rayproject/ray-ml:latest-cpu"
|
||||
# worker_run_options: []
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
# Availability zone(s), comma-separated, that nodes may be launched in.
|
||||
# Nodes are currently spread between zones by a round-robin approach,
|
||||
# however this implementation detail should not be relied upon.
|
||||
availability_zone: us-west-2a,us-west-2b
|
||||
# Whether to allow node reuse. If set to False, nodes will be terminated
|
||||
# instead of stopped.
|
||||
cache_stopped_nodes: True # If not present, the default is True.
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
# By default Ray creates a new private keypair, but you can also use your own.
|
||||
# If you do so, make sure to also set "KeyName" in the head and worker node
|
||||
# configurations below.
|
||||
# ssh_private_key: /path/to/your/key.pem
|
||||
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
worker_nodes:
|
||||
foo: bar
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
}
|
||||
|
||||
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||
# list of paths. The same path on the head node will be copied to the worker node.
|
||||
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
||||
# you should just use file_mounts. Only use this if you know what you're doing!
|
||||
cluster_synced_files: []
|
||||
|
||||
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
|
||||
# should sync to the worker node continuously
|
||||
file_mounts_sync_continuously: False
|
||||
|
||||
# Patterns for files to exclude when running rsync up or rsync down
|
||||
rsync_exclude:
|
||||
- "**/.git"
|
||||
- "**/.git/**"
|
||||
|
||||
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
|
||||
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
|
||||
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
|
||||
rsync_filter:
|
||||
- ".gitignore"
|
||||
|
||||
# List of commands that will be run before `setup_commands`. If docker is
|
||||
# enabled, these commands will run outside the container and before docker
|
||||
# is setup.
|
||||
initialization_commands: []
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands: []
|
||||
# Note: if you're developing Ray, you probably want to create a Docker image that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands: []
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
||||
# Command to start ray on the head node. You don't need to change this.
|
||||
head_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
124
python/ray/tests/test_cli_patterns/test_no_workers.yaml
Normal file
124
python/ray/tests/test_cli_patterns/test_no_workers.yaml
Normal file
|
@ -0,0 +1,124 @@
|
|||
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: default
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
# E.g., if the task requires adding more nodes then autoscaler will gradually
|
||||
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
|
||||
# This number should be > 0.
|
||||
upscaling_speed: 1.0
|
||||
|
||||
# This executes all commands on all nodes in the docker container,
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
container_name: "ray_container"
|
||||
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||
# if no cached version is present.
|
||||
pull_before_run: True
|
||||
run_options: [] # Extra options to pass into "docker run"
|
||||
|
||||
# Example of running a GPU head with CPU workers
|
||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||
# Allow Ray to automatically detect GPUs
|
||||
|
||||
# worker_image: "rayproject/ray-ml:latest-cpu"
|
||||
# worker_run_options: []
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
# Availability zone(s), comma-separated, that nodes may be launched in.
|
||||
# Nodes are currently spread between zones by a round-robin approach,
|
||||
# however this implementation detail should not be relied upon.
|
||||
availability_zone: us-west-2a,us-west-2b
|
||||
# Whether to allow node reuse. If set to False, nodes will be terminated
|
||||
# instead of stopped.
|
||||
cache_stopped_nodes: True # If not present, the default is True.
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
# By default Ray creates a new private keypair, but you can also use your own.
|
||||
# If you do so, make sure to also set "KeyName" in the head and worker node
|
||||
# configurations below.
|
||||
# ssh_private_key: /path/to/your/key.pem
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
head_node:
|
||||
baz: qux
|
||||
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
}
|
||||
|
||||
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||
# list of paths. The same path on the head node will be copied to the worker node.
|
||||
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
||||
# you should just use file_mounts. Only use this if you know what you're doing!
|
||||
cluster_synced_files: []
|
||||
|
||||
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
|
||||
# should sync to the worker node continuously
|
||||
file_mounts_sync_continuously: False
|
||||
|
||||
# Patterns for files to exclude when running rsync up or rsync down
|
||||
rsync_exclude:
|
||||
- "**/.git"
|
||||
- "**/.git/**"
|
||||
|
||||
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
|
||||
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
|
||||
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
|
||||
rsync_filter:
|
||||
- ".gitignore"
|
||||
|
||||
# List of commands that will be run before `setup_commands`. If docker is
|
||||
# enabled, these commands will run outside the container and before docker
|
||||
# is setup.
|
||||
initialization_commands: []
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands: []
|
||||
# Note: if you're developing Ray, you probably want to create a Docker image that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands: []
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
||||
# Command to start ray on the head node. You don't need to change this.
|
||||
head_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
|
@ -195,4 +195,4 @@ class KubernetesOperatorTest(unittest.TestCase):
|
|||
|
||||
if __name__ == "__main__":
|
||||
kubernetes.config.load_kube_config()
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
sys.exit(pytest.main(["-sv", __file__]))
|
||||
|
|
|
@ -5,15 +5,16 @@ import yaml
|
|||
import tempfile
|
||||
import shutil
|
||||
import unittest
|
||||
from unittest import mock
|
||||
import copy
|
||||
|
||||
import ray
|
||||
import ray.ray_constants
|
||||
from ray.autoscaler._private.util import \
|
||||
rewrite_legacy_yaml_to_available_node_types, format_info_string, \
|
||||
prepare_config, format_info_string, \
|
||||
format_info_string_no_node_types
|
||||
from ray.tests.test_autoscaler import SMALL_CLUSTER, MockProvider, \
|
||||
MockProcessRunner
|
||||
from ray.tests.test_autoscaler import SMALL_CLUSTER, MOCK_DEFAULT_CONFIG, \
|
||||
MockProvider, MockProcessRunner
|
||||
from ray.autoscaler._private.providers import (_NODE_PROVIDERS,
|
||||
_clear_provider_cache)
|
||||
from ray.autoscaler._private.autoscaler import StandardAutoscaler, \
|
||||
|
@ -38,6 +39,8 @@ from ray.autoscaler._private.constants import \
|
|||
|
||||
from time import sleep
|
||||
|
||||
GET_DEFAULT_METHOD = "ray.autoscaler._private.util._get_default_config"
|
||||
|
||||
TYPES_A = {
|
||||
"empty_node": {
|
||||
"node_config": {
|
||||
|
@ -1042,131 +1045,135 @@ def test_get_nodes_to_launch_max_launch_concurrency():
|
|||
|
||||
|
||||
def test_rewrite_legacy_yaml_to_available_node_types():
|
||||
cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config.
|
||||
cluster_config = rewrite_legacy_yaml_to_available_node_types(
|
||||
cluster_config)
|
||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
|
||||
"max_workers"] == 0
|
||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
|
||||
"min_workers"] == 0
|
||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
|
||||
"node_config"] == SMALL_CLUSTER["head_node"]
|
||||
with mock.patch(GET_DEFAULT_METHOD, return_value=MOCK_DEFAULT_CONFIG):
|
||||
cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config.
|
||||
cluster_config = prepare_config(cluster_config)
|
||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
|
||||
"max_workers"] == 0
|
||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
|
||||
"min_workers"] == 0
|
||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
|
||||
"node_config"] == SMALL_CLUSTER["head_node"]
|
||||
|
||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
|
||||
"node_config"] == SMALL_CLUSTER["worker_nodes"]
|
||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
|
||||
"max_workers"] == SMALL_CLUSTER["max_workers"]
|
||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
|
||||
"min_workers"] == SMALL_CLUSTER["min_workers"]
|
||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
|
||||
"node_config"] == SMALL_CLUSTER["worker_nodes"]
|
||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
|
||||
"max_workers"] == SMALL_CLUSTER["max_workers"]
|
||||
assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
|
||||
"min_workers"] == SMALL_CLUSTER["min_workers"]
|
||||
|
||||
|
||||
def test_handle_legacy_cluster_config_yaml():
|
||||
provider = MockProvider()
|
||||
head_resources = {"CPU": 8, "GPU": 1}
|
||||
worker_resources = {"CPU": 32, "GPU": 8}
|
||||
cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config.
|
||||
cluster_config = rewrite_legacy_yaml_to_available_node_types(
|
||||
cluster_config)
|
||||
scheduler = ResourceDemandScheduler(
|
||||
provider,
|
||||
cluster_config["available_node_types"],
|
||||
0,
|
||||
head_node_type=NODE_TYPE_LEGACY_HEAD)
|
||||
provider.create_node({}, {
|
||||
TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
|
||||
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_HEAD
|
||||
}, 1)
|
||||
head_ip = provider.non_terminated_node_ips({})[0]
|
||||
head_node_id = provider.non_terminated_nodes({})[0]
|
||||
to_launch = scheduler.get_nodes_to_launch([], {}, [], {}, [],
|
||||
{head_ip: head_resources})
|
||||
assert to_launch == {} # Should always be empty with max_workers = 0.
|
||||
with mock.patch(GET_DEFAULT_METHOD, return_value=MOCK_DEFAULT_CONFIG):
|
||||
provider = MockProvider()
|
||||
head_resources = {"CPU": 8, "GPU": 1}
|
||||
worker_resources = {"CPU": 32, "GPU": 8}
|
||||
cluster_config = copy.deepcopy(SMALL_CLUSTER) # Legacy cluster_config.
|
||||
cluster_config = prepare_config(cluster_config)
|
||||
scheduler = ResourceDemandScheduler(
|
||||
provider,
|
||||
cluster_config["available_node_types"],
|
||||
0,
|
||||
head_node_type=NODE_TYPE_LEGACY_HEAD)
|
||||
provider.create_node({}, {
|
||||
TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
|
||||
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_HEAD
|
||||
}, 1)
|
||||
head_ip = provider.non_terminated_node_ips({})[0]
|
||||
head_node_id = provider.non_terminated_nodes({})[0]
|
||||
to_launch = scheduler.get_nodes_to_launch([], {}, [], {}, [],
|
||||
{head_ip: head_resources})
|
||||
assert to_launch == {} # Should always be empty with max_workers = 0.
|
||||
|
||||
scheduler.max_workers = 30
|
||||
min_workers = scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"]
|
||||
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = 0
|
||||
to_launch = scheduler.get_nodes_to_launch([head_node_id], {}, [], {}, [],
|
||||
{head_ip: head_resources})
|
||||
assert to_launch == {
|
||||
} # Since the resource demand does not require adding nodes.
|
||||
to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
|
||||
[head_resources], {}, [],
|
||||
{head_ip: head_resources})
|
||||
assert to_launch == {
|
||||
} # Since the resource demand does not require adding nodes.
|
||||
scheduler.max_workers = 30
|
||||
min_workers = scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
|
||||
"min_workers"]
|
||||
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = 0
|
||||
to_launch = scheduler.get_nodes_to_launch(
|
||||
[head_node_id], {}, [], {}, [], {head_ip: head_resources})
|
||||
assert to_launch == {
|
||||
} # Since the resource demand does not require adding nodes.
|
||||
to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
|
||||
[head_resources], {}, [],
|
||||
{head_ip: head_resources})
|
||||
assert to_launch == {
|
||||
} # Since the resource demand does not require adding nodes.
|
||||
|
||||
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = min_workers
|
||||
# Returns min_workers when min_workers>0.
|
||||
to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
|
||||
[head_resources], {}, [],
|
||||
{head_ip: head_resources})
|
||||
assert to_launch == {NODE_TYPE_LEGACY_WORKER: min_workers}
|
||||
scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
|
||||
"min_workers"] = min_workers
|
||||
# Returns min_workers when min_workers>0.
|
||||
to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
|
||||
[head_resources], {}, [],
|
||||
{head_ip: head_resources})
|
||||
assert to_launch == {NODE_TYPE_LEGACY_WORKER: min_workers}
|
||||
|
||||
provider.create_node({}, {
|
||||
TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
|
||||
TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
|
||||
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_WORKER
|
||||
}, min_workers)
|
||||
nodes = provider.non_terminated_nodes({})
|
||||
to_launch = scheduler.get_nodes_to_launch(nodes, {}, [head_resources], {},
|
||||
[], {head_ip: head_resources})
|
||||
assert to_launch == {} # A node is running, at some point it'll connect.
|
||||
pending_launches = {NODE_TYPE_LEGACY_WORKER: 4}
|
||||
to_launch = scheduler.get_nodes_to_launch([], pending_launches,
|
||||
[head_resources], {}, [],
|
||||
{head_ip: head_resources})
|
||||
assert to_launch == {} # A node is launching, at some point it'll connect.
|
||||
provider.create_node({}, {
|
||||
TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
|
||||
TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
|
||||
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_WORKER
|
||||
}, min_workers)
|
||||
nodes = provider.non_terminated_nodes({})
|
||||
to_launch = scheduler.get_nodes_to_launch(
|
||||
nodes, {}, [head_resources], {}, [], {head_ip: head_resources})
|
||||
# A node is running, at some point it'll connect.
|
||||
assert to_launch == {}
|
||||
pending_launches = {NODE_TYPE_LEGACY_WORKER: 4}
|
||||
to_launch = scheduler.get_nodes_to_launch([], pending_launches,
|
||||
[head_resources], {}, [],
|
||||
{head_ip: head_resources})
|
||||
# A node is launching, at some point it'll connect.
|
||||
assert to_launch == {}
|
||||
|
||||
# Now assume that we already launched/connected the nodes.
|
||||
ips = provider.non_terminated_node_ips({})
|
||||
lm = LoadMetrics()
|
||||
worker_ips = []
|
||||
for ip in ips:
|
||||
if ip == head_ip:
|
||||
lm.update(ip, head_resources, head_resources, {})
|
||||
else:
|
||||
lm.update(ip, worker_resources, worker_resources, {})
|
||||
worker_ips.append(ip)
|
||||
# Now assume that we already launched/connected the nodes.
|
||||
ips = provider.non_terminated_node_ips({})
|
||||
lm = LoadMetrics()
|
||||
worker_ips = []
|
||||
for ip in ips:
|
||||
if ip == head_ip:
|
||||
lm.update(ip, head_resources, head_resources, {})
|
||||
else:
|
||||
lm.update(ip, worker_resources, worker_resources, {})
|
||||
worker_ips.append(ip)
|
||||
|
||||
assert not scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["resources"]
|
||||
to_launch = scheduler.get_nodes_to_launch(
|
||||
nodes, {}, [], {}, [], lm.get_static_node_resources_by_ip())
|
||||
assert scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
|
||||
"resources"] == worker_resources
|
||||
assert to_launch == {}
|
||||
utilizations = {ip: worker_resources for ip in worker_ips}
|
||||
utilizations[head_ip] = head_resources
|
||||
# Requires 4 nodes since worker resources is bigger than head reasources.
|
||||
demands = [worker_resources] * (len(utilizations) + 3)
|
||||
to_launch = scheduler.get_nodes_to_launch(
|
||||
nodes, {}, demands, utilizations, [],
|
||||
lm.get_static_node_resources_by_ip())
|
||||
# 4 nodes are necessary to meet resource demand, but we never exceed
|
||||
# max_workers.
|
||||
assert to_launch == {}
|
||||
scheduler.max_workers = 10
|
||||
to_launch = scheduler.get_nodes_to_launch(
|
||||
nodes, {}, demands, utilizations, [],
|
||||
lm.get_static_node_resources_by_ip())
|
||||
# 4 nodes are necessary to meet resource demand, but we never exceed
|
||||
# max_workers.
|
||||
assert to_launch == {}
|
||||
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["max_workers"] = 10
|
||||
to_launch = scheduler.get_nodes_to_launch(
|
||||
nodes, {}, demands, utilizations, [],
|
||||
lm.get_static_node_resources_by_ip())
|
||||
# 4 nodes are necessary to meet resource demand.
|
||||
assert to_launch == {NODE_TYPE_LEGACY_WORKER: 4}
|
||||
to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches, demands,
|
||||
utilizations, [],
|
||||
lm.get_node_resources())
|
||||
# 0 because there are 4 pending launches and we only need 4.
|
||||
assert to_launch == {}
|
||||
to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches,
|
||||
demands * 2, utilizations, [],
|
||||
lm.get_node_resources())
|
||||
# 1 because there are 4 pending launches and we only allow a max of 5.
|
||||
assert to_launch == {NODE_TYPE_LEGACY_WORKER: 1}
|
||||
assert not scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["resources"]
|
||||
to_launch = scheduler.get_nodes_to_launch(
|
||||
nodes, {}, [], {}, [], lm.get_static_node_resources_by_ip())
|
||||
assert scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
|
||||
"resources"] == worker_resources
|
||||
assert to_launch == {}
|
||||
utilizations = {ip: worker_resources for ip in worker_ips}
|
||||
utilizations[head_ip] = head_resources
|
||||
# Needs 4 nodes since worker resources is bigger than head reasources.
|
||||
demands = [worker_resources] * (len(utilizations) + 3)
|
||||
to_launch = scheduler.get_nodes_to_launch(
|
||||
nodes, {}, demands, utilizations, [],
|
||||
lm.get_static_node_resources_by_ip())
|
||||
# 4 nodes are necessary to meet resource demand, but we never exceed
|
||||
# max_workers.
|
||||
assert to_launch == {}
|
||||
scheduler.max_workers = 10
|
||||
to_launch = scheduler.get_nodes_to_launch(
|
||||
nodes, {}, demands, utilizations, [],
|
||||
lm.get_static_node_resources_by_ip())
|
||||
# 4 nodes are necessary to meet resource demand, but we never exceed
|
||||
# max_workers.
|
||||
assert to_launch == {}
|
||||
scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["max_workers"] = 10
|
||||
to_launch = scheduler.get_nodes_to_launch(
|
||||
nodes, {}, demands, utilizations, [],
|
||||
lm.get_static_node_resources_by_ip())
|
||||
# 4 nodes are necessary to meet resource demand.
|
||||
assert to_launch == {NODE_TYPE_LEGACY_WORKER: 4}
|
||||
to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches,
|
||||
demands, utilizations, [],
|
||||
lm.get_node_resources())
|
||||
# 0 because there are 4 pending launches and we only need 4.
|
||||
assert to_launch == {}
|
||||
to_launch = scheduler.get_nodes_to_launch(
|
||||
nodes, pending_launches, demands * 2, utilizations, [],
|
||||
lm.get_node_resources())
|
||||
# 1 because there are 4 pending launches and we only allow a max of 5.
|
||||
assert to_launch == {NODE_TYPE_LEGACY_WORKER: 1}
|
||||
|
||||
|
||||
class LoadMetricsTest(unittest.TestCase):
|
||||
|
|
Loading…
Add table
Reference in a new issue