[autoscaler][interface] Use multi node types in defaults.yaml and example-full.yaml (#14239)

* random doc typo * example-full-multi * left off max workers * wip * address comments, modify defaults, wip * fix * wip * reformat more things * undo useless diff * space * max workers * space * copy-paste mishaps * space * More copy-paste mishaps * copy-paste issues, space, max_workers * head_node_type * legacy yamls * line undeleted * correct-gpu * Remove redundant GPU example. * Extraneous comment * whitespace * example-java.yaml * Revert "example-java.yaml" This reverts commit 1e9c0124b9d97e651aaeeb6ec5bf7a4ef2a2df17. * tests and other things * doc * doc * revert max worker default * Kubernetes comment * wip * wip * tweak * Address comments * test_resource_demand_scheduler fixes * Head type min/max workers, aws resources * fix example_cluster2.yaml * Fix external node type test (compatibility with legacy-style external node types) * fix test_autoscaler_aws * gcp-images * gcp node type names * fix gcp defaults * doc format * typo * Skip failed Windows tests * doc string and comment * assert * remove contents of default external head and worker * legacy external failed validation test * Readability -- define the minimal external config at the top of the file. * Remove default worker type min worker * Remove extraneous global min_workers comment. * per-node-type docker in aws/example-gpu-docker * ray.worker.small -> ray.worker.default * fix-docker * fix gpu docker again * undo kubernetes experiment * fix doc * remove worker max_worker from kubernetes * remove max_worker from local worker node type * fix doc again * py38 * eric-comment * fix cluster name * fix-test-autoscaler * legacy config logic * pop resources * Remove min_workers AFTER merge * comment, warning message * warning, comment
2025-03-06 10:31:39 -05:00 · 2021-03-02 20:16:19 -08:00 · 2021-03-02 20:16:19 -08:00 · 1675156a8b
commit 1675156a8b
parent ef873be9e8
32 changed files with 1774 additions and 715 deletions
--- a/doc/source/cluster/config.rst
+++ b/doc/source/cluster/config.rst
@ -341,14 +341,13 @@ The key is the name of the node type, which is just for debugging purposes.
                resources: {"CPU": 2}
                min_workers: 0
                max_workers: 0
-            ray.worker.small:
+            ray.worker.default:
                node_config:
                  InstanceType: m5.large
                  InstanceMarketOptions:
                      MarketType: spot
                resources: {"CPU": 2}
                min_workers: 0
                max_workers: 1
 .. _cluster-configuration-head-node-type:
@ -1073,12 +1072,12 @@ Minimal configuration
            :language: yaml
    .. group-tab:: Azure
-    
+
        .. literalinclude:: ../../../python/ray/autoscaler/azure/example-minimal.yaml
            :language: yaml
    .. group-tab:: GCP
-    
+
        .. literalinclude:: ../../../python/ray/autoscaler/gcp/example-minimal.yaml
            :language: yaml
@ -1092,11 +1091,11 @@ Full configuration
            :language: yaml
    .. group-tab:: Azure
-    
+
        .. literalinclude:: ../../../python/ray/autoscaler/azure/example-full.yaml
            :language: yaml
    .. group-tab:: GCP
-    
+
        .. literalinclude:: ../../../python/ray/autoscaler/gcp/example-full.yaml
            :language: yaml
--- a/python/ray/autoscaler/_private/kubernetes/config.py
+++ b/python/ray/autoscaler/_private/kubernetes/config.py
@ -71,8 +71,14 @@ def fillout_resources_kubernetes(config):
        return config
    node_types = copy.deepcopy(config["available_node_types"])
    for node_type in node_types:
-        container_data = node_types[node_type]["node_config"]["spec"][
+
-            "containers"][0]
+        node_config = node_types[node_type]["node_config"]
        # The next line is for compatibility with configs like
        # kubernetes/example-ingress.yaml,
        # cf. KubernetesNodeProvider.create_node().
        pod = node_config.get("pod", node_config)
        container_data = pod["spec"]["containers"][0]
        autodetected_resources = get_autodetected_resources(container_data)
        if "resources" not in config["available_node_types"][node_type]:
            config["available_node_types"][node_type]["resources"] = {}
--- a/python/ray/autoscaler/_private/providers.py
+++ b/python/ray/autoscaler/_private/providers.py
@ -1,3 +1,4 @@
 import copy
 import importlib
 import logging
 import json
@ -11,6 +12,17 @@ logger = logging.getLogger(__name__)
 # For caching provider instantiations across API calls of one python session
 _provider_instances = {}
 # Minimal config for compatibility with legacy-style external configs.
 MINIMAL_EXTERNAL_CONFIG = {
    "available_node_types": {
        "ray.head.default": {},
        "ray.worker.default": {},
    },
    "head_node_type": "ray.head.default",
    "head_node": {},
    "worker_nodes": {},
 }
 def _import_aws(provider_config):
    from ray.autoscaler._private.aws.node_provider import AWSNodeProvider
@ -192,7 +204,7 @@ def _get_default_config(provider_config):
    package outside the autoscaler.
    """
    if provider_config["type"] == "external":
-        return {}
+        return copy.deepcopy(MINIMAL_EXTERNAL_CONFIG)
    load_config = _DEFAULT_CONFIGS.get(provider_config["type"])
    if load_config is None:
        raise NotImplementedError("Unsupported node provider: {}".format(
--- a/python/ray/autoscaler/_private/util.py
+++ b/python/ray/autoscaler/_private/util.py
@ -1,4 +1,5 @@
 import collections
 import copy
 from datetime import datetime
 import logging
 import hashlib
@ -103,38 +104,91 @@ def prepare_config(config):
    return with_defaults
 def rewrite_legacy_yaml_to_available_node_types(
        config: Dict[str, Any]) -> Dict[str, Any]:
    if "available_node_types" not in config:
        # TODO(ameer/ekl/alex): we can also rewrite here many other fields
        # that include initialization/setup/start commands and ImageId.
        logger.debug("Converting legacy cluster config to multi node types.")
        config["available_node_types"] = {
            NODE_TYPE_LEGACY_HEAD: {
                "node_config": config["head_node"],
                "resources": config["head_node"].get("resources") or {},
                "min_workers": 0,
                "max_workers": 0,
            },
            NODE_TYPE_LEGACY_WORKER: {
                "node_config": config["worker_nodes"],
                "resources": config["worker_nodes"].get("resources") or {},
                "min_workers": config.get("min_workers", 0),
                "max_workers": config.get("max_workers", 0),
            },
        }
        config["head_node_type"] = NODE_TYPE_LEGACY_HEAD
        del config["min_workers"]
    return config
 def fillout_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
    defaults = _get_default_config(config["provider"])
    defaults.update(config)
-    defaults["auth"] = defaults.get("auth", {})
+
-    defaults = rewrite_legacy_yaml_to_available_node_types(defaults)
+    # Just for clarity:
-    return defaults
+    merged_config = copy.deepcopy(defaults)
    # Fill auth field to avoid key errors.
    # This field is accessed when calling NodeUpdater but is not relevant to
    # certain node providers and is thus left out of some cluster launching
    # configs.
    merged_config["auth"] = merged_config.get("auth", {})
    # A legacy config is one which doesn't have available_node_types,
    # but has at least one of head_node or worker_nodes.
    is_legacy_config = (("available_node_types" not in config) and
                        ("head_node" in config or "worker_nodes" in config))
    # Do merging logic for legacy configs.
    if is_legacy_config:
        merged_config = merge_legacy_yaml_with_defaults(merged_config)
    # Take care of this here, in case a config does not specify any of head,
    # workers, node types, but does specify min workers:
    merged_config.pop("min_workers", None)
    return merged_config
 def merge_legacy_yaml_with_defaults(
        merged_config: Dict[str, Any]) -> Dict[str, Any]:
    """Rewrite legacy config's available node types after it has been merged
    with defaults yaml.
    """
    logger.warning("Converting legacy cluster config to multi node types.\n"
                   "Refer to the docs for examples of multi-node-type "
                   "autoscaling:\n"
                   "https://docs.ray.io/en/master/cluster/config.html"
                   "#full-configuration")
    # Get default head and worker types.
    default_head_type = merged_config["head_node_type"]
    # Default configs are assumed to have two node types -- one for the head
    # and one for the workers.
    assert len(merged_config["available_node_types"].keys()) == 2
    default_worker_type = (merged_config["available_node_types"].keys() -
                           {default_head_type}).pop()
    if merged_config["head_node"]:
        # User specified a head node in legacy config.
        # Convert it into data for the head's node type.
        head_node_info = {
            "node_config": merged_config["head_node"],
            "resources": merged_config["head_node"].get("resources") or {},
            "min_workers": 0,
            "max_workers": 0,
        }
    else:
        # Use default data for the head's node type.
        head_node_info = merged_config["available_node_types"][
            default_head_type]
    if merged_config["worker_nodes"]:
        # User specified a worker node in legacy config.
        # Convert it into data for the workers' node type.
        worker_node_info = {
            "node_config": merged_config["worker_nodes"],
            "resources": merged_config["worker_nodes"].get("resources") or {},
            "min_workers": merged_config.get("min_workers", 0),
            "max_workers": merged_config["max_workers"],
        }
    else:
        # Use default data for the workers' node type.
        worker_node_info = merged_config["available_node_types"][
            default_worker_type]
    # Rewrite available_node_types.
    merged_config["available_node_types"] = {
        NODE_TYPE_LEGACY_HEAD: head_node_info,
        NODE_TYPE_LEGACY_WORKER: worker_node_info
    }
    merged_config["head_node_type"] = NODE_TYPE_LEGACY_HEAD
    # Resources field in head/worker fields cause node launch to fail.
    merged_config["head_node"].pop("resources", None)
    merged_config["worker_nodes"].pop("resources", None)
    return merged_config
 def merge_setup_commands(config):
@ -147,7 +201,6 @@ def merge_setup_commands(config):
 def fill_node_type_max_workers(config):
    """Sets default per-node max workers to global max_workers.
    This equivalent to setting the default per-node max workers to infinity,
    with the only upper constraint coming from the global max_workers.
    """
--- a/python/ray/autoscaler/aws/defaults.yaml
+++ b/python/ray/autoscaler/aws/defaults.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
 min_workers: 0
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2
 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -43,38 +39,63 @@ auth:
 # configurations below.
 #    ssh_private_key: /path/to/your/key.pem
-# Provider-specific config for the head node, e.g. instance type. By default
+# Tell the autoscaler the allowed node types and the resources they provide.
-# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# The key is the name of the node type, which is just for debugging purposes.
-# For more documentation on available fields, see:
+# The node config specifies the launch config and physical instance type.
-# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+available_node_types:
-head_node:
+    ray.head.default:
-    InstanceType: m5.large
+        # The minimum number of worker nodes of this type to launch.
-    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+        # This number should be >= 0.
        min_workers: 0
        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 0
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see:
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: m5.large
            ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
            # You can provision additional disk space with a conf as follows
            BlockDeviceMappings:
                - DeviceName: /dev/sda1
                  Ebs:
                      VolumeSize: 100
            # Additional options in the boto docs.
    ray.worker.default:
        # The minimum number of nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 0
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see:
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: m5.large
            ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
            # Run workers on spot by default. Comment this out to use on-demand.
            InstanceMarketOptions:
                MarketType: spot
                # Additional options can be found in the boto docs, e.g.
                #   SpotOptions:
                #       MaxPrice: MAX_HOURLY_PRICE
            # Additional options in the boto docs.
-    # You can provision additional disk space with a conf as follows
+# Specify the node type of the head node (as configured above).
-    BlockDeviceMappings:
+head_node_type: ray.head.default
        - DeviceName: /dev/sda1
          Ebs:
              VolumeSize: 100
    # Additional options in the boto docs.
 # Provider-specific config for worker nodes, e.g. instance type. By default
 # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
 # For more documentation on available fields, see:
 # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
 worker_nodes:
    InstanceType: m5.large
    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
    # Run workers on spot by default. Comment this out to use on-demand.
    InstanceMarketOptions:
        MarketType: spot
        # Additional options can be found in the boto docs, e.g.
        #   SpotOptions:
        #       MaxPrice: MAX_HOURLY_PRICE
    # Additional options in the boto docs.
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -108,15 +129,8 @@ initialization_commands: []
 # List of shell commands to run to set up nodes.
 setup_commands:
-    # Note: if you're developing Ray, you probably want to create an AMI that
+    - echo 'export PATH="$HOME/anaconda3/envs/tensorflow2_latest_p37/bin:$PATH"' >> ~/.bashrc
-    # has your Ray repo pre-cloned. Then, you can replace the pip installs
+    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
    # below with a git checkout <your_sha> (and possibly a recompile).
    - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
    # Consider uncommenting these if you also want to run apt-get commands during setup
    # - sudo pkill -9 apt-get || true
    # - sudo pkill -9 dpkg || true
    # - sudo dpkg --configure -a
 # Custom commands that will be run on the head node after common setup.
 head_setup_commands:
@ -134,3 +148,6 @@ head_start_ray_commands:
 worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
 head_node: {}
 worker_nodes: {}
--- a/python/ray/autoscaler/aws/example-full-legacy.yaml
+++ b/python/ray/autoscaler/aws/example-full-legacy.yaml
@ -0,0 +1,148 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
 min_workers: 0
 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers.
 max_workers: 2
 # The autoscaler will scale up the cluster faster with higher upscaling speed.
 # E.g., if the task requires adding more nodes then autoscaler will gradually
 # scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
 # This number should be > 0.
 upscaling_speed: 1.0
 # This executes all commands on all nodes in the docker container,
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
    pull_before_run: True
    run_options: []  # Extra options to pass into "docker run"
    # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray-ml:latest-gpu"
    # Allow Ray to automatically detect GPUs
    # worker_image: "rayproject/ray-ml:latest-cpu"
    # worker_run_options: []
 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
 # Cloud-provider specific configuration.
 provider:
    type: aws
    region: us-west-2
    # Availability zone(s), comma-separated, that nodes may be launched in.
    # Nodes are currently spread between zones by a round-robin approach,
    # however this implementation detail should not be relied upon.
    availability_zone: us-west-2a,us-west-2b
    # Whether to allow node reuse. If set to False, nodes will be terminated
    # instead of stopped.
    cache_stopped_nodes: True # If not present, the default is True.
 # How Ray will authenticate with newly launched nodes.
 auth:
    ssh_user: ubuntu
 # By default Ray creates a new private keypair, but you can also use your own.
 # If you do so, make sure to also set "KeyName" in the head and worker node
 # configurations below.
 #    ssh_private_key: /path/to/your/key.pem
 # Provider-specific config for the head node, e.g. instance type. By default
 # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
 # For more documentation on available fields, see:
 # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
 head_node:
    InstanceType: m5.large
    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
    # You can provision additional disk space with a conf as follows
    BlockDeviceMappings:
        - DeviceName: /dev/sda1
          Ebs:
              VolumeSize: 100
    # Additional options in the boto docs.
 # Provider-specific config for worker nodes, e.g. instance type. By default
 # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
 # For more documentation on available fields, see:
 # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
 worker_nodes:
    InstanceType: m5.large
    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
    # Run workers on spot by default. Comment this out to use on-demand.
    InstanceMarketOptions:
        MarketType: spot
        # Additional options can be found in the boto docs, e.g.
        #   SpotOptions:
        #       MaxPrice: MAX_HOURLY_PRICE
    # Additional options in the boto docs.
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
 file_mounts: {
 #    "/path1/on/remote/machine": "/path1/on/local/machine",
 #    "/path2/on/remote/machine": "/path2/on/local/machine",
 }
 # Files or directories to copy from the head node to the worker nodes. The format is a
 # list of paths. The same path on the head node will be copied to the worker node.
 # This behavior is a subset of the file_mounts behavior. In the vast majority of cases
 # you should just use file_mounts. Only use this if you know what you're doing!
 cluster_synced_files: []
 # Whether changes to directories in file_mounts or cluster_synced_files in the head node
 # should sync to the worker node continuously
 file_mounts_sync_continuously: False
 # Patterns for files to exclude when running rsync up or rsync down
 rsync_exclude:
    - "**/.git"
    - "**/.git/**"
 # Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
 # in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
 # as a value, the behavior will match git's behavior for finding and using .gitignore files.
 rsync_filter:
    - ".gitignore"
 # List of commands that will be run before `setup_commands`. If docker is
 # enabled, these commands will run outside the container and before docker
 # is setup.
 initialization_commands: []
 # List of shell commands to run to set up nodes.
 setup_commands: []
    # Note: if you're developing Ray, you probably want to create a Docker image that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
 # Custom commands that will be run on the head node after common setup.
 head_setup_commands: []
 # Custom commands that will be run on worker nodes after common setup.
 worker_setup_commands: []
 # Command to start ray on the head node. You don't need to change this.
 head_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
 # Command to start ray on worker nodes. You don't need to change this.
 worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
--- a/python/ray/autoscaler/aws/example-full.yaml
+++ b/python/ray/autoscaler/aws/example-full.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
 min_workers: 0
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2
 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -57,38 +53,66 @@ auth:
 # configurations below.
 #    ssh_private_key: /path/to/your/key.pem
-# Provider-specific config for the head node, e.g. instance type. By default
+# Tell the autoscaler the allowed node types and the resources they provide.
-# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# The key is the name of the node type, which is just for debugging purposes.
-# For more documentation on available fields, see:
+# The node config specifies the launch config and physical instance type.
-# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+available_node_types:
-head_node:
+    ray.head.default:
-    InstanceType: m5.large
+        # The minimum number of worker nodes of this type to launch.
-    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+        # This number should be >= 0.
        min_workers: 0
        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 0
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see:
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: m5.large
            ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
            # You can provision additional disk space with a conf as follows
            BlockDeviceMappings:
                - DeviceName: /dev/sda1
                  Ebs:
                      VolumeSize: 100
            # Additional options in the boto docs.
    ray.worker.default:
        # The minimum number of worker nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 0
        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 2
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see:
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: m5.large
            ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
            # Run workers on spot by default. Comment this out to use on-demand.
            InstanceMarketOptions:
                MarketType: spot
                # Additional options can be found in the boto docs, e.g.
                #   SpotOptions:
                #       MaxPrice: MAX_HOURLY_PRICE
            # Additional options in the boto docs.
-    # You can provision additional disk space with a conf as follows
+# Specify the node type of the head node (as configured above).
-    BlockDeviceMappings:
+head_node_type: ray.head.default
        - DeviceName: /dev/sda1
          Ebs:
              VolumeSize: 100
    # Additional options in the boto docs.
 # Provider-specific config for worker nodes, e.g. instance type. By default
 # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
 # For more documentation on available fields, see:
 # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
 worker_nodes:
    InstanceType: m5.large
    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
    # Run workers on spot by default. Comment this out to use on-demand.
    InstanceMarketOptions:
        MarketType: spot
        # Additional options can be found in the boto docs, e.g.
        #   SpotOptions:
        #       MaxPrice: MAX_HOURLY_PRICE
    # Additional options in the boto docs.
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -146,3 +170,6 @@ head_start_ray_commands:
 worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
 head_node: {}
 worker_nodes: {}
--- a/python/ray/autoscaler/aws/example-gpu-docker.yaml
+++ b/python/ray/autoscaler/aws/example-gpu-docker.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: gpu-docker
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
 min_workers: 0
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2
 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -23,10 +19,6 @@ docker:
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_nvidia_docker" # e.g. ray_docker
    # # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray-ml:latest-gpu"
    # worker_image: "rayproject/ray-ml:latest"
 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
@ -48,38 +40,74 @@ auth:
 # configurations below.
 #    ssh_private_key: /path/to/your/key.pem
-# Provider-specific config for the head node, e.g. instance type. By default
+# Tell the autoscaler the allowed node types and the resources they provide.
-# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# The key is the name of the node type, which is just for debugging purposes.
-# For more documentation on available fields, see:
+# The node config specifies the launch config and physical instance type.
-# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+available_node_types:
-head_node:
+    # GPU head node.
-    InstanceType: p2.xlarge
+    ray.head.gpu:
-    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+        # worker_image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
        # The minimum number of worker nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 0
        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 0
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see:
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: p2.xlarge
            ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
            # You can provision additional disk space with a conf as follows
            BlockDeviceMappings:
                - DeviceName: /dev/sda1
                  Ebs:
                      VolumeSize: 100
            # Additional options in the boto docs.
    # CPU workers.
    ray.worker.default:
        # Override global docker setting.
        # This node type will run a CPU image,
        # rather than the GPU image specified in the global docker settings.
        docker:
            worker_image: "rayproject/ray-ml:latest-cpu"
        # The minimum number of nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 1
        # The maximum number of workers nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 2
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see:
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: m5.large
            ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
            # Run workers on spot by default. Comment this out to use on-demand.
            InstanceMarketOptions:
                MarketType: spot
                # Additional options can be found in the boto docs, e.g.
                #   SpotOptions:
                #       MaxPrice: MAX_HOURLY_PRICE
            # Additional options in the boto docs.
-    # You can provision additional disk space with a conf as follows
+# Specify the node type of the head node (as configured above).
-    BlockDeviceMappings:
+head_node_type: ray.head.gpu
        - DeviceName: /dev/sda1
          Ebs:
              VolumeSize: 100
    # Additional options in the boto docs.
 # Provider-specific config for worker nodes, e.g. instance type. By default
 # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
 # For more documentation on available fields, see:
 # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
 worker_nodes:
    InstanceType: m5.large
    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
    # Run workers on spot by default. Comment this out to use on-demand.
    InstanceMarketOptions:
        MarketType: spot
        # Additional options can be found in the boto docs, e.g.
        #   SpotOptions:
        #       MaxPrice: MAX_HOURLY_PRICE
    # Additional options in the boto docs.
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
--- a/python/ray/autoscaler/aws/example-minimal.yaml
+++ b/python/ray/autoscaler/aws/example-minimal.yaml
@ -2,7 +2,7 @@
 cluster_name: minimal
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers. min_workers default to 0.
+# node. min_workers default to 0.
 max_workers: 1
 # Cloud-provider specific configuration.
--- a/python/ray/autoscaler/azure/defaults.yaml
+++ b/python/ray/autoscaler/azure/defaults.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
 min_workers: 0
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2
 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -17,7 +13,7 @@ upscaling_speed: 1.0
 # This executes all commands on all nodes in the docker container,
 # and opens all the necessary ports to support the Ray cluster.
-# Empty string means disabled.
+# Empty object means disabled.
 docker: {}
 # If a node is idle for this many minutes, it will be removed.
@ -46,30 +42,52 @@ auth:
 # Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
 # on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
-# Provider-specific config for the head node, e.g. instance type.
+# Tell the autoscaler the allowed node types and the resources they provide.
-head_node:
+# The key is the name of the node type, which is just for debugging purposes.
-    azure_arm_parameters:
+# The node config specifies the launch config and physical instance type.
-        vmSize: Standard_D2s_v3
+available_node_types:
-        # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+    ray.head.default:
-        imagePublisher: microsoft-dsvm
+        # The minimum number of worker nodes of this type to launch.
-        imageOffer: ubuntu-1804
+        # This number should be >= 0.
-        imageSku: 1804-gen2
+        min_workers: 0
-        imageVersion: 20.07.06
+        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 0
        # The resources provided by this node type.
        resources: {"CPU": 2}
        # Provider-specific config, e.g. instance type.
        node_config:
            azure_arm_parameters:
                vmSize: Standard_D2s_v3
                # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
                imagePublisher: microsoft-dsvm
                imageOffer: ubuntu-1804
                imageSku: 1804-gen2
                imageVersion: 20.07.06
-# Provider-specific config for worker nodes, e.g. instance type.
+    ray.worker.default:
-worker_nodes:
+        # The minimum number of nodes of this type to launch.
-    azure_arm_parameters:
+        # This number should be >= 0.
-        vmSize: Standard_D2s_v3
+        min_workers: 0
-        # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+        # The resources provided by this node type.
-        imagePublisher: microsoft-dsvm
+        resources: {"CPU": 2}
-        imageOffer: ubuntu-1804
+        # Provider-specific config, e.g. instance type.
-        imageSku: 1804-gen2
+        node_config:
-        imageVersion: 20.07.06
+            azure_arm_parameters:
-        # optionally set priority to use Spot instances
+                vmSize: Standard_D2s_v3
-        priority: Spot
+                # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
-        # set a maximum price for spot instances if desired
+                imagePublisher: microsoft-dsvm
-        # billingProfile:
+                imageOffer: ubuntu-1804
-        #     maxPrice: -1
+                imageSku: 1804-gen2
                imageVersion: 20.07.06
                # optionally set priority to use Spot instances
                priority: Spot
                # set a maximum price for spot instances if desired
                # billingProfile:
                #     maxPrice: -1
 # Specify the node type of the head node (as configured above).
 head_node_type: ray.head.default
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -134,3 +152,6 @@ head_start_ray_commands:
 worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
 head_node: {}
 worker_nodes: {}
--- a/python/ray/autoscaler/azure/example-full-legacy.yaml
+++ b/python/ray/autoscaler/azure/example-full-legacy.yaml
@ -19,18 +19,20 @@ upscaling_speed: 1.0
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "rayproject/ray-ml:latest-gpu"
+    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
-    container_name: "ray_docker"
+    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
-    pull_before_run: False
+    pull_before_run: True
    run_options: []  # Extra options to pass into "docker run"
    # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray-ml:latest-gpu"
    # Allow Ray to automatically detect GPUs
-    # worker_image: "rayproject/ray-ml:latest"
+    # worker_image: "rayproject/ray-ml:latest-cpu"
    # worker_run_options: []
 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
@ -42,7 +44,7 @@ provider:
    location: westus2
    resource_group: ray-cluster
    # set subscription id otherwise the default from az cli will be used
-    # subscription_id: 00000000-0000-0000-0000-000000000000   
+    # subscription_id: 00000000-0000-0000-0000-000000000000
 # How Ray will authenticate with newly launched nodes.
 auth:
@ -53,27 +55,35 @@ auth:
    # changes to this should match what is specified in file_mounts
    ssh_public_key: ~/.ssh/id_rsa.pub
-# Provider-specific config for the head node, e.g. instance type. By default
+# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
-# Ray will auto-configure unspecified fields using defaults.yaml
+# See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines
 # Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
 # on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
 # Provider-specific config for the head node, e.g. instance type.
 head_node:
    azure_arm_parameters:
-        vmSize: Standard_NC6
+        vmSize: Standard_D2s_v3
        # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
        imagePublisher: microsoft-dsvm
        imageOffer: ubuntu-1804
-        imageSku: "1804"
+        imageSku: 1804-gen2
        imageVersion: 20.07.06
-# Provider-specific config for worker nodes, e.g. instance type. By default
+# Provider-specific config for worker nodes, e.g. instance type.
 # Ray will auto-configure unspecified fields using defaults.yaml
 worker_nodes:
    azure_arm_parameters:
-        vmSize: Standard_NC6
+        vmSize: Standard_D2s_v3
        # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
        imagePublisher: microsoft-dsvm
        imageOffer: ubuntu-1804
-        imageSku: "1804"
+        imageSku: 1804-gen2
        imageVersion: 20.07.06
        # optionally set priority to use Spot instances
        priority: Spot
        # set a maximum price for spot instances if desired
        # billingProfile:
        #     maxPrice: -1
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -83,6 +93,27 @@ file_mounts: {
     "/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
 }
 # Files or directories to copy from the head node to the worker nodes. The format is a
 # list of paths. The same path on the head node will be copied to the worker node.
 # This behavior is a subset of the file_mounts behavior. In the vast majority of cases
 # you should just use file_mounts. Only use this if you know what you're doing!
 cluster_synced_files: []
 # Whether changes to directories in file_mounts or cluster_synced_files in the head node
 # should sync to the worker node continuously
 file_mounts_sync_continuously: False
 # Patterns for files to exclude when running rsync up or rsync down
 rsync_exclude:
    - "**/.git"
    - "**/.git/**"
 # Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
 # in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
 # as a value, the behavior will match git's behavior for finding and using .gitignore files.
 rsync_filter:
    - ".gitignore"
 # List of commands that will be run before `setup_commands`. If docker is
 # enabled, these commands will run outside the container and before docker
 # is setup.
@ -92,20 +123,16 @@ initialization_commands:
 # List of shell commands to run to set up nodes.
 setup_commands:
-    # Note: if you're developing Ray, you probably want to create an AMI that
+    # Note: if you're developing Ray, you probably want to create a Docker image that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
    - echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
    # - echo 'conda activate py37_pytorch' >> ~/.bashrc
    - echo 'conda activate py37_tensorflow' >> ~/.bashrc
    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
    # Consider uncommenting these if you also want to run apt-get commands during setup
    # - sudo pkill -9 apt-get || true
    # - sudo pkill -9 dpkg || true
    # - sudo dpkg --configure -a
 # Custom commands that will be run on the head node after common setup.
-head_setup_commands: 
+head_setup_commands:
    - pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0
 # Custom commands that will be run on worker nodes after common setup.
--- a/python/ray/autoscaler/azure/example-full.yaml
+++ b/python/ray/autoscaler/azure/example-full.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
 min_workers: 0
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2
 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -17,7 +13,7 @@ upscaling_speed: 1.0
 # This executes all commands on all nodes in the docker container,
 # and opens all the necessary ports to support the Ray cluster.
-# Empty string means disabled.
+# Empty object means disabled.
 docker:
    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
@ -60,30 +56,55 @@ auth:
 # Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
 # on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
-# Provider-specific config for the head node, e.g. instance type.
+# Tell the autoscaler the allowed node types and the resources they provide.
-head_node:
+# The key is the name of the node type, which is just for debugging purposes.
-    azure_arm_parameters:
+# The node config specifies the launch config and physical instance type.
-        vmSize: Standard_D2s_v3
+available_node_types:
-        # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+    ray.head.default:
-        imagePublisher: microsoft-dsvm
+        # The minimum number of worker nodes of this type to launch.
-        imageOffer: ubuntu-1804
+        # This number should be >= 0.
-        imageSku: 1804-gen2
+        min_workers: 0
-        imageVersion: 20.07.06
+        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 0
        # The resources provided by this node type.
        resources: {"CPU": 2}
        # Provider-specific config, e.g. instance type.
        node_config:
            azure_arm_parameters:
                vmSize: Standard_D2s_v3
                # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
                imagePublisher: microsoft-dsvm
                imageOffer: ubuntu-1804
                imageSku: 1804-gen2
                imageVersion: 20.07.06
-# Provider-specific config for worker nodes, e.g. instance type.
+    ray.worker.default:
-worker_nodes:
+        # The minimum number of worker nodes of this type to launch.
-    azure_arm_parameters:
+        # This number should be >= 0.
-        vmSize: Standard_D2s_v3
+        min_workers: 0
-        # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+        # The maximum number of worker nodes of this type to launch.
-        imagePublisher: microsoft-dsvm
+        # This takes precedence over min_workers.
-        imageOffer: ubuntu-1804
+        max_workers: 2
-        imageSku: 1804-gen2
+        # The resources provided by this node type.
-        imageVersion: 20.07.06
+        resources: {"CPU": 2}
-        # optionally set priority to use Spot instances
+        # Provider-specific config, e.g. instance type.
-        priority: Spot
+        node_config:
-        # set a maximum price for spot instances if desired
+            azure_arm_parameters:
-        # billingProfile:
+                vmSize: Standard_D2s_v3
-        #     maxPrice: -1
+                # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
                imagePublisher: microsoft-dsvm
                imageOffer: ubuntu-1804
                imageSku: 1804-gen2
                imageVersion: 20.07.06
                # optionally set priority to use Spot instances
                priority: Spot
                # set a maximum price for spot instances if desired
                # billingProfile:
                #     maxPrice: -1
 # Specify the node type of the head node (as configured above).
 head_node_type: ray.head.default
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -147,3 +168,6 @@ head_start_ray_commands:
 worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
 head_node: {}
 worker_nodes: {}
--- a/python/ray/autoscaler/azure/example-gpu-docker.yaml
+++ b/python/ray/autoscaler/azure/example-gpu-docker.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: gpu-docker
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
 min_workers: 0
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2
 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -21,7 +17,7 @@ upscaling_speed: 1.0
 docker:
    image: "rayproject/ray-ml:latest-gpu"
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
-    container_name: "ray_nvidia_docker" # e.g. ray_docker
+    container_name: "ray_nvidia_docker"
    # # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray-ml:latest-gpu"
@ -45,17 +41,40 @@ auth:
    # changes to this should match what is specified in file_mounts
    ssh_public_key: ~/.ssh/id_rsa.pub
-# Provider-specific config for the head node, e.g. instance type. By default
+# Tell the autoscaler the allowed node types and the resources they provide.
-# Ray will auto-configure unspecified fields using defaults.yaml
+# The key is the name of the node type, which is just for debugging purposes.
-head_node:
+# The node config specifies the launch config and physical instance type.
-    azure_arm_parameters:
+available_node_types:
-        vmSize: Standard_NC6s_v3
+    ray.head.gpu:
        # The minimum number of worker nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 0
        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 0
        # The resources provided by this node type.
        resources: {"CPU": 6, "GPU": 1}
        # Provider-specific config, e.g. instance type.
        node_config:
            azure_arm_parameters:
                vmSize: Standard_NC6_v3
-# Provider-specific config for worker nodes, e.g. instance type. By default
+    ray.worker.gpu:
-# Ray will auto-configure unspecified fields using defaults.yaml
+        # The minimum number of nodes of this type to launch.
-worker_nodes:
+        # This number should be >= 0.
-    azure_arm_parameters:
+        min_workers: 0
-        vmSize: Standard_NC6s_v3
+        # The maximum number of workers nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 2
        # The resources provided by this node type.
        resources: {"CPU": 6, "GPU": 1}
        # Provider-specific config, e.g. instance type.
        node_config:
            azure_arm_parameters:
                vmSize: Standard_NC6_v3
 # Specify the node type of the head node (as configured above).
 head_node_type: ray.head.gpu
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -69,7 +88,7 @@ file_mounts: {
 # NOTE: rayproject/ray-ml:latest has ray latest bundled
 setup_commands: []
 #     - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
- 
+
 # Custom commands that will be run on the head node after common setup.
 head_setup_commands:
    - pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0
--- a/python/ray/autoscaler/azure/example-minimal.yaml
+++ b/python/ray/autoscaler/azure/example-minimal.yaml
@ -2,7 +2,7 @@
 cluster_name: minimal
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers. min_workers default to 0.
+# node. min_workers default to 0.
 max_workers: 1
 # Cloud-provider specific configuration.
--- a/python/ray/autoscaler/gcp/defaults.yaml
+++ b/python/ray/autoscaler/gcp/defaults.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
 min_workers: 0
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2
 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -39,50 +35,75 @@ auth:
 # project wide meta-data.
 #    ssh_private_key: /path/to/your/key.pem
-# Provider-specific config for the head node, e.g. instance type. By default
+# Tell the autoscaler the allowed node types and the resources they provide.
-# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
+# The key is the name of the node type, which is just for debugging purposes.
-# For more documentation on available fields, see:
+# The node config specifies the launch config and physical instance type.
-# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+available_node_types:
-head_node:
+    ray_head_default:
-    machineType: n1-standard-2
+        # The minimum number of worker nodes of this type to launch.
-    disks:
+        # This number should be >= 0.
-      - boot: true
+        min_workers: 0
-        autoDelete: true
+        # The maximum number of worker nodes of this type to launch.
-        type: PERSISTENT
+        # This takes precedence over min_workers.
-        initializeParams:
+        max_workers: 0
-          diskSizeGb: 50
+        # The resources provided by this node type.
-          # See https://cloud.google.com/compute/docs/images for more images
+        resources: {"CPU": 2}
-          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
+        # Provider-specific config for this node type, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
        # For more documentation on available fields, see:
        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
        node_config:
            machineType: n1-standard-2
            disks:
              - boot: true
                autoDelete: true
                type: PERSISTENT
                initializeParams:
                  diskSizeGb: 50
                  # See https://cloud.google.com/compute/docs/images for more images
                  sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
            # Additional options can be found in in the compute docs at
            # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
            # If the network interface is specified as below in both head and worker
            # nodes, the manual network config is used.  Otherwise an existing subnet is
            # used.  To use a shared subnet, ask the subnet owner to grant permission
            # for 'compute.subnetworks.use' to the ray autoscaler account...
            # networkInterfaces:
            #   - kind: compute#networkInterface
            #     subnetwork: path/to/subnet
            #     aliasIpRanges: []
    ray_worker_small:
        # The minimum number of nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 0
        # The resources provided by this node type.
        resources: {"CPU": 2}
        # Provider-specific config for this node type, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
        # For more documentation on available fields, see:
        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
        node_config:
            machineType: n1-standard-2
            disks:
              - boot: true
                autoDelete: true
                type: PERSISTENT
                initializeParams:
                  diskSizeGb: 50
                  # See https://cloud.google.com/compute/docs/images for more images
                  sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
            # Run workers on preemtible instance by default.
            # Comment this out to use on-demand.
            scheduling:
              - preemptible: true
    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
-    # If the network interface is specified as below in both head and worker
+# Specify the node type of the head node (as configured above).
-    # nodes, the manual network config is used.  Otherwise an existing subnet is
+head_node_type: ray_head_default
    # used.  To use a shared subnet, ask the subnet owner to grant permission
    # for 'compute.subnetworks.use' to the ray autoscaler account...
    # networkInterfaces:
    #   - kind: compute#networkInterface
    #     subnetwork: path/to/subnet
    #     aliasIpRanges: []
 worker_nodes:
    machineType: n1-standard-2
    disks:
      - boot: true
        autoDelete: true
        type: PERSISTENT
        initializeParams:
          diskSizeGb: 50
          # See https://cloud.google.com/compute/docs/images for more images
          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
    # Run workers on preemtible instance by default.
    # Comment this out to use on-demand.
    scheduling:
      - preemptible: true
    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -159,3 +180,6 @@ worker_start_ray_commands:
      ray start
      --address=$RAY_HEAD_IP:6379
      --object-manager-port=8076
 head_node: {}
 worker_nodes: {}
--- a/python/ray/autoscaler/gcp/example-full-legacy.yaml
+++ b/python/ray/autoscaler/gcp/example-full-legacy.yaml
@ -0,0 +1,167 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
 min_workers: 0
 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers.
 max_workers: 2
 # The autoscaler will scale up the cluster faster with higher upscaling speed.
 # E.g., if the task requires adding more nodes then autoscaler will gradually
 # scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
 # This number should be > 0.
 upscaling_speed: 1.0
 # This executes all commands on all nodes in the docker container,
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
  image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
  container_name: "ray_container"
  # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
  # if no cached version is present.
  pull_before_run: True
  run_options: []  # Extra options to pass into "docker run"
  # Example of running a GPU head with CPU workers
  # head_image: "rayproject/ray-ml:latest-gpu"
  # Allow Ray to automatically detect GPUs
  # worker_image: "rayproject/ray-ml:latest-cpu"
  # worker_run_options: []
 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
 # Cloud-provider specific configuration.
 provider:
    type: gcp
    region: us-west1
    availability_zone: us-west1-a
    project_id: null # Globally unique project id
 # How Ray will authenticate with newly launched nodes.
 auth:
    ssh_user: ubuntu
 # By default Ray creates a new private keypair, but you can also use your own.
 # If you do so, make sure to also set "KeyName" in the head and worker node
 # configurations below. This requires that you have added the key into the
 # project wide meta-data.
 #    ssh_private_key: /path/to/your/key.pem
 # Provider-specific config for the head node, e.g. instance type. By default
 # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
 # For more documentation on available fields, see:
 # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
 head_node:
    machineType: n1-standard-2
    disks:
      - boot: true
        autoDelete: true
        type: PERSISTENT
        initializeParams:
          diskSizeGb: 50
          # See https://cloud.google.com/compute/docs/images for more images
          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
    # If the network interface is specified as below in both head and worker
    # nodes, the manual network config is used.  Otherwise an existing subnet is
    # used.  To use a shared subnet, ask the subnet owner to grant permission
    # for 'compute.subnetworks.use' to the ray autoscaler account...
    # networkInterfaces:
    #   - kind: compute#networkInterface
    #     subnetwork: path/to/subnet
    #     aliasIpRanges: []
 worker_nodes:
    machineType: n1-standard-2
    disks:
      - boot: true
        autoDelete: true
        type: PERSISTENT
        initializeParams:
          diskSizeGb: 50
          # See https://cloud.google.com/compute/docs/images for more images
          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
    # Run workers on preemtible instance by default.
    # Comment this out to use on-demand.
    scheduling:
      - preemptible: true
    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
 file_mounts: {
 #    "/path1/on/remote/machine": "/path1/on/local/machine",
 #    "/path2/on/remote/machine": "/path2/on/local/machine",
 }
 # Files or directories to copy from the head node to the worker nodes. The format is a
 # list of paths. The same path on the head node will be copied to the worker node.
 # This behavior is a subset of the file_mounts behavior. In the vast majority of cases
 # you should just use file_mounts. Only use this if you know what you're doing!
 cluster_synced_files: []
 # Whether changes to directories in file_mounts or cluster_synced_files in the head node
 # should sync to the worker node continuously
 file_mounts_sync_continuously: False
 # Patterns for files to exclude when running rsync up or rsync down
 rsync_exclude:
    - "**/.git"
    - "**/.git/**"
 # Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
 # in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
 # as a value, the behavior will match git's behavior for finding and using .gitignore files.
 rsync_filter:
    - ".gitignore"
 # List of commands that will be run before `setup_commands`. If docker is
 # enabled, these commands will run outside the container and before docker
 # is setup.
 initialization_commands: []
 # List of shell commands to run to set up nodes.
 setup_commands: []
    # Note: if you're developing Ray, you probably want to create a Docker image that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
 # Custom commands that will be run on the head node after common setup.
 head_setup_commands:
  - pip install google-api-python-client==1.7.8
 # Custom commands that will be run on worker nodes after common setup.
 worker_setup_commands: []
 # Command to start ray on the head node. You don't need to change this.
 head_start_ray_commands:
    - ray stop
    - >-
      ulimit -n 65536;
      ray start
      --head
      --port=6379
      --object-manager-port=8076
      --autoscaling-config=~/ray_bootstrap_config.yaml
 # Command to start ray on worker nodes. You don't need to change this.
 worker_start_ray_commands:
    - ray stop
    - >-
      ulimit -n 65536;
      ray start
      --address=$RAY_HEAD_IP:6379
      --object-manager-port=8076
--- a/python/ray/autoscaler/gcp/example-full.yaml
+++ b/python/ray/autoscaler/gcp/example-full.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
 min_workers: 0
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2
 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -53,50 +49,78 @@ auth:
 # project wide meta-data.
 #    ssh_private_key: /path/to/your/key.pem
-# Provider-specific config for the head node, e.g. instance type. By default
+# Tell the autoscaler the allowed node types and the resources they provide.
-# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
+# The key is the name of the node type, which is just for debugging purposes.
-# For more documentation on available fields, see:
+# The node config specifies the launch config and physical instance type.
-# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+available_node_types:
-head_node:
+    ray_head_default:
-    machineType: n1-standard-2
+        # The minimum number of worker nodes of this type to launch.
-    disks:
+        # This number should be >= 0.
-      - boot: true
+        min_workers: 0
-        autoDelete: true
+        # The maximum number of worker nodes of this type to launch.
-        type: PERSISTENT
+        # This takes precedence over min_workers.
-        initializeParams:
+        max_workers: 0
-          diskSizeGb: 50
+        # The resources provided by this node type.
-          # See https://cloud.google.com/compute/docs/images for more images
+        resources: {"CPU": 2}
-          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
+        # Provider-specific config for the head node, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
        # For more documentation on available fields, see:
        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
        node_config:
            machineType: n1-standard-2
            disks:
              - boot: true
                autoDelete: true
                type: PERSISTENT
                initializeParams:
                  diskSizeGb: 50
                  # See https://cloud.google.com/compute/docs/images for more images
                  sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
            # Additional options can be found in in the compute docs at
            # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
            # If the network interface is specified as below in both head and worker
            # nodes, the manual network config is used.  Otherwise an existing subnet is
            # used.  To use a shared subnet, ask the subnet owner to grant permission
            # for 'compute.subnetworks.use' to the ray autoscaler account...
            # networkInterfaces:
            #   - kind: compute#networkInterface
            #     subnetwork: path/to/subnet
            #     aliasIpRanges: []
    ray_worker_small:
        # The minimum number of worker nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 0
        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 2
        # The resources provided by this node type.
        resources: {"CPU": 2}
        # Provider-specific config for the head node, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
        # For more documentation on available fields, see:
        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
        node_config:
            machineType: n1-standard-2
            disks:
              - boot: true
                autoDelete: true
                type: PERSISTENT
                initializeParams:
                  diskSizeGb: 50
                  # See https://cloud.google.com/compute/docs/images for more images
                  sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
            # Run workers on preemtible instance by default.
            # Comment this out to use on-demand.
            scheduling:
              - preemptible: true
    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
-    # If the network interface is specified as below in both head and worker
+# Specify the node type of the head node (as configured above).
-    # nodes, the manual network config is used.  Otherwise an existing subnet is
+head_node_type: ray_head_default
    # used.  To use a shared subnet, ask the subnet owner to grant permission
    # for 'compute.subnetworks.use' to the ray autoscaler account...
    # networkInterfaces:
    #   - kind: compute#networkInterface
    #     subnetwork: path/to/subnet
    #     aliasIpRanges: []
 worker_nodes:
    machineType: n1-standard-2
    disks:
      - boot: true
        autoDelete: true
        type: PERSISTENT
        initializeParams:
          diskSizeGb: 50
          # See https://cloud.google.com/compute/docs/images for more images
          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
    # Run workers on preemtible instance by default.
    # Comment this out to use on-demand.
    scheduling:
      - preemptible: true
    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -166,3 +190,6 @@ worker_start_ray_commands:
      ray start
      --address=$RAY_HEAD_IP:6379
      --object-manager-port=8076
 head_node: {}
 worker_nodes: {}
--- a/python/ray/autoscaler/gcp/example-gpu-docker.yaml
+++ b/python/ray/autoscaler/gcp/example-gpu-docker.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: gpu-docker
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
 min_workers: 0
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2
 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -48,58 +44,81 @@ auth:
 # project wide meta-data.
 #    ssh_private_key: /path/to/your/key.pem
-# Provider-specific config for the head node, e.g. instance type. By default
+# Tell the autoscaler the allowed node types and the resources they provide.
-# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
+# The key is the name of the node type, which is just for debugging purposes.
-# For more documentation on available fields, see:
+# The node config specifies the launch config and physical instance type.
-# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+available_node_types:
-head_node:
+    ray_head_gpu:
-    machineType: custom-6-16384
+        # The minimum number of worker nodes of this type to launch.
-    disks:
+        # This number should be >= 0.
-      - boot: true
+        min_workers: 0
-        autoDelete: true
+        # The maximum number of worker nodes of this type to launch.
-        type: PERSISTENT
+        # This takes precedence over min_workers.
-        initializeParams:
+        max_workers: 0
-          diskSizeGb: 50
+        # The resources provided by this node type.
-          # See https://cloud.google.com/compute/docs/images for more images
+        resources: {"CPU": 6, "GPU": 1}
-          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
+        # Provider-specific config for the head node, e.g. instance type. By default
-    guestAccelerators:
+        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
-      - acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
+        # For more documentation on available fields, see:
-        acceleratorCount: 1
+        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
-    metadata:
+        node_config:
-      items:
+            machineType: custom-6-16384
-        - key: install-nvidia-driver
+            disks:
-          value: "True"
+              - boot: true
-    scheduling:
+                autoDelete: true
-      - onHostMaintenance: TERMINATE
+                type: PERSISTENT
                initializeParams:
                  diskSizeGb: 50
                  # See https://cloud.google.com/compute/docs/images for more images
                  sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
            guestAccelerators:
              - acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
                acceleratorCount: 1
            metadata:
              items:
                - key: install-nvidia-driver
                  value: "True"
            scheduling:
              - onHostMaintenance: TERMINATE
-    # Additional options can be found in in the compute docs at
+    ray_worker_gpu:
-    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+        # The minimum number of nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 0
        # The maximum number of workers nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 2
        # The resources provided by this node type.
        resources: {"CPU": 2, "GPU": 1}
        # Provider-specific config for the head node, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
        # For more documentation on available fields, see:
        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
        node_config:
            machineType: n1-standard-2
            disks:
              - boot: true
                autoDelete: true
                type: PERSISTENT
                initializeParams:
                  diskSizeGb: 50
                  # See https://cloud.google.com/compute/docs/images for more images
                  sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
            guestAccelerators:
              - acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
                acceleratorCount: 1
            metadata:
              items:
                - key: install-nvidia-driver
                  value: "True"
            # Run workers on preemtible instance by default.
            # Comment this out to use on-demand.
            scheduling:
              - preemptible: true
              - onHostMaintenance: TERMINATE
-worker_nodes:
+# Specify the node type of the head node (as configured above).
-    machineType: n1-standard-2
+head_node_type: ray_head_gpu
    disks:
      - boot: true
        autoDelete: true
        type: PERSISTENT
        initializeParams:
          diskSizeGb: 50
          # See https://cloud.google.com/compute/docs/images for more images
          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
    guestAccelerators:
      - acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
        acceleratorCount: 1
    metadata:
      items:
        - key: install-nvidia-driver
          value: "True"
    # Run workers on preemtible instance by default.
    # Comment this out to use on-demand.
    scheduling:
      - preemptible: true
      - onHostMaintenance: TERMINATE
    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
--- a/python/ray/autoscaler/gcp/example-minimal.yaml
+++ b/python/ray/autoscaler/gcp/example-minimal.yaml
@ -2,7 +2,7 @@
 cluster_name: minimal
 # The maximum number of worker nodes to launch in addition to the head
-# node. This takes precedence over min_workers. min_workers default to 0.
+# node. min_workers default to 0.
 max_workers: 1
 # Cloud-provider specific configuration.
--- a/python/ray/autoscaler/kubernetes/defaults.yaml
+++ b/python/ray/autoscaler/kubernetes/defaults.yaml
@ -96,8 +96,6 @@ available_node_types:
  worker_node:
    # Minimum number of Ray workers of this Pod type.
    min_workers: 0
    # Maximum number of Ray workers of this Pod type. Takes precedence over min_workers.
    max_workers: 2
    node_config:
      apiVersion: v1
      kind: Pod
@ -136,6 +134,12 @@ available_node_types:
              # cause problems for other pods.
              memory: 512Mi
  head_node:
    # The minimum number of worker nodes of this type to launch.
    # This number should be >= 0.
    min_workers: 0
    # The maximum number of worker nodes of this type to launch.
    # This takes precedence over min_workers.
    max_workers: 0
    node_config:
      apiVersion: v1
      kind: Pod
--- a/python/ray/autoscaler/kubernetes/example-full.yaml
+++ b/python/ray/autoscaler/kubernetes/example-full.yaml
@ -139,6 +139,12 @@ available_node_types:
              # cause problems for other pods.
              memory: 512Mi
  head_node:
    # The minimum number of worker nodes of this type to launch.
    # This number should be >= 0.
    min_workers: 0
    # The maximum number of worker nodes of this type to launch.
    # This takes precedence over min_workers.
    max_workers: 0
    node_config:
      apiVersion: v1
      kind: Pod
--- a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml
+++ b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml
@ -17,6 +17,10 @@ spec:
  # Specify the allowed pod types for this ray cluster and the resources they provide.
  podTypes:
  - name: head-node
    # Minimum number of Ray workers of this Pod type.
    minWorkers: 0
    # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
    maxWorkers: 0
    podConfig:
      apiVersion: v1
      kind: Pod
--- a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml
+++ b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml
@ -17,6 +17,10 @@ spec:
  # Specify the allowed pod types for this ray cluster and the resources they provide.
  podTypes:
  - name: head-node
    # Minimum number of Ray workers of this Pod type.
    minWorkers: 0
    # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
    maxWorkers: 0
    podConfig:
      apiVersion: v1
      kind: Pod
--- a/python/ray/autoscaler/local/defaults.yaml
+++ b/python/ray/autoscaler/local/defaults.yaml
@ -1,16 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default
-## NOTE: Typically for local clusters, min_workers == max_workers == len(worker_ips).
+## NOTE: Typically for local clusters, max_workers == len(worker_ips).
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
 # Typically, min_workers == max_workers == len(worker_ips).
 min_workers: 0
 # The maximum number of workers nodes to launch in addition to the head node.
 # This takes precedence over min_workers.
 # Typically, min_workers == max_workers == len(worker_ips).
 max_workers: 0
 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -42,11 +34,20 @@ auth:
    # Optional if an ssh private key is necessary to ssh to the cluster.
    # ssh_private_key: ~/.ssh/id_rsa
-# Leave this empty.
+available_node_types:
-head_node: {}
+    ray.head.default:
-
+        resources: {}
-# Leave this empty.
+        min_workers: 0
-worker_nodes: {}
+        max_workers: 0
        # Leave this empty
        node_config: {}
    ray.worker.default:
        resources: {}
        ## NOTE: Typically for local clusters, max_workers == len(worker_ips).
        min_workers: 0
        # Leave this empty
        node_config: {}
 head_node_type: ray.head.default
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -97,3 +98,6 @@ head_start_ray_commands:
 worker_start_ray_commands:
    - ray stop
    - ray start --address=$RAY_HEAD_IP:6379
 head_node: {}
 worker_nodes: {}
--- a/python/ray/autoscaler/staroid/defaults.yaml
+++ b/python/ray/autoscaler/staroid/defaults.yaml
@ -2,10 +2,6 @@
 # A namespace will be automatically created for each cluster_name in SKE.
 cluster_name: default
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
 min_workers: 0
 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers.
 max_workers: 2
@ -85,174 +81,184 @@ provider:
    # Exposing external IP addresses for ray pods isn't currently supported.
    use_internal_ips: true
-# Kubernetes pod config for the head node pod.
+head_node_type: ray.head.default
 head_node:
    apiVersion: v1
    kind: Pod
    metadata:
        # Automatically generates a name for the pod with this prefix.
        generateName: ray-head-
-        # Must match the head node service selector above if a head node
+available_node_types:
-        # service is required.
+    ray.head.default:
-        labels:
+        resources: {"CPU": 1}
-            component: ray-head
+        min_workers: 0
        max_workers: 0
        # Kubernetes pod config for the head node pod.
        node_config:
            apiVersion: v1
            kind: Pod
            metadata:
                # Automatically generates a name for the pod with this prefix.
                generateName: ray-head-
-            # https://docs.staroid.com/ske/pod.html#pod
+                # Must match the head node service selector above if a head node
-            pod.staroid.com/spot: "false" # use on-demand instance for head.
+                # service is required.
                labels:
                    component: ray-head
-            # Uncomment to locate ray head to dedicated Kubernetes node
+                    # https://docs.staroid.com/ske/pod.html#pod
-            # (GPU instance is only available for 'dedicated' isolation)
+                    pod.staroid.com/spot: "false" # use on-demand instance for head.
            #pod.staroid.com/isolation: dedicated
            #pod.staroid.com/instance-type: gpu-1
    spec:
        automountServiceAccountToken: true
-        # Restarting the head node automatically is not currently supported.
+                    # Uncomment to locate ray head to dedicated Kubernetes node
-        # If the head node goes down, `ray up` must be run again.
+                    # (GPU instance is only available for 'dedicated' isolation)
-        restartPolicy: Never
+                    #pod.staroid.com/isolation: dedicated
                    #pod.staroid.com/instance-type: gpu-1
            spec:
                automountServiceAccountToken: true
-        # This volume allocates shared memory for Ray to use for its plasma
+                # Restarting the head node automatically is not currently supported.
-        # object store. If you do not provide this, Ray will fall back to
+                # If the head node goes down, `ray up` must be run again.
-        # /tmp which cause slowdowns if is not a shared memory volume.
+                restartPolicy: Never
        volumes:
        - name: dshm
          emptyDir:
              medium: Memory
        # nfs volume provides a shared volume across all ray-nodes.
        - name: nfs-volume
          persistentVolumeClaim:
            claimName: nfs
-        containers:
+                # This volume allocates shared memory for Ray to use for its plasma
-        - name: ray-node
+                # object store. If you do not provide this, Ray will fall back to
-          imagePullPolicy: Always
+                # /tmp which cause slowdowns if is not a shared memory volume.
-          # You are free (and encouraged) to use your own container image,
+                volumes:
-          # but it should have the following installed:
+                - name: dshm
-          #   - rsync (used for `ray rsync` commands and file mounts)
+                  emptyDir:
-          #   - screen (used for `ray attach`)
+                      medium: Memory
-          #   - kubectl (used by the autoscaler to manage worker pods)
+                # nfs volume provides a shared volume across all ray-nodes.
-          # Image will be overridden when 'image_from_project' is true.
+                - name: nfs-volume
-          image: rayproject/ray
+                  persistentVolumeClaim:
-          # Do not change this command - it keeps the pod alive until it is
+                    claimName: nfs
          # explicitly killed.
          command: ["/bin/bash", "-c", "--"]
          args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
          ports:
              - containerPort: 6379 # Redis port.
              - containerPort: 6380 # Redis port.
              - containerPort: 6381 # Redis port.
              - containerPort: 12345 # Ray internal communication.
              - containerPort: 12346 # Ray internal communication.
-          # This volume allocates shared memory for Ray to use for its plasma
+                containers:
-          # object store. If you do not provide this, Ray will fall back to
+                - name: ray-node
-          # /tmp which cause slowdowns if is not a shared memory volume.
+                  imagePullPolicy: Always
-          volumeMounts:
+                  # You are free (and encouraged) to use your own container image,
-              - mountPath: /dev/shm
+                  # but it should have the following installed:
-                name: dshm
+                  #   - rsync (used for `ray rsync` commands and file mounts)
-              - mountPath: /nfs
+                  #   - screen (used for `ray attach`)
-                name: nfs-volume
+                  #   - kubectl (used by the autoscaler to manage worker pods)
-          resources:
+                  # Image will be overridden when 'image_from_project' is true.
-              requests:
+                  image: rayproject/ray
-                  cpu: 1000m
+                  # Do not change this command - it keeps the pod alive until it is
-                  memory: 2Gi
+                  # explicitly killed.
-              limits:
+                  command: ["/bin/bash", "-c", "--"]
-                  # The maximum memory that this pod is allowed to use. The
+                  args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
-                  # limit will be detected by ray and split to use 10% for
+                  ports:
-                  # redis, 30% for the shared memory object store, and the
+                      - containerPort: 6379 # Redis port.
-                  # rest for application memory. If this limit is not set and
+                      - containerPort: 6380 # Redis port.
-                  # the object store size is not set manually, ray will
+                      - containerPort: 6381 # Redis port.
-                  # allocate a very large object store in each pod that may
+                      - containerPort: 12345 # Ray internal communication.
-                  # cause problems for other pods.
+                      - containerPort: 12346 # Ray internal communication.
                  memory: 2Gi
          env:
              # This is used in the head_start_ray_commands below so that
              # Ray can spawn the correct number of processes. Omitting this
              # may lead to degraded performance.
              - name: MY_CPU_REQUEST
                valueFrom:
                    resourceFieldRef:
                        resource: requests.cpu
              - name: RAY_ADDRESS
                value: "auto"
-# Kubernetes pod config for worker node pods.
+                  # This volume allocates shared memory for Ray to use for its plasma
-worker_nodes:
+                  # object store. If you do not provide this, Ray will fall back to
-    apiVersion: v1
+                  # /tmp which cause slowdowns if is not a shared memory volume.
-    kind: Pod
+                  volumeMounts:
-    metadata:
+                      - mountPath: /dev/shm
-        # Automatically generates a name for the pod with this prefix.
+                        name: dshm
-        generateName: ray-worker-
+                      - mountPath: /nfs
                        name: nfs-volume
                  resources:
                      requests:
                          cpu: 1000m
                          memory: 2Gi
                      limits:
                          # The maximum memory that this pod is allowed to use. The
                          # limit will be detected by ray and split to use 10% for
                          # redis, 30% for the shared memory object store, and the
                          # rest for application memory. If this limit is not set and
                          # the object store size is not set manually, ray will
                          # allocate a very large object store in each pod that may
                          # cause problems for other pods.
                          memory: 2Gi
                  env:
                      # This is used in the head_start_ray_commands below so that
                      # Ray can spawn the correct number of processes. Omitting this
                      # may lead to degraded performance.
                      - name: MY_CPU_REQUEST
                        valueFrom:
                            resourceFieldRef:
                                resource: requests.cpu
                      - name: RAY_ADDRESS
                        value: "auto"
-        # Must match the worker node service selector above if a worker node
+    ray.worker.default:
-        # service is required.
+        min_workers: 0
-        labels:
+        resources: {"CPU": 1}
-            component: ray-worker
+        # Kubernetes pod config for worker node pods.
        node_config:
            apiVersion: v1
            kind: Pod
            metadata:
                # Automatically generates a name for the pod with this prefix.
                generateName: ray-worker-
-            # https://docs.staroid.com/ske/pod.html#pod
+                # Must match the worker node service selector above if a worker node
-            pod.staroid.com/spot: "true" # use spot instance for workers.
+                # service is required.
                labels:
                    component: ray-worker
-            # Uncomment to locate ray head to dedicated Kubernetes node
+                    # https://docs.staroid.com/ske/pod.html#pod
-            # (GPU instance is only available for 'dedicated' isolation)
+                    pod.staroid.com/spot: "true" # use spot instance for workers.
            #pod.staroid.com/isolation: dedicated
            #pod.staroid.com/instance-type: gpu-1
    spec:
        serviceAccountName: default
-        # Worker nodes will be managed automatically by the head node, so
+                    # Uncomment to locate ray head to dedicated Kubernetes node
-        # do not change the restart policy.
+                    # (GPU instance is only available for 'dedicated' isolation)
-        restartPolicy: Never
+                    #pod.staroid.com/isolation: dedicated
                    #pod.staroid.com/instance-type: gpu-1
            spec:
                serviceAccountName: default
-        # This volume allocates shared memory for Ray to use for its plasma
+                # Worker nodes will be managed automatically by the head node, so
-        # object store. If you do not provide this, Ray will fall back to
+                # do not change the restart policy.
-        # /tmp which cause slowdowns if is not a shared memory volume.
+                restartPolicy: Never
        volumes:
        - name: dshm
          emptyDir:
              medium: Memory
        - name: nfs-volume
          persistentVolumeClaim:
            claimName: nfs
        containers:
        - name: ray-node
          imagePullPolicy: Always
          # You are free (and encouraged) to use your own container image,
          # but it should have the following installed:
          #   - rsync (used for `ray rsync` commands and file mounts)
          image: rayproject/autoscaler
          # Do not change this command - it keeps the pod alive until it is
          # explicitly killed.
          command: ["/bin/bash", "-c", "--"]
          args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
          ports:
              - containerPort: 12345 # Ray internal communication.
              - containerPort: 12346 # Ray internal communication.
-          # This volume allocates shared memory for Ray to use for its plasma
+                # This volume allocates shared memory for Ray to use for its plasma
-          # object store. If you do not provide this, Ray will fall back to
+                # object store. If you do not provide this, Ray will fall back to
-          # /tmp which cause slowdowns if is not a shared memory volume.
+                # /tmp which cause slowdowns if is not a shared memory volume.
-          volumeMounts:
+                volumes:
-              - mountPath: /dev/shm
+                - name: dshm
-                name: dshm
+                  emptyDir:
-              - mountPath: /nfs
+                      medium: Memory
-                name: nfs-volume
+                - name: nfs-volume
-          resources:
+                  persistentVolumeClaim:
-              requests:
+                    claimName: nfs
-                  cpu: 1000m
+                containers:
-                  memory: 2Gi
+                - name: ray-node
-              limits:
+                  imagePullPolicy: Always
-                  # This memory limit will be detected by ray and split into
+                  # You are free (and encouraged) to use your own container image,
-                  # 30% for plasma, and 70% for workers.
+                  # but it should have the following installed:
-                  memory: 2Gi
+                  #   - rsync (used for `ray rsync` commands and file mounts)
-          env:
+                  image: rayproject/autoscaler
-              # This is used in the head_start_ray_commands below so that
+                  # Do not change this command - it keeps the pod alive until it is
-              # Ray can spawn the correct number of processes. Omitting this
+                  # explicitly killed.
-              # may lead to degraded performance.
+                  command: ["/bin/bash", "-c", "--"]
-              - name: MY_CPU_REQUEST
+                  args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
-                valueFrom:
+                  ports:
-                    resourceFieldRef:
+                      - containerPort: 12345 # Ray internal communication.
-                        resource: requests.cpu
+                      - containerPort: 12346 # Ray internal communication.
                  # This volume allocates shared memory for Ray to use for its plasma
                  # object store. If you do not provide this, Ray will fall back to
                  # /tmp which cause slowdowns if is not a shared memory volume.
                  volumeMounts:
                      - mountPath: /dev/shm
                        name: dshm
                      - mountPath: /nfs
                        name: nfs-volume
                  resources:
                      requests:
                          cpu: 1000m
                          memory: 2Gi
                      limits:
                          # This memory limit will be detected by ray and split into
                          # 30% for plasma, and 70% for workers.
                          memory: 2Gi
                  env:
                      # This is used in the head_start_ray_commands below so that
                      # Ray can spawn the correct number of processes. Omitting this
                      # may lead to degraded performance.
                      - name: MY_CPU_REQUEST
                        valueFrom:
                            resourceFieldRef:
                                resource: requests.cpu
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -307,3 +313,6 @@ head_start_ray_commands:
 worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
 head_node: {}
 worker_nodes: {}
--- a/python/ray/tests/aws/test_autoscaler_aws.py
+++ b/python/ray/tests/aws/test_autoscaler_aws.py
@ -1,8 +1,8 @@
 import pytest
 from ray.autoscaler._private.aws.config import _get_vpc_id_or_die, \
-                                               bootstrap_aws, \
+    bootstrap_aws, \
-                                               DEFAULT_AMI
+    DEFAULT_AMI
 import ray.tests.aws.utils.stubs as stubs
 import ray.tests.aws.utils.helpers as helpers
 from ray.tests.aws.utils.constants import AUX_SUBNET, DEFAULT_SUBNET, \
@ -143,8 +143,10 @@ def test_fills_out_amis(iam_client_stub, ec2_client_stub):
    stubs.configure_subnet_default(ec2_client_stub)
    config = helpers.load_aws_example_config_file("example-full.yaml")
-    del config["head_node"]["ImageId"]
+    del config["available_node_types"]["ray.head.default"]["node_config"][
-    del config["worker_nodes"]["ImageId"]
+        "ImageId"]
    del config["available_node_types"]["ray.worker.default"]["node_config"][
        "ImageId"]
    # Pass in SG for stub to work
    config["head_node"]["SecurityGroupIds"] = ["sg-1234abcd"]
--- a/python/ray/tests/test_autoscaler.py
+++ b/python/ray/tests/test_autoscaler.py
@ -1,4 +1,5 @@
 import json
 import jsonschema
 import os
 import shutil
 from subprocess import CalledProcessError
@ -264,6 +265,55 @@ SMALL_CLUSTER = {
    "worker_start_ray_commands": ["start_ray_worker"],
 }
 MOCK_DEFAULT_CONFIG = {
    "cluster_name": "default",
    "max_workers": 2,
    "upscaling_speed": 1.0,
    "idle_timeout_minutes": 5,
    "provider": {
        "type": "mock",
        "region": "us-east-1",
        "availability_zone": "us-east-1a",
    },
    "docker": {
        "image": "example",
        "container_name": "mock",
    },
    "auth": {
        "ssh_user": "ubuntu",
        "ssh_private_key": os.devnull,
    },
    "available_node_types": {
        "ray.head.default": {
            "min_workers": 0,
            "max_workers": 0,
            "resources": {},
            "node_config": {
                "head_default_prop": 4
            }
        },
        "ray.worker.default": {
            "min_workers": 0,
            "max_workers": 2,
            "resources": {},
            "node_config": {
                "worker_default_prop": 7
            }
        }
    },
    "head_node_type": "ray.head.default",
    "head_node": {},
    "worker_nodes": {},
    "file_mounts": {},
    "cluster_synced_files": [],
    "initialization_commands": [],
    "setup_commands": [],
    "head_setup_commands": [],
    "worker_setup_commands": [],
    "head_start_ray_commands": [],
    "worker_start_ray_commands": [],
 }
 class LoadMetricsTest(unittest.TestCase):
    def testHeartbeat(self):
@ -1645,6 +1695,28 @@ class AutoscalingTest(unittest.TestCase):
            config_path, LoadMetrics(), max_failures=0, update_interval_s=0)
        assert isinstance(autoscaler.provider, NodeProvider)
    def testLegacyExternalNodeScalerMissingFields(self):
        """Should fail to validate legacy external config with missing
        head_node, worker_nodes, or both."""
        external_config = copy.deepcopy(SMALL_CLUSTER)
        external_config["provider"] = {
            "type": "external",
            "module": "ray.autoscaler.node_provider.NodeProvider",
        }
        missing_workers, missing_head, missing_both = [
            copy.deepcopy(external_config) for _ in range(3)
        ]
        del missing_workers["worker_nodes"]
        del missing_head["head_node"]
        del missing_both["worker_nodes"]
        del missing_both["head_node"]
        for faulty_config in missing_workers, missing_head, missing_both:
            faulty_config = prepare_config(faulty_config)
            with pytest.raises(jsonschema.ValidationError):
                validate_config(faulty_config)
    def testExternalNodeScalerWrongImport(self):
        config = SMALL_CLUSTER.copy()
        config["provider"] = {
--- a/python/ray/tests/test_autoscaler_yaml.py
+++ b/python/ray/tests/test_autoscaler_yaml.py
@ -1,4 +1,5 @@
 import jsonschema
 import logging
 import os
 import sys
 import tempfile
@ -9,10 +10,12 @@ import copy
 from unittest.mock import MagicMock, Mock, patch
 import pytest
-from ray.autoscaler._private.util import prepare_config, validate_config
+from ray.autoscaler._private.util import prepare_config, validate_config,\
    _get_default_config, merge_setup_commands
 from ray.autoscaler._private.providers import _NODE_PROVIDERS
 from ray.autoscaler._private.kubernetes.node_provider import\
    KubernetesNodeProvider
 from ray.autoscaler.tags import NODE_TYPE_LEGACY_HEAD, NODE_TYPE_LEGACY_WORKER
 from ray.test_utils import load_test_config, recursive_fnmatch
@ -37,18 +40,19 @@ CONFIG_PATHS = ignore_k8s_operator_configs(CONFIG_PATHS)
 class AutoscalingConfigTest(unittest.TestCase):
    def testValidateDefaultConfig(self):
        for config_path in CONFIG_PATHS:
            if "aws/example-multi-node-type.yaml" in config_path:
                # aws is tested in testValidateDefaultConfigAWSMultiNodeTypes.
                continue
            with open(config_path) as f:
                config = yaml.safe_load(f)
            config = prepare_config(config)
            if config["provider"]["type"] == "kubernetes":
                KubernetesNodeProvider.fillout_available_node_types_resources(
                    config)
            try:
                if "aws/example-multi-node-type.yaml" in config_path:
                    # aws tested in testValidateDefaultConfigAWSMultiNodeTypes.
                    continue
                with open(config_path) as f:
                    config = yaml.safe_load(f)
                config = prepare_config(config)
                if config["provider"]["type"] == "kubernetes":
                    KubernetesNodeProvider.\
                        fillout_available_node_types_resources(config)
                validate_config(config)
            except Exception:
                logging.exception("")
                self.fail(
                    f"Config {config_path} did not pass validation test!")
@ -232,7 +236,6 @@ class AutoscalingConfigTest(unittest.TestCase):
            self.fail("Failed to validate config with security group name!")
    def testMaxWorkerDefault(self):
        # Load config, call prepare config, check that default max_workers
        # is filled correctly for node types that don't specify it.
        # Check that max_workers is untouched for node types
@ -254,7 +257,7 @@ class AutoscalingConfigTest(unittest.TestCase):
        # Max workers auto-filled with specified cluster-wide value of 5.
        assert config["max_workers"] ==\
            prepared_node_types["worker_node_max_unspecified"]["max_workers"]\
-            == config["max_workers"] == 5
+            == 5
        # Repeat with a config that doesn't specify global max workers.
        # Default value of 2 should be pulled in for global max workers.
@ -275,8 +278,87 @@ class AutoscalingConfigTest(unittest.TestCase):
            prepared_node_types["worker_node_max_specified"][
                "max_workers"] == 3
        # Max workers auto-filled with default cluster-wide value of 2.
-        assert prepared_node_types["worker_node_max_unspecified"][
+        assert prepared_config["max_workers"] ==\
-            "max_workers"] == 2
+            prepared_node_types["worker_node_max_unspecified"]["max_workers"]\
            == 2
    def testFillEdgeLegacyConfigs(self):
        # Test edge cases: legacy configs which specify workers but not head
        # or vice-versa.
        no_head = load_test_config("test_no_head.yaml")
        aws_defaults = _get_default_config(no_head["provider"])
        head_prepared = prepare_config(no_head)
        assert head_prepared["available_node_types"][
            "ray-legacy-head-node-type"]["node_config"] ==\
            aws_defaults["available_node_types"][
                "ray.head.default"]["node_config"]
        assert head_prepared["head_node"] == {}
        # Custom worker config preserved
        node_types = head_prepared["available_node_types"]
        worker_type = node_types["ray-legacy-worker-node-type"]
        assert worker_type["node_config"] == head_prepared["worker_nodes"] == {
            "foo": "bar"
        }
        no_workers = load_test_config("test_no_workers.yaml")
        workers_prepared = prepare_config(no_workers)
        assert workers_prepared["available_node_types"][
            "ray-legacy-worker-node-type"]["node_config"] ==\
            aws_defaults["available_node_types"][
                "ray.worker.default"]["node_config"]
        assert workers_prepared["worker_nodes"] == {}
        # Custom head config preserved
        node_types = workers_prepared["available_node_types"]
        head_type = node_types["ray-legacy-head-node-type"]
        assert head_type["node_config"] == workers_prepared["head_node"] == {
            "baz": "qux"
        }
    @pytest.mark.skipif(
        sys.platform.startswith("win"), reason="Fails on Windows.")
    def testExampleFull(self):
        """
        Test that example-full yamls are unmodified by prepared_config,
        except possibly by having setup_commands merged.
        """
        providers = ["aws", "gcp", "azure"]
        for provider in providers:
            path = os.path.join(RAY_PATH, "autoscaler", provider,
                                "example-full.yaml")
            config = yaml.safe_load(open(path).read())
            config_copy = copy.deepcopy(config)
            merge_setup_commands(config_copy)
            assert config_copy == prepare_config(config)
    @pytest.mark.skipif(
        sys.platform.startswith("win"), reason="Fails on Windows.")
    def testLegacyYaml(self):
        # Test correct default-merging behavior for legacy yamls.
        providers = ["aws", "gcp", "azure"]
        for provider in providers:
            path = os.path.join(RAY_PATH, "autoscaler", provider,
                                "example-full-legacy.yaml")
            legacy_config = yaml.safe_load(open(path).read())
            # custom head and workers
            legacy_config["head_node"] = {"blahblah": 0}
            legacy_config["worker_nodes"] = {"halbhalhb": 0}
            legacy_config_copy = copy.deepcopy(legacy_config)
            prepared_legacy = prepare_config(legacy_config_copy)
            assert prepared_legacy["available_node_types"][
                NODE_TYPE_LEGACY_HEAD]["max_workers"] == 0
            assert prepared_legacy["available_node_types"][
                NODE_TYPE_LEGACY_HEAD]["min_workers"] == 0
            assert prepared_legacy["available_node_types"][
                NODE_TYPE_LEGACY_HEAD]["node_config"] == legacy_config[
                    "head_node"]
            assert prepared_legacy["available_node_types"][
                NODE_TYPE_LEGACY_WORKER]["max_workers"] == 2
            assert prepared_legacy["available_node_types"][
                NODE_TYPE_LEGACY_WORKER]["min_workers"] == 0
            assert prepared_legacy["available_node_types"][
                NODE_TYPE_LEGACY_WORKER]["node_config"] == legacy_config[
                    "worker_nodes"]
 if __name__ == "__main__":
--- a/python/ray/tests/test_cli_patterns/test_no_head.yaml
+++ b/python/ray/tests/test_cli_patterns/test_no_head.yaml
@ -0,0 +1,123 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
 min_workers: 0
 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers.
 max_workers: 2
 # The autoscaler will scale up the cluster faster with higher upscaling speed.
 # E.g., if the task requires adding more nodes then autoscaler will gradually
 # scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
 # This number should be > 0.
 upscaling_speed: 1.0
 # This executes all commands on all nodes in the docker container,
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
    pull_before_run: True
    run_options: []  # Extra options to pass into "docker run"
    # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray-ml:latest-gpu"
    # Allow Ray to automatically detect GPUs
    # worker_image: "rayproject/ray-ml:latest-cpu"
    # worker_run_options: []
 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
 # Cloud-provider specific configuration.
 provider:
    type: aws
    region: us-west-2
    # Availability zone(s), comma-separated, that nodes may be launched in.
    # Nodes are currently spread between zones by a round-robin approach,
    # however this implementation detail should not be relied upon.
    availability_zone: us-west-2a,us-west-2b
    # Whether to allow node reuse. If set to False, nodes will be terminated
    # instead of stopped.
    cache_stopped_nodes: True # If not present, the default is True.
 # How Ray will authenticate with newly launched nodes.
 auth:
    ssh_user: ubuntu
 # By default Ray creates a new private keypair, but you can also use your own.
 # If you do so, make sure to also set "KeyName" in the head and worker node
 # configurations below.
 #    ssh_private_key: /path/to/your/key.pem
 # Provider-specific config for worker nodes, e.g. instance type. By default
 # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
 # For more documentation on available fields, see:
 # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
 worker_nodes:
    foo: bar
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
 file_mounts: {
 #    "/path1/on/remote/machine": "/path1/on/local/machine",
 #    "/path2/on/remote/machine": "/path2/on/local/machine",
 }
 # Files or directories to copy from the head node to the worker nodes. The format is a
 # list of paths. The same path on the head node will be copied to the worker node.
 # This behavior is a subset of the file_mounts behavior. In the vast majority of cases
 # you should just use file_mounts. Only use this if you know what you're doing!
 cluster_synced_files: []
 # Whether changes to directories in file_mounts or cluster_synced_files in the head node
 # should sync to the worker node continuously
 file_mounts_sync_continuously: False
 # Patterns for files to exclude when running rsync up or rsync down
 rsync_exclude:
    - "**/.git"
    - "**/.git/**"
 # Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
 # in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
 # as a value, the behavior will match git's behavior for finding and using .gitignore files.
 rsync_filter:
    - ".gitignore"
 # List of commands that will be run before `setup_commands`. If docker is
 # enabled, these commands will run outside the container and before docker
 # is setup.
 initialization_commands: []
 # List of shell commands to run to set up nodes.
 setup_commands: []
    # Note: if you're developing Ray, you probably want to create a Docker image that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
 # Custom commands that will be run on the head node after common setup.
 head_setup_commands: []
 # Custom commands that will be run on worker nodes after common setup.
 worker_setup_commands: []
 # Command to start ray on the head node. You don't need to change this.
 head_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
 # Command to start ray on worker nodes. You don't need to change this.
 worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
--- a/python/ray/tests/test_cli_patterns/test_no_workers.yaml
+++ b/python/ray/tests/test_cli_patterns/test_no_workers.yaml
@ -0,0 +1,124 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
 min_workers: 0
 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers.
 max_workers: 2
 # The autoscaler will scale up the cluster faster with higher upscaling speed.
 # E.g., if the task requires adding more nodes then autoscaler will gradually
 # scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
 # This number should be > 0.
 upscaling_speed: 1.0
 # This executes all commands on all nodes in the docker container,
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
    pull_before_run: True
    run_options: []  # Extra options to pass into "docker run"
    # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray-ml:latest-gpu"
    # Allow Ray to automatically detect GPUs
    # worker_image: "rayproject/ray-ml:latest-cpu"
    # worker_run_options: []
 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
 # Cloud-provider specific configuration.
 provider:
    type: aws
    region: us-west-2
    # Availability zone(s), comma-separated, that nodes may be launched in.
    # Nodes are currently spread between zones by a round-robin approach,
    # however this implementation detail should not be relied upon.
    availability_zone: us-west-2a,us-west-2b
    # Whether to allow node reuse. If set to False, nodes will be terminated
    # instead of stopped.
    cache_stopped_nodes: True # If not present, the default is True.
 # How Ray will authenticate with newly launched nodes.
 auth:
    ssh_user: ubuntu
 # By default Ray creates a new private keypair, but you can also use your own.
 # If you do so, make sure to also set "KeyName" in the head and worker node
 # configurations below.
 #    ssh_private_key: /path/to/your/key.pem
 # Provider-specific config for the head node, e.g. instance type. By default
 # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
 # For more documentation on available fields, see:
 # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
 head_node:
    baz: qux
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
 file_mounts: {
 #    "/path1/on/remote/machine": "/path1/on/local/machine",
 #    "/path2/on/remote/machine": "/path2/on/local/machine",
 }
 # Files or directories to copy from the head node to the worker nodes. The format is a
 # list of paths. The same path on the head node will be copied to the worker node.
 # This behavior is a subset of the file_mounts behavior. In the vast majority of cases
 # you should just use file_mounts. Only use this if you know what you're doing!
 cluster_synced_files: []
 # Whether changes to directories in file_mounts or cluster_synced_files in the head node
 # should sync to the worker node continuously
 file_mounts_sync_continuously: False
 # Patterns for files to exclude when running rsync up or rsync down
 rsync_exclude:
    - "**/.git"
    - "**/.git/**"
 # Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
 # in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
 # as a value, the behavior will match git's behavior for finding and using .gitignore files.
 rsync_filter:
    - ".gitignore"
 # List of commands that will be run before `setup_commands`. If docker is
 # enabled, these commands will run outside the container and before docker
 # is setup.
 initialization_commands: []
 # List of shell commands to run to set up nodes.
 setup_commands: []
    # Note: if you're developing Ray, you probably want to create a Docker image that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
 # Custom commands that will be run on the head node after common setup.
 head_setup_commands: []
 # Custom commands that will be run on worker nodes after common setup.
 worker_setup_commands: []
 # Command to start ray on the head node. You don't need to change this.
 head_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
 # Command to start ray on worker nodes. You don't need to change this.
 worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
--- a/python/ray/tests/test_k8s_operator_examples.py
+++ b/python/ray/tests/test_k8s_operator_examples.py
@ -195,4 +195,4 @@ class KubernetesOperatorTest(unittest.TestCase):
 if __name__ == "__main__":
    kubernetes.config.load_kube_config()
-    sys.exit(pytest.main(["-v", __file__]))
+    sys.exit(pytest.main(["-sv", __file__]))
--- a/python/ray/tests/test_resource_demand_scheduler.py
+++ b/python/ray/tests/test_resource_demand_scheduler.py
@ -5,15 +5,16 @@ import yaml
 import tempfile
 import shutil
 import unittest
 from unittest import mock
 import copy
 import ray
 import ray.ray_constants
 from ray.autoscaler._private.util import \
-    rewrite_legacy_yaml_to_available_node_types, format_info_string, \
+    prepare_config, format_info_string, \
    format_info_string_no_node_types
-from ray.tests.test_autoscaler import SMALL_CLUSTER, MockProvider, \
+from ray.tests.test_autoscaler import SMALL_CLUSTER, MOCK_DEFAULT_CONFIG, \
-    MockProcessRunner
+    MockProvider, MockProcessRunner
 from ray.autoscaler._private.providers import (_NODE_PROVIDERS,
                                               _clear_provider_cache)
 from ray.autoscaler._private.autoscaler import StandardAutoscaler, \
@ -38,6 +39,8 @@ from ray.autoscaler._private.constants import \
 from time import sleep
 GET_DEFAULT_METHOD = "ray.autoscaler._private.util._get_default_config"
 TYPES_A = {
    "empty_node": {
        "node_config": {
@ -1042,131 +1045,135 @@ def test_get_nodes_to_launch_max_launch_concurrency():
 def test_rewrite_legacy_yaml_to_available_node_types():
-    cluster_config = copy.deepcopy(SMALL_CLUSTER)  # Legacy cluster_config.
+    with mock.patch(GET_DEFAULT_METHOD, return_value=MOCK_DEFAULT_CONFIG):
-    cluster_config = rewrite_legacy_yaml_to_available_node_types(
+        cluster_config = copy.deepcopy(SMALL_CLUSTER)  # Legacy cluster_config.
-        cluster_config)
+        cluster_config = prepare_config(cluster_config)
-    assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
+        assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
-        "max_workers"] == 0
+            "max_workers"] == 0
-    assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
+        assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
-        "min_workers"] == 0
+            "min_workers"] == 0
-    assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
+        assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
-        "node_config"] == SMALL_CLUSTER["head_node"]
+            "node_config"] == SMALL_CLUSTER["head_node"]
-    assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
+        assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
-        "node_config"] == SMALL_CLUSTER["worker_nodes"]
+            "node_config"] == SMALL_CLUSTER["worker_nodes"]
-    assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
+        assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
-        "max_workers"] == SMALL_CLUSTER["max_workers"]
+            "max_workers"] == SMALL_CLUSTER["max_workers"]
-    assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
+        assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
-        "min_workers"] == SMALL_CLUSTER["min_workers"]
+            "min_workers"] == SMALL_CLUSTER["min_workers"]
 def test_handle_legacy_cluster_config_yaml():
-    provider = MockProvider()
+    with mock.patch(GET_DEFAULT_METHOD, return_value=MOCK_DEFAULT_CONFIG):
-    head_resources = {"CPU": 8, "GPU": 1}
+        provider = MockProvider()
-    worker_resources = {"CPU": 32, "GPU": 8}
+        head_resources = {"CPU": 8, "GPU": 1}
-    cluster_config = copy.deepcopy(SMALL_CLUSTER)  # Legacy cluster_config.
+        worker_resources = {"CPU": 32, "GPU": 8}
-    cluster_config = rewrite_legacy_yaml_to_available_node_types(
+        cluster_config = copy.deepcopy(SMALL_CLUSTER)  # Legacy cluster_config.
-        cluster_config)
+        cluster_config = prepare_config(cluster_config)
-    scheduler = ResourceDemandScheduler(
+        scheduler = ResourceDemandScheduler(
-        provider,
+            provider,
-        cluster_config["available_node_types"],
+            cluster_config["available_node_types"],
-        0,
+            0,
-        head_node_type=NODE_TYPE_LEGACY_HEAD)
+            head_node_type=NODE_TYPE_LEGACY_HEAD)
-    provider.create_node({}, {
+        provider.create_node({}, {
-        TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
+            TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
-        TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_HEAD
+            TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_HEAD
-    }, 1)
+        }, 1)
-    head_ip = provider.non_terminated_node_ips({})[0]
+        head_ip = provider.non_terminated_node_ips({})[0]
-    head_node_id = provider.non_terminated_nodes({})[0]
+        head_node_id = provider.non_terminated_nodes({})[0]
-    to_launch = scheduler.get_nodes_to_launch([], {}, [], {}, [],
+        to_launch = scheduler.get_nodes_to_launch([], {}, [], {}, [],
-                                              {head_ip: head_resources})
+                                                  {head_ip: head_resources})
-    assert to_launch == {}  # Should always be empty with max_workers = 0.
+        assert to_launch == {}  # Should always be empty with max_workers = 0.
-    scheduler.max_workers = 30
+        scheduler.max_workers = 30
-    min_workers = scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"]
+        min_workers = scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
-    scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = 0
+            "min_workers"]
-    to_launch = scheduler.get_nodes_to_launch([head_node_id], {}, [], {}, [],
+        scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = 0
-                                              {head_ip: head_resources})
+        to_launch = scheduler.get_nodes_to_launch(
-    assert to_launch == {
+            [head_node_id], {}, [], {}, [], {head_ip: head_resources})
-    }  # Since the resource demand does not require adding nodes.
+        assert to_launch == {
-    to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
+        }  # Since the resource demand does not require adding nodes.
-                                              [head_resources], {}, [],
+        to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
-                                              {head_ip: head_resources})
+                                                  [head_resources], {}, [],
-    assert to_launch == {
+                                                  {head_ip: head_resources})
-    }  # Since the resource demand does not require adding nodes.
+        assert to_launch == {
        }  # Since the resource demand does not require adding nodes.
-    scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = min_workers
+        scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
-    # Returns min_workers when min_workers>0.
+            "min_workers"] = min_workers
-    to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
+        # Returns min_workers when min_workers>0.
-                                              [head_resources], {}, [],
+        to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
-                                              {head_ip: head_resources})
+                                                  [head_resources], {}, [],
-    assert to_launch == {NODE_TYPE_LEGACY_WORKER: min_workers}
+                                                  {head_ip: head_resources})
        assert to_launch == {NODE_TYPE_LEGACY_WORKER: min_workers}
-    provider.create_node({}, {
+        provider.create_node({}, {
-        TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
+            TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
-        TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
+            TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
-        TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_WORKER
+            TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_WORKER
-    }, min_workers)
+        }, min_workers)
-    nodes = provider.non_terminated_nodes({})
+        nodes = provider.non_terminated_nodes({})
-    to_launch = scheduler.get_nodes_to_launch(nodes, {}, [head_resources], {},
+        to_launch = scheduler.get_nodes_to_launch(
-                                              [], {head_ip: head_resources})
+            nodes, {}, [head_resources], {}, [], {head_ip: head_resources})
-    assert to_launch == {}  # A node is running, at some point it'll connect.
+        # A node is running, at some point it'll connect.
-    pending_launches = {NODE_TYPE_LEGACY_WORKER: 4}
+        assert to_launch == {}
-    to_launch = scheduler.get_nodes_to_launch([], pending_launches,
+        pending_launches = {NODE_TYPE_LEGACY_WORKER: 4}
-                                              [head_resources], {}, [],
+        to_launch = scheduler.get_nodes_to_launch([], pending_launches,
-                                              {head_ip: head_resources})
+                                                  [head_resources], {}, [],
-    assert to_launch == {}  # A node is launching, at some point it'll connect.
+                                                  {head_ip: head_resources})
        # A node is launching, at some point it'll connect.
        assert to_launch == {}
-    # Now assume that we already launched/connected the nodes.
+        # Now assume that we already launched/connected the nodes.
-    ips = provider.non_terminated_node_ips({})
+        ips = provider.non_terminated_node_ips({})
-    lm = LoadMetrics()
+        lm = LoadMetrics()
-    worker_ips = []
+        worker_ips = []
-    for ip in ips:
+        for ip in ips:
-        if ip == head_ip:
+            if ip == head_ip:
-            lm.update(ip, head_resources, head_resources, {})
+                lm.update(ip, head_resources, head_resources, {})
-        else:
+            else:
-            lm.update(ip, worker_resources, worker_resources, {})
+                lm.update(ip, worker_resources, worker_resources, {})
-            worker_ips.append(ip)
+                worker_ips.append(ip)
-    assert not scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["resources"]
+        assert not scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["resources"]
-    to_launch = scheduler.get_nodes_to_launch(
+        to_launch = scheduler.get_nodes_to_launch(
-        nodes, {}, [], {}, [], lm.get_static_node_resources_by_ip())
+            nodes, {}, [], {}, [], lm.get_static_node_resources_by_ip())
-    assert scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
+        assert scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
-        "resources"] == worker_resources
+            "resources"] == worker_resources
-    assert to_launch == {}
+        assert to_launch == {}
-    utilizations = {ip: worker_resources for ip in worker_ips}
+        utilizations = {ip: worker_resources for ip in worker_ips}
-    utilizations[head_ip] = head_resources
+        utilizations[head_ip] = head_resources
-    # Requires 4 nodes since worker resources is bigger than head reasources.
+        # Needs 4 nodes since worker resources is bigger than head reasources.
-    demands = [worker_resources] * (len(utilizations) + 3)
+        demands = [worker_resources] * (len(utilizations) + 3)
-    to_launch = scheduler.get_nodes_to_launch(
+        to_launch = scheduler.get_nodes_to_launch(
-        nodes, {}, demands, utilizations, [],
+            nodes, {}, demands, utilizations, [],
-        lm.get_static_node_resources_by_ip())
+            lm.get_static_node_resources_by_ip())
-    # 4 nodes are necessary to meet resource demand, but we never exceed
+        # 4 nodes are necessary to meet resource demand, but we never exceed
-    # max_workers.
+        # max_workers.
-    assert to_launch == {}
+        assert to_launch == {}
-    scheduler.max_workers = 10
+        scheduler.max_workers = 10
-    to_launch = scheduler.get_nodes_to_launch(
+        to_launch = scheduler.get_nodes_to_launch(
-        nodes, {}, demands, utilizations, [],
+            nodes, {}, demands, utilizations, [],
-        lm.get_static_node_resources_by_ip())
+            lm.get_static_node_resources_by_ip())
-    # 4 nodes are necessary to meet resource demand, but we never exceed
+        # 4 nodes are necessary to meet resource demand, but we never exceed
-    # max_workers.
+        # max_workers.
-    assert to_launch == {}
+        assert to_launch == {}
-    scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["max_workers"] = 10
+        scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["max_workers"] = 10
-    to_launch = scheduler.get_nodes_to_launch(
+        to_launch = scheduler.get_nodes_to_launch(
-        nodes, {}, demands, utilizations, [],
+            nodes, {}, demands, utilizations, [],
-        lm.get_static_node_resources_by_ip())
+            lm.get_static_node_resources_by_ip())
-    # 4 nodes are necessary to meet resource demand.
+        # 4 nodes are necessary to meet resource demand.
-    assert to_launch == {NODE_TYPE_LEGACY_WORKER: 4}
+        assert to_launch == {NODE_TYPE_LEGACY_WORKER: 4}
-    to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches, demands,
+        to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches,
-                                              utilizations, [],
+                                                  demands, utilizations, [],
-                                              lm.get_node_resources())
+                                                  lm.get_node_resources())
-    # 0 because there are 4 pending launches and we only need 4.
+        # 0 because there are 4 pending launches and we only need 4.
-    assert to_launch == {}
+        assert to_launch == {}
-    to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches,
+        to_launch = scheduler.get_nodes_to_launch(
-                                              demands * 2, utilizations, [],
+            nodes, pending_launches, demands * 2, utilizations, [],
-                                              lm.get_node_resources())
+            lm.get_node_resources())
-    # 1 because there are 4 pending launches and we only allow a max of 5.
+        # 1 because there are 4 pending launches and we only allow a max of 5.
-    assert to_launch == {NODE_TYPE_LEGACY_WORKER: 1}
+        assert to_launch == {NODE_TYPE_LEGACY_WORKER: 1}
 class LoadMetricsTest(unittest.TestCase):