[autoscaler][interface] Use multi node types in defaults.yaml and example-full.yaml (#14239)

* random doc typo * example-full-multi * left off max workers * wip * address comments, modify defaults, wip * fix * wip * reformat more things * undo useless diff * space * max workers * space * copy-paste mishaps * space * More copy-paste mishaps * copy-paste issues, space, max_workers * head_node_type * legacy yamls * line undeleted * correct-gpu * Remove redundant GPU example. * Extraneous comment * whitespace * example-java.yaml * Revert "example-java.yaml" This reverts commit 1e9c0124b9d97e651aaeeb6ec5bf7a4ef2a2df17. * tests and other things * doc * doc * revert max worker default * Kubernetes comment * wip * wip * tweak * Address comments * test_resource_demand_scheduler fixes * Head type min/max workers, aws resources * fix example_cluster2.yaml * Fix external node type test (compatibility with legacy-style external node types) * fix test_autoscaler_aws * gcp-images * gcp node type names * fix gcp defaults * doc format * typo * Skip failed Windows tests * doc string and comment * assert * remove contents of default external head and worker * legacy external failed validation test * Readability -- define the minimal external config at the top of the file. * Remove default worker type min worker * Remove extraneous global min_workers comment. * per-node-type docker in aws/example-gpu-docker * ray.worker.small -> ray.worker.default * fix-docker * fix gpu docker again * undo kubernetes experiment * fix doc * remove worker max_worker from kubernetes * remove max_worker from local worker node type * fix doc again * py38 * eric-comment * fix cluster name * fix-test-autoscaler * legacy config logic * pop resources * Remove min_workers AFTER merge * comment, warning message * warning, comment
2025-03-06 02:21:39 -05:00 · 2021-03-02 20:16:19 -08:00 · 2021-03-02 20:16:19 -08:00 · 1675156a8b
commit 1675156a8b
parent ef873be9e8
32 changed files with 1774 additions and 715 deletions
--- a/doc/source/cluster/config.rst
+++ b/doc/source/cluster/config.rst
@ -341,14 +341,13 @@ The key is the name of the node type, which is just for debugging purposes.
                resources: {"CPU": 2}
                min_workers: 0
                max_workers: 0
-            ray.worker.small:
+            ray.worker.default:
                node_config:
                  InstanceType: m5.large
                  InstanceMarketOptions:
                      MarketType: spot
                resources: {"CPU": 2}
                min_workers: 0
-                max_workers: 1

 .. _cluster-configuration-head-node-type:

@ -1073,12 +1072,12 @@ Minimal configuration
            :language: yaml

    .. group-tab:: Azure
-    
+
        .. literalinclude:: ../../../python/ray/autoscaler/azure/example-minimal.yaml
            :language: yaml

    .. group-tab:: GCP
-    
+
        .. literalinclude:: ../../../python/ray/autoscaler/gcp/example-minimal.yaml
            :language: yaml

@ -1092,11 +1091,11 @@ Full configuration
            :language: yaml

    .. group-tab:: Azure
-    
+
        .. literalinclude:: ../../../python/ray/autoscaler/azure/example-full.yaml
            :language: yaml

    .. group-tab:: GCP
-    
+
        .. literalinclude:: ../../../python/ray/autoscaler/gcp/example-full.yaml
            :language: yaml
--- a/python/ray/autoscaler/_private/kubernetes/config.py
+++ b/python/ray/autoscaler/_private/kubernetes/config.py
@ -71,8 +71,14 @@ def fillout_resources_kubernetes(config):
        return config
    node_types = copy.deepcopy(config["available_node_types"])
    for node_type in node_types:
-        container_data = node_types[node_type]["node_config"]["spec"][
-            "containers"][0]
+
+        node_config = node_types[node_type]["node_config"]
+        # The next line is for compatibility with configs like
+        # kubernetes/example-ingress.yaml,
+        # cf. KubernetesNodeProvider.create_node().
+        pod = node_config.get("pod", node_config)
+        container_data = pod["spec"]["containers"][0]
+
        autodetected_resources = get_autodetected_resources(container_data)
        if "resources" not in config["available_node_types"][node_type]:
            config["available_node_types"][node_type]["resources"] = {}
--- a/python/ray/autoscaler/_private/providers.py
+++ b/python/ray/autoscaler/_private/providers.py
@ -1,3 +1,4 @@
+import copy
 import importlib
 import logging
 import json
@ -11,6 +12,17 @@ logger = logging.getLogger(__name__)
 # For caching provider instantiations across API calls of one python session
 _provider_instances = {}

+# Minimal config for compatibility with legacy-style external configs.
+MINIMAL_EXTERNAL_CONFIG = {
+    "available_node_types": {
+        "ray.head.default": {},
+        "ray.worker.default": {},
+    },
+    "head_node_type": "ray.head.default",
+    "head_node": {},
+    "worker_nodes": {},
+}
+

 def _import_aws(provider_config):
    from ray.autoscaler._private.aws.node_provider import AWSNodeProvider
@ -192,7 +204,7 @@ def _get_default_config(provider_config):
    package outside the autoscaler.
    """
    if provider_config["type"] == "external":
-        return {}
+        return copy.deepcopy(MINIMAL_EXTERNAL_CONFIG)
    load_config = _DEFAULT_CONFIGS.get(provider_config["type"])
    if load_config is None:
        raise NotImplementedError("Unsupported node provider: {}".format(
--- a/python/ray/autoscaler/_private/util.py
+++ b/python/ray/autoscaler/_private/util.py
@ -1,4 +1,5 @@
 import collections
+import copy
 from datetime import datetime
 import logging
 import hashlib
@ -103,38 +104,91 @@ def prepare_config(config):
    return with_defaults


-def rewrite_legacy_yaml_to_available_node_types(
-        config: Dict[str, Any]) -> Dict[str, Any]:
-
-    if "available_node_types" not in config:
-        # TODO(ameer/ekl/alex): we can also rewrite here many other fields
-        # that include initialization/setup/start commands and ImageId.
-        logger.debug("Converting legacy cluster config to multi node types.")
-        config["available_node_types"] = {
-            NODE_TYPE_LEGACY_HEAD: {
-                "node_config": config["head_node"],
-                "resources": config["head_node"].get("resources") or {},
-                "min_workers": 0,
-                "max_workers": 0,
-            },
-            NODE_TYPE_LEGACY_WORKER: {
-                "node_config": config["worker_nodes"],
-                "resources": config["worker_nodes"].get("resources") or {},
-                "min_workers": config.get("min_workers", 0),
-                "max_workers": config.get("max_workers", 0),
-            },
-        }
-        config["head_node_type"] = NODE_TYPE_LEGACY_HEAD
-        del config["min_workers"]
-    return config
-
-
 def fillout_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
    defaults = _get_default_config(config["provider"])
    defaults.update(config)
-    defaults["auth"] = defaults.get("auth", {})
-    defaults = rewrite_legacy_yaml_to_available_node_types(defaults)
-    return defaults
+
+    # Just for clarity:
+    merged_config = copy.deepcopy(defaults)
+
+    # Fill auth field to avoid key errors.
+    # This field is accessed when calling NodeUpdater but is not relevant to
+    # certain node providers and is thus left out of some cluster launching
+    # configs.
+    merged_config["auth"] = merged_config.get("auth", {})
+
+    # A legacy config is one which doesn't have available_node_types,
+    # but has at least one of head_node or worker_nodes.
+    is_legacy_config = (("available_node_types" not in config) and
+                        ("head_node" in config or "worker_nodes" in config))
+    # Do merging logic for legacy configs.
+    if is_legacy_config:
+        merged_config = merge_legacy_yaml_with_defaults(merged_config)
+    # Take care of this here, in case a config does not specify any of head,
+    # workers, node types, but does specify min workers:
+    merged_config.pop("min_workers", None)
+
+    return merged_config
+
+
+def merge_legacy_yaml_with_defaults(
+        merged_config: Dict[str, Any]) -> Dict[str, Any]:
+    """Rewrite legacy config's available node types after it has been merged
+    with defaults yaml.
+    """
+    logger.warning("Converting legacy cluster config to multi node types.\n"
+                   "Refer to the docs for examples of multi-node-type "
+                   "autoscaling:\n"
+                   "https://docs.ray.io/en/master/cluster/config.html"
+                   "#full-configuration")
+
+    # Get default head and worker types.
+    default_head_type = merged_config["head_node_type"]
+    # Default configs are assumed to have two node types -- one for the head
+    # and one for the workers.
+    assert len(merged_config["available_node_types"].keys()) == 2
+    default_worker_type = (merged_config["available_node_types"].keys() -
+                           {default_head_type}).pop()
+
+    if merged_config["head_node"]:
+        # User specified a head node in legacy config.
+        # Convert it into data for the head's node type.
+        head_node_info = {
+            "node_config": merged_config["head_node"],
+            "resources": merged_config["head_node"].get("resources") or {},
+            "min_workers": 0,
+            "max_workers": 0,
+        }
+    else:
+        # Use default data for the head's node type.
+        head_node_info = merged_config["available_node_types"][
+            default_head_type]
+    if merged_config["worker_nodes"]:
+        # User specified a worker node in legacy config.
+        # Convert it into data for the workers' node type.
+        worker_node_info = {
+            "node_config": merged_config["worker_nodes"],
+            "resources": merged_config["worker_nodes"].get("resources") or {},
+            "min_workers": merged_config.get("min_workers", 0),
+            "max_workers": merged_config["max_workers"],
+        }
+    else:
+        # Use default data for the workers' node type.
+        worker_node_info = merged_config["available_node_types"][
+            default_worker_type]
+
+    # Rewrite available_node_types.
+    merged_config["available_node_types"] = {
+        NODE_TYPE_LEGACY_HEAD: head_node_info,
+        NODE_TYPE_LEGACY_WORKER: worker_node_info
+    }
+    merged_config["head_node_type"] = NODE_TYPE_LEGACY_HEAD
+
+    # Resources field in head/worker fields cause node launch to fail.
+    merged_config["head_node"].pop("resources", None)
+    merged_config["worker_nodes"].pop("resources", None)
+
+    return merged_config


 def merge_setup_commands(config):
@ -147,7 +201,6 @@ def merge_setup_commands(config):

 def fill_node_type_max_workers(config):
    """Sets default per-node max workers to global max_workers.
-
    This equivalent to setting the default per-node max workers to infinity,
    with the only upper constraint coming from the global max_workers.
    """
--- a/python/ray/autoscaler/aws/defaults.yaml
+++ b/python/ray/autoscaler/aws/defaults.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default

-# The minimum number of workers nodes to launch in addition to the head
-# node. This number should be >= 0.
-min_workers: 0
-
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2

 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -43,38 +39,63 @@ auth:
 # configurations below.
 #    ssh_private_key: /path/to/your/key.pem

-# Provider-specific config for the head node, e.g. instance type. By default
-# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
-# For more documentation on available fields, see:
-# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
-head_node:
-    InstanceType: m5.large
-    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+# Tell the autoscaler the allowed node types and the resources they provide.
+# The key is the name of the node type, which is just for debugging purposes.
+# The node config specifies the launch config and physical instance type.
+available_node_types:
+    ray.head.default:
+        # The minimum number of worker nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The maximum number of worker nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 0
+        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
+        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
+        # You can also set custom resources.
+        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
+        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
+        resources: {}
+        # Provider-specific config for this node type, e.g. instance type. By default
+        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+        # For more documentation on available fields, see:
+        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+        node_config:
+            InstanceType: m5.large
+            ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+            # You can provision additional disk space with a conf as follows
+            BlockDeviceMappings:
+                - DeviceName: /dev/sda1
+                  Ebs:
+                      VolumeSize: 100
+            # Additional options in the boto docs.
+    ray.worker.default:
+        # The minimum number of nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
+        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
+        # You can also set custom resources.
+        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
+        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
+        resources: {}
+        # Provider-specific config for this node type, e.g. instance type. By default
+        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+        # For more documentation on available fields, see:
+        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+        node_config:
+            InstanceType: m5.large
+            ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+            # Run workers on spot by default. Comment this out to use on-demand.
+            InstanceMarketOptions:
+                MarketType: spot
+                # Additional options can be found in the boto docs, e.g.
+                #   SpotOptions:
+                #       MaxPrice: MAX_HOURLY_PRICE
+            # Additional options in the boto docs.

-    # You can provision additional disk space with a conf as follows
-    BlockDeviceMappings:
-        - DeviceName: /dev/sda1
-          Ebs:
-              VolumeSize: 100
-
-    # Additional options in the boto docs.
-
-# Provider-specific config for worker nodes, e.g. instance type. By default
-# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
-# For more documentation on available fields, see:
-# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
-worker_nodes:
-    InstanceType: m5.large
-    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
-
-    # Run workers on spot by default. Comment this out to use on-demand.
-    InstanceMarketOptions:
-        MarketType: spot
-        # Additional options can be found in the boto docs, e.g.
-        #   SpotOptions:
-        #       MaxPrice: MAX_HOURLY_PRICE
-
-    # Additional options in the boto docs.
+# Specify the node type of the head node (as configured above).
+head_node_type: ray.head.default

 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -108,15 +129,8 @@ initialization_commands: []

 # List of shell commands to run to set up nodes.
 setup_commands:
-    # Note: if you're developing Ray, you probably want to create an AMI that
-    # has your Ray repo pre-cloned. Then, you can replace the pip installs
-    # below with a git checkout <your_sha> (and possibly a recompile).
-    - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
-    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
-    # Consider uncommenting these if you also want to run apt-get commands during setup
-    # - sudo pkill -9 apt-get || true
-    # - sudo pkill -9 dpkg || true
-    # - sudo dpkg --configure -a
+    - echo 'export PATH="$HOME/anaconda3/envs/tensorflow2_latest_p37/bin:$PATH"' >> ~/.bashrc
+    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl

 # Custom commands that will be run on the head node after common setup.
 head_setup_commands:
@ -134,3 +148,6 @@ head_start_ray_commands:
 worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
+
+head_node: {}
+worker_nodes: {}
--- a/python/ray/autoscaler/aws/example-full-legacy.yaml
+++ b/python/ray/autoscaler/aws/example-full-legacy.yaml
@ -0,0 +1,148 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: default
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 0
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: 2
+
+# The autoscaler will scale up the cluster faster with higher upscaling speed.
+# E.g., if the task requires adding more nodes then autoscaler will gradually
+# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
+# This number should be > 0.
+upscaling_speed: 1.0
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker:
+    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
+    container_name: "ray_container"
+    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
+    # if no cached version is present.
+    pull_before_run: True
+    run_options: []  # Extra options to pass into "docker run"
+
+    # Example of running a GPU head with CPU workers
+    # head_image: "rayproject/ray-ml:latest-gpu"
+    # Allow Ray to automatically detect GPUs
+
+    # worker_image: "rayproject/ray-ml:latest-cpu"
+    # worker_run_options: []
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Cloud-provider specific configuration.
+provider:
+    type: aws
+    region: us-west-2
+    # Availability zone(s), comma-separated, that nodes may be launched in.
+    # Nodes are currently spread between zones by a round-robin approach,
+    # however this implementation detail should not be relied upon.
+    availability_zone: us-west-2a,us-west-2b
+    # Whether to allow node reuse. If set to False, nodes will be terminated
+    # instead of stopped.
+    cache_stopped_nodes: True # If not present, the default is True.
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+# By default Ray creates a new private keypair, but you can also use your own.
+# If you do so, make sure to also set "KeyName" in the head and worker node
+# configurations below.
+#    ssh_private_key: /path/to/your/key.pem
+
+# Provider-specific config for the head node, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# For more documentation on available fields, see:
+# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+head_node:
+    InstanceType: m5.large
+    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+
+    # You can provision additional disk space with a conf as follows
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+              VolumeSize: 100
+
+    # Additional options in the boto docs.
+
+# Provider-specific config for worker nodes, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# For more documentation on available fields, see:
+# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+worker_nodes:
+    InstanceType: m5.large
+    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+
+    # Run workers on spot by default. Comment this out to use on-demand.
+    InstanceMarketOptions:
+        MarketType: spot
+        # Additional options can be found in the boto docs, e.g.
+        #   SpotOptions:
+        #       MaxPrice: MAX_HOURLY_PRICE
+
+    # Additional options in the boto docs.
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+
+# Whether changes to directories in file_mounts or cluster_synced_files in the head node
+# should sync to the worker node continuously
+file_mounts_sync_continuously: False
+
+# Patterns for files to exclude when running rsync up or rsync down
+rsync_exclude:
+    - "**/.git"
+    - "**/.git/**"
+
+# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
+# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
+# as a value, the behavior will match git's behavior for finding and using .gitignore files.
+rsync_filter:
+    - ".gitignore"
+
+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands: []
+
+# List of shell commands to run to set up nodes.
+setup_commands: []
+    # Note: if you're developing Ray, you probably want to create a Docker image that
+    # has your Ray repo pre-cloned. Then, you can replace the pip installs
+    # below with a git checkout <your_sha> (and possibly a recompile).
+    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands: []
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
--- a/python/ray/autoscaler/aws/example-full.yaml
+++ b/python/ray/autoscaler/aws/example-full.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default

-# The minimum number of workers nodes to launch in addition to the head
-# node. This number should be >= 0.
-min_workers: 0
-
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2

 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -57,38 +53,66 @@ auth:
 # configurations below.
 #    ssh_private_key: /path/to/your/key.pem

-# Provider-specific config for the head node, e.g. instance type. By default
-# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
-# For more documentation on available fields, see:
-# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
-head_node:
-    InstanceType: m5.large
-    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+# Tell the autoscaler the allowed node types and the resources they provide.
+# The key is the name of the node type, which is just for debugging purposes.
+# The node config specifies the launch config and physical instance type.
+available_node_types:
+    ray.head.default:
+        # The minimum number of worker nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The maximum number of worker nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 0
+        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
+        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
+        # You can also set custom resources.
+        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
+        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
+        resources: {}
+        # Provider-specific config for this node type, e.g. instance type. By default
+        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+        # For more documentation on available fields, see:
+        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+        node_config:
+            InstanceType: m5.large
+            ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+            # You can provision additional disk space with a conf as follows
+            BlockDeviceMappings:
+                - DeviceName: /dev/sda1
+                  Ebs:
+                      VolumeSize: 100
+            # Additional options in the boto docs.
+    ray.worker.default:
+        # The minimum number of worker nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The maximum number of worker nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 2
+        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
+        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
+        # You can also set custom resources.
+        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
+        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
+        resources: {}
+        # Provider-specific config for this node type, e.g. instance type. By default
+        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+        # For more documentation on available fields, see:
+        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+        node_config:
+            InstanceType: m5.large
+            ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+            # Run workers on spot by default. Comment this out to use on-demand.
+            InstanceMarketOptions:
+                MarketType: spot
+                # Additional options can be found in the boto docs, e.g.
+                #   SpotOptions:
+                #       MaxPrice: MAX_HOURLY_PRICE
+            # Additional options in the boto docs.

-    # You can provision additional disk space with a conf as follows
-    BlockDeviceMappings:
-        - DeviceName: /dev/sda1
-          Ebs:
-              VolumeSize: 100
-
-    # Additional options in the boto docs.
-
-# Provider-specific config for worker nodes, e.g. instance type. By default
-# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
-# For more documentation on available fields, see:
-# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
-worker_nodes:
-    InstanceType: m5.large
-    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
-
-    # Run workers on spot by default. Comment this out to use on-demand.
-    InstanceMarketOptions:
-        MarketType: spot
-        # Additional options can be found in the boto docs, e.g.
-        #   SpotOptions:
-        #       MaxPrice: MAX_HOURLY_PRICE
-
-    # Additional options in the boto docs.
+# Specify the node type of the head node (as configured above).
+head_node_type: ray.head.default

 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -146,3 +170,6 @@ head_start_ray_commands:
 worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
+
+head_node: {}
+worker_nodes: {}
--- a/python/ray/autoscaler/aws/example-gpu-docker.yaml
+++ b/python/ray/autoscaler/aws/example-gpu-docker.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: gpu-docker

-# The minimum number of workers nodes to launch in addition to the head
-# node. This number should be >= 0.
-min_workers: 0
-
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2

 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -23,10 +19,6 @@ docker:
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_nvidia_docker" # e.g. ray_docker

-    # # Example of running a GPU head with CPU workers
-    # head_image: "rayproject/ray-ml:latest-gpu"
-
-    # worker_image: "rayproject/ray-ml:latest"

 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
@ -48,38 +40,74 @@ auth:
 # configurations below.
 #    ssh_private_key: /path/to/your/key.pem

-# Provider-specific config for the head node, e.g. instance type. By default
-# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
-# For more documentation on available fields, see:
-# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
-head_node:
-    InstanceType: p2.xlarge
-    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+# Tell the autoscaler the allowed node types and the resources they provide.
+# The key is the name of the node type, which is just for debugging purposes.
+# The node config specifies the launch config and physical instance type.
+available_node_types:
+    # GPU head node.
+    ray.head.gpu:
+        # worker_image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
+        # The minimum number of worker nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The maximum number of worker nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 0
+        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
+        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
+        # You can also set custom resources.
+        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
+        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
+        resources: {}
+        # Provider-specific config for this node type, e.g. instance type. By default
+        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+        # For more documentation on available fields, see:
+        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+        node_config:
+            InstanceType: p2.xlarge
+            ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+            # You can provision additional disk space with a conf as follows
+            BlockDeviceMappings:
+                - DeviceName: /dev/sda1
+                  Ebs:
+                      VolumeSize: 100
+            # Additional options in the boto docs.
+    # CPU workers.
+    ray.worker.default:
+        # Override global docker setting.
+        # This node type will run a CPU image,
+        # rather than the GPU image specified in the global docker settings.
+        docker:
+            worker_image: "rayproject/ray-ml:latest-cpu"
+        # The minimum number of nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 1
+        # The maximum number of workers nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 2
+        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
+        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
+        # You can also set custom resources.
+        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
+        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
+        resources: {}
+        # Provider-specific config for this node type, e.g. instance type. By default
+        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+        # For more documentation on available fields, see:
+        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+        node_config:
+            InstanceType: m5.large
+            ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+            # Run workers on spot by default. Comment this out to use on-demand.
+            InstanceMarketOptions:
+                MarketType: spot
+                # Additional options can be found in the boto docs, e.g.
+                #   SpotOptions:
+                #       MaxPrice: MAX_HOURLY_PRICE
+            # Additional options in the boto docs.

-    # You can provision additional disk space with a conf as follows
-    BlockDeviceMappings:
-        - DeviceName: /dev/sda1
-          Ebs:
-              VolumeSize: 100
-
-    # Additional options in the boto docs.
-
-# Provider-specific config for worker nodes, e.g. instance type. By default
-# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
-# For more documentation on available fields, see:
-# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
-worker_nodes:
-    InstanceType: m5.large
-    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
-
-    # Run workers on spot by default. Comment this out to use on-demand.
-    InstanceMarketOptions:
-        MarketType: spot
-        # Additional options can be found in the boto docs, e.g.
-        #   SpotOptions:
-        #       MaxPrice: MAX_HOURLY_PRICE
-
-    # Additional options in the boto docs.
+# Specify the node type of the head node (as configured above).
+head_node_type: ray.head.gpu

 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
--- a/python/ray/autoscaler/aws/example-minimal.yaml
+++ b/python/ray/autoscaler/aws/example-minimal.yaml
@ -2,7 +2,7 @@
 cluster_name: minimal

 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers. min_workers default to 0.
+# node. min_workers default to 0.
 max_workers: 1

 # Cloud-provider specific configuration.
--- a/python/ray/autoscaler/azure/defaults.yaml
+++ b/python/ray/autoscaler/azure/defaults.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default

-# The minimum number of workers nodes to launch in addition to the head
-# node. This number should be >= 0.
-min_workers: 0
-
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2

 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -17,7 +13,7 @@ upscaling_speed: 1.0

 # This executes all commands on all nodes in the docker container,
 # and opens all the necessary ports to support the Ray cluster.
-# Empty string means disabled.
+# Empty object means disabled.
 docker: {}

 # If a node is idle for this many minutes, it will be removed.
@ -46,30 +42,52 @@ auth:
 # Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
 # on the head node, so changes to the template must be included in the wheel file used in setup_commands section below

-# Provider-specific config for the head node, e.g. instance type.
-head_node:
-    azure_arm_parameters:
-        vmSize: Standard_D2s_v3
-        # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
-        imagePublisher: microsoft-dsvm
-        imageOffer: ubuntu-1804
-        imageSku: 1804-gen2
-        imageVersion: 20.07.06
+# Tell the autoscaler the allowed node types and the resources they provide.
+# The key is the name of the node type, which is just for debugging purposes.
+# The node config specifies the launch config and physical instance type.
+available_node_types:
+    ray.head.default:
+        # The minimum number of worker nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The maximum number of worker nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 0
+        # The resources provided by this node type.
+        resources: {"CPU": 2}
+        # Provider-specific config, e.g. instance type.
+        node_config:
+            azure_arm_parameters:
+                vmSize: Standard_D2s_v3
+                # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+                imagePublisher: microsoft-dsvm
+                imageOffer: ubuntu-1804
+                imageSku: 1804-gen2
+                imageVersion: 20.07.06

-# Provider-specific config for worker nodes, e.g. instance type.
-worker_nodes:
-    azure_arm_parameters:
-        vmSize: Standard_D2s_v3
-        # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
-        imagePublisher: microsoft-dsvm
-        imageOffer: ubuntu-1804
-        imageSku: 1804-gen2
-        imageVersion: 20.07.06
-        # optionally set priority to use Spot instances
-        priority: Spot
-        # set a maximum price for spot instances if desired
-        # billingProfile:
-        #     maxPrice: -1
+    ray.worker.default:
+        # The minimum number of nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The resources provided by this node type.
+        resources: {"CPU": 2}
+        # Provider-specific config, e.g. instance type.
+        node_config:
+            azure_arm_parameters:
+                vmSize: Standard_D2s_v3
+                # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+                imagePublisher: microsoft-dsvm
+                imageOffer: ubuntu-1804
+                imageSku: 1804-gen2
+                imageVersion: 20.07.06
+                # optionally set priority to use Spot instances
+                priority: Spot
+                # set a maximum price for spot instances if desired
+                # billingProfile:
+                #     maxPrice: -1
+
+# Specify the node type of the head node (as configured above).
+head_node_type: ray.head.default

 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -134,3 +152,6 @@ head_start_ray_commands:
 worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
+
+head_node: {}
+worker_nodes: {}
--- a/python/ray/autoscaler/azure/example-full-legacy.yaml
+++ b/python/ray/autoscaler/azure/example-full-legacy.yaml
@ -19,18 +19,20 @@ upscaling_speed: 1.0
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "rayproject/ray-ml:latest-gpu"
+    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
-    container_name: "ray_docker"
+    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
-    pull_before_run: False
+    pull_before_run: True
    run_options: []  # Extra options to pass into "docker run"

    # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray-ml:latest-gpu"
+    # Allow Ray to automatically detect GPUs

-    # worker_image: "rayproject/ray-ml:latest"
+    # worker_image: "rayproject/ray-ml:latest-cpu"
+    # worker_run_options: []

 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
@ -42,7 +44,7 @@ provider:
    location: westus2
    resource_group: ray-cluster
    # set subscription id otherwise the default from az cli will be used
-    # subscription_id: 00000000-0000-0000-0000-000000000000   
+    # subscription_id: 00000000-0000-0000-0000-000000000000

 # How Ray will authenticate with newly launched nodes.
 auth:
@ -53,27 +55,35 @@ auth:
    # changes to this should match what is specified in file_mounts
    ssh_public_key: ~/.ssh/id_rsa.pub

-# Provider-specific config for the head node, e.g. instance type. By default
-# Ray will auto-configure unspecified fields using defaults.yaml
+# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
+# See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines
+# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
+# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
+
+# Provider-specific config for the head node, e.g. instance type.
 head_node:
    azure_arm_parameters:
-        vmSize: Standard_NC6
+        vmSize: Standard_D2s_v3
        # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
        imagePublisher: microsoft-dsvm
        imageOffer: ubuntu-1804
-        imageSku: "1804"
+        imageSku: 1804-gen2
        imageVersion: 20.07.06

-# Provider-specific config for worker nodes, e.g. instance type. By default
-# Ray will auto-configure unspecified fields using defaults.yaml
+# Provider-specific config for worker nodes, e.g. instance type.
 worker_nodes:
    azure_arm_parameters:
-        vmSize: Standard_NC6
+        vmSize: Standard_D2s_v3
        # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
        imagePublisher: microsoft-dsvm
        imageOffer: ubuntu-1804
-        imageSku: "1804"
+        imageSku: 1804-gen2
        imageVersion: 20.07.06
+        # optionally set priority to use Spot instances
+        priority: Spot
+        # set a maximum price for spot instances if desired
+        # billingProfile:
+        #     maxPrice: -1

 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -83,6 +93,27 @@ file_mounts: {
     "/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
 }

+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+
+# Whether changes to directories in file_mounts or cluster_synced_files in the head node
+# should sync to the worker node continuously
+file_mounts_sync_continuously: False
+
+# Patterns for files to exclude when running rsync up or rsync down
+rsync_exclude:
+    - "**/.git"
+    - "**/.git/**"
+
+# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
+# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
+# as a value, the behavior will match git's behavior for finding and using .gitignore files.
+rsync_filter:
+    - ".gitignore"
+
 # List of commands that will be run before `setup_commands`. If docker is
 # enabled, these commands will run outside the container and before docker
 # is setup.
@ -92,20 +123,16 @@ initialization_commands:

 # List of shell commands to run to set up nodes.
 setup_commands:
-    # Note: if you're developing Ray, you probably want to create an AMI that
+    # Note: if you're developing Ray, you probably want to create a Docker image that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
+    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
    - echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
-    # - echo 'conda activate py37_pytorch' >> ~/.bashrc
    - echo 'conda activate py37_tensorflow' >> ~/.bashrc
    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
-    # Consider uncommenting these if you also want to run apt-get commands during setup
-    # - sudo pkill -9 apt-get || true
-    # - sudo pkill -9 dpkg || true
-    # - sudo dpkg --configure -a

 # Custom commands that will be run on the head node after common setup.
-head_setup_commands: 
+head_setup_commands:
    - pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0

 # Custom commands that will be run on worker nodes after common setup.
--- a/python/ray/autoscaler/azure/example-full.yaml
+++ b/python/ray/autoscaler/azure/example-full.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default

-# The minimum number of workers nodes to launch in addition to the head
-# node. This number should be >= 0.
-min_workers: 0
-
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2

 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -17,7 +13,7 @@ upscaling_speed: 1.0

 # This executes all commands on all nodes in the docker container,
 # and opens all the necessary ports to support the Ray cluster.
-# Empty string means disabled.
+# Empty object means disabled.
 docker:
    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
@ -60,30 +56,55 @@ auth:
 # Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
 # on the head node, so changes to the template must be included in the wheel file used in setup_commands section below

-# Provider-specific config for the head node, e.g. instance type.
-head_node:
-    azure_arm_parameters:
-        vmSize: Standard_D2s_v3
-        # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
-        imagePublisher: microsoft-dsvm
-        imageOffer: ubuntu-1804
-        imageSku: 1804-gen2
-        imageVersion: 20.07.06
+# Tell the autoscaler the allowed node types and the resources they provide.
+# The key is the name of the node type, which is just for debugging purposes.
+# The node config specifies the launch config and physical instance type.
+available_node_types:
+    ray.head.default:
+        # The minimum number of worker nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The maximum number of worker nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 0
+        # The resources provided by this node type.
+        resources: {"CPU": 2}
+        # Provider-specific config, e.g. instance type.
+        node_config:
+            azure_arm_parameters:
+                vmSize: Standard_D2s_v3
+                # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+                imagePublisher: microsoft-dsvm
+                imageOffer: ubuntu-1804
+                imageSku: 1804-gen2
+                imageVersion: 20.07.06

-# Provider-specific config for worker nodes, e.g. instance type.
-worker_nodes:
-    azure_arm_parameters:
-        vmSize: Standard_D2s_v3
-        # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
-        imagePublisher: microsoft-dsvm
-        imageOffer: ubuntu-1804
-        imageSku: 1804-gen2
-        imageVersion: 20.07.06
-        # optionally set priority to use Spot instances
-        priority: Spot
-        # set a maximum price for spot instances if desired
-        # billingProfile:
-        #     maxPrice: -1
+    ray.worker.default:
+        # The minimum number of worker nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The maximum number of worker nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 2
+        # The resources provided by this node type.
+        resources: {"CPU": 2}
+        # Provider-specific config, e.g. instance type.
+        node_config:
+            azure_arm_parameters:
+                vmSize: Standard_D2s_v3
+                # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+                imagePublisher: microsoft-dsvm
+                imageOffer: ubuntu-1804
+                imageSku: 1804-gen2
+                imageVersion: 20.07.06
+                # optionally set priority to use Spot instances
+                priority: Spot
+                # set a maximum price for spot instances if desired
+                # billingProfile:
+                #     maxPrice: -1
+
+# Specify the node type of the head node (as configured above).
+head_node_type: ray.head.default

 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -147,3 +168,6 @@ head_start_ray_commands:
 worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
+
+head_node: {}
+worker_nodes: {}
--- a/python/ray/autoscaler/azure/example-gpu-docker.yaml
+++ b/python/ray/autoscaler/azure/example-gpu-docker.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: gpu-docker

-# The minimum number of workers nodes to launch in addition to the head
-# node. This number should be >= 0.
-min_workers: 0
-
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2

 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -21,7 +17,7 @@ upscaling_speed: 1.0
 docker:
    image: "rayproject/ray-ml:latest-gpu"
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
-    container_name: "ray_nvidia_docker" # e.g. ray_docker
+    container_name: "ray_nvidia_docker"

    # # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray-ml:latest-gpu"
@ -45,17 +41,40 @@ auth:
    # changes to this should match what is specified in file_mounts
    ssh_public_key: ~/.ssh/id_rsa.pub

-# Provider-specific config for the head node, e.g. instance type. By default
-# Ray will auto-configure unspecified fields using defaults.yaml
-head_node:
-    azure_arm_parameters:
-        vmSize: Standard_NC6s_v3
+# Tell the autoscaler the allowed node types and the resources they provide.
+# The key is the name of the node type, which is just for debugging purposes.
+# The node config specifies the launch config and physical instance type.
+available_node_types:
+    ray.head.gpu:
+        # The minimum number of worker nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The maximum number of worker nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 0
+        # The resources provided by this node type.
+        resources: {"CPU": 6, "GPU": 1}
+        # Provider-specific config, e.g. instance type.
+        node_config:
+            azure_arm_parameters:
+                vmSize: Standard_NC6_v3

-# Provider-specific config for worker nodes, e.g. instance type. By default
-# Ray will auto-configure unspecified fields using defaults.yaml
-worker_nodes:
-    azure_arm_parameters:
-        vmSize: Standard_NC6s_v3
+    ray.worker.gpu:
+        # The minimum number of nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The maximum number of workers nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 2
+        # The resources provided by this node type.
+        resources: {"CPU": 6, "GPU": 1}
+        # Provider-specific config, e.g. instance type.
+        node_config:
+            azure_arm_parameters:
+                vmSize: Standard_NC6_v3
+
+# Specify the node type of the head node (as configured above).
+head_node_type: ray.head.gpu

 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -69,7 +88,7 @@ file_mounts: {
 # NOTE: rayproject/ray-ml:latest has ray latest bundled
 setup_commands: []
 #     - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
- 
+
 # Custom commands that will be run on the head node after common setup.
 head_setup_commands:
    - pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0
--- a/python/ray/autoscaler/azure/example-minimal.yaml
+++ b/python/ray/autoscaler/azure/example-minimal.yaml
@ -2,7 +2,7 @@
 cluster_name: minimal

 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers. min_workers default to 0.
+# node. min_workers default to 0.
 max_workers: 1

 # Cloud-provider specific configuration.
--- a/python/ray/autoscaler/gcp/defaults.yaml
+++ b/python/ray/autoscaler/gcp/defaults.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default

-# The minimum number of workers nodes to launch in addition to the head
-# node. This number should be >= 0.
-min_workers: 0
-
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2

 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -39,50 +35,75 @@ auth:
 # project wide meta-data.
 #    ssh_private_key: /path/to/your/key.pem

-# Provider-specific config for the head node, e.g. instance type. By default
-# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
-# For more documentation on available fields, see:
-# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
-head_node:
-    machineType: n1-standard-2
-    disks:
-      - boot: true
-        autoDelete: true
-        type: PERSISTENT
-        initializeParams:
-          diskSizeGb: 50
-          # See https://cloud.google.com/compute/docs/images for more images
-          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
+# Tell the autoscaler the allowed node types and the resources they provide.
+# The key is the name of the node type, which is just for debugging purposes.
+# The node config specifies the launch config and physical instance type.
+available_node_types:
+    ray_head_default:
+        # The minimum number of worker nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The maximum number of worker nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 0
+        # The resources provided by this node type.
+        resources: {"CPU": 2}
+        # Provider-specific config for this node type, e.g. instance type. By default
+        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
+        # For more documentation on available fields, see:
+        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+        node_config:
+            machineType: n1-standard-2
+            disks:
+              - boot: true
+                autoDelete: true
+                type: PERSISTENT
+                initializeParams:
+                  diskSizeGb: 50
+                  # See https://cloud.google.com/compute/docs/images for more images
+                  sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
+
+            # Additional options can be found in in the compute docs at
+            # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+
+            # If the network interface is specified as below in both head and worker
+            # nodes, the manual network config is used.  Otherwise an existing subnet is
+            # used.  To use a shared subnet, ask the subnet owner to grant permission
+            # for 'compute.subnetworks.use' to the ray autoscaler account...
+            # networkInterfaces:
+            #   - kind: compute#networkInterface
+            #     subnetwork: path/to/subnet
+            #     aliasIpRanges: []
+    ray_worker_small:
+        # The minimum number of nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The resources provided by this node type.
+        resources: {"CPU": 2}
+        # Provider-specific config for this node type, e.g. instance type. By default
+        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
+        # For more documentation on available fields, see:
+        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+        node_config:
+            machineType: n1-standard-2
+            disks:
+              - boot: true
+                autoDelete: true
+                type: PERSISTENT
+                initializeParams:
+                  diskSizeGb: 50
+                  # See https://cloud.google.com/compute/docs/images for more images
+                  sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
+            # Run workers on preemtible instance by default.
+            # Comment this out to use on-demand.
+            scheduling:
+              - preemptible: true

    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert

-    # If the network interface is specified as below in both head and worker
-    # nodes, the manual network config is used.  Otherwise an existing subnet is
-    # used.  To use a shared subnet, ask the subnet owner to grant permission
-    # for 'compute.subnetworks.use' to the ray autoscaler account...
-    # networkInterfaces:
-    #   - kind: compute#networkInterface
-    #     subnetwork: path/to/subnet
-    #     aliasIpRanges: []
-
-worker_nodes:
-    machineType: n1-standard-2
-    disks:
-      - boot: true
-        autoDelete: true
-        type: PERSISTENT
-        initializeParams:
-          diskSizeGb: 50
-          # See https://cloud.google.com/compute/docs/images for more images
-          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
-    # Run workers on preemtible instance by default.
-    # Comment this out to use on-demand.
-    scheduling:
-      - preemptible: true
-
-    # Additional options can be found in in the compute docs at
-    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+# Specify the node type of the head node (as configured above).
+head_node_type: ray_head_default

 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -159,3 +180,6 @@ worker_start_ray_commands:
      ray start
      --address=$RAY_HEAD_IP:6379
      --object-manager-port=8076
+
+head_node: {}
+worker_nodes: {}
--- a/python/ray/autoscaler/gcp/example-full-legacy.yaml
+++ b/python/ray/autoscaler/gcp/example-full-legacy.yaml
@ -0,0 +1,167 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: default
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 0
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: 2
+
+# The autoscaler will scale up the cluster faster with higher upscaling speed.
+# E.g., if the task requires adding more nodes then autoscaler will gradually
+# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
+# This number should be > 0.
+upscaling_speed: 1.0
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker:
+  image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
+  container_name: "ray_container"
+  # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
+  # if no cached version is present.
+  pull_before_run: True
+  run_options: []  # Extra options to pass into "docker run"
+
+  # Example of running a GPU head with CPU workers
+  # head_image: "rayproject/ray-ml:latest-gpu"
+  # Allow Ray to automatically detect GPUs
+
+  # worker_image: "rayproject/ray-ml:latest-cpu"
+  # worker_run_options: []
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Cloud-provider specific configuration.
+provider:
+    type: gcp
+    region: us-west1
+    availability_zone: us-west1-a
+    project_id: null # Globally unique project id
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+# By default Ray creates a new private keypair, but you can also use your own.
+# If you do so, make sure to also set "KeyName" in the head and worker node
+# configurations below. This requires that you have added the key into the
+# project wide meta-data.
+#    ssh_private_key: /path/to/your/key.pem
+
+# Provider-specific config for the head node, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
+# For more documentation on available fields, see:
+# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+head_node:
+    machineType: n1-standard-2
+    disks:
+      - boot: true
+        autoDelete: true
+        type: PERSISTENT
+        initializeParams:
+          diskSizeGb: 50
+          # See https://cloud.google.com/compute/docs/images for more images
+          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
+
+    # Additional options can be found in in the compute docs at
+    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+
+    # If the network interface is specified as below in both head and worker
+    # nodes, the manual network config is used.  Otherwise an existing subnet is
+    # used.  To use a shared subnet, ask the subnet owner to grant permission
+    # for 'compute.subnetworks.use' to the ray autoscaler account...
+    # networkInterfaces:
+    #   - kind: compute#networkInterface
+    #     subnetwork: path/to/subnet
+    #     aliasIpRanges: []
+
+worker_nodes:
+    machineType: n1-standard-2
+    disks:
+      - boot: true
+        autoDelete: true
+        type: PERSISTENT
+        initializeParams:
+          diskSizeGb: 50
+          # See https://cloud.google.com/compute/docs/images for more images
+          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
+    # Run workers on preemtible instance by default.
+    # Comment this out to use on-demand.
+    scheduling:
+      - preemptible: true
+
+    # Additional options can be found in in the compute docs at
+    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+
+# Whether changes to directories in file_mounts or cluster_synced_files in the head node
+# should sync to the worker node continuously
+file_mounts_sync_continuously: False
+
+# Patterns for files to exclude when running rsync up or rsync down
+rsync_exclude:
+    - "**/.git"
+    - "**/.git/**"
+
+# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
+# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
+# as a value, the behavior will match git's behavior for finding and using .gitignore files.
+rsync_filter:
+    - ".gitignore"
+
+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands: []
+
+# List of shell commands to run to set up nodes.
+setup_commands: []
+    # Note: if you're developing Ray, you probably want to create a Docker image that
+    # has your Ray repo pre-cloned. Then, you can replace the pip installs
+    # below with a git checkout <your_sha> (and possibly a recompile).
+    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands:
+  - pip install google-api-python-client==1.7.8
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - >-
+      ulimit -n 65536;
+      ray start
+      --head
+      --port=6379
+      --object-manager-port=8076
+      --autoscaling-config=~/ray_bootstrap_config.yaml
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - >-
+      ulimit -n 65536;
+      ray start
+      --address=$RAY_HEAD_IP:6379
+      --object-manager-port=8076
--- a/python/ray/autoscaler/gcp/example-full.yaml
+++ b/python/ray/autoscaler/gcp/example-full.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default

-# The minimum number of workers nodes to launch in addition to the head
-# node. This number should be >= 0.
-min_workers: 0
-
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2

 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -53,50 +49,78 @@ auth:
 # project wide meta-data.
 #    ssh_private_key: /path/to/your/key.pem

-# Provider-specific config for the head node, e.g. instance type. By default
-# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
-# For more documentation on available fields, see:
-# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
-head_node:
-    machineType: n1-standard-2
-    disks:
-      - boot: true
-        autoDelete: true
-        type: PERSISTENT
-        initializeParams:
-          diskSizeGb: 50
-          # See https://cloud.google.com/compute/docs/images for more images
-          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
+# Tell the autoscaler the allowed node types and the resources they provide.
+# The key is the name of the node type, which is just for debugging purposes.
+# The node config specifies the launch config and physical instance type.
+available_node_types:
+    ray_head_default:
+        # The minimum number of worker nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The maximum number of worker nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 0
+        # The resources provided by this node type.
+        resources: {"CPU": 2}
+        # Provider-specific config for the head node, e.g. instance type. By default
+        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
+        # For more documentation on available fields, see:
+        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+        node_config:
+            machineType: n1-standard-2
+            disks:
+              - boot: true
+                autoDelete: true
+                type: PERSISTENT
+                initializeParams:
+                  diskSizeGb: 50
+                  # See https://cloud.google.com/compute/docs/images for more images
+                  sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
+
+            # Additional options can be found in in the compute docs at
+            # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+
+            # If the network interface is specified as below in both head and worker
+            # nodes, the manual network config is used.  Otherwise an existing subnet is
+            # used.  To use a shared subnet, ask the subnet owner to grant permission
+            # for 'compute.subnetworks.use' to the ray autoscaler account...
+            # networkInterfaces:
+            #   - kind: compute#networkInterface
+            #     subnetwork: path/to/subnet
+            #     aliasIpRanges: []
+    ray_worker_small:
+        # The minimum number of worker nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The maximum number of worker nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 2
+        # The resources provided by this node type.
+        resources: {"CPU": 2}
+        # Provider-specific config for the head node, e.g. instance type. By default
+        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
+        # For more documentation on available fields, see:
+        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+        node_config:
+            machineType: n1-standard-2
+            disks:
+              - boot: true
+                autoDelete: true
+                type: PERSISTENT
+                initializeParams:
+                  diskSizeGb: 50
+                  # See https://cloud.google.com/compute/docs/images for more images
+                  sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
+            # Run workers on preemtible instance by default.
+            # Comment this out to use on-demand.
+            scheduling:
+              - preemptible: true

    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert

-    # If the network interface is specified as below in both head and worker
-    # nodes, the manual network config is used.  Otherwise an existing subnet is
-    # used.  To use a shared subnet, ask the subnet owner to grant permission
-    # for 'compute.subnetworks.use' to the ray autoscaler account...
-    # networkInterfaces:
-    #   - kind: compute#networkInterface
-    #     subnetwork: path/to/subnet
-    #     aliasIpRanges: []
-
-worker_nodes:
-    machineType: n1-standard-2
-    disks:
-      - boot: true
-        autoDelete: true
-        type: PERSISTENT
-        initializeParams:
-          diskSizeGb: 50
-          # See https://cloud.google.com/compute/docs/images for more images
-          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
-    # Run workers on preemtible instance by default.
-    # Comment this out to use on-demand.
-    scheduling:
-      - preemptible: true
-
-    # Additional options can be found in in the compute docs at
-    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+# Specify the node type of the head node (as configured above).
+head_node_type: ray_head_default

 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -166,3 +190,6 @@ worker_start_ray_commands:
      ray start
      --address=$RAY_HEAD_IP:6379
      --object-manager-port=8076
+
+head_node: {}
+worker_nodes: {}
--- a/python/ray/autoscaler/gcp/example-gpu-docker.yaml
+++ b/python/ray/autoscaler/gcp/example-gpu-docker.yaml
@ -1,12 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: gpu-docker

-# The minimum number of workers nodes to launch in addition to the head
-# node. This number should be >= 0.
-min_workers: 0
-
 # The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers.
+# node.
 max_workers: 2

 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -48,58 +44,81 @@ auth:
 # project wide meta-data.
 #    ssh_private_key: /path/to/your/key.pem

-# Provider-specific config for the head node, e.g. instance type. By default
-# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
-# For more documentation on available fields, see:
-# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
-head_node:
-    machineType: custom-6-16384
-    disks:
-      - boot: true
-        autoDelete: true
-        type: PERSISTENT
-        initializeParams:
-          diskSizeGb: 50
-          # See https://cloud.google.com/compute/docs/images for more images
-          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
-    guestAccelerators:
-      - acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
-        acceleratorCount: 1
-    metadata:
-      items:
-        - key: install-nvidia-driver
-          value: "True"
-    scheduling:
-      - onHostMaintenance: TERMINATE
+# Tell the autoscaler the allowed node types and the resources they provide.
+# The key is the name of the node type, which is just for debugging purposes.
+# The node config specifies the launch config and physical instance type.
+available_node_types:
+    ray_head_gpu:
+        # The minimum number of worker nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The maximum number of worker nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 0
+        # The resources provided by this node type.
+        resources: {"CPU": 6, "GPU": 1}
+        # Provider-specific config for the head node, e.g. instance type. By default
+        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
+        # For more documentation on available fields, see:
+        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+        node_config:
+            machineType: custom-6-16384
+            disks:
+              - boot: true
+                autoDelete: true
+                type: PERSISTENT
+                initializeParams:
+                  diskSizeGb: 50
+                  # See https://cloud.google.com/compute/docs/images for more images
+                  sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
+            guestAccelerators:
+              - acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
+                acceleratorCount: 1
+            metadata:
+              items:
+                - key: install-nvidia-driver
+                  value: "True"
+            scheduling:
+              - onHostMaintenance: TERMINATE

-    # Additional options can be found in in the compute docs at
-    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+    ray_worker_gpu:
+        # The minimum number of nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The maximum number of workers nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 2
+        # The resources provided by this node type.
+        resources: {"CPU": 2, "GPU": 1}
+        # Provider-specific config for the head node, e.g. instance type. By default
+        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
+        # For more documentation on available fields, see:
+        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+        node_config:
+            machineType: n1-standard-2
+            disks:
+              - boot: true
+                autoDelete: true
+                type: PERSISTENT
+                initializeParams:
+                  diskSizeGb: 50
+                  # See https://cloud.google.com/compute/docs/images for more images
+                  sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
+            guestAccelerators:
+              - acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
+                acceleratorCount: 1
+            metadata:
+              items:
+                - key: install-nvidia-driver
+                  value: "True"
+            # Run workers on preemtible instance by default.
+            # Comment this out to use on-demand.
+            scheduling:
+              - preemptible: true
+              - onHostMaintenance: TERMINATE

-worker_nodes:
-    machineType: n1-standard-2
-    disks:
-      - boot: true
-        autoDelete: true
-        type: PERSISTENT
-        initializeParams:
-          diskSizeGb: 50
-          # See https://cloud.google.com/compute/docs/images for more images
-          sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
-    guestAccelerators:
-      - acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
-        acceleratorCount: 1
-    metadata:
-      items:
-        - key: install-nvidia-driver
-          value: "True"
-    # Run workers on preemtible instance by default.
-    # Comment this out to use on-demand.
-    scheduling:
-      - preemptible: true
-      - onHostMaintenance: TERMINATE
-
-    # Additional options can be found in in the compute docs at
-    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+# Specify the node type of the head node (as configured above).
+head_node_type: ray_head_gpu

 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
--- a/python/ray/autoscaler/gcp/example-minimal.yaml
+++ b/python/ray/autoscaler/gcp/example-minimal.yaml
@ -2,7 +2,7 @@
 cluster_name: minimal

 # The maximum number of worker nodes to launch in addition to the head
-# node. This takes precedence over min_workers. min_workers default to 0.
+# node. min_workers default to 0.
 max_workers: 1

 # Cloud-provider specific configuration.
--- a/python/ray/autoscaler/kubernetes/defaults.yaml
+++ b/python/ray/autoscaler/kubernetes/defaults.yaml
@ -96,8 +96,6 @@ available_node_types:
  worker_node:
    # Minimum number of Ray workers of this Pod type.
    min_workers: 0
-    # Maximum number of Ray workers of this Pod type. Takes precedence over min_workers.
-    max_workers: 2
    node_config:
      apiVersion: v1
      kind: Pod
@ -136,6 +134,12 @@ available_node_types:
              # cause problems for other pods.
              memory: 512Mi
  head_node:
+    # The minimum number of worker nodes of this type to launch.
+    # This number should be >= 0.
+    min_workers: 0
+    # The maximum number of worker nodes of this type to launch.
+    # This takes precedence over min_workers.
+    max_workers: 0
    node_config:
      apiVersion: v1
      kind: Pod
--- a/python/ray/autoscaler/kubernetes/example-full.yaml
+++ b/python/ray/autoscaler/kubernetes/example-full.yaml
@ -139,6 +139,12 @@ available_node_types:
              # cause problems for other pods.
              memory: 512Mi
  head_node:
+    # The minimum number of worker nodes of this type to launch.
+    # This number should be >= 0.
+    min_workers: 0
+    # The maximum number of worker nodes of this type to launch.
+    # This takes precedence over min_workers.
+    max_workers: 0
    node_config:
      apiVersion: v1
      kind: Pod
--- a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml
+++ b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml
@ -17,6 +17,10 @@ spec:
  # Specify the allowed pod types for this ray cluster and the resources they provide.
  podTypes:
  - name: head-node
+    # Minimum number of Ray workers of this Pod type.
+    minWorkers: 0
+    # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
+    maxWorkers: 0
    podConfig:
      apiVersion: v1
      kind: Pod
--- a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml
+++ b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml
@ -17,6 +17,10 @@ spec:
  # Specify the allowed pod types for this ray cluster and the resources they provide.
  podTypes:
  - name: head-node
+    # Minimum number of Ray workers of this Pod type.
+    minWorkers: 0
+    # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
+    maxWorkers: 0
    podConfig:
      apiVersion: v1
      kind: Pod
--- a/python/ray/autoscaler/local/defaults.yaml
+++ b/python/ray/autoscaler/local/defaults.yaml
@ -1,16 +1,8 @@
 # An unique identifier for the head node and workers of this cluster.
 cluster_name: default

-## NOTE: Typically for local clusters, min_workers == max_workers == len(worker_ips).
+## NOTE: Typically for local clusters, max_workers == len(worker_ips).

-# The minimum number of workers nodes to launch in addition to the head
-# node. This number should be >= 0.
-# Typically, min_workers == max_workers == len(worker_ips).
-min_workers: 0
-
-# The maximum number of workers nodes to launch in addition to the head node.
-# This takes precedence over min_workers.
-# Typically, min_workers == max_workers == len(worker_ips).
 max_workers: 0

 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@ -42,11 +34,20 @@ auth:
    # Optional if an ssh private key is necessary to ssh to the cluster.
    # ssh_private_key: ~/.ssh/id_rsa

-# Leave this empty.
-head_node: {}
-
-# Leave this empty.
-worker_nodes: {}
+available_node_types:
+    ray.head.default:
+        resources: {}
+        min_workers: 0
+        max_workers: 0
+        # Leave this empty
+        node_config: {}
+    ray.worker.default:
+        resources: {}
+        ## NOTE: Typically for local clusters, max_workers == len(worker_ips).
+        min_workers: 0
+        # Leave this empty
+        node_config: {}
+head_node_type: ray.head.default

 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -97,3 +98,6 @@ head_start_ray_commands:
 worker_start_ray_commands:
    - ray stop
    - ray start --address=$RAY_HEAD_IP:6379
+
+head_node: {}
+worker_nodes: {}
--- a/python/ray/autoscaler/staroid/defaults.yaml
+++ b/python/ray/autoscaler/staroid/defaults.yaml
@ -2,10 +2,6 @@
 # A namespace will be automatically created for each cluster_name in SKE.
 cluster_name: default

-# The minimum number of workers nodes to launch in addition to the head
-# node. This number should be >= 0.
-min_workers: 0
-
 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers.
 max_workers: 2
@ -85,174 +81,184 @@ provider:
    # Exposing external IP addresses for ray pods isn't currently supported.
    use_internal_ips: true

-# Kubernetes pod config for the head node pod.
-head_node:
-    apiVersion: v1
-    kind: Pod
-    metadata:
-        # Automatically generates a name for the pod with this prefix.
-        generateName: ray-head-
+head_node_type: ray.head.default

-        # Must match the head node service selector above if a head node
-        # service is required.
-        labels:
-            component: ray-head
+available_node_types:
+    ray.head.default:
+        resources: {"CPU": 1}
+        min_workers: 0
+        max_workers: 0
+        # Kubernetes pod config for the head node pod.
+        node_config:
+            apiVersion: v1
+            kind: Pod
+            metadata:
+                # Automatically generates a name for the pod with this prefix.
+                generateName: ray-head-

-            # https://docs.staroid.com/ske/pod.html#pod
-            pod.staroid.com/spot: "false" # use on-demand instance for head.
+                # Must match the head node service selector above if a head node
+                # service is required.
+                labels:
+                    component: ray-head

-            # Uncomment to locate ray head to dedicated Kubernetes node
-            # (GPU instance is only available for 'dedicated' isolation)
-            #pod.staroid.com/isolation: dedicated
-            #pod.staroid.com/instance-type: gpu-1
-    spec:
-        automountServiceAccountToken: true
+                    # https://docs.staroid.com/ske/pod.html#pod
+                    pod.staroid.com/spot: "false" # use on-demand instance for head.

-        # Restarting the head node automatically is not currently supported.
-        # If the head node goes down, `ray up` must be run again.
-        restartPolicy: Never
+                    # Uncomment to locate ray head to dedicated Kubernetes node
+                    # (GPU instance is only available for 'dedicated' isolation)
+                    #pod.staroid.com/isolation: dedicated
+                    #pod.staroid.com/instance-type: gpu-1
+            spec:
+                automountServiceAccountToken: true

-        # This volume allocates shared memory for Ray to use for its plasma
-        # object store. If you do not provide this, Ray will fall back to
-        # /tmp which cause slowdowns if is not a shared memory volume.
-        volumes:
-        - name: dshm
-          emptyDir:
-              medium: Memory
-        # nfs volume provides a shared volume across all ray-nodes.
-        - name: nfs-volume
-          persistentVolumeClaim:
-            claimName: nfs
+                # Restarting the head node automatically is not currently supported.
+                # If the head node goes down, `ray up` must be run again.
+                restartPolicy: Never

-        containers:
-        - name: ray-node
-          imagePullPolicy: Always
-          # You are free (and encouraged) to use your own container image,
-          # but it should have the following installed:
-          #   - rsync (used for `ray rsync` commands and file mounts)
-          #   - screen (used for `ray attach`)
-          #   - kubectl (used by the autoscaler to manage worker pods)
-          # Image will be overridden when 'image_from_project' is true.
-          image: rayproject/ray
-          # Do not change this command - it keeps the pod alive until it is
-          # explicitly killed.
-          command: ["/bin/bash", "-c", "--"]
-          args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
-          ports:
-              - containerPort: 6379 # Redis port.
-              - containerPort: 6380 # Redis port.
-              - containerPort: 6381 # Redis port.
-              - containerPort: 12345 # Ray internal communication.
-              - containerPort: 12346 # Ray internal communication.
+                # This volume allocates shared memory for Ray to use for its plasma
+                # object store. If you do not provide this, Ray will fall back to
+                # /tmp which cause slowdowns if is not a shared memory volume.
+                volumes:
+                - name: dshm
+                  emptyDir:
+                      medium: Memory
+                # nfs volume provides a shared volume across all ray-nodes.
+                - name: nfs-volume
+                  persistentVolumeClaim:
+                    claimName: nfs

-          # This volume allocates shared memory for Ray to use for its plasma
-          # object store. If you do not provide this, Ray will fall back to
-          # /tmp which cause slowdowns if is not a shared memory volume.
-          volumeMounts:
-              - mountPath: /dev/shm
-                name: dshm
-              - mountPath: /nfs
-                name: nfs-volume
-          resources:
-              requests:
-                  cpu: 1000m
-                  memory: 2Gi
-              limits:
-                  # The maximum memory that this pod is allowed to use. The
-                  # limit will be detected by ray and split to use 10% for
-                  # redis, 30% for the shared memory object store, and the
-                  # rest for application memory. If this limit is not set and
-                  # the object store size is not set manually, ray will
-                  # allocate a very large object store in each pod that may
-                  # cause problems for other pods.
-                  memory: 2Gi
-          env:
-              # This is used in the head_start_ray_commands below so that
-              # Ray can spawn the correct number of processes. Omitting this
-              # may lead to degraded performance.
-              - name: MY_CPU_REQUEST
-                valueFrom:
-                    resourceFieldRef:
-                        resource: requests.cpu
-              - name: RAY_ADDRESS
-                value: "auto"
+                containers:
+                - name: ray-node
+                  imagePullPolicy: Always
+                  # You are free (and encouraged) to use your own container image,
+                  # but it should have the following installed:
+                  #   - rsync (used for `ray rsync` commands and file mounts)
+                  #   - screen (used for `ray attach`)
+                  #   - kubectl (used by the autoscaler to manage worker pods)
+                  # Image will be overridden when 'image_from_project' is true.
+                  image: rayproject/ray
+                  # Do not change this command - it keeps the pod alive until it is
+                  # explicitly killed.
+                  command: ["/bin/bash", "-c", "--"]
+                  args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
+                  ports:
+                      - containerPort: 6379 # Redis port.
+                      - containerPort: 6380 # Redis port.
+                      - containerPort: 6381 # Redis port.
+                      - containerPort: 12345 # Ray internal communication.
+                      - containerPort: 12346 # Ray internal communication.

-# Kubernetes pod config for worker node pods.
-worker_nodes:
-    apiVersion: v1
-    kind: Pod
-    metadata:
-        # Automatically generates a name for the pod with this prefix.
-        generateName: ray-worker-
+                  # This volume allocates shared memory for Ray to use for its plasma
+                  # object store. If you do not provide this, Ray will fall back to
+                  # /tmp which cause slowdowns if is not a shared memory volume.
+                  volumeMounts:
+                      - mountPath: /dev/shm
+                        name: dshm
+                      - mountPath: /nfs
+                        name: nfs-volume
+                  resources:
+                      requests:
+                          cpu: 1000m
+                          memory: 2Gi
+                      limits:
+                          # The maximum memory that this pod is allowed to use. The
+                          # limit will be detected by ray and split to use 10% for
+                          # redis, 30% for the shared memory object store, and the
+                          # rest for application memory. If this limit is not set and
+                          # the object store size is not set manually, ray will
+                          # allocate a very large object store in each pod that may
+                          # cause problems for other pods.
+                          memory: 2Gi
+                  env:
+                      # This is used in the head_start_ray_commands below so that
+                      # Ray can spawn the correct number of processes. Omitting this
+                      # may lead to degraded performance.
+                      - name: MY_CPU_REQUEST
+                        valueFrom:
+                            resourceFieldRef:
+                                resource: requests.cpu
+                      - name: RAY_ADDRESS
+                        value: "auto"

-        # Must match the worker node service selector above if a worker node
-        # service is required.
-        labels:
-            component: ray-worker
+    ray.worker.default:
+        min_workers: 0
+        resources: {"CPU": 1}
+        # Kubernetes pod config for worker node pods.
+        node_config:
+            apiVersion: v1
+            kind: Pod
+            metadata:
+                # Automatically generates a name for the pod with this prefix.
+                generateName: ray-worker-

-            # https://docs.staroid.com/ske/pod.html#pod
-            pod.staroid.com/spot: "true" # use spot instance for workers.
+                # Must match the worker node service selector above if a worker node
+                # service is required.
+                labels:
+                    component: ray-worker

-            # Uncomment to locate ray head to dedicated Kubernetes node
-            # (GPU instance is only available for 'dedicated' isolation)
-            #pod.staroid.com/isolation: dedicated
-            #pod.staroid.com/instance-type: gpu-1
-    spec:
-        serviceAccountName: default
+                    # https://docs.staroid.com/ske/pod.html#pod
+                    pod.staroid.com/spot: "true" # use spot instance for workers.

-        # Worker nodes will be managed automatically by the head node, so
-        # do not change the restart policy.
-        restartPolicy: Never
+                    # Uncomment to locate ray head to dedicated Kubernetes node
+                    # (GPU instance is only available for 'dedicated' isolation)
+                    #pod.staroid.com/isolation: dedicated
+                    #pod.staroid.com/instance-type: gpu-1
+            spec:
+                serviceAccountName: default

-        # This volume allocates shared memory for Ray to use for its plasma
-        # object store. If you do not provide this, Ray will fall back to
-        # /tmp which cause slowdowns if is not a shared memory volume.
-        volumes:
-        - name: dshm
-          emptyDir:
-              medium: Memory
-        - name: nfs-volume
-          persistentVolumeClaim:
-            claimName: nfs
-        containers:
-        - name: ray-node
-          imagePullPolicy: Always
-          # You are free (and encouraged) to use your own container image,
-          # but it should have the following installed:
-          #   - rsync (used for `ray rsync` commands and file mounts)
-          image: rayproject/autoscaler
-          # Do not change this command - it keeps the pod alive until it is
-          # explicitly killed.
-          command: ["/bin/bash", "-c", "--"]
-          args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
-          ports:
-              - containerPort: 12345 # Ray internal communication.
-              - containerPort: 12346 # Ray internal communication.
+                # Worker nodes will be managed automatically by the head node, so
+                # do not change the restart policy.
+                restartPolicy: Never

-          # This volume allocates shared memory for Ray to use for its plasma
-          # object store. If you do not provide this, Ray will fall back to
-          # /tmp which cause slowdowns if is not a shared memory volume.
-          volumeMounts:
-              - mountPath: /dev/shm
-                name: dshm
-              - mountPath: /nfs
-                name: nfs-volume
-          resources:
-              requests:
-                  cpu: 1000m
-                  memory: 2Gi
-              limits:
-                  # This memory limit will be detected by ray and split into
-                  # 30% for plasma, and 70% for workers.
-                  memory: 2Gi
-          env:
-              # This is used in the head_start_ray_commands below so that
-              # Ray can spawn the correct number of processes. Omitting this
-              # may lead to degraded performance.
-              - name: MY_CPU_REQUEST
-                valueFrom:
-                    resourceFieldRef:
-                        resource: requests.cpu
+                # This volume allocates shared memory for Ray to use for its plasma
+                # object store. If you do not provide this, Ray will fall back to
+                # /tmp which cause slowdowns if is not a shared memory volume.
+                volumes:
+                - name: dshm
+                  emptyDir:
+                      medium: Memory
+                - name: nfs-volume
+                  persistentVolumeClaim:
+                    claimName: nfs
+                containers:
+                - name: ray-node
+                  imagePullPolicy: Always
+                  # You are free (and encouraged) to use your own container image,
+                  # but it should have the following installed:
+                  #   - rsync (used for `ray rsync` commands and file mounts)
+                  image: rayproject/autoscaler
+                  # Do not change this command - it keeps the pod alive until it is
+                  # explicitly killed.
+                  command: ["/bin/bash", "-c", "--"]
+                  args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
+                  ports:
+                      - containerPort: 12345 # Ray internal communication.
+                      - containerPort: 12346 # Ray internal communication.
+
+                  # This volume allocates shared memory for Ray to use for its plasma
+                  # object store. If you do not provide this, Ray will fall back to
+                  # /tmp which cause slowdowns if is not a shared memory volume.
+                  volumeMounts:
+                      - mountPath: /dev/shm
+                        name: dshm
+                      - mountPath: /nfs
+                        name: nfs-volume
+                  resources:
+                      requests:
+                          cpu: 1000m
+                          memory: 2Gi
+                      limits:
+                          # This memory limit will be detected by ray and split into
+                          # 30% for plasma, and 70% for workers.
+                          memory: 2Gi
+                  env:
+                      # This is used in the head_start_ray_commands below so that
+                      # Ray can spawn the correct number of processes. Omitting this
+                      # may lead to degraded performance.
+                      - name: MY_CPU_REQUEST
+                        valueFrom:
+                            resourceFieldRef:
+                                resource: requests.cpu

 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@ -307,3 +313,6 @@ head_start_ray_commands:
 worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
+
+head_node: {}
+worker_nodes: {}
--- a/python/ray/tests/aws/test_autoscaler_aws.py
+++ b/python/ray/tests/aws/test_autoscaler_aws.py
@ -1,8 +1,8 @@
 import pytest

 from ray.autoscaler._private.aws.config import _get_vpc_id_or_die, \
-                                               bootstrap_aws, \
-                                               DEFAULT_AMI
+    bootstrap_aws, \
+    DEFAULT_AMI
 import ray.tests.aws.utils.stubs as stubs
 import ray.tests.aws.utils.helpers as helpers
 from ray.tests.aws.utils.constants import AUX_SUBNET, DEFAULT_SUBNET, \
@ -143,8 +143,10 @@ def test_fills_out_amis(iam_client_stub, ec2_client_stub):
    stubs.configure_subnet_default(ec2_client_stub)

    config = helpers.load_aws_example_config_file("example-full.yaml")
-    del config["head_node"]["ImageId"]
-    del config["worker_nodes"]["ImageId"]
+    del config["available_node_types"]["ray.head.default"]["node_config"][
+        "ImageId"]
+    del config["available_node_types"]["ray.worker.default"]["node_config"][
+        "ImageId"]

    # Pass in SG for stub to work
    config["head_node"]["SecurityGroupIds"] = ["sg-1234abcd"]
--- a/python/ray/tests/test_autoscaler.py
+++ b/python/ray/tests/test_autoscaler.py
@ -1,4 +1,5 @@
 import json
+import jsonschema
 import os
 import shutil
 from subprocess import CalledProcessError
@ -264,6 +265,55 @@ SMALL_CLUSTER = {
    "worker_start_ray_commands": ["start_ray_worker"],
 }

+MOCK_DEFAULT_CONFIG = {
+    "cluster_name": "default",
+    "max_workers": 2,
+    "upscaling_speed": 1.0,
+    "idle_timeout_minutes": 5,
+    "provider": {
+        "type": "mock",
+        "region": "us-east-1",
+        "availability_zone": "us-east-1a",
+    },
+    "docker": {
+        "image": "example",
+        "container_name": "mock",
+    },
+    "auth": {
+        "ssh_user": "ubuntu",
+        "ssh_private_key": os.devnull,
+    },
+    "available_node_types": {
+        "ray.head.default": {
+            "min_workers": 0,
+            "max_workers": 0,
+            "resources": {},
+            "node_config": {
+                "head_default_prop": 4
+            }
+        },
+        "ray.worker.default": {
+            "min_workers": 0,
+            "max_workers": 2,
+            "resources": {},
+            "node_config": {
+                "worker_default_prop": 7
+            }
+        }
+    },
+    "head_node_type": "ray.head.default",
+    "head_node": {},
+    "worker_nodes": {},
+    "file_mounts": {},
+    "cluster_synced_files": [],
+    "initialization_commands": [],
+    "setup_commands": [],
+    "head_setup_commands": [],
+    "worker_setup_commands": [],
+    "head_start_ray_commands": [],
+    "worker_start_ray_commands": [],
+}
+

 class LoadMetricsTest(unittest.TestCase):
    def testHeartbeat(self):
@ -1645,6 +1695,28 @@ class AutoscalingTest(unittest.TestCase):
            config_path, LoadMetrics(), max_failures=0, update_interval_s=0)
        assert isinstance(autoscaler.provider, NodeProvider)

+    def testLegacyExternalNodeScalerMissingFields(self):
+        """Should fail to validate legacy external config with missing
+        head_node, worker_nodes, or both."""
+        external_config = copy.deepcopy(SMALL_CLUSTER)
+        external_config["provider"] = {
+            "type": "external",
+            "module": "ray.autoscaler.node_provider.NodeProvider",
+        }
+
+        missing_workers, missing_head, missing_both = [
+            copy.deepcopy(external_config) for _ in range(3)
+        ]
+        del missing_workers["worker_nodes"]
+        del missing_head["head_node"]
+        del missing_both["worker_nodes"]
+        del missing_both["head_node"]
+
+        for faulty_config in missing_workers, missing_head, missing_both:
+            faulty_config = prepare_config(faulty_config)
+            with pytest.raises(jsonschema.ValidationError):
+                validate_config(faulty_config)
+
    def testExternalNodeScalerWrongImport(self):
        config = SMALL_CLUSTER.copy()
        config["provider"] = {
--- a/python/ray/tests/test_autoscaler_yaml.py
+++ b/python/ray/tests/test_autoscaler_yaml.py
@ -1,4 +1,5 @@
 import jsonschema
+import logging
 import os
 import sys
 import tempfile
@ -9,10 +10,12 @@ import copy
 from unittest.mock import MagicMock, Mock, patch
 import pytest

-from ray.autoscaler._private.util import prepare_config, validate_config
+from ray.autoscaler._private.util import prepare_config, validate_config,\
+    _get_default_config, merge_setup_commands
 from ray.autoscaler._private.providers import _NODE_PROVIDERS
 from ray.autoscaler._private.kubernetes.node_provider import\
    KubernetesNodeProvider
+from ray.autoscaler.tags import NODE_TYPE_LEGACY_HEAD, NODE_TYPE_LEGACY_WORKER

 from ray.test_utils import load_test_config, recursive_fnmatch

@ -37,18 +40,19 @@ CONFIG_PATHS = ignore_k8s_operator_configs(CONFIG_PATHS)
 class AutoscalingConfigTest(unittest.TestCase):
    def testValidateDefaultConfig(self):
        for config_path in CONFIG_PATHS:
-            if "aws/example-multi-node-type.yaml" in config_path:
-                # aws is tested in testValidateDefaultConfigAWSMultiNodeTypes.
-                continue
-            with open(config_path) as f:
-                config = yaml.safe_load(f)
-            config = prepare_config(config)
-            if config["provider"]["type"] == "kubernetes":
-                KubernetesNodeProvider.fillout_available_node_types_resources(
-                    config)
            try:
+                if "aws/example-multi-node-type.yaml" in config_path:
+                    # aws tested in testValidateDefaultConfigAWSMultiNodeTypes.
+                    continue
+                with open(config_path) as f:
+                    config = yaml.safe_load(f)
+                config = prepare_config(config)
+                if config["provider"]["type"] == "kubernetes":
+                    KubernetesNodeProvider.\
+                        fillout_available_node_types_resources(config)
                validate_config(config)
            except Exception:
+                logging.exception("")
                self.fail(
                    f"Config {config_path} did not pass validation test!")

@ -232,7 +236,6 @@ class AutoscalingConfigTest(unittest.TestCase):
            self.fail("Failed to validate config with security group name!")

    def testMaxWorkerDefault(self):
-
        # Load config, call prepare config, check that default max_workers
        # is filled correctly for node types that don't specify it.
        # Check that max_workers is untouched for node types
@ -254,7 +257,7 @@ class AutoscalingConfigTest(unittest.TestCase):
        # Max workers auto-filled with specified cluster-wide value of 5.
        assert config["max_workers"] ==\
            prepared_node_types["worker_node_max_unspecified"]["max_workers"]\
-            == config["max_workers"] == 5
+            == 5

        # Repeat with a config that doesn't specify global max workers.
        # Default value of 2 should be pulled in for global max workers.
@ -275,8 +278,87 @@ class AutoscalingConfigTest(unittest.TestCase):
            prepared_node_types["worker_node_max_specified"][
                "max_workers"] == 3
        # Max workers auto-filled with default cluster-wide value of 2.
-        assert prepared_node_types["worker_node_max_unspecified"][
-            "max_workers"] == 2
+        assert prepared_config["max_workers"] ==\
+            prepared_node_types["worker_node_max_unspecified"]["max_workers"]\
+            == 2
+
+    def testFillEdgeLegacyConfigs(self):
+        # Test edge cases: legacy configs which specify workers but not head
+        # or vice-versa.
+        no_head = load_test_config("test_no_head.yaml")
+        aws_defaults = _get_default_config(no_head["provider"])
+        head_prepared = prepare_config(no_head)
+        assert head_prepared["available_node_types"][
+            "ray-legacy-head-node-type"]["node_config"] ==\
+            aws_defaults["available_node_types"][
+                "ray.head.default"]["node_config"]
+        assert head_prepared["head_node"] == {}
+        # Custom worker config preserved
+        node_types = head_prepared["available_node_types"]
+        worker_type = node_types["ray-legacy-worker-node-type"]
+        assert worker_type["node_config"] == head_prepared["worker_nodes"] == {
+            "foo": "bar"
+        }
+
+        no_workers = load_test_config("test_no_workers.yaml")
+        workers_prepared = prepare_config(no_workers)
+        assert workers_prepared["available_node_types"][
+            "ray-legacy-worker-node-type"]["node_config"] ==\
+            aws_defaults["available_node_types"][
+                "ray.worker.default"]["node_config"]
+        assert workers_prepared["worker_nodes"] == {}
+        # Custom head config preserved
+        node_types = workers_prepared["available_node_types"]
+        head_type = node_types["ray-legacy-head-node-type"]
+        assert head_type["node_config"] == workers_prepared["head_node"] == {
+            "baz": "qux"
+        }
+
+    @pytest.mark.skipif(
+        sys.platform.startswith("win"), reason="Fails on Windows.")
+    def testExampleFull(self):
+        """
+        Test that example-full yamls are unmodified by prepared_config,
+        except possibly by having setup_commands merged.
+        """
+        providers = ["aws", "gcp", "azure"]
+        for provider in providers:
+            path = os.path.join(RAY_PATH, "autoscaler", provider,
+                                "example-full.yaml")
+            config = yaml.safe_load(open(path).read())
+            config_copy = copy.deepcopy(config)
+            merge_setup_commands(config_copy)
+            assert config_copy == prepare_config(config)
+
+    @pytest.mark.skipif(
+        sys.platform.startswith("win"), reason="Fails on Windows.")
+    def testLegacyYaml(self):
+        # Test correct default-merging behavior for legacy yamls.
+        providers = ["aws", "gcp", "azure"]
+        for provider in providers:
+            path = os.path.join(RAY_PATH, "autoscaler", provider,
+                                "example-full-legacy.yaml")
+            legacy_config = yaml.safe_load(open(path).read())
+            # custom head and workers
+            legacy_config["head_node"] = {"blahblah": 0}
+            legacy_config["worker_nodes"] = {"halbhalhb": 0}
+            legacy_config_copy = copy.deepcopy(legacy_config)
+            prepared_legacy = prepare_config(legacy_config_copy)
+            assert prepared_legacy["available_node_types"][
+                NODE_TYPE_LEGACY_HEAD]["max_workers"] == 0
+            assert prepared_legacy["available_node_types"][
+                NODE_TYPE_LEGACY_HEAD]["min_workers"] == 0
+            assert prepared_legacy["available_node_types"][
+                NODE_TYPE_LEGACY_HEAD]["node_config"] == legacy_config[
+                    "head_node"]
+
+            assert prepared_legacy["available_node_types"][
+                NODE_TYPE_LEGACY_WORKER]["max_workers"] == 2
+            assert prepared_legacy["available_node_types"][
+                NODE_TYPE_LEGACY_WORKER]["min_workers"] == 0
+            assert prepared_legacy["available_node_types"][
+                NODE_TYPE_LEGACY_WORKER]["node_config"] == legacy_config[
+                    "worker_nodes"]


 if __name__ == "__main__":
--- a/python/ray/tests/test_cli_patterns/test_no_head.yaml
+++ b/python/ray/tests/test_cli_patterns/test_no_head.yaml
@ -0,0 +1,123 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: default
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 0
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: 2
+
+# The autoscaler will scale up the cluster faster with higher upscaling speed.
+# E.g., if the task requires adding more nodes then autoscaler will gradually
+# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
+# This number should be > 0.
+upscaling_speed: 1.0
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker:
+    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
+    container_name: "ray_container"
+    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
+    # if no cached version is present.
+    pull_before_run: True
+    run_options: []  # Extra options to pass into "docker run"
+
+    # Example of running a GPU head with CPU workers
+    # head_image: "rayproject/ray-ml:latest-gpu"
+    # Allow Ray to automatically detect GPUs
+
+    # worker_image: "rayproject/ray-ml:latest-cpu"
+    # worker_run_options: []
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Cloud-provider specific configuration.
+provider:
+    type: aws
+    region: us-west-2
+    # Availability zone(s), comma-separated, that nodes may be launched in.
+    # Nodes are currently spread between zones by a round-robin approach,
+    # however this implementation detail should not be relied upon.
+    availability_zone: us-west-2a,us-west-2b
+    # Whether to allow node reuse. If set to False, nodes will be terminated
+    # instead of stopped.
+    cache_stopped_nodes: True # If not present, the default is True.
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+# By default Ray creates a new private keypair, but you can also use your own.
+# If you do so, make sure to also set "KeyName" in the head and worker node
+# configurations below.
+#    ssh_private_key: /path/to/your/key.pem
+
+
+# Provider-specific config for worker nodes, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# For more documentation on available fields, see:
+# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+worker_nodes:
+    foo: bar
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+
+# Whether changes to directories in file_mounts or cluster_synced_files in the head node
+# should sync to the worker node continuously
+file_mounts_sync_continuously: False
+
+# Patterns for files to exclude when running rsync up or rsync down
+rsync_exclude:
+    - "**/.git"
+    - "**/.git/**"
+
+# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
+# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
+# as a value, the behavior will match git's behavior for finding and using .gitignore files.
+rsync_filter:
+    - ".gitignore"
+
+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands: []
+
+# List of shell commands to run to set up nodes.
+setup_commands: []
+    # Note: if you're developing Ray, you probably want to create a Docker image that
+    # has your Ray repo pre-cloned. Then, you can replace the pip installs
+    # below with a git checkout <your_sha> (and possibly a recompile).
+    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands: []
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
--- a/python/ray/tests/test_cli_patterns/test_no_workers.yaml
+++ b/python/ray/tests/test_cli_patterns/test_no_workers.yaml
@ -0,0 +1,124 @@
+
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: default
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 0
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: 2
+
+# The autoscaler will scale up the cluster faster with higher upscaling speed.
+# E.g., if the task requires adding more nodes then autoscaler will gradually
+# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
+# This number should be > 0.
+upscaling_speed: 1.0
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker:
+    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
+    container_name: "ray_container"
+    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
+    # if no cached version is present.
+    pull_before_run: True
+    run_options: []  # Extra options to pass into "docker run"
+
+    # Example of running a GPU head with CPU workers
+    # head_image: "rayproject/ray-ml:latest-gpu"
+    # Allow Ray to automatically detect GPUs
+
+    # worker_image: "rayproject/ray-ml:latest-cpu"
+    # worker_run_options: []
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Cloud-provider specific configuration.
+provider:
+    type: aws
+    region: us-west-2
+    # Availability zone(s), comma-separated, that nodes may be launched in.
+    # Nodes are currently spread between zones by a round-robin approach,
+    # however this implementation detail should not be relied upon.
+    availability_zone: us-west-2a,us-west-2b
+    # Whether to allow node reuse. If set to False, nodes will be terminated
+    # instead of stopped.
+    cache_stopped_nodes: True # If not present, the default is True.
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+# By default Ray creates a new private keypair, but you can also use your own.
+# If you do so, make sure to also set "KeyName" in the head and worker node
+# configurations below.
+#    ssh_private_key: /path/to/your/key.pem
+
+# Provider-specific config for the head node, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# For more documentation on available fields, see:
+# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+head_node:
+    baz: qux
+
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+
+# Whether changes to directories in file_mounts or cluster_synced_files in the head node
+# should sync to the worker node continuously
+file_mounts_sync_continuously: False
+
+# Patterns for files to exclude when running rsync up or rsync down
+rsync_exclude:
+    - "**/.git"
+    - "**/.git/**"
+
+# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
+# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
+# as a value, the behavior will match git's behavior for finding and using .gitignore files.
+rsync_filter:
+    - ".gitignore"
+
+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands: []
+
+# List of shell commands to run to set up nodes.
+setup_commands: []
+    # Note: if you're developing Ray, you probably want to create a Docker image that
+    # has your Ray repo pre-cloned. Then, you can replace the pip installs
+    # below with a git checkout <your_sha> (and possibly a recompile).
+    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands: []
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
--- a/python/ray/tests/test_k8s_operator_examples.py
+++ b/python/ray/tests/test_k8s_operator_examples.py
@ -195,4 +195,4 @@ class KubernetesOperatorTest(unittest.TestCase):

 if __name__ == "__main__":
    kubernetes.config.load_kube_config()
-    sys.exit(pytest.main(["-v", __file__]))
+    sys.exit(pytest.main(["-sv", __file__]))
--- a/python/ray/tests/test_resource_demand_scheduler.py
+++ b/python/ray/tests/test_resource_demand_scheduler.py
@ -5,15 +5,16 @@ import yaml
 import tempfile
 import shutil
 import unittest
+from unittest import mock
 import copy

 import ray
 import ray.ray_constants
 from ray.autoscaler._private.util import \
-    rewrite_legacy_yaml_to_available_node_types, format_info_string, \
+    prepare_config, format_info_string, \
    format_info_string_no_node_types
-from ray.tests.test_autoscaler import SMALL_CLUSTER, MockProvider, \
-    MockProcessRunner
+from ray.tests.test_autoscaler import SMALL_CLUSTER, MOCK_DEFAULT_CONFIG, \
+    MockProvider, MockProcessRunner
 from ray.autoscaler._private.providers import (_NODE_PROVIDERS,
                                               _clear_provider_cache)
 from ray.autoscaler._private.autoscaler import StandardAutoscaler, \
@ -38,6 +39,8 @@ from ray.autoscaler._private.constants import \

 from time import sleep

+GET_DEFAULT_METHOD = "ray.autoscaler._private.util._get_default_config"
+
 TYPES_A = {
    "empty_node": {
        "node_config": {
@ -1042,131 +1045,135 @@ def test_get_nodes_to_launch_max_launch_concurrency():


 def test_rewrite_legacy_yaml_to_available_node_types():
-    cluster_config = copy.deepcopy(SMALL_CLUSTER)  # Legacy cluster_config.
-    cluster_config = rewrite_legacy_yaml_to_available_node_types(
-        cluster_config)
-    assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
-        "max_workers"] == 0
-    assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
-        "min_workers"] == 0
-    assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
-        "node_config"] == SMALL_CLUSTER["head_node"]
+    with mock.patch(GET_DEFAULT_METHOD, return_value=MOCK_DEFAULT_CONFIG):
+        cluster_config = copy.deepcopy(SMALL_CLUSTER)  # Legacy cluster_config.
+        cluster_config = prepare_config(cluster_config)
+        assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
+            "max_workers"] == 0
+        assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
+            "min_workers"] == 0
+        assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_HEAD][
+            "node_config"] == SMALL_CLUSTER["head_node"]

-    assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
-        "node_config"] == SMALL_CLUSTER["worker_nodes"]
-    assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
-        "max_workers"] == SMALL_CLUSTER["max_workers"]
-    assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
-        "min_workers"] == SMALL_CLUSTER["min_workers"]
+        assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
+            "node_config"] == SMALL_CLUSTER["worker_nodes"]
+        assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
+            "max_workers"] == SMALL_CLUSTER["max_workers"]
+        assert cluster_config["available_node_types"][NODE_TYPE_LEGACY_WORKER][
+            "min_workers"] == SMALL_CLUSTER["min_workers"]


 def test_handle_legacy_cluster_config_yaml():
-    provider = MockProvider()
-    head_resources = {"CPU": 8, "GPU": 1}
-    worker_resources = {"CPU": 32, "GPU": 8}
-    cluster_config = copy.deepcopy(SMALL_CLUSTER)  # Legacy cluster_config.
-    cluster_config = rewrite_legacy_yaml_to_available_node_types(
-        cluster_config)
-    scheduler = ResourceDemandScheduler(
-        provider,
-        cluster_config["available_node_types"],
-        0,
-        head_node_type=NODE_TYPE_LEGACY_HEAD)
-    provider.create_node({}, {
-        TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
-        TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_HEAD
-    }, 1)
-    head_ip = provider.non_terminated_node_ips({})[0]
-    head_node_id = provider.non_terminated_nodes({})[0]
-    to_launch = scheduler.get_nodes_to_launch([], {}, [], {}, [],
-                                              {head_ip: head_resources})
-    assert to_launch == {}  # Should always be empty with max_workers = 0.
+    with mock.patch(GET_DEFAULT_METHOD, return_value=MOCK_DEFAULT_CONFIG):
+        provider = MockProvider()
+        head_resources = {"CPU": 8, "GPU": 1}
+        worker_resources = {"CPU": 32, "GPU": 8}
+        cluster_config = copy.deepcopy(SMALL_CLUSTER)  # Legacy cluster_config.
+        cluster_config = prepare_config(cluster_config)
+        scheduler = ResourceDemandScheduler(
+            provider,
+            cluster_config["available_node_types"],
+            0,
+            head_node_type=NODE_TYPE_LEGACY_HEAD)
+        provider.create_node({}, {
+            TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
+            TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_HEAD
+        }, 1)
+        head_ip = provider.non_terminated_node_ips({})[0]
+        head_node_id = provider.non_terminated_nodes({})[0]
+        to_launch = scheduler.get_nodes_to_launch([], {}, [], {}, [],
+                                                  {head_ip: head_resources})
+        assert to_launch == {}  # Should always be empty with max_workers = 0.

-    scheduler.max_workers = 30
-    min_workers = scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"]
-    scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = 0
-    to_launch = scheduler.get_nodes_to_launch([head_node_id], {}, [], {}, [],
-                                              {head_ip: head_resources})
-    assert to_launch == {
-    }  # Since the resource demand does not require adding nodes.
-    to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
-                                              [head_resources], {}, [],
-                                              {head_ip: head_resources})
-    assert to_launch == {
-    }  # Since the resource demand does not require adding nodes.
+        scheduler.max_workers = 30
+        min_workers = scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
+            "min_workers"]
+        scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = 0
+        to_launch = scheduler.get_nodes_to_launch(
+            [head_node_id], {}, [], {}, [], {head_ip: head_resources})
+        assert to_launch == {
+        }  # Since the resource demand does not require adding nodes.
+        to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
+                                                  [head_resources], {}, [],
+                                                  {head_ip: head_resources})
+        assert to_launch == {
+        }  # Since the resource demand does not require adding nodes.

-    scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["min_workers"] = min_workers
-    # Returns min_workers when min_workers>0.
-    to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
-                                              [head_resources], {}, [],
-                                              {head_ip: head_resources})
-    assert to_launch == {NODE_TYPE_LEGACY_WORKER: min_workers}
+        scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
+            "min_workers"] = min_workers
+        # Returns min_workers when min_workers>0.
+        to_launch = scheduler.get_nodes_to_launch([head_node_id], {},
+                                                  [head_resources], {}, [],
+                                                  {head_ip: head_resources})
+        assert to_launch == {NODE_TYPE_LEGACY_WORKER: min_workers}

-    provider.create_node({}, {
-        TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
-        TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
-        TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_WORKER
-    }, min_workers)
-    nodes = provider.non_terminated_nodes({})
-    to_launch = scheduler.get_nodes_to_launch(nodes, {}, [head_resources], {},
-                                              [], {head_ip: head_resources})
-    assert to_launch == {}  # A node is running, at some point it'll connect.
-    pending_launches = {NODE_TYPE_LEGACY_WORKER: 4}
-    to_launch = scheduler.get_nodes_to_launch([], pending_launches,
-                                              [head_resources], {}, [],
-                                              {head_ip: head_resources})
-    assert to_launch == {}  # A node is launching, at some point it'll connect.
+        provider.create_node({}, {
+            TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
+            TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
+            TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_WORKER
+        }, min_workers)
+        nodes = provider.non_terminated_nodes({})
+        to_launch = scheduler.get_nodes_to_launch(
+            nodes, {}, [head_resources], {}, [], {head_ip: head_resources})
+        # A node is running, at some point it'll connect.
+        assert to_launch == {}
+        pending_launches = {NODE_TYPE_LEGACY_WORKER: 4}
+        to_launch = scheduler.get_nodes_to_launch([], pending_launches,
+                                                  [head_resources], {}, [],
+                                                  {head_ip: head_resources})
+        # A node is launching, at some point it'll connect.
+        assert to_launch == {}

-    # Now assume that we already launched/connected the nodes.
-    ips = provider.non_terminated_node_ips({})
-    lm = LoadMetrics()
-    worker_ips = []
-    for ip in ips:
-        if ip == head_ip:
-            lm.update(ip, head_resources, head_resources, {})
-        else:
-            lm.update(ip, worker_resources, worker_resources, {})
-            worker_ips.append(ip)
+        # Now assume that we already launched/connected the nodes.
+        ips = provider.non_terminated_node_ips({})
+        lm = LoadMetrics()
+        worker_ips = []
+        for ip in ips:
+            if ip == head_ip:
+                lm.update(ip, head_resources, head_resources, {})
+            else:
+                lm.update(ip, worker_resources, worker_resources, {})
+                worker_ips.append(ip)

-    assert not scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["resources"]
-    to_launch = scheduler.get_nodes_to_launch(
-        nodes, {}, [], {}, [], lm.get_static_node_resources_by_ip())
-    assert scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
-        "resources"] == worker_resources
-    assert to_launch == {}
-    utilizations = {ip: worker_resources for ip in worker_ips}
-    utilizations[head_ip] = head_resources
-    # Requires 4 nodes since worker resources is bigger than head reasources.
-    demands = [worker_resources] * (len(utilizations) + 3)
-    to_launch = scheduler.get_nodes_to_launch(
-        nodes, {}, demands, utilizations, [],
-        lm.get_static_node_resources_by_ip())
-    # 4 nodes are necessary to meet resource demand, but we never exceed
-    # max_workers.
-    assert to_launch == {}
-    scheduler.max_workers = 10
-    to_launch = scheduler.get_nodes_to_launch(
-        nodes, {}, demands, utilizations, [],
-        lm.get_static_node_resources_by_ip())
-    # 4 nodes are necessary to meet resource demand, but we never exceed
-    # max_workers.
-    assert to_launch == {}
-    scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["max_workers"] = 10
-    to_launch = scheduler.get_nodes_to_launch(
-        nodes, {}, demands, utilizations, [],
-        lm.get_static_node_resources_by_ip())
-    # 4 nodes are necessary to meet resource demand.
-    assert to_launch == {NODE_TYPE_LEGACY_WORKER: 4}
-    to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches, demands,
-                                              utilizations, [],
-                                              lm.get_node_resources())
-    # 0 because there are 4 pending launches and we only need 4.
-    assert to_launch == {}
-    to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches,
-                                              demands * 2, utilizations, [],
-                                              lm.get_node_resources())
-    # 1 because there are 4 pending launches and we only allow a max of 5.
-    assert to_launch == {NODE_TYPE_LEGACY_WORKER: 1}
+        assert not scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["resources"]
+        to_launch = scheduler.get_nodes_to_launch(
+            nodes, {}, [], {}, [], lm.get_static_node_resources_by_ip())
+        assert scheduler.node_types[NODE_TYPE_LEGACY_WORKER][
+            "resources"] == worker_resources
+        assert to_launch == {}
+        utilizations = {ip: worker_resources for ip in worker_ips}
+        utilizations[head_ip] = head_resources
+        # Needs 4 nodes since worker resources is bigger than head reasources.
+        demands = [worker_resources] * (len(utilizations) + 3)
+        to_launch = scheduler.get_nodes_to_launch(
+            nodes, {}, demands, utilizations, [],
+            lm.get_static_node_resources_by_ip())
+        # 4 nodes are necessary to meet resource demand, but we never exceed
+        # max_workers.
+        assert to_launch == {}
+        scheduler.max_workers = 10
+        to_launch = scheduler.get_nodes_to_launch(
+            nodes, {}, demands, utilizations, [],
+            lm.get_static_node_resources_by_ip())
+        # 4 nodes are necessary to meet resource demand, but we never exceed
+        # max_workers.
+        assert to_launch == {}
+        scheduler.node_types[NODE_TYPE_LEGACY_WORKER]["max_workers"] = 10
+        to_launch = scheduler.get_nodes_to_launch(
+            nodes, {}, demands, utilizations, [],
+            lm.get_static_node_resources_by_ip())
+        # 4 nodes are necessary to meet resource demand.
+        assert to_launch == {NODE_TYPE_LEGACY_WORKER: 4}
+        to_launch = scheduler.get_nodes_to_launch(nodes, pending_launches,
+                                                  demands, utilizations, [],
+                                                  lm.get_node_resources())
+        # 0 because there are 4 pending launches and we only need 4.
+        assert to_launch == {}
+        to_launch = scheduler.get_nodes_to_launch(
+            nodes, pending_launches, demands * 2, utilizations, [],
+            lm.get_node_resources())
+        # 1 because there are 4 pending launches and we only allow a max of 5.
+        assert to_launch == {NODE_TYPE_LEGACY_WORKER: 1}


 class LoadMetricsTest(unittest.TestCase):