diff --git a/python/ray/autoscaler/aws/example-ml.yaml b/python/ray/autoscaler/aws/example-ml.yaml new file mode 100644 index 000000000..9732006fa --- /dev/null +++ b/python/ray/autoscaler/aws/example-ml.yaml @@ -0,0 +1,158 @@ +# A cluster setup for ML / RLlib workloads. Note that this uses pytorch by default. +# If you want to use tensorflow, change pytorch_p36 to tensorflow_p36 below. +# +# Important: Make sure to run "source activate pytorch_p36" in your sessions to +# activate the right conda environment. Otherwise you won't be able to import ray. +# +cluster_name: ml + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 0 + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. +max_workers: 0 + +# The initial number of worker nodes to launch in addition to the head +# node. When the cluster is first brought up (or when it is refreshed with a +# subsequent `ray up`) this number of nodes will be started. +initial_workers: 0 + +# Whether or not to autoscale aggressively. If this is enabled, if at any point +# we would start more workers, we start at least enough to bring us to +# initial_workers. +autoscaling_mode: default + +# This executes all commands on all nodes in the docker container, +# and opens all the necessary ports to support the Ray cluster. +# Empty string means disabled. +docker: + image: "" # e.g., rayproject/ray:0.8.7 + container_name: "" # e.g. ray_docker + # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image + # if no cached version is present. + pull_before_run: True + run_options: [] # Extra options to pass into "docker run" + + # Example of running a GPU head with CPU workers + # head_image: "rayproject/ray:0.8.7-gpu" + # head_run_options: + # - --runtime=nvidia + + # worker_image: "rayproject/ray:0.8.7" + # worker_run_options: [] + +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This max value allowed is 1.0, which is the most conservative setting. +target_utilization_fraction: 0.8 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Cloud-provider specific configuration. +provider: + type: aws + region: us-west-2 + # Availability zone(s), comma-separated, that nodes may be launched in. + # Nodes are currently spread between zones by a round-robin approach, + # however this implementation detail should not be relied upon. + availability_zone: us-west-2a,us-west-2b + # Whether to allow node reuse. If set to False, nodes will be terminated + # instead of stopped. + cache_stopped_nodes: True # If not present, the default is True. + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ubuntu +# By default Ray creates a new private keypair, but you can also use your own. +# If you do so, make sure to also set "KeyName" in the head and worker node +# configurations below. +# ssh_private_key: /path/to/your/key.pem + +# Provider-specific config for the head node, e.g. instance type. By default +# Ray will auto-configure unspecified fields such as SubnetId and KeyName. +# For more documentation on available fields, see: +# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances +head_node: + InstanceType: m4.16xlarge + ImageId: latest_dlami + + # You can provision additional disk space with a conf as follows + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 100 + + # Additional options in the boto docs. + +# Provider-specific config for worker nodes, e.g. instance type. By default +# Ray will auto-configure unspecified fields such as SubnetId and KeyName. +# For more documentation on available fields, see: +# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances +worker_nodes: + InstanceType: m4.16xlarge + ImageId: latest_dlami + + # Comment this in to use spot nodes. + # InstanceMarketOptions: + # MarketType: spot + # # Additional options can be found in the boto docs, e.g. + # # SpotOptions: + # # MaxPrice: MAX_HOURLY_PRICE + # + # Additional options in the boto docs. + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +# Files or directories to copy from the head node to the worker nodes. The format is a +# list of paths. The same path on the head node will be copied to the worker node. +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases +# you should just use file_mounts. Only use this if you know what you're doing! +cluster_synced_files: [] + +# Whether changes to directories in file_mounts or cluster_synced_files in the head node +# should sync to the worker node continuously +file_mounts_sync_continuously: False + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + +# List of shell commands to run to set up nodes. +setup_commands: + # Note: if you're developing Ray, you probably want to create an AMI that + # has your Ray repo pre-cloned. Then, you can replace the pip installs + # below with a git checkout (and possibly a recompile). + - source activate pytorch_p36 && pip install -U ray + - source activate pytorch_p36 && pip install -U ray[rllib] ray[tune] ray[debug] + # Consider uncommenting these if you also want to run apt-get commands during setup + # - sudo pkill -9 apt-get || true + # - sudo pkill -9 dpkg || true + # - sudo dpkg --configure -a + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: + - pip install boto3>=1.4.8 # 1.4.8 adds InstanceMarketOptions + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - source activate pytorch_p36 && ray stop + - ulimit -n 65536; source activate pytorch_p36 && ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - source activate pytorch_p36 && ray stop + - ulimit -n 65536; source activate pytorch_p36 && ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/cli_logger.py b/python/ray/autoscaler/cli_logger.py index 4461dffda..7d1cb4370 100644 --- a/python/ray/autoscaler/cli_logger.py +++ b/python/ray/autoscaler/cli_logger.py @@ -698,9 +698,9 @@ class _CliLogger(): raise ValueError("Non-interactive confirm without --yes.") if default: - yn_str = cf.limeGreen("Y") + "/" + cf.red("n") + yn_str = "Y/n" else: - yn_str = cf.limeGreen("y") + "/" + cf.red("N") + yn_str = "y/N" confirm_str = cf.underlined("Confirm [" + yn_str + "]:") + " " diff --git a/python/ray/autoscaler/commands.py b/python/ray/autoscaler/commands.py index c49408456..9be3d849b 100644 --- a/python/ray/autoscaler/commands.py +++ b/python/ray/autoscaler/commands.py @@ -154,8 +154,6 @@ def create_or_update_cluster(config_file: str, raise NotImplementedError("Unsupported provider {}".format( config["provider"])) - cli_logger.success("Cluster configuration valid") - printed_overrides = False def handle_cli_override(key, override): @@ -244,8 +242,8 @@ def _bootstrap_config(config: Dict[str, Any], provider_cls = importer(config["provider"]) - with cli_logger.timed( # todo: better message - "Bootstrapping {} config", + with cli_logger.timed( + "Checking {} environment settings", PROVIDER_PRETTY_NAMES.get(config["provider"]["type"])): resolved_config = provider_cls.bootstrap_config(config) diff --git a/python/ray/tests/test_cli_patterns/test_ray_attach.txt b/python/ray/tests/test_cli_patterns/test_ray_attach.txt index cc648a61f..e743b90cd 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_attach.txt +++ b/python/ray/tests/test_cli_patterns/test_ray_attach.txt @@ -1,3 +1,3 @@ -Bootstrapping AWS config +Checking AWS environment settings Fetched IP: .+ ubuntu@ip-.+:~\$ exit diff --git a/python/ray/tests/test_cli_patterns/test_ray_exec.txt b/python/ray/tests/test_cli_patterns/test_ray_exec.txt index 5496ac187..975ba3b52 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_exec.txt +++ b/python/ray/tests/test_cli_patterns/test_ray_exec.txt @@ -1,3 +1,3 @@ -Bootstrapping AWS config +Checking AWS environment settings Fetched IP: .+ This is a test! diff --git a/python/ray/tests/test_cli_patterns/test_ray_submit.txt b/python/ray/tests/test_cli_patterns/test_ray_submit.txt index ef94f13c0..efc900092 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_submit.txt +++ b/python/ray/tests/test_cli_patterns/test_ray_submit.txt @@ -1,5 +1,5 @@ -Bootstrapping AWS config +Checking AWS environment settings Fetched IP: .+ -Bootstrapping AWS config +Checking AWS environment settings Fetched IP: .+ This is a test! diff --git a/python/ray/tests/test_cli_patterns/test_ray_up.txt b/python/ray/tests/test_cli_patterns/test_ray_up.txt index 3ae4349bf..56dbc4fe4 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_up.txt +++ b/python/ray/tests/test_cli_patterns/test_ray_up.txt @@ -1,7 +1,6 @@ -Cluster configuration valid Cluster: test-cli -Bootstrapping AWS config +Checking AWS environment settings AWS config IAM Profile: .+ \[default\] EC2 Key pair \(head & workers\): .+ \[default\] diff --git a/python/ray/tests/test_cli_patterns/test_ray_up_record.txt b/python/ray/tests/test_cli_patterns/test_ray_up_record.txt index 0aa137248..bf47fccc6 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_up_record.txt +++ b/python/ray/tests/test_cli_patterns/test_ray_up_record.txt @@ -1,6 +1,5 @@ -.+\.py.*Cluster configuration valid .+\.py.*Cluster: test-cli -.+\.py.*Bootstrapping AWS config +.+\.py.*Checking AWS environment settings .+\.py.*Creating new IAM instance profile ray-autoscaler-v1 for use as the default\. .+\.py.*Creating new IAM role ray-autoscaler-v1 for use as the default instance role\. .+\.py.*Creating new key pair __test-cli_key-1 for use as the default\.