From cd95569b014cf06479b96875c63eba171e92ec97 Mon Sep 17 00:00:00 2001 From: Kai Fricke Date: Wed, 13 Jul 2022 22:57:24 +0100 Subject: [PATCH] [tune/release] Add up/down scaling release test (#25392) This adds a nightly release test that asserts that autoscaling a cluster up and down in a Ray Tune run works. Signed-off-by: Kai Fricke --- release/cluster_tests/app_config.yaml | 15 +++++ .../cluster_tests/cpt_autoscaling_1-3.yaml | 13 +++++ .../workloads/tune_scale_up_down.py | 57 +++++++++++++++++++ release/release_tests.yaml | 25 ++++++++ 4 files changed, 110 insertions(+) create mode 100755 release/cluster_tests/app_config.yaml create mode 100644 release/cluster_tests/cpt_autoscaling_1-3.yaml create mode 100644 release/cluster_tests/workloads/tune_scale_up_down.py diff --git a/release/cluster_tests/app_config.yaml b/release/cluster_tests/app_config.yaml new file mode 100755 index 000000000..e552178aa --- /dev/null +++ b/release/cluster_tests/app_config.yaml @@ -0,0 +1,15 @@ +base_image: {{ env["RAY_IMAGE_NIGHTLY_CPU"] | default("anyscale/ray:nightly-py37") }} +env_vars: {} +debian_packages: + - curl + +python: + pip_packages: + - pytest + - awscli + - pyarrow>=6.0.1,<7.0.0 + conda_packages: [] + +post_build_cmds: + - pip3 uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }} + - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }} diff --git a/release/cluster_tests/cpt_autoscaling_1-3.yaml b/release/cluster_tests/cpt_autoscaling_1-3.yaml new file mode 100644 index 000000000..18a66b69a --- /dev/null +++ b/release/cluster_tests/cpt_autoscaling_1-3.yaml @@ -0,0 +1,13 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +head_node_type: + name: head_node + instance_type: m5.xlarge # 4 CPUs + +worker_node_types: + - name: worker_node + instance_type: m5.xlarge + min_workers: 0 + max_workers: 2 + use_spot: false diff --git a/release/cluster_tests/workloads/tune_scale_up_down.py b/release/cluster_tests/workloads/tune_scale_up_down.py new file mode 100644 index 000000000..f99ebcfc4 --- /dev/null +++ b/release/cluster_tests/workloads/tune_scale_up_down.py @@ -0,0 +1,57 @@ +from collections import Counter +import time + +import ray + +from ray import tune + + +def train(config): + this_node_ip = ray.util.get_node_ip_address() + if config["head_node_ip"] == this_node_ip: + # On the head node, run for 30 minutes + for i in range(30): + tune.report(metric=i) + time.sleep(60) + else: + # On worker nodes, run for 3 minutes + for i in range(3): + tune.report(metric=i) + time.sleep(60) + + +class NodeCountCallback(tune.Callback): + def __init__(self): + self.node_counts = [] + + def on_step_begin(self, iteration, trials, **info): + node_count = len([n for n in ray.nodes() if n["Alive"]]) + self.node_counts.append(node_count) + + +def main(): + ray.init() + + head_node_ip = ray.util.get_node_ip_address() + + assert ( + len([n for n in ray.nodes() if n["Alive"]]) == 1 + ), "Too many nodes available at start of script" + + node_counter = NodeCountCallback() + + tune.run( + train, + num_samples=3, + config={"head_node_ip": head_node_ip}, + callbacks=[node_counter], + resources_per_trial={"cpu": 4}, + ) + + node_counts = Counter(node_counter.node_counts) + assert node_counts[3] > 0, "Cluster never scaled to 3 nodes" + assert node_counter.node_counts[-1] == 1, "Cluster didn't scale down to 1 node." + + +if __name__ == "__main__": + main() diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 5006e13ac..bc3a96531 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -94,6 +94,31 @@ # # It can then let the test fail, e.g. if a metric regression is observed. # alert: default +####################### +# Cluster scaling tests +####################### +- name: cluster_tune_scale_up_down + group: Cluster tests + working_dir: cluster_tests + + frequency: nightly + team: ml + env: staging + + cluster: + cluster_env: app_config.yaml + cluster_compute: cpt_autoscaling_1-3.yaml + + run: + timeout: 3600 + script: python workloads/tune_scale_up_down.py + wait_for_nodes: + num_nodes: 0 + + type: sdk_command + + alert: default + ######################### # AIR release tests