[tune/release] Add up/down scaling release test (#25392)

This adds a nightly release test that asserts that autoscaling a cluster up and down in a Ray Tune run works.

Signed-off-by: Kai Fricke <kai@anyscale.com>
This commit is contained in:
Kai Fricke 2022-07-13 22:57:24 +01:00 committed by GitHub
parent 6ef26cd8ff
commit cd95569b01
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 110 additions and 0 deletions

View file

@ -0,0 +1,15 @@
base_image: {{ env["RAY_IMAGE_NIGHTLY_CPU"] | default("anyscale/ray:nightly-py37") }}
env_vars: {}
debian_packages:
- curl
python:
pip_packages:
- pytest
- awscli
- pyarrow>=6.0.1,<7.0.0
conda_packages: []
post_build_cmds:
- pip3 uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
- {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}

View file

@ -0,0 +1,13 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2
head_node_type:
name: head_node
instance_type: m5.xlarge # 4 CPUs
worker_node_types:
- name: worker_node
instance_type: m5.xlarge
min_workers: 0
max_workers: 2
use_spot: false

View file

@ -0,0 +1,57 @@
from collections import Counter
import time
import ray
from ray import tune
def train(config):
this_node_ip = ray.util.get_node_ip_address()
if config["head_node_ip"] == this_node_ip:
# On the head node, run for 30 minutes
for i in range(30):
tune.report(metric=i)
time.sleep(60)
else:
# On worker nodes, run for 3 minutes
for i in range(3):
tune.report(metric=i)
time.sleep(60)
class NodeCountCallback(tune.Callback):
def __init__(self):
self.node_counts = []
def on_step_begin(self, iteration, trials, **info):
node_count = len([n for n in ray.nodes() if n["Alive"]])
self.node_counts.append(node_count)
def main():
ray.init()
head_node_ip = ray.util.get_node_ip_address()
assert (
len([n for n in ray.nodes() if n["Alive"]]) == 1
), "Too many nodes available at start of script"
node_counter = NodeCountCallback()
tune.run(
train,
num_samples=3,
config={"head_node_ip": head_node_ip},
callbacks=[node_counter],
resources_per_trial={"cpu": 4},
)
node_counts = Counter(node_counter.node_counts)
assert node_counts[3] > 0, "Cluster never scaled to 3 nodes"
assert node_counter.node_counts[-1] == 1, "Cluster didn't scale down to 1 node."
if __name__ == "__main__":
main()

View file

@ -94,6 +94,31 @@
# # It can then let the test fail, e.g. if a metric regression is observed.
# alert: default
#######################
# Cluster scaling tests
#######################
- name: cluster_tune_scale_up_down
group: Cluster tests
working_dir: cluster_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: cpt_autoscaling_1-3.yaml
run:
timeout: 3600
script: python workloads/tune_scale_up_down.py
wait_for_nodes:
num_nodes: 0
type: sdk_command
alert: default
#########################
# AIR release tests