mirror of
https://github.com/vale981/ray
synced 2025-03-04 17:41:43 -05:00
[tune/release] Add up/down scaling release test (#25392)
This adds a nightly release test that asserts that autoscaling a cluster up and down in a Ray Tune run works. Signed-off-by: Kai Fricke <kai@anyscale.com>
This commit is contained in:
parent
6ef26cd8ff
commit
cd95569b01
4 changed files with 110 additions and 0 deletions
15
release/cluster_tests/app_config.yaml
Executable file
15
release/cluster_tests/app_config.yaml
Executable file
|
@ -0,0 +1,15 @@
|
|||
base_image: {{ env["RAY_IMAGE_NIGHTLY_CPU"] | default("anyscale/ray:nightly-py37") }}
|
||||
env_vars: {}
|
||||
debian_packages:
|
||||
- curl
|
||||
|
||||
python:
|
||||
pip_packages:
|
||||
- pytest
|
||||
- awscli
|
||||
- pyarrow>=6.0.1,<7.0.0
|
||||
conda_packages: []
|
||||
|
||||
post_build_cmds:
|
||||
- pip3 uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
|
||||
- {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
|
13
release/cluster_tests/cpt_autoscaling_1-3.yaml
Normal file
13
release/cluster_tests/cpt_autoscaling_1-3.yaml
Normal file
|
@ -0,0 +1,13 @@
|
|||
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
|
||||
region: us-west-2
|
||||
|
||||
head_node_type:
|
||||
name: head_node
|
||||
instance_type: m5.xlarge # 4 CPUs
|
||||
|
||||
worker_node_types:
|
||||
- name: worker_node
|
||||
instance_type: m5.xlarge
|
||||
min_workers: 0
|
||||
max_workers: 2
|
||||
use_spot: false
|
57
release/cluster_tests/workloads/tune_scale_up_down.py
Normal file
57
release/cluster_tests/workloads/tune_scale_up_down.py
Normal file
|
@ -0,0 +1,57 @@
|
|||
from collections import Counter
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
from ray import tune
|
||||
|
||||
|
||||
def train(config):
|
||||
this_node_ip = ray.util.get_node_ip_address()
|
||||
if config["head_node_ip"] == this_node_ip:
|
||||
# On the head node, run for 30 minutes
|
||||
for i in range(30):
|
||||
tune.report(metric=i)
|
||||
time.sleep(60)
|
||||
else:
|
||||
# On worker nodes, run for 3 minutes
|
||||
for i in range(3):
|
||||
tune.report(metric=i)
|
||||
time.sleep(60)
|
||||
|
||||
|
||||
class NodeCountCallback(tune.Callback):
|
||||
def __init__(self):
|
||||
self.node_counts = []
|
||||
|
||||
def on_step_begin(self, iteration, trials, **info):
|
||||
node_count = len([n for n in ray.nodes() if n["Alive"]])
|
||||
self.node_counts.append(node_count)
|
||||
|
||||
|
||||
def main():
|
||||
ray.init()
|
||||
|
||||
head_node_ip = ray.util.get_node_ip_address()
|
||||
|
||||
assert (
|
||||
len([n for n in ray.nodes() if n["Alive"]]) == 1
|
||||
), "Too many nodes available at start of script"
|
||||
|
||||
node_counter = NodeCountCallback()
|
||||
|
||||
tune.run(
|
||||
train,
|
||||
num_samples=3,
|
||||
config={"head_node_ip": head_node_ip},
|
||||
callbacks=[node_counter],
|
||||
resources_per_trial={"cpu": 4},
|
||||
)
|
||||
|
||||
node_counts = Counter(node_counter.node_counts)
|
||||
assert node_counts[3] > 0, "Cluster never scaled to 3 nodes"
|
||||
assert node_counter.node_counts[-1] == 1, "Cluster didn't scale down to 1 node."
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -94,6 +94,31 @@
|
|||
# # It can then let the test fail, e.g. if a metric regression is observed.
|
||||
# alert: default
|
||||
|
||||
#######################
|
||||
# Cluster scaling tests
|
||||
#######################
|
||||
- name: cluster_tune_scale_up_down
|
||||
group: Cluster tests
|
||||
working_dir: cluster_tests
|
||||
|
||||
frequency: nightly
|
||||
team: ml
|
||||
env: staging
|
||||
|
||||
cluster:
|
||||
cluster_env: app_config.yaml
|
||||
cluster_compute: cpt_autoscaling_1-3.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
script: python workloads/tune_scale_up_down.py
|
||||
wait_for_nodes:
|
||||
num_nodes: 0
|
||||
|
||||
type: sdk_command
|
||||
|
||||
alert: default
|
||||
|
||||
|
||||
#########################
|
||||
# AIR release tests
|
||||
|
|
Loading…
Add table
Reference in a new issue