ray/release/ray_release/buildkite/concurrency.py
Kai Fricke 6c5229295e
[ci/release] Support running tests with different python versions (#24843)
OSS release tests currently run with hardcoded Python 3.7 base. In the future we will want to run tests on different python versions. 
This PR adds support for a new `python` field in the test configuration. The python field will determine both the base image used in the Buildkite runner docker container (for Ray client compatibility) and the base image for the Anyscale cluster environments. 

Note that in Buildkite, we will still only wait for the python 3.7 base image before kicking off tests. That is acceptable, as we can assume that most wheels finish in a similar time, so even if we wait for the 3.7 image and kick off a 3.8 test, that runner will wait maybe for 5-10 more minutes.
2022-05-17 17:03:12 +01:00

137 lines
4.5 KiB
Python

import csv
import os
from collections import namedtuple
from typing import Tuple, Optional, Dict
from ray_release.config import Test, RELEASE_PACKAGE_DIR
from ray_release.template import load_test_cluster_compute
from ray_release.logger import logger
# Keep 10% for the buffer.
limit = int(15784 * 0.9)
CONCURRENY_GROUPS = {
"tiny": 32, # <= 1k vCPU
"small": 16, # <= 2k vCPU
"medium": 6, # <= 3k vCPU
"large": 8, # <= 8k vCPU
"enormous": 1, # <= 4k vCPU (?)
"small-gpu": 8,
"large-gpu": 4,
}
Condition = namedtuple(
"Condition", ["min_gpu", "max_gpu", "min_cpu", "max_cpu", "group"]
)
gpu_cpu_to_concurrency_groups = [
Condition(min_gpu=9, max_gpu=-1, min_cpu=0, max_cpu=-1, group="large-gpu"),
Condition(min_gpu=1, max_gpu=9, min_cpu=0, max_cpu=-128, group="small-gpu"),
Condition(min_gpu=0, max_gpu=0, min_cpu=1025, max_cpu=-1, group="enormous"),
Condition(min_gpu=0, max_gpu=0, min_cpu=513, max_cpu=1024, group="large"),
Condition(min_gpu=0, max_gpu=0, min_cpu=129, max_cpu=512, group="medium"),
Condition(min_gpu=0, max_gpu=0, min_cpu=0, max_cpu=32, group="tiny"),
# Make sure "small" is the last in the list, because it is the fallback.
Condition(min_gpu=0, max_gpu=0, min_cpu=0, max_cpu=128, group="small"),
]
# Obtained from https://cloud.google.com/compute/docs/accelerator-optimized-machines
gcp_gpu_instances = {
"a2-highgpu-1g": (12, 1),
"a2-highgpu-2g": (24, 2),
"a2-highgpu-4g": (48, 4),
"a2-highgpu-8g": (96, 8),
"a2-megagpu-16g": (96, 16),
}
def load_instance_types(path: Optional[str] = None) -> Dict[str, Tuple[int, int]]:
path = path or os.path.join(
RELEASE_PACKAGE_DIR, "ray_release", "buildkite", "aws_instance_types.csv"
)
instance_to_resources = {}
with open(path, "rt") as fp:
reader = csv.DictReader(fp)
for row in reader:
instance_to_resources[row["instance"]] = (
int(row["cpus"]),
int(row["gpus"]),
)
return instance_to_resources
def parse_instance_resources(instance: str) -> Tuple[int, int]:
"""Parse (GCP) instance strings to resources"""
# Assumes that GPU instances have already been parsed
num_cpus = int(instance.split("-")[-1])
num_gpus = 0
return num_cpus, num_gpus
def parse_condition(cond: int, limit: float = float("inf")) -> float:
return cond if cond > -1 else limit
def get_concurrency_group(test: Test) -> Tuple[str, int]:
try:
test_cpus, test_gpus = get_test_resources(test)
except Exception as e:
logger.warning(f"Couldn't get test resources for test {test['name']}: {e}")
return "small", CONCURRENY_GROUPS["small"]
for condition in gpu_cpu_to_concurrency_groups:
min_gpu = parse_condition(condition.min_gpu, float("-inf"))
max_gpu = parse_condition(condition.max_gpu, float("inf"))
min_cpu = parse_condition(condition.min_cpu, float("-inf"))
max_cpu = parse_condition(condition.max_cpu, float("inf"))
if min_cpu <= test_cpus <= max_cpu and min_gpu <= test_gpus <= max_gpu:
group = condition.group
return group, CONCURRENY_GROUPS[group]
# Return default
logger.warning(
f"Could not find concurrency group for test {test['name']} "
f"based on used resources."
)
return "small", CONCURRENY_GROUPS["small"]
def get_test_resources(test: Test) -> Tuple[int, int]:
cluster_compute = load_test_cluster_compute(test)
return get_test_resources_from_cluster_compute(cluster_compute)
def get_test_resources_from_cluster_compute(cluster_compute: Dict) -> Tuple[int, int]:
instances = []
# Add head node instance
instances.append((cluster_compute["head_node_type"]["instance_type"], 1))
# Add worker node instances
instances.extend(
(w["instance_type"], w.get("max_workers", w.get("min_workers", 1)))
for w in cluster_compute["worker_node_types"]
)
aws_instance_types = load_instance_types()
total_cpus = 0
total_gpus = 0
for instance, count in instances:
if instance in aws_instance_types:
instance_cpus, instance_gpus = aws_instance_types[instance]
elif instance in gcp_gpu_instances:
instance_cpus, instance_gpus = gcp_gpu_instances[instance]
else:
instance_cpus, instance_gpus = parse_instance_resources(instance)
total_cpus += instance_cpus * count
total_gpus += instance_gpus * count
return total_cpus, total_gpus