ray/release/ray_release/buildkite/concurrency.py

import csv
import os
from collections import namedtuple
from typing import Tuple, Optional, Dict

from ray_release.config import Test, RELEASE_PACKAGE_DIR
from ray_release.template import load_test_cluster_compute
from ray_release.logger import logger

# Keep 10% for the buffer.
limit = int(15784 * 0.9)


CONCURRENY_GROUPS = {
    "tiny": 32,  # <= 1k vCPU
    "small": 16,  # <= 2k vCPU
    "medium": 6,  # <= 3k vCPU
    "large": 8,  # <= 8k vCPU
    "enormous": 1,  # <= 4k vCPU (?)
    "small-gpu": 8,
    "large-gpu": 4,
}


Condition = namedtuple(
    "Condition", ["min_gpu", "max_gpu", "min_cpu", "max_cpu", "group"]
)

gpu_cpu_to_concurrency_groups = [
    Condition(min_gpu=9, max_gpu=-1, min_cpu=0, max_cpu=-1, group="large-gpu"),
    Condition(min_gpu=1, max_gpu=9, min_cpu=0, max_cpu=-128, group="small-gpu"),
    Condition(min_gpu=0, max_gpu=0, min_cpu=1025, max_cpu=-1, group="enormous"),
    Condition(min_gpu=0, max_gpu=0, min_cpu=513, max_cpu=1024, group="large"),
    Condition(min_gpu=0, max_gpu=0, min_cpu=129, max_cpu=512, group="medium"),
    Condition(min_gpu=0, max_gpu=0, min_cpu=0, max_cpu=32, group="tiny"),
    # Make sure "small" is the last in the list, because it is the fallback.
    Condition(min_gpu=0, max_gpu=0, min_cpu=0, max_cpu=128, group="small"),
]


# Obtained from https://cloud.google.com/compute/docs/accelerator-optimized-machines
gcp_gpu_instances = {
    "a2-highgpu-1g": (12, 1),
    "a2-highgpu-2g": (24, 2),
    "a2-highgpu-4g": (48, 4),
    "a2-highgpu-8g": (96, 8),
    "a2-megagpu-16g": (96, 16),
}


def load_instance_types(path: Optional[str] = None) -> Dict[str, Tuple[int, int]]:
    path = path or os.path.join(
        RELEASE_PACKAGE_DIR, "ray_release", "buildkite", "aws_instance_types.csv"
    )

    instance_to_resources = {}
    with open(path, "rt") as fp:
        reader = csv.DictReader(fp)
        for row in reader:
            instance_to_resources[row["instance"]] = (
                int(row["cpus"]),
                int(row["gpus"]),
            )

    return instance_to_resources


def parse_instance_resources(instance: str) -> Tuple[int, int]:
    """Parse (GCP) instance strings to resources"""
    # Assumes that GPU instances have already been parsed
    num_cpus = int(instance.split("-")[-1])
    num_gpus = 0
    return num_cpus, num_gpus


def parse_condition(cond: int, limit: float = float("inf")) -> float:
    return cond if cond > -1 else limit


def get_concurrency_group(test: Test) -> Tuple[str, int]:
    try:
        test_cpus, test_gpus = get_test_resources(test)
    except Exception as e:
        logger.warning(f"Couldn't get test resources for test {test['name']}: {e}")
        return "small", CONCURRENY_GROUPS["small"]

    for condition in gpu_cpu_to_concurrency_groups:
        min_gpu = parse_condition(condition.min_gpu, float("-inf"))
        max_gpu = parse_condition(condition.max_gpu, float("inf"))
        min_cpu = parse_condition(condition.min_cpu, float("-inf"))
        max_cpu = parse_condition(condition.max_cpu, float("inf"))

        if min_cpu <= test_cpus <= max_cpu and min_gpu <= test_gpus <= max_gpu:
            group = condition.group
            return group, CONCURRENY_GROUPS[group]

    # Return default
    logger.warning(
        f"Could not find concurrency group for test {test['name']} "
        f"based on used resources."
    )
    return "small", CONCURRENY_GROUPS["small"]


def get_test_resources(test: Test) -> Tuple[int, int]:
    cluster_compute = load_test_cluster_compute(test)
    return get_test_resources_from_cluster_compute(cluster_compute)


def get_test_resources_from_cluster_compute(cluster_compute: Dict) -> Tuple[int, int]:
    instances = []

    # Add head node instance
    instances.append((cluster_compute["head_node_type"]["instance_type"], 1))

    # Add worker node instances
    instances.extend(
        (w["instance_type"], w.get("max_workers", w.get("min_workers", 1)))
        for w in cluster_compute["worker_node_types"]
    )

    aws_instance_types = load_instance_types()
    total_cpus = 0
    total_gpus = 0

    for instance, count in instances:
        if instance in aws_instance_types:
            instance_cpus, instance_gpus = aws_instance_types[instance]
        elif instance in gcp_gpu_instances:
            instance_cpus, instance_gpus = gcp_gpu_instances[instance]
        else:
            instance_cpus, instance_gpus = parse_instance_resources(instance)

        total_cpus += instance_cpus * count
        total_gpus += instance_gpus * count

    return total_cpus, total_gpus
[ci/release] Add support for concurrency groups (#22728) This PR adds concurrency groups to Buildkite release test runs with new release test package. Five concurrency groups are defined (large-gpu, small-gpu, large, medium, small). If not specified manually, concurrency groups are inferred from used cluster resources. Example pipeline: https://buildkite.com/ray-project/release-tests-branch/builds/55#09109eac-d22e-43bc-889e-078cfb037373 (click on Artifacts --> pipeline.json) 2022-03-02 16:35:54 +01:00			`import csv`
			`import os`
			`from collections import namedtuple`
			`from typing import Tuple, Optional, Dict`

[ci/release] Support running tests with different python versions (#24843) OSS release tests currently run with hardcoded Python 3.7 base. In the future we will want to run tests on different python versions. This PR adds support for a new `python` field in the test configuration. The python field will determine both the base image used in the Buildkite runner docker container (for Ray client compatibility) and the base image for the Anyscale cluster environments. Note that in Buildkite, we will still only wait for the python 3.7 base image before kicking off tests. That is acceptable, as we can assume that most wheels finish in a similar time, so even if we wait for the 3.7 image and kick off a 3.8 test, that runner will wait maybe for 5-10 more minutes. 2022-05-17 17:03:12 +01:00			`from ray_release.config import Test, RELEASE_PACKAGE_DIR`
			`from ray_release.template import load_test_cluster_compute`
[ci/release] Add support for concurrency groups (#22728) This PR adds concurrency groups to Buildkite release test runs with new release test package. Five concurrency groups are defined (large-gpu, small-gpu, large, medium, small). If not specified manually, concurrency groups are inferred from used cluster resources. Example pipeline: https://buildkite.com/ray-project/release-tests-branch/builds/55#09109eac-d22e-43bc-889e-078cfb037373 (click on Artifacts --> pipeline.json) 2022-03-02 16:35:54 +01:00			`from ray_release.logger import logger`

[Nightly Tests] Readjust the concurrency limit. (#23002) This PR reduces the concurrency limit. Based on the back of envelope calculation, the current concurrency limit can easily exceed the service quota. Given large == 2048 vCPUs, it will use about 20K vCPUs, which is slightly larger than the limit. 2022-03-11 00:19:38 +09:00			`# Keep 10% for the buffer.`
			`limit = int(15784 * 0.9)`

[ci/release] Add support for concurrency groups (#22728) This PR adds concurrency groups to Buildkite release test runs with new release test package. Five concurrency groups are defined (large-gpu, small-gpu, large, medium, small). If not specified manually, concurrency groups are inferred from used cluster resources. Example pipeline: https://buildkite.com/ray-project/release-tests-branch/builds/55#09109eac-d22e-43bc-889e-078cfb037373 (click on Artifacts --> pipeline.json) 2022-03-02 16:35:54 +01:00
			`CONCURRENY_GROUPS = {`
[CI] Re-balance concurrency groups to allow more quota for `large` tests (#24344) Currently nightly tests are unable to finish in a day because of concurrency group limit on `large` tests. This is an attempt to adjust the limits so buildkite can run / finish more tests. I will observe which tests fall into the `enormous` group and adjust the test resource / concurrency group limits again. 2022-04-29 14:26:16 -07:00			`"tiny": 32, # <= 1k vCPU`
			`"small": 16, # <= 2k vCPU`
			`"medium": 6, # <= 3k vCPU`
			`"large": 8, # <= 8k vCPU`
			`"enormous": 1, # <= 4k vCPU (?)`
[ci/release] Add "tiny" concurrency group, change limits (#23065) E.g. long running tests run on small clusters (often 8 CPUs) but block other jobs for a long time. We should thus add more granularity to the concurrency groups. Additionally, limits have been slightly adjusted to make more sense (e.g. 8 GPUs are now small-gpu, 9+ GPUs large-gpu, instead of 7 for small-gpu and 8 for large-gpu). 2022-03-11 18:19:38 +00:00			`"small-gpu": 8,`
			`"large-gpu": 4,`
[ci/release] Add support for concurrency groups (#22728) This PR adds concurrency groups to Buildkite release test runs with new release test package. Five concurrency groups are defined (large-gpu, small-gpu, large, medium, small). If not specified manually, concurrency groups are inferred from used cluster resources. Example pipeline: https://buildkite.com/ray-project/release-tests-branch/builds/55#09109eac-d22e-43bc-889e-078cfb037373 (click on Artifacts --> pipeline.json) 2022-03-02 16:35:54 +01:00			`}`

[Nightly Tests] Readjust the concurrency limit. (#23002) This PR reduces the concurrency limit. Based on the back of envelope calculation, the current concurrency limit can easily exceed the service quota. Given large == 2048 vCPUs, it will use about 20K vCPUs, which is slightly larger than the limit. 2022-03-11 00:19:38 +09:00
[ci/release] Add support for concurrency groups (#22728) This PR adds concurrency groups to Buildkite release test runs with new release test package. Five concurrency groups are defined (large-gpu, small-gpu, large, medium, small). If not specified manually, concurrency groups are inferred from used cluster resources. Example pipeline: https://buildkite.com/ray-project/release-tests-branch/builds/55#09109eac-d22e-43bc-889e-078cfb037373 (click on Artifacts --> pipeline.json) 2022-03-02 16:35:54 +01:00			`Condition = namedtuple(`
			`"Condition", ["min_gpu", "max_gpu", "min_cpu", "max_cpu", "group"]`
			`)`

			`gpu_cpu_to_concurrency_groups = [`
[ci/release] Add "tiny" concurrency group, change limits (#23065) E.g. long running tests run on small clusters (often 8 CPUs) but block other jobs for a long time. We should thus add more granularity to the concurrency groups. Additionally, limits have been slightly adjusted to make more sense (e.g. 8 GPUs are now small-gpu, 9+ GPUs large-gpu, instead of 7 for small-gpu and 8 for large-gpu). 2022-03-11 18:19:38 +00:00			`Condition(min_gpu=9, max_gpu=-1, min_cpu=0, max_cpu=-1, group="large-gpu"),`
			`Condition(min_gpu=1, max_gpu=9, min_cpu=0, max_cpu=-128, group="small-gpu"),`
[CI] Re-balance concurrency groups to allow more quota for `large` tests (#24344) Currently nightly tests are unable to finish in a day because of concurrency group limit on `large` tests. This is an attempt to adjust the limits so buildkite can run / finish more tests. I will observe which tests fall into the `enormous` group and adjust the test resource / concurrency group limits again. 2022-04-29 14:26:16 -07:00			`Condition(min_gpu=0, max_gpu=0, min_cpu=1025, max_cpu=-1, group="enormous"),`
			`Condition(min_gpu=0, max_gpu=0, min_cpu=513, max_cpu=1024, group="large"),`
[ci/release] Add "tiny" concurrency group, change limits (#23065) E.g. long running tests run on small clusters (often 8 CPUs) but block other jobs for a long time. We should thus add more granularity to the concurrency groups. Additionally, limits have been slightly adjusted to make more sense (e.g. 8 GPUs are now small-gpu, 9+ GPUs large-gpu, instead of 7 for small-gpu and 8 for large-gpu). 2022-03-11 18:19:38 +00:00			`Condition(min_gpu=0, max_gpu=0, min_cpu=129, max_cpu=512, group="medium"),`
			`Condition(min_gpu=0, max_gpu=0, min_cpu=0, max_cpu=32, group="tiny"),`
[CI] Re-balance concurrency groups to allow more quota for `large` tests (#24344) Currently nightly tests are unable to finish in a day because of concurrency group limit on `large` tests. This is an attempt to adjust the limits so buildkite can run / finish more tests. I will observe which tests fall into the `enormous` group and adjust the test resource / concurrency group limits again. 2022-04-29 14:26:16 -07:00			`# Make sure "small" is the last in the list, because it is the fallback.`
[ci/release] Add support for concurrency groups (#22728) This PR adds concurrency groups to Buildkite release test runs with new release test package. Five concurrency groups are defined (large-gpu, small-gpu, large, medium, small). If not specified manually, concurrency groups are inferred from used cluster resources. Example pipeline: https://buildkite.com/ray-project/release-tests-branch/builds/55#09109eac-d22e-43bc-889e-078cfb037373 (click on Artifacts --> pipeline.json) 2022-03-02 16:35:54 +01:00			`Condition(min_gpu=0, max_gpu=0, min_cpu=0, max_cpu=128, group="small"),`
			`]`


			`# Obtained from https://cloud.google.com/compute/docs/accelerator-optimized-machines`
			`gcp_gpu_instances = {`
			`"a2-highgpu-1g": (12, 1),`
			`"a2-highgpu-2g": (24, 2),`
			`"a2-highgpu-4g": (48, 4),`
			`"a2-highgpu-8g": (96, 8),`
			`"a2-megagpu-16g": (96, 16),`
			`}`


			`def load_instance_types(path: Optional[str] = None) -> Dict[str, Tuple[int, int]]:`
			`path = path or os.path.join(`
			`RELEASE_PACKAGE_DIR, "ray_release", "buildkite", "aws_instance_types.csv"`
			`)`

			`instance_to_resources = {}`
			`with open(path, "rt") as fp:`
			`reader = csv.DictReader(fp)`
			`for row in reader:`
			`instance_to_resources[row["instance"]] = (`
			`int(row["cpus"]),`
			`int(row["gpus"]),`
			`)`

			`return instance_to_resources`


			`def parse_instance_resources(instance: str) -> Tuple[int, int]:`
			`"""Parse (GCP) instance strings to resources"""`
			`# Assumes that GPU instances have already been parsed`
			`num_cpus = int(instance.split("-")[-1])`
			`num_gpus = 0`
			`return num_cpus, num_gpus`


			`def parse_condition(cond: int, limit: float = float("inf")) -> float:`
			`return cond if cond > -1 else limit`


			`def get_concurrency_group(test: Test) -> Tuple[str, int]:`
			`try:`
			`test_cpus, test_gpus = get_test_resources(test)`
			`except Exception as e:`
			`logger.warning(f"Couldn't get test resources for test {test['name']}: {e}")`
			`return "small", CONCURRENY_GROUPS["small"]`

			`for condition in gpu_cpu_to_concurrency_groups:`
			`min_gpu = parse_condition(condition.min_gpu, float("-inf"))`
			`max_gpu = parse_condition(condition.max_gpu, float("inf"))`
			`min_cpu = parse_condition(condition.min_cpu, float("-inf"))`
			`max_cpu = parse_condition(condition.max_cpu, float("inf"))`

			`if min_cpu <= test_cpus <= max_cpu and min_gpu <= test_gpus <= max_gpu:`
			`group = condition.group`
			`return group, CONCURRENY_GROUPS[group]`

			`# Return default`
			`logger.warning(`
			`f"Could not find concurrency group for test {test['name']} "`
			`f"based on used resources."`
			`)`
			`return "small", CONCURRENY_GROUPS["small"]`


			`def get_test_resources(test: Test) -> Tuple[int, int]:`
			`cluster_compute = load_test_cluster_compute(test)`
			`return get_test_resources_from_cluster_compute(cluster_compute)`


			`def get_test_resources_from_cluster_compute(cluster_compute: Dict) -> Tuple[int, int]:`
			`instances = []`

			`# Add head node instance`
			`instances.append((cluster_compute["head_node_type"]["instance_type"], 1))`

			`# Add worker node instances`
			`instances.extend(`
			`(w["instance_type"], w.get("max_workers", w.get("min_workers", 1)))`
			`for w in cluster_compute["worker_node_types"]`
			`)`

			`aws_instance_types = load_instance_types()`
			`total_cpus = 0`
			`total_gpus = 0`

			`for instance, count in instances:`
			`if instance in aws_instance_types:`
			`instance_cpus, instance_gpus = aws_instance_types[instance]`
			`elif instance in gcp_gpu_instances:`
			`instance_cpus, instance_gpus = gcp_gpu_instances[instance]`
			`else:`
			`instance_cpus, instance_gpus = parse_instance_resources(instance)`

			`total_cpus += instance_cpus * count`
			`total_gpus += instance_gpus * count`

			`return total_cpus, total_gpus`