mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[ci/release] Add support for concurrency groups (#22728)
This PR adds concurrency groups to Buildkite release test runs with new release test package. Five concurrency groups are defined (large-gpu, small-gpu, large, medium, small). If not specified manually, concurrency groups are inferred from used cluster resources. Example pipeline: https://buildkite.com/ray-project/release-tests-branch/builds/55#09109eac-d22e-43bc-889e-078cfb037373 (click on Artifacts --> pipeline.json)
This commit is contained in:
parent
04a1a19f6b
commit
7425fa6212
7 changed files with 840 additions and 6 deletions
486
release/ray_release/buildkite/aws_instance_types.csv
Normal file
486
release/ray_release/buildkite/aws_instance_types.csv
Normal file
|
@ -0,0 +1,486 @@
|
|||
instance,cpus,gpus
|
||||
a1.2xlarge,8,0
|
||||
a1.4xlarge,16,0
|
||||
a1.large,2,0
|
||||
a1.medium,1,0
|
||||
a1.metal,16,0
|
||||
a1.xlarge,4,0
|
||||
c1.medium,2,0
|
||||
c1.xlarge,8,0
|
||||
c3.2xlarge,8,0
|
||||
c3.4xlarge,16,0
|
||||
c3.8xlarge,32,0
|
||||
c3.large,2,0
|
||||
c3.xlarge,4,0
|
||||
c4.2xlarge,8,0
|
||||
c4.4xlarge,16,0
|
||||
c4.8xlarge,36,0
|
||||
c4.large,2,0
|
||||
c4.xlarge,4,0
|
||||
c5.12xlarge,48,0
|
||||
c5.18xlarge,72,0
|
||||
c5.24xlarge,96,0
|
||||
c5.2xlarge,8,0
|
||||
c5.4xlarge,16,0
|
||||
c5.9xlarge,36,0
|
||||
c5.large,2,0
|
||||
c5.metal,96,0
|
||||
c5.xlarge,4,0
|
||||
c5a.12xlarge,48,0
|
||||
c5a.16xlarge,64,0
|
||||
c5a.24xlarge,96,0
|
||||
c5a.2xlarge,8,0
|
||||
c5a.4xlarge,16,0
|
||||
c5a.8xlarge,32,0
|
||||
c5a.large,2,0
|
||||
c5a.xlarge,4,0
|
||||
c5ad.12xlarge,48,0
|
||||
c5ad.16xlarge,64,0
|
||||
c5ad.24xlarge,96,0
|
||||
c5ad.2xlarge,8,0
|
||||
c5ad.4xlarge,16,0
|
||||
c5ad.8xlarge,32,0
|
||||
c5ad.large,2,0
|
||||
c5ad.xlarge,4,0
|
||||
c5d.12xlarge,48,0
|
||||
c5d.18xlarge,72,0
|
||||
c5d.24xlarge,96,0
|
||||
c5d.2xlarge,8,0
|
||||
c5d.4xlarge,16,0
|
||||
c5d.9xlarge,36,0
|
||||
c5d.large,2,0
|
||||
c5d.metal,96,0
|
||||
c5d.xlarge,4,0
|
||||
c5n.18xlarge,72,0
|
||||
c5n.2xlarge,8,0
|
||||
c5n.4xlarge,16,0
|
||||
c5n.9xlarge,36,0
|
||||
c5n.large,2,0
|
||||
c5n.metal,72,0
|
||||
c5n.xlarge,4,0
|
||||
c6a.12xlarge,48,0
|
||||
c6a.16xlarge,64,0
|
||||
c6a.24xlarge,96,0
|
||||
c6a.2xlarge,8,0
|
||||
c6a.32xlarge,128,0
|
||||
c6a.48xlarge,192,0
|
||||
c6a.4xlarge,16,0
|
||||
c6a.8xlarge,32,0
|
||||
c6a.large,2,0
|
||||
c6a.xlarge,4,0
|
||||
c6g.12xlarge,48,0
|
||||
c6g.16xlarge,64,0
|
||||
c6g.2xlarge,8,0
|
||||
c6g.4xlarge,16,0
|
||||
c6g.8xlarge,32,0
|
||||
c6g.large,2,0
|
||||
c6g.medium,1,0
|
||||
c6g.metal,64,0
|
||||
c6g.xlarge,4,0
|
||||
c6gd.12xlarge,48,0
|
||||
c6gd.16xlarge,64,0
|
||||
c6gd.2xlarge,8,0
|
||||
c6gd.4xlarge,16,0
|
||||
c6gd.8xlarge,32,0
|
||||
c6gd.large,2,0
|
||||
c6gd.medium,1,0
|
||||
c6gd.metal,64,0
|
||||
c6gd.xlarge,4,0
|
||||
c6gn.12xlarge,48,0
|
||||
c6gn.16xlarge,64,0
|
||||
c6gn.2xlarge,8,0
|
||||
c6gn.4xlarge,16,0
|
||||
c6gn.8xlarge,32,0
|
||||
c6gn.large,2,0
|
||||
c6gn.medium,1,0
|
||||
c6gn.xlarge,4,0
|
||||
c6i.12xlarge,48,0
|
||||
c6i.16xlarge,64,0
|
||||
c6i.24xlarge,96,0
|
||||
c6i.2xlarge,8,0
|
||||
c6i.32xlarge,128,0
|
||||
c6i.4xlarge,16,0
|
||||
c6i.8xlarge,32,0
|
||||
c6i.large,2,0
|
||||
c6i.metal,128,0
|
||||
c6i.xlarge,4,0
|
||||
cc2.8xlarge,32,0
|
||||
d2.2xlarge,8,0
|
||||
d2.4xlarge,16,0
|
||||
d2.8xlarge,36,0
|
||||
d2.xlarge,4,0
|
||||
d3.2xlarge,8,0
|
||||
d3.4xlarge,16,0
|
||||
d3.8xlarge,32,0
|
||||
d3.xlarge,4,0
|
||||
d3en.12xlarge,48,0
|
||||
d3en.2xlarge,8,0
|
||||
d3en.4xlarge,16,0
|
||||
d3en.6xlarge,24,0
|
||||
d3en.8xlarge,32,0
|
||||
d3en.xlarge,4,0
|
||||
dl1.24xlarge,96,8
|
||||
f1.16xlarge,64,0
|
||||
f1.2xlarge,8,0
|
||||
f1.4xlarge,16,0
|
||||
g2.2xlarge,8,1
|
||||
g2.8xlarge,32,4
|
||||
g3.16xlarge,64,4
|
||||
g3.4xlarge,16,1
|
||||
g3.8xlarge,32,2
|
||||
g3s.xlarge,4,1
|
||||
g4ad.16xlarge,64,4
|
||||
g4ad.2xlarge,8,1
|
||||
g4ad.4xlarge,16,1
|
||||
g4ad.8xlarge,32,2
|
||||
g4ad.xlarge,4,1
|
||||
g4dn.12xlarge,48,4
|
||||
g4dn.16xlarge,64,1
|
||||
g4dn.2xlarge,8,1
|
||||
g4dn.4xlarge,16,1
|
||||
g4dn.8xlarge,32,1
|
||||
g4dn.metal,96,8
|
||||
g4dn.xlarge,4,1
|
||||
g5.12xlarge,48,4
|
||||
g5.16xlarge,64,1
|
||||
g5.24xlarge,96,4
|
||||
g5.2xlarge,8,1
|
||||
g5.48xlarge,192,8
|
||||
g5.4xlarge,16,1
|
||||
g5.8xlarge,32,1
|
||||
g5.xlarge,4,1
|
||||
g5g.16xlarge,64,2
|
||||
g5g.2xlarge,8,1
|
||||
g5g.4xlarge,16,1
|
||||
g5g.8xlarge,32,1
|
||||
g5g.metal,64,2
|
||||
g5g.xlarge,4,1
|
||||
h1.16xlarge,64,0
|
||||
h1.2xlarge,8,0
|
||||
h1.4xlarge,16,0
|
||||
h1.8xlarge,32,0
|
||||
i2.2xlarge,8,0
|
||||
i2.4xlarge,16,0
|
||||
i2.8xlarge,32,0
|
||||
i2.xlarge,4,0
|
||||
i3.16xlarge,64,0
|
||||
i3.2xlarge,8,0
|
||||
i3.4xlarge,16,0
|
||||
i3.8xlarge,32,0
|
||||
i3.large,2,0
|
||||
i3.metal,72,0
|
||||
i3.xlarge,4,0
|
||||
i3en.12xlarge,48,0
|
||||
i3en.24xlarge,96,0
|
||||
i3en.2xlarge,8,0
|
||||
i3en.3xlarge,12,0
|
||||
i3en.6xlarge,24,0
|
||||
i3en.large,2,0
|
||||
i3en.metal,96,0
|
||||
i3en.xlarge,4,0
|
||||
im4gn.16xlarge,64,0
|
||||
im4gn.2xlarge,8,0
|
||||
im4gn.4xlarge,16,0
|
||||
im4gn.8xlarge,32,0
|
||||
im4gn.large,2,0
|
||||
im4gn.xlarge,4,0
|
||||
inf1.24xlarge,96,0
|
||||
inf1.2xlarge,8,0
|
||||
inf1.6xlarge,24,0
|
||||
inf1.xlarge,4,0
|
||||
is4gen.2xlarge,8,0
|
||||
is4gen.4xlarge,16,0
|
||||
is4gen.8xlarge,32,0
|
||||
is4gen.large,2,0
|
||||
is4gen.medium,1,0
|
||||
is4gen.xlarge,4,0
|
||||
m1.large,2,0
|
||||
m1.medium,1,0
|
||||
m1.small,1,0
|
||||
m1.xlarge,4,0
|
||||
m2.2xlarge,4,0
|
||||
m2.4xlarge,8,0
|
||||
m2.xlarge,2,0
|
||||
m3.2xlarge,8,0
|
||||
m3.large,2,0
|
||||
m3.medium,1,0
|
||||
m3.xlarge,4,0
|
||||
m4.10xlarge,40,0
|
||||
m4.16xlarge,64,0
|
||||
m4.2xlarge,8,0
|
||||
m4.4xlarge,16,0
|
||||
m4.large,2,0
|
||||
m4.xlarge,4,0
|
||||
m5.12xlarge,48,0
|
||||
m5.16xlarge,64,0
|
||||
m5.24xlarge,96,0
|
||||
m5.2xlarge,8,0
|
||||
m5.4xlarge,16,0
|
||||
m5.8xlarge,32,0
|
||||
m5.large,2,0
|
||||
m5.metal,96,0
|
||||
m5.xlarge,4,0
|
||||
m5a.12xlarge,48,0
|
||||
m5a.16xlarge,64,0
|
||||
m5a.24xlarge,96,0
|
||||
m5a.2xlarge,8,0
|
||||
m5a.4xlarge,16,0
|
||||
m5a.8xlarge,32,0
|
||||
m5a.large,2,0
|
||||
m5a.xlarge,4,0
|
||||
m5ad.12xlarge,48,0
|
||||
m5ad.16xlarge,64,0
|
||||
m5ad.24xlarge,96,0
|
||||
m5ad.2xlarge,8,0
|
||||
m5ad.4xlarge,16,0
|
||||
m5ad.8xlarge,32,0
|
||||
m5ad.large,2,0
|
||||
m5ad.xlarge,4,0
|
||||
m5d.12xlarge,48,0
|
||||
m5d.16xlarge,64,0
|
||||
m5d.24xlarge,96,0
|
||||
m5d.2xlarge,8,0
|
||||
m5d.4xlarge,16,0
|
||||
m5d.8xlarge,32,0
|
||||
m5d.large,2,0
|
||||
m5d.metal,96,0
|
||||
m5d.xlarge,4,0
|
||||
m5dn.12xlarge,48,0
|
||||
m5dn.16xlarge,64,0
|
||||
m5dn.24xlarge,96,0
|
||||
m5dn.2xlarge,8,0
|
||||
m5dn.4xlarge,16,0
|
||||
m5dn.8xlarge,32,0
|
||||
m5dn.large,2,0
|
||||
m5dn.metal,96,0
|
||||
m5dn.xlarge,4,0
|
||||
m5n.12xlarge,48,0
|
||||
m5n.16xlarge,64,0
|
||||
m5n.24xlarge,96,0
|
||||
m5n.2xlarge,8,0
|
||||
m5n.4xlarge,16,0
|
||||
m5n.8xlarge,32,0
|
||||
m5n.large,2,0
|
||||
m5n.metal,96,0
|
||||
m5n.xlarge,4,0
|
||||
m5zn.12xlarge,48,0
|
||||
m5zn.2xlarge,8,0
|
||||
m5zn.3xlarge,12,0
|
||||
m5zn.6xlarge,24,0
|
||||
m5zn.large,2,0
|
||||
m5zn.metal,48,0
|
||||
m5zn.xlarge,4,0
|
||||
m6a.12xlarge,48,0
|
||||
m6a.16xlarge,64,0
|
||||
m6a.24xlarge,96,0
|
||||
m6a.2xlarge,8,0
|
||||
m6a.32xlarge,128,0
|
||||
m6a.48xlarge,192,0
|
||||
m6a.4xlarge,16,0
|
||||
m6a.8xlarge,32,0
|
||||
m6a.large,2,0
|
||||
m6a.xlarge,4,0
|
||||
m6g.12xlarge,48,0
|
||||
m6g.16xlarge,64,0
|
||||
m6g.2xlarge,8,0
|
||||
m6g.4xlarge,16,0
|
||||
m6g.8xlarge,32,0
|
||||
m6g.large,2,0
|
||||
m6g.medium,1,0
|
||||
m6g.metal,64,0
|
||||
m6g.xlarge,4,0
|
||||
m6gd.12xlarge,48,0
|
||||
m6gd.16xlarge,64,0
|
||||
m6gd.2xlarge,8,0
|
||||
m6gd.4xlarge,16,0
|
||||
m6gd.8xlarge,32,0
|
||||
m6gd.large,2,0
|
||||
m6gd.medium,1,0
|
||||
m6gd.metal,64,0
|
||||
m6gd.xlarge,4,0
|
||||
m6i.12xlarge,48,0
|
||||
m6i.16xlarge,64,0
|
||||
m6i.24xlarge,96,0
|
||||
m6i.2xlarge,8,0
|
||||
m6i.32xlarge,128,0
|
||||
m6i.4xlarge,16,0
|
||||
m6i.8xlarge,32,0
|
||||
m6i.large,2,0
|
||||
m6i.metal,128,0
|
||||
m6i.xlarge,4,0
|
||||
mac1.metal,12,0
|
||||
p2.16xlarge,64,16
|
||||
p2.8xlarge,32,8
|
||||
p2.xlarge,4,1
|
||||
p3.16xlarge,64,8
|
||||
p3.2xlarge,8,1
|
||||
p3.8xlarge,32,4
|
||||
p3dn.24xlarge,96,8
|
||||
p4d.24xlarge,96,8
|
||||
r3.2xlarge,8,0
|
||||
r3.4xlarge,16,0
|
||||
r3.8xlarge,32,0
|
||||
r3.large,2,0
|
||||
r3.xlarge,4,0
|
||||
r4.16xlarge,64,0
|
||||
r4.2xlarge,8,0
|
||||
r4.4xlarge,16,0
|
||||
r4.8xlarge,32,0
|
||||
r4.large,2,0
|
||||
r4.xlarge,4,0
|
||||
r5.12xlarge,48,0
|
||||
r5.16xlarge,64,0
|
||||
r5.24xlarge,96,0
|
||||
r5.2xlarge,8,0
|
||||
r5.4xlarge,16,0
|
||||
r5.8xlarge,32,0
|
||||
r5.large,2,0
|
||||
r5.metal,96,0
|
||||
r5.xlarge,4,0
|
||||
r5a.12xlarge,48,0
|
||||
r5a.16xlarge,64,0
|
||||
r5a.24xlarge,96,0
|
||||
r5a.2xlarge,8,0
|
||||
r5a.4xlarge,16,0
|
||||
r5a.8xlarge,32,0
|
||||
r5a.large,2,0
|
||||
r5a.xlarge,4,0
|
||||
r5ad.12xlarge,48,0
|
||||
r5ad.16xlarge,64,0
|
||||
r5ad.24xlarge,96,0
|
||||
r5ad.2xlarge,8,0
|
||||
r5ad.4xlarge,16,0
|
||||
r5ad.8xlarge,32,0
|
||||
r5ad.large,2,0
|
||||
r5ad.xlarge,4,0
|
||||
r5b.12xlarge,48,0
|
||||
r5b.16xlarge,64,0
|
||||
r5b.24xlarge,96,0
|
||||
r5b.2xlarge,8,0
|
||||
r5b.4xlarge,16,0
|
||||
r5b.8xlarge,32,0
|
||||
r5b.large,2,0
|
||||
r5b.metal,96,0
|
||||
r5b.xlarge,4,0
|
||||
r5d.12xlarge,48,0
|
||||
r5d.16xlarge,64,0
|
||||
r5d.24xlarge,96,0
|
||||
r5d.2xlarge,8,0
|
||||
r5d.4xlarge,16,0
|
||||
r5d.8xlarge,32,0
|
||||
r5d.large,2,0
|
||||
r5d.metal,96,0
|
||||
r5d.xlarge,4,0
|
||||
r5dn.12xlarge,48,0
|
||||
r5dn.16xlarge,64,0
|
||||
r5dn.24xlarge,96,0
|
||||
r5dn.2xlarge,8,0
|
||||
r5dn.4xlarge,16,0
|
||||
r5dn.8xlarge,32,0
|
||||
r5dn.large,2,0
|
||||
r5dn.metal,96,0
|
||||
r5dn.xlarge,4,0
|
||||
r5n.12xlarge,48,0
|
||||
r5n.16xlarge,64,0
|
||||
r5n.24xlarge,96,0
|
||||
r5n.2xlarge,8,0
|
||||
r5n.4xlarge,16,0
|
||||
r5n.8xlarge,32,0
|
||||
r5n.large,2,0
|
||||
r5n.metal,96,0
|
||||
r5n.xlarge,4,0
|
||||
r6g.12xlarge,48,0
|
||||
r6g.16xlarge,64,0
|
||||
r6g.2xlarge,8,0
|
||||
r6g.4xlarge,16,0
|
||||
r6g.8xlarge,32,0
|
||||
r6g.large,2,0
|
||||
r6g.medium,1,0
|
||||
r6g.metal,64,0
|
||||
r6g.xlarge,4,0
|
||||
r6gd.12xlarge,48,0
|
||||
r6gd.16xlarge,64,0
|
||||
r6gd.2xlarge,8,0
|
||||
r6gd.4xlarge,16,0
|
||||
r6gd.8xlarge,32,0
|
||||
r6gd.large,2,0
|
||||
r6gd.medium,1,0
|
||||
r6gd.metal,64,0
|
||||
r6gd.xlarge,4,0
|
||||
r6i.12xlarge,48,0
|
||||
r6i.16xlarge,64,0
|
||||
r6i.24xlarge,96,0
|
||||
r6i.2xlarge,8,0
|
||||
r6i.32xlarge,128,0
|
||||
r6i.4xlarge,16,0
|
||||
r6i.8xlarge,32,0
|
||||
r6i.large,2,0
|
||||
r6i.metal,128,0
|
||||
r6i.xlarge,4,0
|
||||
t1.micro,1,0
|
||||
t2.2xlarge,8,0
|
||||
t2.large,2,0
|
||||
t2.medium,2,0
|
||||
t2.micro,1,0
|
||||
t2.nano,1,0
|
||||
t2.small,1,0
|
||||
t2.xlarge,4,0
|
||||
t3.2xlarge,8,0
|
||||
t3.large,2,0
|
||||
t3.medium,2,0
|
||||
t3.micro,2,0
|
||||
t3.nano,2,0
|
||||
t3.small,2,0
|
||||
t3.xlarge,4,0
|
||||
t3a.2xlarge,8,0
|
||||
t3a.large,2,0
|
||||
t3a.medium,2,0
|
||||
t3a.micro,2,0
|
||||
t3a.nano,2,0
|
||||
t3a.small,2,0
|
||||
t3a.xlarge,4,0
|
||||
t4g.2xlarge,8,0
|
||||
t4g.large,2,0
|
||||
t4g.medium,2,0
|
||||
t4g.micro,2,0
|
||||
t4g.nano,2,0
|
||||
t4g.small,2,0
|
||||
t4g.xlarge,4,0
|
||||
u-12tb1.112xlarge,448,0
|
||||
u-3tb1.56xlarge,224,0
|
||||
u-6tb1.112xlarge,448,0
|
||||
u-6tb1.56xlarge,224,0
|
||||
u-9tb1.112xlarge,448,0
|
||||
vt1.24xlarge,96,0
|
||||
vt1.3xlarge,12,0
|
||||
vt1.6xlarge,24,0
|
||||
x1.16xlarge,64,0
|
||||
x1.32xlarge,128,0
|
||||
x1e.16xlarge,64,0
|
||||
x1e.2xlarge,8,0
|
||||
x1e.32xlarge,128,0
|
||||
x1e.4xlarge,16,0
|
||||
x1e.8xlarge,32,0
|
||||
x1e.xlarge,4,0
|
||||
x2gd.12xlarge,48,0
|
||||
x2gd.16xlarge,64,0
|
||||
x2gd.2xlarge,8,0
|
||||
x2gd.4xlarge,16,0
|
||||
x2gd.8xlarge,32,0
|
||||
x2gd.large,2,0
|
||||
x2gd.medium,1,0
|
||||
x2gd.metal,64,0
|
||||
x2gd.xlarge,4,0
|
||||
x2iezn.12xlarge,48,0
|
||||
x2iezn.2xlarge,8,0
|
||||
x2iezn.4xlarge,16,0
|
||||
x2iezn.6xlarge,24,0
|
||||
x2iezn.8xlarge,32,0
|
||||
x2iezn.metal,48,0
|
||||
z1d.12xlarge,48,0
|
||||
z1d.2xlarge,8,0
|
||||
z1d.3xlarge,12,0
|
||||
z1d.6xlarge,24,0
|
||||
z1d.large,2,0
|
||||
z1d.metal,48,0
|
||||
z1d.xlarge,4,0
|
|
127
release/ray_release/buildkite/concurrency.py
Normal file
127
release/ray_release/buildkite/concurrency.py
Normal file
|
@ -0,0 +1,127 @@
|
|||
import csv
|
||||
import os
|
||||
from collections import namedtuple
|
||||
from typing import Tuple, Optional, Dict
|
||||
|
||||
from ray_release.config import Test, RELEASE_PACKAGE_DIR, load_test_cluster_compute
|
||||
from ray_release.logger import logger
|
||||
|
||||
|
||||
CONCURRENY_GROUPS = {
|
||||
"small": 64,
|
||||
"medium": 16,
|
||||
"large": 8,
|
||||
"small-gpu": 8,
|
||||
"large-gpu": 4,
|
||||
}
|
||||
|
||||
Condition = namedtuple(
|
||||
"Condition", ["min_gpu", "max_gpu", "min_cpu", "max_cpu", "group"]
|
||||
)
|
||||
|
||||
gpu_cpu_to_concurrency_groups = [
|
||||
Condition(min_gpu=8, max_gpu=-1, min_cpu=0, max_cpu=-1, group="large-gpu"),
|
||||
Condition(min_gpu=1, max_gpu=8, min_cpu=0, max_cpu=-1, group="small-gpu"),
|
||||
Condition(min_gpu=0, max_gpu=0, min_cpu=512, max_cpu=-1, group="large"),
|
||||
Condition(min_gpu=0, max_gpu=0, min_cpu=128, max_cpu=512, group="medium"),
|
||||
Condition(min_gpu=0, max_gpu=0, min_cpu=0, max_cpu=128, group="small"),
|
||||
]
|
||||
|
||||
|
||||
# Obtained from https://cloud.google.com/compute/docs/accelerator-optimized-machines
|
||||
gcp_gpu_instances = {
|
||||
"a2-highgpu-1g": (12, 1),
|
||||
"a2-highgpu-2g": (24, 2),
|
||||
"a2-highgpu-4g": (48, 4),
|
||||
"a2-highgpu-8g": (96, 8),
|
||||
"a2-megagpu-16g": (96, 16),
|
||||
}
|
||||
|
||||
|
||||
def load_instance_types(path: Optional[str] = None) -> Dict[str, Tuple[int, int]]:
|
||||
path = path or os.path.join(
|
||||
RELEASE_PACKAGE_DIR, "ray_release", "buildkite", "aws_instance_types.csv"
|
||||
)
|
||||
|
||||
instance_to_resources = {}
|
||||
with open(path, "rt") as fp:
|
||||
reader = csv.DictReader(fp)
|
||||
for row in reader:
|
||||
instance_to_resources[row["instance"]] = (
|
||||
int(row["cpus"]),
|
||||
int(row["gpus"]),
|
||||
)
|
||||
|
||||
return instance_to_resources
|
||||
|
||||
|
||||
def parse_instance_resources(instance: str) -> Tuple[int, int]:
|
||||
"""Parse (GCP) instance strings to resources"""
|
||||
# Assumes that GPU instances have already been parsed
|
||||
num_cpus = int(instance.split("-")[-1])
|
||||
num_gpus = 0
|
||||
return num_cpus, num_gpus
|
||||
|
||||
|
||||
def parse_condition(cond: int, limit: float = float("inf")) -> float:
|
||||
return cond if cond > -1 else limit
|
||||
|
||||
|
||||
def get_concurrency_group(test: Test) -> Tuple[str, int]:
|
||||
try:
|
||||
test_cpus, test_gpus = get_test_resources(test)
|
||||
except Exception as e:
|
||||
logger.warning(f"Couldn't get test resources for test {test['name']}: {e}")
|
||||
return "small", CONCURRENY_GROUPS["small"]
|
||||
|
||||
for condition in gpu_cpu_to_concurrency_groups:
|
||||
min_gpu = parse_condition(condition.min_gpu, float("-inf"))
|
||||
max_gpu = parse_condition(condition.max_gpu, float("inf"))
|
||||
min_cpu = parse_condition(condition.min_cpu, float("-inf"))
|
||||
max_cpu = parse_condition(condition.max_cpu, float("inf"))
|
||||
|
||||
if min_cpu <= test_cpus <= max_cpu and min_gpu <= test_gpus <= max_gpu:
|
||||
group = condition.group
|
||||
return group, CONCURRENY_GROUPS[group]
|
||||
|
||||
# Return default
|
||||
logger.warning(
|
||||
f"Could not find concurrency group for test {test['name']} "
|
||||
f"based on used resources."
|
||||
)
|
||||
return "small", CONCURRENY_GROUPS["small"]
|
||||
|
||||
|
||||
def get_test_resources(test: Test) -> Tuple[int, int]:
|
||||
cluster_compute = load_test_cluster_compute(test)
|
||||
return get_test_resources_from_cluster_compute(cluster_compute)
|
||||
|
||||
|
||||
def get_test_resources_from_cluster_compute(cluster_compute: Dict) -> Tuple[int, int]:
|
||||
instances = []
|
||||
|
||||
# Add head node instance
|
||||
instances.append((cluster_compute["head_node_type"]["instance_type"], 1))
|
||||
|
||||
# Add worker node instances
|
||||
instances.extend(
|
||||
(w["instance_type"], w.get("max_workers", w.get("min_workers", 1)))
|
||||
for w in cluster_compute["worker_node_types"]
|
||||
)
|
||||
|
||||
aws_instance_types = load_instance_types()
|
||||
total_cpus = 0
|
||||
total_gpus = 0
|
||||
|
||||
for instance, count in instances:
|
||||
if instance in aws_instance_types:
|
||||
instance_cpus, instance_gpus = aws_instance_types[instance]
|
||||
elif instance in gcp_gpu_instances:
|
||||
instance_cpus, instance_gpus = gcp_gpu_instances[instance]
|
||||
else:
|
||||
instance_cpus, instance_gpus = parse_instance_resources(instance)
|
||||
|
||||
total_cpus += instance_cpus * count
|
||||
total_gpus += instance_gpus * count
|
||||
|
||||
return total_cpus, total_gpus
|
|
@ -25,6 +25,21 @@ frequency_str_to_enum = {
|
|||
}
|
||||
|
||||
|
||||
class Priority(enum.Enum):
|
||||
DEFAULT = 0
|
||||
MANUAL = 10
|
||||
HIGH = 50
|
||||
HIGHEST = 100
|
||||
|
||||
|
||||
priority_str_to_enum = {
|
||||
"default": Priority.DEFAULT,
|
||||
"manual": Priority.MANUAL,
|
||||
"high": Priority.HIGH,
|
||||
"highest": Priority.HIGHEST,
|
||||
}
|
||||
|
||||
|
||||
def get_frequency(frequency_str: str) -> Frequency:
|
||||
frequency_str = frequency_str.lower()
|
||||
if frequency_str not in frequency_str_to_enum:
|
||||
|
@ -35,6 +50,16 @@ def get_frequency(frequency_str: str) -> Frequency:
|
|||
return frequency_str_to_enum[frequency_str]
|
||||
|
||||
|
||||
def get_priority(priority_str: str) -> Priority:
|
||||
priority_str = priority_str.lower()
|
||||
if priority_str not in priority_str_to_enum:
|
||||
raise ReleaseTestConfigError(
|
||||
f"Priority not found: {priority_str}. Must be one of "
|
||||
f"{list(priority_str_to_enum.keys())}."
|
||||
)
|
||||
return priority_str_to_enum[priority_str]
|
||||
|
||||
|
||||
def split_ray_repo_str(repo_str: str) -> Tuple[str, str]:
|
||||
if "https://" in repo_str:
|
||||
if "/tree/" in repo_str:
|
||||
|
@ -83,6 +108,8 @@ def get_default_settings() -> Dict:
|
|||
"ray_wheels": None,
|
||||
"ray_test_repo": None,
|
||||
"ray_test_branch": None,
|
||||
"priority": Priority.DEFAULT,
|
||||
"no_concurrency_limit": False,
|
||||
}
|
||||
return settings
|
||||
|
||||
|
@ -104,6 +131,12 @@ def update_settings_from_environment(settings: Dict) -> Dict:
|
|||
if "TEST_NAME" in os.environ:
|
||||
settings["test_name_filter"] = os.environ["TEST_NAME"]
|
||||
|
||||
if "RELEASE_PRIORITY" in os.environ:
|
||||
settings["priority"] = get_priority(os.environ["RELEASE_PRIORITY"])
|
||||
|
||||
if "NO_CONCURRENCY_LIMIT" in os.environ:
|
||||
settings["no_concurrency_limit"] = bool(int(os.environ["NO_CONCURRENCY_LIMIT"]))
|
||||
|
||||
return settings
|
||||
|
||||
|
||||
|
@ -126,4 +159,12 @@ def update_settings_from_buildkite(settings: Dict):
|
|||
if ray_wheels:
|
||||
settings["test_name_filter"] = test_name_filter
|
||||
|
||||
test_priority = get_buildkite_prompt_value("release-priority")
|
||||
if test_priority:
|
||||
settings["priority"] = get_priority(test_priority)
|
||||
|
||||
no_concurrency_limit = get_buildkite_prompt_value("release-no-concurrency-limit")
|
||||
if no_concurrency_limit == "yes":
|
||||
settings["no_concurrency_limit"] = True
|
||||
|
||||
return settings
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
import copy
|
||||
from typing import Optional, Dict
|
||||
|
||||
from ray_release.buildkite.concurrency import CONCURRENY_GROUPS, get_concurrency_group
|
||||
from ray_release.config import Test, get_test_env_var
|
||||
from ray_release.exception import ReleaseTestConfigError
|
||||
|
||||
DEFAULT_STEP_TEMPLATE = {
|
||||
"env": {
|
||||
|
@ -30,6 +32,7 @@ DEFAULT_STEP_TEMPLATE = {
|
|||
}
|
||||
],
|
||||
"artifact_paths": ["/tmp/ray_release_test_artifacts/**/*"],
|
||||
"priority": 0,
|
||||
}
|
||||
|
||||
|
||||
|
@ -38,6 +41,7 @@ def get_step(
|
|||
smoke_test: bool = False,
|
||||
ray_wheels: Optional[str] = None,
|
||||
env: Optional[Dict] = None,
|
||||
priority_val: int = 0,
|
||||
):
|
||||
env = env or {}
|
||||
|
||||
|
@ -57,6 +61,21 @@ def get_step(
|
|||
branch = get_test_env_var("RAY_BRANCH")
|
||||
label = commit[:7] if commit else branch
|
||||
|
||||
concurrency_group = test.get("concurrency_group", None)
|
||||
if concurrency_group:
|
||||
if concurrency_group not in CONCURRENY_GROUPS:
|
||||
raise ReleaseTestConfigError(
|
||||
f"Unknown concurrency group: {concurrency_group}"
|
||||
)
|
||||
concurrency_limit = CONCURRENY_GROUPS[concurrency_group]
|
||||
else:
|
||||
concurrency_group, concurrency_limit = get_concurrency_group(test)
|
||||
|
||||
step["concurrency_group"] = concurrency_group
|
||||
step["concurrency"] = concurrency_limit
|
||||
|
||||
step["priority"] = priority_val
|
||||
|
||||
step["label"] = test["name"]
|
||||
if smoke_test:
|
||||
step["label"] += " [smoke test] "
|
||||
|
|
|
@ -69,14 +69,17 @@ def main(test_collection_file: Optional[str] = None):
|
|||
frequency = settings["frequency"]
|
||||
test_name_filter = settings["test_name_filter"]
|
||||
ray_wheels = settings["ray_wheels"]
|
||||
priority = settings["priority"]
|
||||
|
||||
logger.info(
|
||||
f"Found the following buildkite pipeline settings:\n\n"
|
||||
f" frequency = {settings['frequency']}\n"
|
||||
f" test_name_filter = {settings['test_name_filter']}\n"
|
||||
f" ray_wheels = {settings['ray_wheels']}\n"
|
||||
f" ray_test_repo = {settings['ray_test_repo']}\n"
|
||||
f" ray_test_branch = {settings['ray_test_branch']}\n"
|
||||
f" frequency = {settings['frequency']}\n"
|
||||
f" test_name_filter = {settings['test_name_filter']}\n"
|
||||
f" ray_wheels = {settings['ray_wheels']}\n"
|
||||
f" ray_test_repo = {settings['ray_test_repo']}\n"
|
||||
f" ray_test_branch = {settings['ray_test_branch']}\n"
|
||||
f" priority = {settings['priority']}\n"
|
||||
f" no_concurrency_limit = {settings['no_concurrency_limit']}\n"
|
||||
)
|
||||
|
||||
filtered_tests = filter_tests(
|
||||
|
@ -108,14 +111,27 @@ def main(test_collection_file: Optional[str] = None):
|
|||
)
|
||||
logger.info(f"Starting pipeline for Ray wheel: {ray_wheels_url}")
|
||||
|
||||
no_concurrency_limit = settings["no_concurrency_limit"]
|
||||
if no_concurrency_limit:
|
||||
logger.warning("Concurrency is not limited for this run!")
|
||||
|
||||
steps = []
|
||||
for group in sorted(grouped_tests):
|
||||
tests = grouped_tests[group]
|
||||
group_steps = []
|
||||
for test, smoke_test in tests:
|
||||
step = get_step(
|
||||
test, smoke_test=smoke_test, ray_wheels=ray_wheels_url, env=env
|
||||
test,
|
||||
smoke_test=smoke_test,
|
||||
ray_wheels=ray_wheels_url,
|
||||
env=env,
|
||||
priority_val=priority.value,
|
||||
)
|
||||
|
||||
if no_concurrency_limit:
|
||||
step.pop("concurrency", None)
|
||||
step.pop("concurrency_group", None)
|
||||
|
||||
group_steps.append(step)
|
||||
|
||||
group_step = {"group": group, "steps": group_steps}
|
||||
|
@ -131,6 +147,7 @@ def main(test_collection_file: Optional[str] = None):
|
|||
json.dump(steps, fp)
|
||||
|
||||
settings["frequency"] = settings["frequency"].value
|
||||
settings["priority"] = settings["priority"].value
|
||||
with open(os.path.join(PIPELINE_ARTIFACT_PATH, "settings.json"), "wt") as fp:
|
||||
json.dump(settings, fp)
|
||||
|
||||
|
|
45
release/ray_release/scripts/get_aws_instance_information.py
Normal file
45
release/ray_release/scripts/get_aws_instance_information.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
import csv
|
||||
import sys
|
||||
from typing import List, Tuple, Dict
|
||||
|
||||
import boto3
|
||||
|
||||
|
||||
def get_aws_instance_information() -> List[Dict[str, Tuple[int, int]]]:
|
||||
rows = []
|
||||
client = boto3.client("ec2")
|
||||
|
||||
args = {}
|
||||
while True:
|
||||
result = client.describe_instance_types(**args)
|
||||
|
||||
for instance in result["InstanceTypes"]:
|
||||
num_cpus = instance["VCpuInfo"]["DefaultVCpus"]
|
||||
num_gpus = sum(
|
||||
gpu["Count"] for gpu in instance.get("GpuInfo", {"Gpus": []})["Gpus"]
|
||||
)
|
||||
rows.append(
|
||||
{
|
||||
"instance": instance["InstanceType"],
|
||||
"cpus": num_cpus,
|
||||
"gpus": num_gpus,
|
||||
}
|
||||
)
|
||||
|
||||
if "NextToken" not in result:
|
||||
break
|
||||
|
||||
args["NextToken"] = result["NextToken"]
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
rows = []
|
||||
|
||||
rows += get_aws_instance_information()
|
||||
|
||||
writer = csv.DictWriter(fieldnames=["instance", "cpus", "gpus"], f=sys.stdout)
|
||||
writer.writeheader()
|
||||
for row in sorted(rows, key=lambda item: item["instance"]):
|
||||
writer.writerow(row)
|
|
@ -3,6 +3,11 @@ import unittest
|
|||
from typing import Dict
|
||||
from unittest.mock import patch
|
||||
|
||||
from ray_release.buildkite.concurrency import (
|
||||
get_test_resources_from_cluster_compute,
|
||||
get_concurrency_group,
|
||||
CONCURRENY_GROUPS,
|
||||
)
|
||||
from ray_release.buildkite.filter import filter_tests, group_tests
|
||||
from ray_release.buildkite.settings import (
|
||||
split_ray_repo_str,
|
||||
|
@ -10,6 +15,7 @@ from ray_release.buildkite.settings import (
|
|||
update_settings_from_environment,
|
||||
Frequency,
|
||||
update_settings_from_buildkite,
|
||||
Priority,
|
||||
)
|
||||
from ray_release.buildkite.step import get_step
|
||||
from ray_release.config import Test
|
||||
|
@ -74,11 +80,18 @@ class BuildkiteSettingsTest(unittest.TestCase):
|
|||
with self.assertRaises(ReleaseTestConfigError):
|
||||
update_settings_from_environment(updated_settings)
|
||||
|
||||
# Invalid priority
|
||||
os.environ["RELEASE_PRIORITY"] = "invalid"
|
||||
updated_settings = settings.copy()
|
||||
with self.assertRaises(ReleaseTestConfigError):
|
||||
update_settings_from_environment(updated_settings)
|
||||
|
||||
os.environ["RELEASE_FREQUENCY"] = "nightly"
|
||||
os.environ["RAY_TEST_REPO"] = "https://github.com/user/ray.git"
|
||||
os.environ["RAY_TEST_BRANCH"] = "sub/branch"
|
||||
os.environ["RAY_WHEELS"] = "custom-wheels"
|
||||
os.environ["TEST_NAME"] = "name_filter"
|
||||
os.environ["RELEASE_PRIORITY"] = "manual"
|
||||
updated_settings = settings.copy()
|
||||
update_settings_from_environment(updated_settings)
|
||||
|
||||
|
@ -90,6 +103,8 @@ class BuildkiteSettingsTest(unittest.TestCase):
|
|||
"ray_wheels": "custom-wheels",
|
||||
"ray_test_repo": "https://github.com/user/ray.git",
|
||||
"ray_test_branch": "sub/branch",
|
||||
"priority": Priority.MANUAL,
|
||||
"no_concurrency_limit": False,
|
||||
},
|
||||
)
|
||||
|
||||
|
@ -113,10 +128,17 @@ class BuildkiteSettingsTest(unittest.TestCase):
|
|||
with self.assertRaises(ReleaseTestConfigError):
|
||||
update_settings_from_buildkite(updated_settings)
|
||||
|
||||
# Invalid priority
|
||||
self.buildkite["release-priority"] = "invalid"
|
||||
updated_settings = settings.copy()
|
||||
with self.assertRaises(ReleaseTestConfigError):
|
||||
update_settings_from_buildkite(updated_settings)
|
||||
|
||||
self.buildkite["release-frequency"] = "nightly"
|
||||
self.buildkite["release-ray-test-repo-branch"] = "user:sub/branch"
|
||||
self.buildkite["release-ray-wheels"] = "custom-wheels"
|
||||
self.buildkite["release-test-name"] = "name_filter"
|
||||
self.buildkite["release-priority"] = "manual"
|
||||
updated_settings = settings.copy()
|
||||
update_settings_from_buildkite(updated_settings)
|
||||
|
||||
|
@ -128,6 +150,8 @@ class BuildkiteSettingsTest(unittest.TestCase):
|
|||
"ray_wheels": "custom-wheels",
|
||||
"ray_test_repo": "https://github.com/user/ray.git",
|
||||
"ray_test_branch": "sub/branch",
|
||||
"priority": Priority.MANUAL,
|
||||
"no_concurrency_limit": False,
|
||||
},
|
||||
)
|
||||
|
||||
|
@ -238,3 +262,78 @@ class BuildkiteSettingsTest(unittest.TestCase):
|
|||
|
||||
step = get_step(test, smoke_test=True)
|
||||
self.assertIn("--smoke-test", step["command"])
|
||||
|
||||
step = get_step(test, priority_val=20)
|
||||
self.assertEqual(step["priority"], 20)
|
||||
|
||||
def testInstanceResources(self):
|
||||
# AWS instances
|
||||
cpus, gpus = get_test_resources_from_cluster_compute(
|
||||
{
|
||||
"head_node_type": {"instance_type": "m5.4xlarge"}, # 16 CPUs, 0 GPUs
|
||||
"worker_node_types": [
|
||||
{
|
||||
"instance_type": "m5.8xlarge", # 32 CPUS, 0 GPUs
|
||||
"max_workers": 4,
|
||||
},
|
||||
{
|
||||
"instance_type": "g3.8xlarge", # 32 CPUs, 2 GPUs
|
||||
"min_workers": 8,
|
||||
},
|
||||
],
|
||||
}
|
||||
)
|
||||
self.assertEqual(cpus, 16 + 32 * 4 + 32 * 8)
|
||||
self.assertEqual(gpus, 2 * 8)
|
||||
|
||||
cpus, gpus = get_test_resources_from_cluster_compute(
|
||||
{
|
||||
"head_node_type": {
|
||||
"instance_type": "n1-standard-16" # 16 CPUs, 0 GPUs
|
||||
},
|
||||
"worker_node_types": [
|
||||
{
|
||||
"instance_type": "random-str-xxx-32", # 32 CPUS, 0 GPUs
|
||||
"max_workers": 4,
|
||||
},
|
||||
{
|
||||
"instance_type": "a2-highgpu-2g", # 24 CPUs, 2 GPUs
|
||||
"min_workers": 8,
|
||||
},
|
||||
],
|
||||
}
|
||||
)
|
||||
self.assertEqual(cpus, 16 + 32 * 4 + 24 * 8)
|
||||
self.assertEqual(gpus, 2 * 8)
|
||||
|
||||
def testConcurrencyGroups(self):
|
||||
def _return(ret):
|
||||
def _inner(*args, **kwargs):
|
||||
return ret
|
||||
|
||||
return _inner
|
||||
|
||||
test = Test(
|
||||
{
|
||||
"name": "test_1",
|
||||
}
|
||||
)
|
||||
|
||||
def test_concurrency(cpu, gpu, group):
|
||||
with patch(
|
||||
"ray_release.buildkite.concurrency.get_test_resources",
|
||||
_return((cpu, gpu)),
|
||||
):
|
||||
group_name, limit = get_concurrency_group(test)
|
||||
self.assertEqual(group_name, group)
|
||||
self.assertEqual(limit, CONCURRENY_GROUPS[group_name])
|
||||
|
||||
test_concurrency(12800, 8, "large-gpu")
|
||||
test_concurrency(12800, 7, "small-gpu")
|
||||
test_concurrency(12800, 1, "small-gpu")
|
||||
test_concurrency(12800, 0, "large")
|
||||
test_concurrency(512, 0, "large")
|
||||
test_concurrency(511, 0, "medium")
|
||||
test_concurrency(128, 0, "medium")
|
||||
test_concurrency(127, 0, "small")
|
||||
test_concurrency(1, 0, "small")
|
||||
|
|
Loading…
Add table
Reference in a new issue