oom release test

Signed-off-by: Clarence Ng <clarence.wyng@gmail.com>
This commit is contained in:
Clarence Ng 2022-08-31 19:05:42 -07:00
parent 5cec2492bb
commit 670c7da148
4 changed files with 89 additions and 0 deletions

View file

@ -0,0 +1,13 @@
base_image: {{ env["RAY_IMAGE_NIGHTLY_CPU"] | default("anyscale/ray:nightly-py37") }}
env_vars: {}
debian_packages:
- curl
python:
pip_packages:
- pytest
conda_packages: []
post_build_cmds:
- pip3 uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
- {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}

View file

@ -0,0 +1,15 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2
max_workers: 1
head_node_type:
name: head_node
instance_type: m5.2xlarge
worker_node_types:
- name: worker_node
instance_type: m5.2xlarge
max_workers: 1
min_workers: 1
use_spot: false

View file

@ -0,0 +1,43 @@
"""Job submission test
This test runs a basic Tune job on a remote cluster.
Test owner: architkulkarni
Acceptance criteria: Should run through and print "PASSED"
"""
from math import ceil
import time
import ray
import psutil
def get_additional_bytes_to_reach_memory_usage_pct(pct: float) -> None:
node_mem = psutil.virtual_memory()
used = node_mem.total - node_mem.available
bytes_needed = node_mem.total * pct - used
assert bytes_needed > 0, "node has less memory than what is requested"
return bytes_needed
@ray.remote(max_retries=-1)
def inf_retry(
allocate_bytes: int, num_chunks: int = 10, allocate_interval_s: float = 0
):
start = time.time()
chunks = []
# divide by 8 as each element in the array occupies 8 bytes
bytes_per_chunk = allocate_bytes / 8 / num_chunks
for _ in range(num_chunks):
chunks.append([0] * ceil(bytes_per_chunk))
time.sleep(allocate_interval_s)
end = time.time()
return end - start
if __name__ == "__main__":
bytes_to_alloc = get_additional_bytes_to_reach_memory_usage_pct(1)
ray.get(inf_retry.remote(bytes_to_alloc))

View file

@ -4629,6 +4629,24 @@
type: sdk_command
file_manager: sdk
- name: oom_actor_tests
group: core-daily-test
working_dir: core_tests
frequency: nightly
team: core
cluster:
cluster_env: app_config_cpu_nightly.yaml
cluster_compute: compute_cpu_2.yaml
run:
timeout: 600
script: python oom_actor_test.py
wait_for_nodes:
num_nodes: 2
type: sdk_command
file_manager: sdk
- name: k8s_serve_ha_test
group: k8s-test
working_dir: k8s_tests