oom release test

Signed-off-by: Clarence Ng <clarence.wyng@gmail.com>
2025-03-06 10:31:39 -05:00 · 2022-08-31 19:05:42 -07:00 · 2022-08-31 19:05:42 -07:00 · 670c7da148
commit 670c7da148
parent 5cec2492bb
4 changed files with 89 additions and 0 deletions
--- a/release/core_tests/app_cpu_nightly.yaml
+++ b/release/core_tests/app_cpu_nightly.yaml
@ -0,0 +1,13 @@
+base_image: {{ env["RAY_IMAGE_NIGHTLY_CPU"] | default("anyscale/ray:nightly-py37") }}
+env_vars: {}
+debian_packages:
+  - curl
+
+python:
+  pip_packages:
+    - pytest
+  conda_packages: []
+
+post_build_cmds:
+  - pip3 uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
+  - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
--- a/release/core_tests/compute_2_cpu.yaml
+++ b/release/core_tests/compute_2_cpu.yaml
@ -0,0 +1,15 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+max_workers: 1
+
+head_node_type:
+    name: head_node
+    instance_type: m5.2xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: m5.2xlarge
+      max_workers: 1
+      min_workers: 1
+      use_spot: false
--- a/release/core_tests/oom_actor_test.py
+++ b/release/core_tests/oom_actor_test.py
@ -0,0 +1,43 @@
+
+"""Job submission test
+
+This test runs a basic Tune job on a remote cluster.
+
+Test owner: architkulkarni
+
+Acceptance criteria: Should run through and print "PASSED"
+"""
+
+from math import ceil
+import time
+import ray
+import psutil
+
+
+def get_additional_bytes_to_reach_memory_usage_pct(pct: float) -> None:
+    node_mem = psutil.virtual_memory()
+    used = node_mem.total - node_mem.available
+    bytes_needed = node_mem.total * pct - used
+    assert bytes_needed > 0, "node has less memory than what is requested"
+    return bytes_needed
+
+
+@ray.remote(max_retries=-1)
+def inf_retry(
+    allocate_bytes: int, num_chunks: int = 10, allocate_interval_s: float = 0
+):
+    start = time.time()
+    chunks = []
+    # divide by 8 as each element in the array occupies 8 bytes
+    bytes_per_chunk = allocate_bytes / 8 / num_chunks
+    for _ in range(num_chunks):
+        chunks.append([0] * ceil(bytes_per_chunk))
+        time.sleep(allocate_interval_s)
+    end = time.time()
+    return end - start
+
+
+if __name__ == "__main__":
+    bytes_to_alloc = get_additional_bytes_to_reach_memory_usage_pct(1)
+    ray.get(inf_retry.remote(bytes_to_alloc))
+    
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@ -4629,6 +4629,24 @@
    type: sdk_command
    file_manager: sdk

+- name: oom_actor_tests
+  group: core-daily-test
+  working_dir: core_tests
+
+  frequency: nightly
+  team: core
+  cluster:
+    cluster_env: app_config_cpu_nightly.yaml
+    cluster_compute: compute_cpu_2.yaml
+
+  run:
+    timeout: 600
+    script: python oom_actor_test.py
+    wait_for_nodes:
+      num_nodes: 2
+    type: sdk_command
+    file_manager: sdk
+
 - name: k8s_serve_ha_test
  group: k8s-test
  working_dir: k8s_tests