[TEST] Additional data processing nightly test (#16078)

* in progress * in progress * almost done * Lint * almost done * All tests are available now * Change the test a little more stressful * Modify paramter to make tests a little more stressful
2025-03-05 10:01:43 -05:00 · 2021-06-09 22:38:53 -07:00 · 2021-06-09 22:38:53 -07:00 · c8a5d7ba85
commit c8a5d7ba85
parent d390344a8f
12 changed files with 139 additions and 75 deletions
--- a/.gitignore
+++ b/.gitignore
@ -190,3 +190,7 @@ project-id
 # gitpod cache related
 .pip-cache/
 .bazel-cache/
+
+# release test related
+.anyscale.yaml
+test_state.json
--- a/release/data_processing_tests/dask-on-ray-test.sh
+++ b/release/data_processing_tests/dask-on-ray-test.sh
@ -1,7 +1,8 @@
-#!/usr/bin/env bash
+#!/bin/bash
+
 # NOTE: Only working for Python 3.7 on MacOS.
 # NOTE: Please modify the wheel URL.
-DASK_VERSION=("2021.4.0" "2021.3.1" "2021.2.0" "2021.1.1" "2020.12.0")
+DASK_VERSION=("2021.5.0" "2021.4.1" "2021.4.0" "2021.3.1" "2021.2.0" "2021.1.1" "2020.12.0")

 unset RAY_ADDRESS

@ -12,21 +13,21 @@ echo "Please run vi dask-on-ray-test.sh and modify the ray wheel properly."
 echo "Also make sure that you are in the right branch on your repo."
 echo "For example, if you are using releases/1.3.0 wheel, you should checkout to that repo."
 echo "Example: git checkout -b releases/1.3.0 upstream/releases/1.3.0"
-exit 1
-# pip uninstall -y ray
-# pip install -U "ray[full] @ https://s3-us-west-2.amazonaws.com/ray-wheels/releases/1.3.0/cb3661e547662f309a0cc55c5495b3adb779a309/ray-1.3.0-cp37-cp37m-macosx_10_13_intel.whl"
+#exit 1
+pip uninstall -y ray
+pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/1.4.0/0e95428a5975b774d266893102c39c8e137da5d8/ray-1.4.0-cp38-cp38-manylinux2014_x86_64.whl

 for dask_version in "${DASK_VERSION[@]}"
 do   # The quotes are necessary here
    echo "=================================================="
    echo "Downloading Dask of version '${dask_version}'"
    pip uninstall -y dask
-    pip install -U dask=="$dask_version"
+    pip install -U "dask[complete]==${dask_version}"
    printf "==================================================\n\n\n"
    echo "=================================================="
    echo "Running tests against dask version ${dask_version}"
-    pytest -v ../../python/ray/tests/test_dask_scheduler.py
-    pytest -v ../../python/ray/tests/test_dask_callback.py
-    pytest -v ../../python/ray/tests/test_dask_optimization.py
+    python -m pytest -v ../../python/ray/tests/test_dask_scheduler.py
+    python -m pytest -v ../../python/ray/tests/test_dask_callback.py
+    python -m pytest -v ../../python/ray/tests/test_dask_optimization.py
    printf "==================================================\n\n\n"
 done
--- a/release/data_processing_tests/dask_on_ray.yaml
+++ b/release/data_processing_tests/dask_on_ray.yaml
@ -23,7 +23,7 @@ docker:
    container_name: ""
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
-    pull_before_run: True
+    pull_before_run: False
    run_options: []  # Extra options to pass into "docker run"

    # Example of running a GPU head with CPU workers
@ -43,7 +43,7 @@ provider:
    # Availability zone(s), comma-separated, that nodes may be launched in.
    # Nodes are currently spread between zones by a round-robin approach,
    # however this implementation detail should not be relied upon.
-    availability_zone: us-west-2a,us-west-2b
+    availability_zone: us-west-2a
    # Whether to allow node reuse. If set to False, nodes will be terminated
    # instead of stopped.
    cache_stopped_nodes: True # If not present, the default is True.
@ -62,13 +62,12 @@ available_node_types:
            InstanceType: r5n.16xlarge
        # For AWS instances, autoscaler will automatically add the available
        # CPUs/GPUs/accelerator_type ({"CPU": 4} for m4.xlarge) in "resources".
-        # resources: {"CPU": 4}
        resources: {"CPU": 0}
    worker_node_t:
        node_config:
            InstanceType: r5.4xlarge
        # Autoscaler will auto fill the CPU resources below.
-        resources: {"CPU": 96}
+        resources: {"CPU": 16}
        min_workers: 250
        max_workers: 250

@ -140,9 +139,9 @@ setup_commands:
 # Command to start ray on the head node. You don't need to change this.
 head_start_ray_commands:
    - ray stop
-    - ray start --num-cpus=0 --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --system-config='{"asio_event_loop_stats_collection_enabled":true,"scheduler_loadbalance_spillback":true}'
+    - ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --system-config='{"asio_event_loop_stats_collection_enabled":true,"locality_aware_leasing_enabled":true,"scheduler_hybrid_scheduling":false,"scheduler_loadbalance_spillback":true,"overcommit_plasma_memory":false}'

 # Command to start ray on worker nodes. You don't need to change this.
 worker_start_ray_commands:
    - ray stop
-    - ray start --address=$RAY_HEAD_IP:6379 --num-cpus=96 --object-manager-port=8076
+    - ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
--- a/release/data_processing_tests/workloads/dask_on_ray_large_scale_test.py
+++ b/release/data_processing_tests/workloads/dask_on_ray_large_scale_test.py
@ -37,12 +37,13 @@ we determine the batch_size based on script parameters.

 """

-MINUTES_IN_A_MONTH = 43800
+# MINUTES_IN_A_MONTH = 43800
+MINUTES_IN_A_MONTH = 10950
 NUM_MINS_PER_OUTPUT_FILE = 30
-SAMPLING_RATE = 2000000
-SECONDS_IN_A_MIN = 60
+SAMPLING_RATE = 200000
+SECONDS_IN_A_MIN = 20
 INPUT_SHAPE = (3, SAMPLING_RATE * SECONDS_IN_A_MIN)
-PEAK_MEMORY_CONSUMPTION_IN_GB = 60
+PEAK_MEMORY_CONSUMPTION_IN_GB = 20

 logging.basicConfig(
    format="%(asctime)s %(levelname)-8s %(message)s",
--- a/release/nightly_tests/dask_on_ray/large_scale_dask_on_ray_app_config.yaml
+++ b/release/nightly_tests/dask_on_ray/large_scale_dask_on_ray_app_config.yaml
@ -0,0 +1,14 @@
+base_image: "anyscale/ray-ml:pinned-nightly-py37"
+env_vars: {"RAY_scheduler_hybrid_threshold": "0"}
+debian_packages: []
+
+python:
+  pip_packages: ["dask[complete]", tqdm, scipy, xarray, zarr, boto, s3fs, pyarrow]
+  conda_packages: []
+
+post_build_cmds:
+  # - pip install fastparquet
+  - pip3 uninstall -y ray
+  - pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
+  - pip3 install -U ray[default]
+  - echo {{env["DATESTAMP"]}}
--- a/release/nightly_tests/dask_on_ray/v1_large_scale_dask_on_ray_compute_template.yaml
+++ b/release/nightly_tests/dask_on_ray/v1_large_scale_dask_on_ray_compute_template.yaml
@ -1,16 +1,15 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2

-max_workers: 20
-
-# TODO(ekl/sang) switch to i2.8xl and mount the NVMe disks
 head_node_type:
    name: head_node
    instance_type: i3.8xlarge
+    resources: {"object_store_memory": 53687091200}

 worker_node_types:
    - name: worker_node
      instance_type: i3.8xlarge
-      min_workers: 20
-      max_workers: 20
+      min_workers: 4
+      max_workers: 4
      use_spot: false
+      resources: {"object_store_memory": 53687091200}
--- a/release/nightly_tests/dask_on_ray/large_scale_test.py
+++ b/release/nightly_tests/dask_on_ray/large_scale_test.py
@ -15,6 +15,7 @@ import dask.array
 import xarray
 from ray.util.dask import ray_dask_get
 import math
+import json
 """
 We simulate a real-life usecase where we process a time-series
 data of 1 month, using Dask/Xarray on a Ray cluster.
@ -27,22 +28,22 @@ Perform decimation to reduce data size.
 Perform decimation to reduce data size.

 (3) Segment the Xarray from (2) into 30-minute Xarrays;
-at this point, we have 43800 / 30 = 1460 Xarrays.
+at this point, we have 4380 / 30 = 146 Xarrays.

 (4) Trigger save to disk for each of the 30-minute Xarrays.
-This triggers Dask computations; there will be 1460 graphs.
+This triggers Dask computations; there will be 146 graphs.
 Since 1460 graphs is too much to process at once,
 we determine the batch_size based on script parameters.
 (e.g. if batch_size is 100, we'll have 15 batches).

 """

-MINUTES_IN_A_MONTH = 43800
+MINUTES_IN_A_MONTH = 500
 NUM_MINS_PER_OUTPUT_FILE = 30
-SAMPLING_RATE = 2000000
+SAMPLING_RATE = 200000
 SECONDS_IN_A_MIN = 60
 INPUT_SHAPE = (3, SAMPLING_RATE * SECONDS_IN_A_MIN)
-PEAK_MEMORY_CONSUMPTION_IN_GB = 60
+PEAK_MEMORY_CONSUMPTION_IN_GB = 6

 logging.basicConfig(
    format="%(asctime)s %(levelname)-8s %(message)s",
@ -141,7 +142,7 @@ class LoadRoutines:
    def load_array_one_minute(test_spec: TestSpec) -> np.ndarray:
        """
        Load an array representing 1 minute of data. Each load consumes
-        ~1.44GB of memory (3 * 2000000 * 60 * 4 (bytes in a float)) = ~1.44
+        ~0.144GB of memory (3 * 200000 * 60 * 4 (bytes in a float)) = ~0.14GB

        In real life, this is loaded from cloud storage or disk.
        """
@ -420,36 +421,44 @@ def parse_script_args():
 def main():
    args, unknown = parse_script_args()
    logging.info("Received arguments: {}".format(args))
+    success = 1
+    try:
+        # Create test spec
+        test_spec = TestSpec(
+            num_workers=args.num_workers,
+            worker_obj_store_size_in_gb=args.worker_obj_store_size_in_gb,
+            error_rate=args.error_rate,
+            trigger_object_spill=args.trigger_object_spill,
+        )
+        logging.info("Created test spec: {}".format(test_spec))

-    # Create test spec
-    test_spec = TestSpec(
-        num_workers=args.num_workers,
-        worker_obj_store_size_in_gb=args.worker_obj_store_size_in_gb,
-        error_rate=args.error_rate,
-        trigger_object_spill=args.trigger_object_spill,
-    )
-    logging.info("Created test spec: {}".format(test_spec))
+        # Create the data save path if it doesn't exist.
+        data_save_path = args.data_save_path
+        if not os.path.exists(data_save_path):
+            os.makedirs(data_save_path, mode=0o777, exist_ok=True)
+        os.chmod(data_save_path, mode=0o777)

-    # Create the data save path if it doesn't exist.
-    data_save_path = args.data_save_path
-    if not os.path.exists(data_save_path):
-        os.makedirs(data_save_path, mode=0o777, exist_ok=True)
-    os.chmod(data_save_path, mode=0o777)
+        # Lazily construct Xarrays
+        xarray_filename_pairs = lazy_create_xarray_filename_pairs(test_spec)

-    # Lazily construct Xarrays
-    xarray_filename_pairs = lazy_create_xarray_filename_pairs(test_spec)
+        # Connect to the Ray cluster
+        ray.init(address="auto")

-    # Connect to the Ray cluster
-    ray.init(address="auto")
-
-    # Save all the Xarrays to disk; this will trigger Dask computations on Ray.
-    logging.info("Saving {} xarrays..".format(len(xarray_filename_pairs)))
-    SaveRoutines.save_all_xarrays(
-        xarray_filename_pairs=xarray_filename_pairs,
-        dirpath=data_save_path,
-        batch_size=test_spec.batch_size,
-        ray_scheduler=ray_dask_get,
-    )
+        # Save all the Xarrays to disk; this will trigger
+        # Dask computations on Ray.
+        logging.info("Saving {} xarrays..".format(len(xarray_filename_pairs)))
+        SaveRoutines.save_all_xarrays(
+            xarray_filename_pairs=xarray_filename_pairs,
+            dirpath=data_save_path,
+            batch_size=test_spec.batch_size,
+            ray_scheduler=ray_dask_get,
+        )
+        print(ray.internal.internal_api.memory_summary(stats_only=True))
+    except Exception as e:
+        logging.exception(e)
+        success = 0
+    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
+        f.write(json.dumps({"success": success}))


 if __name__ == "__main__":
--- a/release/nightly_tests/nightly_tests.yaml
+++ b/release/nightly_tests/nightly_tests.yaml
@ -154,21 +154,39 @@
      compute_template: shuffle/shuffle_compute_smoke.yaml  # Does not exist yet

 # Test multi nodes 100GB shuffle with a large number of partitions.
-# TODO(sang): Not working due to a bug https://github.com/ray-project/ray/issues/16025.
-# - name: shuffle_100gb_large_partition
-#   owner:
-#     mail: "sangcho@anyscale.com"
-#     slack: "@proj-data-processing"
+- name: shuffle_1tb_large_partition
+  owner:
+    mail: "sangcho@anyscale.com"
+    slack: "@proj-data-processing"

-#   cluster:
-#     app_config: shuffle/shuffle_app_config.yaml
-#     compute_template: shuffle/shuffle_compute_multi.yaml
+  cluster:
+    app_config: shuffle/shuffle_app_config.yaml
+    compute_template: shuffle/shuffle_compute_large_scale.yaml

-#   run:
-#     timeout: 3000
-#     prepare: python wait_cluster.py 4 600
-#     script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=100e6
+  run:
+    timeout: 3000
+    prepare: python wait_cluster.py 20 600
+    script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9

-#   smoke_test:
-#     cluster:
-#       compute_template: shuffle/shuffle_compute_smoke.yaml  # Does not exist yet
+  smoke_test:
+    cluster:
+      compute_template: shuffle/shuffle_compute_smoke.yaml  # Does not exist yet
+
+# Test large scale dask on ray test without spilling.
+- name: dask_on_ray_large_scale_test_no_spilling
+  owner:
+    mail: "sangcho@anyscale.com"
+    slack: "@proj-data-processing"
+
+  cluster:
+    app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
+    compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
+
+  run:
+    timeout: 7200
+    prepare: python wait_cluster.py 5 600
+    script: python dask_on_ray/large_scale_test.py --num_workers 16 --worker_obj_store_size_in_gb 20 --error_rate 0  --data_save_path /tmp/ray
+
+  smoke_test:
+    cluster:
+      compute_template: shuffle/shuffle_compute_smoke.yaml  # Does not exist yet
--- a/release/nightly_tests/shuffle/shuffle_compute_large_scale.yaml
+++ b/release/nightly_tests/shuffle/shuffle_compute_large_scale.yaml
@ -0,0 +1,15 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+head_node_type:
+    name: head_node
+    instance_type: i3.4xlarge
+    resources: {"object_store_memory": 21474836480}
+
+worker_node_types:
+    - name: worker_node
+      instance_type: i3.4xlarge
+      min_workers: 19
+      max_workers: 19
+      use_spot: false
+      resources: {"object_store_memory": 21474836480}
--- a/release/nightly_tests/shuffle/shuffle_compute_multi.yaml
+++ b/release/nightly_tests/shuffle/shuffle_compute_multi.yaml
@ -3,10 +3,10 @@ region: us-west-2

 max_workers: 3

-# TODO(ekl/sang) switch to i2.8xl and mount the NVMe disks
 head_node_type:
    name: head_node
    instance_type: i3.4xlarge
+    resources: {"object_store_memory": 21474836480}

 worker_node_types:
    - name: worker_node
@ -14,3 +14,4 @@ worker_node_types:
      min_workers: 3
      max_workers: 3
      use_spot: false
+      resources: {"object_store_memory": 21474836480}
--- a/release/nightly_tests/shuffle/shuffle_compute_single.yaml
+++ b/release/nightly_tests/shuffle/shuffle_compute_single.yaml
@ -3,13 +3,13 @@ region: us-west-2

 max_workers: 0

-# TODO(ekl/sang) switch to i2.8xl and mount the NVMe disks
 head_node_type:
-    name: head_node2
+    name: head_node
    instance_type: i3.4xlarge
+    resources: {"object_store_memory": 21474836480}

 worker_node_types:
-    - name: worker_node2
+    - name: worker_node
      instance_type: i3.4xlarge
      min_workers: 0
      max_workers: 0
--- a/release/nightly_tests/shuffle/shuffle_test.py
+++ b/release/nightly_tests/shuffle/shuffle_test.py
@ -19,6 +19,7 @@ if __name__ == "__main__":

    start = time.time()
    success = 1
+
    commands = [
        "python", "-m", "ray.experimental.shuffle", "--ray-address={}".format(
            os.environ["RAY_ADDRESS"]),
@ -35,6 +36,8 @@ if __name__ == "__main__":
        success = 0
    delta = time.time() - start

+    # Report the running time as 0 if it fails so that
+    # it is easy to be discovered from the graph.
    if not success:
        delta = 0
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f: