[core][tests] Add nightly test for datasets random_shuffle and sort (#23784)

Adding a large-scale nightly test for Datasets random_shuffle and sort. The test script generates random blocks and reports total run time and peak driver memory.
2025-03-06 02:21:39 -05:00 · 2022-04-08 11:31:10 -07:00 · 2022-04-08 11:31:10 -07:00 · ba484feac0
commit ba484feac0
parent c82f6c62c8
2 changed files with 220 additions and 0 deletions
--- a/release/nightly_tests/dataset/sort.py
+++ b/release/nightly_tests/dataset/sort.py
@ -0,0 +1,168 @@
 import ray
 import pandas as pd
 import numpy as np
 import time
 import builtins
 from typing import Any, Generic, List, Callable, Union, Tuple, Iterable
 import os
 import psutil
 import resource
 import json
 import numpy as np
 import ray
 from ray.types import ObjectRef
 from ray.data.block import (
    Block,
    BlockAccessor,
    BlockMetadata,
    T,
    BlockPartition,
    BlockPartitionMetadata,
    MaybeBlockPartition,
 )
 from ray.data.context import DatasetContext
 from ray.data.impl.arrow_block import ArrowRow
 from ray.data.impl.delegating_block_builder import DelegatingBlockBuilder
 from ray.data.impl.util import _check_pyarrow_version
 from ray.util.annotations import DeveloperAPI
 from ray.data.datasource import Datasource, ReadTask
 from ray.internal.internal_api import memory_summary
 class RandomIntRowDatasource(Datasource[ArrowRow]):
    """An example datasource that generates rows with random int64 columns.
    Examples:
        >>> source = RandomIntRowDatasource()
        >>> ray.data.read_datasource(source, n=10, num_columns=2).take()
        ... {'c_0': 1717767200176864416, 'c_1': 999657309586757214}
        ... {'c_0': 4983608804013926748, 'c_1': 1160140066899844087}
    """
    def prepare_read(
        self, parallelism: int, n: int, num_columns: int
    ) -> List[ReadTask]:
        _check_pyarrow_version()
        import pyarrow
        read_tasks: List[ReadTask] = []
        block_size = max(1, n // parallelism)
        def make_block(count: int, num_columns: int) -> Block:
            return pyarrow.Table.from_arrays(
                np.random.randint(
                    np.iinfo(np.int64).max, size=(num_columns, count), dtype=np.int64
                ),
                names=[f"c_{i}" for i in range(num_columns)],
            )
        schema = pyarrow.Table.from_pydict(
            {f"c_{i}": [0] for i in range(num_columns)}
        ).schema
        i = 0
        while i < n:
            count = min(block_size, n - i)
            meta = BlockMetadata(
                num_rows=count,
                size_bytes=8 * count * num_columns,
                schema=schema,
                input_files=None,
                exec_stats=None,
            )
            read_tasks.append(
                ReadTask(
                    lambda count=count, num_columns=num_columns: [
                        make_block(count, num_columns)
                    ],
                    meta,
                )
            )
            i += block_size
        return read_tasks
 if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--num-partitions", help="number of partitions", default="50", type=str
    )
    parser.add_argument(
        "--partition-size",
        help="partition size (bytes)",
        default="200e6",
        type=str,
    )
    parser.add_argument(
        "--shuffle", help="shuffle instead of sort", action="store_true"
    )
    args = parser.parse_args()
    num_partitions = int(args.num_partitions)
    partition_size = int(float(args.partition_size))
    print(f"Dataset size: {num_partitions} partitions, {partition_size / 1e9}GB partition size, {num_partitions * partition_size / 1e9}GB total")
    start_time = time.time()
    source = RandomIntRowDatasource()
    num_rows_per_partition = partition_size // 8
    ds = ray.data.read_datasource(source,
            parallelism=num_partitions,
            n=num_rows_per_partition * num_partitions,
            num_columns=1)
    exc = None
    try:
        if args.shuffle:
            ds = ds.random_shuffle()
        else:
            ds = ds.sort(key="c_0")
    except Exception as e:
        exc = e
        pass
    end_time = time.time()
    duration = end_time - start_time
    print("Finished in", duration)
    print("")
    print("==== Driver memory summary ====")
    maxrss = int(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1e3)
    print(f"max: {maxrss / 1e9}/GB")
    process = psutil.Process(os.getpid())
    rss = int(process.memory_info().rss)
    print(f"rss: {rss / 1e9}/GB")
    print(memory_summary(stats_only=True))
    print("")
    print(ds.stats())
    if "TEST_OUTPUT_JSON" in os.environ:
        out_file = open(os.environ["TEST_OUTPUT_JSON"], "w")
        results = {
                "time": duration,
                "success": "1" if exc is None else "0",
                "num_partitions": num_partitions,
                "partition_size": partition_size,
                "perf_metrics": [
                        {
                            "perf_metric_name": "peak_driver_memory",
                            "perf_metric_value": maxrss,
                            "perf_metric_type": "MEMORY",
                        },
                        {
                            "perf_metric_name": "runtime",
                            "perf_metric_value": duration,
                            "perf_metric_type": "LATENCY",
                        },
                    ]
                }
        json.dump(results, out_file)
    if exc:
        raise exc
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@ -3411,6 +3411,58 @@
    type: sdk_command
    file_manager: sdk
 - name: dataset_shuffle_random_shuffle_1tb
  group: core-multi-test
  working_dir: nightly_tests
  legacy:
    test_name: dataset_shuffle_random_shuffle_1tb
    test_suite: dataset_test
  stable: false
  frequency: nightly
  team: core
  cluster:
    cluster_env: shuffle/shuffle_app_config.yaml
    cluster_compute: shuffle/shuffle_compute_large_scale.yaml
  run:
    timeout: 7200
    script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
    wait_for_nodes:
      num_nodes: 20
      timeout: 900
    type: sdk_command
    file_manager: sdk
 - name: dataset_shuffle_sort_1tb
  group: core-multi-test
  working_dir: nightly_tests
  legacy:
    test_name: dataset_shuffle_sort_1tb
    test_suite: dataset_test
  stable: false
  frequency: nightly
  team: core
  cluster:
    cluster_env: shuffle/shuffle_app_config.yaml
    cluster_compute: shuffle/shuffle_compute_large_scale.yaml
  run:
    timeout: 7200
    script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9
    wait_for_nodes:
      num_nodes: 20
      timeout: 900
    type: sdk_command
    file_manager: sdk
 ################
 # Core K8s tests
 ################