ray/release/nightly_tests/shuffle_data_loader/benchmark.py

import argparse
import asyncio
import collections
import contextlib
import glob
import json
import os
import timeit
from typing import List

import numpy as np

import ray
from ray_shuffling_data_loader.shuffle import (shuffle, BatchConsumer)
from ray_shuffling_data_loader.stats import (
    TrialStatsCollector, ObjectStoreStatsCollector, process_stats,
    human_readable_size)

from ray_shuffling_data_loader.data_generation import generate_data

BATCHQUEUE_ACTOR_NAME = "BatchQueue"
DEFAULT_DATA_DIR = "/mnt/disk0/benchmark_scratch"
DEFAULT_STATS_DIR = "./results"

DEFAULT_UTILIZATION_SAMPLE_PERIOD = 5.0


@ray.remote(num_cpus=0)
class Consumer:
    def __init__(self,
                 rank,
                 num_epochs,
                 max_concurrent_epochs,
                 stats_collector=None):
        self._rank = rank
        self._num_epochs = num_epochs
        self._max_epochs = max_concurrent_epochs
        self._curr_epochs = collections.deque()
        self._epoch_done_evs = [asyncio.Event() for _ in range(num_epochs)]
        self._stats_collector = stats_collector

    async def new_epoch(self, epoch):
        if len(self._curr_epochs) == self._max_epochs:
            first_epoch = self._curr_epochs.popleft()
            await self._epoch_done_evs[first_epoch].wait()
        self._curr_epochs.append(epoch)
        print(f"Starting epoch {epoch} on consumer {self._rank}.")

    def consume(self, epoch, batch):
        print(f"Consuming batch on consumer {self._rank} for epoch {epoch}.")
        if self._stats_collector is not None:
            self._stats_collector.consume_batch.remote(epoch, len(batch))

    def producer_done(self, epoch):
        if self._stats_collector is not None:
            self._stats_collector.consume_done.remote(epoch)
        self._epoch_done_evs[epoch].set()
        print(f"Epoch {epoch} done on consumer {self._rank}.")

    async def wait_until_all_epochs_done(self):
        await self._epoch_done_evs[self._num_epochs - 1].wait()

    def ready(self):
        pass


class BatchConsumer(BatchConsumer):
    def __init__(self,
                 num_trainers,
                 num_epochs,
                 pg,
                 max_concurrent_epochs,
                 stats_collector=None):
        self._consumers = [
            Consumer.options(placement_group=pg).remote(
                rank, num_epochs, max_concurrent_epochs, stats_collector)
            for rank in range(num_trainers)
        ]

    def consume(self, rank: int, epoch: int, batches: List[ray.ObjectRef]):
        if batches is not None:
            for batch in batches:
                self._consumers[rank].consume.remote(epoch, batch)

    def producer_done(self, rank: int, epoch: int):
        self._consumers[rank].producer_done.remote(epoch)

    def wait_until_ready(self, epoch: int):
        ray.get(
            [consumer.new_epoch.remote(epoch) for consumer in self._consumers])

    def wait_until_all_epochs_done(self):
        ray.get([
            consumer.wait_until_all_epochs_done.remote()
            for consumer in self._consumers
        ])

    def actors_ready(self):
        ray.get([consumer.ready.remote() for consumer in self._consumers])

    def get_stats(self):
        (consume_times, time_to_consumes, consume_start_times,
         consume_end_times) = tuple(
             list(zip(*stats_)) for stats_ in zip(*ray.get(
                 [consumer.get_stats.remote()
                  for consumer in self._consumers])))
        consume_stage_durations = []
        for start_times, end_times in zip(consume_start_times,
                                          consume_end_times):
            consume_stage_durations.append(max(end_times) - min(start_times))
        return consume_times, time_to_consumes, consume_stage_durations


def run_trials(num_epochs,
               filenames,
               num_reducers,
               num_trainers,
               max_concurrent_epochs,
               utilization_sample_period,
               collect_stats=True,
               num_trials=None,
               trials_timeout=None):
    """
    Run shuffling trials.
    """
    print("Using from-memory shuffler.")
    all_stats = []
    pg = ray.util.placement_group(
        [{
            "CPU": 0.1
        } for _ in range(num_trainers)], strategy="SPREAD")
    ray.get(pg.ready())
    if collect_stats:
        stats_collector = TrialStatsCollector.remote(
            num_epochs, len(filenames), num_reducers, num_trainers)
        object_store_stats_collector = ObjectStoreStatsCollector(
            utilization_sample_period)
    else:
        stats_collector = None
        try:
            object_store_stats_collector = contextlib.nullcontext()
        except AttributeError:
            # Python 3.6 doesn't support nullcontext().
            object_store_stats_collector = contextlib.suppress()
    batch_consumer = BatchConsumer(num_trainers, num_epochs, pg,
                                   max_concurrent_epochs, stats_collector)
    # Wait until batch consumer actors have been created.
    batch_consumer.actors_ready()
    if num_trials is not None:
        for trial in range(num_trials):
            print(f"Starting trial {trial}.")
            with object_store_stats_collector:
                duration = shuffle(filenames, batch_consumer, num_epochs,
                                   num_reducers, num_trainers, stats_collector)
            print(f"Trial {trial} done after {duration} seconds.")
            if collect_stats:
                stats = ray.get(stats_collector.get_stats.remote())
                store_stats = object_store_stats_collector.get_stats()
            else:
                stats = duration
                store_stats = None
            all_stats.append((stats, store_stats))
    elif trials_timeout is not None:
        start = timeit.default_timer()
        trial = 0
        while timeit.default_timer() - start < trials_timeout:
            print(f"Starting trial {trial}.")
            with object_store_stats_collector:
                duration = shuffle(filenames, batch_consumer, num_epochs,
                                   num_reducers, num_trainers, stats_collector)
            print(f"Trial {trial} done after {duration} seconds.")
            if collect_stats:
                stats = ray.get(stats_collector.get_stats.remote())
                store_stats = object_store_stats_collector.get_stats()
            else:
                stats = duration
                store_stats = None
            all_stats.append((stats, store_stats))
            trial += 1
    else:
        raise ValueError(
            "One of num_trials and trials_timeout must be specified")
    return all_stats


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Shuffling data loader")
    parser.add_argument("--num-rows", type=int, default=4 * (10**11))
    parser.add_argument("--num-files", type=int, default=100)
    parser.add_argument("--max-row-group-skew", type=float, default=0.0)
    parser.add_argument("--num-row-groups-per-file", type=int, default=1)
    parser.add_argument("--num-reducers", type=int, default=5)
    parser.add_argument("--num-trainers", type=int, default=5)
    parser.add_argument("--num-epochs", type=int, default=10)
    parser.add_argument("--max-concurrent-epochs", type=int, default=None)
    parser.add_argument("--batch-size", type=int, default=100)
    parser.add_argument("--num-trials", type=int, default=None)
    parser.add_argument("--trials-timeout", type=int, default=None)
    parser.add_argument(
        "--utilization-sample-period",
        type=float,
        default=DEFAULT_UTILIZATION_SAMPLE_PERIOD)
    parser.add_argument("--cluster", action="store_true")
    parser.add_argument("--data-dir", type=str, default=DEFAULT_DATA_DIR)
    parser.add_argument("--stats-dir", type=str, default=DEFAULT_STATS_DIR)
    parser.add_argument("--clear-old-data", action="store_true")
    parser.add_argument("--use-old-data", action="store_true")
    parser.add_argument("--no-stats", action="store_true")
    parser.add_argument("--no-epoch-stats", action="store_true")
    parser.add_argument("--no-consumer-stats", action="store_true")
    parser.add_argument("--overwrite-stats", action="store_true")
    parser.add_argument("--unique-stats", action="store_true")
    args = parser.parse_args()

    if args.num_row_groups_per_file < 1:
        raise ValueError("Must have at least one row group per file.")

    num_trials = args.num_trials
    trials_timeout = args.trials_timeout
    if num_trials is not None and trials_timeout is not None:
        raise ValueError(
            "Only one of --num-trials and --trials-timeout should be "
            "specified.")

    if num_trials is None and trials_timeout is None:
        num_trials = 3

    if args.clear_old_data and args.use_old_data:
        raise ValueError(
            "Only one of --clear-old-data and --use-old-data should be "
            "specified.")

    data_dir = args.data_dir
    if args.clear_old_data:
        print(f"Clearing old data from {data_dir}.")
        files = glob.glob(os.path.join(data_dir, "*.parquet.snappy"))
        for f in files:
            os.remove(f)

    if args.cluster:
        print("Connecting to an existing Ray cluster.")
        ray.init(address="auto")
    else:
        print("Starting a new local Ray cluster.")
        ray.init(resources={"resources": 100})

    num_rows = args.num_rows
    num_row_groups_per_file = args.num_row_groups_per_file
    num_files = args.num_files
    max_row_group_skew = args.max_row_group_skew
    if not args.use_old_data:
        print(f"Generating {num_rows} rows over {num_files} files, with "
              f"{num_row_groups_per_file} row groups per file and at most "
              f"{100 * max_row_group_skew:.1f}% row group skew.")
        filenames, num_bytes = generate_data(num_rows, num_files,
                                             num_row_groups_per_file,
                                             max_row_group_skew, data_dir)
        print(f"Generated {len(filenames)} files containing {num_rows} rows "
              f"with {num_row_groups_per_file} row groups per file, totalling "
              f"{human_readable_size(num_bytes)}.")
    else:
        filenames = [
            os.path.join(data_dir, f"input_data_{file_index}.parquet.snappy")
            for file_index in range(num_files)
        ]
        print("Not generating input data, using existing data instead.")

    num_reducers = args.num_reducers
    num_trainers = args.num_trainers
    batch_size = args.batch_size

    num_epochs = args.num_epochs
    max_concurrent_epochs = args.max_concurrent_epochs
    if max_concurrent_epochs is None or max_concurrent_epochs > num_epochs:
        max_concurrent_epochs = num_epochs
    assert max_concurrent_epochs > 0

    utilization_sample_period = args.utilization_sample_period

    # TODO(Clark): Add warmup trials.

    print("\nRunning real trials.")
    if num_trials is not None:
        print(f"Running {num_trials} shuffle trials with {num_epochs} epochs, "
              f"{num_reducers} reducers, {num_trainers} trainers, and a batch "
              f"size of {batch_size} over {num_rows} rows.")
    else:
        print(f"Running {trials_timeout} seconds of shuffle trials with "
              f"{num_epochs} epochs, {num_reducers} reducers, {num_trainers} "
              f"trainers, and a batch size of {batch_size} over {num_rows} "
              "rows.")
    print(f"Shuffling will be pipelined with at most "
          f"{max_concurrent_epochs} concurrent epochs.")
    collect_stats = not args.no_stats
    all_stats = run_trials(num_epochs, filenames, num_reducers, num_trainers,
                           max_concurrent_epochs, utilization_sample_period,
                           collect_stats, num_trials, trials_timeout)

    if collect_stats:
        process_stats(all_stats, args.overwrite_stats, args.stats_dir,
                      args.no_epoch_stats, args.no_consumer_stats,
                      args.unique_stats, num_rows, num_files,
                      num_row_groups_per_file, batch_size, num_reducers,
                      num_trainers, num_epochs, max_concurrent_epochs)
    else:
        print("Shuffle trials done, no detailed stats collected.")
        times, _ = zip(*all_stats)
        mean = np.mean(times)
        std = np.std(times)
        throughput_std = np.std(
            [num_epochs * num_rows / time for time in times])
        batch_throughput_std = np.std(
            [(num_epochs * num_rows / batch_size) / time for time in times])
        print(f"\nMean over {len(times)} trials: {mean:.3f}s +- {std}")
        print(f"Mean throughput over {len(times)} trials: "
              f"{num_epochs * num_rows / mean:.2f} rows/s +- "
              f"{throughput_std:.2f}")
        print(f"Mean batch throughput over {len(times)} trials: "
              f"{(num_epochs * num_rows / batch_size) / mean:.2f} batches/s "
              f"+- {batch_throughput_std:.2f}")

    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
        f.write(json.dumps({"success": 1}))
[nightly-test] add test shuffle_data_loader (#16972) * test shuffle_data_loader * address comments * update 2021-07-15 20:03:35 -07:00			`import argparse`
			`import asyncio`
			`import collections`
			`import contextlib`
			`import glob`
			`import json`
			`import os`
			`import timeit`
			`from typing import List`

			`import numpy as np`

			`import ray`
			`from ray_shuffling_data_loader.shuffle import (shuffle, BatchConsumer)`
			`from ray_shuffling_data_loader.stats import (`
			`TrialStatsCollector, ObjectStoreStatsCollector, process_stats,`
			`human_readable_size)`

			`from ray_shuffling_data_loader.data_generation import generate_data`

			`BATCHQUEUE_ACTOR_NAME = "BatchQueue"`
			`DEFAULT_DATA_DIR = "/mnt/disk0/benchmark_scratch"`
			`DEFAULT_STATS_DIR = "./results"`

			`DEFAULT_UTILIZATION_SAMPLE_PERIOD = 5.0`


			`@ray.remote(num_cpus=0)`
			`class Consumer:`
			`def __init__(self,`
			`rank,`
			`num_epochs,`
			`max_concurrent_epochs,`
			`stats_collector=None):`
			`self._rank = rank`
			`self._num_epochs = num_epochs`
			`self._max_epochs = max_concurrent_epochs`
			`self._curr_epochs = collections.deque()`
			`self._epoch_done_evs = [asyncio.Event() for _ in range(num_epochs)]`
			`self._stats_collector = stats_collector`

			`async def new_epoch(self, epoch):`
			`if len(self._curr_epochs) == self._max_epochs:`
			`first_epoch = self._curr_epochs.popleft()`
			`await self._epoch_done_evs[first_epoch].wait()`
			`self._curr_epochs.append(epoch)`
			`print(f"Starting epoch {epoch} on consumer {self._rank}.")`

			`def consume(self, epoch, batch):`
			`print(f"Consuming batch on consumer {self._rank} for epoch {epoch}.")`
			`if self._stats_collector is not None:`
			`self._stats_collector.consume_batch.remote(epoch, len(batch))`

			`def producer_done(self, epoch):`
			`if self._stats_collector is not None:`
			`self._stats_collector.consume_done.remote(epoch)`
			`self._epoch_done_evs[epoch].set()`
			`print(f"Epoch {epoch} done on consumer {self._rank}.")`

			`async def wait_until_all_epochs_done(self):`
			`await self._epoch_done_evs[self._num_epochs - 1].wait()`

			`def ready(self):`
			`pass`


			`class BatchConsumer(BatchConsumer):`
			`def __init__(self,`
			`num_trainers,`
			`num_epochs,`
			`pg,`
			`max_concurrent_epochs,`
			`stats_collector=None):`
			`self._consumers = [`
			`Consumer.options(placement_group=pg).remote(`
			`rank, num_epochs, max_concurrent_epochs, stats_collector)`
			`for rank in range(num_trainers)`
			`]`

			`def consume(self, rank: int, epoch: int, batches: List[ray.ObjectRef]):`
			`if batches is not None:`
			`for batch in batches:`
			`self._consumers[rank].consume.remote(epoch, batch)`

			`def producer_done(self, rank: int, epoch: int):`
			`self._consumers[rank].producer_done.remote(epoch)`

			`def wait_until_ready(self, epoch: int):`
			`ray.get(`
			`[consumer.new_epoch.remote(epoch) for consumer in self._consumers])`

			`def wait_until_all_epochs_done(self):`
			`ray.get([`
			`consumer.wait_until_all_epochs_done.remote()`
			`for consumer in self._consumers`
			`])`

			`def actors_ready(self):`
			`ray.get([consumer.ready.remote() for consumer in self._consumers])`

			`def get_stats(self):`
			`(consume_times, time_to_consumes, consume_start_times,`
			`consume_end_times) = tuple(`
			`list(zip(stats_)) for stats_ in zip(ray.get(`
			`[consumer.get_stats.remote()`
			`for consumer in self._consumers])))`
			`consume_stage_durations = []`
			`for start_times, end_times in zip(consume_start_times,`
			`consume_end_times):`
			`consume_stage_durations.append(max(end_times) - min(start_times))`
			`return consume_times, time_to_consumes, consume_stage_durations`


			`def run_trials(num_epochs,`
			`filenames,`
			`num_reducers,`
			`num_trainers,`
			`max_concurrent_epochs,`
			`utilization_sample_period,`
			`collect_stats=True,`
			`num_trials=None,`
			`trials_timeout=None):`
			`"""`
			`Run shuffling trials.`
			`"""`
			`print("Using from-memory shuffler.")`
			`all_stats = []`
[nightly-test] add 4-nodes shuffle-data-loader test (#17155) 2021-07-19 17:46:22 -07:00			`pg = ray.util.placement_group(`
			`[{`
			`"CPU": 0.1`
			`} for _ in range(num_trainers)], strategy="SPREAD")`
			`ray.get(pg.ready())`
[nightly-test] add test shuffle_data_loader (#16972) * test shuffle_data_loader * address comments * update 2021-07-15 20:03:35 -07:00			`if collect_stats:`
			`stats_collector = TrialStatsCollector.remote(`
			`num_epochs, len(filenames), num_reducers, num_trainers)`
			`object_store_stats_collector = ObjectStoreStatsCollector(`
			`utilization_sample_period)`
			`else:`
			`stats_collector = None`
			`try:`
			`object_store_stats_collector = contextlib.nullcontext()`
			`except AttributeError:`
			`# Python 3.6 doesn't support nullcontext().`
			`object_store_stats_collector = contextlib.suppress()`
[nightly-test] add 4-nodes shuffle-data-loader test (#17155) 2021-07-19 17:46:22 -07:00			`batch_consumer = BatchConsumer(num_trainers, num_epochs, pg,`
[nightly-test] add test shuffle_data_loader (#16972) * test shuffle_data_loader * address comments * update 2021-07-15 20:03:35 -07:00			`max_concurrent_epochs, stats_collector)`
			`# Wait until batch consumer actors have been created.`
			`batch_consumer.actors_ready()`
			`if num_trials is not None:`
			`for trial in range(num_trials):`
			`print(f"Starting trial {trial}.")`
			`with object_store_stats_collector:`
			`duration = shuffle(filenames, batch_consumer, num_epochs,`
			`num_reducers, num_trainers, stats_collector)`
			`print(f"Trial {trial} done after {duration} seconds.")`
			`if collect_stats:`
			`stats = ray.get(stats_collector.get_stats.remote())`
			`store_stats = object_store_stats_collector.get_stats()`
			`else:`
			`stats = duration`
			`store_stats = None`
			`all_stats.append((stats, store_stats))`
			`elif trials_timeout is not None:`
			`start = timeit.default_timer()`
			`trial = 0`
			`while timeit.default_timer() - start < trials_timeout:`
			`print(f"Starting trial {trial}.")`
			`with object_store_stats_collector:`
			`duration = shuffle(filenames, batch_consumer, num_epochs,`
			`num_reducers, num_trainers, stats_collector)`
			`print(f"Trial {trial} done after {duration} seconds.")`
			`if collect_stats:`
			`stats = ray.get(stats_collector.get_stats.remote())`
			`store_stats = object_store_stats_collector.get_stats()`
			`else:`
			`stats = duration`
			`store_stats = None`
			`all_stats.append((stats, store_stats))`
			`trial += 1`
			`else:`
			`raise ValueError(`
			`"One of num_trials and trials_timeout must be specified")`
			`return all_stats`


			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser(description="Shuffling data loader")`
			`parser.add_argument("--num-rows", type=int, default=4 * (10**11))`
			`parser.add_argument("--num-files", type=int, default=100)`
			`parser.add_argument("--max-row-group-skew", type=float, default=0.0)`
			`parser.add_argument("--num-row-groups-per-file", type=int, default=1)`
			`parser.add_argument("--num-reducers", type=int, default=5)`
			`parser.add_argument("--num-trainers", type=int, default=5)`
			`parser.add_argument("--num-epochs", type=int, default=10)`
			`parser.add_argument("--max-concurrent-epochs", type=int, default=None)`
			`parser.add_argument("--batch-size", type=int, default=100)`
			`parser.add_argument("--num-trials", type=int, default=None)`
			`parser.add_argument("--trials-timeout", type=int, default=None)`
			`parser.add_argument(`
			`"--utilization-sample-period",`
			`type=float,`
			`default=DEFAULT_UTILIZATION_SAMPLE_PERIOD)`
			`parser.add_argument("--cluster", action="store_true")`
			`parser.add_argument("--data-dir", type=str, default=DEFAULT_DATA_DIR)`
			`parser.add_argument("--stats-dir", type=str, default=DEFAULT_STATS_DIR)`
			`parser.add_argument("--clear-old-data", action="store_true")`
			`parser.add_argument("--use-old-data", action="store_true")`
			`parser.add_argument("--no-stats", action="store_true")`
			`parser.add_argument("--no-epoch-stats", action="store_true")`
			`parser.add_argument("--no-consumer-stats", action="store_true")`
			`parser.add_argument("--overwrite-stats", action="store_true")`
			`parser.add_argument("--unique-stats", action="store_true")`
			`args = parser.parse_args()`

			`if args.num_row_groups_per_file < 1:`
			`raise ValueError("Must have at least one row group per file.")`

			`num_trials = args.num_trials`
			`trials_timeout = args.trials_timeout`
			`if num_trials is not None and trials_timeout is not None:`
			`raise ValueError(`
			`"Only one of --num-trials and --trials-timeout should be "`
			`"specified.")`

			`if num_trials is None and trials_timeout is None:`
			`num_trials = 3`

			`if args.clear_old_data and args.use_old_data:`
			`raise ValueError(`
			`"Only one of --clear-old-data and --use-old-data should be "`
			`"specified.")`

			`data_dir = args.data_dir`
			`if args.clear_old_data:`
			`print(f"Clearing old data from {data_dir}.")`
			`files = glob.glob(os.path.join(data_dir, "*.parquet.snappy"))`
			`for f in files:`
			`os.remove(f)`

			`if args.cluster:`
			`print("Connecting to an existing Ray cluster.")`
			`ray.init(address="auto")`
			`else:`
			`print("Starting a new local Ray cluster.")`
			`ray.init(resources={"resources": 100})`

			`num_rows = args.num_rows`
			`num_row_groups_per_file = args.num_row_groups_per_file`
			`num_files = args.num_files`
			`max_row_group_skew = args.max_row_group_skew`
			`if not args.use_old_data:`
			`print(f"Generating {num_rows} rows over {num_files} files, with "`
			`f"{num_row_groups_per_file} row groups per file and at most "`
			`f"{100 * max_row_group_skew:.1f}% row group skew.")`
			`filenames, num_bytes = generate_data(num_rows, num_files,`
			`num_row_groups_per_file,`
			`max_row_group_skew, data_dir)`
			`print(f"Generated {len(filenames)} files containing {num_rows} rows "`
			`f"with {num_row_groups_per_file} row groups per file, totalling "`
			`f"{human_readable_size(num_bytes)}.")`
			`else:`
			`filenames = [`
			`os.path.join(data_dir, f"input_data_{file_index}.parquet.snappy")`
			`for file_index in range(num_files)`
			`]`
			`print("Not generating input data, using existing data instead.")`

			`num_reducers = args.num_reducers`
			`num_trainers = args.num_trainers`
			`batch_size = args.batch_size`

			`num_epochs = args.num_epochs`
			`max_concurrent_epochs = args.max_concurrent_epochs`
			`if max_concurrent_epochs is None or max_concurrent_epochs > num_epochs:`
			`max_concurrent_epochs = num_epochs`
			`assert max_concurrent_epochs > 0`

			`utilization_sample_period = args.utilization_sample_period`

			`# TODO(Clark): Add warmup trials.`

			`print("\nRunning real trials.")`
			`if num_trials is not None:`
			`print(f"Running {num_trials} shuffle trials with {num_epochs} epochs, "`
			`f"{num_reducers} reducers, {num_trainers} trainers, and a batch "`
			`f"size of {batch_size} over {num_rows} rows.")`
			`else:`
			`print(f"Running {trials_timeout} seconds of shuffle trials with "`
			`f"{num_epochs} epochs, {num_reducers} reducers, {num_trainers} "`
			`f"trainers, and a batch size of {batch_size} over {num_rows} "`
			`"rows.")`
			`print(f"Shuffling will be pipelined with at most "`
			`f"{max_concurrent_epochs} concurrent epochs.")`
			`collect_stats = not args.no_stats`
			`all_stats = run_trials(num_epochs, filenames, num_reducers, num_trainers,`
			`max_concurrent_epochs, utilization_sample_period,`
			`collect_stats, num_trials, trials_timeout)`

			`if collect_stats:`
			`process_stats(all_stats, args.overwrite_stats, args.stats_dir,`
			`args.no_epoch_stats, args.no_consumer_stats,`
			`args.unique_stats, num_rows, num_files,`
			`num_row_groups_per_file, batch_size, num_reducers,`
			`num_trainers, num_epochs, max_concurrent_epochs)`
			`else:`
			`print("Shuffle trials done, no detailed stats collected.")`
			`times, _ = zip(*all_stats)`
			`mean = np.mean(times)`
			`std = np.std(times)`
			`throughput_std = np.std(`
			`[num_epochs * num_rows / time for time in times])`
			`batch_throughput_std = np.std(`
			`[(num_epochs * num_rows / batch_size) / time for time in times])`
			`print(f"\nMean over {len(times)} trials: {mean:.3f}s +- {std}")`
			`print(f"Mean throughput over {len(times)} trials: "`
			`f"{num_epochs * num_rows / mean:.2f} rows/s +- "`
			`f"{throughput_std:.2f}")`
			`print(f"Mean batch throughput over {len(times)} trials: "`
			`f"{(num_epochs * num_rows / batch_size) / mean:.2f} batches/s "`
			`f"+- {batch_throughput_std:.2f}")`

			`with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:`
			`f.write(json.dumps({"success": 1}))`