2022-07-15 22:01:23 -07:00
|
|
|
import argparse
|
|
|
|
import json
|
|
|
|
import os
|
|
|
|
import time
|
|
|
|
|
|
|
|
import ray
|
2022-07-18 18:46:58 -04:00
|
|
|
from ray.air.config import DatasetConfig, ScalingConfig
|
2022-07-15 22:01:23 -07:00
|
|
|
from ray.air.util.check_ingest import DummyTrainer
|
|
|
|
from ray.data.preprocessors import BatchMapper
|
|
|
|
|
|
|
|
GiB = 1024 * 1024 * 1024
|
|
|
|
|
|
|
|
|
2022-07-16 17:37:06 -07:00
|
|
|
def make_ds(size_gb: int):
|
2022-07-15 22:01:23 -07:00
|
|
|
# Dataset of 10KiB tensor records.
|
2022-07-16 17:37:06 -07:00
|
|
|
total_size = GiB * size_gb
|
2022-07-15 22:01:23 -07:00
|
|
|
record_dim = 1280
|
|
|
|
record_size = record_dim * 8
|
|
|
|
num_records = int(total_size / record_size)
|
|
|
|
dataset = ray.data.range_tensor(num_records, shape=(record_dim,))
|
|
|
|
print("Created dataset", dataset, "of size", dataset.size_bytes())
|
|
|
|
return dataset
|
|
|
|
|
|
|
|
|
2022-07-16 17:37:06 -07:00
|
|
|
def run_ingest_bulk(dataset, num_workers, num_cpus_per_worker):
|
2022-07-15 22:01:23 -07:00
|
|
|
dummy_prep = BatchMapper(lambda df: df * 2)
|
|
|
|
trainer = DummyTrainer(
|
2022-07-18 18:46:58 -04:00
|
|
|
scaling_config=ScalingConfig(
|
|
|
|
num_workers=num_workers,
|
|
|
|
trainer_resources={"CPU": 0},
|
|
|
|
resources_per_worker={"CPU": num_cpus_per_worker},
|
2022-07-19 23:01:24 -07:00
|
|
|
_max_cpu_fraction_per_node=0.1,
|
2022-07-18 18:46:58 -04:00
|
|
|
),
|
2022-07-15 22:01:23 -07:00
|
|
|
datasets={"train": dataset},
|
|
|
|
preprocessor=dummy_prep,
|
|
|
|
num_epochs=1,
|
|
|
|
prefetch_blocks=1,
|
|
|
|
dataset_config={"train": DatasetConfig(split=True)},
|
|
|
|
)
|
|
|
|
trainer.fit()
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument("--num-workers", type=int, default=4)
|
|
|
|
parser.add_argument(
|
|
|
|
"--num-cpus-per-worker",
|
|
|
|
type=int,
|
|
|
|
default=1,
|
|
|
|
help="Number of CPUs for each training worker.",
|
|
|
|
)
|
2022-07-16 17:37:06 -07:00
|
|
|
parser.add_argument("--dataset-size-gb", type=int, default=200)
|
2022-07-15 22:01:23 -07:00
|
|
|
args = parser.parse_args()
|
2022-07-16 17:37:06 -07:00
|
|
|
ds = make_ds(args.dataset_size_gb)
|
2022-07-15 22:01:23 -07:00
|
|
|
|
|
|
|
start = time.time()
|
2022-07-16 17:37:06 -07:00
|
|
|
run_ingest_bulk(ds, args.num_workers, args.num_cpus_per_worker)
|
2022-07-15 22:01:23 -07:00
|
|
|
end = time.time()
|
|
|
|
time_taken = end - start
|
|
|
|
|
|
|
|
result = {"time_taken_s": time_taken}
|
|
|
|
|
|
|
|
print("Results:", result)
|
|
|
|
test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/result.json")
|
|
|
|
with open(test_output_json, "wt") as f:
|
|
|
|
json.dump(result, f)
|