[nighly-test] update cuj2 to reflect latest change #20889

we fixed groupby issue in cuj2; sync the change into nightly test. this test doesn't need to use gpu at all. it returns soon after data ingestion finishes.
2025-03-06 02:21:39 -05:00 · 2021-12-06 09:59:21 -08:00 · 2021-12-06 09:59:21 -08:00 · a628182cf5
commit a628182cf5
parent b8c64480d8
3 changed files with 55 additions and 18 deletions
--- a/release/nightly_tests/dataset/dataset_test.yaml
+++ b/release/nightly_tests/dataset/dataset_test.yaml
@ -86,11 +86,11 @@
  cluster:
    app_config: ray_sgd_training_app.yaml
-    compute_template: ray_sgd_training_compute.yaml
+    compute_template: ray_sgd_training_compute_no_gpu.yaml
  run:
-    timeout: 14400
+    timeout: 7200
-    prepare: python wait_cluster.py 66 2400
+    prepare: python wait_cluster.py 21 2400
    script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset --debug
  stable: false
--- a/release/nightly_tests/dataset/ray_sgd_training.py
+++ b/release/nightly_tests/dataset/ray_sgd_training.py
@ -3,7 +3,7 @@ import collections
 import json
 import os
 import sys
-import time
+import timeit
 from typing import Tuple
 import boto3
@ -50,6 +50,7 @@ class MLflowCallback(TrainingCallback):
 def read_dataset(path: str) -> ray.data.Dataset:
    print(f"reading data from {path}")
    return ray.data.read_parquet(path, _spread_resource_prefix="node:") \
        .repartition(400) \
        .random_shuffle(_spread_resource_prefix="node:")
@ -101,13 +102,12 @@ class DataPreprocessor:
        print("\nStep 2: Precalculating fruit-grouped mean for new column and "
              "for one-hot encoding (latter only uses fruit groups)\n")
        agg_ds = ds.groupby("fruit").mean("feature_1")
        fruit_means = {
            r["fruit"]: r["mean(feature_1)"]
-            for r in agg_ds.take_all()
+            for r in ds.groupby("fruit").mean("feature_1").take_all()
        }
-        print("\nStep 3: create mean_by_fruit as mean of feature_1 groupby "
+        print("\nStep 3: Create mean_by_fruit as mean of feature_1 groupby "
              "fruit; one-hot encode fruit column\n")
        if inferencing:
@ -407,7 +407,8 @@ class TrainingWorker:
                    training_dataset.to_torch(
                        batch_size=self.batch_size, label_column="label")):
                if i % 10000 == 0:
-                    print(f"epoch: {epoch}, worker: {self.rank}, batch: {i}")
+                    print(f"epoch: {epoch}, worker: {self.rank},"
                          f" processing batch: {i}")
 if __name__ == "__main__":
@ -437,7 +438,7 @@ if __name__ == "__main__":
        "--large-dataset",
        action="store_true",
        default=False,
-        help="Use 500GB dataset")
+        help="Use 100GB dataset")
    parser.add_argument(
        "--use-gpu",
        action="store_true",
@ -465,7 +466,7 @@ if __name__ == "__main__":
    if large_dataset:
        assert use_s3, "--large-dataset requires --use-s3 to be set."
-    start_time = time.time()
+    e2e_start_time = timeit.default_timer()
    ray.init(address=address)
@ -487,15 +488,18 @@ if __name__ == "__main__":
    if use_s3:
        # Check if s3 data is populated.
        BUCKET_NAME = "cuj-big-data"
-        FOLDER_NAME = "big-data/" if large_dataset else "data/"
+        FOLDER_NAME = "100GB/" if large_dataset else "data/"
        s3_resource = boto3.resource("s3")
        bucket = s3_resource.Bucket(BUCKET_NAME)
        count = bucket.objects.filter(Prefix=FOLDER_NAME)
        if len(list(count)) == 0:
            print("please run `python make_and_upload_dataset.py` first")
            sys.exit(1)
        # cuj-big-data/big-data stats
        # 156 files, 3_120_000_000 rows and 501_748_803_387 bytes
-        data_path = ("s3://cuj-big-data/big-data/"
+        # cuj-big-data/100GB stats
        # 33 files, 660_000_000 rows and 106_139_169_947 bytes
        data_path = ("s3://cuj-big-data/100GB/"
                     if large_dataset else "s3://cuj-big-data/data/")
        inference_path = "s3://cuj-big-data/inference/"
        inference_output_path = "s3://cuj-big-data/output/"
@ -521,6 +525,9 @@ if __name__ == "__main__":
    train_dataset, test_dataset = preprocessor.preprocess_train_data(
        read_dataset(data_path))
    preprocessing_end_time = timeit.default_timer()
    print("Preprocessing time (s): ", preprocessing_end_time - e2e_start_time)
    num_columns = len(train_dataset.schema().names)
    # remove label column and internal Arrow column.
    num_features = num_columns - 2
@ -533,22 +540,21 @@ if __name__ == "__main__":
    DROPOUT_PROB = 0.2
    if args.debug:
        num_gpus = 1 if use_gpu else 0
        shards = train_dataset.repeat(NUM_EPOCHS) \
            .random_shuffle_each_window(_spread_resource_prefix="node:") \
            .split(num_workers)
        del train_dataset
        num_gpus = 1 if use_gpu else 0
        training_workers = [
            TrainingWorker.options(num_gpus=num_gpus, num_cpus=0).remote(
                rank, shard, BATCH_SIZE) for rank, shard in enumerate(shards)
        ]
        ray.get([worker.train.remote() for worker in training_workers])
-        total_time = time.time() - start_time
+        e2e_end_time = timeit.default_timer()
        total_time = e2e_end_time - e2e_start_time
        print(f"Job finished in {total_time} seconds.")
        with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
            f.write(json.dumps({"time": total_time, "success": 1}))
        exit()
    # Random global shuffle
@ -596,6 +602,9 @@ if __name__ == "__main__":
    model = results[0]
    trainer.shutdown()
    training_end_time = timeit.default_timer()
    print("Training time (s): ", training_end_time - preprocessing_end_time)
    if args.mlflow_register_model:
        mlflow.pytorch.log_model(
            model, artifact_path="models", registered_model_name="torch_model")
@ -648,9 +657,11 @@ if __name__ == "__main__":
    inference(inference_dataset, BatchInferModel(load_model_func), 100,
              inference_output_path, use_gpu)
-    end_time = time.time()
+    e2e_end_time = timeit.default_timer()
    print("Inference time (s): ", e2e_end_time - training_end_time)
-    total_time = end_time - start_time
+    total_time = e2e_end_time - e2e_start_time
    print("Total time (s): ", e2e_end_time - e2e_start_time)
    print(f"Job finished in {total_time} seconds.")
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
--- a/release/nightly_tests/dataset/ray_sgd_training_compute_no_gpu.yaml
+++ b/release/nightly_tests/dataset/ray_sgd_training_compute_no_gpu.yaml
@ -0,0 +1,26 @@
 cloud_id: cld_17WvYIBBkdgLwEUNcLeRAE
 region: us-west-2
 allowed_azs:
  - us-west-2a
 max_workers: 20
 aws:
    IamInstanceProfile: {"Name": "ray-autoscaler-v1"}
    BlockDeviceMappings:
        - DeviceName: /dev/sda1
          Ebs:
            VolumeSize: 500
 head_node_type:
    name: head_node
    instance_type: i3.8xlarge
 worker_node_types:
    - name: memory_node
      instance_type: i3.8xlarge
      min_workers: 20
      max_workers: 20
      use_spot: false
      resources:
        gpu: 1