[Tests] Remove app level error from nightly tests (#16968)

* Completed * Fix tests * increase the node wait timeout Signed-off-by: SangBin Cho <rkooo567@gmail.com>
2025-03-05 18:11:42 -05:00 · 2021-07-09 12:20:42 -07:00 · 2021-07-09 12:20:42 -07:00 · 33e319e9d7
commit 33e319e9d7
parent 66ea099897
7 changed files with 205 additions and 257 deletions
--- a/release/nightly_tests/dask_on_ray/dask_on_ray_sort.py
+++ b/release/nightly_tests/dask_on_ray/dask_on_ray_sort.py
@ -185,9 +185,7 @@ if __name__ == "__main__":
    if args.nbytes // npartitions > args.max_partition_size:
        npartitions = args.nbytes // args.max_partition_size

-    success = 1
    duration = []
-    try:
    output = trial(
        client,
        args.data_dir,
@ -198,18 +196,12 @@ if __name__ == "__main__":
        file_path=args.file_path)
    print("mean over {} trials: {} +- {}".format(
        len(output), np.mean(output), np.std(output)))
-    except Exception as e:
-        import traceback
-        print(traceback.format_exc())
-        print(e)
-        success = 0
-        duration = []

    print(ray.internal.internal_api.memory_summary(stats_only=True))
    duration = np.mean(output)

    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
-        f.write(json.dumps({"duration": duration, "success": success}))
+        f.write(json.dumps({"duration": duration, "success": 1}))

    write_header = not os.path.exists("output.csv") or os.path.getsize(
        "output.csv") == 0
--- a/release/nightly_tests/dask_on_ray/large_scale_test.py
+++ b/release/nightly_tests/dask_on_ray/large_scale_test.py
@ -421,8 +421,7 @@ def parse_script_args():
 def main():
    args, unknown = parse_script_args()
    logging.info("Received arguments: {}".format(args))
-    success = 1
-    try:
+
    # Create test spec
    test_spec = TestSpec(
        num_workers=args.num_workers,
@ -454,11 +453,8 @@ def main():
        ray_scheduler=ray_dask_get,
    )
    print(ray.internal.internal_api.memory_summary(stats_only=True))
-    except Exception as e:
-        logging.exception(e)
-        success = 0
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
-        f.write(json.dumps({"success": success}))
+        f.write(json.dumps({"success": 1}))


 if __name__ == "__main__":
--- a/release/nightly_tests/nightly_tests.yaml
+++ b/release/nightly_tests/nightly_tests.yaml
@ -99,26 +99,6 @@
    prepare: python wait_cluster.py 4 600
    script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming

-# Test streaming shuffle in a single node with a large partition size.
- name: streaming_shuffle_1tb_100_partitions
-  cluster:
-    app_config: shuffle/shuffle_app_config.yaml
-    compute_template: shuffle/shuffle_compute_multi.yaml
-
-  run:
-    timeout: 3000
-    script: python shuffle/shuffle_test.py --num-partitions=100 --partition-size=10e9
-
-# Test non streaming shuffle in a single node with a large partition size.
- name: non_streaming_shuffle_1tb_100_partitions
-  cluster:
-    app_config: shuffle/shuffle_app_config.yaml
-    compute_template: shuffle/shuffle_compute_multi.yaml
-
-  run:
-    timeout: 3000
-    script: python shuffle/shuffle_test.py --num-partitions=100 --partition-size=10e9 --no-streaming
-
 # Test multi nodes 1TB streaming shuffle with a large number of partitions.
 - name: shuffle_1tb_1000_partition
  cluster:
@ -127,7 +107,7 @@

  run:
    timeout: 3000
-    prepare: python wait_cluster.py 20 600
+    prepare: python wait_cluster.py 20 900
    script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9

 # Test multi nodes 1TB non streaming shuffle with a large number of partitions.
@ -138,7 +118,7 @@

  run:
    timeout: 3000
-    prepare: python wait_cluster.py 20 600
+    prepare: python wait_cluster.py 20 900
    script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming

 # Stress test for 1TB multi node streaming shuffle.
@ -149,7 +129,7 @@

  run:
    timeout: 3000
-    prepare: python wait_cluster.py 20 600
+    prepare: python wait_cluster.py 20 900
    script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6

 # Stress test for 1TB multi node non-streaming shuffle.
@ -160,7 +140,7 @@

  run:
    timeout: 3000
-    prepare: python wait_cluster.py 20 600
+    prepare: python wait_cluster.py 20 900
    script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 --no-streaming

 # Test large scale dask on ray test without spilling.
--- a/release/nightly_tests/shuffle/shuffle_test.py
+++ b/release/nightly_tests/shuffle/shuffle_test.py
@ -18,7 +18,6 @@ if __name__ == "__main__":
    args = parser.parse_args()

    start = time.time()
-    success = 1

    commands = [
        "python", "-m", "ray.experimental.shuffle", "--ray-address={}".format(
@ -29,16 +28,8 @@ if __name__ == "__main__":
    if args.no_streaming:
        commands.append("--no-streaming")

-    try:
    subprocess.check_call(commands)
-    except Exception as e:
-        print(f"The test failed with {e}")
-        success = 0
    delta = time.time() - start

-    # Report the running time as 0 if it fails so that
-    # it is easy to be discovered from the graph.
-    if not success:
-        delta = 0
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
-        f.write(json.dumps({"shuffle_time": delta, "success": success}))
+        f.write(json.dumps({"shuffle_time": delta, "success": 1}))
--- a/release/nightly_tests/stress_tests/test_dead_actors.py
+++ b/release/nightly_tests/stress_tests/test_dead_actors.py
@ -73,12 +73,11 @@ if __name__ == "__main__":
    num_children = args.num_children
    death_probability = args.death_probability

-    try:
    # Wait until the expected number of nodes have joined the cluster.
    while True:
        num_nodes = len(ray.nodes())
-            logger.info("Waiting for nodes {}/{}".format(
-                num_nodes, num_remote_nodes + 1))
+        logger.info("Waiting for nodes {}/{}".format(num_nodes,
+                                                     num_remote_nodes + 1))
        if num_nodes >= num_remote_nodes + 1:
            break
        time.sleep(5)
@ -118,8 +117,6 @@ if __name__ == "__main__":
    result["min_iteration_time"] = min(loop_times)
    result["success"] = 1
    print("PASSED.")
-    except Exception as e:
-        logging.exception(e)
-        print("FAILED.")
+
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
        f.write(json.dumps(result))
--- a/release/nightly_tests/stress_tests/test_many_tasks.py
+++ b/release/nightly_tests/stress_tests/test_many_tasks.py
@ -179,12 +179,11 @@ if __name__ == "__main__":
    is_smoke_test = args.smoke_test

    result = {"success": 0}
-    try:
    # Wait until the expected number of nodes have joined the cluster.
    while True:
        num_nodes = len(ray.nodes())
-            logger.info("Waiting for nodes {}/{}".format(
-                num_nodes, num_remote_nodes + 1))
+        logger.info("Waiting for nodes {}/{}".format(num_nodes,
+                                                     num_remote_nodes + 1))
        if num_nodes >= num_remote_nodes + 1:
            break
        time.sleep(5)
@ -236,8 +235,6 @@ if __name__ == "__main__":
    #                  in actors]
    # ray.get(x_ids)
    # logger.info("Finished after %s seconds.", time.time() - start_time)
-    except Exception as e:
-        logging.exception(e)
-        print("FAILED.")
+
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as out_put:
        out_put.write(json.dumps(result))
--- a/release/nightly_tests/stress_tests/test_placement_group.py
+++ b/release/nightly_tests/stress_tests/test_placement_group.py
@ -87,13 +87,12 @@ def pg_launcher(pre_created_pgs, num_pgs_to_create):

 if __name__ == "__main__":
    result = {"success": 0}
-    try:
+
    # Wait until the expected number of nodes have joined the cluster.
    ray.init(address="auto")
    while True:
        num_nodes = len(ray.nodes())
-            logger.info("Waiting for nodes {}/{}".format(
-                num_nodes, NUM_NODES + 1))
+        logger.info("Waiting for nodes {}/{}".format(num_nodes, NUM_NODES + 1))
        if num_nodes >= NUM_NODES + 1:
            break
        time.sleep(5)
@ -155,18 +154,14 @@ if __name__ == "__main__":
    assert ray.cluster_resources()[
        "pg_custom"] == NUM_NODES * RESOURCE_QUANTITY

-        result[
-            "avg_pg_create_time_ms"] = total_creating_time / total_trial * 1000
-        result[
-            "avg_pg_remove_time_ms"] = total_removing_time / total_trial * 1000
+    result["avg_pg_create_time_ms"] = total_creating_time / total_trial * 1000
+    result["avg_pg_remove_time_ms"] = total_removing_time / total_trial * 1000
    result["success"] = 1
    print("Avg placement group creating time: "
          f"{total_creating_time / total_trial * 1000} ms")
    print("Avg placement group removing time: "
          f"{total_removing_time / total_trial* 1000} ms")
    print("PASSED.")
-    except Exception as e:
-        logger.exception(e)
-        print("FAILED.")
+
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as out_put:
        out_put.write(json.dumps(result))