mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
[Tests] Remove app level error from nightly tests (#16968)
* Completed * Fix tests * increase the node wait timeout Signed-off-by: SangBin Cho <rkooo567@gmail.com>
This commit is contained in:
parent
66ea099897
commit
33e319e9d7
7 changed files with 205 additions and 257 deletions
|
@ -185,9 +185,7 @@ if __name__ == "__main__":
|
|||
if args.nbytes // npartitions > args.max_partition_size:
|
||||
npartitions = args.nbytes // args.max_partition_size
|
||||
|
||||
success = 1
|
||||
duration = []
|
||||
try:
|
||||
output = trial(
|
||||
client,
|
||||
args.data_dir,
|
||||
|
@ -198,18 +196,12 @@ if __name__ == "__main__":
|
|||
file_path=args.file_path)
|
||||
print("mean over {} trials: {} +- {}".format(
|
||||
len(output), np.mean(output), np.std(output)))
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print(traceback.format_exc())
|
||||
print(e)
|
||||
success = 0
|
||||
duration = []
|
||||
|
||||
print(ray.internal.internal_api.memory_summary(stats_only=True))
|
||||
duration = np.mean(output)
|
||||
|
||||
with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
|
||||
f.write(json.dumps({"duration": duration, "success": success}))
|
||||
f.write(json.dumps({"duration": duration, "success": 1}))
|
||||
|
||||
write_header = not os.path.exists("output.csv") or os.path.getsize(
|
||||
"output.csv") == 0
|
||||
|
|
|
@ -421,8 +421,7 @@ def parse_script_args():
|
|||
def main():
|
||||
args, unknown = parse_script_args()
|
||||
logging.info("Received arguments: {}".format(args))
|
||||
success = 1
|
||||
try:
|
||||
|
||||
# Create test spec
|
||||
test_spec = TestSpec(
|
||||
num_workers=args.num_workers,
|
||||
|
@ -454,11 +453,8 @@ def main():
|
|||
ray_scheduler=ray_dask_get,
|
||||
)
|
||||
print(ray.internal.internal_api.memory_summary(stats_only=True))
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
success = 0
|
||||
with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
|
||||
f.write(json.dumps({"success": success}))
|
||||
f.write(json.dumps({"success": 1}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -99,26 +99,6 @@
|
|||
prepare: python wait_cluster.py 4 600
|
||||
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming
|
||||
|
||||
# Test streaming shuffle in a single node with a large partition size.
|
||||
- name: streaming_shuffle_1tb_100_partitions
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_multi.yaml
|
||||
|
||||
run:
|
||||
timeout: 3000
|
||||
script: python shuffle/shuffle_test.py --num-partitions=100 --partition-size=10e9
|
||||
|
||||
# Test non streaming shuffle in a single node with a large partition size.
|
||||
- name: non_streaming_shuffle_1tb_100_partitions
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_multi.yaml
|
||||
|
||||
run:
|
||||
timeout: 3000
|
||||
script: python shuffle/shuffle_test.py --num-partitions=100 --partition-size=10e9 --no-streaming
|
||||
|
||||
# Test multi nodes 1TB streaming shuffle with a large number of partitions.
|
||||
- name: shuffle_1tb_1000_partition
|
||||
cluster:
|
||||
|
@ -127,7 +107,7 @@
|
|||
|
||||
run:
|
||||
timeout: 3000
|
||||
prepare: python wait_cluster.py 20 600
|
||||
prepare: python wait_cluster.py 20 900
|
||||
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
|
||||
|
||||
# Test multi nodes 1TB non streaming shuffle with a large number of partitions.
|
||||
|
@ -138,7 +118,7 @@
|
|||
|
||||
run:
|
||||
timeout: 3000
|
||||
prepare: python wait_cluster.py 20 600
|
||||
prepare: python wait_cluster.py 20 900
|
||||
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
|
||||
|
||||
# Stress test for 1TB multi node streaming shuffle.
|
||||
|
@ -149,7 +129,7 @@
|
|||
|
||||
run:
|
||||
timeout: 3000
|
||||
prepare: python wait_cluster.py 20 600
|
||||
prepare: python wait_cluster.py 20 900
|
||||
script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6
|
||||
|
||||
# Stress test for 1TB multi node non-streaming shuffle.
|
||||
|
@ -160,7 +140,7 @@
|
|||
|
||||
run:
|
||||
timeout: 3000
|
||||
prepare: python wait_cluster.py 20 600
|
||||
prepare: python wait_cluster.py 20 900
|
||||
script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 --no-streaming
|
||||
|
||||
# Test large scale dask on ray test without spilling.
|
||||
|
|
|
@ -18,7 +18,6 @@ if __name__ == "__main__":
|
|||
args = parser.parse_args()
|
||||
|
||||
start = time.time()
|
||||
success = 1
|
||||
|
||||
commands = [
|
||||
"python", "-m", "ray.experimental.shuffle", "--ray-address={}".format(
|
||||
|
@ -29,16 +28,8 @@ if __name__ == "__main__":
|
|||
if args.no_streaming:
|
||||
commands.append("--no-streaming")
|
||||
|
||||
try:
|
||||
subprocess.check_call(commands)
|
||||
except Exception as e:
|
||||
print(f"The test failed with {e}")
|
||||
success = 0
|
||||
delta = time.time() - start
|
||||
|
||||
# Report the running time as 0 if it fails so that
|
||||
# it is easy to be discovered from the graph.
|
||||
if not success:
|
||||
delta = 0
|
||||
with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
|
||||
f.write(json.dumps({"shuffle_time": delta, "success": success}))
|
||||
f.write(json.dumps({"shuffle_time": delta, "success": 1}))
|
||||
|
|
|
@ -73,12 +73,11 @@ if __name__ == "__main__":
|
|||
num_children = args.num_children
|
||||
death_probability = args.death_probability
|
||||
|
||||
try:
|
||||
# Wait until the expected number of nodes have joined the cluster.
|
||||
while True:
|
||||
num_nodes = len(ray.nodes())
|
||||
logger.info("Waiting for nodes {}/{}".format(
|
||||
num_nodes, num_remote_nodes + 1))
|
||||
logger.info("Waiting for nodes {}/{}".format(num_nodes,
|
||||
num_remote_nodes + 1))
|
||||
if num_nodes >= num_remote_nodes + 1:
|
||||
break
|
||||
time.sleep(5)
|
||||
|
@ -118,8 +117,6 @@ if __name__ == "__main__":
|
|||
result["min_iteration_time"] = min(loop_times)
|
||||
result["success"] = 1
|
||||
print("PASSED.")
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
print("FAILED.")
|
||||
|
||||
with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
|
||||
f.write(json.dumps(result))
|
||||
|
|
|
@ -179,12 +179,11 @@ if __name__ == "__main__":
|
|||
is_smoke_test = args.smoke_test
|
||||
|
||||
result = {"success": 0}
|
||||
try:
|
||||
# Wait until the expected number of nodes have joined the cluster.
|
||||
while True:
|
||||
num_nodes = len(ray.nodes())
|
||||
logger.info("Waiting for nodes {}/{}".format(
|
||||
num_nodes, num_remote_nodes + 1))
|
||||
logger.info("Waiting for nodes {}/{}".format(num_nodes,
|
||||
num_remote_nodes + 1))
|
||||
if num_nodes >= num_remote_nodes + 1:
|
||||
break
|
||||
time.sleep(5)
|
||||
|
@ -236,8 +235,6 @@ if __name__ == "__main__":
|
|||
# in actors]
|
||||
# ray.get(x_ids)
|
||||
# logger.info("Finished after %s seconds.", time.time() - start_time)
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
print("FAILED.")
|
||||
|
||||
with open(os.environ["TEST_OUTPUT_JSON"], "w") as out_put:
|
||||
out_put.write(json.dumps(result))
|
||||
|
|
|
@ -87,13 +87,12 @@ def pg_launcher(pre_created_pgs, num_pgs_to_create):
|
|||
|
||||
if __name__ == "__main__":
|
||||
result = {"success": 0}
|
||||
try:
|
||||
|
||||
# Wait until the expected number of nodes have joined the cluster.
|
||||
ray.init(address="auto")
|
||||
while True:
|
||||
num_nodes = len(ray.nodes())
|
||||
logger.info("Waiting for nodes {}/{}".format(
|
||||
num_nodes, NUM_NODES + 1))
|
||||
logger.info("Waiting for nodes {}/{}".format(num_nodes, NUM_NODES + 1))
|
||||
if num_nodes >= NUM_NODES + 1:
|
||||
break
|
||||
time.sleep(5)
|
||||
|
@ -155,18 +154,14 @@ if __name__ == "__main__":
|
|||
assert ray.cluster_resources()[
|
||||
"pg_custom"] == NUM_NODES * RESOURCE_QUANTITY
|
||||
|
||||
result[
|
||||
"avg_pg_create_time_ms"] = total_creating_time / total_trial * 1000
|
||||
result[
|
||||
"avg_pg_remove_time_ms"] = total_removing_time / total_trial * 1000
|
||||
result["avg_pg_create_time_ms"] = total_creating_time / total_trial * 1000
|
||||
result["avg_pg_remove_time_ms"] = total_removing_time / total_trial * 1000
|
||||
result["success"] = 1
|
||||
print("Avg placement group creating time: "
|
||||
f"{total_creating_time / total_trial * 1000} ms")
|
||||
print("Avg placement group removing time: "
|
||||
f"{total_removing_time / total_trial* 1000} ms")
|
||||
print("PASSED.")
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
print("FAILED.")
|
||||
|
||||
with open(os.environ["TEST_OUTPUT_JSON"], "w") as out_put:
|
||||
out_put.write(json.dumps(result))
|
||||
|
|
Loading…
Add table
Reference in a new issue