[Tests] Remove app level error from nightly tests (#16968)

* Completed

* Fix tests

* increase the node wait timeout

Signed-off-by: SangBin Cho <rkooo567@gmail.com>
This commit is contained in:
SangBin Cho 2021-07-09 12:20:42 -07:00 committed by GitHub
parent 66ea099897
commit 33e319e9d7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 205 additions and 257 deletions

View file

@ -185,9 +185,7 @@ if __name__ == "__main__":
if args.nbytes // npartitions > args.max_partition_size:
npartitions = args.nbytes // args.max_partition_size
success = 1
duration = []
try:
output = trial(
client,
args.data_dir,
@ -198,18 +196,12 @@ if __name__ == "__main__":
file_path=args.file_path)
print("mean over {} trials: {} +- {}".format(
len(output), np.mean(output), np.std(output)))
except Exception as e:
import traceback
print(traceback.format_exc())
print(e)
success = 0
duration = []
print(ray.internal.internal_api.memory_summary(stats_only=True))
duration = np.mean(output)
with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
f.write(json.dumps({"duration": duration, "success": success}))
f.write(json.dumps({"duration": duration, "success": 1}))
write_header = not os.path.exists("output.csv") or os.path.getsize(
"output.csv") == 0

View file

@ -421,8 +421,7 @@ def parse_script_args():
def main():
args, unknown = parse_script_args()
logging.info("Received arguments: {}".format(args))
success = 1
try:
# Create test spec
test_spec = TestSpec(
num_workers=args.num_workers,
@ -454,11 +453,8 @@ def main():
ray_scheduler=ray_dask_get,
)
print(ray.internal.internal_api.memory_summary(stats_only=True))
except Exception as e:
logging.exception(e)
success = 0
with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
f.write(json.dumps({"success": success}))
f.write(json.dumps({"success": 1}))
if __name__ == "__main__":

View file

@ -99,26 +99,6 @@
prepare: python wait_cluster.py 4 600
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming
# Test streaming shuffle in a single node with a large partition size.
- name: streaming_shuffle_1tb_100_partitions
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_multi.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=100 --partition-size=10e9
# Test non streaming shuffle in a single node with a large partition size.
- name: non_streaming_shuffle_1tb_100_partitions
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_multi.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=100 --partition-size=10e9 --no-streaming
# Test multi nodes 1TB streaming shuffle with a large number of partitions.
- name: shuffle_1tb_1000_partition
cluster:
@ -127,7 +107,7 @@
run:
timeout: 3000
prepare: python wait_cluster.py 20 600
prepare: python wait_cluster.py 20 900
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
# Test multi nodes 1TB non streaming shuffle with a large number of partitions.
@ -138,7 +118,7 @@
run:
timeout: 3000
prepare: python wait_cluster.py 20 600
prepare: python wait_cluster.py 20 900
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
# Stress test for 1TB multi node streaming shuffle.
@ -149,7 +129,7 @@
run:
timeout: 3000
prepare: python wait_cluster.py 20 600
prepare: python wait_cluster.py 20 900
script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6
# Stress test for 1TB multi node non-streaming shuffle.
@ -160,7 +140,7 @@
run:
timeout: 3000
prepare: python wait_cluster.py 20 600
prepare: python wait_cluster.py 20 900
script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 --no-streaming
# Test large scale dask on ray test without spilling.

View file

@ -18,7 +18,6 @@ if __name__ == "__main__":
args = parser.parse_args()
start = time.time()
success = 1
commands = [
"python", "-m", "ray.experimental.shuffle", "--ray-address={}".format(
@ -29,16 +28,8 @@ if __name__ == "__main__":
if args.no_streaming:
commands.append("--no-streaming")
try:
subprocess.check_call(commands)
except Exception as e:
print(f"The test failed with {e}")
success = 0
delta = time.time() - start
# Report the running time as 0 if it fails so that
# it is easy to be discovered from the graph.
if not success:
delta = 0
with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
f.write(json.dumps({"shuffle_time": delta, "success": success}))
f.write(json.dumps({"shuffle_time": delta, "success": 1}))

View file

@ -73,12 +73,11 @@ if __name__ == "__main__":
num_children = args.num_children
death_probability = args.death_probability
try:
# Wait until the expected number of nodes have joined the cluster.
while True:
num_nodes = len(ray.nodes())
logger.info("Waiting for nodes {}/{}".format(
num_nodes, num_remote_nodes + 1))
logger.info("Waiting for nodes {}/{}".format(num_nodes,
num_remote_nodes + 1))
if num_nodes >= num_remote_nodes + 1:
break
time.sleep(5)
@ -118,8 +117,6 @@ if __name__ == "__main__":
result["min_iteration_time"] = min(loop_times)
result["success"] = 1
print("PASSED.")
except Exception as e:
logging.exception(e)
print("FAILED.")
with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
f.write(json.dumps(result))

View file

@ -179,12 +179,11 @@ if __name__ == "__main__":
is_smoke_test = args.smoke_test
result = {"success": 0}
try:
# Wait until the expected number of nodes have joined the cluster.
while True:
num_nodes = len(ray.nodes())
logger.info("Waiting for nodes {}/{}".format(
num_nodes, num_remote_nodes + 1))
logger.info("Waiting for nodes {}/{}".format(num_nodes,
num_remote_nodes + 1))
if num_nodes >= num_remote_nodes + 1:
break
time.sleep(5)
@ -236,8 +235,6 @@ if __name__ == "__main__":
# in actors]
# ray.get(x_ids)
# logger.info("Finished after %s seconds.", time.time() - start_time)
except Exception as e:
logging.exception(e)
print("FAILED.")
with open(os.environ["TEST_OUTPUT_JSON"], "w") as out_put:
out_put.write(json.dumps(result))

View file

@ -87,13 +87,12 @@ def pg_launcher(pre_created_pgs, num_pgs_to_create):
if __name__ == "__main__":
result = {"success": 0}
try:
# Wait until the expected number of nodes have joined the cluster.
ray.init(address="auto")
while True:
num_nodes = len(ray.nodes())
logger.info("Waiting for nodes {}/{}".format(
num_nodes, NUM_NODES + 1))
logger.info("Waiting for nodes {}/{}".format(num_nodes, NUM_NODES + 1))
if num_nodes >= NUM_NODES + 1:
break
time.sleep(5)
@ -155,18 +154,14 @@ if __name__ == "__main__":
assert ray.cluster_resources()[
"pg_custom"] == NUM_NODES * RESOURCE_QUANTITY
result[
"avg_pg_create_time_ms"] = total_creating_time / total_trial * 1000
result[
"avg_pg_remove_time_ms"] = total_removing_time / total_trial * 1000
result["avg_pg_create_time_ms"] = total_creating_time / total_trial * 1000
result["avg_pg_remove_time_ms"] = total_removing_time / total_trial * 1000
result["success"] = 1
print("Avg placement group creating time: "
f"{total_creating_time / total_trial * 1000} ms")
print("Avg placement group removing time: "
f"{total_removing_time / total_trial* 1000} ms")
print("PASSED.")
except Exception as e:
logger.exception(e)
print("FAILED.")
with open(os.environ["TEST_OUTPUT_JSON"], "w") as out_put:
out_put.write(json.dumps(result))