Support better infra failure detection + stable flag (#18202)

This commit is contained in:
SangBin Cho 2021-08-30 10:51:03 -07:00 committed by GitHub
parent 1adce7da4e
commit dfbad8668a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 15 additions and 3 deletions

View file

@ -277,6 +277,10 @@ def maybe_fetch_api_token():
"anyscale-token20210505220406333800000001-BcUuKB")["SecretString"]
class PrepareCommandRuntimeError(RuntimeError):
pass
class ReleaseTestTimeoutError(RuntimeError):
pass
@ -876,8 +880,12 @@ def wait_for_session_command_to_complete(create_session_command_result,
runtime = time.time() - start_wait
if status_code != 0:
raise RuntimeError(
f"Command returned non-success status: {status_code}")
if state_str == "CMD_RUN":
raise RuntimeError(
f"Command returned non-success status: {status_code}")
elif state_str == "CMD_PREPARE":
raise PrepareCommandRuntimeError(
f"Prepare command returned non-success status: {status_code}")
return status_code, runtime
@ -1193,6 +1201,7 @@ def run_test_config(
results["_runtime"] = runtime
results["_session_url"] = session_url
results["_commit_url"] = commit_url
results["_stable"] = test_config.get("stable", True)
result_queue.put(
State(
"END",
@ -1452,7 +1461,8 @@ def run_test_config(
runtime = 0
elif (isinstance(e, PrepareCommandTimeoutError)
or isinstance(e, FileSyncTimeoutError)
or isinstance(e, SessionTimeoutError)):
or isinstance(e, SessionTimeoutError)
or isinstance(e, PrepareCommandRuntimeError)):
timeout_type = "infra_timeout"
runtime = None
elif isinstance(e, RuntimeError):
@ -1467,6 +1477,7 @@ def run_test_config(
results["_runtime"] = runtime
results["_session_url"] = session_url
results["_commit_url"] = commit_url
results["_stable"] = test_config.get("stable", True)
result_queue.put(
State(
"END", time.time(), {

View file

@ -142,6 +142,7 @@
# Stress test for 1TB multi node non-streaming shuffle.
- name: non_streaming_shuffle_1tb_5000_partitions
stable: False
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_large_scale.yaml