mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
Support better infra failure detection + stable flag (#18202)
This commit is contained in:
parent
1adce7da4e
commit
dfbad8668a
2 changed files with 15 additions and 3 deletions
|
@ -277,6 +277,10 @@ def maybe_fetch_api_token():
|
|||
"anyscale-token20210505220406333800000001-BcUuKB")["SecretString"]
|
||||
|
||||
|
||||
class PrepareCommandRuntimeError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class ReleaseTestTimeoutError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
@ -876,8 +880,12 @@ def wait_for_session_command_to_complete(create_session_command_result,
|
|||
runtime = time.time() - start_wait
|
||||
|
||||
if status_code != 0:
|
||||
raise RuntimeError(
|
||||
f"Command returned non-success status: {status_code}")
|
||||
if state_str == "CMD_RUN":
|
||||
raise RuntimeError(
|
||||
f"Command returned non-success status: {status_code}")
|
||||
elif state_str == "CMD_PREPARE":
|
||||
raise PrepareCommandRuntimeError(
|
||||
f"Prepare command returned non-success status: {status_code}")
|
||||
|
||||
return status_code, runtime
|
||||
|
||||
|
@ -1193,6 +1201,7 @@ def run_test_config(
|
|||
results["_runtime"] = runtime
|
||||
results["_session_url"] = session_url
|
||||
results["_commit_url"] = commit_url
|
||||
results["_stable"] = test_config.get("stable", True)
|
||||
result_queue.put(
|
||||
State(
|
||||
"END",
|
||||
|
@ -1452,7 +1461,8 @@ def run_test_config(
|
|||
runtime = 0
|
||||
elif (isinstance(e, PrepareCommandTimeoutError)
|
||||
or isinstance(e, FileSyncTimeoutError)
|
||||
or isinstance(e, SessionTimeoutError)):
|
||||
or isinstance(e, SessionTimeoutError)
|
||||
or isinstance(e, PrepareCommandRuntimeError)):
|
||||
timeout_type = "infra_timeout"
|
||||
runtime = None
|
||||
elif isinstance(e, RuntimeError):
|
||||
|
@ -1467,6 +1477,7 @@ def run_test_config(
|
|||
results["_runtime"] = runtime
|
||||
results["_session_url"] = session_url
|
||||
results["_commit_url"] = commit_url
|
||||
results["_stable"] = test_config.get("stable", True)
|
||||
result_queue.put(
|
||||
State(
|
||||
"END", time.time(), {
|
||||
|
|
|
@ -142,6 +142,7 @@
|
|||
|
||||
# Stress test for 1TB multi node non-streaming shuffle.
|
||||
- name: non_streaming_shuffle_1tb_5000_partitions
|
||||
stable: False
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
||||
|
|
Loading…
Add table
Reference in a new issue