mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[tune] Fix cloud tests, mark as stable (#25583)
#25063 broke release tests, but they've been consistently stable before. This PR fixes the tests and marks tune cloud tests as stable.
This commit is contained in:
parent
3296345557
commit
c3b608f757
2 changed files with 18 additions and 10 deletions
|
@ -774,7 +774,7 @@
|
|||
group: Tune cloud tests
|
||||
working_dir: tune_tests/cloud_tests
|
||||
|
||||
stable: false
|
||||
stable: true
|
||||
|
||||
legacy:
|
||||
test_name: aws_no_sync_down
|
||||
|
@ -803,7 +803,7 @@
|
|||
group: Tune cloud tests
|
||||
working_dir: tune_tests/cloud_tests
|
||||
|
||||
stable: false
|
||||
stable: true
|
||||
|
||||
legacy:
|
||||
test_name: aws_ssh_sync
|
||||
|
@ -832,7 +832,7 @@
|
|||
group: Tune cloud tests
|
||||
working_dir: tune_tests/cloud_tests
|
||||
|
||||
stable: false
|
||||
stable: true
|
||||
|
||||
legacy:
|
||||
test_name: aws_durable_upload
|
||||
|
@ -922,7 +922,7 @@
|
|||
group: Tune cloud tests
|
||||
working_dir: tune_tests/cloud_tests
|
||||
|
||||
stable: false
|
||||
stable: true
|
||||
|
||||
legacy:
|
||||
test_name: gcp_k8s_no_sync_down
|
||||
|
@ -950,7 +950,7 @@
|
|||
group: Tune cloud tests
|
||||
working_dir: tune_tests/cloud_tests
|
||||
|
||||
stable: false
|
||||
stable: true
|
||||
|
||||
legacy:
|
||||
test_name: gcp_k8s_ssh_sync
|
||||
|
@ -978,7 +978,7 @@
|
|||
group: Tune cloud tests
|
||||
working_dir: tune_tests/cloud_tests
|
||||
|
||||
stable: false
|
||||
stable: true
|
||||
|
||||
legacy:
|
||||
test_name: gcp_k8s_durable_upload
|
||||
|
|
|
@ -80,7 +80,7 @@ class TrialStub:
|
|||
local_dir: str,
|
||||
experiment_tag: str,
|
||||
_last_result: Dict[str, Any],
|
||||
logdir: str,
|
||||
relative_logdir: str,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
|
@ -91,7 +91,7 @@ class TrialStub:
|
|||
self.local_dir = local_dir
|
||||
self.experiment_tag = experiment_tag
|
||||
self.last_result = _last_result
|
||||
self.logdir = logdir
|
||||
self.relative_logdir = relative_logdir
|
||||
|
||||
self.local_experiment_dir = None
|
||||
|
||||
|
@ -107,7 +107,7 @@ class TrialStub:
|
|||
|
||||
@property
|
||||
def dirname(self):
|
||||
return os.path.basename(self.logdir)
|
||||
return os.path.basename(self.relative_logdir)
|
||||
|
||||
@property
|
||||
def was_on_driver_node(self):
|
||||
|
@ -308,6 +308,7 @@ def run_tune_script_for_time(
|
|||
indicator_file: str,
|
||||
no_syncer: bool,
|
||||
upload_dir: Optional[str],
|
||||
run_start_timeout: int = 30,
|
||||
):
|
||||
# Start run
|
||||
process = start_run(
|
||||
|
@ -318,7 +319,9 @@ def run_tune_script_for_time(
|
|||
)
|
||||
try:
|
||||
# Wait until indicator file exists
|
||||
wait_for_run_or_raise(process, indicator_file=indicator_file, timeout=30)
|
||||
wait_for_run_or_raise(
|
||||
process, indicator_file=indicator_file, timeout=run_start_timeout
|
||||
)
|
||||
# Stop experiment (with checkpoint) after some time
|
||||
send_signal_after_wait(process, signal=signal.SIGUSR1, wait=run_time)
|
||||
# Wait until process gracefully terminated
|
||||
|
@ -337,6 +340,7 @@ def run_resume_flow(
|
|||
upload_dir: Optional[str],
|
||||
first_run_time: int = 33,
|
||||
second_run_time: int = 33,
|
||||
run_start_timeout: int = 30,
|
||||
before_experiments_callback: Optional[Callable[[], None]] = None,
|
||||
between_experiments_callback: Optional[Callable[[], None]] = None,
|
||||
after_experiments_callback: Optional[Callable[[], None]] = None,
|
||||
|
@ -372,6 +376,7 @@ def run_resume_flow(
|
|||
indicator_file=indicator_file,
|
||||
no_syncer=no_syncer,
|
||||
upload_dir=upload_dir,
|
||||
run_start_timeout=run_start_timeout,
|
||||
)
|
||||
|
||||
# Before we restart, run a couple of checks
|
||||
|
@ -1152,6 +1157,8 @@ def test_durable_upload(bucket: str):
|
|||
|
||||
run_time = int(os.getenv("TUNE_RUN_TIME", "180")) or 180
|
||||
|
||||
run_start_timeout = 600 if "rllib" in os.environ["TUNE_TRAINABLE"] else 30
|
||||
|
||||
run_resume_flow(
|
||||
experiment_name=experiment_name,
|
||||
indicator_file=indicator_file,
|
||||
|
@ -1159,6 +1166,7 @@ def test_durable_upload(bucket: str):
|
|||
upload_dir=bucket,
|
||||
first_run_time=run_time,
|
||||
second_run_time=run_time,
|
||||
run_start_timeout=run_start_timeout,
|
||||
before_experiments_callback=before_experiments,
|
||||
between_experiments_callback=between_experiments,
|
||||
after_experiments_callback=after_experiments,
|
||||
|
|
Loading…
Add table
Reference in a new issue