[tune] Fix cloud tests, mark as stable (#25583)

#25063 broke release tests, but they've been consistently stable before. This PR fixes the tests and marks tune cloud tests as stable.
This commit is contained in:
Kai Fricke 2022-06-08 17:47:54 +01:00 committed by GitHub
parent 3296345557
commit c3b608f757
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 18 additions and 10 deletions

View file

@ -774,7 +774,7 @@
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
stable: false
stable: true
legacy:
test_name: aws_no_sync_down
@ -803,7 +803,7 @@
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
stable: false
stable: true
legacy:
test_name: aws_ssh_sync
@ -832,7 +832,7 @@
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
stable: false
stable: true
legacy:
test_name: aws_durable_upload
@ -922,7 +922,7 @@
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
stable: false
stable: true
legacy:
test_name: gcp_k8s_no_sync_down
@ -950,7 +950,7 @@
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
stable: false
stable: true
legacy:
test_name: gcp_k8s_ssh_sync
@ -978,7 +978,7 @@
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
stable: false
stable: true
legacy:
test_name: gcp_k8s_durable_upload

View file

@ -80,7 +80,7 @@ class TrialStub:
local_dir: str,
experiment_tag: str,
_last_result: Dict[str, Any],
logdir: str,
relative_logdir: str,
*args,
**kwargs,
):
@ -91,7 +91,7 @@ class TrialStub:
self.local_dir = local_dir
self.experiment_tag = experiment_tag
self.last_result = _last_result
self.logdir = logdir
self.relative_logdir = relative_logdir
self.local_experiment_dir = None
@ -107,7 +107,7 @@ class TrialStub:
@property
def dirname(self):
return os.path.basename(self.logdir)
return os.path.basename(self.relative_logdir)
@property
def was_on_driver_node(self):
@ -308,6 +308,7 @@ def run_tune_script_for_time(
indicator_file: str,
no_syncer: bool,
upload_dir: Optional[str],
run_start_timeout: int = 30,
):
# Start run
process = start_run(
@ -318,7 +319,9 @@ def run_tune_script_for_time(
)
try:
# Wait until indicator file exists
wait_for_run_or_raise(process, indicator_file=indicator_file, timeout=30)
wait_for_run_or_raise(
process, indicator_file=indicator_file, timeout=run_start_timeout
)
# Stop experiment (with checkpoint) after some time
send_signal_after_wait(process, signal=signal.SIGUSR1, wait=run_time)
# Wait until process gracefully terminated
@ -337,6 +340,7 @@ def run_resume_flow(
upload_dir: Optional[str],
first_run_time: int = 33,
second_run_time: int = 33,
run_start_timeout: int = 30,
before_experiments_callback: Optional[Callable[[], None]] = None,
between_experiments_callback: Optional[Callable[[], None]] = None,
after_experiments_callback: Optional[Callable[[], None]] = None,
@ -372,6 +376,7 @@ def run_resume_flow(
indicator_file=indicator_file,
no_syncer=no_syncer,
upload_dir=upload_dir,
run_start_timeout=run_start_timeout,
)
# Before we restart, run a couple of checks
@ -1152,6 +1157,8 @@ def test_durable_upload(bucket: str):
run_time = int(os.getenv("TUNE_RUN_TIME", "180")) or 180
run_start_timeout = 600 if "rllib" in os.environ["TUNE_TRAINABLE"] else 30
run_resume_flow(
experiment_name=experiment_name,
indicator_file=indicator_file,
@ -1159,6 +1166,7 @@ def test_durable_upload(bucket: str):
upload_dir=bucket,
first_run_time=run_time,
second_run_time=run_time,
run_start_timeout=run_start_timeout,
before_experiments_callback=before_experiments,
between_experiments_callback=between_experiments,
after_experiments_callback=after_experiments,