[tune] Fix cloud tests, mark as stable (#25583)

#25063 broke release tests, but they've been consistently stable before. This PR fixes the tests and marks tune cloud tests as stable.
2025-03-06 02:21:39 -05:00 · 2022-06-08 17:47:54 +01:00 · 2022-06-08 17:47:54 +01:00 · c3b608f757
commit c3b608f757
parent 3296345557
2 changed files with 18 additions and 10 deletions
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@ -774,7 +774,7 @@
  group: Tune cloud tests
  working_dir: tune_tests/cloud_tests

-  stable: false
+  stable: true

  legacy:
    test_name: aws_no_sync_down
@ -803,7 +803,7 @@
  group: Tune cloud tests
  working_dir: tune_tests/cloud_tests

-  stable: false
+  stable: true

  legacy:
    test_name: aws_ssh_sync
@ -832,7 +832,7 @@
  group: Tune cloud tests
  working_dir: tune_tests/cloud_tests

-  stable: false
+  stable: true

  legacy:
    test_name: aws_durable_upload
@ -922,7 +922,7 @@
  group: Tune cloud tests
  working_dir: tune_tests/cloud_tests

-  stable: false
+  stable: true

  legacy:
    test_name: gcp_k8s_no_sync_down
@ -950,7 +950,7 @@
  group: Tune cloud tests
  working_dir: tune_tests/cloud_tests

-  stable: false
+  stable: true

  legacy:
    test_name: gcp_k8s_ssh_sync
@ -978,7 +978,7 @@
  group: Tune cloud tests
  working_dir: tune_tests/cloud_tests

-  stable: false
+  stable: true

  legacy:
    test_name: gcp_k8s_durable_upload
--- a/release/tune_tests/cloud_tests/workloads/run_cloud_test.py
+++ b/release/tune_tests/cloud_tests/workloads/run_cloud_test.py
@ -80,7 +80,7 @@ class TrialStub:
        local_dir: str,
        experiment_tag: str,
        _last_result: Dict[str, Any],
-        logdir: str,
+        relative_logdir: str,
        *args,
        **kwargs,
    ):
@ -91,7 +91,7 @@ class TrialStub:
        self.local_dir = local_dir
        self.experiment_tag = experiment_tag
        self.last_result = _last_result
-        self.logdir = logdir
+        self.relative_logdir = relative_logdir

        self.local_experiment_dir = None

@ -107,7 +107,7 @@ class TrialStub:

    @property
    def dirname(self):
-        return os.path.basename(self.logdir)
+        return os.path.basename(self.relative_logdir)

    @property
    def was_on_driver_node(self):
@ -308,6 +308,7 @@ def run_tune_script_for_time(
    indicator_file: str,
    no_syncer: bool,
    upload_dir: Optional[str],
+    run_start_timeout: int = 30,
 ):
    # Start run
    process = start_run(
@ -318,7 +319,9 @@ def run_tune_script_for_time(
    )
    try:
        # Wait until indicator file exists
-        wait_for_run_or_raise(process, indicator_file=indicator_file, timeout=30)
+        wait_for_run_or_raise(
+            process, indicator_file=indicator_file, timeout=run_start_timeout
+        )
        # Stop experiment (with checkpoint) after some time
        send_signal_after_wait(process, signal=signal.SIGUSR1, wait=run_time)
        # Wait until process gracefully terminated
@ -337,6 +340,7 @@ def run_resume_flow(
    upload_dir: Optional[str],
    first_run_time: int = 33,
    second_run_time: int = 33,
+    run_start_timeout: int = 30,
    before_experiments_callback: Optional[Callable[[], None]] = None,
    between_experiments_callback: Optional[Callable[[], None]] = None,
    after_experiments_callback: Optional[Callable[[], None]] = None,
@ -372,6 +376,7 @@ def run_resume_flow(
        indicator_file=indicator_file,
        no_syncer=no_syncer,
        upload_dir=upload_dir,
+        run_start_timeout=run_start_timeout,
    )

    # Before we restart, run a couple of checks
@ -1152,6 +1157,8 @@ def test_durable_upload(bucket: str):

    run_time = int(os.getenv("TUNE_RUN_TIME", "180")) or 180

+    run_start_timeout = 600 if "rllib" in os.environ["TUNE_TRAINABLE"] else 30
+
    run_resume_flow(
        experiment_name=experiment_name,
        indicator_file=indicator_file,
@ -1159,6 +1166,7 @@ def test_durable_upload(bucket: str):
        upload_dir=bucket,
        first_run_time=run_time,
        second_run_time=run_time,
+        run_start_timeout=run_start_timeout,
        before_experiments_callback=before_experiments,
        between_experiments_callback=between_experiments,
        after_experiments_callback=after_experiments,