[release] update torch_tune_serve_test to use anyscale connect (#16754)

* [release] update torch_tune_serve_test to use anyscale connect * use download_results to download model checkpoint * clean up code to support both OSS and Anyscale
2025-03-06 02:21:39 -05:00 · 2021-07-06 19:02:50 -07:00 · 2021-07-06 19:02:50 -07:00 · 23088bd7ea
commit 23088bd7ea
parent 7318a212fb
2 changed files with 42 additions and 4 deletions
--- a/release/golden_notebook_tests/golden_notebook_tests.yaml
+++ b/release/golden_notebook_tests/golden_notebook_tests.yaml
@ -26,6 +26,7 @@
    compute_template: gpu_tpl.yaml
  run:
    use_connect: True
    timeout: 1800
    script: python workloads/torch_tune_serve_test.py
--- a/release/golden_notebook_tests/workloads/torch_tune_serve_test.py
+++ b/release/golden_notebook_tests/workloads/torch_tune_serve_test.py
@ -17,6 +17,13 @@ from torch.utils.data import DataLoader, Subset
 from torchvision.datasets import MNIST
 def _is_anyscale_connect():
    address = os.environ.get("RAY_ADDRESS")
    is_anyscale_connect = address is not None and address.startswith(
        "anyscale://")
    return is_anyscale_connect
 def load_mnist_data(train: bool, download: bool):
    transform = transforms.Compose(
        [transforms.ToTensor(),
@ -85,8 +92,33 @@ def train_mnist(test_mode=False, num_workers=1, use_gpu=False):
        checkpoint_at_end=True)
-def get_best_model(best_model_checkpoint_path):
+def get_remote_model(remote_model_checkpoint_path):
-    model_state = torch.load(best_model_checkpoint_path)
+    if _is_anyscale_connect():
        # Download training results to local client.
        local_dir = "~/ray_results"
        # TODO(matt): remove the following line when Anyscale Connect
        # supports tilde expansion.
        local_dir = os.path.expanduser(local_dir)
        remote_dir = "/home/ray/ray_results/"
        ray.client().download_results(
            local_dir=local_dir, remote_dir=remote_dir)
        # Compute local path.
        rel_model_checkpoint_path = os.path.relpath(
            remote_model_checkpoint_path, remote_dir)
        local_model_checkpoint_path = os.path.join(local_dir,
                                                   rel_model_checkpoint_path)
        # Load model reference.
        return get_model(local_model_checkpoint_path)
    else:
        get_best_model_remote = ray.remote(get_model)
        return ray.get(
            get_best_model_remote.remote(remote_model_checkpoint_path))
 def get_model(model_checkpoint_path):
    model_state = torch.load(model_checkpoint_path)
    model = ResNet18(None)
    model.conv1 = nn.Conv2d(
@ -184,7 +216,12 @@ if __name__ == "__main__":
    start = time.time()
-    ray.client("anyscale://").connect()
+    client_builder = ray.client()
    if (_is_anyscale_connect()):
        job_name = os.environ.get("RAY_JOB_NAME", "torch_tune_serve_test")
        client_builder.job_name(job_name)
    client_builder.connect()
    num_workers = 2
    use_gpu = True
@ -193,7 +230,7 @@ if __name__ == "__main__":
    print("Retrieving best model.")
    best_checkpoint = analysis.best_checkpoint
-    model_id = get_best_model(best_checkpoint)
+    model_id = get_remote_model(best_checkpoint)
    print("Setting up Serve.")
    setup_serve(model_id)