[tune] Increase volume size for long running pbt failure (#27163)

Currently running into an issue: Cluster startup Failed. Error: RuntimeError: botocore.exceptions.ClientError: An error occurred (InvalidBlockDeviceMapping) when calling the RunInstances operation: Volume of size 202GB is smaller than snapshot 'snap-02c4e6a0ad06cf3d6', expect size >= 400GB
2025-03-05 10:01:43 -05:00 · 2022-07-29 06:57:26 +01:00 · 2022-07-29 06:57:26 +01:00 · ee05fc94fe
commit ee05fc94fe
parent c1ac2bb80f
2 changed files with 3 additions and 2 deletions
--- a/release/long_running_distributed_tests/compute_tpl.yaml
+++ b/release/long_running_distributed_tests/compute_tpl.yaml
@ -26,4 +26,4 @@ aws:
  BlockDeviceMappings:
    - DeviceName: /dev/sda1
      Ebs:
-        VolumeSize: 202
+        VolumeSize: 400
--- a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py
+++ b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py
@ -5,7 +5,7 @@ import numpy as np

 import ray
 from ray import tune
-from ray.air.config import RunConfig, ScalingConfig
+from ray.air.config import RunConfig, ScalingConfig, FailureConfig
 from ray.train.examples.tune_cifar_torch_pbt_example import train_func
 from ray.train.torch import TorchConfig, TorchTrainer
 from ray.tune.schedulers import PopulationBasedTraining
@ -69,6 +69,7 @@ tuner = Tuner(
    ),
    run_config=RunConfig(
        stop={"training_iteration": 1} if args.smoke_test else None,
+        failure_config=FailureConfig(max_failures=-1),
        callbacks=[FailureInjectorCallback(time_between_checks=90), ProgressCallback()],
    ),
 )