mirror of
https://github.com/vale981/ray
synced 2025-03-04 17:41:43 -05:00
[tune] Increase volume size for long running pbt failure (#27163)
Currently running into an issue: Cluster startup Failed. Error: RuntimeError: botocore.exceptions.ClientError: An error occurred (InvalidBlockDeviceMapping) when calling the RunInstances operation: Volume of size 202GB is smaller than snapshot 'snap-02c4e6a0ad06cf3d6', expect size >= 400GB
This commit is contained in:
parent
c1ac2bb80f
commit
ee05fc94fe
2 changed files with 3 additions and 2 deletions
|
@ -26,4 +26,4 @@ aws:
|
|||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 202
|
||||
VolumeSize: 400
|
|
@ -5,7 +5,7 @@ import numpy as np
|
|||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.air.config import RunConfig, ScalingConfig
|
||||
from ray.air.config import RunConfig, ScalingConfig, FailureConfig
|
||||
from ray.train.examples.tune_cifar_torch_pbt_example import train_func
|
||||
from ray.train.torch import TorchConfig, TorchTrainer
|
||||
from ray.tune.schedulers import PopulationBasedTraining
|
||||
|
@ -69,6 +69,7 @@ tuner = Tuner(
|
|||
),
|
||||
run_config=RunConfig(
|
||||
stop={"training_iteration": 1} if args.smoke_test else None,
|
||||
failure_config=FailureConfig(max_failures=-1),
|
||||
callbacks=[FailureInjectorCallback(time_between_checks=90), ProgressCallback()],
|
||||
),
|
||||
)
|
||||
|
|
Loading…
Add table
Reference in a new issue