mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[tune] Better error msg for grpc resource exhausted error. (#22806)
This commit is contained in:
parent
bf1bd293f4
commit
99d5288bbd
2 changed files with 48 additions and 4 deletions
|
@ -1,11 +1,12 @@
|
||||||
from functools import partial
|
|
||||||
from typing import Dict, Sequence, Any
|
|
||||||
import copy
|
import copy
|
||||||
|
from functools import partial
|
||||||
|
import grpc
|
||||||
import inspect
|
import inspect
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from pickle import PicklingError
|
from pickle import PicklingError
|
||||||
|
import traceback
|
||||||
|
from typing import Dict, Sequence, Any
|
||||||
|
|
||||||
from ray.tune.error import TuneError
|
from ray.tune.error import TuneError
|
||||||
from ray.tune.registry import register_trainable
|
from ray.tune.registry import register_trainable
|
||||||
|
@ -121,7 +122,21 @@ class Experiment:
|
||||||
"checkpointable function. You can specify checkpoints "
|
"checkpointable function. You can specify checkpoints "
|
||||||
"within your trainable function."
|
"within your trainable function."
|
||||||
)
|
)
|
||||||
self._run_identifier = Experiment.register_if_needed(run)
|
try:
|
||||||
|
self._run_identifier = Experiment.register_if_needed(run)
|
||||||
|
except grpc.RpcError as e:
|
||||||
|
if e.code() == grpc.StatusCode.RESOURCE_EXHAUSTED:
|
||||||
|
raise TuneError(
|
||||||
|
f"The Trainable/training function is too large for grpc resource "
|
||||||
|
f"limit. Check that its definition is not implicitly capturing a "
|
||||||
|
f"large array or other object in scope. "
|
||||||
|
f"Tip: use tune.with_parameters() to put large objects "
|
||||||
|
f"in the Ray object store. \n"
|
||||||
|
f"Original exception: {traceback.format_exc()}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
|
||||||
self.name = name or self._run_identifier
|
self.name = name or self._run_identifier
|
||||||
|
|
||||||
# If the name has been set explicitly, we don't want to create
|
# If the name has been set explicitly, we don't want to create
|
||||||
|
|
|
@ -15,6 +15,7 @@ from ray import tune
|
||||||
from ray._private.test_utils import recursive_fnmatch
|
from ray._private.test_utils import recursive_fnmatch
|
||||||
from ray.exceptions import RayTaskError
|
from ray.exceptions import RayTaskError
|
||||||
from ray.rllib import _register_all
|
from ray.rllib import _register_all
|
||||||
|
from ray.tune import TuneError
|
||||||
from ray.tune.callback import Callback
|
from ray.tune.callback import Callback
|
||||||
from ray.tune.suggest.basic_variant import BasicVariantGenerator
|
from ray.tune.suggest.basic_variant import BasicVariantGenerator
|
||||||
from ray.tune.suggest import Searcher
|
from ray.tune.suggest import Searcher
|
||||||
|
@ -541,6 +542,34 @@ class TrainableCrashWithFailFast(unittest.TestCase):
|
||||||
tune.run(f, fail_fast=TrialRunner.RAISE)
|
tune.run(f, fail_fast=TrialRunner.RAISE)
|
||||||
|
|
||||||
|
|
||||||
|
# For some reason, different tests are coupled through tune.registry.
|
||||||
|
# After running `ResourceExhaustedTest`, there is always a super huge `training_func` to
|
||||||
|
# be put through GCS, which will fail subsequent tests.
|
||||||
|
# tldr, make sure that this test is the last test in the file.
|
||||||
|
class ResourceExhaustedTest(unittest.TestCase):
|
||||||
|
def test_resource_exhausted_info(self):
|
||||||
|
"""This is to test if helpful information is displayed when
|
||||||
|
the objects captured in trainable/training function are too
|
||||||
|
large and RESOURCES_EXHAUSTED error of gRPC is triggered."""
|
||||||
|
|
||||||
|
# generate some random data to be captured implicitly in training func.
|
||||||
|
from sklearn.datasets import fetch_olivetti_faces
|
||||||
|
|
||||||
|
a_large_array = []
|
||||||
|
for i in range(10):
|
||||||
|
a_large_array.append(fetch_olivetti_faces())
|
||||||
|
|
||||||
|
def training_func(config):
|
||||||
|
for item in a_large_array:
|
||||||
|
assert item
|
||||||
|
|
||||||
|
with self.assertRaisesRegex(
|
||||||
|
TuneError,
|
||||||
|
"The Trainable/training function is too large for grpc resource limit.",
|
||||||
|
):
|
||||||
|
tune.run(training_func)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import pytest
|
import pytest
|
||||||
import sys
|
import sys
|
||||||
|
|
Loading…
Add table
Reference in a new issue