mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[tune] Better error msg for grpc resource exhausted error. (#22806)
This commit is contained in:
parent
bf1bd293f4
commit
99d5288bbd
2 changed files with 48 additions and 4 deletions
|
@ -1,11 +1,12 @@
|
|||
from functools import partial
|
||||
from typing import Dict, Sequence, Any
|
||||
import copy
|
||||
from functools import partial
|
||||
import grpc
|
||||
import inspect
|
||||
import logging
|
||||
import os
|
||||
|
||||
from pickle import PicklingError
|
||||
import traceback
|
||||
from typing import Dict, Sequence, Any
|
||||
|
||||
from ray.tune.error import TuneError
|
||||
from ray.tune.registry import register_trainable
|
||||
|
@ -121,7 +122,21 @@ class Experiment:
|
|||
"checkpointable function. You can specify checkpoints "
|
||||
"within your trainable function."
|
||||
)
|
||||
try:
|
||||
self._run_identifier = Experiment.register_if_needed(run)
|
||||
except grpc.RpcError as e:
|
||||
if e.code() == grpc.StatusCode.RESOURCE_EXHAUSTED:
|
||||
raise TuneError(
|
||||
f"The Trainable/training function is too large for grpc resource "
|
||||
f"limit. Check that its definition is not implicitly capturing a "
|
||||
f"large array or other object in scope. "
|
||||
f"Tip: use tune.with_parameters() to put large objects "
|
||||
f"in the Ray object store. \n"
|
||||
f"Original exception: {traceback.format_exc()}"
|
||||
)
|
||||
else:
|
||||
raise e
|
||||
|
||||
self.name = name or self._run_identifier
|
||||
|
||||
# If the name has been set explicitly, we don't want to create
|
||||
|
|
|
@ -15,6 +15,7 @@ from ray import tune
|
|||
from ray._private.test_utils import recursive_fnmatch
|
||||
from ray.exceptions import RayTaskError
|
||||
from ray.rllib import _register_all
|
||||
from ray.tune import TuneError
|
||||
from ray.tune.callback import Callback
|
||||
from ray.tune.suggest.basic_variant import BasicVariantGenerator
|
||||
from ray.tune.suggest import Searcher
|
||||
|
@ -541,6 +542,34 @@ class TrainableCrashWithFailFast(unittest.TestCase):
|
|||
tune.run(f, fail_fast=TrialRunner.RAISE)
|
||||
|
||||
|
||||
# For some reason, different tests are coupled through tune.registry.
|
||||
# After running `ResourceExhaustedTest`, there is always a super huge `training_func` to
|
||||
# be put through GCS, which will fail subsequent tests.
|
||||
# tldr, make sure that this test is the last test in the file.
|
||||
class ResourceExhaustedTest(unittest.TestCase):
|
||||
def test_resource_exhausted_info(self):
|
||||
"""This is to test if helpful information is displayed when
|
||||
the objects captured in trainable/training function are too
|
||||
large and RESOURCES_EXHAUSTED error of gRPC is triggered."""
|
||||
|
||||
# generate some random data to be captured implicitly in training func.
|
||||
from sklearn.datasets import fetch_olivetti_faces
|
||||
|
||||
a_large_array = []
|
||||
for i in range(10):
|
||||
a_large_array.append(fetch_olivetti_faces())
|
||||
|
||||
def training_func(config):
|
||||
for item in a_large_array:
|
||||
assert item
|
||||
|
||||
with self.assertRaisesRegex(
|
||||
TuneError,
|
||||
"The Trainable/training function is too large for grpc resource limit.",
|
||||
):
|
||||
tune.run(training_func)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
|
|
Loading…
Add table
Reference in a new issue