[tune] Better error msg for grpc resource exhausted error. (#22806)

This commit is contained in:
xwjiang2010 2022-03-15 09:01:40 -07:00 committed by GitHub
parent bf1bd293f4
commit 99d5288bbd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 48 additions and 4 deletions

View file

@ -1,11 +1,12 @@
from functools import partial
from typing import Dict, Sequence, Any
import copy
from functools import partial
import grpc
import inspect
import logging
import os
from pickle import PicklingError
import traceback
from typing import Dict, Sequence, Any
from ray.tune.error import TuneError
from ray.tune.registry import register_trainable
@ -121,7 +122,21 @@ class Experiment:
"checkpointable function. You can specify checkpoints "
"within your trainable function."
)
try:
self._run_identifier = Experiment.register_if_needed(run)
except grpc.RpcError as e:
if e.code() == grpc.StatusCode.RESOURCE_EXHAUSTED:
raise TuneError(
f"The Trainable/training function is too large for grpc resource "
f"limit. Check that its definition is not implicitly capturing a "
f"large array or other object in scope. "
f"Tip: use tune.with_parameters() to put large objects "
f"in the Ray object store. \n"
f"Original exception: {traceback.format_exc()}"
)
else:
raise e
self.name = name or self._run_identifier
# If the name has been set explicitly, we don't want to create

View file

@ -15,6 +15,7 @@ from ray import tune
from ray._private.test_utils import recursive_fnmatch
from ray.exceptions import RayTaskError
from ray.rllib import _register_all
from ray.tune import TuneError
from ray.tune.callback import Callback
from ray.tune.suggest.basic_variant import BasicVariantGenerator
from ray.tune.suggest import Searcher
@ -541,6 +542,34 @@ class TrainableCrashWithFailFast(unittest.TestCase):
tune.run(f, fail_fast=TrialRunner.RAISE)
# For some reason, different tests are coupled through tune.registry.
# After running `ResourceExhaustedTest`, there is always a super huge `training_func` to
# be put through GCS, which will fail subsequent tests.
# tldr, make sure that this test is the last test in the file.
class ResourceExhaustedTest(unittest.TestCase):
def test_resource_exhausted_info(self):
"""This is to test if helpful information is displayed when
the objects captured in trainable/training function are too
large and RESOURCES_EXHAUSTED error of gRPC is triggered."""
# generate some random data to be captured implicitly in training func.
from sklearn.datasets import fetch_olivetti_faces
a_large_array = []
for i in range(10):
a_large_array.append(fetch_olivetti_faces())
def training_func(config):
for item in a_large_array:
assert item
with self.assertRaisesRegex(
TuneError,
"The Trainable/training function is too large for grpc resource limit.",
):
tune.run(training_func)
if __name__ == "__main__":
import pytest
import sys