mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
[core] Allow user to override global default for max_retries (#25189)
This PR allows the user to override the global default for max_retries for non-actor tasks. It adds an OS env called RAY_task_max_retries which can be passed to the driver or set with runtime envs. Any future tasks submitted by that worker will default to this value instead of 3, the hard-coded default. It would be nicer if we could have a standard way of setting these defaults, but I think this is fine as a one-off for now (not a clear need for overriding defaults of other @ray.remote options yet). Related issue number Closes #24854.
This commit is contained in:
parent
71717e59c4
commit
961b875ab8
5 changed files with 81 additions and 2 deletions
|
@ -8,7 +8,10 @@ because the process crashed or because the machine failed, Ray will rerun
|
|||
the task until either the task succeeds or the maximum number of retries is
|
||||
exceeded. The default number of retries is 3 and can be overridden by
|
||||
specifying ``max_retries`` in the ``@ray.remote`` decorator. Specifying -1
|
||||
allows infinite retries, and 0 disables retries.
|
||||
allows infinite retries, and 0 disables retries. To override the default number
|
||||
of retries for all tasks submitted, set the OS environment variable
|
||||
``RAY_TASK_MAX_RETRIES``. e.g., by passing this to your driver script or by
|
||||
using :ref:`runtime environments<runtime-environments>`.
|
||||
|
||||
You can experiment with this behavior by running the following code.
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ from ray.util.scheduling_strategies import (
|
|||
PlacementGroupSchedulingStrategy,
|
||||
NodeAffinitySchedulingStrategy,
|
||||
)
|
||||
import ray.ray_constants as ray_constants
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -104,7 +105,9 @@ _task_only_options = {
|
|||
"max_calls": _counting_option("max_calls", False, default_value=0),
|
||||
# Normal tasks may be retried on failure this many times.
|
||||
# TODO(swang): Allow this to be set globally for an application.
|
||||
"max_retries": _counting_option("max_retries", default_value=3),
|
||||
"max_retries": _counting_option(
|
||||
"max_retries", default_value=ray_constants.DEFAULT_TASK_MAX_RETRIES
|
||||
),
|
||||
# override "_common_options"
|
||||
"num_cpus": _resource_option("num_cpus", default_value=1),
|
||||
"num_returns": _counting_option("num_returns", False, default_value=1),
|
||||
|
|
|
@ -366,3 +366,7 @@ KV_NAMESPACE_FUNCTION_TABLE = b"fun"
|
|||
LANGUAGE_WORKER_TYPES = ["python", "java", "cpp"]
|
||||
|
||||
NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"
|
||||
|
||||
# Default max_retries option in @ray.remote for non-actor
|
||||
# tasks.
|
||||
DEFAULT_TASK_MAX_RETRIES = 3
|
||||
|
|
|
@ -2,6 +2,7 @@ from functools import wraps
|
|||
import inspect
|
||||
import logging
|
||||
import uuid
|
||||
import os
|
||||
|
||||
from ray import cloudpickle as pickle
|
||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||
|
@ -217,6 +218,14 @@ class RemoteFunction:
|
|||
|
||||
# fill task required options
|
||||
for k, v in ray_option_utils.task_options.items():
|
||||
if k == "max_retries":
|
||||
# TODO(swang): We need to override max_retries here because the default
|
||||
# value gets set at Ray import time. Ideally, we should allow setting
|
||||
# default values from env vars for other options too.
|
||||
v.default_value = os.environ.get(
|
||||
"RAY_TASK_MAX_RETRIES", v.default_value
|
||||
)
|
||||
v.default_value = int(v.default_value)
|
||||
task_options[k] = task_options.get(k, v.default_value)
|
||||
# "max_calls" already takes effects and should not apply again.
|
||||
# Remove the default value here.
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import sys
|
||||
import time
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
@ -11,6 +12,7 @@ from ray._private.test_utils import (
|
|||
Semaphore,
|
||||
)
|
||||
from ray.internal.internal_api import memory_summary
|
||||
import ray.ray_constants as ray_constants
|
||||
|
||||
# Task status.
|
||||
WAITING_FOR_DEPENDENCIES = "WAITING_FOR_DEPENDENCIES"
|
||||
|
@ -420,6 +422,64 @@ def test_memory_util(ray_start_cluster):
|
|||
wait_for_condition(lambda: stats() == (0, 0, 2))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("override_max_retries", [False, True])
|
||||
def test_override_max_retries(ray_start_cluster, override_max_retries):
|
||||
cluster = ray_start_cluster
|
||||
cluster.add_node(num_cpus=1)
|
||||
max_retries = ray_constants.DEFAULT_TASK_MAX_RETRIES
|
||||
runtime_env = {}
|
||||
if override_max_retries:
|
||||
max_retries = 1
|
||||
runtime_env["env_vars"] = {"RAY_TASK_MAX_RETRIES": str(max_retries)}
|
||||
os.environ["RAY_TASK_MAX_RETRIES"] = str(max_retries)
|
||||
# Since we're setting the OS environment variable after the driver process
|
||||
# is already started, we need to set it a second time for the workers with
|
||||
# runtime_env.
|
||||
ray.init(cluster.address, runtime_env=runtime_env)
|
||||
|
||||
try:
|
||||
|
||||
@ray.remote
|
||||
class ExecutionCounter:
|
||||
def __init__(self):
|
||||
self.count = 0
|
||||
|
||||
def inc(self):
|
||||
self.count += 1
|
||||
|
||||
def pop(self):
|
||||
count = self.count
|
||||
self.count = 0
|
||||
return count
|
||||
|
||||
@ray.remote
|
||||
def f(counter):
|
||||
ray.get(counter.inc.remote())
|
||||
sys.exit(-1)
|
||||
|
||||
counter = ExecutionCounter.remote()
|
||||
with pytest.raises(ray.exceptions.WorkerCrashedError):
|
||||
ray.get(f.remote(counter))
|
||||
assert ray.get(counter.pop.remote()) == max_retries + 1
|
||||
|
||||
# Check max_retries override still works.
|
||||
with pytest.raises(ray.exceptions.WorkerCrashedError):
|
||||
ray.get(f.options(max_retries=0).remote(counter))
|
||||
assert ray.get(counter.pop.remote()) == 1
|
||||
|
||||
@ray.remote
|
||||
def nested(counter):
|
||||
ray.get(f.remote(counter))
|
||||
|
||||
# Check override works through nested tasks.
|
||||
with pytest.raises(ray.exceptions.RayTaskError):
|
||||
ray.get(nested.remote(counter))
|
||||
assert ray.get(counter.pop.remote()) == max_retries + 1
|
||||
finally:
|
||||
if override_max_retries:
|
||||
del os.environ["RAY_TASK_MAX_RETRIES"]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue