mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
improve redis connection backoff (#24168)
This commit is contained in:
parent
8fdde12e9e
commit
2c0f9d7e8f
2 changed files with 36 additions and 3 deletions
|
@ -556,7 +556,9 @@ def create_redis_client(redis_address, password=None):
|
|||
if not hasattr(create_redis_client, "instances"):
|
||||
create_redis_client.instances = {}
|
||||
|
||||
for _ in range(ray_constants.START_REDIS_WAIT_RETRIES):
|
||||
num_retries = ray_constants.START_REDIS_WAIT_RETRIES
|
||||
delay = 0.001
|
||||
for i in range(num_retries):
|
||||
cli = create_redis_client.instances.get(redis_address)
|
||||
if cli is None:
|
||||
redis_ip_address, redis_port = extract_ip_port(
|
||||
|
@ -571,7 +573,12 @@ def create_redis_client(redis_address, password=None):
|
|||
return cli
|
||||
except Exception:
|
||||
create_redis_client.instances.pop(redis_address)
|
||||
time.sleep(2)
|
||||
if i >= num_retries - 1:
|
||||
break
|
||||
# Wait a little bit.
|
||||
time.sleep(delay)
|
||||
# Make sure the retry interval doesn't increase too large.
|
||||
delay = min(1, delay * 2)
|
||||
|
||||
raise RuntimeError(f"Unable to connect to Redis at {redis_address}")
|
||||
|
||||
|
@ -863,7 +870,7 @@ def wait_for_redis_to_start(redis_ip_address, redis_port, password=None):
|
|||
time.sleep(delay)
|
||||
# Make sure the retry interval doesn't increase too large, which will
|
||||
# affect the delivery time of the Ray cluster.
|
||||
delay = 1000 if i >= 10 else delay * 2
|
||||
delay = min(1, delay * 2)
|
||||
else:
|
||||
break
|
||||
else:
|
||||
|
|
|
@ -379,6 +379,32 @@ def test_ray_init_using_hostname(ray_start_cluster):
|
|||
assert node_table[0].get("NodeManagerHostname", "") == hostname
|
||||
|
||||
|
||||
def test_redis_connect_backoff():
|
||||
from ray import ray_constants
|
||||
import time
|
||||
|
||||
unreachable_address = "127.0.0.1:65535"
|
||||
redis_ip, redis_port = unreachable_address.split(":")
|
||||
wait_retries = ray_constants.START_REDIS_WAIT_RETRIES
|
||||
ray_constants.START_REDIS_WAIT_RETRIES = 12
|
||||
try:
|
||||
start = time.time()
|
||||
with pytest.raises(RuntimeError):
|
||||
ray._private.services.wait_for_redis_to_start(redis_ip, int(redis_port))
|
||||
end = time.time()
|
||||
duration = end - start
|
||||
assert duration > 2
|
||||
|
||||
start = time.time()
|
||||
with pytest.raises(RuntimeError):
|
||||
ray._private.services.create_redis_client(redis_address=unreachable_address)
|
||||
end = time.time()
|
||||
duration = end - start
|
||||
assert duration > 2
|
||||
finally:
|
||||
ray_constants.START_REDIS_WAIT_RETRIES = wait_retries
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
|
|
Loading…
Add table
Reference in a new issue