improve redis connection backoff (#24168)

This commit is contained in:
ZhuSenlin 2022-04-29 14:36:13 +08:00 committed by GitHub
parent 8fdde12e9e
commit 2c0f9d7e8f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 36 additions and 3 deletions

View file

@ -556,7 +556,9 @@ def create_redis_client(redis_address, password=None):
if not hasattr(create_redis_client, "instances"):
create_redis_client.instances = {}
for _ in range(ray_constants.START_REDIS_WAIT_RETRIES):
num_retries = ray_constants.START_REDIS_WAIT_RETRIES
delay = 0.001
for i in range(num_retries):
cli = create_redis_client.instances.get(redis_address)
if cli is None:
redis_ip_address, redis_port = extract_ip_port(
@ -571,7 +573,12 @@ def create_redis_client(redis_address, password=None):
return cli
except Exception:
create_redis_client.instances.pop(redis_address)
time.sleep(2)
if i >= num_retries - 1:
break
# Wait a little bit.
time.sleep(delay)
# Make sure the retry interval doesn't increase too large.
delay = min(1, delay * 2)
raise RuntimeError(f"Unable to connect to Redis at {redis_address}")
@ -863,7 +870,7 @@ def wait_for_redis_to_start(redis_ip_address, redis_port, password=None):
time.sleep(delay)
# Make sure the retry interval doesn't increase too large, which will
# affect the delivery time of the Ray cluster.
delay = 1000 if i >= 10 else delay * 2
delay = min(1, delay * 2)
else:
break
else:

View file

@ -379,6 +379,32 @@ def test_ray_init_using_hostname(ray_start_cluster):
assert node_table[0].get("NodeManagerHostname", "") == hostname
def test_redis_connect_backoff():
from ray import ray_constants
import time
unreachable_address = "127.0.0.1:65535"
redis_ip, redis_port = unreachable_address.split(":")
wait_retries = ray_constants.START_REDIS_WAIT_RETRIES
ray_constants.START_REDIS_WAIT_RETRIES = 12
try:
start = time.time()
with pytest.raises(RuntimeError):
ray._private.services.wait_for_redis_to_start(redis_ip, int(redis_port))
end = time.time()
duration = end - start
assert duration > 2
start = time.time()
with pytest.raises(RuntimeError):
ray._private.services.create_redis_client(redis_address=unreachable_address)
end = time.time()
duration = end - start
assert duration > 2
finally:
ray_constants.START_REDIS_WAIT_RETRIES = wait_retries
if __name__ == "__main__":
import pytest
import sys