mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[client] Fix ray client object ref releasing in wrong context. (#22025)
This commit is contained in:
parent
54fe2f80bb
commit
588d540b68
5 changed files with 52 additions and 22 deletions
|
@ -93,6 +93,7 @@ cdef class ObjectRef(BaseID):
|
|||
cdef class ClientObjectRef(ObjectRef):
|
||||
cdef object _mutex
|
||||
cdef object _id_future
|
||||
cdef object _client_worker_ref
|
||||
|
||||
cdef _set_id(self, id)
|
||||
cdef inline _wait_for_id(self, timeout=None)
|
||||
|
@ -107,6 +108,7 @@ cdef class ActorID(BaseID):
|
|||
cdef class ClientActorRef(ActorID):
|
||||
cdef object _mutex
|
||||
cdef object _id_future
|
||||
cdef object _client_worker_ref
|
||||
|
||||
cdef _set_id(self, id)
|
||||
cdef inline _wait_for_id(self, timeout=None)
|
||||
|
|
|
@ -5,6 +5,7 @@ import concurrent.futures
|
|||
import functools
|
||||
import logging
|
||||
import threading
|
||||
import weakref
|
||||
from typing import Callable, Any, Union
|
||||
|
||||
import ray
|
||||
|
@ -154,6 +155,9 @@ cdef class ClientObjectRef(ObjectRef):
|
|||
def __init__(self, id: Union[bytes, concurrent.futures.Future]):
|
||||
self.in_core_worker = False
|
||||
self._mutex = threading.Lock()
|
||||
# client worker might be cleaned up before __dealloc__ is called.
|
||||
# so use a weakref to check whether it's alive or not.
|
||||
self._client_worker_ref = weakref.ref(client.ray.get_context().client_worker)
|
||||
if isinstance(id, bytes):
|
||||
self._set_id(id)
|
||||
elif isinstance(id, concurrent.futures.Future):
|
||||
|
@ -162,14 +166,8 @@ cdef class ClientObjectRef(ObjectRef):
|
|||
raise TypeError("Unexpected type for id {}".format(id))
|
||||
|
||||
def __dealloc__(self):
|
||||
if client is None or client.ray is None:
|
||||
# Similar issue as mentioned in ObjectRef.__dealloc__ above. The
|
||||
# client package or client.ray object might be set
|
||||
# to None when the script exits. Should be safe to skip
|
||||
# call_release in this case, since the client should have already
|
||||
# disconnected at this point.
|
||||
return
|
||||
if client.ray.is_connected():
|
||||
client_worker = self._client_worker_ref()
|
||||
if client_worker is not None and client_worker.is_connected():
|
||||
try:
|
||||
self._wait_for_id()
|
||||
# cython would suppress this exception as well, but it tries to
|
||||
|
@ -182,7 +180,7 @@ cdef class ClientObjectRef(ObjectRef):
|
|||
"a method on the actor reference before its destructor "
|
||||
"is run.")
|
||||
if not self.data.IsNil():
|
||||
client.ray.call_release(self.id)
|
||||
client_worker.call_release(self.id)
|
||||
|
||||
cdef CObjectID native(self):
|
||||
self._wait_for_id()
|
||||
|
@ -251,13 +249,16 @@ cdef class ClientObjectRef(ObjectRef):
|
|||
data = loads_from_server(resp.get.data)
|
||||
|
||||
py_callback(data)
|
||||
|
||||
client.ray._register_callback(self, deserialize_obj)
|
||||
client_worker = self._client_worker_ref()
|
||||
assert client_worker is not None
|
||||
client_worker.register_callback(self, deserialize_obj)
|
||||
|
||||
cdef _set_id(self, id):
|
||||
check_id(id)
|
||||
self.data = CObjectID.FromBinary(<c_string>id)
|
||||
client.ray.call_retain(id)
|
||||
client_worker = self._client_worker_ref()
|
||||
assert client_worker is not None
|
||||
client_worker.call_retain(id)
|
||||
|
||||
cdef inline _wait_for_id(self, timeout=None):
|
||||
if self._id_future:
|
||||
|
|
|
@ -325,6 +325,9 @@ cdef class ClientActorRef(ActorID):
|
|||
|
||||
def __init__(self, id: Union[bytes, concurrent.futures.Future]):
|
||||
self._mutex = threading.Lock()
|
||||
# client worker might be cleaned up before __dealloc__ is called.
|
||||
# so use a weakref to check whether it's alive or not.
|
||||
self._client_worker_ref = weakref.ref(client.ray.get_context().client_worker)
|
||||
if isinstance(id, bytes):
|
||||
self._set_id(id)
|
||||
elif isinstance(id, Future):
|
||||
|
@ -333,13 +336,8 @@ cdef class ClientActorRef(ActorID):
|
|||
raise TypeError("Unexpected type for id {}".format(id))
|
||||
|
||||
def __dealloc__(self):
|
||||
if client is None or client.ray is None:
|
||||
# The client package or client.ray object might be set
|
||||
# to None when the script exits. Should be safe to skip
|
||||
# call_release in this case, since the client should have already
|
||||
# disconnected at this point.
|
||||
return
|
||||
if client.ray.is_connected():
|
||||
client_worker = self._client_worker_ref()
|
||||
if client_worker is not None and client_worker.is_connected():
|
||||
try:
|
||||
self._wait_for_id()
|
||||
# cython would suppress this exception as well, but it tries to
|
||||
|
@ -352,7 +350,7 @@ cdef class ClientActorRef(ActorID):
|
|||
"a method on the actor reference before its destructor "
|
||||
"is run.")
|
||||
if not self.data.IsNil():
|
||||
client.ray.call_release(self.id)
|
||||
client_worker.call_release(self.id)
|
||||
|
||||
def binary(self):
|
||||
self._wait_for_id()
|
||||
|
@ -381,7 +379,9 @@ cdef class ClientActorRef(ActorID):
|
|||
cdef _set_id(self, id):
|
||||
check_id(id, CActorID.Size())
|
||||
self.data = CActorID.FromBinary(<c_string>id)
|
||||
client.ray.call_retain(id)
|
||||
client_worker = self._client_worker_ref()
|
||||
assert client_worker is not None
|
||||
client_worker.call_retain(id)
|
||||
|
||||
cdef _wait_for_id(self, timeout=None):
|
||||
if self._id_future:
|
||||
|
@ -390,7 +390,6 @@ cdef class ClientActorRef(ActorID):
|
|||
self._set_id(self._id_future.result(timeout=timeout))
|
||||
self._id_future = None
|
||||
|
||||
|
||||
cdef class FunctionID(UniqueID):
|
||||
|
||||
def __init__(self, id):
|
||||
|
|
|
@ -758,5 +758,31 @@ def test_init_requires_no_resources(call_ray_start, use_client):
|
|||
ray.get(f.remote())
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"call_ray_start",
|
||||
["ray start --head --ray-client-server-port 25553 --num-cpus 1"],
|
||||
indirect=True,
|
||||
)
|
||||
def test_object_ref_release(call_ray_start):
|
||||
"""This is to test the release of an object in previous session is
|
||||
handled correctly.
|
||||
"""
|
||||
import ray
|
||||
|
||||
ray.init("ray://localhost:25553")
|
||||
|
||||
a = ray.put("Hello")
|
||||
|
||||
ray.shutdown()
|
||||
ray.init("ray://localhost:25553")
|
||||
# a is release in the session which doesn't create it.
|
||||
del a
|
||||
|
||||
with disable_client_hook():
|
||||
# Make sure a doesn't generate a release request.
|
||||
ref_cnt = ray.util.client.ray.get_context().client_worker.reference_count
|
||||
assert all(v > 0 for v in ref_cnt.values())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
|
|
@ -138,8 +138,10 @@ class _ClientContext:
|
|||
|
||||
def disconnect(self):
|
||||
"""Disconnect the Ray Client."""
|
||||
|
||||
if self.client_worker is not None:
|
||||
self.client_worker.close()
|
||||
self.api.worker = None
|
||||
self.client_worker = None
|
||||
|
||||
# remote can be called outside of a connection, which is why it
|
||||
|
|
Loading…
Add table
Reference in a new issue