[serve] Handle None in ReplicaConfig's resource_dict (#23851)

This change sets `"memory"`'s default to `0` in the `resource_dict` but keeps the default as `None` in `ray_actor_options`. It adds logic to both problematic lines to handle `None` in case of future settings updates. It also adds unit tests to prevent regressions.
This commit is contained in:
shrekris-anyscale 2022-04-18 10:34:20 -07:00 committed by GitHub
parent a940e643db
commit fab33fa730
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 51 additions and 8 deletions

View file

@ -252,7 +252,7 @@ py_test(
py_test(
name = "test_standalone2",
size = "medium",
size = "large",
srcs = serve_tests_srcs,
tags = ["exclusive", "team:serve"],
deps = [":serve_lib"],

View file

@ -297,7 +297,7 @@ class ReplicaConfig:
# TODO(suquark): reuse options validation of remote function/actor.
# Ray defaults to zero CPUs for placement, we default to one here.
if self.ray_actor_options.get("num_cpus", None) is None:
if self.ray_actor_options.get("num_cpus") is None:
self.ray_actor_options["num_cpus"] = 1
num_cpus = self.ray_actor_options["num_cpus"]
if not isinstance(num_cpus, (int, float)):
@ -306,7 +306,7 @@ class ReplicaConfig:
raise ValueError("num_cpus in ray_actor_options must be >= 0.")
self.resource_dict["CPU"] = num_cpus
if self.ray_actor_options.get("num_gpus", None) is None:
if self.ray_actor_options.get("num_gpus") is None:
self.ray_actor_options["num_gpus"] = 0
num_gpus = self.ray_actor_options["num_gpus"]
if not isinstance(num_gpus, (int, float)):
@ -315,6 +315,7 @@ class ReplicaConfig:
raise ValueError("num_gpus in ray_actor_options must be >= 0.")
self.resource_dict["GPU"] = num_gpus
# Serve deployments use Ray's default for actor memory.
self.ray_actor_options.setdefault("memory", None)
memory = self.ray_actor_options["memory"]
if memory is not None and not isinstance(memory, (int, float)):
@ -335,7 +336,7 @@ class ReplicaConfig:
raise ValueError("object_store_memory in ray_actor_options must be >= 0.")
self.resource_dict["object_store_memory"] = object_store_memory
if self.ray_actor_options.get("resources", None) is None:
if self.ray_actor_options.get("resources") is None:
self.ray_actor_options["resources"] = {}
custom_resources = self.ray_actor_options["resources"]
if not isinstance(custom_resources, dict):

View file

@ -267,9 +267,11 @@ class ActorReplicaWrapper:
)
self._actor_resources = deployment_info.replica_config.resource_dict
# it is currently not possiible to create a placement group
# it is currently not possible to create a placement group
# with no resources (https://github.com/ray-project/ray/issues/20401)
has_resources_assigned = all((r > 0 for r in self._actor_resources.values()))
has_resources_assigned = all(
(r is not None and r > 0 for r in self._actor_resources.values())
)
if USE_PLACEMENT_GROUP and has_resources_assigned:
self._placement_group = self.create_placement_group(
self._placement_group_name, self._actor_resources
@ -752,7 +754,11 @@ class DeploymentReplica(VersionedReplica):
if self._actor.actor_resources is None:
return "UNKNOWN", "UNKNOWN"
required = {k: v for k, v in self._actor.actor_resources.items() if v > 0}
required = {
k: v
for k, v in self._actor.actor_resources.items()
if v is not None and v > 0
}
available = {
k: v for k, v in self._actor.available_resources.items() if k in required
}

View file

@ -184,7 +184,12 @@ def test_replica_config_validation():
@pytest.mark.parametrize(
"memory_omitted_options",
[None, {}, {"CPU": 1, "GPU": 3}, {"CPU": 1, "GPU": 3, "memory": None}],
[
None,
{},
{"num_cpus": 1, "num_gpus": 3},
{"num_cpus": 1, "num_gpus": 3, "memory": None},
],
)
def test_replica_config_default_memory_none(memory_omitted_options):
"""Checks that ReplicaConfig's default memory is None."""

View file

@ -21,6 +21,7 @@ from ray.serve.deployment_state import (
DeploymentState,
DeploymentStateManager,
DeploymentVersion,
DeploymentReplica,
ReplicaStartupStatus,
ReplicaState,
ReplicaStateContainer,
@ -2086,5 +2087,21 @@ def test_stopping_replicas_ranking():
compare([2, 2, 3, 3], [2, 2, 3, 3]) # if equal, ordering should be kept
def test_resource_requirements_none(mock_deployment_state):
"""Ensure resource_requirements doesn't break if a requirement is None"""
class FakeActor:
actor_resources = {"num_cpus": 2, "fake": None}
available_resources = {}
# Make a DeploymentReplica just to accesss its resource_requirement function
replica = DeploymentReplica(None, None, None, None, None)
replica._actor = FakeActor()
# resource_requirements() should not error
replica.resource_requirements()
if __name__ == "__main__":
sys.exit(pytest.main(["-v", "-s", __file__]))

View file

@ -52,6 +52,20 @@ def test_standalone_actor_outside_serve():
ray.shutdown()
def test_memory_omitted_option(ray_shutdown):
"""Ensure that omitting memory doesn't break the deployment."""
@serve.deployment(ray_actor_options={"num_cpus": 1, "num_gpus": 1})
def hello(*args, **kwargs):
return "world"
ray.init(num_gpus=3, namespace="serve")
serve.start()
hello.deploy()
assert ray.get(hello.get_handle().remote()) == "world"
@pytest.mark.parametrize("detached", [True, False])
def test_override_namespace(shutdown_ray, detached):
"""Test the _override_controller_namespace flag in serve.start()."""