From 20c0cdee4f1d1e9ff0721fc405da81ada92d78f7 Mon Sep 17 00:00:00 2001 From: Richard Liaw Date: Mon, 14 Oct 2019 10:37:50 -0700 Subject: [PATCH] [autoscaler] Worker-Head termination + Better Scale-up message (#5909) --- python/ray/autoscaler/aws/node_provider.py | 27 ++++++++++++++-------- src/ray/raylet/node_manager.cc | 3 ++- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/python/ray/autoscaler/aws/node_provider.py b/python/ray/autoscaler/aws/node_provider.py index c5f5438f1..3b33750b2 100644 --- a/python/ray/autoscaler/aws/node_provider.py +++ b/python/ray/autoscaler/aws/node_provider.py @@ -321,20 +321,27 @@ class AWSNodeProvider(NodeProvider): def terminate_nodes(self, node_ids): if not node_ids: return - - node0 = self._get_cached_node(node_ids[0]) if self.cache_stopped_nodes: - if node0.spot_instance_request_id: - logger.info( - "AWSNodeProvider: terminating nodes {} (spot nodes cannot " - "be stopped, only terminated)".format(node_ids)) - self.ec2.meta.client.terminate_instances(InstanceIds=node_ids) - else: + spot_ids = [] + on_demand_ids = [] + + for node_id in node_ids: + if self._get_cached_node(node_id).spot_instance_request_id: + spot_ids += [node_id] + else: + on_demand_ids += [node_id] + + if on_demand_ids: logger.info( "AWSNodeProvider: stopping nodes {}. To terminate nodes " "on stop, set 'cache_stopped_nodes: False' in the " - "provider config.".format(node_ids)) - self.ec2.meta.client.stop_instances(InstanceIds=node_ids) + "provider config.".format(on_demand_ids)) + self.ec2.meta.client.stop_instances(InstanceIds=on_demand_ids) + if spot_ids: + logger.info( + "AWSNodeProvider: terminating nodes {} (spot nodes cannot " + "be stopped, only terminated)".format(spot_ids)) + self.ec2.meta.client.terminate_instances(InstanceIds=spot_ids) else: self.ec2.meta.client.terminate_instances(InstanceIds=node_ids) diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 85b315682..5cf4c1019 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -410,7 +410,8 @@ void NodeManager::WarnResourceDeadlock() { << " pending actors on this node. " << "This is likely due to all cluster resources being claimed by actors. " << "To resolve the issue, consider creating fewer actors or increase the " - << "resources available to this Ray cluster."; + << "resources available to this Ray cluster. You can ignore this message " + << "if this Ray cluster is expected to auto-scale."; RAY_CHECK_OK(gcs_client_->error_table().PushErrorToDriver( exemplar.GetTaskSpecification().JobId(), "resource_deadlock", error_message.str(), current_time_ms()));