[autoscaler] Worker-Head termination + Better Scale-up message (#5909)

This commit is contained in:
Richard Liaw 2019-10-14 10:37:50 -07:00 committed by GitHub
parent abbfe7392f
commit 20c0cdee4f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 19 additions and 11 deletions

View file

@ -321,20 +321,27 @@ class AWSNodeProvider(NodeProvider):
def terminate_nodes(self, node_ids):
if not node_ids:
return
node0 = self._get_cached_node(node_ids[0])
if self.cache_stopped_nodes:
if node0.spot_instance_request_id:
logger.info(
"AWSNodeProvider: terminating nodes {} (spot nodes cannot "
"be stopped, only terminated)".format(node_ids))
self.ec2.meta.client.terminate_instances(InstanceIds=node_ids)
else:
spot_ids = []
on_demand_ids = []
for node_id in node_ids:
if self._get_cached_node(node_id).spot_instance_request_id:
spot_ids += [node_id]
else:
on_demand_ids += [node_id]
if on_demand_ids:
logger.info(
"AWSNodeProvider: stopping nodes {}. To terminate nodes "
"on stop, set 'cache_stopped_nodes: False' in the "
"provider config.".format(node_ids))
self.ec2.meta.client.stop_instances(InstanceIds=node_ids)
"provider config.".format(on_demand_ids))
self.ec2.meta.client.stop_instances(InstanceIds=on_demand_ids)
if spot_ids:
logger.info(
"AWSNodeProvider: terminating nodes {} (spot nodes cannot "
"be stopped, only terminated)".format(spot_ids))
self.ec2.meta.client.terminate_instances(InstanceIds=spot_ids)
else:
self.ec2.meta.client.terminate_instances(InstanceIds=node_ids)

View file

@ -410,7 +410,8 @@ void NodeManager::WarnResourceDeadlock() {
<< " pending actors on this node. "
<< "This is likely due to all cluster resources being claimed by actors. "
<< "To resolve the issue, consider creating fewer actors or increase the "
<< "resources available to this Ray cluster.";
<< "resources available to this Ray cluster. You can ignore this message "
<< "if this Ray cluster is expected to auto-scale.";
RAY_CHECK_OK(gcs_client_->error_table().PushErrorToDriver(
exemplar.GetTaskSpecification().JobId(), "resource_deadlock", error_message.str(),
current_time_ms()));