mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[autoscaler] Worker-Head termination + Better Scale-up message (#5909)
This commit is contained in:
parent
abbfe7392f
commit
20c0cdee4f
2 changed files with 19 additions and 11 deletions
|
@ -321,20 +321,27 @@ class AWSNodeProvider(NodeProvider):
|
|||
def terminate_nodes(self, node_ids):
|
||||
if not node_ids:
|
||||
return
|
||||
|
||||
node0 = self._get_cached_node(node_ids[0])
|
||||
if self.cache_stopped_nodes:
|
||||
if node0.spot_instance_request_id:
|
||||
logger.info(
|
||||
"AWSNodeProvider: terminating nodes {} (spot nodes cannot "
|
||||
"be stopped, only terminated)".format(node_ids))
|
||||
self.ec2.meta.client.terminate_instances(InstanceIds=node_ids)
|
||||
else:
|
||||
spot_ids = []
|
||||
on_demand_ids = []
|
||||
|
||||
for node_id in node_ids:
|
||||
if self._get_cached_node(node_id).spot_instance_request_id:
|
||||
spot_ids += [node_id]
|
||||
else:
|
||||
on_demand_ids += [node_id]
|
||||
|
||||
if on_demand_ids:
|
||||
logger.info(
|
||||
"AWSNodeProvider: stopping nodes {}. To terminate nodes "
|
||||
"on stop, set 'cache_stopped_nodes: False' in the "
|
||||
"provider config.".format(node_ids))
|
||||
self.ec2.meta.client.stop_instances(InstanceIds=node_ids)
|
||||
"provider config.".format(on_demand_ids))
|
||||
self.ec2.meta.client.stop_instances(InstanceIds=on_demand_ids)
|
||||
if spot_ids:
|
||||
logger.info(
|
||||
"AWSNodeProvider: terminating nodes {} (spot nodes cannot "
|
||||
"be stopped, only terminated)".format(spot_ids))
|
||||
self.ec2.meta.client.terminate_instances(InstanceIds=spot_ids)
|
||||
else:
|
||||
self.ec2.meta.client.terminate_instances(InstanceIds=node_ids)
|
||||
|
||||
|
|
|
@ -410,7 +410,8 @@ void NodeManager::WarnResourceDeadlock() {
|
|||
<< " pending actors on this node. "
|
||||
<< "This is likely due to all cluster resources being claimed by actors. "
|
||||
<< "To resolve the issue, consider creating fewer actors or increase the "
|
||||
<< "resources available to this Ray cluster.";
|
||||
<< "resources available to this Ray cluster. You can ignore this message "
|
||||
<< "if this Ray cluster is expected to auto-scale.";
|
||||
RAY_CHECK_OK(gcs_client_->error_table().PushErrorToDriver(
|
||||
exemplar.GetTaskSpecification().JobId(), "resource_deadlock", error_message.str(),
|
||||
current_time_ms()));
|
||||
|
|
Loading…
Add table
Reference in a new issue