mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[autoscaler] Worker-Head termination + Better Scale-up message (#5909)
This commit is contained in:
parent
abbfe7392f
commit
20c0cdee4f
2 changed files with 19 additions and 11 deletions
|
@ -321,20 +321,27 @@ class AWSNodeProvider(NodeProvider):
|
||||||
def terminate_nodes(self, node_ids):
|
def terminate_nodes(self, node_ids):
|
||||||
if not node_ids:
|
if not node_ids:
|
||||||
return
|
return
|
||||||
|
|
||||||
node0 = self._get_cached_node(node_ids[0])
|
|
||||||
if self.cache_stopped_nodes:
|
if self.cache_stopped_nodes:
|
||||||
if node0.spot_instance_request_id:
|
spot_ids = []
|
||||||
logger.info(
|
on_demand_ids = []
|
||||||
"AWSNodeProvider: terminating nodes {} (spot nodes cannot "
|
|
||||||
"be stopped, only terminated)".format(node_ids))
|
for node_id in node_ids:
|
||||||
self.ec2.meta.client.terminate_instances(InstanceIds=node_ids)
|
if self._get_cached_node(node_id).spot_instance_request_id:
|
||||||
else:
|
spot_ids += [node_id]
|
||||||
|
else:
|
||||||
|
on_demand_ids += [node_id]
|
||||||
|
|
||||||
|
if on_demand_ids:
|
||||||
logger.info(
|
logger.info(
|
||||||
"AWSNodeProvider: stopping nodes {}. To terminate nodes "
|
"AWSNodeProvider: stopping nodes {}. To terminate nodes "
|
||||||
"on stop, set 'cache_stopped_nodes: False' in the "
|
"on stop, set 'cache_stopped_nodes: False' in the "
|
||||||
"provider config.".format(node_ids))
|
"provider config.".format(on_demand_ids))
|
||||||
self.ec2.meta.client.stop_instances(InstanceIds=node_ids)
|
self.ec2.meta.client.stop_instances(InstanceIds=on_demand_ids)
|
||||||
|
if spot_ids:
|
||||||
|
logger.info(
|
||||||
|
"AWSNodeProvider: terminating nodes {} (spot nodes cannot "
|
||||||
|
"be stopped, only terminated)".format(spot_ids))
|
||||||
|
self.ec2.meta.client.terminate_instances(InstanceIds=spot_ids)
|
||||||
else:
|
else:
|
||||||
self.ec2.meta.client.terminate_instances(InstanceIds=node_ids)
|
self.ec2.meta.client.terminate_instances(InstanceIds=node_ids)
|
||||||
|
|
||||||
|
|
|
@ -410,7 +410,8 @@ void NodeManager::WarnResourceDeadlock() {
|
||||||
<< " pending actors on this node. "
|
<< " pending actors on this node. "
|
||||||
<< "This is likely due to all cluster resources being claimed by actors. "
|
<< "This is likely due to all cluster resources being claimed by actors. "
|
||||||
<< "To resolve the issue, consider creating fewer actors or increase the "
|
<< "To resolve the issue, consider creating fewer actors or increase the "
|
||||||
<< "resources available to this Ray cluster.";
|
<< "resources available to this Ray cluster. You can ignore this message "
|
||||||
|
<< "if this Ray cluster is expected to auto-scale.";
|
||||||
RAY_CHECK_OK(gcs_client_->error_table().PushErrorToDriver(
|
RAY_CHECK_OK(gcs_client_->error_table().PushErrorToDriver(
|
||||||
exemplar.GetTaskSpecification().JobId(), "resource_deadlock", error_message.str(),
|
exemplar.GetTaskSpecification().JobId(), "resource_deadlock", error_message.str(),
|
||||||
current_time_ms()));
|
current_time_ms()));
|
||||||
|
|
Loading…
Add table
Reference in a new issue