Spread even if nodes are not available (#23445)

Several changes to make spread scheduling work better under load: * When nodes are not available, spread among feasible nodes. * If grant_or_reject is true, don't spill back if the selected node is not available. * Don't spill due to waiting for dependencies for spread tasks.
2025-03-06 10:31:39 -05:00 · 2022-04-20 07:35:15 -07:00 · 2022-04-20 07:35:15 -07:00 · 6cfec51d1e
commit 6cfec51d1e
parent 261a8a7470
9 changed files with 110 additions and 47 deletions
--- a/src/ray/common/task/task_spec.cc
+++ b/src/ray/common/task/task_spec.cc
@ -319,6 +319,11 @@ bool TaskSpecification::IsActorTask() const {
  return message_->type() == TaskType::ACTOR_TASK;
 }

+bool TaskSpecification::IsSpreadSchedulingStrategy() const {
+  return message_->scheduling_strategy().scheduling_strategy_case() ==
+         rpc::SchedulingStrategy::SchedulingStrategyCase::kSpreadSchedulingStrategy;
+}
+
 // === Below are getter methods specific to actor creation tasks.

 ActorID TaskSpecification::ActorCreationId() const {
--- a/src/ray/common/task/task_spec.h
+++ b/src/ray/common/task/task_spec.h
@ -360,6 +360,8 @@ class TaskSpecification : public MessageWrapper<rpc::TaskSpec> {

  bool ExecuteOutOfOrder() const;

+  bool IsSpreadSchedulingStrategy() const;
+
 private:
  void ComputeResources();

--- a/src/ray/raylet/local_task_manager.cc
+++ b/src/ray/raylet/local_task_manager.cc
@ -315,19 +315,30 @@ void LocalTaskManager::SpillWaitingTasks() {
    // pulled).  If this is true, then we should force the task onto a remote
    // feasible node, even if we have enough resources available locally for
    // placement.
-    bool force_spillback = task_dependency_manager_.TaskDependenciesBlocked(task_id);
+    bool task_dependencies_blocked =
+        task_dependency_manager_.TaskDependenciesBlocked(task_id);
    RAY_LOG(DEBUG) << "Attempting to spill back waiting task " << task_id
-                   << " to remote node. Force spillback? " << force_spillback;
+                   << " to remote node. Dependencies blocked? "
+                   << task_dependencies_blocked;
    bool is_infeasible;
    // TODO(swang): The policy currently does not account for the amount of
    // object store memory availability. Ideally, we should pick the node with
    // the most memory availability.
-    auto scheduling_node_id = cluster_resource_scheduler_->GetBestSchedulableNode(
-        (*it)->task.GetTaskSpecification(),
-        /*prioritize_local_node*/ true,
-        /*exclude_local_node*/ force_spillback,
-        /*requires_object_store_memory*/ true,
-        &is_infeasible);
+    scheduling::NodeID scheduling_node_id;
+    if (!task.GetTaskSpecification().IsSpreadSchedulingStrategy()) {
+      scheduling_node_id = cluster_resource_scheduler_->GetBestSchedulableNode(
+          task.GetTaskSpecification(),
+          /*prioritize_local_node*/ true,
+          /*exclude_local_node*/ task_dependencies_blocked,
+          /*requires_object_store_memory*/ true,
+          &is_infeasible);
+    } else {
+      // If scheduling strategy is spread, we prefer honoring spread decision
+      // and waiting for task dependencies to be pulled
+      // locally than spilling back and causing uneven spread.
+      scheduling_node_id = scheduling::NodeID(self_node_id_.Binary());
+    }
+
    if (!scheduling_node_id.IsNil() &&
        scheduling_node_id.Binary() != self_node_id_.Binary()) {
      NodeID node_id = NodeID::FromBinary(scheduling_node_id.Binary());
--- a/src/ray/raylet/scheduling/cluster_resource_scheduler.cc
+++ b/src/ray/raylet/scheduling/cluster_resource_scheduler.cc
@ -210,9 +210,11 @@ bool ClusterResourceScheduler::AllocateRemoteTaskResources(
 }

 bool ClusterResourceScheduler::IsSchedulableOnNode(
-    scheduling::NodeID node_id, const absl::flat_hash_map<std::string, double> &shape) {
+    scheduling::NodeID node_id,
+    const absl::flat_hash_map<std::string, double> &shape,
+    bool requires_object_store_memory) {
  auto resource_request =
-      ResourceMapToResourceRequest(shape, /*requires_object_store_memory=*/false);
+      ResourceMapToResourceRequest(shape, requires_object_store_memory);
  return IsSchedulable(resource_request, node_id);
 }

@ -226,21 +228,34 @@ scheduling::NodeID ClusterResourceScheduler::GetBestSchedulableNode(
  // going through the full hybrid policy since we don't want spillback.
  if (prioritize_local_node && !exclude_local_node &&
      IsSchedulableOnNode(local_node_id_,
-                          task_spec.GetRequiredResources().GetResourceMap())) {
+                          task_spec.GetRequiredResources().GetResourceMap(),
+                          requires_object_store_memory)) {
    *is_infeasible = false;
    return local_node_id_;
  }

  // This argument is used to set violation, which is an unsupported feature now.
  int64_t _unused;
-  return GetBestSchedulableNode(
-      task_spec.GetRequiredPlacementResources().GetResourceMap(),
-      task_spec.GetMessage().scheduling_strategy(),
-      requires_object_store_memory,
-      task_spec.IsActorCreationTask(),
-      exclude_local_node,
-      &_unused,
-      is_infeasible);
+  scheduling::NodeID best_node =
+      GetBestSchedulableNode(task_spec.GetRequiredPlacementResources().GetResourceMap(),
+                             task_spec.GetMessage().scheduling_strategy(),
+                             requires_object_store_memory,
+                             task_spec.IsActorCreationTask(),
+                             exclude_local_node,
+                             &_unused,
+                             is_infeasible);
+
+  // If there is no other available nodes, prefer waiting on the local node
+  // since the local node is chosen for a reason (e.g. spread).
+  if (prioritize_local_node && !best_node.IsNil() &&
+      !IsSchedulableOnNode(best_node,
+                           task_spec.GetRequiredResources().GetResourceMap(),
+                           requires_object_store_memory)) {
+    *is_infeasible = false;
+    return local_node_id_;
+  }
+
+  return best_node;
 }

 SchedulingResult ClusterResourceScheduler::Schedule(
--- a/src/ray/raylet/scheduling/cluster_resource_scheduler.h
+++ b/src/ray/raylet/scheduling/cluster_resource_scheduler.h
@ -114,7 +114,8 @@ class ClusterResourceScheduler {
  /// \param node_name Name of the node.
  /// \param shape The resource demand's shape.
  bool IsSchedulableOnNode(scheduling::NodeID node_id,
-                           const absl::flat_hash_map<std::string, double> &shape);
+                           const absl::flat_hash_map<std::string, double> &shape,
+                           bool requires_object_store_memory);

  LocalResourceManager &GetLocalResourceManager() { return *local_resource_manager_; }
  ClusterResourceManager &GetClusterResourceManager() {
--- a/src/ray/raylet/scheduling/cluster_task_manager_test.cc
+++ b/src/ray/raylet/scheduling/cluster_task_manager_test.cc
@ -1646,6 +1646,12 @@ TEST_F(ClusterTaskManagerTest, TestSpillWaitingTasks) {
      auto missing_arg = task.GetTaskSpecification().GetDependencyIds()[0];
      missing_objects_.insert(missing_arg);
    }
+    if (i == 0) {
+      const_cast<TaskSpecification &>(task.GetTaskSpecification())
+          .GetMutableMessage()
+          .mutable_scheduling_strategy()
+          ->mutable_spread_scheduling_strategy();
+    }
    task_manager_.QueueAndScheduleTask(task, false, false, replies[i].get(), callback);
    pool_.TriggerCallbacks();
  }
@ -1696,6 +1702,12 @@ TEST_F(ClusterTaskManagerTest, TestSpillWaitingTasks) {
  // One task dispatched.
  ASSERT_EQ(replies[4]->worker_address().port(), 1234);

+  // Spread task won't be spilled due to waiting for dependencies.
+  AddNode(remote_node_id, 8);
+  task_manager_.ScheduleAndDispatchTasks();
+  ASSERT_EQ(num_callbacks, 4);
+  ASSERT_EQ(replies[0]->retry_at_raylet_address().raylet_id(), "");
+
  RayTask finished_task;
  local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task);
  leased_workers_.clear();
--- a/src/ray/raylet/scheduling/policy/scheduling_policy_test.cc
+++ b/src/ray/raylet/scheduling/policy/scheduling_policy_test.cc
@ -119,9 +119,9 @@ TEST_F(SchedulingPolicyTest, NodeAffinityPolicyTest) {
 TEST_F(SchedulingPolicyTest, SpreadPolicyTest) {
  ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}}, false);

-  nodes.emplace(local_node, CreateNodeResources(20, 20, 0, 0, 0, 0));
+  nodes.emplace(local_node, CreateNodeResources(20, 20, 0, 0, 0, 1));
  // Unavailable node
-  nodes.emplace(remote_node, CreateNodeResources(0, 20, 0, 0, 0, 0));
+  nodes.emplace(remote_node, CreateNodeResources(0, 20, 0, 0, 0, 1));
  // Infeasible node
  nodes.emplace(remote_node_2, CreateNodeResources(0, 0, 0, 0, 0, 0));
  nodes.emplace(remote_node_3, CreateNodeResources(20, 20, 0, 0, 0, 0));
@ -137,8 +137,20 @@ TEST_F(SchedulingPolicyTest, SpreadPolicyTest) {
  ASSERT_EQ(to_schedule, remote_node_3);

  to_schedule = scheduling_policy.Schedule(
-      req, SchedulingOptions::Spread(/*force_spillback=*/true, false));
+      req, SchedulingOptions::Spread(/*avoid_local_node=*/true, false));
  ASSERT_EQ(to_schedule, remote_node_3);
+
+  // Spread across feasible nodes if there is no available nodes
+  req = ResourceMapToResourceRequest({{"GPU", 1}}, false);
+  to_schedule = scheduling_policy.Schedule(req, SchedulingOptions::Spread(false, false));
+  ASSERT_EQ(to_schedule, local_node);
+
+  to_schedule = scheduling_policy.Schedule(req, SchedulingOptions::Spread(false, false));
+  ASSERT_EQ(to_schedule, remote_node);
+
+  to_schedule = scheduling_policy.Schedule(
+      req, SchedulingOptions::Spread(false, /*require_node_available=*/true));
+  ASSERT_TRUE(to_schedule.IsNil());
 }

 TEST_F(SchedulingPolicyTest, RandomPolicyTest) {
--- a/src/ray/raylet/scheduling/policy/spread_scheduling_policy.cc
+++ b/src/ray/raylet/scheduling/policy/spread_scheduling_policy.cc
@ -35,24 +35,34 @@ scheduling::NodeID SpreadSchedulingPolicy::Schedule(
  }
  std::sort(round.begin(), round.end());

-  size_t round_index = spread_scheduling_next_index_;
-  for (size_t i = 0; i < round.size(); ++i, ++round_index) {
-    const auto &node_id = round[round_index % round.size()];
-    const auto &node = map_find_or_die(nodes_, node_id);
-    if (node_id == local_node_id_ && options.avoid_local_node) {
-      continue;
-    }
-    if (!is_node_available_(node_id) ||
-        !node.GetLocalView().IsFeasible(resource_request) ||
-        !node.GetLocalView().IsAvailable(resource_request, true)) {
-      continue;
-    }
+  // Spread among available nodes first.
+  // If there is no available nodes, we spread among feasible nodes.
+  for (bool available_nodes_only :
+       (options.require_node_available ? std::vector<bool>{true}
+                                       : std::vector<bool>{true, false})) {
+    size_t round_index = spread_scheduling_next_index_;
+    for (size_t i = 0; i < round.size(); ++i, ++round_index) {
+      const auto &node_id = round[round_index % round.size()];
+      const auto &node = map_find_or_die(nodes_, node_id);
+      if (node_id == local_node_id_ && options.avoid_local_node) {
+        continue;
+      }
+      if (!is_node_alive_(node_id) || !node.GetLocalView().IsFeasible(resource_request)) {
+        continue;
+      }

-    spread_scheduling_next_index_ = ((round_index + 1) % round.size());
-    return node_id;
+      if (available_nodes_only &&
+          !node.GetLocalView().IsAvailable(resource_request,
+                                           /*ignore_pull_manager_at_capacity=*/false)) {
+        continue;
+      }
+
+      spread_scheduling_next_index_ = ((round_index + 1) % round.size());
+      return node_id;
+    }
  }
-  options.scheduling_type = SchedulingType::HYBRID;
-  return hybrid_policy_.Schedule(resource_request, options);
+
+  return scheduling::NodeID::Nil();
 }

 }  // namespace raylet_scheduling_policy
--- a/src/ray/raylet/scheduling/policy/spread_scheduling_policy.h
+++ b/src/ray/raylet/scheduling/policy/spread_scheduling_policy.h
@ -28,11 +28,8 @@ class SpreadSchedulingPolicy : public ISchedulingPolicy {
 public:
  SpreadSchedulingPolicy(scheduling::NodeID local_node_id,
                         const absl::flat_hash_map<scheduling::NodeID, Node> &nodes,
-                         std::function<bool(scheduling::NodeID)> is_node_available)
-      : local_node_id_(local_node_id),
-        nodes_(nodes),
-        is_node_available_(is_node_available),
-        hybrid_policy_(local_node_id_, nodes_, is_node_available_) {}
+                         std::function<bool(scheduling::NodeID)> is_node_alive)
+      : local_node_id_(local_node_id), nodes_(nodes), is_node_alive_(is_node_alive) {}

  scheduling::NodeID Schedule(const ResourceRequest &resource_request,
                              SchedulingOptions options) override;
@ -47,9 +44,7 @@ class SpreadSchedulingPolicy : public ISchedulingPolicy {
  // but it should still be better than always scanning from 0 for spread scheduling.
  size_t spread_scheduling_next_index_ = 0;
  /// Function Checks if node is alive.
-  std::function<bool(scheduling::NodeID)> is_node_available_;
-  /// Instance of hybrid policy;
-  HybridSchedulingPolicy hybrid_policy_;
+  std::function<bool(scheduling::NodeID)> is_node_alive_;
 };
 }  // namespace raylet_scheduling_policy
 }  // namespace ray