Revert "[GCS] refactor the resource related data structures on the GCS (#22817)" (#22863)

This reverts commit 549466a42f.
This commit is contained in:
SangBin Cho 2022-03-08 01:48:17 +09:00 committed by GitHub
parent 15d97a1021
commit 79e8405fda
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 228 additions and 503 deletions

View file

@ -674,7 +674,6 @@ def test_task_arguments_inline_bytes_limit(ray_start_cluster):
# This case tests whether gcs-based actor scheduler works properly with # This case tests whether gcs-based actor scheduler works properly with
# a normal task co-existed. # a normal task co-existed.
@pytest.mark.skip(reason="The resource update of normal task has been broken.")
def test_schedule_actor_and_normal_task(ray_start_cluster): def test_schedule_actor_and_normal_task(ray_start_cluster):
cluster = ray_start_cluster cluster = ray_start_cluster
cluster.add_node( cluster.add_node(
@ -721,7 +720,6 @@ def test_schedule_actor_and_normal_task(ray_start_cluster):
# This case tests whether gcs-based actor scheduler works properly # This case tests whether gcs-based actor scheduler works properly
# in a large scale. # in a large scale.
@pytest.mark.skip(reason="The resource update of normal task has been broken.")
def test_schedule_many_actors_and_normal_tasks(ray_start_cluster): def test_schedule_many_actors_and_normal_tasks(ray_start_cluster):
cluster = ray_start_cluster cluster = ray_start_cluster
@ -765,7 +763,6 @@ def test_schedule_many_actors_and_normal_tasks(ray_start_cluster):
# This case tests whether gcs-based actor scheduler distributes actors # This case tests whether gcs-based actor scheduler distributes actors
# in a balanced way. By default, it uses the `SPREAD` strategy of # in a balanced way. By default, it uses the `SPREAD` strategy of
# gcs resource scheduler. # gcs resource scheduler.
@pytest.mark.skip(reason="The resource update of normal task has been broken.")
@pytest.mark.parametrize("args", [[5, 20], [5, 3]]) @pytest.mark.parametrize("args", [[5, 20], [5, 3]])
def test_actor_distribution_balance(ray_start_cluster, args): def test_actor_distribution_balance(ray_start_cluster, args):
cluster = ray_start_cluster cluster = ray_start_cluster
@ -806,7 +803,6 @@ def test_actor_distribution_balance(ray_start_cluster, args):
# This case tests whether RequestWorkerLeaseReply carries normal task resources # This case tests whether RequestWorkerLeaseReply carries normal task resources
# when the request is rejected (due to resource preemption by normal tasks). # when the request is rejected (due to resource preemption by normal tasks).
@pytest.mark.skip(reason="The resource update of normal task has been broken.")
def test_worker_lease_reply_with_resources(ray_start_cluster): def test_worker_lease_reply_with_resources(ray_start_cluster):
cluster = ray_start_cluster cluster = ray_start_cluster
cluster.add_node( cluster.add_node(

View file

@ -21,12 +21,9 @@ void BundleSpecification::ComputeResources() {
if (unit_resource.empty()) { if (unit_resource.empty()) {
// A static nil object is used here to avoid allocating the empty object every time. // A static nil object is used here to avoid allocating the empty object every time.
static std::shared_ptr<ResourceRequest> nil_unit_resource = unit_resource_ = ResourceSet::Nil();
std::make_shared<ResourceRequest>();
unit_resource_ = nil_unit_resource;
} else { } else {
unit_resource_ = std::make_shared<ResourceRequest>(ResourceMapToResourceRequest( unit_resource_.reset(new ResourceSet(unit_resource));
unit_resource, /*requires_object_store_memory=*/false));
} }
// Generate placement group bundle labels. // Generate placement group bundle labels.
@ -36,40 +33,18 @@ void BundleSpecification::ComputeResources() {
void BundleSpecification::ComputeBundleResourceLabels() { void BundleSpecification::ComputeBundleResourceLabels() {
RAY_CHECK(unit_resource_); RAY_CHECK(unit_resource_);
for (size_t i = 0; i < unit_resource_->predefined_resources.size(); ++i) { for (const auto &resource_pair : unit_resource_->GetResourceMap()) {
auto resource_name = scheduling::ResourceID(i).Binary(); double resource_value = resource_pair.second;
const auto &resource_value = unit_resource_->predefined_resources[i];
if (resource_value <= 0.) {
continue;
}
/// With bundle index (e.g., CPU_group_i_zzz). /// With bundle index (e.g., CPU_group_i_zzz).
const std::string &resource_label = const std::string &resource_label =
FormatPlacementGroupResource(resource_name, PlacementGroupId(), Index()); FormatPlacementGroupResource(resource_pair.first, PlacementGroupId(), Index());
bundle_resource_labels_[resource_label] = resource_value.Double(); bundle_resource_labels_[resource_label] = resource_value;
/// Without bundle index (e.g., CPU_group_zzz). /// Without bundle index (e.g., CPU_group_zzz).
const std::string &wildcard_label = const std::string &wildcard_label =
FormatPlacementGroupResource(resource_name, PlacementGroupId(), -1); FormatPlacementGroupResource(resource_pair.first, PlacementGroupId(), -1);
bundle_resource_labels_[wildcard_label] = resource_value.Double(); bundle_resource_labels_[wildcard_label] = resource_value;
}
for (const auto &resource_pair : unit_resource_->custom_resources) {
auto resource_name = scheduling::ResourceID(resource_pair.first).Binary();
const auto &resource_value = resource_pair.second;
if (resource_value <= 0.) {
continue;
}
/// With bundle index (e.g., CPU_group_i_zzz).
const std::string &resource_label =
FormatPlacementGroupResource(resource_name, PlacementGroupId(), Index());
bundle_resource_labels_[resource_label] = resource_value.Double();
/// Without bundle index (e.g., CPU_group_zzz).
const std::string &wildcard_label =
FormatPlacementGroupResource(resource_name, PlacementGroupId(), -1);
bundle_resource_labels_[wildcard_label] = resource_value.Double();
} }
auto bundle_label = auto bundle_label =
FormatPlacementGroupResource(kBundle_ResourceLabel, PlacementGroupId(), -1); FormatPlacementGroupResource(kBundle_ResourceLabel, PlacementGroupId(), -1);
@ -79,7 +54,7 @@ void BundleSpecification::ComputeBundleResourceLabels() {
1000; 1000;
} }
const ResourceRequest &BundleSpecification::GetRequiredResources() const { const ResourceSet &BundleSpecification::GetRequiredResources() const {
return *unit_resource_; return *unit_resource_;
} }

View file

@ -65,7 +65,7 @@ class BundleSpecification : public MessageWrapper<rpc::Bundle> {
/// Return the resources that are to be acquired by this bundle. /// Return the resources that are to be acquired by this bundle.
/// ///
/// \return The resources that will be acquired by this bundle. /// \return The resources that will be acquired by this bundle.
const ResourceRequest &GetRequiredResources() const; const ResourceSet &GetRequiredResources() const;
/// Get all placement group bundle resource labels. /// Get all placement group bundle resource labels.
const absl::flat_hash_map<std::string, double> &GetFormattedResources() const { const absl::flat_hash_map<std::string, double> &GetFormattedResources() const {
@ -81,7 +81,7 @@ class BundleSpecification : public MessageWrapper<rpc::Bundle> {
/// Field storing unit resources. Initialized in constructor. /// Field storing unit resources. Initialized in constructor.
/// TODO(ekl) consider optimizing the representation of ResourceSet for fast copies /// TODO(ekl) consider optimizing the representation of ResourceSet for fast copies
/// instead of keeping shared pointers here. /// instead of keeping shared pointers here.
std::shared_ptr<ResourceRequest> unit_resource_; std::shared_ptr<ResourceSet> unit_resource_;
/// When a bundle is assigned on a node, we'll add the following special resources on /// When a bundle is assigned on a node, we'll add the following special resources on
/// that node: /// that node:

View file

@ -20,13 +20,14 @@ namespace ray {
namespace gcs { namespace gcs {
GcsActorWorkerAssignment::GcsActorWorkerAssignment( GcsActorWorkerAssignment::GcsActorWorkerAssignment(const NodeID &node_id,
const NodeID &node_id, const ResourceRequest &acquired_resources, bool is_shared) const ResourceSet &acquired_resources,
bool is_shared)
: node_id_(node_id), acquired_resources_(acquired_resources), is_shared_(is_shared) {} : node_id_(node_id), acquired_resources_(acquired_resources), is_shared_(is_shared) {}
const NodeID &GcsActorWorkerAssignment::GetNodeID() const { return node_id_; } const NodeID &GcsActorWorkerAssignment::GetNodeID() const { return node_id_; }
const ResourceRequest &GcsActorWorkerAssignment::GetResources() const { const ResourceSet &GcsActorWorkerAssignment::GetResources() const {
return acquired_resources_; return acquired_resources_;
} }
@ -66,9 +67,7 @@ std::unique_ptr<GcsActorWorkerAssignment>
GcsBasedActorScheduler::SelectOrAllocateActorWorkerAssignment( GcsBasedActorScheduler::SelectOrAllocateActorWorkerAssignment(
std::shared_ptr<GcsActor> actor, bool need_sole_actor_worker_assignment) { std::shared_ptr<GcsActor> actor, bool need_sole_actor_worker_assignment) {
const auto &task_spec = actor->GetCreationTaskSpecification(); const auto &task_spec = actor->GetCreationTaskSpecification();
auto required_resources = ResourceMapToResourceRequest( auto required_resources = task_spec.GetRequiredPlacementResources();
task_spec.GetRequiredPlacementResources().GetResourceMap(),
/*requires_object_store_memory=*/false);
// If the task needs a sole actor worker assignment then allocate a new one. // If the task needs a sole actor worker assignment then allocate a new one.
return AllocateNewActorWorkerAssignment(required_resources, /*is_shared=*/false, return AllocateNewActorWorkerAssignment(required_resources, /*is_shared=*/false,
@ -79,7 +78,7 @@ GcsBasedActorScheduler::SelectOrAllocateActorWorkerAssignment(
std::unique_ptr<GcsActorWorkerAssignment> std::unique_ptr<GcsActorWorkerAssignment>
GcsBasedActorScheduler::AllocateNewActorWorkerAssignment( GcsBasedActorScheduler::AllocateNewActorWorkerAssignment(
const ResourceRequest &required_resources, bool is_shared, const ResourceSet &required_resources, bool is_shared,
const TaskSpecification &task_spec) { const TaskSpecification &task_spec) {
// Allocate resources from cluster. // Allocate resources from cluster.
auto selected_node_id = AllocateResources(required_resources); auto selected_node_id = AllocateResources(required_resources);
@ -95,8 +94,7 @@ GcsBasedActorScheduler::AllocateNewActorWorkerAssignment(
return gcs_actor_worker_assignment; return gcs_actor_worker_assignment;
} }
NodeID GcsBasedActorScheduler::AllocateResources( NodeID GcsBasedActorScheduler::AllocateResources(const ResourceSet &required_resources) {
const ResourceRequest &required_resources) {
auto selected_nodes = auto selected_nodes =
gcs_resource_scheduler_->Schedule({required_resources}, SchedulingType::SPREAD) gcs_resource_scheduler_->Schedule({required_resources}, SchedulingType::SPREAD)
.second; .second;
@ -120,7 +118,7 @@ NodeID GcsBasedActorScheduler::AllocateResources(
} }
NodeID GcsBasedActorScheduler::GetHighestScoreNodeResource( NodeID GcsBasedActorScheduler::GetHighestScoreNodeResource(
const ResourceRequest &required_resources) const { const ResourceSet &required_resources) const {
const auto &cluster_map = gcs_resource_manager_->GetClusterResources(); const auto &cluster_map = gcs_resource_manager_->GetClusterResources();
/// Get the highest score node /// Get the highest score node
@ -129,8 +127,7 @@ NodeID GcsBasedActorScheduler::GetHighestScoreNodeResource(
double highest_score = std::numeric_limits<double>::lowest(); double highest_score = std::numeric_limits<double>::lowest();
auto highest_score_node = NodeID::Nil(); auto highest_score_node = NodeID::Nil();
for (const auto &pair : cluster_map) { for (const auto &pair : cluster_map) {
double least_resource_val = double least_resource_val = scorer.Score(required_resources, *pair.second);
scorer.Score(required_resources, pair.second->GetLocalView());
if (least_resource_val > highest_score) { if (least_resource_val > highest_score) {
highest_score = least_resource_val; highest_score = least_resource_val;
highest_score_node = pair.first; highest_score_node = pair.first;
@ -141,12 +138,12 @@ NodeID GcsBasedActorScheduler::GetHighestScoreNodeResource(
} }
void GcsBasedActorScheduler::WarnResourceAllocationFailure( void GcsBasedActorScheduler::WarnResourceAllocationFailure(
const TaskSpecification &task_spec, const ResourceRequest &required_resources) const { const TaskSpecification &task_spec, const ResourceSet &required_resources) const {
auto scheduling_node_id = GetHighestScoreNodeResource(required_resources); auto scheduling_node_id = GetHighestScoreNodeResource(required_resources);
const NodeResources *scheduling_resource = nullptr; const SchedulingResources *scheduling_resource = nullptr;
auto iter = gcs_resource_manager_->GetClusterResources().find(scheduling_node_id); auto iter = gcs_resource_manager_->GetClusterResources().find(scheduling_node_id);
if (iter != gcs_resource_manager_->GetClusterResources().end()) { if (iter != gcs_resource_manager_->GetClusterResources().end()) {
scheduling_resource = iter->second->GetMutableLocalView(); scheduling_resource = iter->second.get();
} }
std::string scheduling_resource_str = std::string scheduling_resource_str =
scheduling_resource ? scheduling_resource->DebugString() : "None"; scheduling_resource ? scheduling_resource->DebugString() : "None";
@ -154,7 +151,7 @@ void GcsBasedActorScheduler::WarnResourceAllocationFailure(
RAY_LOG(WARNING) << "No enough resources for creating actor " RAY_LOG(WARNING) << "No enough resources for creating actor "
<< task_spec.ActorCreationId() << task_spec.ActorCreationId()
<< "\nActor class: " << task_spec.FunctionDescriptor()->ToString() << "\nActor class: " << task_spec.FunctionDescriptor()->ToString()
<< "\nRequired resources: " << required_resources.DebugString() << "\nRequired resources: " << required_resources.ToString()
<< "\nThe node with the most resources is:" << "\nThe node with the most resources is:"
<< "\n Node id: " << scheduling_node_id << "\n Node id: " << scheduling_node_id
<< "\n Node resources: " << scheduling_resource_str; << "\n Node resources: " << scheduling_resource_str;

View file

@ -18,6 +18,7 @@
#include "ray/common/id.h" #include "ray/common/id.h"
#include "ray/common/status.h" #include "ray/common/status.h"
#include "ray/common/task/scheduling_resources.h"
#include "ray/common/task/task_spec.h" #include "ray/common/task/task_spec.h"
#include "ray/gcs/gcs_server/gcs_actor_manager.h" #include "ray/gcs/gcs_server/gcs_actor_manager.h"
#include "ray/gcs/gcs_server/gcs_actor_scheduler.h" #include "ray/gcs/gcs_server/gcs_actor_scheduler.h"
@ -41,12 +42,12 @@ class GcsActorWorkerAssignment
/// \param node_id ID of node on which this gcs actor worker assignment is allocated. /// \param node_id ID of node on which this gcs actor worker assignment is allocated.
/// \param acquired_resources Resources owned by this gcs actor worker assignment. /// \param acquired_resources Resources owned by this gcs actor worker assignment.
/// \param is_shared A flag to represent that whether the worker process can be shared. /// \param is_shared A flag to represent that whether the worker process can be shared.
GcsActorWorkerAssignment(const NodeID &node_id, GcsActorWorkerAssignment(const NodeID &node_id, const ResourceSet &acquired_resources,
const ResourceRequest &acquired_resources, bool is_shared); bool is_shared);
const NodeID &GetNodeID() const; const NodeID &GetNodeID() const;
const ResourceRequest &GetResources() const; const ResourceSet &GetResources() const;
bool IsShared() const; bool IsShared() const;
@ -54,7 +55,7 @@ class GcsActorWorkerAssignment
/// ID of node on which this actor worker assignment is allocated. /// ID of node on which this actor worker assignment is allocated.
const NodeID node_id_; const NodeID node_id_;
/// Resources owned by this actor worker assignment. /// Resources owned by this actor worker assignment.
const ResourceRequest acquired_resources_; const ResourceSet acquired_resources_;
/// A flag to represent that whether the worker process can be shared. /// A flag to represent that whether the worker process can be shared.
const bool is_shared_; const bool is_shared_;
}; };
@ -130,19 +131,19 @@ class GcsBasedActorScheduler : public GcsActorScheduler {
/// \param is_shared If the worker is shared by multiple actors or not. /// \param is_shared If the worker is shared by multiple actors or not.
/// \param task_spec The specification of the task. /// \param task_spec The specification of the task.
std::unique_ptr<GcsActorWorkerAssignment> AllocateNewActorWorkerAssignment( std::unique_ptr<GcsActorWorkerAssignment> AllocateNewActorWorkerAssignment(
const ResourceRequest &required_resources, bool is_shared, const ResourceSet &required_resources, bool is_shared,
const TaskSpecification &task_spec); const TaskSpecification &task_spec);
/// Allocate resources for the actor. /// Allocate resources for the actor.
/// ///
/// \param required_resources The resources to be allocated. /// \param required_resources The resources to be allocated.
/// \return ID of the node from which the resources are allocated. /// \return ID of the node from which the resources are allocated.
NodeID AllocateResources(const ResourceRequest &required_resources); NodeID AllocateResources(const ResourceSet &required_resources);
NodeID GetHighestScoreNodeResource(const ResourceRequest &required_resources) const; NodeID GetHighestScoreNodeResource(const ResourceSet &required_resources) const;
void WarnResourceAllocationFailure(const TaskSpecification &task_spec, void WarnResourceAllocationFailure(const TaskSpecification &task_spec,
const ResourceRequest &required_resources) const; const ResourceSet &required_resources) const;
/// A rejected rely means resources were preempted by normal tasks. Then /// A rejected rely means resources were preempted by normal tasks. Then
/// update the the cluster resource view and reschedule immediately. /// update the the cluster resource view and reschedule immediately.

View file

@ -63,9 +63,9 @@ GcsPlacementGroupScheduler::GcsPlacementGroupScheduler(
scheduler_strategies_.push_back(std::make_shared<GcsStrictSpreadStrategy>()); scheduler_strategies_.push_back(std::make_shared<GcsStrictSpreadStrategy>());
} }
std::vector<ResourceRequest> GcsScheduleStrategy::GetRequiredResourcesFromBundles( std::vector<ResourceSet> GcsScheduleStrategy::GetRequiredResourcesFromBundles(
const std::vector<std::shared_ptr<const ray::BundleSpecification>> &bundles) { const std::vector<std::shared_ptr<const ray::BundleSpecification>> &bundles) {
std::vector<ResourceRequest> required_resources; std::vector<ResourceSet> required_resources;
for (const auto &bundle : bundles) { for (const auto &bundle : bundles) {
required_resources.push_back(bundle->GetRequiredResources()); required_resources.push_back(bundle->GetRequiredResources());
} }

View file

@ -128,7 +128,7 @@ class GcsScheduleStrategy {
/// ///
/// \param bundles Bundles to be scheduled. /// \param bundles Bundles to be scheduled.
/// \return Required resources. /// \return Required resources.
std::vector<ResourceRequest> GetRequiredResourcesFromBundles( std::vector<ResourceSet> GetRequiredResourcesFromBundles(
const std::vector<std::shared_ptr<const ray::BundleSpecification>> &bundles); const std::vector<std::shared_ptr<const ray::BundleSpecification>> &bundles);
/// Generate `ScheduleResult` from bundles and nodes . /// Generate `ScheduleResult` from bundles and nodes .

View file

@ -36,23 +36,11 @@ void GcsResourceManager::HandleGetResources(const rpc::GetResourcesRequest &requ
NodeID node_id = NodeID::FromBinary(request.node_id()); NodeID node_id = NodeID::FromBinary(request.node_id());
auto iter = cluster_scheduling_resources_.find(node_id); auto iter = cluster_scheduling_resources_.find(node_id);
if (iter != cluster_scheduling_resources_.end()) { if (iter != cluster_scheduling_resources_.end()) {
const auto &resource_map = iter->second->GetTotalResources().GetResourceMap();
rpc::ResourceTableData resource_table_data; rpc::ResourceTableData resource_table_data;
const auto &node_resources = iter->second->GetLocalView(); for (const auto &resource : resource_map) {
for (size_t i = 0; i < node_resources.predefined_resources.size(); ++i) { resource_table_data.set_resource_capacity(resource.second);
const auto &resource_value = node_resources.predefined_resources[i].total; (*reply->mutable_resources())[resource.first] = resource_table_data;
if (resource_value <= 0) {
continue;
}
const auto &resource_name = scheduling::ResourceID(i).Binary();
resource_table_data.set_resource_capacity(resource_value.Double());
(*reply->mutable_resources()).insert({resource_name, resource_table_data});
}
for (const auto &entry : node_resources.custom_resources) {
const auto &resource_name = scheduling::ResourceID(entry.first).Binary();
const auto &resource_value = entry.second.total;
resource_table_data.set_resource_capacity(resource_value.Double());
(*reply->mutable_resources()).insert({resource_name, resource_table_data});
} }
} }
GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK()); GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK());
@ -72,28 +60,18 @@ void GcsResourceManager::HandleUpdateResources(
auto iter = cluster_scheduling_resources_.find(node_id); auto iter = cluster_scheduling_resources_.find(node_id);
if (iter != cluster_scheduling_resources_.end()) { if (iter != cluster_scheduling_resources_.end()) {
// Update `cluster_scheduling_resources_`. // Update `cluster_scheduling_resources_`.
auto node_resources = iter->second->GetMutableLocalView(); SchedulingResources &scheduling_resources = *iter->second;
for (const auto &entry : *changed_resources) { for (const auto &entry : *changed_resources) {
UpdateResourceCapacity(node_resources, entry.first, entry.second); scheduling_resources.UpdateResourceCapacity(entry.first, entry.second);
} }
// Update gcs storage. // Update gcs storage.
rpc::ResourceMap resource_map; rpc::ResourceMap resource_map;
for (size_t i = 0; i < node_resources->predefined_resources.size(); ++i) { for (const auto &entry : scheduling_resources.GetTotalResources().GetResourceMap()) {
const auto &resource_value = node_resources->predefined_resources[i].total; (*resource_map.mutable_items())[entry.first].set_resource_capacity(entry.second);
if (resource_value <= 0) {
continue;
}
const auto &resource_name = scheduling::ResourceID(i).Binary();
(*resource_map.mutable_items())[resource_name].set_resource_capacity(
resource_value.Double());
} }
for (const auto &entry : node_resources->custom_resources) { for (const auto &entry : *changed_resources) {
const auto &resource_name = scheduling::ResourceID(entry.first).Binary(); (*resource_map.mutable_items())[entry.first].set_resource_capacity(entry.second);
const auto &resource_value = entry.second.total;
(*resource_map.mutable_items())[resource_name].set_resource_capacity(
resource_value.Double());
} }
auto start = absl::GetCurrentTimeNanos(); auto start = absl::GetCurrentTimeNanos();
@ -131,41 +109,19 @@ void GcsResourceManager::HandleDeleteResources(
auto resource_names = VectorFromProtobuf(request.resource_name_list()); auto resource_names = VectorFromProtobuf(request.resource_name_list());
auto iter = cluster_scheduling_resources_.find(node_id); auto iter = cluster_scheduling_resources_.find(node_id);
if (iter != cluster_scheduling_resources_.end()) { if (iter != cluster_scheduling_resources_.end()) {
auto node_resources = iter->second->GetMutableLocalView();
// Update `cluster_scheduling_resources_`. // Update `cluster_scheduling_resources_`.
DeleteResources(node_resources, resource_names); for (const auto &resource_name : resource_names) {
iter->second->DeleteResource(resource_name);
}
// Update gcs storage. // Update gcs storage.
rpc::ResourceMap resource_map; rpc::ResourceMap resource_map;
for (size_t i = 0; i < node_resources->predefined_resources.size(); ++i) { auto resources = iter->second->GetTotalResources().GetResourceMap();
const auto &resource_name = scheduling::ResourceID(i).Binary(); for (const auto &resource_name : resource_names) {
if (std::find(resource_names.begin(), resource_names.end(), resource_name) != resources.erase(resource_name);
resource_names.end()) {
continue;
}
const auto &resource_value = node_resources->predefined_resources[i].total;
if (resource_value <= 0) {
continue;
}
(*resource_map.mutable_items())[resource_name].set_resource_capacity(
resource_value.Double());
} }
for (const auto &entry : node_resources->custom_resources) { for (const auto &entry : resources) {
const auto &resource_name = scheduling::ResourceID(entry.first).Binary(); (*resource_map.mutable_items())[entry.first].set_resource_capacity(entry.second);
if (std::find(resource_names.begin(), resource_names.end(), resource_name) !=
resource_names.end()) {
continue;
}
const auto &resource_value = entry.second.total;
if (resource_value <= 0) {
continue;
}
(*resource_map.mutable_items())[resource_name].set_resource_capacity(
resource_value.Double());
} }
auto on_done = [this, node_id, resource_names, reply, auto on_done = [this, node_id, resource_names, reply,
@ -193,31 +149,11 @@ void GcsResourceManager::HandleGetAllAvailableResources(
const rpc::GetAllAvailableResourcesRequest &request, const rpc::GetAllAvailableResourcesRequest &request,
rpc::GetAllAvailableResourcesReply *reply, rpc::GetAllAvailableResourcesReply *reply,
rpc::SendReplyCallback send_reply_callback) { rpc::SendReplyCallback send_reply_callback) {
for (const auto &node_resources_entry : cluster_scheduling_resources_) { for (const auto &iter : cluster_scheduling_resources_) {
const auto &node_id = node_resources_entry.first;
const auto &node_resources = node_resources_entry.second->GetLocalView();
rpc::AvailableResources resource; rpc::AvailableResources resource;
resource.set_node_id(node_id.Binary()); resource.set_node_id(iter.first.Binary());
for (const auto &res : iter.second->GetAvailableResources().GetResourceAmountMap()) {
for (size_t i = 0; i < node_resources.predefined_resources.size(); ++i) { (*resource.mutable_resources_available())[res.first] = res.second.Double();
const auto &resource_value = node_resources.predefined_resources[i].available;
if (resource_value <= 0) {
continue;
}
const auto &resource_name = scheduling::ResourceID(i).Binary();
resource.mutable_resources_available()->insert(
{resource_name, resource_value.Double()});
}
for (const auto &entry : node_resources.custom_resources) {
const auto &resource_value = entry.second.available;
if (resource_value <= 0) {
continue;
}
const auto &resource_name = scheduling::ResourceID(entry.first).Binary();
resource.mutable_resources_available()->insert(
{resource_name, resource_value.Double()});
} }
reply->add_resources_list()->CopyFrom(resource); reply->add_resources_list()->CopyFrom(resource);
} }
@ -231,7 +167,8 @@ void GcsResourceManager::UpdateFromResourceReport(const rpc::ResourcesData &data
UpdateNodeNormalTaskResources(node_id, data); UpdateNodeNormalTaskResources(node_id, data);
} else { } else {
if (node_resource_usages_.count(node_id) == 0 || data.resources_available_changed()) { if (node_resource_usages_.count(node_id) == 0 || data.resources_available_changed()) {
SetAvailableResources(node_id, MapFromProtobuf(data.resources_available())); const auto &resource_changed = MapFromProtobuf(data.resources_available());
SetAvailableResources(node_id, ResourceSet(resource_changed));
} }
} }
@ -258,13 +195,13 @@ void GcsResourceManager::HandleGetAllResourceUsage(
rpc::SendReplyCallback send_reply_callback) { rpc::SendReplyCallback send_reply_callback) {
if (!node_resource_usages_.empty()) { if (!node_resource_usages_.empty()) {
auto batch = std::make_shared<rpc::ResourceUsageBatchData>(); auto batch = std::make_shared<rpc::ResourceUsageBatchData>();
std::unordered_map<google::protobuf::Map<std::string, double>, rpc::ResourceDemand> absl::flat_hash_map<ResourceSet, rpc::ResourceDemand> aggregate_load;
aggregate_load;
for (const auto &usage : node_resource_usages_) { for (const auto &usage : node_resource_usages_) {
// Aggregate the load reported by each raylet. // Aggregate the load reported by each raylet.
auto load = usage.second.resource_load_by_shape(); auto load = usage.second.resource_load_by_shape();
for (const auto &demand : load.resource_demands()) { for (const auto &demand : load.resource_demands()) {
auto &aggregate_demand = aggregate_load[demand.shape()]; auto scheduling_key = ResourceSet(MapFromProtobuf(demand.shape()));
auto &aggregate_demand = aggregate_load[scheduling_key];
aggregate_demand.set_num_ready_requests_queued( aggregate_demand.set_num_ready_requests_queued(
aggregate_demand.num_ready_requests_queued() + aggregate_demand.num_ready_requests_queued() +
demand.num_ready_requests_queued()); demand.num_ready_requests_queued());
@ -281,7 +218,7 @@ void GcsResourceManager::HandleGetAllResourceUsage(
for (const auto &demand : aggregate_load) { for (const auto &demand : aggregate_load) {
auto demand_proto = batch->mutable_resource_load_by_shape()->add_resource_demands(); auto demand_proto = batch->mutable_resource_load_by_shape()->add_resource_demands();
demand_proto->CopyFrom(demand.second); demand_proto->CopyFrom(demand.second);
for (const auto &resource_pair : demand.first) { for (const auto &resource_pair : demand.first.GetResourceMap()) {
(*demand_proto->mutable_shape())[resource_pair.first] = resource_pair.second; (*demand_proto->mutable_shape())[resource_pair.first] = resource_pair.second;
} }
} }
@ -338,39 +275,24 @@ void GcsResourceManager::Initialize(const GcsInitData &gcs_init_data) {
for (const auto &entry : cluster_resources) { for (const auto &entry : cluster_resources) {
const auto &iter = cluster_scheduling_resources_.find(entry.first); const auto &iter = cluster_scheduling_resources_.find(entry.first);
if (iter != cluster_scheduling_resources_.end()) { if (iter != cluster_scheduling_resources_.end()) {
auto node_resources = iter->second->GetMutableLocalView();
for (const auto &resource : entry.second.items()) { for (const auto &resource : entry.second.items()) {
UpdateResourceCapacity(node_resources, resource.first, iter->second->UpdateResourceCapacity(resource.first,
resource.second.resource_capacity()); resource.second.resource_capacity());
} }
} }
} }
} }
const absl::flat_hash_map<NodeID, std::shared_ptr<Node>> const absl::flat_hash_map<NodeID, std::shared_ptr<SchedulingResources>>
&GcsResourceManager::GetClusterResources() const { &GcsResourceManager::GetClusterResources() const {
return cluster_scheduling_resources_; return cluster_scheduling_resources_;
} }
void GcsResourceManager::SetAvailableResources( void GcsResourceManager::SetAvailableResources(const NodeID &node_id,
const NodeID &node_id, const absl::flat_hash_map<std::string, double> &resource_map) { const ResourceSet &resources) {
auto iter = cluster_scheduling_resources_.find(node_id); auto iter = cluster_scheduling_resources_.find(node_id);
if (iter != cluster_scheduling_resources_.end()) { if (iter != cluster_scheduling_resources_.end()) {
auto resources = ResourceMapToResourceRequest(resource_map, iter->second->SetAvailableResources(ResourceSet(resources));
/*requires_object_store_memory=*/false);
auto node_resources = iter->second->GetMutableLocalView();
for (size_t i = 0; i < node_resources->predefined_resources.size(); ++i) {
node_resources->predefined_resources[i].available =
resources.predefined_resources[i];
}
for (auto &entry : node_resources->custom_resources) {
auto it = resources.custom_resources.find(entry.first);
if (it != resources.custom_resources.end()) {
entry.second.available = it->second;
} else {
entry.second.available = 0.;
}
}
} else { } else {
RAY_LOG(WARNING) RAY_LOG(WARNING)
<< "Skip the setting of available resources of node " << node_id << "Skip the setting of available resources of node " << node_id
@ -378,19 +300,12 @@ void GcsResourceManager::SetAvailableResources(
} }
} }
void GcsResourceManager::DeleteResources(NodeResources *node_resources, void GcsResourceManager::DeleteResources(
const std::vector<std::string> &resource_names) { const NodeID &node_id, const std::vector<std::string> &deleted_resources) {
for (const auto &resource_name : resource_names) { auto iter = cluster_scheduling_resources_.find(node_id);
auto resource_id = scheduling::ResourceID(resource_name).ToInt(); if (iter != cluster_scheduling_resources_.end()) {
if (resource_id == -1) { for (const auto &resource_name : deleted_resources) {
continue; iter->second->DeleteResource(resource_name);
}
if (resource_id >= 0 && resource_id < PredefinedResources_MAX) {
node_resources->predefined_resources[resource_id].total = 0;
node_resources->predefined_resources[resource_id].available = 0;
} else {
node_resources->custom_resources.erase(resource_id);
} }
} }
} }
@ -401,9 +316,9 @@ void GcsResourceManager::OnNodeAdd(const rpc::GcsNodeInfo &node) {
absl::flat_hash_map<std::string, double> resource_mapping( absl::flat_hash_map<std::string, double> resource_mapping(
node.resources_total().begin(), node.resources_total().end()); node.resources_total().begin(), node.resources_total().end());
// Update the cluster scheduling resources as new node is added. // Update the cluster scheduling resources as new node is added.
ResourceSet node_resources(resource_mapping);
cluster_scheduling_resources_.emplace( cluster_scheduling_resources_.emplace(
node_id, std::make_shared<Node>( node_id, std::make_shared<SchedulingResources>(node_resources));
ResourceMapToNodeResources(resource_mapping, resource_mapping)));
} }
} }
@ -414,21 +329,13 @@ void GcsResourceManager::OnNodeDead(const NodeID &node_id) {
} }
bool GcsResourceManager::AcquireResources(const NodeID &node_id, bool GcsResourceManager::AcquireResources(const NodeID &node_id,
const ResourceRequest &required_resources) { const ResourceSet &required_resources) {
auto iter = cluster_scheduling_resources_.find(node_id); auto iter = cluster_scheduling_resources_.find(node_id);
if (iter != cluster_scheduling_resources_.end()) { if (iter != cluster_scheduling_resources_.end()) {
auto node_resources = iter->second->GetMutableLocalView(); if (!required_resources.IsSubset(iter->second->GetAvailableResources())) {
if (!node_resources->IsAvailable(required_resources)) {
return false; return false;
} }
iter->second->Acquire(required_resources);
for (size_t i = 0; i < required_resources.predefined_resources.size(); ++i) {
node_resources->predefined_resources[i].available -=
required_resources.predefined_resources[i];
}
for (auto &entry : required_resources.custom_resources) {
node_resources->custom_resources[entry.first].available -= entry.second;
}
} }
// If node dead, we will not find the node. This is a normal scenario, so it returns // If node dead, we will not find the node. This is a normal scenario, so it returns
// true. // true.
@ -436,27 +343,10 @@ bool GcsResourceManager::AcquireResources(const NodeID &node_id,
} }
bool GcsResourceManager::ReleaseResources(const NodeID &node_id, bool GcsResourceManager::ReleaseResources(const NodeID &node_id,
const ResourceRequest &acquired_resources) { const ResourceSet &acquired_resources) {
auto iter = cluster_scheduling_resources_.find(node_id); auto iter = cluster_scheduling_resources_.find(node_id);
if (iter != cluster_scheduling_resources_.end()) { if (iter != cluster_scheduling_resources_.end()) {
auto node_resources = iter->second->GetMutableLocalView(); iter->second->Release(acquired_resources);
RAY_CHECK(acquired_resources.predefined_resources.size() <=
node_resources->predefined_resources.size());
for (size_t i = 0; i < acquired_resources.predefined_resources.size(); ++i) {
node_resources->predefined_resources[i].available +=
acquired_resources.predefined_resources[i];
node_resources->predefined_resources[i].available =
std::min(node_resources->predefined_resources[i].available,
node_resources->predefined_resources[i].total);
}
for (auto &entry : acquired_resources.custom_resources) {
auto it = node_resources->custom_resources.find(entry.first);
if (it != node_resources->custom_resources.end()) {
it->second.available += entry.second;
it->second.available = std::min(it->second.available, it->second.total);
}
}
} }
// If node dead, we will not find the node. This is a normal scenario, so it returns // If node dead, we will not find the node. This is a normal scenario, so it returns
// true. // true.
@ -493,9 +383,24 @@ void GcsResourceManager::AddResourcesChangedListener(std::function<void()> liste
void GcsResourceManager::UpdateNodeNormalTaskResources( void GcsResourceManager::UpdateNodeNormalTaskResources(
const NodeID &node_id, const rpc::ResourcesData &heartbeat) { const NodeID &node_id, const rpc::ResourcesData &heartbeat) {
// TODO(Shanly): To be implemented. auto iter = cluster_scheduling_resources_.find(node_id);
// This method is breaked by the refactoring of new resource structure, just remove the if (iter == cluster_scheduling_resources_.end()) {
// implementation for the time being. return;
}
auto &scheduling_resoruces = iter->second;
ResourceSet resources_normal_task(MapFromProtobuf(heartbeat.resources_normal_task()));
if (heartbeat.resources_normal_task_changed() &&
heartbeat.resources_normal_task_timestamp() >
latest_resources_normal_task_timestamp_[node_id] &&
!resources_normal_task.IsEqual(scheduling_resoruces->GetNormalTaskResources())) {
scheduling_resoruces->SetNormalTaskResources(resources_normal_task);
latest_resources_normal_task_timestamp_[node_id] =
heartbeat.resources_normal_task_timestamp();
for (const auto &listener : resources_changed_listeners_) {
listener();
}
}
} }
std::string GcsResourceManager::ToString() const { std::string GcsResourceManager::ToString() const {
@ -505,52 +410,11 @@ std::string GcsResourceManager::ToString() const {
std::string indent_1(indent + 1 * 2, ' '); std::string indent_1(indent + 1 * 2, ' ');
ostr << "{\n"; ostr << "{\n";
for (const auto &entry : cluster_scheduling_resources_) { for (const auto &entry : cluster_scheduling_resources_) {
ostr << indent_1 << entry.first << " : " << entry.second->GetLocalView().DebugString() ostr << indent_1 << entry.first << " : " << entry.second->DebugString() << ",\n";
<< ",\n";
} }
ostr << indent_0 << "}\n"; ostr << indent_0 << "}\n";
return ostr.str(); return ostr.str();
} }
void GcsResourceManager::UpdateResourceCapacity(NodeResources *node_resources,
const std::string &resource_name,
double capacity) {
auto idx = scheduling::ResourceID(resource_name).ToInt();
if (idx == -1) {
return;
}
FixedPoint resource_total_fp(capacity);
if (idx >= 0 && idx < PredefinedResources_MAX) {
auto diff_capacity =
resource_total_fp - node_resources->predefined_resources[idx].total;
node_resources->predefined_resources[idx].total += diff_capacity;
node_resources->predefined_resources[idx].available += diff_capacity;
if (node_resources->predefined_resources[idx].available < 0) {
node_resources->predefined_resources[idx].available = 0;
}
if (node_resources->predefined_resources[idx].total < 0) {
node_resources->predefined_resources[idx].total = 0;
}
} else {
auto itr = node_resources->custom_resources.find(idx);
if (itr != node_resources->custom_resources.end()) {
auto diff_capacity = resource_total_fp - itr->second.total;
itr->second.total += diff_capacity;
itr->second.available += diff_capacity;
if (itr->second.available < 0) {
itr->second.available = 0;
}
if (itr->second.total < 0) {
itr->second.total = 0;
}
} else {
ResourceCapacity resource_capacity;
resource_capacity.total = resource_capacity.available = resource_total_fp;
node_resources->custom_resources.emplace(idx, resource_capacity);
}
}
}
} // namespace gcs } // namespace gcs
} // namespace ray } // namespace ray

View file

@ -18,10 +18,11 @@
#include "ray/common/asio/instrumented_io_context.h" #include "ray/common/asio/instrumented_io_context.h"
#include "ray/common/asio/periodical_runner.h" #include "ray/common/asio/periodical_runner.h"
#include "ray/common/id.h" #include "ray/common/id.h"
#include "ray/common/task/scheduling_resources.h"
#include "ray/gcs/gcs_server/gcs_init_data.h" #include "ray/gcs/gcs_server/gcs_init_data.h"
#include "ray/gcs/gcs_server/gcs_resource_manager.h"
#include "ray/gcs/gcs_server/gcs_table_storage.h" #include "ray/gcs/gcs_server/gcs_table_storage.h"
#include "ray/gcs/pubsub/gcs_pub_sub.h" #include "ray/gcs/pubsub/gcs_pub_sub.h"
#include "ray/raylet/scheduling/cluster_resource_data.h"
#include "ray/rpc/client_call.h" #include "ray/rpc/client_call.h"
#include "ray/rpc/gcs_server/gcs_rpc_server.h" #include "ray/rpc/gcs_server/gcs_rpc_server.h"
#include "src/ray/protobuf/gcs.pb.h" #include "src/ray/protobuf/gcs.pb.h"
@ -84,7 +85,8 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler {
/// Get the resources of all nodes in the cluster. /// Get the resources of all nodes in the cluster.
/// ///
/// \return The resources of all nodes in the cluster. /// \return The resources of all nodes in the cluster.
const absl::flat_hash_map<NodeID, std::shared_ptr<Node>> &GetClusterResources() const; const absl::flat_hash_map<NodeID, std::shared_ptr<SchedulingResources>>
&GetClusterResources() const;
/// Handle a node registration. /// Handle a node registration.
/// ///
@ -100,9 +102,7 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler {
/// ///
/// \param node_id Id of a node. /// \param node_id Id of a node.
/// \param resources Available resources of a node. /// \param resources Available resources of a node.
void SetAvailableResources( void SetAvailableResources(const NodeID &node_id, const ResourceSet &resources);
const NodeID &node_id,
const absl::flat_hash_map<std::string, double> &resource_map);
/// Acquire resources from the specified node. It will deduct directly from the node /// Acquire resources from the specified node. It will deduct directly from the node
/// resources. /// resources.
@ -110,7 +110,7 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler {
/// \param node_id Id of a node. /// \param node_id Id of a node.
/// \param required_resources Resources to apply for. /// \param required_resources Resources to apply for.
/// \return True if acquire resources successfully. False otherwise. /// \return True if acquire resources successfully. False otherwise.
bool AcquireResources(const NodeID &node_id, const ResourceRequest &required_resources); bool AcquireResources(const NodeID &node_id, const ResourceSet &required_resources);
/// Release the resources of the specified node. It will be added directly to the node /// Release the resources of the specified node. It will be added directly to the node
/// resources. /// resources.
@ -118,7 +118,7 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler {
/// \param node_id Id of a node. /// \param node_id Id of a node.
/// \param acquired_resources Resources to release. /// \param acquired_resources Resources to release.
/// \return True if release resources successfully. False otherwise. /// \return True if release resources successfully. False otherwise.
bool ReleaseResources(const NodeID &node_id, const ResourceRequest &acquired_resources); bool ReleaseResources(const NodeID &node_id, const ResourceSet &acquired_resources);
/// Initialize with the gcs tables data synchronously. /// Initialize with the gcs tables data synchronously.
/// This should be called when GCS server restarts after a failure. /// This should be called when GCS server restarts after a failure.
@ -158,13 +158,10 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler {
private: private:
/// Delete the scheduling resources of the specified node. /// Delete the scheduling resources of the specified node.
/// ///
/// \param node_resources Id of a node. /// \param node_id Id of a node.
/// \param resource_names Deleted resources of a node. /// \param deleted_resources Deleted resources of a node.
void DeleteResources(NodeResources *node_resources, void DeleteResources(const NodeID &node_id,
const std::vector<std::string> &resource_names); const std::vector<std::string> &deleted_resources);
void UpdateResourceCapacity(NodeResources *node_resources,
const std::string &resource_name, double capacity);
/// The runner to run function periodically. /// The runner to run function periodically.
PeriodicalRunner periodical_runner_; PeriodicalRunner periodical_runner_;
@ -176,7 +173,8 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler {
/// Storage for GCS tables. /// Storage for GCS tables.
std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage_; std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage_;
/// Map from node id to the scheduling resources of the node. /// Map from node id to the scheduling resources of the node.
absl::flat_hash_map<NodeID, std::shared_ptr<Node>> cluster_scheduling_resources_; absl::flat_hash_map<NodeID, std::shared_ptr<SchedulingResources>>
cluster_scheduling_resources_;
/// Placement group load information that is used for autoscaler. /// Placement group load information that is used for autoscaler.
absl::optional<std::shared_ptr<rpc::PlacementGroupLoad>> placement_group_load_; absl::optional<std::shared_ptr<rpc::PlacementGroupLoad>> placement_group_load_;
/// Normal task resources could be uploaded by 1) Raylets' periodical reporters; 2) /// Normal task resources could be uploaded by 1) Raylets' periodical reporters; 2)
@ -206,34 +204,3 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler {
} // namespace gcs } // namespace gcs
} // namespace ray } // namespace ray
namespace std {
template <>
struct hash<google::protobuf::Map<std::string, double>> {
size_t operator()(google::protobuf::Map<std::string, double> const &k) const {
size_t seed = k.size();
for (auto &elem : k) {
seed ^= std::hash<std::string>()(elem.first);
seed ^= std::hash<double>()(elem.second);
}
return seed;
}
};
template <>
struct equal_to<google::protobuf::Map<std::string, double>> {
bool operator()(const google::protobuf::Map<std::string, double> &left,
const google::protobuf::Map<std::string, double> &right) const {
if (left.size() != right.size()) {
return false;
}
for (const auto &entry : left) {
auto iter = right.find(entry.first);
if (iter == right.end() || iter->second != entry.second) {
return false;
}
}
return true;
}
};
} // namespace std

View file

@ -17,46 +17,40 @@
namespace ray { namespace ray {
namespace gcs { namespace gcs {
double LeastResourceScorer::Score(const ResourceRequest &required_resources, double LeastResourceScorer::Score(const ResourceSet &required_resources,
const NodeResources &node_resources) { const SchedulingResources &node_resources) {
// TODO(Shanly): Take normal task resources into account later for GCS-based actor // In GCS-based actor scheduling, the `resources_available_` (of class
// scheduling. // `SchedulingResources`) is only acquired or released by actor scheduling, instead of
// being updated by resource reports from raylets. So the 'actual' available resources
double node_score = 0.; // (if there exist normal tasks) are equal to `resources_available_` -
// `resources_normal_tasks_`.
if (required_resources.predefined_resources.size() > ResourceSet new_available_resource_set;
node_resources.predefined_resources.size()) { const ResourceSet *available_resource_set = &node_resources.GetAvailableResources();
return -1.; if (!node_resources.GetNormalTaskResources().IsEmpty()) {
new_available_resource_set = node_resources.GetAvailableResources();
new_available_resource_set.SubtractResources(node_resources.GetNormalTaskResources());
available_resource_set = &new_available_resource_set;
} }
const auto &available_resource_amount_map =
available_resource_set->GetResourceAmountMap();
for (size_t i = 0; i < required_resources.predefined_resources.size(); ++i) { double node_score = 0.0;
const auto &request_resource = required_resources.predefined_resources[i]; for (const auto &entry : required_resources.GetResourceAmountMap()) {
const auto &node_available_resource = auto available_resource_amount_iter = available_resource_amount_map.find(entry.first);
node_resources.predefined_resources[i].available; if (available_resource_amount_iter == available_resource_amount_map.end()) {
auto score = Calculate(request_resource, node_available_resource); return -1;
if (score < 0.) {
return -1.;
} }
node_score += score; auto calculated_score =
} Calculate(entry.second, available_resource_amount_iter->second);
if (calculated_score < 0) {
for (const auto &request_resource_entry : required_resources.custom_resources) { return -1;
auto iter = node_resources.custom_resources.find(request_resource_entry.first); }
if (iter == node_resources.custom_resources.end()) { node_score += calculated_score;
return -1.;
}
const auto &request_resource = request_resource_entry.second;
const auto &node_available_resource = iter->second.available;
auto score = Calculate(request_resource, node_available_resource);
if (score < 0.) {
return -1.;
}
node_score += score;
} }
// TODO(ffbin): We always want to choose the node with the least matching resources. We
// will solve it in next pr.
return node_score; return node_score;
} }
@ -67,22 +61,20 @@ double LeastResourceScorer::Calculate(const FixedPoint &requested,
if (requested > available) { if (requested > available) {
return -1; return -1;
} }
if (available == 0) {
return 0;
}
return (available - requested).Double() / available.Double(); return (available - requested).Double() / available.Double();
} }
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
SchedulingResult GcsResourceScheduler::Schedule( SchedulingResult GcsResourceScheduler::Schedule(
const std::vector<ResourceRequest> &required_resources_list, const std::vector<ResourceSet> &required_resources_list,
const SchedulingType &scheduling_type, const SchedulingType &scheduling_type,
const std::function<bool(const NodeID &)> &node_filter_func) { const std::function<bool(const NodeID &)> &node_filter_func) {
const auto &cluster_resources = gcs_resource_manager_.GetClusterResources();
// Filter candidate nodes. // Filter candidate nodes.
auto candidate_nodes = FilterCandidateNodes(node_filter_func); absl::flat_hash_set<NodeID> candidate_nodes =
FilterCandidateNodes(cluster_resources, node_filter_func);
if (candidate_nodes.empty()) { if (candidate_nodes.empty()) {
RAY_LOG(DEBUG) << "The candidate nodes is empty, return directly."; RAY_LOG(DEBUG) << "The candidate nodes is empty, return directly.";
return std::make_pair(SchedulingResultStatus::INFEASIBLE, std::vector<NodeID>()); return std::make_pair(SchedulingResultStatus::INFEASIBLE, std::vector<NodeID>());
@ -115,11 +107,12 @@ SchedulingResult GcsResourceScheduler::Schedule(
} }
absl::flat_hash_set<NodeID> GcsResourceScheduler::FilterCandidateNodes( absl::flat_hash_set<NodeID> GcsResourceScheduler::FilterCandidateNodes(
const absl::flat_hash_map<NodeID, std::shared_ptr<SchedulingResources>>
&cluster_resources,
const std::function<bool(const NodeID &)> &node_filter_func) { const std::function<bool(const NodeID &)> &node_filter_func) {
absl::flat_hash_set<NodeID> result; absl::flat_hash_set<NodeID> result;
const auto &resource_view = GetResourceView(); result.reserve(cluster_resources.size());
result.reserve(resource_view.size()); for (const auto &iter : cluster_resources) {
for (const auto &iter : resource_view) {
const auto &node_id = iter.first; const auto &node_id = iter.first;
if (node_filter_func == nullptr || node_filter_func(node_id)) { if (node_filter_func == nullptr || node_filter_func(node_id)) {
result.emplace(node_id); result.emplace(node_id);
@ -128,8 +121,8 @@ absl::flat_hash_set<NodeID> GcsResourceScheduler::FilterCandidateNodes(
return result; return result;
} }
const std::vector<ResourceRequest> &GcsResourceScheduler::SortRequiredResources( const std::vector<ResourceSet> &GcsResourceScheduler::SortRequiredResources(
const std::vector<ResourceRequest> &required_resources) { const std::vector<ResourceSet> &required_resources) {
// TODO(ffbin): A bundle may require special resources, such as GPU. We need to // TODO(ffbin): A bundle may require special resources, such as GPU. We need to
// schedule bundles with special resource requirements first, which will be implemented // schedule bundles with special resource requirements first, which will be implemented
// in the next pr. // in the next pr.
@ -137,7 +130,7 @@ const std::vector<ResourceRequest> &GcsResourceScheduler::SortRequiredResources(
} }
SchedulingResult GcsResourceScheduler::StrictSpreadSchedule( SchedulingResult GcsResourceScheduler::StrictSpreadSchedule(
const std::vector<ResourceRequest> &required_resources_list, const std::vector<ResourceSet> &required_resources_list,
const absl::flat_hash_set<NodeID> &candidate_nodes) { const absl::flat_hash_set<NodeID> &candidate_nodes) {
if (required_resources_list.size() > candidate_nodes.size()) { if (required_resources_list.size() > candidate_nodes.size()) {
RAY_LOG(DEBUG) << "The number of required resources " RAY_LOG(DEBUG) << "The number of required resources "
@ -171,7 +164,7 @@ SchedulingResult GcsResourceScheduler::StrictSpreadSchedule(
} }
SchedulingResult GcsResourceScheduler::SpreadSchedule( SchedulingResult GcsResourceScheduler::SpreadSchedule(
const std::vector<ResourceRequest> &required_resources_list, const std::vector<ResourceSet> &required_resources_list,
const absl::flat_hash_set<NodeID> &candidate_nodes) { const absl::flat_hash_set<NodeID> &candidate_nodes) {
std::vector<NodeID> result_nodes; std::vector<NodeID> result_nodes;
absl::flat_hash_set<NodeID> candidate_nodes_copy(candidate_nodes); absl::flat_hash_set<NodeID> candidate_nodes_copy(candidate_nodes);
@ -183,7 +176,7 @@ SchedulingResult GcsResourceScheduler::SpreadSchedule(
// There are nodes to meet the scheduling requirements. // There are nodes to meet the scheduling requirements.
if (best_node) { if (best_node) {
result_nodes.emplace_back(std::move(*best_node)); result_nodes.emplace_back(std::move(*best_node));
RAY_CHECK(AllocateRemoteTaskResources(result_nodes.back(), iter)); RAY_CHECK(gcs_resource_manager_.AcquireResources(result_nodes.back(), iter));
candidate_nodes_copy.erase(result_nodes.back()); candidate_nodes_copy.erase(result_nodes.back());
selected_nodes.insert(result_nodes.back()); selected_nodes.insert(result_nodes.back());
} else { } else {
@ -191,7 +184,7 @@ SchedulingResult GcsResourceScheduler::SpreadSchedule(
auto best_node = GetBestNode(iter, selected_nodes); auto best_node = GetBestNode(iter, selected_nodes);
if (best_node) { if (best_node) {
result_nodes.push_back(std::move(*best_node)); result_nodes.push_back(std::move(*best_node));
RAY_CHECK(AllocateRemoteTaskResources(result_nodes.back(), iter)); RAY_CHECK(gcs_resource_manager_.AcquireResources(result_nodes.back(), iter));
} else { } else {
break; break;
} }
@ -209,30 +202,20 @@ SchedulingResult GcsResourceScheduler::SpreadSchedule(
} }
SchedulingResult GcsResourceScheduler::StrictPackSchedule( SchedulingResult GcsResourceScheduler::StrictPackSchedule(
const std::vector<ResourceRequest> &required_resources_list, const std::vector<ResourceSet> &required_resources_list,
const absl::flat_hash_set<NodeID> &candidate_nodes) { const absl::flat_hash_set<NodeID> &candidate_nodes) {
// Aggregate required resources. // Aggregate required resources.
ResourceRequest aggregated_resource_request; ResourceSet required_resources;
for (const auto &resource_request : required_resources_list) { for (const auto &iter : required_resources_list) {
if (aggregated_resource_request.predefined_resources.size() < required_resources.AddResources(iter);
resource_request.predefined_resources.size()) {
aggregated_resource_request.predefined_resources.resize(
resource_request.predefined_resources.size());
}
for (size_t i = 0; i < resource_request.predefined_resources.size(); ++i) {
aggregated_resource_request.predefined_resources[i] +=
resource_request.predefined_resources[i];
}
for (const auto &entry : resource_request.custom_resources) {
aggregated_resource_request.custom_resources[entry.first] += entry.second;
}
} }
const auto &cluster_resource = GetResourceView(); const auto &cluster_resource = gcs_resource_manager_.GetClusterResources();
const auto &right_node_it = std::find_if( const auto &right_node_it = std::find_if(
cluster_resource.begin(), cluster_resource.end(), cluster_resource.begin(), cluster_resource.end(),
[&aggregated_resource_request](const auto &entry) { [required_resources](const auto &node_resource) {
return entry.second->GetLocalView().IsAvailable(aggregated_resource_request); return required_resources.IsSubset(node_resource.second->GetTotalResources());
}); });
if (right_node_it == cluster_resource.end()) { if (right_node_it == cluster_resource.end()) {
@ -243,7 +226,7 @@ SchedulingResult GcsResourceScheduler::StrictPackSchedule(
std::vector<NodeID> result_nodes; std::vector<NodeID> result_nodes;
auto best_node = GetBestNode(aggregated_resource_request, candidate_nodes); auto best_node = GetBestNode(required_resources, candidate_nodes);
// Select the node with the highest score. // Select the node with the highest score.
// `StrictPackSchedule` does not need to consider the scheduling context, because it // `StrictPackSchedule` does not need to consider the scheduling context, because it
@ -262,12 +245,12 @@ SchedulingResult GcsResourceScheduler::StrictPackSchedule(
} }
SchedulingResult GcsResourceScheduler::PackSchedule( SchedulingResult GcsResourceScheduler::PackSchedule(
const std::vector<ResourceRequest> &required_resources_list, const std::vector<ResourceSet> &required_resources_list,
const absl::flat_hash_set<NodeID> &candidate_nodes) { const absl::flat_hash_set<NodeID> &candidate_nodes) {
std::vector<NodeID> result_nodes; std::vector<NodeID> result_nodes;
result_nodes.resize(required_resources_list.size()); result_nodes.resize(required_resources_list.size());
absl::flat_hash_set<NodeID> candidate_nodes_copy(candidate_nodes); absl::flat_hash_set<NodeID> candidate_nodes_copy(candidate_nodes);
std::list<std::pair<int, ResourceRequest>> required_resources_list_copy; std::list<std::pair<int, ResourceSet>> required_resources_list_copy;
int index = 0; int index = 0;
for (const auto &iter : required_resources_list) { for (const auto &iter : required_resources_list) {
required_resources_list_copy.emplace_back(index++, iter); required_resources_list_copy.emplace_back(index++, iter);
@ -282,14 +265,14 @@ SchedulingResult GcsResourceScheduler::PackSchedule(
break; break;
} }
RAY_CHECK(AllocateRemoteTaskResources(*best_node, required_resources)); RAY_CHECK(gcs_resource_manager_.AcquireResources(*best_node, required_resources));
result_nodes[required_resources_index] = *best_node; result_nodes[required_resources_index] = *best_node;
required_resources_list_copy.pop_front(); required_resources_list_copy.pop_front();
// We try to schedule more resources on one node. // We try to schedule more resources on one node.
for (auto iter = required_resources_list_copy.begin(); for (auto iter = required_resources_list_copy.begin();
iter != required_resources_list_copy.end();) { iter != required_resources_list_copy.end();) {
if (AllocateRemoteTaskResources(*best_node, iter->second)) { if (gcs_resource_manager_.AcquireResources(*best_node, iter->second)) {
result_nodes[iter->first] = *best_node; result_nodes[iter->first] = *best_node;
required_resources_list_copy.erase(iter++); required_resources_list_copy.erase(iter++);
} else { } else {
@ -310,15 +293,17 @@ SchedulingResult GcsResourceScheduler::PackSchedule(
} }
std::optional<NodeID> GcsResourceScheduler::GetBestNode( std::optional<NodeID> GcsResourceScheduler::GetBestNode(
const ResourceRequest &required_resources, const ResourceSet &required_resources,
const absl::flat_hash_set<NodeID> &candidate_nodes) { const absl::flat_hash_set<NodeID> &candidate_nodes) {
double best_node_score = -1; double best_node_score = -1;
const NodeID *best_node_id = nullptr; const NodeID *best_node_id = nullptr;
const auto &cluster_resources = gcs_resource_manager_.GetClusterResources();
// Score the nodes. // Score the nodes.
for (const auto &node_id : candidate_nodes) { for (const auto &node_id : candidate_nodes) {
const auto &node_resources = GetNodeResources(node_id); const auto &iter = cluster_resources.find(node_id);
double node_score = node_scorer_->Score(required_resources, node_resources); RAY_CHECK(iter != cluster_resources.end());
double node_score = node_scorer_->Score(required_resources, *iter->second);
if (best_node_id == nullptr || best_node_score < node_score) { if (best_node_id == nullptr || best_node_score < node_score) {
best_node_id = &node_id; best_node_id = &node_id;
best_node_score = node_score; best_node_score = node_score;
@ -329,41 +314,19 @@ std::optional<NodeID> GcsResourceScheduler::GetBestNode(
} else { } else {
return std::nullopt; return std::nullopt;
} }
return std::nullopt;
} }
void GcsResourceScheduler::ReleaseTemporarilyDeductedResources( void GcsResourceScheduler::ReleaseTemporarilyDeductedResources(
const std::vector<ResourceRequest> &required_resources_list, const std::vector<ResourceSet> &required_resources_list,
const std::vector<NodeID> &nodes) { const std::vector<NodeID> &nodes) {
for (int index = 0; index < (int)nodes.size(); index++) { for (int index = 0; index < (int)nodes.size(); index++) {
// If `PackSchedule` fails, the id of some nodes may be nil. // If `PackSchedule` fails, the id of some nodes may be nil.
if (!nodes[index].IsNil()) { if (!nodes[index].IsNil()) {
RAY_CHECK(ReleaseRemoteTaskResources(nodes[index], required_resources_list[index])); RAY_CHECK(gcs_resource_manager_.ReleaseResources(nodes[index],
required_resources_list[index]));
} }
} }
} }
const NodeResources &GcsResourceScheduler::GetNodeResources(const NodeID &node_id) const {
const auto &resource_view = GetResourceView();
auto iter = resource_view.find(node_id);
RAY_CHECK(iter != resource_view.end());
return iter->second->GetLocalView();
}
bool GcsResourceScheduler::AllocateRemoteTaskResources(
const NodeID &node_id, const ResourceRequest &resource_request) {
return gcs_resource_manager_.AcquireResources(node_id, resource_request);
}
bool GcsResourceScheduler::ReleaseRemoteTaskResources(
const NodeID &node_id, const ResourceRequest &resource_request) {
return gcs_resource_manager_.ReleaseResources(node_id, resource_request);
}
const absl::flat_hash_map<NodeID, std::shared_ptr<Node>>
&GcsResourceScheduler::GetResourceView() const {
return gcs_resource_manager_.GetClusterResources();
}
} // namespace gcs } // namespace gcs
} // namespace ray } // namespace ray

View file

@ -16,9 +16,8 @@
#include <optional> #include <optional>
#include "absl/container/flat_hash_set.h" #include "absl/container/flat_hash_set.h"
#include "ray/common/id.h" #include "ray/common/task/scheduling_resources.h"
#include "ray/gcs/gcs_server/gcs_resource_manager.h" #include "ray/gcs/gcs_server/gcs_resource_manager.h"
#include "ray/raylet/scheduling/cluster_resource_data.h"
namespace ray { namespace ray {
namespace gcs { namespace gcs {
@ -62,16 +61,16 @@ class NodeScorer {
/// \param node_resources The node resources which contains available and total /// \param node_resources The node resources which contains available and total
/// resources. /// resources.
/// \return Score of the node. /// \return Score of the node.
virtual double Score(const ResourceRequest &required_resources, virtual double Score(const ResourceSet &required_resources,
const NodeResources &node_resources) = 0; const SchedulingResources &node_resources) = 0;
}; };
/// LeastResourceScorer is a score plugin that favors nodes with fewer allocation /// LeastResourceScorer is a score plugin that favors nodes with fewer allocation
/// requested resources based on requested resources. /// requested resources based on requested resources.
class LeastResourceScorer : public NodeScorer { class LeastResourceScorer : public NodeScorer {
public: public:
double Score(const ResourceRequest &required_resources, double Score(const ResourceSet &required_resources,
const NodeResources &node_resources) override; const SchedulingResources &node_resources) override;
private: private:
/// \brief Calculate one of the resource scores. /// \brief Calculate one of the resource scores.
@ -103,7 +102,7 @@ class GcsResourceScheduler {
/// otherwise, it will return an empty vector and a flag to indicate whether this /// otherwise, it will return an empty vector and a flag to indicate whether this
/// request can be retry or not. /// request can be retry or not.
SchedulingResult Schedule( SchedulingResult Schedule(
const std::vector<ResourceRequest> &required_resources_list, const std::vector<ResourceSet> &required_resources_list,
const SchedulingType &scheduling_type, const SchedulingType &scheduling_type,
const std::function<bool(const NodeID &)> &node_filter_func = nullptr); const std::function<bool(const NodeID &)> &node_filter_func = nullptr);
@ -116,6 +115,8 @@ class GcsResourceScheduler {
/// can be used for scheduling. /// can be used for scheduling.
/// \return The candidate nodes which can be used for scheduling. /// \return The candidate nodes which can be used for scheduling.
absl::flat_hash_set<NodeID> FilterCandidateNodes( absl::flat_hash_set<NodeID> FilterCandidateNodes(
const absl::flat_hash_map<NodeID, std::shared_ptr<SchedulingResources>>
&cluster_resources,
const std::function<bool(const NodeID &)> &node_filter_func); const std::function<bool(const NodeID &)> &node_filter_func);
/// Sort required resources according to the scarcity and capacity of resources. /// Sort required resources according to the scarcity and capacity of resources.
@ -124,8 +125,8 @@ class GcsResourceScheduler {
/// ///
/// \param required_resources The resources to be scheduled. /// \param required_resources The resources to be scheduled.
/// \return The Sorted resources. /// \return The Sorted resources.
const std::vector<ResourceRequest> &SortRequiredResources( const std::vector<ResourceSet> &SortRequiredResources(
const std::vector<ResourceRequest> &required_resources); const std::vector<ResourceSet> &required_resources);
/// Schedule resources according to `STRICT_SPREAD` strategy. /// Schedule resources according to `STRICT_SPREAD` strategy.
/// ///
@ -135,7 +136,7 @@ class GcsResourceScheduler {
/// otherwise, it will return an empty vector and a flag to indicate whether this /// otherwise, it will return an empty vector and a flag to indicate whether this
/// request can be retry or not. /// request can be retry or not.
SchedulingResult StrictSpreadSchedule( SchedulingResult StrictSpreadSchedule(
const std::vector<ResourceRequest> &required_resources_list, const std::vector<ResourceSet> &required_resources_list,
const absl::flat_hash_set<NodeID> &candidate_nodes); const absl::flat_hash_set<NodeID> &candidate_nodes);
/// Schedule resources according to `SPREAD` strategy. /// Schedule resources according to `SPREAD` strategy.
@ -145,9 +146,8 @@ class GcsResourceScheduler {
/// \return `SchedulingResult`, including the selected nodes if schedule successful, /// \return `SchedulingResult`, including the selected nodes if schedule successful,
/// otherwise, it will return an empty vector and a flag to indicate whether this /// otherwise, it will return an empty vector and a flag to indicate whether this
/// request can be retry or not. /// request can be retry or not.
SchedulingResult SpreadSchedule( SchedulingResult SpreadSchedule(const std::vector<ResourceSet> &required_resources_list,
const std::vector<ResourceRequest> &required_resources_list, const absl::flat_hash_set<NodeID> &candidate_nodes);
const absl::flat_hash_set<NodeID> &candidate_nodes);
/// Schedule resources according to `STRICT_PACK` strategy. /// Schedule resources according to `STRICT_PACK` strategy.
/// ///
@ -157,7 +157,7 @@ class GcsResourceScheduler {
/// otherwise, it will return an empty vector and a flag to indicate whether this /// otherwise, it will return an empty vector and a flag to indicate whether this
/// request can be retry or not. /// request can be retry or not.
SchedulingResult StrictPackSchedule( SchedulingResult StrictPackSchedule(
const std::vector<ResourceRequest> &required_resources_list, const std::vector<ResourceSet> &required_resources_list,
const absl::flat_hash_set<NodeID> &candidate_nodes); const absl::flat_hash_set<NodeID> &candidate_nodes);
/// Schedule resources according to `PACK` strategy. /// Schedule resources according to `PACK` strategy.
@ -167,45 +167,26 @@ class GcsResourceScheduler {
/// \return `SchedulingResult`, including the selected nodes if schedule successful, /// \return `SchedulingResult`, including the selected nodes if schedule successful,
/// otherwise, it will return an empty vector and a flag to indicate whether this /// otherwise, it will return an empty vector and a flag to indicate whether this
/// request can be retry or not. /// request can be retry or not.
SchedulingResult PackSchedule( SchedulingResult PackSchedule(const std::vector<ResourceSet> &required_resources_list,
const std::vector<ResourceRequest> &required_resources_list, const absl::flat_hash_set<NodeID> &candidate_nodes);
const absl::flat_hash_set<NodeID> &candidate_nodes);
/// Score all nodes according to the specified resources. /// Score all nodes according to the specified resources.
/// ///
/// \param required_resources The resources to be scheduled. /// \param required_resources The resources to be scheduled.
/// \param candidate_nodes The nodes can be used for scheduling. /// \param candidate_nodes The nodes can be used for scheduling.
/// \return Score of all nodes. /// \return Score of all nodes.
std::optional<NodeID> GetBestNode(const ResourceRequest &required_resources, std::optional<NodeID> GetBestNode(const ResourceSet &required_resources,
const absl::flat_hash_set<NodeID> &candidate_nodes); const absl::flat_hash_set<NodeID> &candidate_nodes);
/// Get node resources.
const NodeResources &GetNodeResources(const NodeID &node_id) const;
/// Return the resources temporarily deducted from gcs resource manager. /// Return the resources temporarily deducted from gcs resource manager.
/// ///
/// \param required_resources_list The resources to be scheduled. /// \param required_resources_list The resources to be scheduled.
/// \param nodes Scheduling selected nodes, it corresponds to `required_resources_list` /// \param nodes Scheduling selected nodes, it corresponds to `required_resources_list`
/// one by one. /// one by one.
void ReleaseTemporarilyDeductedResources( void ReleaseTemporarilyDeductedResources(
const std::vector<ResourceRequest> &required_resources_list, const std::vector<ResourceSet> &required_resources_list,
const std::vector<NodeID> &nodes); const std::vector<NodeID> &nodes);
/// Subtract the resources required by a given resource request (resource_request) from
/// a given remote node.
///
/// \param node_id Remote node whose resources we allocate.
/// \param resource_request Task for which we allocate resources.
/// \return True if remote node has enough resources to satisfy the resource request.
/// False otherwise.
bool AllocateRemoteTaskResources(const NodeID &node_id,
const ResourceRequest &resource_request);
bool ReleaseRemoteTaskResources(const NodeID &node_id,
const ResourceRequest &resource_request);
const absl::flat_hash_map<NodeID, std::shared_ptr<Node>> &GetResourceView() const;
/// Reference of GcsResourceManager. /// Reference of GcsResourceManager.
GcsResourceManager &gcs_resource_manager_; GcsResourceManager &gcs_resource_manager_;

View file

@ -157,10 +157,11 @@ TEST_F(GcsBasedActorSchedulerTest, TestScheduleAndDestroyOneActor) {
auto node = AddNewNode(node_resources); auto node = AddNewNode(node_resources);
auto node_id = NodeID::FromBinary(node->node_id()); auto node_id = NodeID::FromBinary(node->node_id());
ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size());
absl::flat_hash_map<NodeID, std::shared_ptr<Node>> cluster_resources_before_scheduling; absl::flat_hash_map<NodeID, std::shared_ptr<SchedulingResources>>
cluster_resources_before_scheduling;
for (auto &entry : gcs_resource_manager_->GetClusterResources()) { for (auto &entry : gcs_resource_manager_->GetClusterResources()) {
cluster_resources_before_scheduling.emplace(entry.first, cluster_resources_before_scheduling.emplace(
std::make_shared<Node>(*entry.second)); entry.first, std::make_shared<SchedulingResources>(*entry.second));
} }
ASSERT_TRUE(cluster_resources_before_scheduling.contains(node_id)); ASSERT_TRUE(cluster_resources_before_scheduling.contains(node_id));
@ -194,15 +195,17 @@ TEST_F(GcsBasedActorSchedulerTest, TestScheduleAndDestroyOneActor) {
auto cluster_resources_after_scheduling = gcs_resource_manager_->GetClusterResources(); auto cluster_resources_after_scheduling = gcs_resource_manager_->GetClusterResources();
ASSERT_TRUE(cluster_resources_after_scheduling.contains(node_id)); ASSERT_TRUE(cluster_resources_after_scheduling.contains(node_id));
ASSERT_NE(cluster_resources_before_scheduling[node_id]->GetLocalView(), ASSERT_FALSE(
cluster_resources_after_scheduling[node_id]->GetLocalView()); cluster_resources_before_scheduling[node_id]->GetAvailableResources().IsEqual(
cluster_resources_after_scheduling[node_id]->GetAvailableResources()));
// When destroying an actor, its acquired resources have to be returned. // When destroying an actor, its acquired resources have to be returned.
gcs_actor_scheduler_->OnActorDestruction(actor); gcs_actor_scheduler_->OnActorDestruction(actor);
auto cluster_resources_after_destruction = gcs_resource_manager_->GetClusterResources(); auto cluster_resources_after_destruction = gcs_resource_manager_->GetClusterResources();
ASSERT_TRUE(cluster_resources_after_destruction.contains(node_id)); ASSERT_TRUE(cluster_resources_after_destruction.contains(node_id));
ASSERT_EQ(cluster_resources_before_scheduling[node_id]->GetLocalView(), ASSERT_TRUE(
cluster_resources_after_scheduling[node_id]->GetLocalView()); cluster_resources_before_scheduling[node_id]->GetAvailableResources().IsEqual(
cluster_resources_after_destruction[node_id]->GetAvailableResources()));
} }
TEST_F(GcsBasedActorSchedulerTest, TestBalancedSchedule) { TEST_F(GcsBasedActorSchedulerTest, TestBalancedSchedule) {

View file

@ -50,18 +50,17 @@ TEST_F(GcsResourceManagerTest, TestBasic) {
ASSERT_EQ(1, cluster_resource.size()); ASSERT_EQ(1, cluster_resource.size());
const auto &node_id = NodeID::FromBinary(node->node_id()); const auto &node_id = NodeID::FromBinary(node->node_id());
auto resource_request = ResourceSet resource_set(resource_map);
ResourceMapToResourceRequest(resource_map, /*requires_object_store_memory=*/false);
// Test `AcquireResources`. // Test `AcquireResources`.
ASSERT_TRUE(gcs_resource_manager_->AcquireResources(node_id, resource_request)); ASSERT_TRUE(gcs_resource_manager_->AcquireResources(node_id, resource_set));
ASSERT_FALSE(gcs_resource_manager_->AcquireResources(node_id, resource_request)); ASSERT_FALSE(gcs_resource_manager_->AcquireResources(node_id, resource_set));
// Test `ReleaseResources`. // Test `ReleaseResources`.
ASSERT_TRUE( ASSERT_TRUE(
gcs_resource_manager_->ReleaseResources(NodeID::FromRandom(), resource_request)); gcs_resource_manager_->ReleaseResources(NodeID::FromRandom(), resource_set));
ASSERT_TRUE(gcs_resource_manager_->ReleaseResources(node_id, resource_request)); ASSERT_TRUE(gcs_resource_manager_->ReleaseResources(node_id, resource_set));
ASSERT_TRUE(gcs_resource_manager_->AcquireResources(node_id, resource_request)); ASSERT_TRUE(gcs_resource_manager_->AcquireResources(node_id, resource_set));
} }
TEST_F(GcsResourceManagerTest, TestResourceUsageAPI) { TEST_F(GcsResourceManagerTest, TestResourceUsageAPI) {

View file

@ -52,21 +52,8 @@ class GcsResourceSchedulerTest : public ::testing::Test {
const auto &cluster_resource = gcs_resource_manager_->GetClusterResources(); const auto &cluster_resource = gcs_resource_manager_->GetClusterResources();
auto iter = cluster_resource.find(node_id); auto iter = cluster_resource.find(node_id);
ASSERT_TRUE(iter != cluster_resource.end()); ASSERT_TRUE(iter != cluster_resource.end());
const auto &node_resources = iter->second->GetLocalView(); ASSERT_EQ(iter->second->GetAvailableResources().GetResource(resource_name).Double(),
auto resource_id = scheduling::ResourceID(resource_name).ToInt(); resource_value);
ASSERT_NE(resource_id, -1);
const ResourceCapacity *resource_capacity = nullptr;
if (resource_id >= 0 && resource_id < PredefinedResources_MAX) {
resource_capacity = &node_resources.predefined_resources[resource_id];
} else {
auto iter = node_resources.custom_resources.find(resource_id);
if (iter != node_resources.custom_resources.end()) {
resource_capacity = &iter->second;
}
}
ASSERT_TRUE(resource_capacity != nullptr);
ASSERT_EQ(resource_capacity->available.Double(), resource_value);
} }
void TestResourceLeaks(const gcs::SchedulingType &scheduling_type) { void TestResourceLeaks(const gcs::SchedulingType &scheduling_type) {
@ -77,12 +64,11 @@ class GcsResourceSchedulerTest : public ::testing::Test {
AddClusterResources(node_id, cpu_resource, node_cpu_num); AddClusterResources(node_id, cpu_resource, node_cpu_num);
// Scheduling succeeded and node resources are used up. // Scheduling succeeded and node resources are used up.
std::vector<ResourceRequest> required_resources_list; std::vector<ResourceSet> required_resources_list;
absl::flat_hash_map<std::string, double> resource_map; absl::flat_hash_map<std::string, double> resource_map;
for (int bundle_cpu_num = 1; bundle_cpu_num <= 3; ++bundle_cpu_num) { for (int bundle_cpu_num = 1; bundle_cpu_num <= 3; ++bundle_cpu_num) {
resource_map[cpu_resource] = bundle_cpu_num; resource_map[cpu_resource] = bundle_cpu_num;
required_resources_list.emplace_back(ResourceMapToResourceRequest( required_resources_list.emplace_back(resource_map);
resource_map, /*requires_object_store_memory=*/false));
} }
const auto &result1 = const auto &result1 =
gcs_resource_scheduler_->Schedule(required_resources_list, scheduling_type); gcs_resource_scheduler_->Schedule(required_resources_list, scheduling_type);
@ -94,9 +80,7 @@ class GcsResourceSchedulerTest : public ::testing::Test {
// Scheduling failure. // Scheduling failure.
resource_map[cpu_resource] = 5; resource_map[cpu_resource] = 5;
required_resources_list.emplace_back( required_resources_list.emplace_back(resource_map);
ResourceMapToResourceRequest(resource_map,
/*requires_object_store_memory=*/false));
const auto &result2 = const auto &result2 =
gcs_resource_scheduler_->Schedule(required_resources_list, scheduling_type); gcs_resource_scheduler_->Schedule(required_resources_list, scheduling_type);
ASSERT_TRUE(result2.first == gcs::SchedulingResultStatus::FAILED); ASSERT_TRUE(result2.first == gcs::SchedulingResultStatus::FAILED);
@ -129,11 +113,10 @@ TEST_F(GcsResourceSchedulerTest, TestNodeFilter) {
AddClusterResources(node_id, cpu_resource, node_cpu_num); AddClusterResources(node_id, cpu_resource, node_cpu_num);
// Scheduling failure. // Scheduling failure.
std::vector<ResourceRequest> required_resources_list; std::vector<ResourceSet> required_resources_list;
absl::flat_hash_map<std::string, double> resource_map; absl::flat_hash_map<std::string, double> resource_map;
resource_map[cpu_resource] = 1; resource_map[cpu_resource] = 1;
required_resources_list.emplace_back( required_resources_list.emplace_back(resource_map);
ResourceMapToResourceRequest(resource_map, /*requires_object_store_memory=*/false));
const auto &result1 = gcs_resource_scheduler_->Schedule( const auto &result1 = gcs_resource_scheduler_->Schedule(
required_resources_list, gcs::SchedulingType::STRICT_SPREAD, required_resources_list, gcs::SchedulingType::STRICT_SPREAD,
[](const NodeID &) { return false; }); [](const NodeID &) { return false; });
@ -158,12 +141,11 @@ TEST_F(GcsResourceSchedulerTest, TestSchedulingResultStatusForStrictStrategy) {
AddClusterResources(node_tow_id, cpu_resource, node_cpu_num); AddClusterResources(node_tow_id, cpu_resource, node_cpu_num);
// Mock a request that has three required resources. // Mock a request that has three required resources.
std::vector<ResourceRequest> required_resources_list; std::vector<ResourceSet> required_resources_list;
absl::flat_hash_map<std::string, double> resource_map; absl::flat_hash_map<std::string, double> resource_map;
resource_map[cpu_resource] = 1; resource_map[cpu_resource] = 1;
for (int node_number = 0; node_number < 3; node_number++) { for (int node_number = 0; node_number < 3; node_number++) {
required_resources_list.emplace_back(ResourceMapToResourceRequest( required_resources_list.emplace_back(resource_map);
resource_map, /*requires_object_store_memory=*/false));
} }
const auto &result1 = gcs_resource_scheduler_->Schedule( const auto &result1 = gcs_resource_scheduler_->Schedule(
@ -180,8 +162,7 @@ TEST_F(GcsResourceSchedulerTest, TestSchedulingResultStatusForStrictStrategy) {
required_resources_list.clear(); required_resources_list.clear();
resource_map.clear(); resource_map.clear();
resource_map[cpu_resource] = 50; resource_map[cpu_resource] = 50;
required_resources_list.emplace_back( required_resources_list.emplace_back(resource_map);
ResourceMapToResourceRequest(resource_map, /*requires_object_store_memory=*/false));
const auto &result2 = gcs_resource_scheduler_->Schedule( const auto &result2 = gcs_resource_scheduler_->Schedule(
required_resources_list, gcs::SchedulingType::STRICT_PACK); required_resources_list, gcs::SchedulingType::STRICT_PACK);

View file

@ -64,7 +64,7 @@ bool NewPlacementGroupResourceManager::PrepareBundle(
auto resource_instances = std::make_shared<TaskResourceInstances>(); auto resource_instances = std::make_shared<TaskResourceInstances>();
bool allocated = bool allocated =
cluster_resource_scheduler_->GetLocalResourceManager().AllocateLocalTaskResources( cluster_resource_scheduler_->GetLocalResourceManager().AllocateLocalTaskResources(
bundle_spec.GetRequiredResources(), resource_instances); bundle_spec.GetRequiredResources().GetResourceMap(), resource_instances);
if (!allocated) { if (!allocated) {
return false; return false;

View file

@ -294,7 +294,7 @@ bool NodeResources::IsFeasible(const ResourceRequest &resource_request) const {
return true; return true;
} }
bool NodeResources::operator==(const NodeResources &other) const { bool NodeResources::operator==(const NodeResources &other) {
for (size_t i = 0; i < PredefinedResources_MAX; i++) { for (size_t i = 0; i < PredefinedResources_MAX; i++) {
if (this->predefined_resources[i].total != other.predefined_resources[i].total) { if (this->predefined_resources[i].total != other.predefined_resources[i].total) {
return false; return false;
@ -325,9 +325,7 @@ bool NodeResources::operator==(const NodeResources &other) const {
return true; return true;
} }
bool NodeResources::operator!=(const NodeResources &other) const { bool NodeResources::operator!=(const NodeResources &other) { return !(*this == other); }
return !(*this == other);
}
std::string NodeResources::DebugString() const { std::string NodeResources::DebugString() const {
std::stringstream buffer; std::stringstream buffer;

View file

@ -165,8 +165,8 @@ class NodeResources {
/// Note: This doesn't account for the binpacking of unit resources. /// Note: This doesn't account for the binpacking of unit resources.
bool IsFeasible(const ResourceRequest &resource_request) const; bool IsFeasible(const ResourceRequest &resource_request) const;
/// Returns if this equals another node resources. /// Returns if this equals another node resources.
bool operator==(const NodeResources &other) const; bool operator==(const NodeResources &other);
bool operator!=(const NodeResources &other) const; bool operator!=(const NodeResources &other);
/// Returns human-readable string for these resources. /// Returns human-readable string for these resources.
std::string DebugString() const; std::string DebugString() const;
/// Returns compact dict-like string. /// Returns compact dict-like string.