mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
This reverts commit 549466a42f
.
This commit is contained in:
parent
15d97a1021
commit
79e8405fda
17 changed files with 228 additions and 503 deletions
|
@ -674,7 +674,6 @@ def test_task_arguments_inline_bytes_limit(ray_start_cluster):
|
|||
|
||||
# This case tests whether gcs-based actor scheduler works properly with
|
||||
# a normal task co-existed.
|
||||
@pytest.mark.skip(reason="The resource update of normal task has been broken.")
|
||||
def test_schedule_actor_and_normal_task(ray_start_cluster):
|
||||
cluster = ray_start_cluster
|
||||
cluster.add_node(
|
||||
|
@ -721,7 +720,6 @@ def test_schedule_actor_and_normal_task(ray_start_cluster):
|
|||
|
||||
# This case tests whether gcs-based actor scheduler works properly
|
||||
# in a large scale.
|
||||
@pytest.mark.skip(reason="The resource update of normal task has been broken.")
|
||||
def test_schedule_many_actors_and_normal_tasks(ray_start_cluster):
|
||||
cluster = ray_start_cluster
|
||||
|
||||
|
@ -765,7 +763,6 @@ def test_schedule_many_actors_and_normal_tasks(ray_start_cluster):
|
|||
# This case tests whether gcs-based actor scheduler distributes actors
|
||||
# in a balanced way. By default, it uses the `SPREAD` strategy of
|
||||
# gcs resource scheduler.
|
||||
@pytest.mark.skip(reason="The resource update of normal task has been broken.")
|
||||
@pytest.mark.parametrize("args", [[5, 20], [5, 3]])
|
||||
def test_actor_distribution_balance(ray_start_cluster, args):
|
||||
cluster = ray_start_cluster
|
||||
|
@ -806,7 +803,6 @@ def test_actor_distribution_balance(ray_start_cluster, args):
|
|||
|
||||
# This case tests whether RequestWorkerLeaseReply carries normal task resources
|
||||
# when the request is rejected (due to resource preemption by normal tasks).
|
||||
@pytest.mark.skip(reason="The resource update of normal task has been broken.")
|
||||
def test_worker_lease_reply_with_resources(ray_start_cluster):
|
||||
cluster = ray_start_cluster
|
||||
cluster.add_node(
|
||||
|
|
|
@ -21,12 +21,9 @@ void BundleSpecification::ComputeResources() {
|
|||
|
||||
if (unit_resource.empty()) {
|
||||
// A static nil object is used here to avoid allocating the empty object every time.
|
||||
static std::shared_ptr<ResourceRequest> nil_unit_resource =
|
||||
std::make_shared<ResourceRequest>();
|
||||
unit_resource_ = nil_unit_resource;
|
||||
unit_resource_ = ResourceSet::Nil();
|
||||
} else {
|
||||
unit_resource_ = std::make_shared<ResourceRequest>(ResourceMapToResourceRequest(
|
||||
unit_resource, /*requires_object_store_memory=*/false));
|
||||
unit_resource_.reset(new ResourceSet(unit_resource));
|
||||
}
|
||||
|
||||
// Generate placement group bundle labels.
|
||||
|
@ -36,40 +33,18 @@ void BundleSpecification::ComputeResources() {
|
|||
void BundleSpecification::ComputeBundleResourceLabels() {
|
||||
RAY_CHECK(unit_resource_);
|
||||
|
||||
for (size_t i = 0; i < unit_resource_->predefined_resources.size(); ++i) {
|
||||
auto resource_name = scheduling::ResourceID(i).Binary();
|
||||
const auto &resource_value = unit_resource_->predefined_resources[i];
|
||||
if (resource_value <= 0.) {
|
||||
continue;
|
||||
}
|
||||
for (const auto &resource_pair : unit_resource_->GetResourceMap()) {
|
||||
double resource_value = resource_pair.second;
|
||||
|
||||
/// With bundle index (e.g., CPU_group_i_zzz).
|
||||
const std::string &resource_label =
|
||||
FormatPlacementGroupResource(resource_name, PlacementGroupId(), Index());
|
||||
bundle_resource_labels_[resource_label] = resource_value.Double();
|
||||
FormatPlacementGroupResource(resource_pair.first, PlacementGroupId(), Index());
|
||||
bundle_resource_labels_[resource_label] = resource_value;
|
||||
|
||||
/// Without bundle index (e.g., CPU_group_zzz).
|
||||
const std::string &wildcard_label =
|
||||
FormatPlacementGroupResource(resource_name, PlacementGroupId(), -1);
|
||||
bundle_resource_labels_[wildcard_label] = resource_value.Double();
|
||||
}
|
||||
|
||||
for (const auto &resource_pair : unit_resource_->custom_resources) {
|
||||
auto resource_name = scheduling::ResourceID(resource_pair.first).Binary();
|
||||
const auto &resource_value = resource_pair.second;
|
||||
if (resource_value <= 0.) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/// With bundle index (e.g., CPU_group_i_zzz).
|
||||
const std::string &resource_label =
|
||||
FormatPlacementGroupResource(resource_name, PlacementGroupId(), Index());
|
||||
bundle_resource_labels_[resource_label] = resource_value.Double();
|
||||
|
||||
/// Without bundle index (e.g., CPU_group_zzz).
|
||||
const std::string &wildcard_label =
|
||||
FormatPlacementGroupResource(resource_name, PlacementGroupId(), -1);
|
||||
bundle_resource_labels_[wildcard_label] = resource_value.Double();
|
||||
FormatPlacementGroupResource(resource_pair.first, PlacementGroupId(), -1);
|
||||
bundle_resource_labels_[wildcard_label] = resource_value;
|
||||
}
|
||||
auto bundle_label =
|
||||
FormatPlacementGroupResource(kBundle_ResourceLabel, PlacementGroupId(), -1);
|
||||
|
@ -79,7 +54,7 @@ void BundleSpecification::ComputeBundleResourceLabels() {
|
|||
1000;
|
||||
}
|
||||
|
||||
const ResourceRequest &BundleSpecification::GetRequiredResources() const {
|
||||
const ResourceSet &BundleSpecification::GetRequiredResources() const {
|
||||
return *unit_resource_;
|
||||
}
|
||||
|
||||
|
|
|
@ -65,7 +65,7 @@ class BundleSpecification : public MessageWrapper<rpc::Bundle> {
|
|||
/// Return the resources that are to be acquired by this bundle.
|
||||
///
|
||||
/// \return The resources that will be acquired by this bundle.
|
||||
const ResourceRequest &GetRequiredResources() const;
|
||||
const ResourceSet &GetRequiredResources() const;
|
||||
|
||||
/// Get all placement group bundle resource labels.
|
||||
const absl::flat_hash_map<std::string, double> &GetFormattedResources() const {
|
||||
|
@ -81,7 +81,7 @@ class BundleSpecification : public MessageWrapper<rpc::Bundle> {
|
|||
/// Field storing unit resources. Initialized in constructor.
|
||||
/// TODO(ekl) consider optimizing the representation of ResourceSet for fast copies
|
||||
/// instead of keeping shared pointers here.
|
||||
std::shared_ptr<ResourceRequest> unit_resource_;
|
||||
std::shared_ptr<ResourceSet> unit_resource_;
|
||||
|
||||
/// When a bundle is assigned on a node, we'll add the following special resources on
|
||||
/// that node:
|
||||
|
|
|
@ -20,13 +20,14 @@ namespace ray {
|
|||
|
||||
namespace gcs {
|
||||
|
||||
GcsActorWorkerAssignment::GcsActorWorkerAssignment(
|
||||
const NodeID &node_id, const ResourceRequest &acquired_resources, bool is_shared)
|
||||
GcsActorWorkerAssignment::GcsActorWorkerAssignment(const NodeID &node_id,
|
||||
const ResourceSet &acquired_resources,
|
||||
bool is_shared)
|
||||
: node_id_(node_id), acquired_resources_(acquired_resources), is_shared_(is_shared) {}
|
||||
|
||||
const NodeID &GcsActorWorkerAssignment::GetNodeID() const { return node_id_; }
|
||||
|
||||
const ResourceRequest &GcsActorWorkerAssignment::GetResources() const {
|
||||
const ResourceSet &GcsActorWorkerAssignment::GetResources() const {
|
||||
return acquired_resources_;
|
||||
}
|
||||
|
||||
|
@ -66,9 +67,7 @@ std::unique_ptr<GcsActorWorkerAssignment>
|
|||
GcsBasedActorScheduler::SelectOrAllocateActorWorkerAssignment(
|
||||
std::shared_ptr<GcsActor> actor, bool need_sole_actor_worker_assignment) {
|
||||
const auto &task_spec = actor->GetCreationTaskSpecification();
|
||||
auto required_resources = ResourceMapToResourceRequest(
|
||||
task_spec.GetRequiredPlacementResources().GetResourceMap(),
|
||||
/*requires_object_store_memory=*/false);
|
||||
auto required_resources = task_spec.GetRequiredPlacementResources();
|
||||
|
||||
// If the task needs a sole actor worker assignment then allocate a new one.
|
||||
return AllocateNewActorWorkerAssignment(required_resources, /*is_shared=*/false,
|
||||
|
@ -79,7 +78,7 @@ GcsBasedActorScheduler::SelectOrAllocateActorWorkerAssignment(
|
|||
|
||||
std::unique_ptr<GcsActorWorkerAssignment>
|
||||
GcsBasedActorScheduler::AllocateNewActorWorkerAssignment(
|
||||
const ResourceRequest &required_resources, bool is_shared,
|
||||
const ResourceSet &required_resources, bool is_shared,
|
||||
const TaskSpecification &task_spec) {
|
||||
// Allocate resources from cluster.
|
||||
auto selected_node_id = AllocateResources(required_resources);
|
||||
|
@ -95,8 +94,7 @@ GcsBasedActorScheduler::AllocateNewActorWorkerAssignment(
|
|||
return gcs_actor_worker_assignment;
|
||||
}
|
||||
|
||||
NodeID GcsBasedActorScheduler::AllocateResources(
|
||||
const ResourceRequest &required_resources) {
|
||||
NodeID GcsBasedActorScheduler::AllocateResources(const ResourceSet &required_resources) {
|
||||
auto selected_nodes =
|
||||
gcs_resource_scheduler_->Schedule({required_resources}, SchedulingType::SPREAD)
|
||||
.second;
|
||||
|
@ -120,7 +118,7 @@ NodeID GcsBasedActorScheduler::AllocateResources(
|
|||
}
|
||||
|
||||
NodeID GcsBasedActorScheduler::GetHighestScoreNodeResource(
|
||||
const ResourceRequest &required_resources) const {
|
||||
const ResourceSet &required_resources) const {
|
||||
const auto &cluster_map = gcs_resource_manager_->GetClusterResources();
|
||||
|
||||
/// Get the highest score node
|
||||
|
@ -129,8 +127,7 @@ NodeID GcsBasedActorScheduler::GetHighestScoreNodeResource(
|
|||
double highest_score = std::numeric_limits<double>::lowest();
|
||||
auto highest_score_node = NodeID::Nil();
|
||||
for (const auto &pair : cluster_map) {
|
||||
double least_resource_val =
|
||||
scorer.Score(required_resources, pair.second->GetLocalView());
|
||||
double least_resource_val = scorer.Score(required_resources, *pair.second);
|
||||
if (least_resource_val > highest_score) {
|
||||
highest_score = least_resource_val;
|
||||
highest_score_node = pair.first;
|
||||
|
@ -141,12 +138,12 @@ NodeID GcsBasedActorScheduler::GetHighestScoreNodeResource(
|
|||
}
|
||||
|
||||
void GcsBasedActorScheduler::WarnResourceAllocationFailure(
|
||||
const TaskSpecification &task_spec, const ResourceRequest &required_resources) const {
|
||||
const TaskSpecification &task_spec, const ResourceSet &required_resources) const {
|
||||
auto scheduling_node_id = GetHighestScoreNodeResource(required_resources);
|
||||
const NodeResources *scheduling_resource = nullptr;
|
||||
const SchedulingResources *scheduling_resource = nullptr;
|
||||
auto iter = gcs_resource_manager_->GetClusterResources().find(scheduling_node_id);
|
||||
if (iter != gcs_resource_manager_->GetClusterResources().end()) {
|
||||
scheduling_resource = iter->second->GetMutableLocalView();
|
||||
scheduling_resource = iter->second.get();
|
||||
}
|
||||
std::string scheduling_resource_str =
|
||||
scheduling_resource ? scheduling_resource->DebugString() : "None";
|
||||
|
@ -154,7 +151,7 @@ void GcsBasedActorScheduler::WarnResourceAllocationFailure(
|
|||
RAY_LOG(WARNING) << "No enough resources for creating actor "
|
||||
<< task_spec.ActorCreationId()
|
||||
<< "\nActor class: " << task_spec.FunctionDescriptor()->ToString()
|
||||
<< "\nRequired resources: " << required_resources.DebugString()
|
||||
<< "\nRequired resources: " << required_resources.ToString()
|
||||
<< "\nThe node with the most resources is:"
|
||||
<< "\n Node id: " << scheduling_node_id
|
||||
<< "\n Node resources: " << scheduling_resource_str;
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
|
||||
#include "ray/common/id.h"
|
||||
#include "ray/common/status.h"
|
||||
#include "ray/common/task/scheduling_resources.h"
|
||||
#include "ray/common/task/task_spec.h"
|
||||
#include "ray/gcs/gcs_server/gcs_actor_manager.h"
|
||||
#include "ray/gcs/gcs_server/gcs_actor_scheduler.h"
|
||||
|
@ -41,12 +42,12 @@ class GcsActorWorkerAssignment
|
|||
/// \param node_id ID of node on which this gcs actor worker assignment is allocated.
|
||||
/// \param acquired_resources Resources owned by this gcs actor worker assignment.
|
||||
/// \param is_shared A flag to represent that whether the worker process can be shared.
|
||||
GcsActorWorkerAssignment(const NodeID &node_id,
|
||||
const ResourceRequest &acquired_resources, bool is_shared);
|
||||
GcsActorWorkerAssignment(const NodeID &node_id, const ResourceSet &acquired_resources,
|
||||
bool is_shared);
|
||||
|
||||
const NodeID &GetNodeID() const;
|
||||
|
||||
const ResourceRequest &GetResources() const;
|
||||
const ResourceSet &GetResources() const;
|
||||
|
||||
bool IsShared() const;
|
||||
|
||||
|
@ -54,7 +55,7 @@ class GcsActorWorkerAssignment
|
|||
/// ID of node on which this actor worker assignment is allocated.
|
||||
const NodeID node_id_;
|
||||
/// Resources owned by this actor worker assignment.
|
||||
const ResourceRequest acquired_resources_;
|
||||
const ResourceSet acquired_resources_;
|
||||
/// A flag to represent that whether the worker process can be shared.
|
||||
const bool is_shared_;
|
||||
};
|
||||
|
@ -130,19 +131,19 @@ class GcsBasedActorScheduler : public GcsActorScheduler {
|
|||
/// \param is_shared If the worker is shared by multiple actors or not.
|
||||
/// \param task_spec The specification of the task.
|
||||
std::unique_ptr<GcsActorWorkerAssignment> AllocateNewActorWorkerAssignment(
|
||||
const ResourceRequest &required_resources, bool is_shared,
|
||||
const ResourceSet &required_resources, bool is_shared,
|
||||
const TaskSpecification &task_spec);
|
||||
|
||||
/// Allocate resources for the actor.
|
||||
///
|
||||
/// \param required_resources The resources to be allocated.
|
||||
/// \return ID of the node from which the resources are allocated.
|
||||
NodeID AllocateResources(const ResourceRequest &required_resources);
|
||||
NodeID AllocateResources(const ResourceSet &required_resources);
|
||||
|
||||
NodeID GetHighestScoreNodeResource(const ResourceRequest &required_resources) const;
|
||||
NodeID GetHighestScoreNodeResource(const ResourceSet &required_resources) const;
|
||||
|
||||
void WarnResourceAllocationFailure(const TaskSpecification &task_spec,
|
||||
const ResourceRequest &required_resources) const;
|
||||
const ResourceSet &required_resources) const;
|
||||
|
||||
/// A rejected rely means resources were preempted by normal tasks. Then
|
||||
/// update the the cluster resource view and reschedule immediately.
|
||||
|
|
|
@ -63,9 +63,9 @@ GcsPlacementGroupScheduler::GcsPlacementGroupScheduler(
|
|||
scheduler_strategies_.push_back(std::make_shared<GcsStrictSpreadStrategy>());
|
||||
}
|
||||
|
||||
std::vector<ResourceRequest> GcsScheduleStrategy::GetRequiredResourcesFromBundles(
|
||||
std::vector<ResourceSet> GcsScheduleStrategy::GetRequiredResourcesFromBundles(
|
||||
const std::vector<std::shared_ptr<const ray::BundleSpecification>> &bundles) {
|
||||
std::vector<ResourceRequest> required_resources;
|
||||
std::vector<ResourceSet> required_resources;
|
||||
for (const auto &bundle : bundles) {
|
||||
required_resources.push_back(bundle->GetRequiredResources());
|
||||
}
|
||||
|
|
|
@ -128,7 +128,7 @@ class GcsScheduleStrategy {
|
|||
///
|
||||
/// \param bundles Bundles to be scheduled.
|
||||
/// \return Required resources.
|
||||
std::vector<ResourceRequest> GetRequiredResourcesFromBundles(
|
||||
std::vector<ResourceSet> GetRequiredResourcesFromBundles(
|
||||
const std::vector<std::shared_ptr<const ray::BundleSpecification>> &bundles);
|
||||
|
||||
/// Generate `ScheduleResult` from bundles and nodes .
|
||||
|
|
|
@ -36,23 +36,11 @@ void GcsResourceManager::HandleGetResources(const rpc::GetResourcesRequest &requ
|
|||
NodeID node_id = NodeID::FromBinary(request.node_id());
|
||||
auto iter = cluster_scheduling_resources_.find(node_id);
|
||||
if (iter != cluster_scheduling_resources_.end()) {
|
||||
const auto &resource_map = iter->second->GetTotalResources().GetResourceMap();
|
||||
rpc::ResourceTableData resource_table_data;
|
||||
const auto &node_resources = iter->second->GetLocalView();
|
||||
for (size_t i = 0; i < node_resources.predefined_resources.size(); ++i) {
|
||||
const auto &resource_value = node_resources.predefined_resources[i].total;
|
||||
if (resource_value <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto &resource_name = scheduling::ResourceID(i).Binary();
|
||||
resource_table_data.set_resource_capacity(resource_value.Double());
|
||||
(*reply->mutable_resources()).insert({resource_name, resource_table_data});
|
||||
}
|
||||
for (const auto &entry : node_resources.custom_resources) {
|
||||
const auto &resource_name = scheduling::ResourceID(entry.first).Binary();
|
||||
const auto &resource_value = entry.second.total;
|
||||
resource_table_data.set_resource_capacity(resource_value.Double());
|
||||
(*reply->mutable_resources()).insert({resource_name, resource_table_data});
|
||||
for (const auto &resource : resource_map) {
|
||||
resource_table_data.set_resource_capacity(resource.second);
|
||||
(*reply->mutable_resources())[resource.first] = resource_table_data;
|
||||
}
|
||||
}
|
||||
GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK());
|
||||
|
@ -72,28 +60,18 @@ void GcsResourceManager::HandleUpdateResources(
|
|||
auto iter = cluster_scheduling_resources_.find(node_id);
|
||||
if (iter != cluster_scheduling_resources_.end()) {
|
||||
// Update `cluster_scheduling_resources_`.
|
||||
auto node_resources = iter->second->GetMutableLocalView();
|
||||
SchedulingResources &scheduling_resources = *iter->second;
|
||||
for (const auto &entry : *changed_resources) {
|
||||
UpdateResourceCapacity(node_resources, entry.first, entry.second);
|
||||
scheduling_resources.UpdateResourceCapacity(entry.first, entry.second);
|
||||
}
|
||||
|
||||
// Update gcs storage.
|
||||
rpc::ResourceMap resource_map;
|
||||
for (size_t i = 0; i < node_resources->predefined_resources.size(); ++i) {
|
||||
const auto &resource_value = node_resources->predefined_resources[i].total;
|
||||
if (resource_value <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto &resource_name = scheduling::ResourceID(i).Binary();
|
||||
(*resource_map.mutable_items())[resource_name].set_resource_capacity(
|
||||
resource_value.Double());
|
||||
for (const auto &entry : scheduling_resources.GetTotalResources().GetResourceMap()) {
|
||||
(*resource_map.mutable_items())[entry.first].set_resource_capacity(entry.second);
|
||||
}
|
||||
for (const auto &entry : node_resources->custom_resources) {
|
||||
const auto &resource_name = scheduling::ResourceID(entry.first).Binary();
|
||||
const auto &resource_value = entry.second.total;
|
||||
(*resource_map.mutable_items())[resource_name].set_resource_capacity(
|
||||
resource_value.Double());
|
||||
for (const auto &entry : *changed_resources) {
|
||||
(*resource_map.mutable_items())[entry.first].set_resource_capacity(entry.second);
|
||||
}
|
||||
|
||||
auto start = absl::GetCurrentTimeNanos();
|
||||
|
@ -131,41 +109,19 @@ void GcsResourceManager::HandleDeleteResources(
|
|||
auto resource_names = VectorFromProtobuf(request.resource_name_list());
|
||||
auto iter = cluster_scheduling_resources_.find(node_id);
|
||||
if (iter != cluster_scheduling_resources_.end()) {
|
||||
auto node_resources = iter->second->GetMutableLocalView();
|
||||
// Update `cluster_scheduling_resources_`.
|
||||
DeleteResources(node_resources, resource_names);
|
||||
for (const auto &resource_name : resource_names) {
|
||||
iter->second->DeleteResource(resource_name);
|
||||
}
|
||||
|
||||
// Update gcs storage.
|
||||
rpc::ResourceMap resource_map;
|
||||
for (size_t i = 0; i < node_resources->predefined_resources.size(); ++i) {
|
||||
const auto &resource_name = scheduling::ResourceID(i).Binary();
|
||||
if (std::find(resource_names.begin(), resource_names.end(), resource_name) !=
|
||||
resource_names.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto &resource_value = node_resources->predefined_resources[i].total;
|
||||
if (resource_value <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
(*resource_map.mutable_items())[resource_name].set_resource_capacity(
|
||||
resource_value.Double());
|
||||
auto resources = iter->second->GetTotalResources().GetResourceMap();
|
||||
for (const auto &resource_name : resource_names) {
|
||||
resources.erase(resource_name);
|
||||
}
|
||||
for (const auto &entry : node_resources->custom_resources) {
|
||||
const auto &resource_name = scheduling::ResourceID(entry.first).Binary();
|
||||
if (std::find(resource_names.begin(), resource_names.end(), resource_name) !=
|
||||
resource_names.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto &resource_value = entry.second.total;
|
||||
if (resource_value <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
(*resource_map.mutable_items())[resource_name].set_resource_capacity(
|
||||
resource_value.Double());
|
||||
for (const auto &entry : resources) {
|
||||
(*resource_map.mutable_items())[entry.first].set_resource_capacity(entry.second);
|
||||
}
|
||||
|
||||
auto on_done = [this, node_id, resource_names, reply,
|
||||
|
@ -193,31 +149,11 @@ void GcsResourceManager::HandleGetAllAvailableResources(
|
|||
const rpc::GetAllAvailableResourcesRequest &request,
|
||||
rpc::GetAllAvailableResourcesReply *reply,
|
||||
rpc::SendReplyCallback send_reply_callback) {
|
||||
for (const auto &node_resources_entry : cluster_scheduling_resources_) {
|
||||
const auto &node_id = node_resources_entry.first;
|
||||
const auto &node_resources = node_resources_entry.second->GetLocalView();
|
||||
for (const auto &iter : cluster_scheduling_resources_) {
|
||||
rpc::AvailableResources resource;
|
||||
resource.set_node_id(node_id.Binary());
|
||||
|
||||
for (size_t i = 0; i < node_resources.predefined_resources.size(); ++i) {
|
||||
const auto &resource_value = node_resources.predefined_resources[i].available;
|
||||
if (resource_value <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto &resource_name = scheduling::ResourceID(i).Binary();
|
||||
resource.mutable_resources_available()->insert(
|
||||
{resource_name, resource_value.Double()});
|
||||
}
|
||||
for (const auto &entry : node_resources.custom_resources) {
|
||||
const auto &resource_value = entry.second.available;
|
||||
if (resource_value <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto &resource_name = scheduling::ResourceID(entry.first).Binary();
|
||||
resource.mutable_resources_available()->insert(
|
||||
{resource_name, resource_value.Double()});
|
||||
resource.set_node_id(iter.first.Binary());
|
||||
for (const auto &res : iter.second->GetAvailableResources().GetResourceAmountMap()) {
|
||||
(*resource.mutable_resources_available())[res.first] = res.second.Double();
|
||||
}
|
||||
reply->add_resources_list()->CopyFrom(resource);
|
||||
}
|
||||
|
@ -231,7 +167,8 @@ void GcsResourceManager::UpdateFromResourceReport(const rpc::ResourcesData &data
|
|||
UpdateNodeNormalTaskResources(node_id, data);
|
||||
} else {
|
||||
if (node_resource_usages_.count(node_id) == 0 || data.resources_available_changed()) {
|
||||
SetAvailableResources(node_id, MapFromProtobuf(data.resources_available()));
|
||||
const auto &resource_changed = MapFromProtobuf(data.resources_available());
|
||||
SetAvailableResources(node_id, ResourceSet(resource_changed));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -258,13 +195,13 @@ void GcsResourceManager::HandleGetAllResourceUsage(
|
|||
rpc::SendReplyCallback send_reply_callback) {
|
||||
if (!node_resource_usages_.empty()) {
|
||||
auto batch = std::make_shared<rpc::ResourceUsageBatchData>();
|
||||
std::unordered_map<google::protobuf::Map<std::string, double>, rpc::ResourceDemand>
|
||||
aggregate_load;
|
||||
absl::flat_hash_map<ResourceSet, rpc::ResourceDemand> aggregate_load;
|
||||
for (const auto &usage : node_resource_usages_) {
|
||||
// Aggregate the load reported by each raylet.
|
||||
auto load = usage.second.resource_load_by_shape();
|
||||
for (const auto &demand : load.resource_demands()) {
|
||||
auto &aggregate_demand = aggregate_load[demand.shape()];
|
||||
auto scheduling_key = ResourceSet(MapFromProtobuf(demand.shape()));
|
||||
auto &aggregate_demand = aggregate_load[scheduling_key];
|
||||
aggregate_demand.set_num_ready_requests_queued(
|
||||
aggregate_demand.num_ready_requests_queued() +
|
||||
demand.num_ready_requests_queued());
|
||||
|
@ -281,7 +218,7 @@ void GcsResourceManager::HandleGetAllResourceUsage(
|
|||
for (const auto &demand : aggregate_load) {
|
||||
auto demand_proto = batch->mutable_resource_load_by_shape()->add_resource_demands();
|
||||
demand_proto->CopyFrom(demand.second);
|
||||
for (const auto &resource_pair : demand.first) {
|
||||
for (const auto &resource_pair : demand.first.GetResourceMap()) {
|
||||
(*demand_proto->mutable_shape())[resource_pair.first] = resource_pair.second;
|
||||
}
|
||||
}
|
||||
|
@ -338,39 +275,24 @@ void GcsResourceManager::Initialize(const GcsInitData &gcs_init_data) {
|
|||
for (const auto &entry : cluster_resources) {
|
||||
const auto &iter = cluster_scheduling_resources_.find(entry.first);
|
||||
if (iter != cluster_scheduling_resources_.end()) {
|
||||
auto node_resources = iter->second->GetMutableLocalView();
|
||||
for (const auto &resource : entry.second.items()) {
|
||||
UpdateResourceCapacity(node_resources, resource.first,
|
||||
resource.second.resource_capacity());
|
||||
iter->second->UpdateResourceCapacity(resource.first,
|
||||
resource.second.resource_capacity());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const absl::flat_hash_map<NodeID, std::shared_ptr<Node>>
|
||||
const absl::flat_hash_map<NodeID, std::shared_ptr<SchedulingResources>>
|
||||
&GcsResourceManager::GetClusterResources() const {
|
||||
return cluster_scheduling_resources_;
|
||||
}
|
||||
|
||||
void GcsResourceManager::SetAvailableResources(
|
||||
const NodeID &node_id, const absl::flat_hash_map<std::string, double> &resource_map) {
|
||||
void GcsResourceManager::SetAvailableResources(const NodeID &node_id,
|
||||
const ResourceSet &resources) {
|
||||
auto iter = cluster_scheduling_resources_.find(node_id);
|
||||
if (iter != cluster_scheduling_resources_.end()) {
|
||||
auto resources = ResourceMapToResourceRequest(resource_map,
|
||||
/*requires_object_store_memory=*/false);
|
||||
auto node_resources = iter->second->GetMutableLocalView();
|
||||
for (size_t i = 0; i < node_resources->predefined_resources.size(); ++i) {
|
||||
node_resources->predefined_resources[i].available =
|
||||
resources.predefined_resources[i];
|
||||
}
|
||||
for (auto &entry : node_resources->custom_resources) {
|
||||
auto it = resources.custom_resources.find(entry.first);
|
||||
if (it != resources.custom_resources.end()) {
|
||||
entry.second.available = it->second;
|
||||
} else {
|
||||
entry.second.available = 0.;
|
||||
}
|
||||
}
|
||||
iter->second->SetAvailableResources(ResourceSet(resources));
|
||||
} else {
|
||||
RAY_LOG(WARNING)
|
||||
<< "Skip the setting of available resources of node " << node_id
|
||||
|
@ -378,19 +300,12 @@ void GcsResourceManager::SetAvailableResources(
|
|||
}
|
||||
}
|
||||
|
||||
void GcsResourceManager::DeleteResources(NodeResources *node_resources,
|
||||
const std::vector<std::string> &resource_names) {
|
||||
for (const auto &resource_name : resource_names) {
|
||||
auto resource_id = scheduling::ResourceID(resource_name).ToInt();
|
||||
if (resource_id == -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (resource_id >= 0 && resource_id < PredefinedResources_MAX) {
|
||||
node_resources->predefined_resources[resource_id].total = 0;
|
||||
node_resources->predefined_resources[resource_id].available = 0;
|
||||
} else {
|
||||
node_resources->custom_resources.erase(resource_id);
|
||||
void GcsResourceManager::DeleteResources(
|
||||
const NodeID &node_id, const std::vector<std::string> &deleted_resources) {
|
||||
auto iter = cluster_scheduling_resources_.find(node_id);
|
||||
if (iter != cluster_scheduling_resources_.end()) {
|
||||
for (const auto &resource_name : deleted_resources) {
|
||||
iter->second->DeleteResource(resource_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -401,9 +316,9 @@ void GcsResourceManager::OnNodeAdd(const rpc::GcsNodeInfo &node) {
|
|||
absl::flat_hash_map<std::string, double> resource_mapping(
|
||||
node.resources_total().begin(), node.resources_total().end());
|
||||
// Update the cluster scheduling resources as new node is added.
|
||||
ResourceSet node_resources(resource_mapping);
|
||||
cluster_scheduling_resources_.emplace(
|
||||
node_id, std::make_shared<Node>(
|
||||
ResourceMapToNodeResources(resource_mapping, resource_mapping)));
|
||||
node_id, std::make_shared<SchedulingResources>(node_resources));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -414,21 +329,13 @@ void GcsResourceManager::OnNodeDead(const NodeID &node_id) {
|
|||
}
|
||||
|
||||
bool GcsResourceManager::AcquireResources(const NodeID &node_id,
|
||||
const ResourceRequest &required_resources) {
|
||||
const ResourceSet &required_resources) {
|
||||
auto iter = cluster_scheduling_resources_.find(node_id);
|
||||
if (iter != cluster_scheduling_resources_.end()) {
|
||||
auto node_resources = iter->second->GetMutableLocalView();
|
||||
if (!node_resources->IsAvailable(required_resources)) {
|
||||
if (!required_resources.IsSubset(iter->second->GetAvailableResources())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < required_resources.predefined_resources.size(); ++i) {
|
||||
node_resources->predefined_resources[i].available -=
|
||||
required_resources.predefined_resources[i];
|
||||
}
|
||||
for (auto &entry : required_resources.custom_resources) {
|
||||
node_resources->custom_resources[entry.first].available -= entry.second;
|
||||
}
|
||||
iter->second->Acquire(required_resources);
|
||||
}
|
||||
// If node dead, we will not find the node. This is a normal scenario, so it returns
|
||||
// true.
|
||||
|
@ -436,27 +343,10 @@ bool GcsResourceManager::AcquireResources(const NodeID &node_id,
|
|||
}
|
||||
|
||||
bool GcsResourceManager::ReleaseResources(const NodeID &node_id,
|
||||
const ResourceRequest &acquired_resources) {
|
||||
const ResourceSet &acquired_resources) {
|
||||
auto iter = cluster_scheduling_resources_.find(node_id);
|
||||
if (iter != cluster_scheduling_resources_.end()) {
|
||||
auto node_resources = iter->second->GetMutableLocalView();
|
||||
RAY_CHECK(acquired_resources.predefined_resources.size() <=
|
||||
node_resources->predefined_resources.size());
|
||||
|
||||
for (size_t i = 0; i < acquired_resources.predefined_resources.size(); ++i) {
|
||||
node_resources->predefined_resources[i].available +=
|
||||
acquired_resources.predefined_resources[i];
|
||||
node_resources->predefined_resources[i].available =
|
||||
std::min(node_resources->predefined_resources[i].available,
|
||||
node_resources->predefined_resources[i].total);
|
||||
}
|
||||
for (auto &entry : acquired_resources.custom_resources) {
|
||||
auto it = node_resources->custom_resources.find(entry.first);
|
||||
if (it != node_resources->custom_resources.end()) {
|
||||
it->second.available += entry.second;
|
||||
it->second.available = std::min(it->second.available, it->second.total);
|
||||
}
|
||||
}
|
||||
iter->second->Release(acquired_resources);
|
||||
}
|
||||
// If node dead, we will not find the node. This is a normal scenario, so it returns
|
||||
// true.
|
||||
|
@ -493,9 +383,24 @@ void GcsResourceManager::AddResourcesChangedListener(std::function<void()> liste
|
|||
|
||||
void GcsResourceManager::UpdateNodeNormalTaskResources(
|
||||
const NodeID &node_id, const rpc::ResourcesData &heartbeat) {
|
||||
// TODO(Shanly): To be implemented.
|
||||
// This method is breaked by the refactoring of new resource structure, just remove the
|
||||
// implementation for the time being.
|
||||
auto iter = cluster_scheduling_resources_.find(node_id);
|
||||
if (iter == cluster_scheduling_resources_.end()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto &scheduling_resoruces = iter->second;
|
||||
ResourceSet resources_normal_task(MapFromProtobuf(heartbeat.resources_normal_task()));
|
||||
if (heartbeat.resources_normal_task_changed() &&
|
||||
heartbeat.resources_normal_task_timestamp() >
|
||||
latest_resources_normal_task_timestamp_[node_id] &&
|
||||
!resources_normal_task.IsEqual(scheduling_resoruces->GetNormalTaskResources())) {
|
||||
scheduling_resoruces->SetNormalTaskResources(resources_normal_task);
|
||||
latest_resources_normal_task_timestamp_[node_id] =
|
||||
heartbeat.resources_normal_task_timestamp();
|
||||
for (const auto &listener : resources_changed_listeners_) {
|
||||
listener();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::string GcsResourceManager::ToString() const {
|
||||
|
@ -505,52 +410,11 @@ std::string GcsResourceManager::ToString() const {
|
|||
std::string indent_1(indent + 1 * 2, ' ');
|
||||
ostr << "{\n";
|
||||
for (const auto &entry : cluster_scheduling_resources_) {
|
||||
ostr << indent_1 << entry.first << " : " << entry.second->GetLocalView().DebugString()
|
||||
<< ",\n";
|
||||
ostr << indent_1 << entry.first << " : " << entry.second->DebugString() << ",\n";
|
||||
}
|
||||
ostr << indent_0 << "}\n";
|
||||
return ostr.str();
|
||||
}
|
||||
|
||||
void GcsResourceManager::UpdateResourceCapacity(NodeResources *node_resources,
|
||||
const std::string &resource_name,
|
||||
double capacity) {
|
||||
auto idx = scheduling::ResourceID(resource_name).ToInt();
|
||||
if (idx == -1) {
|
||||
return;
|
||||
}
|
||||
|
||||
FixedPoint resource_total_fp(capacity);
|
||||
if (idx >= 0 && idx < PredefinedResources_MAX) {
|
||||
auto diff_capacity =
|
||||
resource_total_fp - node_resources->predefined_resources[idx].total;
|
||||
node_resources->predefined_resources[idx].total += diff_capacity;
|
||||
node_resources->predefined_resources[idx].available += diff_capacity;
|
||||
if (node_resources->predefined_resources[idx].available < 0) {
|
||||
node_resources->predefined_resources[idx].available = 0;
|
||||
}
|
||||
if (node_resources->predefined_resources[idx].total < 0) {
|
||||
node_resources->predefined_resources[idx].total = 0;
|
||||
}
|
||||
} else {
|
||||
auto itr = node_resources->custom_resources.find(idx);
|
||||
if (itr != node_resources->custom_resources.end()) {
|
||||
auto diff_capacity = resource_total_fp - itr->second.total;
|
||||
itr->second.total += diff_capacity;
|
||||
itr->second.available += diff_capacity;
|
||||
if (itr->second.available < 0) {
|
||||
itr->second.available = 0;
|
||||
}
|
||||
if (itr->second.total < 0) {
|
||||
itr->second.total = 0;
|
||||
}
|
||||
} else {
|
||||
ResourceCapacity resource_capacity;
|
||||
resource_capacity.total = resource_capacity.available = resource_total_fp;
|
||||
node_resources->custom_resources.emplace(idx, resource_capacity);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace gcs
|
||||
} // namespace ray
|
||||
|
|
|
@ -18,10 +18,11 @@
|
|||
#include "ray/common/asio/instrumented_io_context.h"
|
||||
#include "ray/common/asio/periodical_runner.h"
|
||||
#include "ray/common/id.h"
|
||||
#include "ray/common/task/scheduling_resources.h"
|
||||
#include "ray/gcs/gcs_server/gcs_init_data.h"
|
||||
#include "ray/gcs/gcs_server/gcs_resource_manager.h"
|
||||
#include "ray/gcs/gcs_server/gcs_table_storage.h"
|
||||
#include "ray/gcs/pubsub/gcs_pub_sub.h"
|
||||
#include "ray/raylet/scheduling/cluster_resource_data.h"
|
||||
#include "ray/rpc/client_call.h"
|
||||
#include "ray/rpc/gcs_server/gcs_rpc_server.h"
|
||||
#include "src/ray/protobuf/gcs.pb.h"
|
||||
|
@ -84,7 +85,8 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler {
|
|||
/// Get the resources of all nodes in the cluster.
|
||||
///
|
||||
/// \return The resources of all nodes in the cluster.
|
||||
const absl::flat_hash_map<NodeID, std::shared_ptr<Node>> &GetClusterResources() const;
|
||||
const absl::flat_hash_map<NodeID, std::shared_ptr<SchedulingResources>>
|
||||
&GetClusterResources() const;
|
||||
|
||||
/// Handle a node registration.
|
||||
///
|
||||
|
@ -100,9 +102,7 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler {
|
|||
///
|
||||
/// \param node_id Id of a node.
|
||||
/// \param resources Available resources of a node.
|
||||
void SetAvailableResources(
|
||||
const NodeID &node_id,
|
||||
const absl::flat_hash_map<std::string, double> &resource_map);
|
||||
void SetAvailableResources(const NodeID &node_id, const ResourceSet &resources);
|
||||
|
||||
/// Acquire resources from the specified node. It will deduct directly from the node
|
||||
/// resources.
|
||||
|
@ -110,7 +110,7 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler {
|
|||
/// \param node_id Id of a node.
|
||||
/// \param required_resources Resources to apply for.
|
||||
/// \return True if acquire resources successfully. False otherwise.
|
||||
bool AcquireResources(const NodeID &node_id, const ResourceRequest &required_resources);
|
||||
bool AcquireResources(const NodeID &node_id, const ResourceSet &required_resources);
|
||||
|
||||
/// Release the resources of the specified node. It will be added directly to the node
|
||||
/// resources.
|
||||
|
@ -118,7 +118,7 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler {
|
|||
/// \param node_id Id of a node.
|
||||
/// \param acquired_resources Resources to release.
|
||||
/// \return True if release resources successfully. False otherwise.
|
||||
bool ReleaseResources(const NodeID &node_id, const ResourceRequest &acquired_resources);
|
||||
bool ReleaseResources(const NodeID &node_id, const ResourceSet &acquired_resources);
|
||||
|
||||
/// Initialize with the gcs tables data synchronously.
|
||||
/// This should be called when GCS server restarts after a failure.
|
||||
|
@ -158,13 +158,10 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler {
|
|||
private:
|
||||
/// Delete the scheduling resources of the specified node.
|
||||
///
|
||||
/// \param node_resources Id of a node.
|
||||
/// \param resource_names Deleted resources of a node.
|
||||
void DeleteResources(NodeResources *node_resources,
|
||||
const std::vector<std::string> &resource_names);
|
||||
|
||||
void UpdateResourceCapacity(NodeResources *node_resources,
|
||||
const std::string &resource_name, double capacity);
|
||||
/// \param node_id Id of a node.
|
||||
/// \param deleted_resources Deleted resources of a node.
|
||||
void DeleteResources(const NodeID &node_id,
|
||||
const std::vector<std::string> &deleted_resources);
|
||||
|
||||
/// The runner to run function periodically.
|
||||
PeriodicalRunner periodical_runner_;
|
||||
|
@ -176,7 +173,8 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler {
|
|||
/// Storage for GCS tables.
|
||||
std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage_;
|
||||
/// Map from node id to the scheduling resources of the node.
|
||||
absl::flat_hash_map<NodeID, std::shared_ptr<Node>> cluster_scheduling_resources_;
|
||||
absl::flat_hash_map<NodeID, std::shared_ptr<SchedulingResources>>
|
||||
cluster_scheduling_resources_;
|
||||
/// Placement group load information that is used for autoscaler.
|
||||
absl::optional<std::shared_ptr<rpc::PlacementGroupLoad>> placement_group_load_;
|
||||
/// Normal task resources could be uploaded by 1) Raylets' periodical reporters; 2)
|
||||
|
@ -206,34 +204,3 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler {
|
|||
|
||||
} // namespace gcs
|
||||
} // namespace ray
|
||||
|
||||
namespace std {
|
||||
template <>
|
||||
struct hash<google::protobuf::Map<std::string, double>> {
|
||||
size_t operator()(google::protobuf::Map<std::string, double> const &k) const {
|
||||
size_t seed = k.size();
|
||||
for (auto &elem : k) {
|
||||
seed ^= std::hash<std::string>()(elem.first);
|
||||
seed ^= std::hash<double>()(elem.second);
|
||||
}
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct equal_to<google::protobuf::Map<std::string, double>> {
|
||||
bool operator()(const google::protobuf::Map<std::string, double> &left,
|
||||
const google::protobuf::Map<std::string, double> &right) const {
|
||||
if (left.size() != right.size()) {
|
||||
return false;
|
||||
}
|
||||
for (const auto &entry : left) {
|
||||
auto iter = right.find(entry.first);
|
||||
if (iter == right.end() || iter->second != entry.second) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
} // namespace std
|
||||
|
|
|
@ -17,46 +17,40 @@
|
|||
namespace ray {
|
||||
namespace gcs {
|
||||
|
||||
double LeastResourceScorer::Score(const ResourceRequest &required_resources,
|
||||
const NodeResources &node_resources) {
|
||||
// TODO(Shanly): Take normal task resources into account later for GCS-based actor
|
||||
// scheduling.
|
||||
|
||||
double node_score = 0.;
|
||||
|
||||
if (required_resources.predefined_resources.size() >
|
||||
node_resources.predefined_resources.size()) {
|
||||
return -1.;
|
||||
double LeastResourceScorer::Score(const ResourceSet &required_resources,
|
||||
const SchedulingResources &node_resources) {
|
||||
// In GCS-based actor scheduling, the `resources_available_` (of class
|
||||
// `SchedulingResources`) is only acquired or released by actor scheduling, instead of
|
||||
// being updated by resource reports from raylets. So the 'actual' available resources
|
||||
// (if there exist normal tasks) are equal to `resources_available_` -
|
||||
// `resources_normal_tasks_`.
|
||||
ResourceSet new_available_resource_set;
|
||||
const ResourceSet *available_resource_set = &node_resources.GetAvailableResources();
|
||||
if (!node_resources.GetNormalTaskResources().IsEmpty()) {
|
||||
new_available_resource_set = node_resources.GetAvailableResources();
|
||||
new_available_resource_set.SubtractResources(node_resources.GetNormalTaskResources());
|
||||
available_resource_set = &new_available_resource_set;
|
||||
}
|
||||
const auto &available_resource_amount_map =
|
||||
available_resource_set->GetResourceAmountMap();
|
||||
|
||||
for (size_t i = 0; i < required_resources.predefined_resources.size(); ++i) {
|
||||
const auto &request_resource = required_resources.predefined_resources[i];
|
||||
const auto &node_available_resource =
|
||||
node_resources.predefined_resources[i].available;
|
||||
auto score = Calculate(request_resource, node_available_resource);
|
||||
if (score < 0.) {
|
||||
return -1.;
|
||||
double node_score = 0.0;
|
||||
for (const auto &entry : required_resources.GetResourceAmountMap()) {
|
||||
auto available_resource_amount_iter = available_resource_amount_map.find(entry.first);
|
||||
if (available_resource_amount_iter == available_resource_amount_map.end()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
node_score += score;
|
||||
}
|
||||
|
||||
for (const auto &request_resource_entry : required_resources.custom_resources) {
|
||||
auto iter = node_resources.custom_resources.find(request_resource_entry.first);
|
||||
if (iter == node_resources.custom_resources.end()) {
|
||||
return -1.;
|
||||
}
|
||||
|
||||
const auto &request_resource = request_resource_entry.second;
|
||||
const auto &node_available_resource = iter->second.available;
|
||||
auto score = Calculate(request_resource, node_available_resource);
|
||||
if (score < 0.) {
|
||||
return -1.;
|
||||
}
|
||||
|
||||
node_score += score;
|
||||
auto calculated_score =
|
||||
Calculate(entry.second, available_resource_amount_iter->second);
|
||||
if (calculated_score < 0) {
|
||||
return -1;
|
||||
}
|
||||
node_score += calculated_score;
|
||||
}
|
||||
|
||||
// TODO(ffbin): We always want to choose the node with the least matching resources. We
|
||||
// will solve it in next pr.
|
||||
return node_score;
|
||||
}
|
||||
|
||||
|
@ -67,22 +61,20 @@ double LeastResourceScorer::Calculate(const FixedPoint &requested,
|
|||
if (requested > available) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (available == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return (available - requested).Double() / available.Double();
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
SchedulingResult GcsResourceScheduler::Schedule(
|
||||
const std::vector<ResourceRequest> &required_resources_list,
|
||||
const std::vector<ResourceSet> &required_resources_list,
|
||||
const SchedulingType &scheduling_type,
|
||||
const std::function<bool(const NodeID &)> &node_filter_func) {
|
||||
const auto &cluster_resources = gcs_resource_manager_.GetClusterResources();
|
||||
|
||||
// Filter candidate nodes.
|
||||
auto candidate_nodes = FilterCandidateNodes(node_filter_func);
|
||||
absl::flat_hash_set<NodeID> candidate_nodes =
|
||||
FilterCandidateNodes(cluster_resources, node_filter_func);
|
||||
if (candidate_nodes.empty()) {
|
||||
RAY_LOG(DEBUG) << "The candidate nodes is empty, return directly.";
|
||||
return std::make_pair(SchedulingResultStatus::INFEASIBLE, std::vector<NodeID>());
|
||||
|
@ -115,11 +107,12 @@ SchedulingResult GcsResourceScheduler::Schedule(
|
|||
}
|
||||
|
||||
absl::flat_hash_set<NodeID> GcsResourceScheduler::FilterCandidateNodes(
|
||||
const absl::flat_hash_map<NodeID, std::shared_ptr<SchedulingResources>>
|
||||
&cluster_resources,
|
||||
const std::function<bool(const NodeID &)> &node_filter_func) {
|
||||
absl::flat_hash_set<NodeID> result;
|
||||
const auto &resource_view = GetResourceView();
|
||||
result.reserve(resource_view.size());
|
||||
for (const auto &iter : resource_view) {
|
||||
result.reserve(cluster_resources.size());
|
||||
for (const auto &iter : cluster_resources) {
|
||||
const auto &node_id = iter.first;
|
||||
if (node_filter_func == nullptr || node_filter_func(node_id)) {
|
||||
result.emplace(node_id);
|
||||
|
@ -128,8 +121,8 @@ absl::flat_hash_set<NodeID> GcsResourceScheduler::FilterCandidateNodes(
|
|||
return result;
|
||||
}
|
||||
|
||||
const std::vector<ResourceRequest> &GcsResourceScheduler::SortRequiredResources(
|
||||
const std::vector<ResourceRequest> &required_resources) {
|
||||
const std::vector<ResourceSet> &GcsResourceScheduler::SortRequiredResources(
|
||||
const std::vector<ResourceSet> &required_resources) {
|
||||
// TODO(ffbin): A bundle may require special resources, such as GPU. We need to
|
||||
// schedule bundles with special resource requirements first, which will be implemented
|
||||
// in the next pr.
|
||||
|
@ -137,7 +130,7 @@ const std::vector<ResourceRequest> &GcsResourceScheduler::SortRequiredResources(
|
|||
}
|
||||
|
||||
SchedulingResult GcsResourceScheduler::StrictSpreadSchedule(
|
||||
const std::vector<ResourceRequest> &required_resources_list,
|
||||
const std::vector<ResourceSet> &required_resources_list,
|
||||
const absl::flat_hash_set<NodeID> &candidate_nodes) {
|
||||
if (required_resources_list.size() > candidate_nodes.size()) {
|
||||
RAY_LOG(DEBUG) << "The number of required resources "
|
||||
|
@ -171,7 +164,7 @@ SchedulingResult GcsResourceScheduler::StrictSpreadSchedule(
|
|||
}
|
||||
|
||||
SchedulingResult GcsResourceScheduler::SpreadSchedule(
|
||||
const std::vector<ResourceRequest> &required_resources_list,
|
||||
const std::vector<ResourceSet> &required_resources_list,
|
||||
const absl::flat_hash_set<NodeID> &candidate_nodes) {
|
||||
std::vector<NodeID> result_nodes;
|
||||
absl::flat_hash_set<NodeID> candidate_nodes_copy(candidate_nodes);
|
||||
|
@ -183,7 +176,7 @@ SchedulingResult GcsResourceScheduler::SpreadSchedule(
|
|||
// There are nodes to meet the scheduling requirements.
|
||||
if (best_node) {
|
||||
result_nodes.emplace_back(std::move(*best_node));
|
||||
RAY_CHECK(AllocateRemoteTaskResources(result_nodes.back(), iter));
|
||||
RAY_CHECK(gcs_resource_manager_.AcquireResources(result_nodes.back(), iter));
|
||||
candidate_nodes_copy.erase(result_nodes.back());
|
||||
selected_nodes.insert(result_nodes.back());
|
||||
} else {
|
||||
|
@ -191,7 +184,7 @@ SchedulingResult GcsResourceScheduler::SpreadSchedule(
|
|||
auto best_node = GetBestNode(iter, selected_nodes);
|
||||
if (best_node) {
|
||||
result_nodes.push_back(std::move(*best_node));
|
||||
RAY_CHECK(AllocateRemoteTaskResources(result_nodes.back(), iter));
|
||||
RAY_CHECK(gcs_resource_manager_.AcquireResources(result_nodes.back(), iter));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
@ -209,30 +202,20 @@ SchedulingResult GcsResourceScheduler::SpreadSchedule(
|
|||
}
|
||||
|
||||
SchedulingResult GcsResourceScheduler::StrictPackSchedule(
|
||||
const std::vector<ResourceRequest> &required_resources_list,
|
||||
const std::vector<ResourceSet> &required_resources_list,
|
||||
const absl::flat_hash_set<NodeID> &candidate_nodes) {
|
||||
// Aggregate required resources.
|
||||
ResourceRequest aggregated_resource_request;
|
||||
for (const auto &resource_request : required_resources_list) {
|
||||
if (aggregated_resource_request.predefined_resources.size() <
|
||||
resource_request.predefined_resources.size()) {
|
||||
aggregated_resource_request.predefined_resources.resize(
|
||||
resource_request.predefined_resources.size());
|
||||
}
|
||||
for (size_t i = 0; i < resource_request.predefined_resources.size(); ++i) {
|
||||
aggregated_resource_request.predefined_resources[i] +=
|
||||
resource_request.predefined_resources[i];
|
||||
}
|
||||
for (const auto &entry : resource_request.custom_resources) {
|
||||
aggregated_resource_request.custom_resources[entry.first] += entry.second;
|
||||
}
|
||||
ResourceSet required_resources;
|
||||
for (const auto &iter : required_resources_list) {
|
||||
required_resources.AddResources(iter);
|
||||
}
|
||||
|
||||
const auto &cluster_resource = GetResourceView();
|
||||
const auto &cluster_resource = gcs_resource_manager_.GetClusterResources();
|
||||
|
||||
const auto &right_node_it = std::find_if(
|
||||
cluster_resource.begin(), cluster_resource.end(),
|
||||
[&aggregated_resource_request](const auto &entry) {
|
||||
return entry.second->GetLocalView().IsAvailable(aggregated_resource_request);
|
||||
[required_resources](const auto &node_resource) {
|
||||
return required_resources.IsSubset(node_resource.second->GetTotalResources());
|
||||
});
|
||||
|
||||
if (right_node_it == cluster_resource.end()) {
|
||||
|
@ -243,7 +226,7 @@ SchedulingResult GcsResourceScheduler::StrictPackSchedule(
|
|||
|
||||
std::vector<NodeID> result_nodes;
|
||||
|
||||
auto best_node = GetBestNode(aggregated_resource_request, candidate_nodes);
|
||||
auto best_node = GetBestNode(required_resources, candidate_nodes);
|
||||
|
||||
// Select the node with the highest score.
|
||||
// `StrictPackSchedule` does not need to consider the scheduling context, because it
|
||||
|
@ -262,12 +245,12 @@ SchedulingResult GcsResourceScheduler::StrictPackSchedule(
|
|||
}
|
||||
|
||||
SchedulingResult GcsResourceScheduler::PackSchedule(
|
||||
const std::vector<ResourceRequest> &required_resources_list,
|
||||
const std::vector<ResourceSet> &required_resources_list,
|
||||
const absl::flat_hash_set<NodeID> &candidate_nodes) {
|
||||
std::vector<NodeID> result_nodes;
|
||||
result_nodes.resize(required_resources_list.size());
|
||||
absl::flat_hash_set<NodeID> candidate_nodes_copy(candidate_nodes);
|
||||
std::list<std::pair<int, ResourceRequest>> required_resources_list_copy;
|
||||
std::list<std::pair<int, ResourceSet>> required_resources_list_copy;
|
||||
int index = 0;
|
||||
for (const auto &iter : required_resources_list) {
|
||||
required_resources_list_copy.emplace_back(index++, iter);
|
||||
|
@ -282,14 +265,14 @@ SchedulingResult GcsResourceScheduler::PackSchedule(
|
|||
break;
|
||||
}
|
||||
|
||||
RAY_CHECK(AllocateRemoteTaskResources(*best_node, required_resources));
|
||||
RAY_CHECK(gcs_resource_manager_.AcquireResources(*best_node, required_resources));
|
||||
result_nodes[required_resources_index] = *best_node;
|
||||
required_resources_list_copy.pop_front();
|
||||
|
||||
// We try to schedule more resources on one node.
|
||||
for (auto iter = required_resources_list_copy.begin();
|
||||
iter != required_resources_list_copy.end();) {
|
||||
if (AllocateRemoteTaskResources(*best_node, iter->second)) {
|
||||
if (gcs_resource_manager_.AcquireResources(*best_node, iter->second)) {
|
||||
result_nodes[iter->first] = *best_node;
|
||||
required_resources_list_copy.erase(iter++);
|
||||
} else {
|
||||
|
@ -310,15 +293,17 @@ SchedulingResult GcsResourceScheduler::PackSchedule(
|
|||
}
|
||||
|
||||
std::optional<NodeID> GcsResourceScheduler::GetBestNode(
|
||||
const ResourceRequest &required_resources,
|
||||
const ResourceSet &required_resources,
|
||||
const absl::flat_hash_set<NodeID> &candidate_nodes) {
|
||||
double best_node_score = -1;
|
||||
const NodeID *best_node_id = nullptr;
|
||||
const auto &cluster_resources = gcs_resource_manager_.GetClusterResources();
|
||||
|
||||
// Score the nodes.
|
||||
for (const auto &node_id : candidate_nodes) {
|
||||
const auto &node_resources = GetNodeResources(node_id);
|
||||
double node_score = node_scorer_->Score(required_resources, node_resources);
|
||||
const auto &iter = cluster_resources.find(node_id);
|
||||
RAY_CHECK(iter != cluster_resources.end());
|
||||
double node_score = node_scorer_->Score(required_resources, *iter->second);
|
||||
if (best_node_id == nullptr || best_node_score < node_score) {
|
||||
best_node_id = &node_id;
|
||||
best_node_score = node_score;
|
||||
|
@ -329,41 +314,19 @@ std::optional<NodeID> GcsResourceScheduler::GetBestNode(
|
|||
} else {
|
||||
return std::nullopt;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
void GcsResourceScheduler::ReleaseTemporarilyDeductedResources(
|
||||
const std::vector<ResourceRequest> &required_resources_list,
|
||||
const std::vector<ResourceSet> &required_resources_list,
|
||||
const std::vector<NodeID> &nodes) {
|
||||
for (int index = 0; index < (int)nodes.size(); index++) {
|
||||
// If `PackSchedule` fails, the id of some nodes may be nil.
|
||||
if (!nodes[index].IsNil()) {
|
||||
RAY_CHECK(ReleaseRemoteTaskResources(nodes[index], required_resources_list[index]));
|
||||
RAY_CHECK(gcs_resource_manager_.ReleaseResources(nodes[index],
|
||||
required_resources_list[index]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const NodeResources &GcsResourceScheduler::GetNodeResources(const NodeID &node_id) const {
|
||||
const auto &resource_view = GetResourceView();
|
||||
auto iter = resource_view.find(node_id);
|
||||
RAY_CHECK(iter != resource_view.end());
|
||||
return iter->second->GetLocalView();
|
||||
}
|
||||
|
||||
bool GcsResourceScheduler::AllocateRemoteTaskResources(
|
||||
const NodeID &node_id, const ResourceRequest &resource_request) {
|
||||
return gcs_resource_manager_.AcquireResources(node_id, resource_request);
|
||||
}
|
||||
|
||||
bool GcsResourceScheduler::ReleaseRemoteTaskResources(
|
||||
const NodeID &node_id, const ResourceRequest &resource_request) {
|
||||
return gcs_resource_manager_.ReleaseResources(node_id, resource_request);
|
||||
}
|
||||
|
||||
const absl::flat_hash_map<NodeID, std::shared_ptr<Node>>
|
||||
&GcsResourceScheduler::GetResourceView() const {
|
||||
return gcs_resource_manager_.GetClusterResources();
|
||||
}
|
||||
|
||||
} // namespace gcs
|
||||
} // namespace ray
|
||||
|
|
|
@ -16,9 +16,8 @@
|
|||
#include <optional>
|
||||
|
||||
#include "absl/container/flat_hash_set.h"
|
||||
#include "ray/common/id.h"
|
||||
#include "ray/common/task/scheduling_resources.h"
|
||||
#include "ray/gcs/gcs_server/gcs_resource_manager.h"
|
||||
#include "ray/raylet/scheduling/cluster_resource_data.h"
|
||||
|
||||
namespace ray {
|
||||
namespace gcs {
|
||||
|
@ -62,16 +61,16 @@ class NodeScorer {
|
|||
/// \param node_resources The node resources which contains available and total
|
||||
/// resources.
|
||||
/// \return Score of the node.
|
||||
virtual double Score(const ResourceRequest &required_resources,
|
||||
const NodeResources &node_resources) = 0;
|
||||
virtual double Score(const ResourceSet &required_resources,
|
||||
const SchedulingResources &node_resources) = 0;
|
||||
};
|
||||
|
||||
/// LeastResourceScorer is a score plugin that favors nodes with fewer allocation
|
||||
/// requested resources based on requested resources.
|
||||
class LeastResourceScorer : public NodeScorer {
|
||||
public:
|
||||
double Score(const ResourceRequest &required_resources,
|
||||
const NodeResources &node_resources) override;
|
||||
double Score(const ResourceSet &required_resources,
|
||||
const SchedulingResources &node_resources) override;
|
||||
|
||||
private:
|
||||
/// \brief Calculate one of the resource scores.
|
||||
|
@ -103,7 +102,7 @@ class GcsResourceScheduler {
|
|||
/// otherwise, it will return an empty vector and a flag to indicate whether this
|
||||
/// request can be retry or not.
|
||||
SchedulingResult Schedule(
|
||||
const std::vector<ResourceRequest> &required_resources_list,
|
||||
const std::vector<ResourceSet> &required_resources_list,
|
||||
const SchedulingType &scheduling_type,
|
||||
const std::function<bool(const NodeID &)> &node_filter_func = nullptr);
|
||||
|
||||
|
@ -116,6 +115,8 @@ class GcsResourceScheduler {
|
|||
/// can be used for scheduling.
|
||||
/// \return The candidate nodes which can be used for scheduling.
|
||||
absl::flat_hash_set<NodeID> FilterCandidateNodes(
|
||||
const absl::flat_hash_map<NodeID, std::shared_ptr<SchedulingResources>>
|
||||
&cluster_resources,
|
||||
const std::function<bool(const NodeID &)> &node_filter_func);
|
||||
|
||||
/// Sort required resources according to the scarcity and capacity of resources.
|
||||
|
@ -124,8 +125,8 @@ class GcsResourceScheduler {
|
|||
///
|
||||
/// \param required_resources The resources to be scheduled.
|
||||
/// \return The Sorted resources.
|
||||
const std::vector<ResourceRequest> &SortRequiredResources(
|
||||
const std::vector<ResourceRequest> &required_resources);
|
||||
const std::vector<ResourceSet> &SortRequiredResources(
|
||||
const std::vector<ResourceSet> &required_resources);
|
||||
|
||||
/// Schedule resources according to `STRICT_SPREAD` strategy.
|
||||
///
|
||||
|
@ -135,7 +136,7 @@ class GcsResourceScheduler {
|
|||
/// otherwise, it will return an empty vector and a flag to indicate whether this
|
||||
/// request can be retry or not.
|
||||
SchedulingResult StrictSpreadSchedule(
|
||||
const std::vector<ResourceRequest> &required_resources_list,
|
||||
const std::vector<ResourceSet> &required_resources_list,
|
||||
const absl::flat_hash_set<NodeID> &candidate_nodes);
|
||||
|
||||
/// Schedule resources according to `SPREAD` strategy.
|
||||
|
@ -145,9 +146,8 @@ class GcsResourceScheduler {
|
|||
/// \return `SchedulingResult`, including the selected nodes if schedule successful,
|
||||
/// otherwise, it will return an empty vector and a flag to indicate whether this
|
||||
/// request can be retry or not.
|
||||
SchedulingResult SpreadSchedule(
|
||||
const std::vector<ResourceRequest> &required_resources_list,
|
||||
const absl::flat_hash_set<NodeID> &candidate_nodes);
|
||||
SchedulingResult SpreadSchedule(const std::vector<ResourceSet> &required_resources_list,
|
||||
const absl::flat_hash_set<NodeID> &candidate_nodes);
|
||||
|
||||
/// Schedule resources according to `STRICT_PACK` strategy.
|
||||
///
|
||||
|
@ -157,7 +157,7 @@ class GcsResourceScheduler {
|
|||
/// otherwise, it will return an empty vector and a flag to indicate whether this
|
||||
/// request can be retry or not.
|
||||
SchedulingResult StrictPackSchedule(
|
||||
const std::vector<ResourceRequest> &required_resources_list,
|
||||
const std::vector<ResourceSet> &required_resources_list,
|
||||
const absl::flat_hash_set<NodeID> &candidate_nodes);
|
||||
|
||||
/// Schedule resources according to `PACK` strategy.
|
||||
|
@ -167,45 +167,26 @@ class GcsResourceScheduler {
|
|||
/// \return `SchedulingResult`, including the selected nodes if schedule successful,
|
||||
/// otherwise, it will return an empty vector and a flag to indicate whether this
|
||||
/// request can be retry or not.
|
||||
SchedulingResult PackSchedule(
|
||||
const std::vector<ResourceRequest> &required_resources_list,
|
||||
const absl::flat_hash_set<NodeID> &candidate_nodes);
|
||||
SchedulingResult PackSchedule(const std::vector<ResourceSet> &required_resources_list,
|
||||
const absl::flat_hash_set<NodeID> &candidate_nodes);
|
||||
|
||||
/// Score all nodes according to the specified resources.
|
||||
///
|
||||
/// \param required_resources The resources to be scheduled.
|
||||
/// \param candidate_nodes The nodes can be used for scheduling.
|
||||
/// \return Score of all nodes.
|
||||
std::optional<NodeID> GetBestNode(const ResourceRequest &required_resources,
|
||||
std::optional<NodeID> GetBestNode(const ResourceSet &required_resources,
|
||||
const absl::flat_hash_set<NodeID> &candidate_nodes);
|
||||
|
||||
/// Get node resources.
|
||||
const NodeResources &GetNodeResources(const NodeID &node_id) const;
|
||||
|
||||
/// Return the resources temporarily deducted from gcs resource manager.
|
||||
///
|
||||
/// \param required_resources_list The resources to be scheduled.
|
||||
/// \param nodes Scheduling selected nodes, it corresponds to `required_resources_list`
|
||||
/// one by one.
|
||||
void ReleaseTemporarilyDeductedResources(
|
||||
const std::vector<ResourceRequest> &required_resources_list,
|
||||
const std::vector<ResourceSet> &required_resources_list,
|
||||
const std::vector<NodeID> &nodes);
|
||||
|
||||
/// Subtract the resources required by a given resource request (resource_request) from
|
||||
/// a given remote node.
|
||||
///
|
||||
/// \param node_id Remote node whose resources we allocate.
|
||||
/// \param resource_request Task for which we allocate resources.
|
||||
/// \return True if remote node has enough resources to satisfy the resource request.
|
||||
/// False otherwise.
|
||||
bool AllocateRemoteTaskResources(const NodeID &node_id,
|
||||
const ResourceRequest &resource_request);
|
||||
|
||||
bool ReleaseRemoteTaskResources(const NodeID &node_id,
|
||||
const ResourceRequest &resource_request);
|
||||
|
||||
const absl::flat_hash_map<NodeID, std::shared_ptr<Node>> &GetResourceView() const;
|
||||
|
||||
/// Reference of GcsResourceManager.
|
||||
GcsResourceManager &gcs_resource_manager_;
|
||||
|
||||
|
|
|
@ -157,10 +157,11 @@ TEST_F(GcsBasedActorSchedulerTest, TestScheduleAndDestroyOneActor) {
|
|||
auto node = AddNewNode(node_resources);
|
||||
auto node_id = NodeID::FromBinary(node->node_id());
|
||||
ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size());
|
||||
absl::flat_hash_map<NodeID, std::shared_ptr<Node>> cluster_resources_before_scheduling;
|
||||
absl::flat_hash_map<NodeID, std::shared_ptr<SchedulingResources>>
|
||||
cluster_resources_before_scheduling;
|
||||
for (auto &entry : gcs_resource_manager_->GetClusterResources()) {
|
||||
cluster_resources_before_scheduling.emplace(entry.first,
|
||||
std::make_shared<Node>(*entry.second));
|
||||
cluster_resources_before_scheduling.emplace(
|
||||
entry.first, std::make_shared<SchedulingResources>(*entry.second));
|
||||
}
|
||||
ASSERT_TRUE(cluster_resources_before_scheduling.contains(node_id));
|
||||
|
||||
|
@ -194,15 +195,17 @@ TEST_F(GcsBasedActorSchedulerTest, TestScheduleAndDestroyOneActor) {
|
|||
|
||||
auto cluster_resources_after_scheduling = gcs_resource_manager_->GetClusterResources();
|
||||
ASSERT_TRUE(cluster_resources_after_scheduling.contains(node_id));
|
||||
ASSERT_NE(cluster_resources_before_scheduling[node_id]->GetLocalView(),
|
||||
cluster_resources_after_scheduling[node_id]->GetLocalView());
|
||||
ASSERT_FALSE(
|
||||
cluster_resources_before_scheduling[node_id]->GetAvailableResources().IsEqual(
|
||||
cluster_resources_after_scheduling[node_id]->GetAvailableResources()));
|
||||
|
||||
// When destroying an actor, its acquired resources have to be returned.
|
||||
gcs_actor_scheduler_->OnActorDestruction(actor);
|
||||
auto cluster_resources_after_destruction = gcs_resource_manager_->GetClusterResources();
|
||||
ASSERT_TRUE(cluster_resources_after_destruction.contains(node_id));
|
||||
ASSERT_EQ(cluster_resources_before_scheduling[node_id]->GetLocalView(),
|
||||
cluster_resources_after_scheduling[node_id]->GetLocalView());
|
||||
ASSERT_TRUE(
|
||||
cluster_resources_before_scheduling[node_id]->GetAvailableResources().IsEqual(
|
||||
cluster_resources_after_destruction[node_id]->GetAvailableResources()));
|
||||
}
|
||||
|
||||
TEST_F(GcsBasedActorSchedulerTest, TestBalancedSchedule) {
|
||||
|
|
|
@ -50,18 +50,17 @@ TEST_F(GcsResourceManagerTest, TestBasic) {
|
|||
ASSERT_EQ(1, cluster_resource.size());
|
||||
|
||||
const auto &node_id = NodeID::FromBinary(node->node_id());
|
||||
auto resource_request =
|
||||
ResourceMapToResourceRequest(resource_map, /*requires_object_store_memory=*/false);
|
||||
ResourceSet resource_set(resource_map);
|
||||
|
||||
// Test `AcquireResources`.
|
||||
ASSERT_TRUE(gcs_resource_manager_->AcquireResources(node_id, resource_request));
|
||||
ASSERT_FALSE(gcs_resource_manager_->AcquireResources(node_id, resource_request));
|
||||
ASSERT_TRUE(gcs_resource_manager_->AcquireResources(node_id, resource_set));
|
||||
ASSERT_FALSE(gcs_resource_manager_->AcquireResources(node_id, resource_set));
|
||||
|
||||
// Test `ReleaseResources`.
|
||||
ASSERT_TRUE(
|
||||
gcs_resource_manager_->ReleaseResources(NodeID::FromRandom(), resource_request));
|
||||
ASSERT_TRUE(gcs_resource_manager_->ReleaseResources(node_id, resource_request));
|
||||
ASSERT_TRUE(gcs_resource_manager_->AcquireResources(node_id, resource_request));
|
||||
gcs_resource_manager_->ReleaseResources(NodeID::FromRandom(), resource_set));
|
||||
ASSERT_TRUE(gcs_resource_manager_->ReleaseResources(node_id, resource_set));
|
||||
ASSERT_TRUE(gcs_resource_manager_->AcquireResources(node_id, resource_set));
|
||||
}
|
||||
|
||||
TEST_F(GcsResourceManagerTest, TestResourceUsageAPI) {
|
||||
|
|
|
@ -52,21 +52,8 @@ class GcsResourceSchedulerTest : public ::testing::Test {
|
|||
const auto &cluster_resource = gcs_resource_manager_->GetClusterResources();
|
||||
auto iter = cluster_resource.find(node_id);
|
||||
ASSERT_TRUE(iter != cluster_resource.end());
|
||||
const auto &node_resources = iter->second->GetLocalView();
|
||||
auto resource_id = scheduling::ResourceID(resource_name).ToInt();
|
||||
ASSERT_NE(resource_id, -1);
|
||||
|
||||
const ResourceCapacity *resource_capacity = nullptr;
|
||||
if (resource_id >= 0 && resource_id < PredefinedResources_MAX) {
|
||||
resource_capacity = &node_resources.predefined_resources[resource_id];
|
||||
} else {
|
||||
auto iter = node_resources.custom_resources.find(resource_id);
|
||||
if (iter != node_resources.custom_resources.end()) {
|
||||
resource_capacity = &iter->second;
|
||||
}
|
||||
}
|
||||
ASSERT_TRUE(resource_capacity != nullptr);
|
||||
ASSERT_EQ(resource_capacity->available.Double(), resource_value);
|
||||
ASSERT_EQ(iter->second->GetAvailableResources().GetResource(resource_name).Double(),
|
||||
resource_value);
|
||||
}
|
||||
|
||||
void TestResourceLeaks(const gcs::SchedulingType &scheduling_type) {
|
||||
|
@ -77,12 +64,11 @@ class GcsResourceSchedulerTest : public ::testing::Test {
|
|||
AddClusterResources(node_id, cpu_resource, node_cpu_num);
|
||||
|
||||
// Scheduling succeeded and node resources are used up.
|
||||
std::vector<ResourceRequest> required_resources_list;
|
||||
std::vector<ResourceSet> required_resources_list;
|
||||
absl::flat_hash_map<std::string, double> resource_map;
|
||||
for (int bundle_cpu_num = 1; bundle_cpu_num <= 3; ++bundle_cpu_num) {
|
||||
resource_map[cpu_resource] = bundle_cpu_num;
|
||||
required_resources_list.emplace_back(ResourceMapToResourceRequest(
|
||||
resource_map, /*requires_object_store_memory=*/false));
|
||||
required_resources_list.emplace_back(resource_map);
|
||||
}
|
||||
const auto &result1 =
|
||||
gcs_resource_scheduler_->Schedule(required_resources_list, scheduling_type);
|
||||
|
@ -94,9 +80,7 @@ class GcsResourceSchedulerTest : public ::testing::Test {
|
|||
|
||||
// Scheduling failure.
|
||||
resource_map[cpu_resource] = 5;
|
||||
required_resources_list.emplace_back(
|
||||
ResourceMapToResourceRequest(resource_map,
|
||||
/*requires_object_store_memory=*/false));
|
||||
required_resources_list.emplace_back(resource_map);
|
||||
const auto &result2 =
|
||||
gcs_resource_scheduler_->Schedule(required_resources_list, scheduling_type);
|
||||
ASSERT_TRUE(result2.first == gcs::SchedulingResultStatus::FAILED);
|
||||
|
@ -129,11 +113,10 @@ TEST_F(GcsResourceSchedulerTest, TestNodeFilter) {
|
|||
AddClusterResources(node_id, cpu_resource, node_cpu_num);
|
||||
|
||||
// Scheduling failure.
|
||||
std::vector<ResourceRequest> required_resources_list;
|
||||
std::vector<ResourceSet> required_resources_list;
|
||||
absl::flat_hash_map<std::string, double> resource_map;
|
||||
resource_map[cpu_resource] = 1;
|
||||
required_resources_list.emplace_back(
|
||||
ResourceMapToResourceRequest(resource_map, /*requires_object_store_memory=*/false));
|
||||
required_resources_list.emplace_back(resource_map);
|
||||
const auto &result1 = gcs_resource_scheduler_->Schedule(
|
||||
required_resources_list, gcs::SchedulingType::STRICT_SPREAD,
|
||||
[](const NodeID &) { return false; });
|
||||
|
@ -158,12 +141,11 @@ TEST_F(GcsResourceSchedulerTest, TestSchedulingResultStatusForStrictStrategy) {
|
|||
AddClusterResources(node_tow_id, cpu_resource, node_cpu_num);
|
||||
|
||||
// Mock a request that has three required resources.
|
||||
std::vector<ResourceRequest> required_resources_list;
|
||||
std::vector<ResourceSet> required_resources_list;
|
||||
absl::flat_hash_map<std::string, double> resource_map;
|
||||
resource_map[cpu_resource] = 1;
|
||||
for (int node_number = 0; node_number < 3; node_number++) {
|
||||
required_resources_list.emplace_back(ResourceMapToResourceRequest(
|
||||
resource_map, /*requires_object_store_memory=*/false));
|
||||
required_resources_list.emplace_back(resource_map);
|
||||
}
|
||||
|
||||
const auto &result1 = gcs_resource_scheduler_->Schedule(
|
||||
|
@ -180,8 +162,7 @@ TEST_F(GcsResourceSchedulerTest, TestSchedulingResultStatusForStrictStrategy) {
|
|||
required_resources_list.clear();
|
||||
resource_map.clear();
|
||||
resource_map[cpu_resource] = 50;
|
||||
required_resources_list.emplace_back(
|
||||
ResourceMapToResourceRequest(resource_map, /*requires_object_store_memory=*/false));
|
||||
required_resources_list.emplace_back(resource_map);
|
||||
|
||||
const auto &result2 = gcs_resource_scheduler_->Schedule(
|
||||
required_resources_list, gcs::SchedulingType::STRICT_PACK);
|
||||
|
|
|
@ -64,7 +64,7 @@ bool NewPlacementGroupResourceManager::PrepareBundle(
|
|||
auto resource_instances = std::make_shared<TaskResourceInstances>();
|
||||
bool allocated =
|
||||
cluster_resource_scheduler_->GetLocalResourceManager().AllocateLocalTaskResources(
|
||||
bundle_spec.GetRequiredResources(), resource_instances);
|
||||
bundle_spec.GetRequiredResources().GetResourceMap(), resource_instances);
|
||||
|
||||
if (!allocated) {
|
||||
return false;
|
||||
|
|
|
@ -294,7 +294,7 @@ bool NodeResources::IsFeasible(const ResourceRequest &resource_request) const {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool NodeResources::operator==(const NodeResources &other) const {
|
||||
bool NodeResources::operator==(const NodeResources &other) {
|
||||
for (size_t i = 0; i < PredefinedResources_MAX; i++) {
|
||||
if (this->predefined_resources[i].total != other.predefined_resources[i].total) {
|
||||
return false;
|
||||
|
@ -325,9 +325,7 @@ bool NodeResources::operator==(const NodeResources &other) const {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool NodeResources::operator!=(const NodeResources &other) const {
|
||||
return !(*this == other);
|
||||
}
|
||||
bool NodeResources::operator!=(const NodeResources &other) { return !(*this == other); }
|
||||
|
||||
std::string NodeResources::DebugString() const {
|
||||
std::stringstream buffer;
|
||||
|
|
|
@ -165,8 +165,8 @@ class NodeResources {
|
|||
/// Note: This doesn't account for the binpacking of unit resources.
|
||||
bool IsFeasible(const ResourceRequest &resource_request) const;
|
||||
/// Returns if this equals another node resources.
|
||||
bool operator==(const NodeResources &other) const;
|
||||
bool operator!=(const NodeResources &other) const;
|
||||
bool operator==(const NodeResources &other);
|
||||
bool operator!=(const NodeResources &other);
|
||||
/// Returns human-readable string for these resources.
|
||||
std::string DebugString() const;
|
||||
/// Returns compact dict-like string.
|
||||
|
|
Loading…
Add table
Reference in a new issue