mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[scheduler] hide StringIDMap under BaseSchedulingID (#22722)
* add * address comments
This commit is contained in:
parent
271ed44143
commit
3e3db8e9cd
24 changed files with 695 additions and 665 deletions
14
BUILD.bazel
14
BUILD.bazel
|
@ -1066,6 +1066,20 @@ cc_test(
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
cc_test(
|
||||||
|
name = "scheduling_ids_test",
|
||||||
|
size = "small",
|
||||||
|
srcs = [
|
||||||
|
"src/ray/raylet/scheduling/scheduling_ids_test.cc",
|
||||||
|
],
|
||||||
|
copts = COPTS,
|
||||||
|
tags = ["team:core"],
|
||||||
|
deps = [
|
||||||
|
":raylet_lib",
|
||||||
|
"@com_google_googletest//:gtest_main",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
cc_test(
|
cc_test(
|
||||||
name = "local_object_manager_test",
|
name = "local_object_manager_test",
|
||||||
size = "small",
|
size = "small",
|
||||||
|
|
|
@ -30,12 +30,6 @@ namespace ray {
|
||||||
/// divide to convert from internal to actual.
|
/// divide to convert from internal to actual.
|
||||||
constexpr double kResourceConversionFactor = 10000;
|
constexpr double kResourceConversionFactor = 10000;
|
||||||
|
|
||||||
const std::string kCPU_ResourceLabel = "CPU";
|
|
||||||
const std::string kGPU_ResourceLabel = "GPU";
|
|
||||||
const std::string kObjectStoreMemory_ResourceLabel = "object_store_memory";
|
|
||||||
const std::string kMemory_ResourceLabel = "memory";
|
|
||||||
const std::string kBundle_ResourceLabel = "bundle";
|
|
||||||
|
|
||||||
/// \class ResourceSet
|
/// \class ResourceSet
|
||||||
/// \brief Encapsulates and operates on a set of resources, including CPUs,
|
/// \brief Encapsulates and operates on a set of resources, including CPUs,
|
||||||
/// GPUs, and custom labels.
|
/// GPUs, and custom labels.
|
||||||
|
|
|
@ -313,8 +313,8 @@ NodeManager::NodeManager(instrumented_io_context &io_service, const NodeID &self
|
||||||
SchedulingResources local_resources(config.resource_config);
|
SchedulingResources local_resources(config.resource_config);
|
||||||
cluster_resource_scheduler_ =
|
cluster_resource_scheduler_ =
|
||||||
std::shared_ptr<ClusterResourceScheduler>(new ClusterResourceScheduler(
|
std::shared_ptr<ClusterResourceScheduler>(new ClusterResourceScheduler(
|
||||||
self_node_id_.Binary(), local_resources.GetTotalResources().GetResourceMap(),
|
scheduling::NodeID(self_node_id_.Binary()),
|
||||||
*gcs_client_,
|
local_resources.GetTotalResources().GetResourceMap(), *gcs_client_,
|
||||||
[this]() {
|
[this]() {
|
||||||
if (RayConfig::instance().scheduler_report_pinned_bytes_only()) {
|
if (RayConfig::instance().scheduler_report_pinned_bytes_only()) {
|
||||||
return local_object_manager_.GetPinnedBytes();
|
return local_object_manager_.GetPinnedBytes();
|
||||||
|
@ -771,7 +771,7 @@ void NodeManager::WarnResourceDeadlock() {
|
||||||
<< "\n"
|
<< "\n"
|
||||||
<< "Available resources on this node: "
|
<< "Available resources on this node: "
|
||||||
<< cluster_resource_scheduler_->GetClusterResourceManager()
|
<< cluster_resource_scheduler_->GetClusterResourceManager()
|
||||||
.GetNodeResourceViewString(self_node_id_.Binary())
|
.GetNodeResourceViewString(scheduling::NodeID(self_node_id_.Binary()))
|
||||||
<< " In total there are " << pending_tasks << " pending tasks and "
|
<< " In total there are " << pending_tasks << " pending tasks and "
|
||||||
<< pending_actor_creations << " pending actors on this node.";
|
<< pending_actor_creations << " pending actors on this node.";
|
||||||
|
|
||||||
|
@ -847,7 +847,7 @@ void NodeManager::NodeRemoved(const NodeID &node_id) {
|
||||||
|
|
||||||
// Remove the node from the resource map.
|
// Remove the node from the resource map.
|
||||||
if (!cluster_resource_scheduler_->GetClusterResourceManager().RemoveNode(
|
if (!cluster_resource_scheduler_->GetClusterResourceManager().RemoveNode(
|
||||||
node_id.Binary())) {
|
scheduling::NodeID(node_id.Binary()))) {
|
||||||
RAY_LOG(DEBUG) << "Received NodeRemoved callback for an unknown node: " << node_id
|
RAY_LOG(DEBUG) << "Received NodeRemoved callback for an unknown node: " << node_id
|
||||||
<< ".";
|
<< ".";
|
||||||
return;
|
return;
|
||||||
|
@ -933,7 +933,8 @@ void NodeManager::ResourceCreateUpdated(const NodeID &node_id,
|
||||||
const std::string &resource_label = resource_pair.first;
|
const std::string &resource_label = resource_pair.first;
|
||||||
const double &new_resource_capacity = resource_pair.second;
|
const double &new_resource_capacity = resource_pair.second;
|
||||||
cluster_resource_scheduler_->GetClusterResourceManager().UpdateResourceCapacity(
|
cluster_resource_scheduler_->GetClusterResourceManager().UpdateResourceCapacity(
|
||||||
node_id.Binary(), resource_label, new_resource_capacity);
|
scheduling::NodeID(node_id.Binary()), scheduling::ResourceID(resource_label),
|
||||||
|
new_resource_capacity);
|
||||||
}
|
}
|
||||||
RAY_LOG(DEBUG) << "[ResourceCreateUpdated] Updated cluster_resource_map.";
|
RAY_LOG(DEBUG) << "[ResourceCreateUpdated] Updated cluster_resource_map.";
|
||||||
cluster_task_manager_->ScheduleAndDispatchTasks();
|
cluster_task_manager_->ScheduleAndDispatchTasks();
|
||||||
|
@ -961,7 +962,7 @@ void NodeManager::ResourceDeleted(const NodeID &node_id,
|
||||||
// Update local_available_resources_ and SchedulingResources
|
// Update local_available_resources_ and SchedulingResources
|
||||||
for (const auto &resource_label : resource_names) {
|
for (const auto &resource_label : resource_names) {
|
||||||
cluster_resource_scheduler_->GetClusterResourceManager().DeleteResource(
|
cluster_resource_scheduler_->GetClusterResourceManager().DeleteResource(
|
||||||
node_id.Binary(), resource_label);
|
scheduling::NodeID(node_id.Binary()), scheduling::ResourceID(resource_label));
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -969,7 +970,7 @@ void NodeManager::ResourceDeleted(const NodeID &node_id,
|
||||||
void NodeManager::UpdateResourceUsage(const NodeID &node_id,
|
void NodeManager::UpdateResourceUsage(const NodeID &node_id,
|
||||||
const rpc::ResourcesData &resource_data) {
|
const rpc::ResourcesData &resource_data) {
|
||||||
if (!cluster_resource_scheduler_->GetClusterResourceManager().UpdateNode(
|
if (!cluster_resource_scheduler_->GetClusterResourceManager().UpdateNode(
|
||||||
node_id.Binary(), resource_data)) {
|
scheduling::NodeID(node_id.Binary()), resource_data)) {
|
||||||
RAY_LOG(INFO)
|
RAY_LOG(INFO)
|
||||||
<< "[UpdateResourceUsage]: received resource usage from unknown node id "
|
<< "[UpdateResourceUsage]: received resource usage from unknown node id "
|
||||||
<< node_id;
|
<< node_id;
|
||||||
|
@ -1605,7 +1606,8 @@ void NodeManager::HandleRequestWorkerLease(const rpc::RequestWorkerLeaseRequest
|
||||||
<< ", normal_task_resources = " << normal_task_resources.ToString()
|
<< ", normal_task_resources = " << normal_task_resources.ToString()
|
||||||
<< ", local_resoruce_view = "
|
<< ", local_resoruce_view = "
|
||||||
<< cluster_resource_scheduler_->GetClusterResourceManager()
|
<< cluster_resource_scheduler_->GetClusterResourceManager()
|
||||||
.GetNodeResourceViewString(self_node_id_.Binary());
|
.GetNodeResourceViewString(
|
||||||
|
scheduling::NodeID(self_node_id_.Binary()));
|
||||||
auto resources_data = reply->mutable_resources_data();
|
auto resources_data = reply->mutable_resources_data();
|
||||||
resources_data->set_node_id(self_node_id_.Binary());
|
resources_data->set_node_id(self_node_id_.Binary());
|
||||||
resources_data->set_resources_normal_task_changed(true);
|
resources_data->set_resources_normal_task_changed(true);
|
||||||
|
|
|
@ -128,7 +128,6 @@ void NewPlacementGroupResourceManager::CommitBundle(
|
||||||
const auto &bundle_state = it->second;
|
const auto &bundle_state = it->second;
|
||||||
bundle_state->state_ = CommitState::COMMITTED;
|
bundle_state->state_ = CommitState::COMMITTED;
|
||||||
|
|
||||||
const auto &string_id_map = cluster_resource_scheduler_->GetStringIdMap();
|
|
||||||
const auto &task_resource_instances = *bundle_state->resources_;
|
const auto &task_resource_instances = *bundle_state->resources_;
|
||||||
|
|
||||||
const auto &resources = bundle_spec.GetFormattedResources();
|
const auto &resources = bundle_spec.GetFormattedResources();
|
||||||
|
@ -136,13 +135,12 @@ void NewPlacementGroupResourceManager::CommitBundle(
|
||||||
const auto &resource_name = resource.first;
|
const auto &resource_name = resource.first;
|
||||||
const auto &original_resource_name = GetOriginalResourceName(resource_name);
|
const auto &original_resource_name = GetOriginalResourceName(resource_name);
|
||||||
if (original_resource_name != kBundle_ResourceLabel) {
|
if (original_resource_name != kBundle_ResourceLabel) {
|
||||||
const auto &instances =
|
const auto &instances = task_resource_instances.Get(original_resource_name);
|
||||||
task_resource_instances.Get(original_resource_name, string_id_map);
|
|
||||||
cluster_resource_scheduler_->GetLocalResourceManager().AddLocalResourceInstances(
|
cluster_resource_scheduler_->GetLocalResourceManager().AddLocalResourceInstances(
|
||||||
resource_name, instances);
|
scheduling::ResourceID{resource_name}, instances);
|
||||||
} else {
|
} else {
|
||||||
cluster_resource_scheduler_->GetLocalResourceManager().AddLocalResourceInstances(
|
cluster_resource_scheduler_->GetLocalResourceManager().AddLocalResourceInstances(
|
||||||
resource_name, {resource.second});
|
scheduling::ResourceID{resource_name}, {resource.second});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
update_resources_(
|
update_resources_(
|
||||||
|
@ -185,14 +183,15 @@ void NewPlacementGroupResourceManager::ReturnBundle(
|
||||||
|
|
||||||
std::vector<std::string> deleted;
|
std::vector<std::string> deleted;
|
||||||
for (const auto &resource : placement_group_resources) {
|
for (const auto &resource : placement_group_resources) {
|
||||||
|
auto resource_id = scheduling::ResourceID{resource.first};
|
||||||
if (cluster_resource_scheduler_->GetLocalResourceManager().IsAvailableResourceEmpty(
|
if (cluster_resource_scheduler_->GetLocalResourceManager().IsAvailableResourceEmpty(
|
||||||
resource.first)) {
|
resource_id)) {
|
||||||
RAY_LOG(DEBUG) << "Available bundle resource:[" << resource.first
|
RAY_LOG(DEBUG) << "Available bundle resource:[" << resource.first
|
||||||
<< "] is empty, Will delete it from local resource";
|
<< "] is empty, Will delete it from local resource";
|
||||||
// Delete local resource if available resource is empty when return bundle, or there
|
// Delete local resource if available resource is empty when return bundle, or there
|
||||||
// will be resource leak.
|
// will be resource leak.
|
||||||
cluster_resource_scheduler_->GetLocalResourceManager().DeleteLocalResource(
|
cluster_resource_scheduler_->GetLocalResourceManager().DeleteLocalResource(
|
||||||
resource.first);
|
resource_id);
|
||||||
deleted.push_back(resource.first);
|
deleted.push_back(resource.first);
|
||||||
} else {
|
} else {
|
||||||
RAY_LOG(DEBUG) << "Available bundle resource:[" << resource.first
|
RAY_LOG(DEBUG) << "Available bundle resource:[" << resource.first
|
||||||
|
|
|
@ -41,8 +41,8 @@ class NewPlacementGroupResourceManagerTest : public ::testing::Test {
|
||||||
}
|
}
|
||||||
void InitLocalAvailableResource(
|
void InitLocalAvailableResource(
|
||||||
absl::flat_hash_map<std::string, double> &unit_resource) {
|
absl::flat_hash_map<std::string, double> &unit_resource) {
|
||||||
cluster_resource_scheduler_ =
|
cluster_resource_scheduler_ = std::make_shared<ClusterResourceScheduler>(
|
||||||
std::make_shared<ClusterResourceScheduler>("local", unit_resource, *gcs_client_);
|
scheduling::NodeID("local"), unit_resource, *gcs_client_);
|
||||||
new_placement_group_resource_manager_.reset(
|
new_placement_group_resource_manager_.reset(
|
||||||
new raylet::NewPlacementGroupResourceManager(
|
new raylet::NewPlacementGroupResourceManager(
|
||||||
cluster_resource_scheduler_,
|
cluster_resource_scheduler_,
|
||||||
|
@ -57,13 +57,13 @@ class NewPlacementGroupResourceManagerTest : public ::testing::Test {
|
||||||
void CheckAvailableResoueceEmpty(const std::string &resource) {
|
void CheckAvailableResoueceEmpty(const std::string &resource) {
|
||||||
ASSERT_TRUE(
|
ASSERT_TRUE(
|
||||||
cluster_resource_scheduler_->GetLocalResourceManager().IsAvailableResourceEmpty(
|
cluster_resource_scheduler_->GetLocalResourceManager().IsAvailableResourceEmpty(
|
||||||
resource));
|
scheduling::ResourceID(resource)));
|
||||||
}
|
}
|
||||||
|
|
||||||
void CheckRemainingResourceCorrect(NodeResources &node_resources) {
|
void CheckRemainingResourceCorrect(NodeResources &node_resources) {
|
||||||
auto local_node_resource =
|
auto local_node_resource =
|
||||||
cluster_resource_scheduler_->GetClusterResourceManager().GetNodeResources(
|
cluster_resource_scheduler_->GetClusterResourceManager().GetNodeResources(
|
||||||
"local");
|
scheduling::NodeID("local"));
|
||||||
ASSERT_TRUE(local_node_resource == node_resources);
|
ASSERT_TRUE(local_node_resource == node_resources);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -130,7 +130,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewCommitBundleResource) {
|
||||||
{"bundle_group_1_" + group_id.Hex(), 1000},
|
{"bundle_group_1_" + group_id.Hex(), 1000},
|
||||||
{"bundle_group_" + group_id.Hex(), 1000}};
|
{"bundle_group_" + group_id.Hex(), 1000}};
|
||||||
auto remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
auto remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
||||||
"remaining", remaining_resources, *gcs_client_);
|
scheduling::NodeID("remaining"), remaining_resources, *gcs_client_);
|
||||||
std::shared_ptr<TaskResourceInstances> resource_instances =
|
std::shared_ptr<TaskResourceInstances> resource_instances =
|
||||||
std::make_shared<TaskResourceInstances>();
|
std::make_shared<TaskResourceInstances>();
|
||||||
ASSERT_TRUE(
|
ASSERT_TRUE(
|
||||||
|
@ -138,7 +138,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewCommitBundleResource) {
|
||||||
unit_resource, resource_instances));
|
unit_resource, resource_instances));
|
||||||
auto remaining_resource_instance =
|
auto remaining_resource_instance =
|
||||||
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
||||||
"remaining");
|
scheduling::NodeID("remaining"));
|
||||||
CheckRemainingResourceCorrect(remaining_resource_instance);
|
CheckRemainingResourceCorrect(remaining_resource_instance);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -163,10 +163,10 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewReturnBundleResource) {
|
||||||
ASSERT_TRUE(delete_called_);
|
ASSERT_TRUE(delete_called_);
|
||||||
/// 5. check remaining resources is correct.
|
/// 5. check remaining resources is correct.
|
||||||
auto remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
auto remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
||||||
"remaining", unit_resource, *gcs_client_);
|
scheduling::NodeID("remaining"), unit_resource, *gcs_client_);
|
||||||
auto remaining_resource_instance =
|
auto remaining_resource_instance =
|
||||||
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
||||||
"remaining");
|
scheduling::NodeID("remaining"));
|
||||||
CheckRemainingResourceCorrect(remaining_resource_instance);
|
CheckRemainingResourceCorrect(remaining_resource_instance);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -204,7 +204,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewMultipleBundlesCommitAndRetu
|
||||||
{"bundle_group_2_" + group_id.Hex(), 1000},
|
{"bundle_group_2_" + group_id.Hex(), 1000},
|
||||||
{"bundle_group_" + group_id.Hex(), 2000}};
|
{"bundle_group_" + group_id.Hex(), 2000}};
|
||||||
auto remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
auto remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
||||||
"remaining", remaining_resources, *gcs_client_);
|
scheduling::NodeID("remaining"), remaining_resources, *gcs_client_);
|
||||||
std::shared_ptr<TaskResourceInstances> resource_instances =
|
std::shared_ptr<TaskResourceInstances> resource_instances =
|
||||||
std::make_shared<TaskResourceInstances>();
|
std::make_shared<TaskResourceInstances>();
|
||||||
ASSERT_TRUE(
|
ASSERT_TRUE(
|
||||||
|
@ -212,7 +212,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewMultipleBundlesCommitAndRetu
|
||||||
init_unit_resource, resource_instances));
|
init_unit_resource, resource_instances));
|
||||||
auto remaining_resource_instance =
|
auto remaining_resource_instance =
|
||||||
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
||||||
"remaining");
|
scheduling::NodeID("remaining"));
|
||||||
|
|
||||||
CheckRemainingResourceCorrect(remaining_resource_instance);
|
CheckRemainingResourceCorrect(remaining_resource_instance);
|
||||||
/// 5. return second bundle.
|
/// 5. return second bundle.
|
||||||
|
@ -228,7 +228,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewMultipleBundlesCommitAndRetu
|
||||||
{"bundle_group_1_" + group_id.Hex(), 1000},
|
{"bundle_group_1_" + group_id.Hex(), 1000},
|
||||||
{"bundle_group_" + group_id.Hex(), 2000}};
|
{"bundle_group_" + group_id.Hex(), 2000}};
|
||||||
remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
||||||
"remaining", remaining_resources, *gcs_client_);
|
scheduling::NodeID("remaining"), remaining_resources, *gcs_client_);
|
||||||
ASSERT_TRUE(
|
ASSERT_TRUE(
|
||||||
remaining_resource_scheduler->GetLocalResourceManager().AllocateLocalTaskResources(
|
remaining_resource_scheduler->GetLocalResourceManager().AllocateLocalTaskResources(
|
||||||
{{"CPU_group_" + group_id.Hex(), 1.0},
|
{{"CPU_group_" + group_id.Hex(), 1.0},
|
||||||
|
@ -237,17 +237,17 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewMultipleBundlesCommitAndRetu
|
||||||
resource_instances));
|
resource_instances));
|
||||||
remaining_resource_instance =
|
remaining_resource_instance =
|
||||||
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
||||||
"remaining");
|
scheduling::NodeID("remaining"));
|
||||||
CheckRemainingResourceCorrect(remaining_resource_instance);
|
CheckRemainingResourceCorrect(remaining_resource_instance);
|
||||||
/// 7. return first bundle.
|
/// 7. return first bundle.
|
||||||
new_placement_group_resource_manager_->ReturnBundle(first_bundle_spec);
|
new_placement_group_resource_manager_->ReturnBundle(first_bundle_spec);
|
||||||
/// 8. check remaining resources is correct after all bundle returned.
|
/// 8. check remaining resources is correct after all bundle returned.
|
||||||
remaining_resources = {{"CPU", 2.0}};
|
remaining_resources = {{"CPU", 2.0}};
|
||||||
remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
||||||
"remaining", remaining_resources, *gcs_client_);
|
scheduling::NodeID("remaining"), remaining_resources, *gcs_client_);
|
||||||
remaining_resource_instance =
|
remaining_resource_instance =
|
||||||
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
||||||
"remaining");
|
scheduling::NodeID("remaining"));
|
||||||
ASSERT_TRUE(update_called_);
|
ASSERT_TRUE(update_called_);
|
||||||
ASSERT_TRUE(delete_called_);
|
ASSERT_TRUE(delete_called_);
|
||||||
CheckRemainingResourceCorrect(remaining_resource_instance);
|
CheckRemainingResourceCorrect(remaining_resource_instance);
|
||||||
|
@ -270,7 +270,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewIdempotencyWithMultiPrepare)
|
||||||
/// 4. check remaining resources is correct.
|
/// 4. check remaining resources is correct.
|
||||||
absl::flat_hash_map<std::string, double> remaining_resources = {{"CPU", 3.0}};
|
absl::flat_hash_map<std::string, double> remaining_resources = {{"CPU", 3.0}};
|
||||||
auto remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
auto remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
||||||
"remaining", remaining_resources, *gcs_client_);
|
scheduling::NodeID("remaining"), remaining_resources, *gcs_client_);
|
||||||
std::shared_ptr<TaskResourceInstances> resource_instances =
|
std::shared_ptr<TaskResourceInstances> resource_instances =
|
||||||
std::make_shared<TaskResourceInstances>();
|
std::make_shared<TaskResourceInstances>();
|
||||||
ASSERT_TRUE(
|
ASSERT_TRUE(
|
||||||
|
@ -278,7 +278,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewIdempotencyWithMultiPrepare)
|
||||||
unit_resource, resource_instances));
|
unit_resource, resource_instances));
|
||||||
auto remaining_resource_instance =
|
auto remaining_resource_instance =
|
||||||
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
||||||
"remaining");
|
scheduling::NodeID("remaining"));
|
||||||
CheckRemainingResourceCorrect(remaining_resource_instance);
|
CheckRemainingResourceCorrect(remaining_resource_instance);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -307,7 +307,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewIdempotencyWithRandomOrder)
|
||||||
{"bundle_group_1_" + group_id.Hex(), 1000},
|
{"bundle_group_1_" + group_id.Hex(), 1000},
|
||||||
{"bundle_group_" + group_id.Hex(), 1000}};
|
{"bundle_group_" + group_id.Hex(), 1000}};
|
||||||
auto remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
auto remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
||||||
"remaining", remaining_resources, *gcs_client_);
|
scheduling::NodeID("remaining"), remaining_resources, *gcs_client_);
|
||||||
std::shared_ptr<TaskResourceInstances> resource_instances =
|
std::shared_ptr<TaskResourceInstances> resource_instances =
|
||||||
std::make_shared<TaskResourceInstances>();
|
std::make_shared<TaskResourceInstances>();
|
||||||
ASSERT_TRUE(
|
ASSERT_TRUE(
|
||||||
|
@ -315,7 +315,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewIdempotencyWithRandomOrder)
|
||||||
unit_resource, resource_instances));
|
unit_resource, resource_instances));
|
||||||
auto remaining_resource_instance =
|
auto remaining_resource_instance =
|
||||||
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
||||||
"remaining");
|
scheduling::NodeID("remaining"));
|
||||||
CheckRemainingResourceCorrect(remaining_resource_instance);
|
CheckRemainingResourceCorrect(remaining_resource_instance);
|
||||||
new_placement_group_resource_manager_->ReturnBundle(bundle_spec);
|
new_placement_group_resource_manager_->ReturnBundle(bundle_spec);
|
||||||
// 5. prepare bundle -> commit bundle -> commit bundle.
|
// 5. prepare bundle -> commit bundle -> commit bundle.
|
||||||
|
@ -336,10 +336,10 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewIdempotencyWithRandomOrder)
|
||||||
ConvertSingleSpecToVectorPtrs(bundle_spec));
|
ConvertSingleSpecToVectorPtrs(bundle_spec));
|
||||||
// 8. check remaining resources is correct.
|
// 8. check remaining resources is correct.
|
||||||
remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
||||||
"remaining", available_resource, *gcs_client_);
|
scheduling::NodeID("remaining"), available_resource, *gcs_client_);
|
||||||
remaining_resource_instance =
|
remaining_resource_instance =
|
||||||
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
||||||
"remaining");
|
scheduling::NodeID("remaining"));
|
||||||
CheckRemainingResourceCorrect(remaining_resource_instance);
|
CheckRemainingResourceCorrect(remaining_resource_instance);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -360,10 +360,10 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestPreparedResourceBatched) {
|
||||||
// 4. check remaining resources is correct.
|
// 4. check remaining resources is correct.
|
||||||
absl::flat_hash_map<std::string, double> remaining_resources = {{"CPU", 3.0}};
|
absl::flat_hash_map<std::string, double> remaining_resources = {{"CPU", 3.0}};
|
||||||
auto remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
auto remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
||||||
"remaining", remaining_resources, *gcs_client_);
|
scheduling::NodeID("remaining"), remaining_resources, *gcs_client_);
|
||||||
auto remaining_resource_instance =
|
auto remaining_resource_instance =
|
||||||
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
||||||
"remaining");
|
scheduling::NodeID("remaining"));
|
||||||
CheckRemainingResourceCorrect(remaining_resource_instance);
|
CheckRemainingResourceCorrect(remaining_resource_instance);
|
||||||
// 5. re-init the local available resource with 4 CPUs.
|
// 5. re-init the local available resource with 4 CPUs.
|
||||||
available_resource = {std::make_pair("CPU", 4.0)};
|
available_resource = {std::make_pair("CPU", 4.0)};
|
||||||
|
@ -386,7 +386,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestPreparedResourceBatched) {
|
||||||
{"bundle_group_4_" + group_id.Hex(), 1000},
|
{"bundle_group_4_" + group_id.Hex(), 1000},
|
||||||
{"bundle_group_" + group_id.Hex(), 4000}};
|
{"bundle_group_" + group_id.Hex(), 4000}};
|
||||||
remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
||||||
"remaining", remaining_resources, *gcs_client_);
|
scheduling::NodeID("remaining"), remaining_resources, *gcs_client_);
|
||||||
std::shared_ptr<TaskResourceInstances> resource_instances =
|
std::shared_ptr<TaskResourceInstances> resource_instances =
|
||||||
std::make_shared<TaskResourceInstances>();
|
std::make_shared<TaskResourceInstances>();
|
||||||
absl::flat_hash_map<std::string, double> allocating_resource;
|
absl::flat_hash_map<std::string, double> allocating_resource;
|
||||||
|
@ -396,7 +396,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestPreparedResourceBatched) {
|
||||||
allocating_resource, resource_instances));
|
allocating_resource, resource_instances));
|
||||||
remaining_resource_instance =
|
remaining_resource_instance =
|
||||||
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
||||||
"remaining");
|
scheduling::NodeID("remaining"));
|
||||||
RAY_LOG(INFO) << "The current local resource view: "
|
RAY_LOG(INFO) << "The current local resource view: "
|
||||||
<< cluster_resource_scheduler_->DebugString();
|
<< cluster_resource_scheduler_->DebugString();
|
||||||
CheckRemainingResourceCorrect(remaining_resource_instance);
|
CheckRemainingResourceCorrect(remaining_resource_instance);
|
||||||
|
@ -432,7 +432,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestCommiteResourceBatched) {
|
||||||
{"bundle_group_4_" + group_id.Hex(), 1000},
|
{"bundle_group_4_" + group_id.Hex(), 1000},
|
||||||
{"bundle_group_" + group_id.Hex(), 4000}};
|
{"bundle_group_" + group_id.Hex(), 4000}};
|
||||||
auto remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
auto remaining_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
|
||||||
"remaining", remaining_resources, *gcs_client_);
|
scheduling::NodeID("remaining"), remaining_resources, *gcs_client_);
|
||||||
std::shared_ptr<TaskResourceInstances> resource_instances =
|
std::shared_ptr<TaskResourceInstances> resource_instances =
|
||||||
std::make_shared<TaskResourceInstances>();
|
std::make_shared<TaskResourceInstances>();
|
||||||
absl::flat_hash_map<std::string, double> allocating_resource;
|
absl::flat_hash_map<std::string, double> allocating_resource;
|
||||||
|
@ -442,7 +442,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestCommiteResourceBatched) {
|
||||||
allocating_resource, resource_instances));
|
allocating_resource, resource_instances));
|
||||||
auto remaining_resource_instance =
|
auto remaining_resource_instance =
|
||||||
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources(
|
||||||
"remaining");
|
scheduling::NodeID("remaining"));
|
||||||
RAY_LOG(INFO) << "The current local resource view: "
|
RAY_LOG(INFO) << "The current local resource view: "
|
||||||
<< cluster_resource_scheduler_->DebugString();
|
<< cluster_resource_scheduler_->DebugString();
|
||||||
CheckRemainingResourceCorrect(remaining_resource_instance);
|
CheckRemainingResourceCorrect(remaining_resource_instance);
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
#include "ray/common/task/scheduling_resources.h"
|
#include "ray/common/task/scheduling_resources.h"
|
||||||
|
|
||||||
namespace ray {
|
namespace ray {
|
||||||
|
using namespace ::ray::scheduling;
|
||||||
|
|
||||||
const std::string resource_labels[] = {
|
const std::string resource_labels[] = {
|
||||||
ray::kCPU_ResourceLabel, ray::kMemory_ResourceLabel, ray::kGPU_ResourceLabel,
|
ray::kCPU_ResourceLabel, ray::kMemory_ResourceLabel, ray::kGPU_ResourceLabel,
|
||||||
|
@ -89,7 +90,6 @@ std::vector<double> VectorFixedPointToVectorDouble(
|
||||||
|
|
||||||
/// Convert a map of resources to a ResourceRequest data structure.
|
/// Convert a map of resources to a ResourceRequest data structure.
|
||||||
ResourceRequest ResourceMapToResourceRequest(
|
ResourceRequest ResourceMapToResourceRequest(
|
||||||
StringIdMap &string_to_int_map,
|
|
||||||
const absl::flat_hash_map<std::string, double> &resource_map,
|
const absl::flat_hash_map<std::string, double> &resource_map,
|
||||||
bool requires_object_store_memory) {
|
bool requires_object_store_memory) {
|
||||||
ResourceRequest resource_request;
|
ResourceRequest resource_request;
|
||||||
|
@ -107,8 +107,8 @@ ResourceRequest ResourceMapToResourceRequest(
|
||||||
} else if (resource.first == ray::kMemory_ResourceLabel) {
|
} else if (resource.first == ray::kMemory_ResourceLabel) {
|
||||||
resource_request.predefined_resources[MEM] = resource.second;
|
resource_request.predefined_resources[MEM] = resource.second;
|
||||||
} else {
|
} else {
|
||||||
int64_t id = string_to_int_map.Insert(resource.first);
|
resource_request.custom_resources[ResourceID(resource.first).ToInt()] =
|
||||||
resource_request.custom_resources[id] = resource.second;
|
resource.second;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -116,7 +116,7 @@ ResourceRequest ResourceMapToResourceRequest(
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::vector<FixedPoint> &TaskResourceInstances::Get(
|
const std::vector<FixedPoint> &TaskResourceInstances::Get(
|
||||||
const std::string &resource_name, const StringIdMap &string_id_map) const {
|
const std::string &resource_name) const {
|
||||||
if (ray::kCPU_ResourceLabel == resource_name) {
|
if (ray::kCPU_ResourceLabel == resource_name) {
|
||||||
return predefined_resources[CPU];
|
return predefined_resources[CPU];
|
||||||
} else if (ray::kGPU_ResourceLabel == resource_name) {
|
} else if (ray::kGPU_ResourceLabel == resource_name) {
|
||||||
|
@ -126,8 +126,7 @@ const std::vector<FixedPoint> &TaskResourceInstances::Get(
|
||||||
} else if (ray::kMemory_ResourceLabel == resource_name) {
|
} else if (ray::kMemory_ResourceLabel == resource_name) {
|
||||||
return predefined_resources[MEM];
|
return predefined_resources[MEM];
|
||||||
} else {
|
} else {
|
||||||
int64_t resource_id = string_id_map.Get(resource_name);
|
auto it = custom_resources.find(ResourceID(resource_name).ToInt());
|
||||||
auto it = custom_resources.find(resource_id);
|
|
||||||
RAY_CHECK(it != custom_resources.end());
|
RAY_CHECK(it != custom_resources.end());
|
||||||
return it->second;
|
return it->second;
|
||||||
}
|
}
|
||||||
|
@ -162,7 +161,6 @@ ResourceRequest TaskResourceInstances::ToResourceRequest() const {
|
||||||
///
|
///
|
||||||
/// \request Conversion result to a ResourceRequest data structure.
|
/// \request Conversion result to a ResourceRequest data structure.
|
||||||
NodeResources ResourceMapToNodeResources(
|
NodeResources ResourceMapToNodeResources(
|
||||||
StringIdMap &string_to_int_map,
|
|
||||||
const absl::flat_hash_map<std::string, double> &resource_map_total,
|
const absl::flat_hash_map<std::string, double> &resource_map_total,
|
||||||
const absl::flat_hash_map<std::string, double> &resource_map_available) {
|
const absl::flat_hash_map<std::string, double> &resource_map_available) {
|
||||||
NodeResources node_resources;
|
NodeResources node_resources;
|
||||||
|
@ -191,7 +189,7 @@ NodeResources ResourceMapToNodeResources(
|
||||||
node_resources.predefined_resources[MEM] = resource_capacity;
|
node_resources.predefined_resources[MEM] = resource_capacity;
|
||||||
} else {
|
} else {
|
||||||
// This is a custom resource.
|
// This is a custom resource.
|
||||||
node_resources.custom_resources.emplace(string_to_int_map.Insert(resource.first),
|
node_resources.custom_resources.emplace(ResourceID(resource.first).ToInt(),
|
||||||
resource_capacity);
|
resource_capacity);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -325,7 +323,7 @@ bool NodeResources::operator==(const NodeResources &other) {
|
||||||
|
|
||||||
bool NodeResources::operator!=(const NodeResources &other) { return !(*this == other); }
|
bool NodeResources::operator!=(const NodeResources &other) { return !(*this == other); }
|
||||||
|
|
||||||
std::string NodeResources::DebugString(StringIdMap string_to_in_map) const {
|
std::string NodeResources::DebugString() const {
|
||||||
std::stringstream buffer;
|
std::stringstream buffer;
|
||||||
buffer << " {\n";
|
buffer << " {\n";
|
||||||
for (size_t i = 0; i < this->predefined_resources.size(); i++) {
|
for (size_t i = 0; i < this->predefined_resources.size(); i++) {
|
||||||
|
@ -352,14 +350,14 @@ std::string NodeResources::DebugString(StringIdMap string_to_in_map) const {
|
||||||
}
|
}
|
||||||
for (auto it = this->custom_resources.begin(); it != this->custom_resources.end();
|
for (auto it = this->custom_resources.begin(); it != this->custom_resources.end();
|
||||||
++it) {
|
++it) {
|
||||||
buffer << "\t" << string_to_in_map.Get(it->first) << ":(" << it->second.total << ":"
|
buffer << "\t" << ResourceID(it->first).Binary() << ":(" << it->second.total << ":"
|
||||||
<< it->second.available << ")\n";
|
<< it->second.available << ")\n";
|
||||||
}
|
}
|
||||||
buffer << "}" << std::endl;
|
buffer << "}" << std::endl;
|
||||||
return buffer.str();
|
return buffer.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string NodeResources::DictString(StringIdMap string_to_in_map) const {
|
std::string NodeResources::DictString() const {
|
||||||
std::stringstream buffer;
|
std::stringstream buffer;
|
||||||
bool first = true;
|
bool first = true;
|
||||||
buffer << "{";
|
buffer << "{";
|
||||||
|
@ -397,7 +395,7 @@ std::string NodeResources::DictString(StringIdMap string_to_in_map) const {
|
||||||
}
|
}
|
||||||
for (auto it = this->custom_resources.begin(); it != this->custom_resources.end();
|
for (auto it = this->custom_resources.begin(); it != this->custom_resources.end();
|
||||||
++it) {
|
++it) {
|
||||||
auto name = string_to_in_map.Get(it->first);
|
auto name = ResourceID(it->first).Binary();
|
||||||
buffer << ", " << format_resource(name, it->second.available.Double()) << "/"
|
buffer << ", " << format_resource(name, it->second.available.Double()) << "/"
|
||||||
<< format_resource(name, it->second.total.Double());
|
<< format_resource(name, it->second.total.Double());
|
||||||
buffer << " " << name;
|
buffer << " " << name;
|
||||||
|
@ -445,7 +443,7 @@ void TaskResourceInstances::ClearCPUInstances() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string NodeResourceInstances::DebugString(StringIdMap string_to_int_map) const {
|
std::string NodeResourceInstances::DebugString() const {
|
||||||
std::stringstream buffer;
|
std::stringstream buffer;
|
||||||
buffer << "{\n";
|
buffer << "{\n";
|
||||||
for (size_t i = 0; i < this->predefined_resources.size(); i++) {
|
for (size_t i = 0; i < this->predefined_resources.size(); i++) {
|
||||||
|
@ -472,7 +470,7 @@ std::string NodeResourceInstances::DebugString(StringIdMap string_to_int_map) co
|
||||||
}
|
}
|
||||||
for (auto it = this->custom_resources.begin(); it != this->custom_resources.end();
|
for (auto it = this->custom_resources.begin(); it != this->custom_resources.end();
|
||||||
++it) {
|
++it) {
|
||||||
buffer << "\t" << string_to_int_map.Get(it->first) << ":("
|
buffer << "\t" << ResourceID(it->first).Binary() << ":("
|
||||||
<< VectorToString(it->second.total) << ":"
|
<< VectorToString(it->second.total) << ":"
|
||||||
<< VectorToString(it->second.available) << ")\n";
|
<< VectorToString(it->second.available) << ")\n";
|
||||||
}
|
}
|
||||||
|
@ -545,7 +543,7 @@ bool TaskResourceInstances::IsEmpty() const {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string TaskResourceInstances::DebugString(const StringIdMap &string_id_map) const {
|
std::string TaskResourceInstances::DebugString() const {
|
||||||
std::stringstream buffer;
|
std::stringstream buffer;
|
||||||
buffer << std::endl << " Allocation: {";
|
buffer << std::endl << " Allocation: {";
|
||||||
for (size_t i = 0; i < this->predefined_resources.size(); i++) {
|
for (size_t i = 0; i < this->predefined_resources.size(); i++) {
|
||||||
|
@ -556,7 +554,7 @@ std::string TaskResourceInstances::DebugString(const StringIdMap &string_id_map)
|
||||||
buffer << " [";
|
buffer << " [";
|
||||||
for (auto it = this->custom_resources.begin(); it != this->custom_resources.end();
|
for (auto it = this->custom_resources.begin(); it != this->custom_resources.end();
|
||||||
++it) {
|
++it) {
|
||||||
buffer << string_id_map.Get(it->first) << ":" << VectorToString(it->second) << ", ";
|
buffer << ResourceID(it->first).Binary() << ":" << VectorToString(it->second) << ", ";
|
||||||
}
|
}
|
||||||
|
|
||||||
buffer << "]" << std::endl;
|
buffer << "]" << std::endl;
|
||||||
|
|
|
@ -27,9 +27,6 @@
|
||||||
|
|
||||||
namespace ray {
|
namespace ray {
|
||||||
|
|
||||||
/// List of predefined resources.
|
|
||||||
enum PredefinedResources { CPU, MEM, GPU, OBJECT_STORE_MEM, PredefinedResources_MAX };
|
|
||||||
|
|
||||||
const std::string ResourceEnumToString(PredefinedResources resource);
|
const std::string ResourceEnumToString(PredefinedResources resource);
|
||||||
|
|
||||||
const PredefinedResources ResourceStringToEnum(const std::string &resource);
|
const PredefinedResources ResourceStringToEnum(const std::string &resource);
|
||||||
|
@ -84,8 +81,7 @@ class TaskResourceInstances {
|
||||||
absl::flat_hash_map<int64_t, std::vector<FixedPoint>> custom_resources;
|
absl::flat_hash_map<int64_t, std::vector<FixedPoint>> custom_resources;
|
||||||
bool operator==(const TaskResourceInstances &other);
|
bool operator==(const TaskResourceInstances &other);
|
||||||
/// Get instances based on the string.
|
/// Get instances based on the string.
|
||||||
const std::vector<FixedPoint> &Get(const std::string &resource_name,
|
const std::vector<FixedPoint> &Get(const std::string &resource_name) const;
|
||||||
const StringIdMap &string_id_map) const;
|
|
||||||
/// For each resource of this request aggregate its instances.
|
/// For each resource of this request aggregate its instances.
|
||||||
ResourceRequest ToResourceRequest() const;
|
ResourceRequest ToResourceRequest() const;
|
||||||
/// Get CPU instances only.
|
/// Get CPU instances only.
|
||||||
|
@ -138,7 +134,7 @@ class TaskResourceInstances {
|
||||||
/// Check whether there are no resource instances.
|
/// Check whether there are no resource instances.
|
||||||
bool IsEmpty() const;
|
bool IsEmpty() const;
|
||||||
/// Returns human-readable string for these resources.
|
/// Returns human-readable string for these resources.
|
||||||
[[nodiscard]] std::string DebugString(const StringIdMap &string_id_map) const;
|
[[nodiscard]] std::string DebugString() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Total and available capacities of each resource of a node.
|
/// Total and available capacities of each resource of a node.
|
||||||
|
@ -170,9 +166,9 @@ class NodeResources {
|
||||||
bool operator==(const NodeResources &other);
|
bool operator==(const NodeResources &other);
|
||||||
bool operator!=(const NodeResources &other);
|
bool operator!=(const NodeResources &other);
|
||||||
/// Returns human-readable string for these resources.
|
/// Returns human-readable string for these resources.
|
||||||
std::string DebugString(StringIdMap string_to_int_map) const;
|
std::string DebugString() const;
|
||||||
/// Returns compact dict-like string.
|
/// Returns compact dict-like string.
|
||||||
std::string DictString(StringIdMap string_to_int_map) const;
|
std::string DictString() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Total and available capacities of each resource instance.
|
/// Total and available capacities of each resource instance.
|
||||||
|
@ -189,7 +185,7 @@ class NodeResourceInstances {
|
||||||
/// Returns if this equals another node resources.
|
/// Returns if this equals another node resources.
|
||||||
bool operator==(const NodeResourceInstances &other);
|
bool operator==(const NodeResourceInstances &other);
|
||||||
/// Returns human-readable string for these resources.
|
/// Returns human-readable string for these resources.
|
||||||
[[nodiscard]] std::string DebugString(StringIdMap string_to_int_map) const;
|
[[nodiscard]] std::string DebugString() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Node {
|
struct Node {
|
||||||
|
@ -211,13 +207,11 @@ struct Node {
|
||||||
|
|
||||||
/// \request Conversion result to a ResourceRequest data structure.
|
/// \request Conversion result to a ResourceRequest data structure.
|
||||||
NodeResources ResourceMapToNodeResources(
|
NodeResources ResourceMapToNodeResources(
|
||||||
StringIdMap &string_to_int_map,
|
|
||||||
const absl::flat_hash_map<std::string, double> &resource_map_total,
|
const absl::flat_hash_map<std::string, double> &resource_map_total,
|
||||||
const absl::flat_hash_map<std::string, double> &resource_map_available);
|
const absl::flat_hash_map<std::string, double> &resource_map_available);
|
||||||
|
|
||||||
/// Convert a map of resources to a ResourceRequest data structure.
|
/// Convert a map of resources to a ResourceRequest data structure.
|
||||||
ResourceRequest ResourceMapToResourceRequest(
|
ResourceRequest ResourceMapToResourceRequest(
|
||||||
StringIdMap &string_to_int_map,
|
|
||||||
const absl::flat_hash_map<std::string, double> &resource_map,
|
const absl::flat_hash_map<std::string, double> &resource_map,
|
||||||
bool requires_object_store_memory);
|
bool requires_object_store_memory);
|
||||||
|
|
||||||
|
|
|
@ -22,22 +22,21 @@
|
||||||
|
|
||||||
namespace ray {
|
namespace ray {
|
||||||
|
|
||||||
ClusterResourceManager::ClusterResourceManager(StringIdMap &string_to_int_map)
|
ClusterResourceManager::ClusterResourceManager() : nodes_{} {}
|
||||||
: nodes_{}, string_to_int_map_{string_to_int_map} {}
|
|
||||||
|
|
||||||
void ClusterResourceManager::AddOrUpdateNode(
|
void ClusterResourceManager::AddOrUpdateNode(
|
||||||
const std::string &node_id,
|
scheduling::NodeID node_id,
|
||||||
const absl::flat_hash_map<std::string, double> &resources_total,
|
const absl::flat_hash_map<std::string, double> &resources_total,
|
||||||
const absl::flat_hash_map<std::string, double> &resources_available) {
|
const absl::flat_hash_map<std::string, double> &resources_available) {
|
||||||
NodeResources node_resources = ResourceMapToNodeResources(
|
NodeResources node_resources =
|
||||||
string_to_int_map_, resources_total, resources_available);
|
ResourceMapToNodeResources(resources_total, resources_available);
|
||||||
AddOrUpdateNode(string_to_int_map_.Insert(node_id), node_resources);
|
AddOrUpdateNode(node_id, node_resources);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ClusterResourceManager::AddOrUpdateNode(int64_t node_id,
|
void ClusterResourceManager::AddOrUpdateNode(scheduling::NodeID node_id,
|
||||||
const NodeResources &node_resources) {
|
const NodeResources &node_resources) {
|
||||||
RAY_LOG(DEBUG) << "Update node info, node_id: " << node_id << ", node_resources: "
|
RAY_LOG(DEBUG) << "Update node info, node_id: " << node_id.ToInt()
|
||||||
<< node_resources.DebugString(string_to_int_map_);
|
<< ", node_resources: " << node_resources.DebugString();
|
||||||
auto it = nodes_.find(node_id);
|
auto it = nodes_.find(node_id);
|
||||||
if (it == nodes_.end()) {
|
if (it == nodes_.end()) {
|
||||||
// This node is new, so add it to the map.
|
// This node is new, so add it to the map.
|
||||||
|
@ -48,17 +47,16 @@ void ClusterResourceManager::AddOrUpdateNode(int64_t node_id,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ClusterResourceManager::UpdateNode(const std::string &node_id_string,
|
bool ClusterResourceManager::UpdateNode(scheduling::NodeID node_id,
|
||||||
const rpc::ResourcesData &resource_data) {
|
const rpc::ResourcesData &resource_data) {
|
||||||
auto node_id = string_to_int_map_.Insert(node_id_string);
|
|
||||||
if (!nodes_.contains(node_id)) {
|
if (!nodes_.contains(node_id)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto resources_total = MapFromProtobuf(resource_data.resources_total());
|
auto resources_total = MapFromProtobuf(resource_data.resources_total());
|
||||||
auto resources_available = MapFromProtobuf(resource_data.resources_available());
|
auto resources_available = MapFromProtobuf(resource_data.resources_available());
|
||||||
NodeResources node_resources = ResourceMapToNodeResources(
|
NodeResources node_resources =
|
||||||
string_to_int_map_, resources_total, resources_available);
|
ResourceMapToNodeResources(resources_total, resources_available);
|
||||||
NodeResources local_view;
|
NodeResources local_view;
|
||||||
RAY_CHECK(GetNodeResources(node_id, &local_view));
|
RAY_CHECK(GetNodeResources(node_id, &local_view));
|
||||||
|
|
||||||
|
@ -88,7 +86,7 @@ bool ClusterResourceManager::UpdateNode(const std::string &node_id_string,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ClusterResourceManager::RemoveNode(int64_t node_id) {
|
bool ClusterResourceManager::RemoveNode(scheduling::NodeID node_id) {
|
||||||
auto it = nodes_.find(node_id);
|
auto it = nodes_.find(node_id);
|
||||||
if (it == nodes_.end()) {
|
if (it == nodes_.end()) {
|
||||||
// Node not found.
|
// Node not found.
|
||||||
|
@ -99,16 +97,7 @@ bool ClusterResourceManager::RemoveNode(int64_t node_id) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ClusterResourceManager::RemoveNode(const std::string &node_id_string) {
|
bool ClusterResourceManager::GetNodeResources(scheduling::NodeID node_id,
|
||||||
auto node_id = string_to_int_map_.Get(node_id_string);
|
|
||||||
if (node_id == -1) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return RemoveNode(node_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ClusterResourceManager::GetNodeResources(int64_t node_id,
|
|
||||||
NodeResources *ret_resources) const {
|
NodeResources *ret_resources) const {
|
||||||
auto it = nodes_.find(node_id);
|
auto it = nodes_.find(node_id);
|
||||||
if (it != nodes_.end()) {
|
if (it != nodes_.end()) {
|
||||||
|
@ -120,28 +109,24 @@ bool ClusterResourceManager::GetNodeResources(int64_t node_id,
|
||||||
}
|
}
|
||||||
|
|
||||||
const NodeResources &ClusterResourceManager::GetNodeResources(
|
const NodeResources &ClusterResourceManager::GetNodeResources(
|
||||||
const std::string &node_name) const {
|
scheduling::NodeID node_id) const {
|
||||||
int64_t node_id = string_to_int_map_.Get(node_name);
|
|
||||||
const auto &node = map_find_or_die(nodes_, node_id);
|
const auto &node = map_find_or_die(nodes_, node_id);
|
||||||
return node.GetLocalView();
|
return node.GetLocalView();
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t ClusterResourceManager::NumNodes() const { return nodes_.size(); }
|
int64_t ClusterResourceManager::NumNodes() const { return nodes_.size(); }
|
||||||
|
|
||||||
void ClusterResourceManager::UpdateResourceCapacity(const std::string &node_id_string,
|
void ClusterResourceManager::UpdateResourceCapacity(scheduling::NodeID node_id,
|
||||||
const std::string &resource_name,
|
scheduling::ResourceID resource_id,
|
||||||
double resource_total) {
|
double resource_total) {
|
||||||
int64_t node_id = string_to_int_map_.Get(node_id_string);
|
|
||||||
|
|
||||||
auto it = nodes_.find(node_id);
|
auto it = nodes_.find(node_id);
|
||||||
if (it == nodes_.end()) {
|
if (it == nodes_.end()) {
|
||||||
NodeResources node_resources;
|
NodeResources node_resources;
|
||||||
node_resources.predefined_resources.resize(PredefinedResources_MAX);
|
node_resources.predefined_resources.resize(PredefinedResources_MAX);
|
||||||
node_id = string_to_int_map_.Insert(node_id_string);
|
|
||||||
it = nodes_.emplace(node_id, node_resources).first;
|
it = nodes_.emplace(node_id, node_resources).first;
|
||||||
}
|
}
|
||||||
|
|
||||||
int idx = GetPredefinedResourceIndex(resource_name);
|
int idx = GetPredefinedResourceIndex(resource_id);
|
||||||
|
|
||||||
auto local_view = it->second.GetMutableLocalView();
|
auto local_view = it->second.GetMutableLocalView();
|
||||||
FixedPoint resource_total_fp(resource_total);
|
FixedPoint resource_total_fp(resource_total);
|
||||||
|
@ -156,9 +141,7 @@ void ClusterResourceManager::UpdateResourceCapacity(const std::string &node_id_s
|
||||||
local_view->predefined_resources[idx].total = 0;
|
local_view->predefined_resources[idx].total = 0;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
string_to_int_map_.Insert(resource_name);
|
auto itr = local_view->custom_resources.find(resource_id.ToInt());
|
||||||
int64_t resource_id = string_to_int_map_.Get(resource_name);
|
|
||||||
auto itr = local_view->custom_resources.find(resource_id);
|
|
||||||
if (itr != local_view->custom_resources.end()) {
|
if (itr != local_view->custom_resources.end()) {
|
||||||
auto diff_capacity = resource_total_fp - itr->second.total;
|
auto diff_capacity = resource_total_fp - itr->second.total;
|
||||||
itr->second.total += diff_capacity;
|
itr->second.total += diff_capacity;
|
||||||
|
@ -172,29 +155,26 @@ void ClusterResourceManager::UpdateResourceCapacity(const std::string &node_id_s
|
||||||
} else {
|
} else {
|
||||||
ResourceCapacity resource_capacity;
|
ResourceCapacity resource_capacity;
|
||||||
resource_capacity.total = resource_capacity.available = resource_total_fp;
|
resource_capacity.total = resource_capacity.available = resource_total_fp;
|
||||||
local_view->custom_resources.emplace(resource_id, resource_capacity);
|
local_view->custom_resources.emplace(resource_id.ToInt(), resource_capacity);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ClusterResourceManager::DeleteResource(const std::string &node_id_string,
|
void ClusterResourceManager::DeleteResource(scheduling::NodeID node_id,
|
||||||
const std::string &resource_name) {
|
scheduling::ResourceID resource_id) {
|
||||||
int64_t node_id = string_to_int_map_.Get(node_id_string);
|
|
||||||
|
|
||||||
auto it = nodes_.find(node_id);
|
auto it = nodes_.find(node_id);
|
||||||
if (it == nodes_.end()) {
|
if (it == nodes_.end()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
int idx = GetPredefinedResourceIndex(resource_name);
|
int idx = GetPredefinedResourceIndex(resource_id);
|
||||||
auto local_view = it->second.GetMutableLocalView();
|
auto local_view = it->second.GetMutableLocalView();
|
||||||
if (idx != -1) {
|
if (idx != -1) {
|
||||||
local_view->predefined_resources[idx].available = 0;
|
local_view->predefined_resources[idx].available = 0;
|
||||||
local_view->predefined_resources[idx].total = 0;
|
local_view->predefined_resources[idx].total = 0;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
int64_t resource_id = string_to_int_map_.Get(resource_name);
|
auto itr = local_view->custom_resources.find(resource_id.ToInt());
|
||||||
auto itr = local_view->custom_resources.find(resource_id);
|
|
||||||
if (itr != local_view->custom_resources.end()) {
|
if (itr != local_view->custom_resources.end()) {
|
||||||
local_view->custom_resources.erase(itr);
|
local_view->custom_resources.erase(itr);
|
||||||
}
|
}
|
||||||
|
@ -202,33 +182,22 @@ void ClusterResourceManager::DeleteResource(const std::string &node_id_string,
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string ClusterResourceManager::GetNodeResourceViewString(
|
std::string ClusterResourceManager::GetNodeResourceViewString(
|
||||||
const std::string &node_name) const {
|
scheduling::NodeID node_id) const {
|
||||||
int64_t node_id = string_to_int_map_.Get(node_name);
|
|
||||||
const auto &node = map_find_or_die(nodes_, node_id);
|
const auto &node = map_find_or_die(nodes_, node_id);
|
||||||
return node.GetLocalView().DictString(string_to_int_map_);
|
return node.GetLocalView().DictString();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string ClusterResourceManager::GetResourceNameFromIndex(int64_t res_idx) {
|
std::string ClusterResourceManager::GetResourceNameFromIndex(int64_t res_idx) {
|
||||||
if (res_idx == CPU) {
|
return scheduling::ResourceID(res_idx).Binary();
|
||||||
return ray::kCPU_ResourceLabel;
|
|
||||||
} else if (res_idx == GPU) {
|
|
||||||
return ray::kGPU_ResourceLabel;
|
|
||||||
} else if (res_idx == OBJECT_STORE_MEM) {
|
|
||||||
return ray::kObjectStoreMemory_ResourceLabel;
|
|
||||||
} else if (res_idx == MEM) {
|
|
||||||
return ray::kMemory_ResourceLabel;
|
|
||||||
} else {
|
|
||||||
return string_to_int_map_.Get((uint64_t)res_idx);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const absl::flat_hash_map<int64_t, Node> &ClusterResourceManager::GetResourceView()
|
const absl::flat_hash_map<scheduling::NodeID, Node>
|
||||||
const {
|
&ClusterResourceManager::GetResourceView() const {
|
||||||
return nodes_;
|
return nodes_;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ClusterResourceManager::SubtractNodeAvailableResources(
|
bool ClusterResourceManager::SubtractNodeAvailableResources(
|
||||||
int64_t node_id, const ResourceRequest &resource_request) {
|
scheduling::NodeID node_id, const ResourceRequest &resource_request) {
|
||||||
auto it = nodes_.find(node_id);
|
auto it = nodes_.find(node_id);
|
||||||
if (it == nodes_.end()) {
|
if (it == nodes_.end()) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -260,7 +229,7 @@ bool ClusterResourceManager::SubtractNodeAvailableResources(
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ClusterResourceManager::HasSufficientResource(
|
bool ClusterResourceManager::HasSufficientResource(
|
||||||
int64_t node_id, const ResourceRequest &resource_request,
|
scheduling::NodeID node_id, const ResourceRequest &resource_request,
|
||||||
bool ignore_object_store_memory_requirement) const {
|
bool ignore_object_store_memory_requirement) const {
|
||||||
auto it = nodes_.find(node_id);
|
auto it = nodes_.find(node_id);
|
||||||
if (it == nodes_.end()) {
|
if (it == nodes_.end()) {
|
||||||
|
@ -305,8 +274,8 @@ bool ClusterResourceManager::HasSufficientResource(
|
||||||
|
|
||||||
void ClusterResourceManager::DebugString(std::stringstream &buffer) const {
|
void ClusterResourceManager::DebugString(std::stringstream &buffer) const {
|
||||||
for (auto &node : GetResourceView()) {
|
for (auto &node : GetResourceView()) {
|
||||||
buffer << "node id: " << node.first;
|
buffer << "node id: " << node.first.ToInt();
|
||||||
buffer << node.second.GetLocalView().DebugString(string_to_int_map_);
|
buffer << node.second.GetLocalView().DebugString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -37,10 +37,10 @@ class ClusterTaskManagerTest;
|
||||||
/// This class is not thread safe.
|
/// This class is not thread safe.
|
||||||
class ClusterResourceManager {
|
class ClusterResourceManager {
|
||||||
public:
|
public:
|
||||||
explicit ClusterResourceManager(StringIdMap &string_to_int_map);
|
explicit ClusterResourceManager();
|
||||||
|
|
||||||
/// Get the resource view of the cluster.
|
/// Get the resource view of the cluster.
|
||||||
const absl::flat_hash_map<int64_t, Node> &GetResourceView() const;
|
const absl::flat_hash_map<scheduling::NodeID, Node> &GetResourceView() const;
|
||||||
|
|
||||||
// Mapping from predefined resource indexes to resource strings
|
// Mapping from predefined resource indexes to resource strings
|
||||||
std::string GetResourceNameFromIndex(int64_t res_idx);
|
std::string GetResourceNameFromIndex(int64_t res_idx);
|
||||||
|
@ -49,14 +49,13 @@ class ClusterResourceManager {
|
||||||
///
|
///
|
||||||
/// \param node_id_string ID of the node which resoruces need to be udpated.
|
/// \param node_id_string ID of the node which resoruces need to be udpated.
|
||||||
/// \param resource_data The node resource data.
|
/// \param resource_data The node resource data.
|
||||||
bool UpdateNode(const std::string &node_id_string,
|
bool UpdateNode(scheduling::NodeID node_id, const rpc::ResourcesData &resource_data);
|
||||||
const rpc::ResourcesData &resource_data);
|
|
||||||
|
|
||||||
/// Remove node from the cluster data structure. This happens
|
/// Remove node from the cluster data structure. This happens
|
||||||
/// when a node fails or it is removed from the cluster.
|
/// when a node fails or it is removed from the cluster.
|
||||||
///
|
///
|
||||||
/// \param node_id_string ID of the node to be removed.
|
/// \param node_id_string ID of the node to be removed.
|
||||||
bool RemoveNode(const std::string &node_id_string);
|
bool RemoveNode(scheduling::NodeID node_id);
|
||||||
|
|
||||||
/// Get number of nodes in the cluster.
|
/// Get number of nodes in the cluster.
|
||||||
int64_t NumNodes() const;
|
int64_t NumNodes() const;
|
||||||
|
@ -66,24 +65,24 @@ class ClusterResourceManager {
|
||||||
/// \param node_name: Node whose resource we want to update.
|
/// \param node_name: Node whose resource we want to update.
|
||||||
/// \param resource_name: Resource which we want to update.
|
/// \param resource_name: Resource which we want to update.
|
||||||
/// \param resource_total: New capacity of the resource.
|
/// \param resource_total: New capacity of the resource.
|
||||||
void UpdateResourceCapacity(const std::string &node_name,
|
void UpdateResourceCapacity(scheduling::NodeID node_id,
|
||||||
const std::string &resource_name, double resource_total);
|
scheduling::ResourceID resource_id, double resource_total);
|
||||||
|
|
||||||
/// Delete a given resource from a given node.
|
/// Delete a given resource from a given node.
|
||||||
///
|
///
|
||||||
/// \param node_name: Node whose resource we want to delete.
|
/// \param node_name: Node whose resource we want to delete.
|
||||||
/// \param resource_name: Resource we want to delete
|
/// \param resource_name: Resource we want to delete
|
||||||
void DeleteResource(const std::string &node_name, const std::string &resource_name);
|
void DeleteResource(scheduling::NodeID node_id, scheduling::ResourceID resource_id);
|
||||||
|
|
||||||
/// Return local resources in human-readable string form.
|
/// Return local resources in human-readable string form.
|
||||||
std::string GetNodeResourceViewString(const std::string &node_name) const;
|
std::string GetNodeResourceViewString(scheduling::NodeID node_id) const;
|
||||||
|
|
||||||
/// Get local resource.
|
/// Get local resource.
|
||||||
const NodeResources &GetNodeResources(const std::string &node_name) const;
|
const NodeResources &GetNodeResources(scheduling::NodeID node_id) const;
|
||||||
|
|
||||||
/// Subtract available resource from a given node.
|
/// Subtract available resource from a given node.
|
||||||
//// Return false if such node doesn't exist.
|
//// Return false if such node doesn't exist.
|
||||||
bool SubtractNodeAvailableResources(int64_t node_id,
|
bool SubtractNodeAvailableResources(scheduling::NodeID node_id,
|
||||||
const ResourceRequest &resource_request);
|
const ResourceRequest &resource_request);
|
||||||
|
|
||||||
/// Check if we have sufficient resource to fullfill resource request for an given node.
|
/// Check if we have sufficient resource to fullfill resource request for an given node.
|
||||||
|
@ -92,7 +91,8 @@ class ClusterResourceManager {
|
||||||
/// \param resource_request: the request we want to check.
|
/// \param resource_request: the request we want to check.
|
||||||
/// \param ignore_object_store_memory_requirement: if true, we will ignore the
|
/// \param ignore_object_store_memory_requirement: if true, we will ignore the
|
||||||
/// require_object_store_memory in the resource_request.
|
/// require_object_store_memory in the resource_request.
|
||||||
bool HasSufficientResource(int64_t node_id, const ResourceRequest &resource_request,
|
bool HasSufficientResource(scheduling::NodeID node_id,
|
||||||
|
const ResourceRequest &resource_request,
|
||||||
bool ignore_object_store_memory_requirement) const;
|
bool ignore_object_store_memory_requirement) const;
|
||||||
|
|
||||||
void DebugString(std::stringstream &buffer) const;
|
void DebugString(std::stringstream &buffer) const;
|
||||||
|
@ -104,29 +104,20 @@ class ClusterResourceManager {
|
||||||
///
|
///
|
||||||
/// \param node_id: Node ID.
|
/// \param node_id: Node ID.
|
||||||
/// \param node_resources: Up to date total and available resources of the node.
|
/// \param node_resources: Up to date total and available resources of the node.
|
||||||
void AddOrUpdateNode(int64_t node_id, const NodeResources &node_resources);
|
void AddOrUpdateNode(scheduling::NodeID node_id, const NodeResources &node_resources);
|
||||||
|
|
||||||
void AddOrUpdateNode(
|
void AddOrUpdateNode(
|
||||||
const std::string &node_id,
|
scheduling::NodeID node_id,
|
||||||
const absl::flat_hash_map<std::string, double> &resource_map_total,
|
const absl::flat_hash_map<std::string, double> &resource_map_total,
|
||||||
const absl::flat_hash_map<std::string, double> &resource_map_available);
|
const absl::flat_hash_map<std::string, double> &resource_map_available);
|
||||||
|
|
||||||
/// Remove node from the cluster data structure. This happens
|
|
||||||
/// when a node fails or it is removed from the cluster.
|
|
||||||
///
|
|
||||||
/// \param node_id ID of the node to be removed.
|
|
||||||
bool RemoveNode(int64_t node_id);
|
|
||||||
|
|
||||||
/// Return resources associated to the given node_id in ret_resources.
|
/// Return resources associated to the given node_id in ret_resources.
|
||||||
/// If node_id not found, return false; otherwise return true.
|
/// If node_id not found, return false; otherwise return true.
|
||||||
bool GetNodeResources(int64_t node_id, NodeResources *ret_resources) const;
|
bool GetNodeResources(scheduling::NodeID node_id, NodeResources *ret_resources) const;
|
||||||
|
|
||||||
/// List of nodes in the clusters and their resources organized as a map.
|
/// List of nodes in the clusters and their resources organized as a map.
|
||||||
/// The key of the map is the node ID.
|
/// The key of the map is the node ID.
|
||||||
absl::flat_hash_map<int64_t, Node> nodes_;
|
absl::flat_hash_map<scheduling::NodeID, Node> nodes_;
|
||||||
/// Keep the mapping between node and resource IDs in string representation
|
|
||||||
/// to integer representation. Used for improving map performance.
|
|
||||||
StringIdMap &string_to_int_map_;
|
|
||||||
|
|
||||||
friend class ClusterResourceSchedulerTest;
|
friend class ClusterResourceSchedulerTest;
|
||||||
friend struct ClusterResourceManagerTest;
|
friend struct ClusterResourceManagerTest;
|
||||||
|
|
|
@ -18,14 +18,14 @@
|
||||||
|
|
||||||
namespace ray {
|
namespace ray {
|
||||||
|
|
||||||
NodeResources CreateNodeResources(StringIdMap &map, double available_cpu,
|
NodeResources CreateNodeResources(double available_cpu, double total_cpu,
|
||||||
double total_cpu, double available_custom_resource = 0,
|
double available_custom_resource = 0,
|
||||||
double total_custom_resource = 0,
|
double total_custom_resource = 0,
|
||||||
bool object_pulls_queued = false) {
|
bool object_pulls_queued = false) {
|
||||||
NodeResources resources;
|
NodeResources resources;
|
||||||
resources.predefined_resources = {{available_cpu, total_cpu}, {0, 0}, {0, 0}, {0, 0}};
|
resources.predefined_resources = {{available_cpu, total_cpu}, {0, 0}, {0, 0}, {0, 0}};
|
||||||
resources.custom_resources[map.Insert("CUSTOM")] = {available_custom_resource,
|
resources.custom_resources[scheduling::ResourceID("CUSTOM").ToInt()] = {
|
||||||
total_custom_resource};
|
available_custom_resource, total_custom_resource};
|
||||||
resources.object_pulls_queued = object_pulls_queued;
|
resources.object_pulls_queued = object_pulls_queued;
|
||||||
return resources;
|
return resources;
|
||||||
}
|
}
|
||||||
|
@ -33,22 +33,21 @@ NodeResources CreateNodeResources(StringIdMap &map, double available_cpu,
|
||||||
struct ClusterResourceManagerTest : public ::testing::Test {
|
struct ClusterResourceManagerTest : public ::testing::Test {
|
||||||
void SetUp() {
|
void SetUp() {
|
||||||
::testing::Test::SetUp();
|
::testing::Test::SetUp();
|
||||||
manager = std::make_unique<ClusterResourceManager>(map);
|
manager = std::make_unique<ClusterResourceManager>();
|
||||||
|
manager->AddOrUpdateNode(node0,
|
||||||
|
CreateNodeResources(/*available_cpu*/ 1, /*total_cpu*/ 1));
|
||||||
manager->AddOrUpdateNode(
|
manager->AddOrUpdateNode(
|
||||||
node0, CreateNodeResources(map, /*available_cpu*/ 1, /*total_cpu*/ 1));
|
node1, CreateNodeResources(/*available_cpu*/ 0, /*total_cpu*/ 0,
|
||||||
manager->AddOrUpdateNode(
|
|
||||||
node1, CreateNodeResources(map, /*available_cpu*/ 0, /*total_cpu*/ 0,
|
|
||||||
/*available_custom*/ 1, /*total_custom*/ 1));
|
/*available_custom*/ 1, /*total_custom*/ 1));
|
||||||
manager->AddOrUpdateNode(
|
manager->AddOrUpdateNode(
|
||||||
node2, CreateNodeResources(map, /*available_cpu*/ 1, /*total_cpu*/ 1,
|
node2, CreateNodeResources(/*available_cpu*/ 1, /*total_cpu*/ 1,
|
||||||
/*available_custom*/ 1, /*total_custom*/ 1,
|
/*available_custom*/ 1, /*total_custom*/ 1,
|
||||||
/*object_pulls_queued*/ true));
|
/*object_pulls_queued*/ true));
|
||||||
}
|
}
|
||||||
StringIdMap map;
|
scheduling::NodeID node0 = scheduling::NodeID(0);
|
||||||
int64_t node0 = 0;
|
scheduling::NodeID node1 = scheduling::NodeID(1);
|
||||||
int64_t node1 = 1;
|
scheduling::NodeID node2 = scheduling::NodeID(2);
|
||||||
int64_t node2 = 2;
|
scheduling::NodeID node3 = scheduling::NodeID(3);
|
||||||
int64_t node3 = 3;
|
|
||||||
std::unique_ptr<ClusterResourceManager> manager;
|
std::unique_ptr<ClusterResourceManager> manager;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -57,32 +56,32 @@ TEST_F(ClusterResourceManagerTest, HasSufficientResourceTest) {
|
||||||
node3, {}, /*ignore_object_store_memory_requirement*/ false));
|
node3, {}, /*ignore_object_store_memory_requirement*/ false));
|
||||||
ASSERT_TRUE(manager->HasSufficientResource(
|
ASSERT_TRUE(manager->HasSufficientResource(
|
||||||
node0,
|
node0,
|
||||||
ResourceMapToResourceRequest(map, {{"CPU", 1}},
|
ResourceMapToResourceRequest({{"CPU", 1}},
|
||||||
/*requires_object_store_memory=*/true),
|
/*requires_object_store_memory=*/true),
|
||||||
/*ignore_object_store_memory_requirement*/ false));
|
/*ignore_object_store_memory_requirement*/ false));
|
||||||
ASSERT_FALSE(manager->HasSufficientResource(
|
ASSERT_FALSE(manager->HasSufficientResource(
|
||||||
node0,
|
node0,
|
||||||
ResourceMapToResourceRequest(map, {{"CUSTOM", 1}},
|
ResourceMapToResourceRequest({{"CUSTOM", 1}},
|
||||||
/*requires_object_store_memory=*/true),
|
/*requires_object_store_memory=*/true),
|
||||||
/*ignore_object_store_memory_requirement*/ false));
|
/*ignore_object_store_memory_requirement*/ false));
|
||||||
ASSERT_TRUE(manager->HasSufficientResource(
|
ASSERT_TRUE(manager->HasSufficientResource(
|
||||||
node1,
|
node1,
|
||||||
ResourceMapToResourceRequest(map, {{"CUSTOM", 1}},
|
ResourceMapToResourceRequest({{"CUSTOM", 1}},
|
||||||
/*requires_object_store_memory=*/true),
|
/*requires_object_store_memory=*/true),
|
||||||
/*ignore_object_store_memory_requirement*/ false));
|
/*ignore_object_store_memory_requirement*/ false));
|
||||||
ASSERT_TRUE(manager->HasSufficientResource(
|
ASSERT_TRUE(manager->HasSufficientResource(
|
||||||
node2,
|
node2,
|
||||||
ResourceMapToResourceRequest(map, {{"CPU", 1}},
|
ResourceMapToResourceRequest({{"CPU", 1}},
|
||||||
/*requires_object_store_memory=*/false),
|
/*requires_object_store_memory=*/false),
|
||||||
/*ignore_object_store_memory_requirement*/ false));
|
/*ignore_object_store_memory_requirement*/ false));
|
||||||
ASSERT_FALSE(manager->HasSufficientResource(
|
ASSERT_FALSE(manager->HasSufficientResource(
|
||||||
node2,
|
node2,
|
||||||
ResourceMapToResourceRequest(map, {{"CPU", 1}},
|
ResourceMapToResourceRequest({{"CPU", 1}},
|
||||||
/*requires_object_store_memory=*/true),
|
/*requires_object_store_memory=*/true),
|
||||||
/*ignore_object_store_memory_requirement*/ false));
|
/*ignore_object_store_memory_requirement*/ false));
|
||||||
ASSERT_TRUE(manager->HasSufficientResource(
|
ASSERT_TRUE(manager->HasSufficientResource(
|
||||||
node2,
|
node2,
|
||||||
ResourceMapToResourceRequest(map, {{"CPU", 1}},
|
ResourceMapToResourceRequest({{"CPU", 1}},
|
||||||
/*requires_object_store_memory=*/true),
|
/*requires_object_store_memory=*/true),
|
||||||
/*ignore_object_store_memory_requirement*/ true));
|
/*ignore_object_store_memory_requirement*/ true));
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,25 +21,13 @@
|
||||||
|
|
||||||
namespace ray {
|
namespace ray {
|
||||||
|
|
||||||
namespace {
|
ClusterResourceScheduler::ClusterResourceScheduler()
|
||||||
// Add predefined resource in string_to_int_map to
|
: local_node_id_(scheduling::NodeID::Nil()) {
|
||||||
// avoid conflict.
|
cluster_resource_manager_ = std::make_unique<ClusterResourceManager>();
|
||||||
void PopulatePredefinedResources(StringIdMap &string_to_int_map) {
|
|
||||||
string_to_int_map.InsertOrDie(ray::kCPU_ResourceLabel, CPU)
|
|
||||||
.InsertOrDie(ray::kGPU_ResourceLabel, GPU)
|
|
||||||
.InsertOrDie(ray::kObjectStoreMemory_ResourceLabel, OBJECT_STORE_MEM)
|
|
||||||
.InsertOrDie(ray::kMemory_ResourceLabel, MEM);
|
|
||||||
}
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
ClusterResourceScheduler::ClusterResourceScheduler() {
|
|
||||||
PopulatePredefinedResources(string_to_int_map_);
|
|
||||||
cluster_resource_manager_ =
|
|
||||||
std::make_unique<ClusterResourceManager>(string_to_int_map_);
|
|
||||||
NodeResources node_resources;
|
NodeResources node_resources;
|
||||||
node_resources.predefined_resources.resize(PredefinedResources_MAX);
|
node_resources.predefined_resources.resize(PredefinedResources_MAX);
|
||||||
local_resource_manager_ = std::make_unique<LocalResourceManager>(
|
local_resource_manager_ = std::make_unique<LocalResourceManager>(
|
||||||
local_node_id_, string_to_int_map_, node_resources,
|
local_node_id_, node_resources,
|
||||||
/*get_used_object_store_memory*/ nullptr, /*get_pull_manager_at_capacity*/ nullptr,
|
/*get_used_object_store_memory*/ nullptr, /*get_pull_manager_at_capacity*/ nullptr,
|
||||||
[&](const NodeResources &local_resource_update) {
|
[&](const NodeResources &local_resource_update) {
|
||||||
cluster_resource_manager_->AddOrUpdateNode(local_node_id_, local_resource_update);
|
cluster_resource_manager_->AddOrUpdateNode(local_node_id_, local_resource_update);
|
||||||
|
@ -49,14 +37,12 @@ ClusterResourceScheduler::ClusterResourceScheduler() {
|
||||||
}
|
}
|
||||||
|
|
||||||
ClusterResourceScheduler::ClusterResourceScheduler(
|
ClusterResourceScheduler::ClusterResourceScheduler(
|
||||||
int64_t local_node_id, const NodeResources &local_node_resources,
|
scheduling::NodeID local_node_id, const NodeResources &local_node_resources,
|
||||||
gcs::GcsClient &gcs_client)
|
gcs::GcsClient &gcs_client)
|
||||||
: string_to_int_map_(), local_node_id_(local_node_id), gcs_client_(&gcs_client) {
|
: local_node_id_(local_node_id), gcs_client_(&gcs_client) {
|
||||||
PopulatePredefinedResources(string_to_int_map_);
|
cluster_resource_manager_ = std::make_unique<ClusterResourceManager>();
|
||||||
cluster_resource_manager_ =
|
|
||||||
std::make_unique<ClusterResourceManager>(string_to_int_map_);
|
|
||||||
local_resource_manager_ = std::make_unique<LocalResourceManager>(
|
local_resource_manager_ = std::make_unique<LocalResourceManager>(
|
||||||
local_node_id, string_to_int_map_, local_node_resources,
|
local_node_id, local_node_resources,
|
||||||
/*get_used_object_store_memory*/ nullptr, /*get_pull_manager_at_capacity*/ nullptr,
|
/*get_used_object_store_memory*/ nullptr, /*get_pull_manager_at_capacity*/ nullptr,
|
||||||
[&](const NodeResources &local_resource_update) {
|
[&](const NodeResources &local_resource_update) {
|
||||||
cluster_resource_manager_->AddOrUpdateNode(local_node_id_, local_resource_update);
|
cluster_resource_manager_->AddOrUpdateNode(local_node_id_, local_resource_update);
|
||||||
|
@ -67,19 +53,16 @@ ClusterResourceScheduler::ClusterResourceScheduler(
|
||||||
}
|
}
|
||||||
|
|
||||||
ClusterResourceScheduler::ClusterResourceScheduler(
|
ClusterResourceScheduler::ClusterResourceScheduler(
|
||||||
const std::string &local_node_id,
|
scheduling::NodeID local_node_id,
|
||||||
const absl::flat_hash_map<std::string, double> &local_node_resources,
|
const absl::flat_hash_map<std::string, double> &local_node_resources,
|
||||||
gcs::GcsClient &gcs_client, std::function<int64_t(void)> get_used_object_store_memory,
|
gcs::GcsClient &gcs_client, std::function<int64_t(void)> get_used_object_store_memory,
|
||||||
std::function<bool(void)> get_pull_manager_at_capacity)
|
std::function<bool(void)> get_pull_manager_at_capacity)
|
||||||
: string_to_int_map_(), local_node_id_(), gcs_client_(&gcs_client) {
|
: local_node_id_(local_node_id), gcs_client_(&gcs_client) {
|
||||||
PopulatePredefinedResources(string_to_int_map_);
|
NodeResources node_resources =
|
||||||
local_node_id_ = string_to_int_map_.Insert(local_node_id);
|
ResourceMapToNodeResources(local_node_resources, local_node_resources);
|
||||||
NodeResources node_resources = ResourceMapToNodeResources(
|
cluster_resource_manager_ = std::make_unique<ClusterResourceManager>();
|
||||||
string_to_int_map_, local_node_resources, local_node_resources);
|
|
||||||
cluster_resource_manager_ =
|
|
||||||
std::make_unique<ClusterResourceManager>(string_to_int_map_);
|
|
||||||
local_resource_manager_ = std::make_unique<LocalResourceManager>(
|
local_resource_manager_ = std::make_unique<LocalResourceManager>(
|
||||||
local_node_id_, string_to_int_map_, node_resources, get_used_object_store_memory,
|
local_node_id_, node_resources, get_used_object_store_memory,
|
||||||
get_pull_manager_at_capacity, [&](const NodeResources &local_resource_update) {
|
get_pull_manager_at_capacity, [&](const NodeResources &local_resource_update) {
|
||||||
cluster_resource_manager_->AddOrUpdateNode(local_node_id_, local_resource_update);
|
cluster_resource_manager_->AddOrUpdateNode(local_node_id_, local_resource_update);
|
||||||
});
|
});
|
||||||
|
@ -88,19 +71,18 @@ ClusterResourceScheduler::ClusterResourceScheduler(
|
||||||
local_node_id_, cluster_resource_manager_->GetResourceView());
|
local_node_id_, cluster_resource_manager_->GetResourceView());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ClusterResourceScheduler::NodeAlive(int64_t node_id) const {
|
bool ClusterResourceScheduler::NodeAlive(scheduling::NodeID node_id) const {
|
||||||
if (node_id == local_node_id_) {
|
if (node_id == local_node_id_) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (node_id == -1) {
|
if (node_id.IsNil()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
auto node_id_binary = string_to_int_map_.Get(node_id);
|
return gcs_client_->Nodes().Get(NodeID::FromBinary(node_id.Binary())) != nullptr;
|
||||||
return gcs_client_->Nodes().Get(NodeID::FromBinary(node_id_binary)) != nullptr;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ClusterResourceScheduler::IsSchedulable(const ResourceRequest &resource_request,
|
bool ClusterResourceScheduler::IsSchedulable(const ResourceRequest &resource_request,
|
||||||
int64_t node_id) const {
|
scheduling::NodeID node_id) const {
|
||||||
// It's okay if the local node's pull manager is at capacity because we
|
// It's okay if the local node's pull manager is at capacity because we
|
||||||
// will eventually spill the task back from the waiting queue if its args
|
// will eventually spill the task back from the waiting queue if its args
|
||||||
// cannot be pulled.
|
// cannot be pulled.
|
||||||
|
@ -109,7 +91,7 @@ bool ClusterResourceScheduler::IsSchedulable(const ResourceRequest &resource_req
|
||||||
/*ignore_object_store_memory_requirement*/ node_id == local_node_id_);
|
/*ignore_object_store_memory_requirement*/ node_id == local_node_id_);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t ClusterResourceScheduler::GetBestSchedulableNode(
|
scheduling::NodeID ClusterResourceScheduler::GetBestSchedulableNode(
|
||||||
const ResourceRequest &resource_request,
|
const ResourceRequest &resource_request,
|
||||||
const rpc::SchedulingStrategy &scheduling_strategy, bool actor_creation,
|
const rpc::SchedulingStrategy &scheduling_strategy, bool actor_creation,
|
||||||
bool force_spillback, int64_t *total_violations, bool *is_infeasible) {
|
bool force_spillback, int64_t *total_violations, bool *is_infeasible) {
|
||||||
|
@ -120,7 +102,7 @@ int64_t ClusterResourceScheduler::GetBestSchedulableNode(
|
||||||
resource_request, [this](auto node_id) { return this->NodeAlive(node_id); });
|
resource_request, [this](auto node_id) { return this->NodeAlive(node_id); });
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t best_node_id = -1;
|
auto best_node_id = scheduling::NodeID::Nil();
|
||||||
if (scheduling_strategy.scheduling_strategy_case() ==
|
if (scheduling_strategy.scheduling_strategy_case() ==
|
||||||
rpc::SchedulingStrategy::SchedulingStrategyCase::kSpreadSchedulingStrategy) {
|
rpc::SchedulingStrategy::SchedulingStrategyCase::kSpreadSchedulingStrategy) {
|
||||||
best_node_id = scheduling_policy_->SpreadPolicy(
|
best_node_id = scheduling_policy_->SpreadPolicy(
|
||||||
|
@ -135,7 +117,7 @@ int64_t ClusterResourceScheduler::GetBestSchedulableNode(
|
||||||
[this](auto node_id) { return this->NodeAlive(node_id); });
|
[this](auto node_id) { return this->NodeAlive(node_id); });
|
||||||
}
|
}
|
||||||
|
|
||||||
*is_infeasible = best_node_id == -1 ? true : false;
|
*is_infeasible = best_node_id.IsNil();
|
||||||
if (!*is_infeasible) {
|
if (!*is_infeasible) {
|
||||||
// TODO (Alex): Support soft constraints if needed later.
|
// TODO (Alex): Support soft constraints if needed later.
|
||||||
*total_violations = 0;
|
*total_violations = 0;
|
||||||
|
@ -143,35 +125,26 @@ int64_t ClusterResourceScheduler::GetBestSchedulableNode(
|
||||||
|
|
||||||
RAY_LOG(DEBUG) << "Scheduling decision. "
|
RAY_LOG(DEBUG) << "Scheduling decision. "
|
||||||
<< "forcing spillback: " << force_spillback
|
<< "forcing spillback: " << force_spillback
|
||||||
<< ". Best node: " << best_node_id << " "
|
<< ". Best node: " << best_node_id.ToInt() << " "
|
||||||
<< (string_to_int_map_.Get(best_node_id) == "-1"
|
<< (best_node_id.IsNil() ? NodeID::Nil()
|
||||||
? NodeID::Nil()
|
: NodeID::FromBinary(best_node_id.Binary()))
|
||||||
: NodeID::FromBinary(string_to_int_map_.Get(best_node_id)))
|
|
||||||
<< ", is infeasible: " << *is_infeasible;
|
<< ", is infeasible: " << *is_infeasible;
|
||||||
return best_node_id;
|
return best_node_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string ClusterResourceScheduler::GetBestSchedulableNode(
|
scheduling::NodeID ClusterResourceScheduler::GetBestSchedulableNode(
|
||||||
const absl::flat_hash_map<std::string, double> &task_resources,
|
const absl::flat_hash_map<std::string, double> &task_resources,
|
||||||
const rpc::SchedulingStrategy &scheduling_strategy, bool requires_object_store_memory,
|
const rpc::SchedulingStrategy &scheduling_strategy, bool requires_object_store_memory,
|
||||||
bool actor_creation, bool force_spillback, int64_t *total_violations,
|
bool actor_creation, bool force_spillback, int64_t *total_violations,
|
||||||
bool *is_infeasible) {
|
bool *is_infeasible) {
|
||||||
ResourceRequest resource_request = ResourceMapToResourceRequest(
|
ResourceRequest resource_request =
|
||||||
string_to_int_map_, task_resources, requires_object_store_memory);
|
ResourceMapToResourceRequest(task_resources, requires_object_store_memory);
|
||||||
int64_t node_id =
|
return GetBestSchedulableNode(resource_request, scheduling_strategy, actor_creation,
|
||||||
GetBestSchedulableNode(resource_request, scheduling_strategy, actor_creation,
|
force_spillback, total_violations, is_infeasible);
|
||||||
force_spillback, total_violations, is_infeasible);
|
|
||||||
|
|
||||||
if (node_id == -1) {
|
|
||||||
// This is not a schedulable node, so return empty string.
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
// Return the string name of the node.
|
|
||||||
return string_to_int_map_.Get(node_id);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ClusterResourceScheduler::SubtractRemoteNodeAvailableResources(
|
bool ClusterResourceScheduler::SubtractRemoteNodeAvailableResources(
|
||||||
int64_t node_id, const ResourceRequest &resource_request) {
|
scheduling::NodeID node_id, const ResourceRequest &resource_request) {
|
||||||
RAY_CHECK(node_id != local_node_id_);
|
RAY_CHECK(node_id != local_node_id_);
|
||||||
|
|
||||||
// Just double check this node can still schedule the resource request.
|
// Just double check this node can still schedule the resource request.
|
||||||
|
@ -182,46 +155,40 @@ bool ClusterResourceScheduler::SubtractRemoteNodeAvailableResources(
|
||||||
resource_request);
|
resource_request);
|
||||||
}
|
}
|
||||||
|
|
||||||
const StringIdMap &ClusterResourceScheduler::GetStringIdMap() const {
|
|
||||||
return string_to_int_map_;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string ClusterResourceScheduler::DebugString(void) const {
|
std::string ClusterResourceScheduler::DebugString(void) const {
|
||||||
std::stringstream buffer;
|
std::stringstream buffer;
|
||||||
buffer << "\nLocal id: " << local_node_id_;
|
buffer << "\nLocal id: " << local_node_id_.ToInt();
|
||||||
buffer << " Local resources: " << local_resource_manager_->DebugString();
|
buffer << " Local resources: " << local_resource_manager_->DebugString();
|
||||||
cluster_resource_manager_->DebugString(buffer);
|
cluster_resource_manager_->DebugString(buffer);
|
||||||
return buffer.str();
|
return buffer.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ClusterResourceScheduler::AllocateRemoteTaskResources(
|
bool ClusterResourceScheduler::AllocateRemoteTaskResources(
|
||||||
const std::string &node_string,
|
scheduling::NodeID node_id,
|
||||||
const absl::flat_hash_map<std::string, double> &task_resources) {
|
const absl::flat_hash_map<std::string, double> &task_resources) {
|
||||||
ResourceRequest resource_request = ResourceMapToResourceRequest(
|
ResourceRequest resource_request = ResourceMapToResourceRequest(
|
||||||
string_to_int_map_, task_resources, /*requires_object_store_memory=*/false);
|
task_resources, /*requires_object_store_memory=*/false);
|
||||||
auto node_id = string_to_int_map_.Insert(node_string);
|
|
||||||
RAY_CHECK(node_id != local_node_id_);
|
RAY_CHECK(node_id != local_node_id_);
|
||||||
return SubtractRemoteNodeAvailableResources(node_id, resource_request);
|
return SubtractRemoteNodeAvailableResources(node_id, resource_request);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ClusterResourceScheduler::IsSchedulableOnNode(
|
bool ClusterResourceScheduler::IsSchedulableOnNode(
|
||||||
const std::string &node_name, const absl::flat_hash_map<std::string, double> &shape) {
|
scheduling::NodeID node_id, const absl::flat_hash_map<std::string, double> &shape) {
|
||||||
int64_t node_id = string_to_int_map_.Get(node_name);
|
auto resource_request =
|
||||||
auto resource_request = ResourceMapToResourceRequest(
|
ResourceMapToResourceRequest(shape, /*requires_object_store_memory=*/false);
|
||||||
string_to_int_map_, shape, /*requires_object_store_memory=*/false);
|
|
||||||
return IsSchedulable(resource_request, node_id);
|
return IsSchedulable(resource_request, node_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string ClusterResourceScheduler::GetBestSchedulableNode(
|
scheduling::NodeID ClusterResourceScheduler::GetBestSchedulableNode(
|
||||||
const TaskSpecification &task_spec, bool prioritize_local_node,
|
const TaskSpecification &task_spec, bool prioritize_local_node,
|
||||||
bool exclude_local_node, bool requires_object_store_memory, bool *is_infeasible) {
|
bool exclude_local_node, bool requires_object_store_memory, bool *is_infeasible) {
|
||||||
// If the local node is available, we should directly return it instead of
|
// If the local node is available, we should directly return it instead of
|
||||||
// going through the full hybrid policy since we don't want spillback.
|
// going through the full hybrid policy since we don't want spillback.
|
||||||
if (prioritize_local_node && !exclude_local_node &&
|
if (prioritize_local_node && !exclude_local_node &&
|
||||||
IsSchedulableOnNode(string_to_int_map_.Get(local_node_id_),
|
IsSchedulableOnNode(local_node_id_,
|
||||||
task_spec.GetRequiredResources().GetResourceMap())) {
|
task_spec.GetRequiredResources().GetResourceMap())) {
|
||||||
*is_infeasible = false;
|
*is_infeasible = false;
|
||||||
return string_to_int_map_.Get(local_node_id_);
|
return local_node_id_;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This argument is used to set violation, which is an unsupported feature now.
|
// This argument is used to set violation, which is an unsupported feature now.
|
||||||
|
|
|
@ -50,18 +50,17 @@ class ClusterResourceScheduler {
|
||||||
/// \param local_node_id: ID of local node,
|
/// \param local_node_id: ID of local node,
|
||||||
/// \param local_node_resources: The total and the available resources associated
|
/// \param local_node_resources: The total and the available resources associated
|
||||||
/// with the local node.
|
/// with the local node.
|
||||||
ClusterResourceScheduler(int64_t local_node_id,
|
ClusterResourceScheduler(scheduling::NodeID local_node_id,
|
||||||
const NodeResources &local_node_resources,
|
const NodeResources &local_node_resources,
|
||||||
gcs::GcsClient &gcs_client);
|
gcs::GcsClient &gcs_client);
|
||||||
|
|
||||||
ClusterResourceScheduler(
|
ClusterResourceScheduler(
|
||||||
const std::string &local_node_id,
|
scheduling::NodeID local_node_id,
|
||||||
const absl::flat_hash_map<std::string, double> &local_node_resources,
|
const absl::flat_hash_map<std::string, double> &local_node_resources,
|
||||||
gcs::GcsClient &gcs_client,
|
gcs::GcsClient &gcs_client,
|
||||||
std::function<int64_t(void)> get_used_object_store_memory = nullptr,
|
std::function<int64_t(void)> get_used_object_store_memory = nullptr,
|
||||||
std::function<bool(void)> get_pull_manager_at_capacity = nullptr);
|
std::function<bool(void)> get_pull_manager_at_capacity = nullptr);
|
||||||
|
|
||||||
const StringIdMap &GetStringIdMap() const;
|
|
||||||
|
|
||||||
/// Find a node in the cluster on which we can schedule a given resource request.
|
/// Find a node in the cluster on which we can schedule a given resource request.
|
||||||
/// In hybrid mode, see `scheduling_policy.h` for a description of the policy.
|
/// In hybrid mode, see `scheduling_policy.h` for a description of the policy.
|
||||||
///
|
///
|
||||||
|
@ -76,10 +75,11 @@ class ClusterResourceScheduler {
|
||||||
///
|
///
|
||||||
/// \return emptry string, if no node can schedule the current request; otherwise,
|
/// \return emptry string, if no node can schedule the current request; otherwise,
|
||||||
/// return the string name of a node that can schedule the resource request.
|
/// return the string name of a node that can schedule the resource request.
|
||||||
std::string GetBestSchedulableNode(const TaskSpecification &task_spec,
|
scheduling::NodeID GetBestSchedulableNode(const TaskSpecification &task_spec,
|
||||||
bool prioritize_local_node, bool exclude_local_node,
|
bool prioritize_local_node,
|
||||||
bool requires_object_store_memory,
|
bool exclude_local_node,
|
||||||
bool *is_infeasible);
|
bool requires_object_store_memory,
|
||||||
|
bool *is_infeasible);
|
||||||
|
|
||||||
/// Subtract the resources required by a given resource request (resource_request) from
|
/// Subtract the resources required by a given resource request (resource_request) from
|
||||||
/// a given remote node.
|
/// a given remote node.
|
||||||
|
@ -89,7 +89,7 @@ class ClusterResourceScheduler {
|
||||||
/// \return True if remote node has enough resources to satisfy the resource request.
|
/// \return True if remote node has enough resources to satisfy the resource request.
|
||||||
/// False otherwise.
|
/// False otherwise.
|
||||||
bool AllocateRemoteTaskResources(
|
bool AllocateRemoteTaskResources(
|
||||||
const std::string &node_id,
|
scheduling::NodeID node_id,
|
||||||
const absl::flat_hash_map<std::string, double> &task_resources);
|
const absl::flat_hash_map<std::string, double> &task_resources);
|
||||||
|
|
||||||
/// Return human-readable string for this scheduler state.
|
/// Return human-readable string for this scheduler state.
|
||||||
|
@ -100,7 +100,7 @@ class ClusterResourceScheduler {
|
||||||
///
|
///
|
||||||
/// \param node_name Name of the node.
|
/// \param node_name Name of the node.
|
||||||
/// \param shape The resource demand's shape.
|
/// \param shape The resource demand's shape.
|
||||||
bool IsSchedulableOnNode(const std::string &node_name,
|
bool IsSchedulableOnNode(scheduling::NodeID node_id,
|
||||||
const absl::flat_hash_map<std::string, double> &shape);
|
const absl::flat_hash_map<std::string, double> &shape);
|
||||||
|
|
||||||
LocalResourceManager &GetLocalResourceManager() { return *local_resource_manager_; }
|
LocalResourceManager &GetLocalResourceManager() { return *local_resource_manager_; }
|
||||||
|
@ -109,7 +109,7 @@ class ClusterResourceScheduler {
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool NodeAlive(int64_t node_id) const;
|
bool NodeAlive(scheduling::NodeID node_id) const;
|
||||||
|
|
||||||
/// Decrease the available resources of a node when a resource request is
|
/// Decrease the available resources of a node when a resource request is
|
||||||
/// scheduled on the given node.
|
/// scheduled on the given node.
|
||||||
|
@ -119,7 +119,7 @@ class ClusterResourceScheduler {
|
||||||
///
|
///
|
||||||
/// \return true, if resource_request can be indeed scheduled on the node,
|
/// \return true, if resource_request can be indeed scheduled on the node,
|
||||||
/// and false otherwise.
|
/// and false otherwise.
|
||||||
bool SubtractRemoteNodeAvailableResources(int64_t node_id,
|
bool SubtractRemoteNodeAvailableResources(scheduling::NodeID node_id,
|
||||||
const ResourceRequest &resource_request);
|
const ResourceRequest &resource_request);
|
||||||
|
|
||||||
/// Check whether a resource request can be scheduled given a node.
|
/// Check whether a resource request can be scheduled given a node.
|
||||||
|
@ -128,7 +128,8 @@ class ClusterResourceScheduler {
|
||||||
/// \param node_id: ID of the node.
|
/// \param node_id: ID of the node.
|
||||||
///
|
///
|
||||||
/// \return: Whether the request can be scheduled.
|
/// \return: Whether the request can be scheduled.
|
||||||
bool IsSchedulable(const ResourceRequest &resource_request, int64_t node_id) const;
|
bool IsSchedulable(const ResourceRequest &resource_request,
|
||||||
|
scheduling::NodeID node_id) const;
|
||||||
|
|
||||||
/// Find a node in the cluster on which we can schedule a given resource request.
|
/// Find a node in the cluster on which we can schedule a given resource request.
|
||||||
/// In hybrid mode, see `scheduling_policy.h` for a description of the policy.
|
/// In hybrid mode, see `scheduling_policy.h` for a description of the policy.
|
||||||
|
@ -145,10 +146,10 @@ class ClusterResourceScheduler {
|
||||||
///
|
///
|
||||||
/// \return -1, if no node can schedule the current request; otherwise,
|
/// \return -1, if no node can schedule the current request; otherwise,
|
||||||
/// return the ID of a node that can schedule the resource request.
|
/// return the ID of a node that can schedule the resource request.
|
||||||
int64_t GetBestSchedulableNode(const ResourceRequest &resource_request,
|
scheduling::NodeID GetBestSchedulableNode(
|
||||||
const rpc::SchedulingStrategy &scheduling_strategy,
|
const ResourceRequest &resource_request,
|
||||||
bool actor_creation, bool force_spillback,
|
const rpc::SchedulingStrategy &scheduling_strategy, bool actor_creation,
|
||||||
int64_t *violations, bool *is_infeasible);
|
bool force_spillback, int64_t *violations, bool *is_infeasible);
|
||||||
|
|
||||||
/// Similar to
|
/// Similar to
|
||||||
/// int64_t GetBestSchedulableNode(...)
|
/// int64_t GetBestSchedulableNode(...)
|
||||||
|
@ -156,17 +157,14 @@ class ClusterResourceScheduler {
|
||||||
/// \return "", if no node can schedule the current request; otherwise,
|
/// \return "", if no node can schedule the current request; otherwise,
|
||||||
/// return the ID in string format of a node that can schedule the
|
/// return the ID in string format of a node that can schedule the
|
||||||
// resource request.
|
// resource request.
|
||||||
std::string GetBestSchedulableNode(
|
scheduling::NodeID GetBestSchedulableNode(
|
||||||
const absl::flat_hash_map<std::string, double> &resource_request,
|
const absl::flat_hash_map<std::string, double> &resource_request,
|
||||||
const rpc::SchedulingStrategy &scheduling_strategy,
|
const rpc::SchedulingStrategy &scheduling_strategy,
|
||||||
bool requires_object_store_memory, bool actor_creation, bool force_spillback,
|
bool requires_object_store_memory, bool actor_creation, bool force_spillback,
|
||||||
int64_t *violations, bool *is_infeasible);
|
int64_t *violations, bool *is_infeasible);
|
||||||
|
|
||||||
/// Keep the mapping between node and resource IDs in string representation
|
|
||||||
/// to integer representation. Used for improving map performance.
|
|
||||||
StringIdMap string_to_int_map_;
|
|
||||||
/// Identifier of local node.
|
/// Identifier of local node.
|
||||||
int64_t local_node_id_;
|
scheduling::NodeID local_node_id_;
|
||||||
/// Gcs client. It's not owned by this class.
|
/// Gcs client. It's not owned by this class.
|
||||||
gcs::GcsClient *gcs_client_;
|
gcs::GcsClient *gcs_client_;
|
||||||
/// Resources of local node.
|
/// Resources of local node.
|
||||||
|
|
|
@ -185,19 +185,19 @@ class ClusterResourceSchedulerTest : public ::testing::Test {
|
||||||
|
|
||||||
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
||||||
|
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(i, node_resources);
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
||||||
|
scheduling::NodeID(i), node_resources);
|
||||||
|
|
||||||
node_resources.custom_resources.clear();
|
node_resources.custom_resources.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void AssertPredefinedNodeResources(const ClusterResourceScheduler &resource_scheduler) {
|
void AssertPredefinedNodeResources() {
|
||||||
ASSERT_EQ(resource_scheduler.string_to_int_map_.Get(ray::kCPU_ResourceLabel), CPU);
|
ASSERT_EQ(ray::kCPU_ResourceLabel, scheduling::ResourceID(CPU).Binary());
|
||||||
ASSERT_EQ(resource_scheduler.string_to_int_map_.Get(ray::kGPU_ResourceLabel), GPU);
|
ASSERT_EQ(ray::kGPU_ResourceLabel, scheduling::ResourceID(GPU).Binary());
|
||||||
ASSERT_EQ(
|
ASSERT_EQ(ray::kObjectStoreMemory_ResourceLabel,
|
||||||
resource_scheduler.string_to_int_map_.Get(ray::kObjectStoreMemory_ResourceLabel),
|
scheduling::ResourceID(OBJECT_STORE_MEM).Binary());
|
||||||
OBJECT_STORE_MEM);
|
ASSERT_EQ(ray::kMemory_ResourceLabel, scheduling::ResourceID(MEM).Binary());
|
||||||
ASSERT_EQ(resource_scheduler.string_to_int_map_.Get(ray::kMemory_ResourceLabel), MEM);
|
|
||||||
}
|
}
|
||||||
std::unique_ptr<gcs::MockGcsClient> gcs_client_;
|
std::unique_ptr<gcs::MockGcsClient> gcs_client_;
|
||||||
std::string node_name;
|
std::string node_name;
|
||||||
|
@ -313,7 +313,7 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingIdInsertOrDieTest) {
|
||||||
TEST_F(ClusterResourceSchedulerTest, SchedulingInitClusterTest) {
|
TEST_F(ClusterResourceSchedulerTest, SchedulingInitClusterTest) {
|
||||||
int num_nodes = 10;
|
int num_nodes = 10;
|
||||||
ClusterResourceScheduler resource_scheduler;
|
ClusterResourceScheduler resource_scheduler;
|
||||||
AssertPredefinedNodeResources(resource_scheduler);
|
AssertPredefinedNodeResources();
|
||||||
|
|
||||||
initCluster(resource_scheduler, num_nodes);
|
initCluster(resource_scheduler, num_nodes);
|
||||||
|
|
||||||
|
@ -327,7 +327,8 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingDeleteClusterNodeTest) {
|
||||||
ClusterResourceScheduler resource_scheduler;
|
ClusterResourceScheduler resource_scheduler;
|
||||||
|
|
||||||
initCluster(resource_scheduler, num_nodes);
|
initCluster(resource_scheduler, num_nodes);
|
||||||
resource_scheduler.GetClusterResourceManager().RemoveNode(remove_id);
|
resource_scheduler.GetClusterResourceManager().RemoveNode(
|
||||||
|
scheduling::NodeID(remove_id));
|
||||||
|
|
||||||
ASSERT_TRUE(num_nodes - 1 == resource_scheduler.GetClusterResourceManager().NumNodes());
|
ASSERT_TRUE(num_nodes - 1 == resource_scheduler.GetClusterResourceManager().NumNodes());
|
||||||
}
|
}
|
||||||
|
@ -361,19 +362,19 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingModifyClusterNodeTest) {
|
||||||
cust_capacities.push_back(rand() % 10);
|
cust_capacities.push_back(rand() % 10);
|
||||||
|
|
||||||
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(update_id,
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
||||||
node_resources);
|
scheduling::NodeID(update_id), node_resources);
|
||||||
}
|
}
|
||||||
ASSERT_TRUE(num_nodes == resource_scheduler.GetClusterResourceManager().NumNodes());
|
ASSERT_TRUE(num_nodes == resource_scheduler.GetClusterResourceManager().NumNodes());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(ClusterResourceSchedulerTest, SpreadSchedulingStrategyTest) {
|
TEST_F(ClusterResourceSchedulerTest, SpreadSchedulingStrategyTest) {
|
||||||
absl::flat_hash_map<std::string, double> resource_total({{"CPU", 10}});
|
absl::flat_hash_map<std::string, double> resource_total({{"CPU", 10}});
|
||||||
auto local_node_id = NodeID::FromRandom().Binary();
|
auto local_node_id = scheduling::NodeID(NodeID::FromRandom().Binary());
|
||||||
ClusterResourceScheduler resource_scheduler(local_node_id, resource_total,
|
ClusterResourceScheduler resource_scheduler(local_node_id, resource_total,
|
||||||
*gcs_client_);
|
*gcs_client_);
|
||||||
AssertPredefinedNodeResources(resource_scheduler);
|
AssertPredefinedNodeResources();
|
||||||
auto remote_node_id = NodeID::FromRandom().Binary();
|
auto remote_node_id = scheduling::NodeID(NodeID::FromRandom().Binary());
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
||||||
remote_node_id, resource_total, resource_total);
|
remote_node_id, resource_total, resource_total);
|
||||||
|
|
||||||
|
@ -382,17 +383,17 @@ TEST_F(ClusterResourceSchedulerTest, SpreadSchedulingStrategyTest) {
|
||||||
bool is_infeasible;
|
bool is_infeasible;
|
||||||
rpc::SchedulingStrategy scheduling_strategy;
|
rpc::SchedulingStrategy scheduling_strategy;
|
||||||
scheduling_strategy.mutable_spread_scheduling_strategy();
|
scheduling_strategy.mutable_spread_scheduling_strategy();
|
||||||
std::string node_id_1 = resource_scheduler.GetBestSchedulableNode(
|
auto node_id_1 = resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_request, scheduling_strategy, false, false, false, &violations,
|
resource_request, scheduling_strategy, false, false, false, &violations,
|
||||||
&is_infeasible);
|
&is_infeasible);
|
||||||
absl::flat_hash_map<std::string, double> resource_available({{"CPU", 9}});
|
absl::flat_hash_map<std::string, double> resource_available({{"CPU", 9}});
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
||||||
node_id_1, resource_total, resource_available);
|
node_id_1, resource_total, resource_available);
|
||||||
std::string node_id_2 = resource_scheduler.GetBestSchedulableNode(
|
auto node_id_2 = resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_request, scheduling_strategy, false, false, false, &violations,
|
resource_request, scheduling_strategy, false, false, false, &violations,
|
||||||
&is_infeasible);
|
&is_infeasible);
|
||||||
ASSERT_EQ((std::set<std::string>{node_id_1, node_id_2}),
|
ASSERT_EQ((std::set<scheduling::NodeID>{node_id_1, node_id_2}),
|
||||||
(std::set<std::string>{local_node_id, remote_node_id}));
|
(std::set<scheduling::NodeID>{local_node_id, remote_node_id}));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(ClusterResourceSchedulerTest, SchedulingUpdateAvailableResourcesTest) {
|
TEST_F(ClusterResourceSchedulerTest, SchedulingUpdateAvailableResourcesTest) {
|
||||||
|
@ -402,8 +403,9 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingUpdateAvailableResourcesTest) {
|
||||||
vector<int64_t> cust_ids{1, 2};
|
vector<int64_t> cust_ids{1, 2};
|
||||||
vector<FixedPoint> cust_capacities{5, 5};
|
vector<FixedPoint> cust_capacities{5, 5};
|
||||||
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
||||||
ClusterResourceScheduler resource_scheduler(1, node_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID(1), node_resources,
|
||||||
AssertPredefinedNodeResources(resource_scheduler);
|
*gcs_client_);
|
||||||
|
AssertPredefinedNodeResources();
|
||||||
|
|
||||||
{
|
{
|
||||||
ResourceRequest resource_request;
|
ResourceRequest resource_request;
|
||||||
|
@ -416,9 +418,9 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingUpdateAvailableResourcesTest) {
|
||||||
bool is_infeasible;
|
bool is_infeasible;
|
||||||
rpc::SchedulingStrategy scheduling_strategy;
|
rpc::SchedulingStrategy scheduling_strategy;
|
||||||
scheduling_strategy.mutable_default_scheduling_strategy();
|
scheduling_strategy.mutable_default_scheduling_strategy();
|
||||||
int64_t node_id = resource_scheduler.GetBestSchedulableNode(
|
auto node_id = resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_request, scheduling_strategy, false, false, &violations, &is_infeasible);
|
resource_request, scheduling_strategy, false, false, &violations, &is_infeasible);
|
||||||
ASSERT_EQ(node_id, 1);
|
ASSERT_EQ(node_id.ToInt(), 1);
|
||||||
ASSERT_TRUE(violations == 0);
|
ASSERT_TRUE(violations == 0);
|
||||||
|
|
||||||
NodeResources nr1, nr2;
|
NodeResources nr1, nr2;
|
||||||
|
@ -455,23 +457,23 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingUpdateTotalResourcesTest) {
|
||||||
absl::flat_hash_map<std::string, double> initial_resources = {
|
absl::flat_hash_map<std::string, double> initial_resources = {
|
||||||
{ray::kCPU_ResourceLabel, 1}, {"custom", 1}};
|
{ray::kCPU_ResourceLabel, 1}, {"custom", 1}};
|
||||||
std::string name = NodeID::FromRandom().Binary();
|
std::string name = NodeID::FromRandom().Binary();
|
||||||
ClusterResourceScheduler resource_scheduler(name, initial_resources, *gcs_client_,
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID(name), initial_resources,
|
||||||
nullptr, nullptr);
|
*gcs_client_, nullptr, nullptr);
|
||||||
|
|
||||||
resource_scheduler.GetLocalResourceManager().AddLocalResourceInstances(
|
resource_scheduler.GetLocalResourceManager().AddLocalResourceInstances(
|
||||||
ray::kCPU_ResourceLabel, {0, 1, 1});
|
scheduling::ResourceID(ray::kCPU_ResourceLabel), {0, 1, 1});
|
||||||
resource_scheduler.GetLocalResourceManager().AddLocalResourceInstances("custom",
|
resource_scheduler.GetLocalResourceManager().AddLocalResourceInstances(
|
||||||
{0, 1, 1});
|
scheduling::ResourceID("custom"), {0, 1, 1});
|
||||||
|
|
||||||
const auto &predefined_resources = resource_scheduler.GetClusterResourceManager()
|
const auto &predefined_resources = resource_scheduler.GetClusterResourceManager()
|
||||||
.GetNodeResources(name)
|
.GetNodeResources(scheduling::NodeID(name))
|
||||||
.predefined_resources;
|
.predefined_resources;
|
||||||
ASSERT_EQ(predefined_resources[CPU].total.Double(), 3);
|
ASSERT_EQ(predefined_resources[CPU].total.Double(), 3);
|
||||||
|
|
||||||
const auto &custom_resources = resource_scheduler.GetClusterResourceManager()
|
const auto &custom_resources = resource_scheduler.GetClusterResourceManager()
|
||||||
.GetNodeResources(name)
|
.GetNodeResources(scheduling::NodeID(name))
|
||||||
.custom_resources;
|
.custom_resources;
|
||||||
auto resource_id = resource_scheduler.string_to_int_map_.Get("custom");
|
auto resource_id = scheduling::ResourceID("custom").ToInt();
|
||||||
ASSERT_EQ(custom_resources.find(resource_id)->second.total.Double(), 3);
|
ASSERT_EQ(custom_resources.find(resource_id)->second.total.Double(), 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -487,13 +489,14 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingAddOrUpdateNodeTest) {
|
||||||
vector<int64_t> cust_ids{1, 2};
|
vector<int64_t> cust_ids{1, 2};
|
||||||
vector<FixedPoint> cust_capacities{5, 5};
|
vector<FixedPoint> cust_capacities{5, 5};
|
||||||
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(node_id,
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
||||||
node_resources);
|
scheduling::NodeID(node_id), node_resources);
|
||||||
nr = node_resources;
|
nr = node_resources;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check whether node resources were correctly added.
|
// Check whether node resources were correctly added.
|
||||||
if (resource_scheduler.GetClusterResourceManager().GetNodeResources(node_id, &nr_out)) {
|
if (resource_scheduler.GetClusterResourceManager().GetNodeResources(
|
||||||
|
scheduling::NodeID(node_id), &nr_out)) {
|
||||||
ASSERT_TRUE(nodeResourcesEqual(nr, nr_out));
|
ASSERT_TRUE(nodeResourcesEqual(nr, nr_out));
|
||||||
} else {
|
} else {
|
||||||
ASSERT_TRUE(false);
|
ASSERT_TRUE(false);
|
||||||
|
@ -506,11 +509,12 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingAddOrUpdateNodeTest) {
|
||||||
vector<int64_t> cust_ids{2, 3};
|
vector<int64_t> cust_ids{2, 3};
|
||||||
vector<FixedPoint> cust_capacities{6, 6};
|
vector<FixedPoint> cust_capacities{6, 6};
|
||||||
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(node_id,
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
||||||
node_resources);
|
scheduling::NodeID(node_id), node_resources);
|
||||||
nr = node_resources;
|
nr = node_resources;
|
||||||
}
|
}
|
||||||
if (resource_scheduler.GetClusterResourceManager().GetNodeResources(node_id, &nr_out)) {
|
if (resource_scheduler.GetClusterResourceManager().GetNodeResources(
|
||||||
|
scheduling::NodeID(node_id), &nr_out)) {
|
||||||
ASSERT_TRUE(nodeResourcesEqual(nr, nr_out));
|
ASSERT_TRUE(nodeResourcesEqual(nr, nr_out));
|
||||||
} else {
|
} else {
|
||||||
ASSERT_TRUE(false);
|
ASSERT_TRUE(false);
|
||||||
|
@ -524,9 +528,9 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingResourceRequestTest) {
|
||||||
vector<int64_t> cust_ids{1};
|
vector<int64_t> cust_ids{1};
|
||||||
vector<FixedPoint> cust_capacities{10};
|
vector<FixedPoint> cust_capacities{10};
|
||||||
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
||||||
ClusterResourceScheduler resource_scheduler(0, node_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID(0), node_resources,
|
||||||
|
*gcs_client_);
|
||||||
auto node_id = NodeID::FromRandom();
|
auto node_id = NodeID::FromRandom();
|
||||||
auto node_internal_id = resource_scheduler.string_to_int_map_.Insert(node_id.Binary());
|
|
||||||
rpc::SchedulingStrategy scheduling_strategy;
|
rpc::SchedulingStrategy scheduling_strategy;
|
||||||
scheduling_strategy.mutable_default_scheduling_strategy();
|
scheduling_strategy.mutable_default_scheduling_strategy();
|
||||||
{
|
{
|
||||||
|
@ -535,8 +539,8 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingResourceRequestTest) {
|
||||||
vector<int64_t> cust_ids{1, 2};
|
vector<int64_t> cust_ids{1, 2};
|
||||||
vector<FixedPoint> cust_capacities{5, 5};
|
vector<FixedPoint> cust_capacities{5, 5};
|
||||||
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(node_internal_id,
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
||||||
node_resources);
|
scheduling::NodeID(node_id.Binary()), node_resources);
|
||||||
}
|
}
|
||||||
// Predefined resources, hard constraint violation
|
// Predefined resources, hard constraint violation
|
||||||
{
|
{
|
||||||
|
@ -546,9 +550,9 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingResourceRequestTest) {
|
||||||
EmptyFixedPointVector);
|
EmptyFixedPointVector);
|
||||||
int64_t violations;
|
int64_t violations;
|
||||||
bool is_infeasible;
|
bool is_infeasible;
|
||||||
int64_t node_id = resource_scheduler.GetBestSchedulableNode(
|
auto node_id = resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_request, scheduling_strategy, false, false, &violations, &is_infeasible);
|
resource_request, scheduling_strategy, false, false, &violations, &is_infeasible);
|
||||||
ASSERT_EQ(node_id, -1);
|
ASSERT_TRUE(node_id.IsNil());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Predefined resources, no constraint violation.
|
// Predefined resources, no constraint violation.
|
||||||
|
@ -559,9 +563,9 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingResourceRequestTest) {
|
||||||
EmptyFixedPointVector);
|
EmptyFixedPointVector);
|
||||||
int64_t violations;
|
int64_t violations;
|
||||||
bool is_infeasible;
|
bool is_infeasible;
|
||||||
int64_t node_id = resource_scheduler.GetBestSchedulableNode(
|
auto node_id = resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_request, scheduling_strategy, false, false, &violations, &is_infeasible);
|
resource_request, scheduling_strategy, false, false, &violations, &is_infeasible);
|
||||||
ASSERT_TRUE(node_id != -1);
|
ASSERT_TRUE(!node_id.IsNil());
|
||||||
ASSERT_TRUE(violations == 0);
|
ASSERT_TRUE(violations == 0);
|
||||||
}
|
}
|
||||||
// Custom resources, hard constraint violation.
|
// Custom resources, hard constraint violation.
|
||||||
|
@ -573,9 +577,9 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingResourceRequestTest) {
|
||||||
initResourceRequest(resource_request, pred_demands, cust_ids, cust_demands);
|
initResourceRequest(resource_request, pred_demands, cust_ids, cust_demands);
|
||||||
int64_t violations;
|
int64_t violations;
|
||||||
bool is_infeasible;
|
bool is_infeasible;
|
||||||
int64_t node_id = resource_scheduler.GetBestSchedulableNode(
|
auto node_id = resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_request, scheduling_strategy, false, false, &violations, &is_infeasible);
|
resource_request, scheduling_strategy, false, false, &violations, &is_infeasible);
|
||||||
ASSERT_TRUE(node_id == -1);
|
ASSERT_TRUE(node_id.IsNil());
|
||||||
}
|
}
|
||||||
// Custom resources, no constraint violation.
|
// Custom resources, no constraint violation.
|
||||||
{
|
{
|
||||||
|
@ -586,9 +590,9 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingResourceRequestTest) {
|
||||||
initResourceRequest(resource_request, pred_demands, cust_ids, cust_demands);
|
initResourceRequest(resource_request, pred_demands, cust_ids, cust_demands);
|
||||||
int64_t violations;
|
int64_t violations;
|
||||||
bool is_infeasible;
|
bool is_infeasible;
|
||||||
int64_t node_id = resource_scheduler.GetBestSchedulableNode(
|
auto node_id = resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_request, scheduling_strategy, false, false, &violations, &is_infeasible);
|
resource_request, scheduling_strategy, false, false, &violations, &is_infeasible);
|
||||||
ASSERT_TRUE(node_id != -1);
|
ASSERT_TRUE(!node_id.IsNil());
|
||||||
ASSERT_TRUE(violations == 0);
|
ASSERT_TRUE(violations == 0);
|
||||||
}
|
}
|
||||||
// Custom resource missing, hard constraint violation.
|
// Custom resource missing, hard constraint violation.
|
||||||
|
@ -600,9 +604,9 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingResourceRequestTest) {
|
||||||
initResourceRequest(resource_request, pred_demands, cust_ids, cust_demands);
|
initResourceRequest(resource_request, pred_demands, cust_ids, cust_demands);
|
||||||
int64_t violations;
|
int64_t violations;
|
||||||
bool is_infeasible;
|
bool is_infeasible;
|
||||||
int64_t node_id = resource_scheduler.GetBestSchedulableNode(
|
auto node_id = resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_request, scheduling_strategy, false, false, &violations, &is_infeasible);
|
resource_request, scheduling_strategy, false, false, &violations, &is_infeasible);
|
||||||
ASSERT_TRUE(node_id == -1);
|
ASSERT_TRUE(node_id.IsNil());
|
||||||
}
|
}
|
||||||
// Placement hints, no constraint violation.
|
// Placement hints, no constraint violation.
|
||||||
{
|
{
|
||||||
|
@ -613,9 +617,9 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingResourceRequestTest) {
|
||||||
initResourceRequest(resource_request, pred_demands, cust_ids, cust_demands);
|
initResourceRequest(resource_request, pred_demands, cust_ids, cust_demands);
|
||||||
int64_t violations;
|
int64_t violations;
|
||||||
bool is_infeasible;
|
bool is_infeasible;
|
||||||
int64_t node_id = resource_scheduler.GetBestSchedulableNode(
|
auto node_id = resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_request, scheduling_strategy, false, false, &violations, &is_infeasible);
|
resource_request, scheduling_strategy, false, false, &violations, &is_infeasible);
|
||||||
ASSERT_TRUE(node_id != -1);
|
ASSERT_TRUE(!node_id.IsNil());
|
||||||
ASSERT_TRUE(violations == 0);
|
ASSERT_TRUE(violations == 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -633,7 +637,8 @@ TEST_F(ClusterResourceSchedulerTest, GetLocalAvailableResourcesWithCpuUnitTest)
|
||||||
vector<int64_t> cust_ids{1};
|
vector<int64_t> cust_ids{1};
|
||||||
vector<FixedPoint> cust_capacities{8};
|
vector<FixedPoint> cust_capacities{8};
|
||||||
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
||||||
ClusterResourceScheduler resource_scheduler(0, node_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID(0), node_resources,
|
||||||
|
*gcs_client_);
|
||||||
|
|
||||||
TaskResourceInstances available_cluster_resources =
|
TaskResourceInstances available_cluster_resources =
|
||||||
resource_scheduler.GetLocalResourceManager()
|
resource_scheduler.GetLocalResourceManager()
|
||||||
|
@ -665,7 +670,8 @@ TEST_F(ClusterResourceSchedulerTest, GetLocalAvailableResourcesTest) {
|
||||||
vector<int64_t> cust_ids{1};
|
vector<int64_t> cust_ids{1};
|
||||||
vector<FixedPoint> cust_capacities{8};
|
vector<FixedPoint> cust_capacities{8};
|
||||||
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
||||||
ClusterResourceScheduler resource_scheduler(0, node_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID(0), node_resources,
|
||||||
|
*gcs_client_);
|
||||||
|
|
||||||
TaskResourceInstances available_cluster_resources =
|
TaskResourceInstances available_cluster_resources =
|
||||||
resource_scheduler.GetLocalResourceManager()
|
resource_scheduler.GetLocalResourceManager()
|
||||||
|
@ -701,7 +707,7 @@ TEST_F(ClusterResourceSchedulerTest, AvailableResourceInstancesOpsTest) {
|
||||||
vector<FixedPoint> pred_capacities{3 /* CPU */};
|
vector<FixedPoint> pred_capacities{3 /* CPU */};
|
||||||
initNodeResources(node_resources, pred_capacities, EmptyIntVector,
|
initNodeResources(node_resources, pred_capacities, EmptyIntVector,
|
||||||
EmptyFixedPointVector);
|
EmptyFixedPointVector);
|
||||||
ClusterResourceScheduler cluster(0, node_resources, *gcs_client_);
|
ClusterResourceScheduler cluster(scheduling::NodeID(0), node_resources, *gcs_client_);
|
||||||
|
|
||||||
ResourceInstanceCapacities instances;
|
ResourceInstanceCapacities instances;
|
||||||
|
|
||||||
|
@ -734,7 +740,8 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesTest) {
|
||||||
vector<FixedPoint> pred_capacities{3. /* CPU */, 4. /* MEM */, 5. /* GPU */};
|
vector<FixedPoint> pred_capacities{3. /* CPU */, 4. /* MEM */, 5. /* GPU */};
|
||||||
initNodeResources(node_resources, pred_capacities, EmptyIntVector,
|
initNodeResources(node_resources, pred_capacities, EmptyIntVector,
|
||||||
EmptyFixedPointVector);
|
EmptyFixedPointVector);
|
||||||
ClusterResourceScheduler resource_scheduler(0, node_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID(0), node_resources,
|
||||||
|
*gcs_client_);
|
||||||
|
|
||||||
ResourceRequest resource_request;
|
ResourceRequest resource_request;
|
||||||
vector<FixedPoint> pred_demands = {3. /* CPU */, 2. /* MEM */, 1.5 /* GPU */};
|
vector<FixedPoint> pred_demands = {3. /* CPU */, 2. /* MEM */, 1.5 /* GPU */};
|
||||||
|
@ -766,7 +773,8 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesTest) {
|
||||||
vector<FixedPoint> pred_capacities{3 /* CPU */, 4 /* MEM */, 5 /* GPU */};
|
vector<FixedPoint> pred_capacities{3 /* CPU */, 4 /* MEM */, 5 /* GPU */};
|
||||||
initNodeResources(node_resources, pred_capacities, EmptyIntVector,
|
initNodeResources(node_resources, pred_capacities, EmptyIntVector,
|
||||||
EmptyFixedPointVector);
|
EmptyFixedPointVector);
|
||||||
ClusterResourceScheduler resource_scheduler(0, node_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID(0), node_resources,
|
||||||
|
*gcs_client_);
|
||||||
|
|
||||||
ResourceRequest resource_request;
|
ResourceRequest resource_request;
|
||||||
vector<FixedPoint> pred_demands = {4. /* CPU */, 2. /* MEM */, 1.5 /* GPU */};
|
vector<FixedPoint> pred_demands = {4. /* CPU */, 2. /* MEM */, 1.5 /* GPU */};
|
||||||
|
@ -794,7 +802,8 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesTest) {
|
||||||
vector<int64_t> cust_ids{1, 2};
|
vector<int64_t> cust_ids{1, 2};
|
||||||
vector<FixedPoint> cust_capacities{4, 4};
|
vector<FixedPoint> cust_capacities{4, 4};
|
||||||
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
||||||
ClusterResourceScheduler resource_scheduler(0, node_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID(0), node_resources,
|
||||||
|
*gcs_client_);
|
||||||
|
|
||||||
ResourceRequest resource_request;
|
ResourceRequest resource_request;
|
||||||
vector<FixedPoint> pred_demands = {3. /* CPU */, 2. /* MEM */, 1.5 /* GPU */};
|
vector<FixedPoint> pred_demands = {3. /* CPU */, 2. /* MEM */, 1.5 /* GPU */};
|
||||||
|
@ -826,7 +835,8 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesTest) {
|
||||||
vector<int64_t> cust_ids{1, 2};
|
vector<int64_t> cust_ids{1, 2};
|
||||||
vector<FixedPoint> cust_capacities{4, 4};
|
vector<FixedPoint> cust_capacities{4, 4};
|
||||||
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
||||||
ClusterResourceScheduler resource_scheduler(0, node_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID(0), node_resources,
|
||||||
|
*gcs_client_);
|
||||||
|
|
||||||
ResourceRequest resource_request;
|
ResourceRequest resource_request;
|
||||||
vector<FixedPoint> pred_demands = {3. /* CPU */, 2. /* MEM */, 1.5 /* GPU */};
|
vector<FixedPoint> pred_demands = {3. /* CPU */, 2. /* MEM */, 1.5 /* GPU */};
|
||||||
|
@ -855,7 +865,8 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesAllocationFailureTest)
|
||||||
vector<int64_t> cust_ids{1, 2, 3};
|
vector<int64_t> cust_ids{1, 2, 3};
|
||||||
vector<FixedPoint> cust_capacities{4, 4, 4};
|
vector<FixedPoint> cust_capacities{4, 4, 4};
|
||||||
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
||||||
ClusterResourceScheduler resource_scheduler(0, node_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID(0), node_resources,
|
||||||
|
*gcs_client_);
|
||||||
|
|
||||||
ResourceRequest resource_request;
|
ResourceRequest resource_request;
|
||||||
vector<FixedPoint> pred_demands = {0. /* CPU */, 0. /* MEM */, 0. /* GPU */};
|
vector<FixedPoint> pred_demands = {0. /* CPU */, 0. /* MEM */, 0. /* GPU */};
|
||||||
|
@ -885,7 +896,8 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesTest2) {
|
||||||
vector<int64_t> cust_ids{1, 2};
|
vector<int64_t> cust_ids{1, 2};
|
||||||
vector<FixedPoint> cust_capacities{4., 4.};
|
vector<FixedPoint> cust_capacities{4., 4.};
|
||||||
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
||||||
ClusterResourceScheduler resource_scheduler(0, node_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID(0), node_resources,
|
||||||
|
*gcs_client_);
|
||||||
|
|
||||||
ResourceRequest resource_request;
|
ResourceRequest resource_request;
|
||||||
vector<FixedPoint> pred_demands = {2. /* CPU */, 2. /* MEM */, 1.5 /* GPU */};
|
vector<FixedPoint> pred_demands = {2. /* CPU */, 2. /* MEM */, 1.5 /* GPU */};
|
||||||
|
@ -913,25 +925,29 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesTest2) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(ClusterResourceSchedulerTest, DeadNodeTest) {
|
TEST_F(ClusterResourceSchedulerTest, DeadNodeTest) {
|
||||||
ClusterResourceScheduler resource_scheduler("local", {}, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID("local"),
|
||||||
|
absl::flat_hash_map<std::string, double>{},
|
||||||
|
*gcs_client_);
|
||||||
absl::flat_hash_map<std::string, double> resource;
|
absl::flat_hash_map<std::string, double> resource;
|
||||||
resource["CPU"] = 10000.0;
|
resource["CPU"] = 10000.0;
|
||||||
auto node_id = NodeID::FromRandom();
|
auto node_id = NodeID::FromRandom();
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(node_id.Binary(),
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
||||||
resource, resource);
|
scheduling::NodeID(node_id.Binary()), resource, resource);
|
||||||
int64_t violations = 0;
|
int64_t violations = 0;
|
||||||
bool is_infeasible = false;
|
bool is_infeasible = false;
|
||||||
rpc::SchedulingStrategy scheduling_strategy;
|
rpc::SchedulingStrategy scheduling_strategy;
|
||||||
scheduling_strategy.mutable_default_scheduling_strategy();
|
scheduling_strategy.mutable_default_scheduling_strategy();
|
||||||
ASSERT_EQ(node_id.Binary(), resource_scheduler.GetBestSchedulableNode(
|
ASSERT_EQ(scheduling::NodeID(node_id.Binary()),
|
||||||
resource, scheduling_strategy, false, false, false,
|
resource_scheduler.GetBestSchedulableNode(resource, scheduling_strategy,
|
||||||
&violations, &is_infeasible));
|
false, false, false, &violations,
|
||||||
|
&is_infeasible));
|
||||||
EXPECT_CALL(*gcs_client_->mock_node_accessor, Get(node_id, ::testing::_))
|
EXPECT_CALL(*gcs_client_->mock_node_accessor, Get(node_id, ::testing::_))
|
||||||
.WillOnce(::testing::Return(nullptr))
|
.WillOnce(::testing::Return(nullptr))
|
||||||
.WillOnce(::testing::Return(nullptr));
|
.WillOnce(::testing::Return(nullptr));
|
||||||
ASSERT_EQ("", resource_scheduler.GetBestSchedulableNode(resource, scheduling_strategy,
|
ASSERT_TRUE(resource_scheduler
|
||||||
false, false, false,
|
.GetBestSchedulableNode(resource, scheduling_strategy, false, false,
|
||||||
&violations, &is_infeasible));
|
false, &violations, &is_infeasible)
|
||||||
|
.IsNil());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(ClusterResourceSchedulerTest, TaskGPUResourceInstancesTest) {
|
TEST_F(ClusterResourceSchedulerTest, TaskGPUResourceInstancesTest) {
|
||||||
|
@ -941,7 +957,8 @@ TEST_F(ClusterResourceSchedulerTest, TaskGPUResourceInstancesTest) {
|
||||||
vector<int64_t> cust_ids{1};
|
vector<int64_t> cust_ids{1};
|
||||||
vector<FixedPoint> cust_capacities{8};
|
vector<FixedPoint> cust_capacities{8};
|
||||||
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
||||||
ClusterResourceScheduler resource_scheduler(0, node_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID(0), node_resources,
|
||||||
|
*gcs_client_);
|
||||||
|
|
||||||
std::vector<double> allocate_gpu_instances{0.5, 0.5, 0.5, 0.5};
|
std::vector<double> allocate_gpu_instances{0.5, 0.5, 0.5, 0.5};
|
||||||
resource_scheduler.GetLocalResourceManager().SubtractGPUResourceInstances(
|
resource_scheduler.GetLocalResourceManager().SubtractGPUResourceInstances(
|
||||||
|
@ -1004,7 +1021,8 @@ TEST_F(ClusterResourceSchedulerTest,
|
||||||
vector<int64_t> cust_ids{1};
|
vector<int64_t> cust_ids{1};
|
||||||
vector<FixedPoint> cust_capacities{8};
|
vector<FixedPoint> cust_capacities{8};
|
||||||
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
initNodeResources(node_resources, pred_capacities, cust_ids, cust_capacities);
|
||||||
ClusterResourceScheduler resource_scheduler(0, node_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID(0), node_resources,
|
||||||
|
*gcs_client_);
|
||||||
|
|
||||||
{
|
{
|
||||||
std::vector<double> allocate_gpu_instances{0.5, 0.5, 2, 0.5};
|
std::vector<double> allocate_gpu_instances{0.5, 0.5, 2, 0.5};
|
||||||
|
@ -1023,7 +1041,8 @@ TEST_F(ClusterResourceSchedulerTest,
|
||||||
expected_available_gpu_instances.begin()));
|
expected_available_gpu_instances.begin()));
|
||||||
|
|
||||||
NodeResources nr;
|
NodeResources nr;
|
||||||
resource_scheduler.GetClusterResourceManager().GetNodeResources(0, &nr);
|
resource_scheduler.GetClusterResourceManager().GetNodeResources(
|
||||||
|
scheduling::NodeID(0), &nr);
|
||||||
ASSERT_TRUE(nr.predefined_resources[GPU].available == 1.5);
|
ASSERT_TRUE(nr.predefined_resources[GPU].available == 1.5);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1044,7 +1063,8 @@ TEST_F(ClusterResourceSchedulerTest,
|
||||||
expected_available_gpu_instances.begin()));
|
expected_available_gpu_instances.begin()));
|
||||||
|
|
||||||
NodeResources nr;
|
NodeResources nr;
|
||||||
resource_scheduler.GetClusterResourceManager().GetNodeResources(0, &nr);
|
resource_scheduler.GetClusterResourceManager().GetNodeResources(
|
||||||
|
scheduling::NodeID(0), &nr);
|
||||||
ASSERT_TRUE(nr.predefined_resources[GPU].available == 3.8);
|
ASSERT_TRUE(nr.predefined_resources[GPU].available == 3.8);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1055,7 +1075,8 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstanceWithHardRequestTest) {
|
||||||
vector<FixedPoint> pred_capacities{4. /* CPU */, 2. /* MEM */, 4. /* GPU */};
|
vector<FixedPoint> pred_capacities{4. /* CPU */, 2. /* MEM */, 4. /* GPU */};
|
||||||
initNodeResources(node_resources, pred_capacities, EmptyIntVector,
|
initNodeResources(node_resources, pred_capacities, EmptyIntVector,
|
||||||
EmptyFixedPointVector);
|
EmptyFixedPointVector);
|
||||||
ClusterResourceScheduler resource_scheduler(0, node_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID(0), node_resources,
|
||||||
|
*gcs_client_);
|
||||||
|
|
||||||
ResourceRequest resource_request;
|
ResourceRequest resource_request;
|
||||||
vector<FixedPoint> pred_demands = {2. /* CPU */, 2. /* MEM */, 1.5 /* GPU */};
|
vector<FixedPoint> pred_demands = {2. /* CPU */, 2. /* MEM */, 1.5 /* GPU */};
|
||||||
|
@ -1081,7 +1102,8 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstanceWithoutCpuUnitTest) {
|
||||||
vector<FixedPoint> pred_capacities{4. /* CPU */, 2. /* MEM */, 4. /* GPU */};
|
vector<FixedPoint> pred_capacities{4. /* CPU */, 2. /* MEM */, 4. /* GPU */};
|
||||||
initNodeResources(node_resources, pred_capacities, EmptyIntVector,
|
initNodeResources(node_resources, pred_capacities, EmptyIntVector,
|
||||||
EmptyFixedPointVector);
|
EmptyFixedPointVector);
|
||||||
ClusterResourceScheduler resource_scheduler(0, node_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID(0), node_resources,
|
||||||
|
*gcs_client_);
|
||||||
|
|
||||||
ResourceRequest resource_request;
|
ResourceRequest resource_request;
|
||||||
vector<FixedPoint> pred_demands = {2. /* CPU */, 2. /* MEM */, 1.5 /* GPU */};
|
vector<FixedPoint> pred_demands = {2. /* CPU */, 2. /* MEM */, 1.5 /* GPU */};
|
||||||
|
@ -1104,10 +1126,12 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstanceWithoutCpuUnitTest) {
|
||||||
|
|
||||||
TEST_F(ClusterResourceSchedulerTest, TestAlwaysSpillInfeasibleTask) {
|
TEST_F(ClusterResourceSchedulerTest, TestAlwaysSpillInfeasibleTask) {
|
||||||
absl::flat_hash_map<std::string, double> resource_spec({{"CPU", 1}});
|
absl::flat_hash_map<std::string, double> resource_spec({{"CPU", 1}});
|
||||||
ClusterResourceScheduler resource_scheduler("local", {}, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID("local"),
|
||||||
|
absl::flat_hash_map<std::string, double>{},
|
||||||
|
*gcs_client_);
|
||||||
for (int i = 0; i < 100; i++) {
|
for (int i = 0; i < 100; i++) {
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
||||||
NodeID::FromRandom().Binary(), {}, {});
|
scheduling::NodeID(NodeID::FromRandom().Binary()), {}, {});
|
||||||
}
|
}
|
||||||
|
|
||||||
// No feasible nodes.
|
// No feasible nodes.
|
||||||
|
@ -1115,14 +1139,14 @@ TEST_F(ClusterResourceSchedulerTest, TestAlwaysSpillInfeasibleTask) {
|
||||||
bool is_infeasible;
|
bool is_infeasible;
|
||||||
rpc::SchedulingStrategy scheduling_strategy;
|
rpc::SchedulingStrategy scheduling_strategy;
|
||||||
scheduling_strategy.mutable_default_scheduling_strategy();
|
scheduling_strategy.mutable_default_scheduling_strategy();
|
||||||
ASSERT_EQ(resource_scheduler.GetBestSchedulableNode(resource_spec, scheduling_strategy,
|
ASSERT_TRUE(resource_scheduler
|
||||||
false, false, false,
|
.GetBestSchedulableNode(resource_spec, scheduling_strategy, false,
|
||||||
&total_violations, &is_infeasible),
|
false, false, &total_violations, &is_infeasible)
|
||||||
"");
|
.IsNil());
|
||||||
|
|
||||||
// Feasible remote node, but doesn't currently have resources available. We
|
// Feasible remote node, but doesn't currently have resources available. We
|
||||||
// should spill there.
|
// should spill there.
|
||||||
auto remote_feasible = NodeID::FromRandom().Binary();
|
auto remote_feasible = scheduling::NodeID(NodeID::FromRandom().Binary());
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
||||||
remote_feasible, resource_spec, {{"CPU", 0.}});
|
remote_feasible, resource_spec, {{"CPU", 0.}});
|
||||||
ASSERT_EQ(remote_feasible, resource_scheduler.GetBestSchedulableNode(
|
ASSERT_EQ(remote_feasible, resource_scheduler.GetBestSchedulableNode(
|
||||||
|
@ -1131,7 +1155,7 @@ TEST_F(ClusterResourceSchedulerTest, TestAlwaysSpillInfeasibleTask) {
|
||||||
|
|
||||||
// Feasible remote node, and it currently has resources available. We should
|
// Feasible remote node, and it currently has resources available. We should
|
||||||
// prefer to spill there.
|
// prefer to spill there.
|
||||||
auto remote_available = NodeID::FromRandom().Binary();
|
auto remote_available = scheduling::NodeID(NodeID::FromRandom().Binary());
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
||||||
remote_available, resource_spec, resource_spec);
|
remote_available, resource_spec, resource_spec);
|
||||||
ASSERT_EQ(remote_available, resource_scheduler.GetBestSchedulableNode(
|
ASSERT_EQ(remote_available, resource_scheduler.GetBestSchedulableNode(
|
||||||
|
@ -1146,14 +1170,15 @@ TEST_F(ClusterResourceSchedulerTest, ResourceUsageReportTest) {
|
||||||
|
|
||||||
absl::flat_hash_map<std::string, double> initial_resources(
|
absl::flat_hash_map<std::string, double> initial_resources(
|
||||||
{{"CPU", 1}, {"GPU", 2}, {"memory", 3}, {"1", 1}, {"2", 2}, {"3", 3}});
|
{{"CPU", 1}, {"GPU", 2}, {"memory", 3}, {"1", 1}, {"2", 2}, {"3", 3}});
|
||||||
ClusterResourceScheduler resource_scheduler("0", initial_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID("0"), initial_resources,
|
||||||
|
*gcs_client_);
|
||||||
NodeResources other_node_resources;
|
NodeResources other_node_resources;
|
||||||
vector<FixedPoint> other_pred_capacities{1. /* CPU */, 1. /* MEM */, 1. /* GPU */};
|
vector<FixedPoint> other_pred_capacities{1. /* CPU */, 1. /* MEM */, 1. /* GPU */};
|
||||||
vector<FixedPoint> other_cust_capacities{5., 4., 3., 2., 1.};
|
vector<FixedPoint> other_cust_capacities{5., 4., 3., 2., 1.};
|
||||||
initNodeResources(other_node_resources, other_pred_capacities, cust_ids,
|
initNodeResources(other_node_resources, other_pred_capacities, cust_ids,
|
||||||
other_cust_capacities);
|
other_cust_capacities);
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(12345,
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
||||||
other_node_resources);
|
scheduling::NodeID(12345), other_node_resources);
|
||||||
|
|
||||||
{ // Cluster is idle.
|
{ // Cluster is idle.
|
||||||
rpc::ResourcesData data;
|
rpc::ResourcesData data;
|
||||||
|
@ -1229,15 +1254,15 @@ TEST_F(ClusterResourceSchedulerTest, ObjectStoreMemoryUsageTest) {
|
||||||
{"object_store_memory", 1000 * 1024 * 1024}});
|
{"object_store_memory", 1000 * 1024 * 1024}});
|
||||||
int64_t used_object_store_memory = 250 * 1024 * 1024;
|
int64_t used_object_store_memory = 250 * 1024 * 1024;
|
||||||
int64_t *ptr = &used_object_store_memory;
|
int64_t *ptr = &used_object_store_memory;
|
||||||
ClusterResourceScheduler resource_scheduler("0", initial_resources, *gcs_client_,
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID("0"), initial_resources,
|
||||||
[&] { return *ptr; });
|
*gcs_client_, [&] { return *ptr; });
|
||||||
NodeResources other_node_resources;
|
NodeResources other_node_resources;
|
||||||
vector<FixedPoint> other_pred_capacities{1. /* CPU */, 1. /* MEM */, 1. /* GPU */};
|
vector<FixedPoint> other_pred_capacities{1. /* CPU */, 1. /* MEM */, 1. /* GPU */};
|
||||||
vector<FixedPoint> other_cust_capacities{10.};
|
vector<FixedPoint> other_cust_capacities{10.};
|
||||||
initNodeResources(other_node_resources, other_pred_capacities, cust_ids,
|
initNodeResources(other_node_resources, other_pred_capacities, cust_ids,
|
||||||
other_cust_capacities);
|
other_cust_capacities);
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(12345,
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
||||||
other_node_resources);
|
scheduling::NodeID(12345), other_node_resources);
|
||||||
|
|
||||||
{
|
{
|
||||||
rpc::ResourcesData data;
|
rpc::ResourcesData data;
|
||||||
|
@ -1326,8 +1351,9 @@ TEST_F(ClusterResourceSchedulerTest, ObjectStoreMemoryUsageTest) {
|
||||||
|
|
||||||
TEST_F(ClusterResourceSchedulerTest, DirtyLocalViewTest) {
|
TEST_F(ClusterResourceSchedulerTest, DirtyLocalViewTest) {
|
||||||
absl::flat_hash_map<std::string, double> initial_resources({{"CPU", 1}});
|
absl::flat_hash_map<std::string, double> initial_resources({{"CPU", 1}});
|
||||||
ClusterResourceScheduler resource_scheduler("local", initial_resources, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID("local"),
|
||||||
auto remote = NodeID::FromRandom().Binary();
|
initial_resources, *gcs_client_);
|
||||||
|
auto remote = scheduling::NodeID(NodeID::FromRandom().Binary());
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(remote, {{"CPU", 2.}},
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(remote, {{"CPU", 2.}},
|
||||||
{{"CPU", 2.}});
|
{{"CPU", 2.}});
|
||||||
const absl::flat_hash_map<std::string, double> task_spec = {{"CPU", 1.}};
|
const absl::flat_hash_map<std::string, double> task_spec = {{"CPU", 1.}};
|
||||||
|
@ -1366,9 +1392,10 @@ TEST_F(ClusterResourceSchedulerTest, DirtyLocalViewTest) {
|
||||||
}
|
}
|
||||||
// Our local view says there are not enough resources on the remote node to
|
// Our local view says there are not enough resources on the remote node to
|
||||||
// schedule another task.
|
// schedule another task.
|
||||||
ASSERT_EQ("", resource_scheduler.GetBestSchedulableNode(
|
ASSERT_EQ(
|
||||||
task_spec, scheduling_strategy, false, false, true, &t,
|
resource_scheduler.GetBestSchedulableNode(task_spec, scheduling_strategy, false,
|
||||||
&is_infeasible));
|
false, true, &t, &is_infeasible),
|
||||||
|
scheduling::NodeID::Nil());
|
||||||
ASSERT_FALSE(
|
ASSERT_FALSE(
|
||||||
resource_scheduler.GetLocalResourceManager().AllocateLocalTaskResources(
|
resource_scheduler.GetLocalResourceManager().AllocateLocalTaskResources(
|
||||||
task_spec, task_allocation));
|
task_spec, task_allocation));
|
||||||
|
@ -1378,7 +1405,8 @@ TEST_F(ClusterResourceSchedulerTest, DirtyLocalViewTest) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(ClusterResourceSchedulerTest, DynamicResourceTest) {
|
TEST_F(ClusterResourceSchedulerTest, DynamicResourceTest) {
|
||||||
ClusterResourceScheduler resource_scheduler("local", {{"CPU", 2}}, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID("local"), {{"CPU", 2}},
|
||||||
|
*gcs_client_);
|
||||||
|
|
||||||
absl::flat_hash_map<std::string, double> resource_request = {{"CPU", 1},
|
absl::flat_hash_map<std::string, double> resource_request = {{"CPU", 1},
|
||||||
{"custom123", 2}};
|
{"custom123", 2}};
|
||||||
|
@ -1387,36 +1415,38 @@ TEST_F(ClusterResourceSchedulerTest, DynamicResourceTest) {
|
||||||
rpc::SchedulingStrategy scheduling_strategy;
|
rpc::SchedulingStrategy scheduling_strategy;
|
||||||
scheduling_strategy.mutable_default_scheduling_strategy();
|
scheduling_strategy.mutable_default_scheduling_strategy();
|
||||||
|
|
||||||
std::string result = resource_scheduler.GetBestSchedulableNode(
|
auto result = resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_request, scheduling_strategy, false, false, false, &t, &is_infeasible);
|
resource_request, scheduling_strategy, false, false, false, &t, &is_infeasible);
|
||||||
ASSERT_TRUE(result.empty());
|
ASSERT_TRUE(result.IsNil());
|
||||||
|
|
||||||
resource_scheduler.GetLocalResourceManager().AddLocalResourceInstances("custom123",
|
resource_scheduler.GetLocalResourceManager().AddLocalResourceInstances(
|
||||||
{0., 1.0, 1.0});
|
scheduling::ResourceID("custom123"), {0., 1.0, 1.0});
|
||||||
|
|
||||||
result = resource_scheduler.GetBestSchedulableNode(
|
result = resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_request, scheduling_strategy, false, false, false, &t, &is_infeasible);
|
resource_request, scheduling_strategy, false, false, false, &t, &is_infeasible);
|
||||||
ASSERT_FALSE(result.empty()) << resource_scheduler.DebugString();
|
ASSERT_FALSE(result.IsNil()) << resource_scheduler.DebugString();
|
||||||
|
|
||||||
resource_request["custom123"] = 3;
|
resource_request["custom123"] = 3;
|
||||||
result = resource_scheduler.GetBestSchedulableNode(
|
result = resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_request, scheduling_strategy, false, false, false, &t, &is_infeasible);
|
resource_request, scheduling_strategy, false, false, false, &t, &is_infeasible);
|
||||||
ASSERT_TRUE(result.empty());
|
ASSERT_TRUE(result.IsNil());
|
||||||
|
|
||||||
resource_scheduler.GetLocalResourceManager().AddLocalResourceInstances("custom123",
|
resource_scheduler.GetLocalResourceManager().AddLocalResourceInstances(
|
||||||
{1.0});
|
scheduling::ResourceID("custom123"), {1.0});
|
||||||
result = resource_scheduler.GetBestSchedulableNode(
|
result = resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_request, scheduling_strategy, false, false, false, &t, &is_infeasible);
|
resource_request, scheduling_strategy, false, false, false, &t, &is_infeasible);
|
||||||
ASSERT_FALSE(result.empty());
|
ASSERT_FALSE(result.IsNil());
|
||||||
|
|
||||||
resource_scheduler.GetLocalResourceManager().DeleteLocalResource("custom123");
|
resource_scheduler.GetLocalResourceManager().DeleteLocalResource(
|
||||||
|
scheduling::ResourceID("custom123"));
|
||||||
result = resource_scheduler.GetBestSchedulableNode(
|
result = resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_request, scheduling_strategy, false, false, false, &t, &is_infeasible);
|
resource_request, scheduling_strategy, false, false, false, &t, &is_infeasible);
|
||||||
ASSERT_TRUE(result.empty());
|
ASSERT_TRUE(result.IsNil());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(ClusterResourceSchedulerTest, AvailableResourceEmptyTest) {
|
TEST_F(ClusterResourceSchedulerTest, AvailableResourceEmptyTest) {
|
||||||
ClusterResourceScheduler resource_scheduler("local", {{"custom123", 5}}, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID("local"),
|
||||||
|
{{"custom123", 5}}, *gcs_client_);
|
||||||
std::shared_ptr<TaskResourceInstances> resource_instances =
|
std::shared_ptr<TaskResourceInstances> resource_instances =
|
||||||
std::make_shared<TaskResourceInstances>();
|
std::make_shared<TaskResourceInstances>();
|
||||||
absl::flat_hash_map<std::string, double> resource_request = {{"custom123", 5}};
|
absl::flat_hash_map<std::string, double> resource_request = {{"custom123", 5}};
|
||||||
|
@ -1424,16 +1454,17 @@ TEST_F(ClusterResourceSchedulerTest, AvailableResourceEmptyTest) {
|
||||||
resource_scheduler.GetLocalResourceManager().AllocateLocalTaskResources(
|
resource_scheduler.GetLocalResourceManager().AllocateLocalTaskResources(
|
||||||
resource_request, resource_instances);
|
resource_request, resource_instances);
|
||||||
ASSERT_TRUE(allocated);
|
ASSERT_TRUE(allocated);
|
||||||
ASSERT_TRUE(
|
ASSERT_TRUE(resource_scheduler.GetLocalResourceManager().IsAvailableResourceEmpty(
|
||||||
resource_scheduler.GetLocalResourceManager().IsAvailableResourceEmpty("custom123"));
|
scheduling::ResourceID("custom123")));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(ClusterResourceSchedulerTest, TestForceSpillback) {
|
TEST_F(ClusterResourceSchedulerTest, TestForceSpillback) {
|
||||||
absl::flat_hash_map<std::string, double> resource_spec({{"CPU", 1}});
|
absl::flat_hash_map<std::string, double> resource_spec({{"CPU", 1}});
|
||||||
ClusterResourceScheduler resource_scheduler("local", resource_spec, *gcs_client_);
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID("local"), resource_spec,
|
||||||
std::vector<string> node_ids;
|
*gcs_client_);
|
||||||
|
std::vector<scheduling::NodeID> node_ids;
|
||||||
for (int i = 0; i < 100; i++) {
|
for (int i = 0; i < 100; i++) {
|
||||||
node_ids.push_back(NodeID::FromRandom().Binary());
|
node_ids.emplace_back(NodeID::FromRandom().Binary());
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(node_ids.back(), {},
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(node_ids.back(), {},
|
||||||
{});
|
{});
|
||||||
}
|
}
|
||||||
|
@ -1447,20 +1478,20 @@ TEST_F(ClusterResourceSchedulerTest, TestForceSpillback) {
|
||||||
ASSERT_EQ(resource_scheduler.GetBestSchedulableNode(
|
ASSERT_EQ(resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_spec, scheduling_strategy, false, false,
|
resource_spec, scheduling_strategy, false, false,
|
||||||
/*force_spillback=*/false, &total_violations, &is_infeasible),
|
/*force_spillback=*/false, &total_violations, &is_infeasible),
|
||||||
"local");
|
scheduling::NodeID("local"));
|
||||||
// If spillback is forced, we try to spill to remote, but only if there is a
|
// If spillback is forced, we try to spill to remote, but only if there is a
|
||||||
// schedulable node.
|
// schedulable node.
|
||||||
ASSERT_EQ(resource_scheduler.GetBestSchedulableNode(
|
ASSERT_EQ(resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_spec, scheduling_strategy, false, false,
|
resource_spec, scheduling_strategy, false, false,
|
||||||
/*force_spillback=*/true, &total_violations, &is_infeasible),
|
/*force_spillback=*/true, &total_violations, &is_infeasible),
|
||||||
"");
|
scheduling::NodeID::Nil());
|
||||||
// Choose a remote node that has the resources available.
|
// Choose a remote node that has the resources available.
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(node_ids[50],
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(node_ids[50],
|
||||||
resource_spec, {});
|
resource_spec, {});
|
||||||
ASSERT_EQ(resource_scheduler.GetBestSchedulableNode(
|
ASSERT_EQ(resource_scheduler.GetBestSchedulableNode(
|
||||||
resource_spec, scheduling_strategy, false, false,
|
resource_spec, scheduling_strategy, false, false,
|
||||||
/*force_spillback=*/true, &total_violations, &is_infeasible),
|
/*force_spillback=*/true, &total_violations, &is_infeasible),
|
||||||
"");
|
scheduling::NodeID::Nil());
|
||||||
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
resource_scheduler.GetClusterResourceManager().AddOrUpdateNode(
|
||||||
node_ids[51], resource_spec, resource_spec);
|
node_ids[51], resource_spec, resource_spec);
|
||||||
ASSERT_EQ(resource_scheduler.GetBestSchedulableNode(
|
ASSERT_EQ(resource_scheduler.GetBestSchedulableNode(
|
||||||
|
@ -1476,8 +1507,8 @@ TEST_F(ClusterResourceSchedulerTest, CustomResourceInstanceTest) {
|
||||||
"custom_unit_instance_resources": "FPGA"
|
"custom_unit_instance_resources": "FPGA"
|
||||||
}
|
}
|
||||||
)");
|
)");
|
||||||
ClusterResourceScheduler resource_scheduler("local", {{"CPU", 4}, {"FPGA", 2}},
|
ClusterResourceScheduler resource_scheduler(scheduling::NodeID("local"),
|
||||||
*gcs_client_);
|
{{"CPU", 4}, {"FPGA", 2}}, *gcs_client_);
|
||||||
|
|
||||||
StringIdMap mock_string_to_int_map;
|
StringIdMap mock_string_to_int_map;
|
||||||
int64_t fpga_resource_id = mock_string_to_int_map.Insert("FPGA");
|
int64_t fpga_resource_id = mock_string_to_int_map.Insert("FPGA");
|
||||||
|
@ -1509,7 +1540,7 @@ TEST_F(ClusterResourceSchedulerTest, CustomResourceInstanceTest) {
|
||||||
|
|
||||||
TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesSerializedStringTest) {
|
TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesSerializedStringTest) {
|
||||||
ClusterResourceScheduler resource_scheduler(
|
ClusterResourceScheduler resource_scheduler(
|
||||||
"local", {{"CPU", 4}, {"memory", 4}, {"GPU", 2}}, *gcs_client_);
|
scheduling::NodeID("local"), {{"CPU", 4}, {"memory", 4}, {"GPU", 2}}, *gcs_client_);
|
||||||
std::shared_ptr<TaskResourceInstances> cluster_resources =
|
std::shared_ptr<TaskResourceInstances> cluster_resources =
|
||||||
std::make_shared<TaskResourceInstances>();
|
std::make_shared<TaskResourceInstances>();
|
||||||
addTaskResourceInstances(true, {2.}, 0, cluster_resources.get());
|
addTaskResourceInstances(true, {2.}, 0, cluster_resources.get());
|
||||||
|
@ -1534,7 +1565,7 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesSerializedStringTest)
|
||||||
addTaskResourceInstances(true, {4.}, 1, cluster_instance_resources.get());
|
addTaskResourceInstances(true, {4.}, 1, cluster_instance_resources.get());
|
||||||
addTaskResourceInstances(true, {1., 1.}, 2, cluster_instance_resources.get());
|
addTaskResourceInstances(true, {1., 1.}, 2, cluster_instance_resources.get());
|
||||||
ClusterResourceScheduler resource_scheduler_cpu_instance(
|
ClusterResourceScheduler resource_scheduler_cpu_instance(
|
||||||
"local", {{"CPU", 4}, {"memory", 4}, {"GPU", 2}}, *gcs_client_);
|
scheduling::NodeID("local"), {{"CPU", 4}, {"memory", 4}, {"GPU", 2}}, *gcs_client_);
|
||||||
std::string instance_serialized_string =
|
std::string instance_serialized_string =
|
||||||
resource_scheduler_cpu_instance.GetLocalResourceManager()
|
resource_scheduler_cpu_instance.GetLocalResourceManager()
|
||||||
.SerializedTaskResourceInstances(cluster_instance_resources);
|
.SerializedTaskResourceInstances(cluster_instance_resources);
|
||||||
|
|
|
@ -77,21 +77,21 @@ void ClusterTaskManager::ScheduleAndDispatchTasks() {
|
||||||
RayTask task = work->task;
|
RayTask task = work->task;
|
||||||
RAY_LOG(DEBUG) << "Scheduling pending task "
|
RAY_LOG(DEBUG) << "Scheduling pending task "
|
||||||
<< task.GetTaskSpecification().TaskId();
|
<< task.GetTaskSpecification().TaskId();
|
||||||
std::string node_id_string = cluster_resource_scheduler_->GetBestSchedulableNode(
|
auto scheduling_node_id = cluster_resource_scheduler_->GetBestSchedulableNode(
|
||||||
task.GetTaskSpecification(), work->PrioritizeLocalNode(),
|
task.GetTaskSpecification(), work->PrioritizeLocalNode(),
|
||||||
/*exclude_local_node*/ false,
|
/*exclude_local_node*/ false,
|
||||||
/*requires_object_store_memory*/ false, &is_infeasible);
|
/*requires_object_store_memory*/ false, &is_infeasible);
|
||||||
|
|
||||||
// There is no node that has available resources to run the request.
|
// There is no node that has available resources to run the request.
|
||||||
// Move on to the next shape.
|
// Move on to the next shape.
|
||||||
if (node_id_string.empty()) {
|
if (scheduling_node_id.IsNil()) {
|
||||||
RAY_LOG(DEBUG) << "No node found to schedule a task "
|
RAY_LOG(DEBUG) << "No node found to schedule a task "
|
||||||
<< task.GetTaskSpecification().TaskId() << " is infeasible?"
|
<< task.GetTaskSpecification().TaskId() << " is infeasible?"
|
||||||
<< is_infeasible;
|
<< is_infeasible;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
NodeID node_id = NodeID::FromBinary(node_id_string);
|
NodeID node_id = NodeID::FromBinary(scheduling_node_id.Binary());
|
||||||
ScheduleOnNode(node_id, work);
|
ScheduleOnNode(node_id, work);
|
||||||
work_it = work_queue.erase(work_it);
|
work_it = work_queue.erase(work_it);
|
||||||
}
|
}
|
||||||
|
@ -129,7 +129,7 @@ void ClusterTaskManager::TryScheduleInfeasibleTask() {
|
||||||
RAY_LOG(DEBUG) << "Check if the infeasible task is schedulable in any node. task_id:"
|
RAY_LOG(DEBUG) << "Check if the infeasible task is schedulable in any node. task_id:"
|
||||||
<< task.GetTaskSpecification().TaskId();
|
<< task.GetTaskSpecification().TaskId();
|
||||||
bool is_infeasible;
|
bool is_infeasible;
|
||||||
std::string node_id_string = cluster_resource_scheduler_->GetBestSchedulableNode(
|
cluster_resource_scheduler_->GetBestSchedulableNode(
|
||||||
task.GetTaskSpecification(), work->PrioritizeLocalNode(),
|
task.GetTaskSpecification(), work->PrioritizeLocalNode(),
|
||||||
/*exclude_local_node*/ false,
|
/*exclude_local_node*/ false,
|
||||||
/*requires_object_store_memory*/ false, &is_infeasible);
|
/*requires_object_store_memory*/ false, &is_infeasible);
|
||||||
|
@ -293,7 +293,8 @@ void ClusterTaskManager::ScheduleOnNode(const NodeID &spillback_to,
|
||||||
RAY_LOG(DEBUG) << "Spilling task " << task_spec.TaskId() << " to node " << spillback_to;
|
RAY_LOG(DEBUG) << "Spilling task " << task_spec.TaskId() << " to node " << spillback_to;
|
||||||
|
|
||||||
if (!cluster_resource_scheduler_->AllocateRemoteTaskResources(
|
if (!cluster_resource_scheduler_->AllocateRemoteTaskResources(
|
||||||
spillback_to.Binary(), task_spec.GetRequiredResources().GetResourceMap())) {
|
scheduling::NodeID(spillback_to.Binary()),
|
||||||
|
task_spec.GetRequiredResources().GetResourceMap())) {
|
||||||
RAY_LOG(DEBUG) << "Tried to allocate resources for request " << task_spec.TaskId()
|
RAY_LOG(DEBUG) << "Tried to allocate resources for request " << task_spec.TaskId()
|
||||||
<< " on a remote node that are no longer available";
|
<< " on a remote node that are no longer available";
|
||||||
}
|
}
|
||||||
|
|
|
@ -129,8 +129,8 @@ std::shared_ptr<ClusterResourceScheduler> CreateSingleNodeScheduler(
|
||||||
local_node_resources[ray::kGPU_ResourceLabel] = num_gpus;
|
local_node_resources[ray::kGPU_ResourceLabel] = num_gpus;
|
||||||
local_node_resources[ray::kMemory_ResourceLabel] = 128;
|
local_node_resources[ray::kMemory_ResourceLabel] = 128;
|
||||||
|
|
||||||
auto scheduler =
|
auto scheduler = std::make_shared<ClusterResourceScheduler>(
|
||||||
std::make_shared<ClusterResourceScheduler>(id, local_node_resources, gcs_client);
|
scheduling::NodeID(id), local_node_resources, gcs_client);
|
||||||
|
|
||||||
return scheduler;
|
return scheduler;
|
||||||
}
|
}
|
||||||
|
@ -285,8 +285,8 @@ class ClusterTaskManagerTest : public ::testing::Test {
|
||||||
node_resources[ray::kCPU_ResourceLabel] = num_cpus;
|
node_resources[ray::kCPU_ResourceLabel] = num_cpus;
|
||||||
node_resources[ray::kGPU_ResourceLabel] = num_gpus;
|
node_resources[ray::kGPU_ResourceLabel] = num_gpus;
|
||||||
node_resources[ray::kMemory_ResourceLabel] = memory;
|
node_resources[ray::kMemory_ResourceLabel] = memory;
|
||||||
scheduler_->GetClusterResourceManager().AddOrUpdateNode(id.Binary(), node_resources,
|
scheduler_->GetClusterResourceManager().AddOrUpdateNode(
|
||||||
node_resources);
|
scheduling::NodeID(id.Binary()), node_resources, node_resources);
|
||||||
|
|
||||||
rpc::GcsNodeInfo info;
|
rpc::GcsNodeInfo info;
|
||||||
node_info_[id] = info;
|
node_info_[id] = info;
|
||||||
|
@ -1480,7 +1480,8 @@ TEST_F(ClusterTaskManagerTest, FeasibleToNonFeasible) {
|
||||||
|
|
||||||
// Delete cpu resource of local node, then task 2 should be turned into
|
// Delete cpu resource of local node, then task 2 should be turned into
|
||||||
// infeasible.
|
// infeasible.
|
||||||
scheduler_->GetLocalResourceManager().DeleteLocalResource(ray::kCPU_ResourceLabel);
|
scheduler_->GetLocalResourceManager().DeleteLocalResource(
|
||||||
|
scheduling::ResourceID(ray::kCPU_ResourceLabel));
|
||||||
|
|
||||||
RayTask task2 = CreateTask({{ray::kCPU_ResourceLabel, 4}});
|
RayTask task2 = CreateTask({{ray::kCPU_ResourceLabel, 4}});
|
||||||
rpc::RequestWorkerLeaseReply reply2;
|
rpc::RequestWorkerLeaseReply reply2;
|
||||||
|
@ -1506,7 +1507,8 @@ TEST_F(ClusterTaskManagerTest, FeasibleToNonFeasible) {
|
||||||
|
|
||||||
TEST_F(ClusterTaskManagerTestWithGPUsAtHead, RleaseAndReturnWorkerCpuResources) {
|
TEST_F(ClusterTaskManagerTestWithGPUsAtHead, RleaseAndReturnWorkerCpuResources) {
|
||||||
const NodeResources &node_resources =
|
const NodeResources &node_resources =
|
||||||
scheduler_->GetClusterResourceManager().GetNodeResources(id_.Binary());
|
scheduler_->GetClusterResourceManager().GetNodeResources(
|
||||||
|
scheduling::NodeID(id_.Binary()));
|
||||||
ASSERT_EQ(node_resources.predefined_resources[PredefinedResources::CPU].available, 8);
|
ASSERT_EQ(node_resources.predefined_resources[PredefinedResources::CPU].available, 8);
|
||||||
ASSERT_EQ(node_resources.predefined_resources[PredefinedResources::GPU].available, 4);
|
ASSERT_EQ(node_resources.predefined_resources[PredefinedResources::GPU].available, 4);
|
||||||
|
|
||||||
|
@ -1831,8 +1833,8 @@ TEST_F(ClusterTaskManagerTest, PopWorkerExactlyOnce) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(ClusterTaskManagerTest, CapRunningOnDispatchQueue) {
|
TEST_F(ClusterTaskManagerTest, CapRunningOnDispatchQueue) {
|
||||||
scheduler_->GetLocalResourceManager().AddLocalResourceInstances(ray::kGPU_ResourceLabel,
|
scheduler_->GetLocalResourceManager().AddLocalResourceInstances(
|
||||||
{1, 1, 1});
|
scheduling::ResourceID(ray::kGPU_ResourceLabel), {1, 1, 1});
|
||||||
RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 4}, {ray::kGPU_ResourceLabel, 1}},
|
RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 4}, {ray::kGPU_ResourceLabel, 1}},
|
||||||
/*num_args=*/0, /*args=*/{});
|
/*num_args=*/0, /*args=*/{});
|
||||||
RayTask task2 = CreateTask({{ray::kCPU_ResourceLabel, 4}, {ray::kGPU_ResourceLabel, 1}},
|
RayTask task2 = CreateTask({{ray::kCPU_ResourceLabel, 4}, {ray::kGPU_ResourceLabel, 1}},
|
||||||
|
@ -1883,8 +1885,8 @@ TEST_F(ClusterTaskManagerTest, CapRunningOnDispatchQueue) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(ClusterTaskManagerTest, ZeroCPUTasks) {
|
TEST_F(ClusterTaskManagerTest, ZeroCPUTasks) {
|
||||||
scheduler_->GetLocalResourceManager().AddLocalResourceInstances(ray::kGPU_ResourceLabel,
|
scheduler_->GetLocalResourceManager().AddLocalResourceInstances(
|
||||||
{1, 1, 1});
|
scheduling::ResourceID(ray::kGPU_ResourceLabel), {1, 1, 1});
|
||||||
RayTask task = CreateTask({{"GPU", 1}}, /*num_args=*/0, /*args=*/{});
|
RayTask task = CreateTask({{"GPU", 1}}, /*num_args=*/0, /*args=*/{});
|
||||||
RayTask task2 = CreateTask({{"GPU", 1}}, /*num_args=*/0, /*args=*/{});
|
RayTask task2 = CreateTask({{"GPU", 1}}, /*num_args=*/0, /*args=*/{});
|
||||||
RayTask task3 = CreateTask({{"GPU", 1}}, /*num_args=*/0, /*args=*/{});
|
RayTask task3 = CreateTask({{"GPU", 1}}, /*num_args=*/0, /*args=*/{});
|
||||||
|
|
|
@ -22,20 +22,17 @@
|
||||||
namespace ray {
|
namespace ray {
|
||||||
|
|
||||||
LocalResourceManager::LocalResourceManager(
|
LocalResourceManager::LocalResourceManager(
|
||||||
int64_t local_node_id, StringIdMap &resource_name_to_id,
|
scheduling::NodeID local_node_id, const NodeResources &node_resources,
|
||||||
const NodeResources &node_resources,
|
|
||||||
std::function<int64_t(void)> get_used_object_store_memory,
|
std::function<int64_t(void)> get_used_object_store_memory,
|
||||||
std::function<bool(void)> get_pull_manager_at_capacity,
|
std::function<bool(void)> get_pull_manager_at_capacity,
|
||||||
std::function<void(const NodeResources &)> resource_change_subscriber)
|
std::function<void(const NodeResources &)> resource_change_subscriber)
|
||||||
: local_node_id_(local_node_id),
|
: local_node_id_(local_node_id),
|
||||||
resource_name_to_id_(resource_name_to_id),
|
|
||||||
get_used_object_store_memory_(get_used_object_store_memory),
|
get_used_object_store_memory_(get_used_object_store_memory),
|
||||||
get_pull_manager_at_capacity_(get_pull_manager_at_capacity),
|
get_pull_manager_at_capacity_(get_pull_manager_at_capacity),
|
||||||
resource_change_subscriber_(resource_change_subscriber) {
|
resource_change_subscriber_(resource_change_subscriber) {
|
||||||
InitResourceUnitInstanceInfo();
|
InitResourceUnitInstanceInfo();
|
||||||
InitLocalResources(node_resources);
|
InitLocalResources(node_resources);
|
||||||
RAY_LOG(DEBUG) << "local resources: "
|
RAY_LOG(DEBUG) << "local resources: " << local_resources_.DebugString();
|
||||||
<< local_resources_.DebugString(resource_name_to_id);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void LocalResourceManager::InitResourceUnitInstanceInfo() {
|
void LocalResourceManager::InitResourceUnitInstanceInfo() {
|
||||||
|
@ -57,14 +54,15 @@ void LocalResourceManager::InitResourceUnitInstanceInfo() {
|
||||||
std::vector<std::string> results;
|
std::vector<std::string> results;
|
||||||
boost::split(results, custom_unit_instance_resources, boost::is_any_of(","));
|
boost::split(results, custom_unit_instance_resources, boost::is_any_of(","));
|
||||||
for (std::string &result : results) {
|
for (std::string &result : results) {
|
||||||
int64_t resource_id = resource_name_to_id_.Insert(result);
|
int64_t resource_id = scheduling::ResourceID(result).ToInt();
|
||||||
custom_unit_instance_resources_.emplace(resource_id);
|
custom_unit_instance_resources_.emplace(resource_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void LocalResourceManager::AddLocalResourceInstances(
|
void LocalResourceManager::AddLocalResourceInstances(
|
||||||
const std::string &resource_name, const std::vector<FixedPoint> &instances) {
|
scheduling::ResourceID resource_id, const std::vector<FixedPoint> &instances) {
|
||||||
|
auto resource_name = resource_id.Binary();
|
||||||
ResourceInstanceCapacities *node_instances;
|
ResourceInstanceCapacities *node_instances;
|
||||||
local_resources_.predefined_resources.resize(PredefinedResources_MAX);
|
local_resources_.predefined_resources.resize(PredefinedResources_MAX);
|
||||||
if (kCPU_ResourceLabel == resource_name) {
|
if (kCPU_ResourceLabel == resource_name) {
|
||||||
|
@ -76,9 +74,7 @@ void LocalResourceManager::AddLocalResourceInstances(
|
||||||
} else if (kMemory_ResourceLabel == resource_name) {
|
} else if (kMemory_ResourceLabel == resource_name) {
|
||||||
node_instances = &local_resources_.predefined_resources[MEM];
|
node_instances = &local_resources_.predefined_resources[MEM];
|
||||||
} else {
|
} else {
|
||||||
resource_name_to_id_.Insert(resource_name);
|
node_instances = &local_resources_.custom_resources[resource_id.ToInt()];
|
||||||
int64_t resource_id = resource_name_to_id_.Get(resource_name);
|
|
||||||
node_instances = &local_resources_.custom_resources[resource_id];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node_instances->total.size() < instances.size()) {
|
if (node_instances->total.size() < instances.size()) {
|
||||||
|
@ -93,8 +89,8 @@ void LocalResourceManager::AddLocalResourceInstances(
|
||||||
OnResourceChanged();
|
OnResourceChanged();
|
||||||
}
|
}
|
||||||
|
|
||||||
void LocalResourceManager::DeleteLocalResource(const std::string &resource_name) {
|
void LocalResourceManager::DeleteLocalResource(scheduling::ResourceID resource_id) {
|
||||||
int idx = GetPredefinedResourceIndex(resource_name);
|
int idx = GetPredefinedResourceIndex(resource_id);
|
||||||
if (idx != -1) {
|
if (idx != -1) {
|
||||||
for (auto &total : local_resources_.predefined_resources[idx].total) {
|
for (auto &total : local_resources_.predefined_resources[idx].total) {
|
||||||
total = 0;
|
total = 0;
|
||||||
|
@ -103,26 +99,23 @@ void LocalResourceManager::DeleteLocalResource(const std::string &resource_name)
|
||||||
available = 0;
|
available = 0;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
int64_t resource_id = resource_name_to_id_.Get(resource_name);
|
auto c_itr = local_resources_.custom_resources.find(resource_id.ToInt());
|
||||||
auto c_itr = local_resources_.custom_resources.find(resource_id);
|
|
||||||
if (c_itr != local_resources_.custom_resources.end()) {
|
if (c_itr != local_resources_.custom_resources.end()) {
|
||||||
local_resources_.custom_resources[resource_id].total.clear();
|
local_resources_.custom_resources[resource_id.ToInt()].total.clear();
|
||||||
local_resources_.custom_resources[resource_id].available.clear();
|
local_resources_.custom_resources[resource_id.ToInt()].available.clear();
|
||||||
local_resources_.custom_resources.erase(c_itr);
|
local_resources_.custom_resources.erase(c_itr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
OnResourceChanged();
|
OnResourceChanged();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LocalResourceManager::IsAvailableResourceEmpty(const std::string &resource_name) {
|
bool LocalResourceManager::IsAvailableResourceEmpty(scheduling::ResourceID resource_id) {
|
||||||
int idx = GetPredefinedResourceIndex(resource_name);
|
int idx = GetPredefinedResourceIndex(resource_id);
|
||||||
|
|
||||||
if (idx != -1) {
|
if (idx != -1) {
|
||||||
return FixedPoint::Sum(local_resources_.predefined_resources[idx].available) <= 0;
|
return FixedPoint::Sum(local_resources_.predefined_resources[idx].available) <= 0;
|
||||||
}
|
}
|
||||||
resource_name_to_id_.Insert(resource_name);
|
auto itr = local_resources_.custom_resources.find(resource_id.ToInt());
|
||||||
int64_t resource_id = resource_name_to_id_.Get(resource_name);
|
|
||||||
auto itr = local_resources_.custom_resources.find(resource_id);
|
|
||||||
if (itr != local_resources_.custom_resources.end()) {
|
if (itr != local_resources_.custom_resources.end()) {
|
||||||
return FixedPoint::Sum(itr->second.available) <= 0;
|
return FixedPoint::Sum(itr->second.available) <= 0;
|
||||||
} else {
|
} else {
|
||||||
|
@ -132,7 +125,7 @@ bool LocalResourceManager::IsAvailableResourceEmpty(const std::string &resource_
|
||||||
|
|
||||||
std::string LocalResourceManager::DebugString(void) const {
|
std::string LocalResourceManager::DebugString(void) const {
|
||||||
std::stringstream buffer;
|
std::stringstream buffer;
|
||||||
buffer << local_resources_.DebugString(resource_name_to_id_);
|
buffer << local_resources_.DebugString();
|
||||||
return buffer.str();
|
return buffer.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -441,7 +434,7 @@ bool LocalResourceManager::AllocateLocalTaskResources(
|
||||||
RAY_CHECK(task_allocation != nullptr);
|
RAY_CHECK(task_allocation != nullptr);
|
||||||
// We don't track object store memory demands so no need to allocate them.
|
// We don't track object store memory demands so no need to allocate them.
|
||||||
ResourceRequest resource_request = ResourceMapToResourceRequest(
|
ResourceRequest resource_request = ResourceMapToResourceRequest(
|
||||||
resource_name_to_id_, task_resources, /*requires_object_store_memory=*/false);
|
task_resources, /*requires_object_store_memory=*/false);
|
||||||
return AllocateLocalTaskResources(resource_request, task_allocation);
|
return AllocateLocalTaskResources(resource_request, task_allocation);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -510,8 +503,7 @@ void LocalResourceManager::FillResourceUsage(rpc::ResourcesData &resources_data)
|
||||||
|
|
||||||
// Initialize if last report resources is empty.
|
// Initialize if last report resources is empty.
|
||||||
if (!last_report_resources_) {
|
if (!last_report_resources_) {
|
||||||
NodeResources node_resources =
|
NodeResources node_resources = ResourceMapToNodeResources({{}}, {{}});
|
||||||
ResourceMapToNodeResources(resource_name_to_id_, {{}}, {{}});
|
|
||||||
last_report_resources_.reset(new NodeResources(node_resources));
|
last_report_resources_.reset(new NodeResources(node_resources));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -533,7 +525,7 @@ void LocalResourceManager::FillResourceUsage(rpc::ResourcesData &resources_data)
|
||||||
uint64_t custom_id = it.first;
|
uint64_t custom_id = it.first;
|
||||||
const auto &capacity = it.second;
|
const auto &capacity = it.second;
|
||||||
const auto &last_capacity = last_report_resources_->custom_resources[custom_id];
|
const auto &last_capacity = last_report_resources_->custom_resources[custom_id];
|
||||||
const auto &label = resource_name_to_id_.Get(custom_id);
|
auto label = scheduling::ResourceID(custom_id).Binary();
|
||||||
// Note: available may be negative, but only report positive to GCS.
|
// Note: available may be negative, but only report positive to GCS.
|
||||||
if (capacity.available != last_capacity.available && capacity.available > 0) {
|
if (capacity.available != last_capacity.available && capacity.available > 0) {
|
||||||
resources_data.set_resources_available_changed(true);
|
resources_data.set_resources_available_changed(true);
|
||||||
|
@ -541,7 +533,8 @@ void LocalResourceManager::FillResourceUsage(rpc::ResourcesData &resources_data)
|
||||||
capacity.available.Double();
|
capacity.available.Double();
|
||||||
}
|
}
|
||||||
if (capacity.total != last_capacity.total) {
|
if (capacity.total != last_capacity.total) {
|
||||||
(*resources_data.mutable_resources_total())[label] = capacity.total.Double();
|
(*resources_data.mutable_resources_total())[std::move(label)] =
|
||||||
|
capacity.total.Double();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -586,7 +579,7 @@ ray::gcs::NodeResourceInfoAccessor::ResourceMap LocalResourceManager::GetResourc
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto entry : local_resources_.custom_resources) {
|
for (auto entry : local_resources_.custom_resources) {
|
||||||
std::string resource_name = resource_name_to_id_.Get(entry.first);
|
std::string resource_name = scheduling::ResourceID(entry.first).Binary();
|
||||||
double resource_total = FixedPoint::Sum(entry.second.total).Double();
|
double resource_total = FixedPoint::Sum(entry.second.total).Double();
|
||||||
if (!resource_map_filter.contains(resource_name)) {
|
if (!resource_map_filter.contains(resource_name)) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -646,36 +639,28 @@ std::string LocalResourceManager::SerializedTaskResourceInstances(
|
||||||
|
|
||||||
void LocalResourceManager::ResetLastReportResourceUsage(
|
void LocalResourceManager::ResetLastReportResourceUsage(
|
||||||
const SchedulingResources &replacement) {
|
const SchedulingResources &replacement) {
|
||||||
last_report_resources_ = std::make_unique<NodeResources>(ResourceMapToNodeResources(
|
last_report_resources_ = std::make_unique<NodeResources>(
|
||||||
resource_name_to_id_, replacement.GetTotalResources().GetResourceMap(),
|
ResourceMapToNodeResources(replacement.GetTotalResources().GetResourceMap(),
|
||||||
replacement.GetAvailableResources().GetResourceMap()));
|
replacement.GetAvailableResources().GetResourceMap()));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LocalResourceManager::ResourcesExist(const std::string &resource_name) {
|
bool LocalResourceManager::ResourcesExist(scheduling::ResourceID resource_id) {
|
||||||
int idx = GetPredefinedResourceIndex(resource_name);
|
int idx = GetPredefinedResourceIndex(resource_id);
|
||||||
if (idx != -1) {
|
if (idx != -1) {
|
||||||
// Return true directly for predefined resources as we always initialize this kind of
|
// Return true directly for predefined resources as we always initialize this kind of
|
||||||
// resources at the beginning.
|
// resources at the beginning.
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
int64_t resource_id = resource_name_to_id_.Get(resource_name);
|
const auto &it = local_resources_.custom_resources.find(resource_id.ToInt());
|
||||||
const auto &it = local_resources_.custom_resources.find(resource_id);
|
|
||||||
return it != local_resources_.custom_resources.end();
|
return it != local_resources_.custom_resources.end();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int GetPredefinedResourceIndex(const std::string &resource_name) {
|
int GetPredefinedResourceIndex(scheduling::ResourceID resource_id) {
|
||||||
int idx = -1;
|
if (resource_id.ToInt() >= 0 && resource_id.ToInt() < PredefinedResources_MAX) {
|
||||||
if (resource_name == ray::kCPU_ResourceLabel) {
|
return resource_id.ToInt();
|
||||||
idx = (int)ray::CPU;
|
}
|
||||||
} else if (resource_name == ray::kGPU_ResourceLabel) {
|
return -1;
|
||||||
idx = (int)ray::GPU;
|
|
||||||
} else if (resource_name == ray::kObjectStoreMemory_ResourceLabel) {
|
|
||||||
idx = (int)ray::OBJECT_STORE_MEM;
|
|
||||||
} else if (resource_name == ray::kMemory_ResourceLabel) {
|
|
||||||
idx = (int)ray::MEM;
|
|
||||||
};
|
|
||||||
return idx;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace ray
|
} // namespace ray
|
||||||
|
|
|
@ -40,30 +40,29 @@ namespace ray {
|
||||||
class LocalResourceManager {
|
class LocalResourceManager {
|
||||||
public:
|
public:
|
||||||
LocalResourceManager(
|
LocalResourceManager(
|
||||||
int64_t local_node_id, StringIdMap &resource_name_to_id,
|
scheduling::NodeID local_node_id, const NodeResources &node_resources,
|
||||||
const NodeResources &node_resources,
|
|
||||||
std::function<int64_t(void)> get_used_object_store_memory,
|
std::function<int64_t(void)> get_used_object_store_memory,
|
||||||
std::function<bool(void)> get_pull_manager_at_capacity,
|
std::function<bool(void)> get_pull_manager_at_capacity,
|
||||||
std::function<void(const NodeResources &)> resource_change_subscriber);
|
std::function<void(const NodeResources &)> resource_change_subscriber);
|
||||||
|
|
||||||
int64_t GetNodeId() const { return local_node_id_; }
|
scheduling::NodeID GetNodeId() const { return local_node_id_; }
|
||||||
|
|
||||||
/// Add a local resource that is available.
|
/// Add a local resource that is available.
|
||||||
///
|
///
|
||||||
/// \param resource_name: Resource which we want to update.
|
/// \param resource_name: Resource which we want to update.
|
||||||
/// \param resource_total: New capacity of the resource.
|
/// \param resource_total: New capacity of the resource.
|
||||||
void AddLocalResourceInstances(const std::string &resource_name,
|
void AddLocalResourceInstances(scheduling::ResourceID resource_id,
|
||||||
const std::vector<FixedPoint> &instances);
|
const std::vector<FixedPoint> &instances);
|
||||||
|
|
||||||
/// Delete a given resource from the local node.
|
/// Delete a given resource from the local node.
|
||||||
///
|
///
|
||||||
/// \param resource_name: Resource we want to delete
|
/// \param resource_name: Resource we want to delete
|
||||||
void DeleteLocalResource(const std::string &resource_name);
|
void DeleteLocalResource(scheduling::ResourceID resource_id);
|
||||||
|
|
||||||
/// Check whether the available resources are empty.
|
/// Check whether the available resources are empty.
|
||||||
///
|
///
|
||||||
/// \param resource_name: Resource which we want to check.
|
/// \param resource_name: Resource which we want to check.
|
||||||
bool IsAvailableResourceEmpty(const std::string &resource_name);
|
bool IsAvailableResourceEmpty(scheduling::ResourceID resource_id);
|
||||||
|
|
||||||
/// Return local resources.
|
/// Return local resources.
|
||||||
NodeResourceInstances GetLocalResources() const { return local_resources_; }
|
NodeResourceInstances GetLocalResources() const { return local_resources_; }
|
||||||
|
@ -164,7 +163,7 @@ class LocalResourceManager {
|
||||||
/// \param resource_name: the specific resource name.
|
/// \param resource_name: the specific resource name.
|
||||||
///
|
///
|
||||||
/// \return true, if exist. otherwise, false.
|
/// \return true, if exist. otherwise, false.
|
||||||
bool ResourcesExist(const std::string &resource_name);
|
bool ResourcesExist(scheduling::ResourceID resource_id);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// Create instances for each resource associated with the local node, given
|
/// Create instances for each resource associated with the local node, given
|
||||||
|
@ -270,10 +269,7 @@ class LocalResourceManager {
|
||||||
void UpdateAvailableObjectStoreMemResource();
|
void UpdateAvailableObjectStoreMemResource();
|
||||||
|
|
||||||
/// Identifier of local node.
|
/// Identifier of local node.
|
||||||
int64_t local_node_id_;
|
scheduling::NodeID local_node_id_;
|
||||||
/// Keep the mapping between node and resource IDs in string representation
|
|
||||||
/// to integer representation. Used for improving map performance.
|
|
||||||
StringIdMap &resource_name_to_id_;
|
|
||||||
/// Resources of local node.
|
/// Resources of local node.
|
||||||
NodeResourceInstances local_resources_;
|
NodeResourceInstances local_resources_;
|
||||||
/// Cached resources, used to compare with newest one in light heartbeat mode.
|
/// Cached resources, used to compare with newest one in light heartbeat mode.
|
||||||
|
@ -301,6 +297,6 @@ class LocalResourceManager {
|
||||||
FRIEND_TEST(ClusterResourceSchedulerTest, CustomResourceInstanceTest);
|
FRIEND_TEST(ClusterResourceSchedulerTest, CustomResourceInstanceTest);
|
||||||
};
|
};
|
||||||
|
|
||||||
int GetPredefinedResourceIndex(const std::string &resource_name);
|
int GetPredefinedResourceIndex(scheduling::ResourceID resource_id);
|
||||||
|
|
||||||
} // end namespace ray
|
} // end namespace ray
|
||||||
|
|
|
@ -315,13 +315,14 @@ void LocalTaskManager::SpillWaitingTasks() {
|
||||||
// TODO(swang): The policy currently does not account for the amount of
|
// TODO(swang): The policy currently does not account for the amount of
|
||||||
// object store memory availability. Ideally, we should pick the node with
|
// object store memory availability. Ideally, we should pick the node with
|
||||||
// the most memory availability.
|
// the most memory availability.
|
||||||
std::string node_id_string = cluster_resource_scheduler_->GetBestSchedulableNode(
|
auto scheduling_node_id = cluster_resource_scheduler_->GetBestSchedulableNode(
|
||||||
(*it)->task.GetTaskSpecification(),
|
(*it)->task.GetTaskSpecification(),
|
||||||
/*prioritize_local_node*/ true,
|
/*prioritize_local_node*/ true,
|
||||||
/*exclude_local_node*/ force_spillback,
|
/*exclude_local_node*/ force_spillback,
|
||||||
/*requires_object_store_memory*/ true, &is_infeasible);
|
/*requires_object_store_memory*/ true, &is_infeasible);
|
||||||
if (!node_id_string.empty() && node_id_string != self_node_id_.Binary()) {
|
if (!scheduling_node_id.IsNil() &&
|
||||||
NodeID node_id = NodeID::FromBinary(node_id_string);
|
scheduling_node_id.Binary() != self_node_id_.Binary()) {
|
||||||
|
NodeID node_id = NodeID::FromBinary(scheduling_node_id.Binary());
|
||||||
Spillback(node_id, *it);
|
Spillback(node_id, *it);
|
||||||
if (!task.GetTaskSpecification().GetDependencies().empty()) {
|
if (!task.GetTaskSpecification().GetDependencies().empty()) {
|
||||||
task_dependency_manager_.RemoveTaskDependencies(
|
task_dependency_manager_.RemoveTaskDependencies(
|
||||||
|
@ -330,7 +331,7 @@ void LocalTaskManager::SpillWaitingTasks() {
|
||||||
waiting_tasks_index_.erase(task_id);
|
waiting_tasks_index_.erase(task_id);
|
||||||
it = waiting_task_queue_.erase(it);
|
it = waiting_task_queue_.erase(it);
|
||||||
} else {
|
} else {
|
||||||
if (node_id_string.empty()) {
|
if (scheduling_node_id.IsNil()) {
|
||||||
RAY_LOG(DEBUG) << "RayTask " << task_id
|
RAY_LOG(DEBUG) << "RayTask " << task_id
|
||||||
<< " has blocked dependencies, but no other node has resources, "
|
<< " has blocked dependencies, but no other node has resources, "
|
||||||
"keeping the task local";
|
"keeping the task local";
|
||||||
|
@ -347,17 +348,17 @@ void LocalTaskManager::SpillWaitingTasks() {
|
||||||
|
|
||||||
bool LocalTaskManager::TrySpillback(const std::shared_ptr<internal::Work> &work,
|
bool LocalTaskManager::TrySpillback(const std::shared_ptr<internal::Work> &work,
|
||||||
bool &is_infeasible) {
|
bool &is_infeasible) {
|
||||||
std::string node_id_string = cluster_resource_scheduler_->GetBestSchedulableNode(
|
auto scheduling_node_id = cluster_resource_scheduler_->GetBestSchedulableNode(
|
||||||
work->task.GetTaskSpecification(), work->PrioritizeLocalNode(),
|
work->task.GetTaskSpecification(), work->PrioritizeLocalNode(),
|
||||||
/*exclude_local_node*/ false,
|
/*exclude_local_node*/ false,
|
||||||
/*requires_object_store_memory*/ false, &is_infeasible);
|
/*requires_object_store_memory*/ false, &is_infeasible);
|
||||||
|
|
||||||
if (is_infeasible || node_id_string == self_node_id_.Binary() ||
|
if (is_infeasible || scheduling_node_id.IsNil() ||
|
||||||
node_id_string.empty()) {
|
scheduling_node_id.Binary() == self_node_id_.Binary()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
NodeID node_id = NodeID::FromBinary(node_id_string);
|
NodeID node_id = NodeID::FromBinary(scheduling_node_id.Binary());
|
||||||
Spillback(node_id, work);
|
Spillback(node_id, work);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -387,7 +388,7 @@ bool LocalTaskManager::PoppedWorkerHandler(
|
||||||
task.GetTaskSpecification().GetRequiredResources().GetResourceMap();
|
task.GetTaskSpecification().GetRequiredResources().GetResourceMap();
|
||||||
for (auto &entry : required_resource) {
|
for (auto &entry : required_resource) {
|
||||||
if (!cluster_resource_scheduler_->GetLocalResourceManager().ResourcesExist(
|
if (!cluster_resource_scheduler_->GetLocalResourceManager().ResourcesExist(
|
||||||
entry.first)) {
|
scheduling::ResourceID(entry.first))) {
|
||||||
RAY_CHECK(task.GetTaskSpecification().PlacementGroupBundleId().first !=
|
RAY_CHECK(task.GetTaskSpecification().PlacementGroupBundleId().first !=
|
||||||
PlacementGroupID::Nil());
|
PlacementGroupID::Nil());
|
||||||
RAY_LOG(DEBUG) << "The placement group: "
|
RAY_LOG(DEBUG) << "The placement group: "
|
||||||
|
@ -522,7 +523,8 @@ void LocalTaskManager::Spillback(const NodeID &spillback_to,
|
||||||
RAY_LOG(DEBUG) << "Spilling task " << task_spec.TaskId() << " to node " << spillback_to;
|
RAY_LOG(DEBUG) << "Spilling task " << task_spec.TaskId() << " to node " << spillback_to;
|
||||||
|
|
||||||
if (!cluster_resource_scheduler_->AllocateRemoteTaskResources(
|
if (!cluster_resource_scheduler_->AllocateRemoteTaskResources(
|
||||||
spillback_to.Binary(), task_spec.GetRequiredResources().GetResourceMap())) {
|
scheduling::NodeID(spillback_to.Binary()),
|
||||||
|
task_spec.GetRequiredResources().GetResourceMap())) {
|
||||||
RAY_LOG(DEBUG) << "Tried to allocate resources for request " << task_spec.TaskId()
|
RAY_LOG(DEBUG) << "Tried to allocate resources for request " << task_spec.TaskId()
|
||||||
<< " on a remote node that are no longer available";
|
<< " on a remote node that are no longer available";
|
||||||
}
|
}
|
||||||
|
@ -984,7 +986,6 @@ bool LocalTaskManager::ReturnCpuResourcesToBlockedWorker(
|
||||||
|
|
||||||
ResourceSet LocalTaskManager::CalcNormalTaskResources() const {
|
ResourceSet LocalTaskManager::CalcNormalTaskResources() const {
|
||||||
absl::flat_hash_map<std::string, FixedPoint> total_normal_task_resources;
|
absl::flat_hash_map<std::string, FixedPoint> total_normal_task_resources;
|
||||||
const auto &string_id_map = cluster_resource_scheduler_->GetStringIdMap();
|
|
||||||
for (auto &entry : leased_workers_) {
|
for (auto &entry : leased_workers_) {
|
||||||
std::shared_ptr<WorkerInterface> worker = entry.second;
|
std::shared_ptr<WorkerInterface> worker = entry.second;
|
||||||
auto &task_spec = worker->GetAssignedTask().GetTaskSpecification();
|
auto &task_spec = worker->GetAssignedTask().GetTaskSpecification();
|
||||||
|
@ -1009,7 +1010,8 @@ ResourceSet LocalTaskManager::CalcNormalTaskResources() const {
|
||||||
}
|
}
|
||||||
for (auto &entry : resource_request.custom_resources) {
|
for (auto &entry : resource_request.custom_resources) {
|
||||||
if (entry.second > 0) {
|
if (entry.second > 0) {
|
||||||
total_normal_task_resources[string_id_map.Get(entry.first)] += entry.second;
|
total_normal_task_resources[scheduling::ResourceID(entry.first).Binary()] +=
|
||||||
|
entry.second;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,6 +14,8 @@
|
||||||
|
|
||||||
#include "ray/raylet/scheduling/scheduling_ids.h"
|
#include "ray/raylet/scheduling/scheduling_ids.h"
|
||||||
|
|
||||||
|
namespace ray {
|
||||||
|
|
||||||
int64_t StringIdMap::Get(const std::string &string_id) const {
|
int64_t StringIdMap::Get(const std::string &string_id) const {
|
||||||
auto it = string_to_int_.find(string_id);
|
auto it = string_to_int_.find(string_id);
|
||||||
if (it == string_to_int_.end()) {
|
if (it == string_to_int_.end()) {
|
||||||
|
@ -68,3 +70,5 @@ StringIdMap &StringIdMap::InsertOrDie(const std::string &string_id, int64_t valu
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t StringIdMap::Count() { return string_to_int_.size(); }
|
int64_t StringIdMap::Count() { return string_to_int_.size(); }
|
||||||
|
|
||||||
|
} // namespace ray
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <functional>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "absl/container/flat_hash_map.h"
|
#include "absl/container/flat_hash_map.h"
|
||||||
|
@ -22,6 +23,17 @@
|
||||||
/// Limit the ID range to test for collisions.
|
/// Limit the ID range to test for collisions.
|
||||||
#define MAX_ID_TEST 8
|
#define MAX_ID_TEST 8
|
||||||
|
|
||||||
|
namespace ray {
|
||||||
|
|
||||||
|
/// List of predefined resources.
|
||||||
|
enum PredefinedResources { CPU, MEM, GPU, OBJECT_STORE_MEM, PredefinedResources_MAX };
|
||||||
|
|
||||||
|
const std::string kCPU_ResourceLabel = "CPU";
|
||||||
|
const std::string kGPU_ResourceLabel = "GPU";
|
||||||
|
const std::string kObjectStoreMemory_ResourceLabel = "object_store_memory";
|
||||||
|
const std::string kMemory_ResourceLabel = "memory";
|
||||||
|
const std::string kBundle_ResourceLabel = "bundle";
|
||||||
|
|
||||||
/// Class to map string IDs to unique integer IDs and back.
|
/// Class to map string IDs to unique integer IDs and back.
|
||||||
class StringIdMap {
|
class StringIdMap {
|
||||||
absl::flat_hash_map<std::string, int64_t> string_to_int_;
|
absl::flat_hash_map<std::string, int64_t> string_to_int_;
|
||||||
|
@ -62,3 +74,85 @@ class StringIdMap {
|
||||||
/// Get number of identifiers.
|
/// Get number of identifiers.
|
||||||
int64_t Count();
|
int64_t Count();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum class SchedulingIDTag { Node, Resource };
|
||||||
|
|
||||||
|
/// Represent a string scheduling id. It optimizes the storage by
|
||||||
|
/// using a singleton StringIdMap, and only store the integer index as
|
||||||
|
/// its only member.
|
||||||
|
///
|
||||||
|
/// Note: this class is not thread safe!
|
||||||
|
template <SchedulingIDTag T>
|
||||||
|
class BaseSchedulingID {
|
||||||
|
public:
|
||||||
|
explicit BaseSchedulingID(const std::string &name) : id_{GetMap().Insert(name)} {}
|
||||||
|
|
||||||
|
explicit BaseSchedulingID(int64_t id) : id_{id} {}
|
||||||
|
|
||||||
|
int64_t ToInt() const { return id_; }
|
||||||
|
|
||||||
|
std::string Binary() const { return GetMap().Get(id_); }
|
||||||
|
|
||||||
|
bool operator==(const BaseSchedulingID &rhs) const { return id_ == rhs.id_; }
|
||||||
|
|
||||||
|
bool operator!=(const BaseSchedulingID &rhs) const { return id_ != rhs.id_; }
|
||||||
|
|
||||||
|
bool operator<(const BaseSchedulingID &rhs) const { return id_ < rhs.id_; }
|
||||||
|
|
||||||
|
bool IsNil() const { return id_ == -1; }
|
||||||
|
|
||||||
|
static BaseSchedulingID Nil() { return BaseSchedulingID(-1); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
/// Meyer's singleton to store the StringIdMap.
|
||||||
|
static StringIdMap &GetMap() {
|
||||||
|
static StringIdMap map;
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
int64_t id_ = -1;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <ray::SchedulingIDTag T>
|
||||||
|
std::ostream &operator<<(std::ostream &os, const ray::BaseSchedulingID<T> &id) {
|
||||||
|
os << id.ToInt();
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline std::ostream &operator<<(
|
||||||
|
std::ostream &os, const ray::BaseSchedulingID<SchedulingIDTag::Resource> &id) {
|
||||||
|
os << id.Binary();
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Specialization for SchedulingIDTag. Specifically, we populate
|
||||||
|
/// the singleton map with PredefinedResources.
|
||||||
|
template <>
|
||||||
|
inline StringIdMap &BaseSchedulingID<SchedulingIDTag::Resource>::GetMap() {
|
||||||
|
static StringIdMap map = []() {
|
||||||
|
StringIdMap map;
|
||||||
|
return map.InsertOrDie(kCPU_ResourceLabel, CPU)
|
||||||
|
.InsertOrDie(kGPU_ResourceLabel, GPU)
|
||||||
|
.InsertOrDie(kObjectStoreMemory_ResourceLabel, OBJECT_STORE_MEM)
|
||||||
|
.InsertOrDie(kMemory_ResourceLabel, MEM);
|
||||||
|
}();
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace scheduling {
|
||||||
|
/// The actual scheduling id definitions which are used in scheduler.
|
||||||
|
using ResourceID = BaseSchedulingID<SchedulingIDTag::Resource>;
|
||||||
|
using NodeID = BaseSchedulingID<SchedulingIDTag::Node>;
|
||||||
|
} // namespace scheduling
|
||||||
|
|
||||||
|
} // namespace ray
|
||||||
|
|
||||||
|
/// implements hash function for BaseSchedulingID<T>
|
||||||
|
namespace std {
|
||||||
|
template <ray::SchedulingIDTag T>
|
||||||
|
struct hash<ray::BaseSchedulingID<T>> {
|
||||||
|
std::size_t operator()(const ray::BaseSchedulingID<T> &id) const {
|
||||||
|
return std::hash<int64_t>()(id.ToInt());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace std
|
||||||
|
|
52
src/ray/raylet/scheduling/scheduling_ids_test.cc
Normal file
52
src/ray/raylet/scheduling/scheduling_ids_test.cc
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
// Copyright 2021 The Ray Authors.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "ray/raylet/scheduling/scheduling_ids.h"
|
||||||
|
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
|
||||||
|
namespace ray {
|
||||||
|
using namespace ray::scheduling;
|
||||||
|
|
||||||
|
struct SchedulingIDsTest : public ::testing::Test {};
|
||||||
|
|
||||||
|
TEST_F(SchedulingIDsTest, BasicTest) {
|
||||||
|
std::vector<std::string> string_ids = {"hello", "whaaat", "yes"};
|
||||||
|
std::vector<NodeID> node_ids;
|
||||||
|
for (auto &string_id : string_ids) {
|
||||||
|
node_ids.emplace_back(NodeID(string_id));
|
||||||
|
ASSERT_EQ(node_ids.back().Binary(), string_id);
|
||||||
|
}
|
||||||
|
ASSERT_EQ(node_ids[0], NodeID(string_ids[0]));
|
||||||
|
ASSERT_EQ(node_ids[0], NodeID(node_ids[0].ToInt()));
|
||||||
|
|
||||||
|
ASSERT_TRUE(NodeID::Nil().IsNil());
|
||||||
|
ASSERT_EQ(NodeID::Nil().ToInt(), -1);
|
||||||
|
ASSERT_EQ(NodeID::Nil().Binary(), "-1");
|
||||||
|
|
||||||
|
ASSERT_EQ(NodeID(13), NodeID(13));
|
||||||
|
ASSERT_NE(NodeID(1), NodeID(2));
|
||||||
|
ASSERT_TRUE(NodeID(1) < NodeID(2));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(SchedulingIDsTest, PrepopulateResourceIDTest) {
|
||||||
|
ASSERT_EQ(kCPU_ResourceLabel, ResourceID(CPU).Binary());
|
||||||
|
ASSERT_EQ(kGPU_ResourceLabel, ResourceID(GPU).Binary());
|
||||||
|
ASSERT_EQ(kObjectStoreMemory_ResourceLabel, ResourceID(OBJECT_STORE_MEM).Binary());
|
||||||
|
ASSERT_EQ(kMemory_ResourceLabel, ResourceID(MEM).Binary());
|
||||||
|
|
||||||
|
// mean while NodeID is not populated.
|
||||||
|
ASSERT_NE(kCPU_ResourceLabel, NodeID(CPU).Binary());
|
||||||
|
}
|
||||||
|
} // namespace ray
|
|
@ -38,10 +38,10 @@ bool DoesNodeHaveGPUs(const NodeResources &resources) {
|
||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
int64_t SchedulingPolicy::SpreadPolicy(const ResourceRequest &resource_request,
|
scheduling::NodeID SchedulingPolicy::SpreadPolicy(
|
||||||
bool force_spillback, bool require_available,
|
const ResourceRequest &resource_request, bool force_spillback, bool require_available,
|
||||||
std::function<bool(int64_t)> is_node_available) {
|
std::function<bool(scheduling::NodeID)> is_node_available) {
|
||||||
std::vector<int64_t> round;
|
std::vector<scheduling::NodeID> round;
|
||||||
round.reserve(nodes_.size());
|
round.reserve(nodes_.size());
|
||||||
for (const auto &pair : nodes_) {
|
for (const auto &pair : nodes_) {
|
||||||
round.emplace_back(pair.first);
|
round.emplace_back(pair.first);
|
||||||
|
@ -69,19 +69,19 @@ int64_t SchedulingPolicy::SpreadPolicy(const ResourceRequest &resource_request,
|
||||||
is_node_available);
|
is_node_available);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t SchedulingPolicy::HybridPolicyWithFilter(
|
scheduling::NodeID SchedulingPolicy::HybridPolicyWithFilter(
|
||||||
const ResourceRequest &resource_request, float spread_threshold, bool force_spillback,
|
const ResourceRequest &resource_request, float spread_threshold, bool force_spillback,
|
||||||
bool require_available, std::function<bool(int64_t)> is_node_available,
|
bool require_available, std::function<bool(scheduling::NodeID)> is_node_available,
|
||||||
NodeFilter node_filter) {
|
NodeFilter node_filter) {
|
||||||
// Step 1: Generate the traversal order. We guarantee that the first node is local, to
|
// Step 1: Generate the traversal order. We guarantee that the first node is local, to
|
||||||
// encourage local scheduling. The rest of the traversal order should be globally
|
// encourage local scheduling. The rest of the traversal order should be globally
|
||||||
// consistent, to encourage using "warm" workers.
|
// consistent, to encourage using "warm" workers.
|
||||||
std::vector<int64_t> round;
|
std::vector<scheduling::NodeID> round;
|
||||||
round.reserve(nodes_.size());
|
round.reserve(nodes_.size());
|
||||||
const auto local_it = nodes_.find(local_node_id_);
|
const auto local_it = nodes_.find(local_node_id_);
|
||||||
RAY_CHECK(local_it != nodes_.end());
|
RAY_CHECK(local_it != nodes_.end());
|
||||||
auto predicate = [node_filter, &is_node_available](
|
auto predicate = [node_filter, &is_node_available](
|
||||||
int64_t node_id, const NodeResources &node_resources) {
|
scheduling::NodeID node_id, const NodeResources &node_resources) {
|
||||||
if (!is_node_available(node_id)) {
|
if (!is_node_available(node_id)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -116,7 +116,7 @@ int64_t SchedulingPolicy::HybridPolicyWithFilter(
|
||||||
// place.
|
// place.
|
||||||
std::sort(round.begin() + start_index, round.end());
|
std::sort(round.begin() + start_index, round.end());
|
||||||
|
|
||||||
int64_t best_node_id = -1;
|
scheduling::NodeID best_node_id = scheduling::NodeID::Nil();
|
||||||
float best_utilization_score = INFINITY;
|
float best_utilization_score = INFINITY;
|
||||||
bool best_is_available = false;
|
bool best_is_available = false;
|
||||||
|
|
||||||
|
@ -143,12 +143,12 @@ int64_t SchedulingPolicy::HybridPolicyWithFilter(
|
||||||
ignore_pull_manager_at_capacity);
|
ignore_pull_manager_at_capacity);
|
||||||
float critical_resource_utilization =
|
float critical_resource_utilization =
|
||||||
node.GetLocalView().CalculateCriticalResourceUtilization();
|
node.GetLocalView().CalculateCriticalResourceUtilization();
|
||||||
RAY_LOG(DEBUG) << "Node " << node_id << " is "
|
RAY_LOG(DEBUG) << "Node " << node_id.ToInt() << " is "
|
||||||
<< (is_available ? "available" : "not available") << " for request "
|
<< (is_available ? "available" : "not available") << " for request "
|
||||||
<< resource_request.DebugString()
|
<< resource_request.DebugString()
|
||||||
<< " with critical resource utilization "
|
<< " with critical resource utilization "
|
||||||
<< critical_resource_utilization << " based on local view "
|
<< critical_resource_utilization << " based on local view "
|
||||||
<< node.GetLocalView().DebugString(StringIdMap());
|
<< node.GetLocalView().DebugString();
|
||||||
if (critical_resource_utilization < spread_threshold) {
|
if (critical_resource_utilization < spread_threshold) {
|
||||||
critical_resource_utilization = 0;
|
critical_resource_utilization = 0;
|
||||||
}
|
}
|
||||||
|
@ -180,11 +180,10 @@ int64_t SchedulingPolicy::HybridPolicyWithFilter(
|
||||||
return best_node_id;
|
return best_node_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t SchedulingPolicy::HybridPolicy(const ResourceRequest &resource_request,
|
scheduling::NodeID SchedulingPolicy::HybridPolicy(
|
||||||
float spread_threshold, bool force_spillback,
|
const ResourceRequest &resource_request, float spread_threshold, bool force_spillback,
|
||||||
bool require_available,
|
bool require_available, std::function<bool(scheduling::NodeID)> is_node_available,
|
||||||
std::function<bool(int64_t)> is_node_available,
|
bool scheduler_avoid_gpu_nodes) {
|
||||||
bool scheduler_avoid_gpu_nodes) {
|
|
||||||
if (!scheduler_avoid_gpu_nodes || IsGPURequest(resource_request)) {
|
if (!scheduler_avoid_gpu_nodes || IsGPURequest(resource_request)) {
|
||||||
return HybridPolicyWithFilter(resource_request, spread_threshold, force_spillback,
|
return HybridPolicyWithFilter(resource_request, spread_threshold, force_spillback,
|
||||||
require_available, std::move(is_node_available));
|
require_available, std::move(is_node_available));
|
||||||
|
@ -194,7 +193,7 @@ int64_t SchedulingPolicy::HybridPolicy(const ResourceRequest &resource_request,
|
||||||
auto best_node_id = HybridPolicyWithFilter(
|
auto best_node_id = HybridPolicyWithFilter(
|
||||||
resource_request, spread_threshold, force_spillback,
|
resource_request, spread_threshold, force_spillback,
|
||||||
/*require_available*/ true, is_node_available, NodeFilter::kNonGpu);
|
/*require_available*/ true, is_node_available, NodeFilter::kNonGpu);
|
||||||
if (best_node_id != -1) {
|
if (!best_node_id.IsNil()) {
|
||||||
return best_node_id;
|
return best_node_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -204,9 +203,10 @@ int64_t SchedulingPolicy::HybridPolicy(const ResourceRequest &resource_request,
|
||||||
require_available, is_node_available);
|
require_available, is_node_available);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t SchedulingPolicy::RandomPolicy(const ResourceRequest &resource_request,
|
scheduling::NodeID SchedulingPolicy::RandomPolicy(
|
||||||
std::function<bool(int64_t)> is_node_available) {
|
const ResourceRequest &resource_request,
|
||||||
int64_t best_node = -1;
|
std::function<bool(scheduling::NodeID)> is_node_available) {
|
||||||
|
scheduling::NodeID best_node = scheduling::NodeID::Nil();
|
||||||
if (nodes_.empty()) {
|
if (nodes_.empty()) {
|
||||||
return best_node;
|
return best_node;
|
||||||
}
|
}
|
||||||
|
@ -230,7 +230,7 @@ int64_t SchedulingPolicy::RandomPolicy(const ResourceRequest &resource_request,
|
||||||
iter = nodes_.begin();
|
iter = nodes_.begin();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
RAY_LOG(DEBUG) << "RandomPolicy, best_node = " << best_node
|
RAY_LOG(DEBUG) << "RandomPolicy, best_node = " << best_node.ToInt()
|
||||||
<< ", # nodes = " << nodes_.size()
|
<< ", # nodes = " << nodes_.size()
|
||||||
<< ", resource_request = " << resource_request.DebugString();
|
<< ", resource_request = " << resource_request.DebugString();
|
||||||
return best_node;
|
return best_node;
|
||||||
|
|
|
@ -25,7 +25,8 @@ namespace raylet_scheduling_policy {
|
||||||
|
|
||||||
class SchedulingPolicy {
|
class SchedulingPolicy {
|
||||||
public:
|
public:
|
||||||
SchedulingPolicy(int64_t local_node_id, const absl::flat_hash_map<int64_t, Node> &nodes)
|
SchedulingPolicy(scheduling::NodeID local_node_id,
|
||||||
|
const absl::flat_hash_map<scheduling::NodeID, Node> &nodes)
|
||||||
: local_node_id_(local_node_id),
|
: local_node_id_(local_node_id),
|
||||||
nodes_(nodes),
|
nodes_(nodes),
|
||||||
gen_(std::chrono::high_resolution_clock::now().time_since_epoch().count()) {}
|
gen_(std::chrono::high_resolution_clock::now().time_since_epoch().count()) {}
|
||||||
|
@ -62,30 +63,31 @@ class SchedulingPolicy {
|
||||||
///
|
///
|
||||||
/// \return -1 if the task is unfeasible, otherwise the node id (key in `nodes`) to
|
/// \return -1 if the task is unfeasible, otherwise the node id (key in `nodes`) to
|
||||||
/// schedule on.
|
/// schedule on.
|
||||||
int64_t HybridPolicy(
|
scheduling::NodeID HybridPolicy(
|
||||||
const ResourceRequest &resource_request, float spread_threshold,
|
const ResourceRequest &resource_request, float spread_threshold,
|
||||||
bool force_spillback, bool require_available,
|
bool force_spillback, bool require_available,
|
||||||
std::function<bool(int64_t)> is_node_available,
|
std::function<bool(scheduling::NodeID)> is_node_available,
|
||||||
bool scheduler_avoid_gpu_nodes = RayConfig::instance().scheduler_avoid_gpu_nodes());
|
bool scheduler_avoid_gpu_nodes = RayConfig::instance().scheduler_avoid_gpu_nodes());
|
||||||
|
|
||||||
/// Round robin among available nodes.
|
/// Round robin among available nodes.
|
||||||
/// If there are no available nodes, fallback to hybrid policy.
|
/// If there are no available nodes, fallback to hybrid policy.
|
||||||
int64_t SpreadPolicy(const ResourceRequest &resource_request, bool force_spillback,
|
scheduling::NodeID SpreadPolicy(
|
||||||
bool require_available,
|
const ResourceRequest &resource_request, bool force_spillback,
|
||||||
std::function<bool(int64_t)> is_node_available);
|
bool require_available, std::function<bool(scheduling::NodeID)> is_node_available);
|
||||||
|
|
||||||
/// Policy that "randomly" picks a node that could fulfil the request.
|
/// Policy that "randomly" picks a node that could fulfil the request.
|
||||||
/// TODO(scv119): if there are a lot of nodes died or can't fulfill the resource
|
/// TODO(scv119): if there are a lot of nodes died or can't fulfill the resource
|
||||||
/// requirement, the distribution might not be even.
|
/// requirement, the distribution might not be even.
|
||||||
int64_t RandomPolicy(const ResourceRequest &resource_request,
|
scheduling::NodeID RandomPolicy(
|
||||||
std::function<bool(int64_t)> is_node_available);
|
const ResourceRequest &resource_request,
|
||||||
|
std::function<bool(scheduling::NodeID)> is_node_available);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// Identifier of local node.
|
/// Identifier of local node.
|
||||||
const int64_t local_node_id_;
|
const scheduling::NodeID local_node_id_;
|
||||||
/// List of nodes in the clusters and their resources organized as a map.
|
/// List of nodes in the clusters and their resources organized as a map.
|
||||||
/// The key of the map is the node ID.
|
/// The key of the map is the node ID.
|
||||||
const absl::flat_hash_map<int64_t, Node> &nodes_;
|
const absl::flat_hash_map<scheduling::NodeID, Node> &nodes_;
|
||||||
// The node to start round robin if it's spread scheduling.
|
// The node to start round robin if it's spread scheduling.
|
||||||
// The index may be inaccurate when nodes are added or removed dynamically,
|
// The index may be inaccurate when nodes are added or removed dynamically,
|
||||||
// but it should still be better than always scanning from 0 for spread scheduling.
|
// but it should still be better than always scanning from 0 for spread scheduling.
|
||||||
|
@ -111,11 +113,11 @@ class SchedulingPolicy {
|
||||||
///
|
///
|
||||||
/// \return -1 if the task is unfeasible, otherwise the node id (key in `nodes`) to
|
/// \return -1 if the task is unfeasible, otherwise the node id (key in `nodes`) to
|
||||||
/// schedule on.
|
/// schedule on.
|
||||||
int64_t HybridPolicyWithFilter(const ResourceRequest &resource_request,
|
scheduling::NodeID HybridPolicyWithFilter(
|
||||||
float spread_threshold, bool force_spillback,
|
const ResourceRequest &resource_request, float spread_threshold,
|
||||||
bool require_available,
|
bool force_spillback, bool require_available,
|
||||||
std::function<bool(int64_t)> is_node_available,
|
std::function<bool(scheduling::NodeID)> is_node_available,
|
||||||
NodeFilter node_filter = NodeFilter::kAny);
|
NodeFilter node_filter = NodeFilter::kAny);
|
||||||
};
|
};
|
||||||
} // namespace raylet_scheduling_policy
|
} // namespace raylet_scheduling_policy
|
||||||
} // namespace ray
|
} // namespace ray
|
||||||
|
|
|
@ -33,27 +33,28 @@ NodeResources CreateNodeResources(double available_cpu, double total_cpu,
|
||||||
return resources;
|
return resources;
|
||||||
}
|
}
|
||||||
|
|
||||||
class SchedulingPolicyTest : public ::testing::Test {};
|
class SchedulingPolicyTest : public ::testing::Test {
|
||||||
|
public:
|
||||||
|
scheduling::NodeID local_node = scheduling::NodeID(0);
|
||||||
|
scheduling::NodeID remote_node = scheduling::NodeID(1);
|
||||||
|
scheduling::NodeID remote_node_2 = scheduling::NodeID(2);
|
||||||
|
scheduling::NodeID remote_node_3 = scheduling::NodeID(3);
|
||||||
|
absl::flat_hash_map<scheduling::NodeID, Node> nodes;
|
||||||
|
};
|
||||||
|
|
||||||
TEST_F(SchedulingPolicyTest, SpreadPolicyTest) {
|
TEST_F(SchedulingPolicyTest, SpreadPolicyTest) {
|
||||||
StringIdMap map;
|
ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}}, false);
|
||||||
ResourceRequest req = ResourceMapToResourceRequest(map, {{"CPU", 1}}, false);
|
|
||||||
int64_t local_node = 0;
|
|
||||||
int64_t remote_node_1 = 1;
|
|
||||||
int64_t remote_node_2 = 2;
|
|
||||||
int64_t remote_node_3 = 3;
|
|
||||||
|
|
||||||
absl::flat_hash_map<int64_t, Node> nodes;
|
|
||||||
nodes.emplace(local_node, CreateNodeResources(20, 20, 0, 0, 0, 0));
|
nodes.emplace(local_node, CreateNodeResources(20, 20, 0, 0, 0, 0));
|
||||||
// Unavailable node
|
// Unavailable node
|
||||||
nodes.emplace(remote_node_1, CreateNodeResources(0, 20, 0, 0, 0, 0));
|
nodes.emplace(remote_node, CreateNodeResources(0, 20, 0, 0, 0, 0));
|
||||||
// Infeasible node
|
// Infeasible node
|
||||||
nodes.emplace(remote_node_2, CreateNodeResources(0, 0, 0, 0, 0, 0));
|
nodes.emplace(remote_node_2, CreateNodeResources(0, 0, 0, 0, 0, 0));
|
||||||
nodes.emplace(remote_node_3, CreateNodeResources(20, 20, 0, 0, 0, 0));
|
nodes.emplace(remote_node_3, CreateNodeResources(20, 20, 0, 0, 0, 0));
|
||||||
|
|
||||||
raylet_scheduling_policy::SchedulingPolicy scheduling_policy(local_node, nodes);
|
raylet_scheduling_policy::SchedulingPolicy scheduling_policy(local_node, nodes);
|
||||||
|
|
||||||
int64_t to_schedule =
|
auto to_schedule =
|
||||||
scheduling_policy.SpreadPolicy(req, false, false, [](auto) { return true; });
|
scheduling_policy.SpreadPolicy(req, false, false, [](auto) { return true; });
|
||||||
ASSERT_EQ(to_schedule, local_node);
|
ASSERT_EQ(to_schedule, local_node);
|
||||||
|
|
||||||
|
@ -67,17 +68,10 @@ TEST_F(SchedulingPolicyTest, SpreadPolicyTest) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(SchedulingPolicyTest, RandomPolicyTest) {
|
TEST_F(SchedulingPolicyTest, RandomPolicyTest) {
|
||||||
StringIdMap map;
|
ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}}, false);
|
||||||
ResourceRequest req = ResourceMapToResourceRequest(map, {{"CPU", 1}}, false);
|
|
||||||
int64_t local_node = 0;
|
|
||||||
int64_t remote_node_1 = 1;
|
|
||||||
int64_t remote_node_2 = 2;
|
|
||||||
int64_t remote_node_3 = 3;
|
|
||||||
|
|
||||||
absl::flat_hash_map<int64_t, Node> nodes;
|
|
||||||
|
|
||||||
nodes.emplace(local_node, CreateNodeResources(20, 20, 0, 0, 0, 0));
|
nodes.emplace(local_node, CreateNodeResources(20, 20, 0, 0, 0, 0));
|
||||||
nodes.emplace(remote_node_1, CreateNodeResources(20, 20, 0, 0, 0, 0));
|
nodes.emplace(remote_node, CreateNodeResources(20, 20, 0, 0, 0, 0));
|
||||||
// Unavailable node
|
// Unavailable node
|
||||||
nodes.emplace(remote_node_2, CreateNodeResources(0, 20, 0, 0, 0, 0));
|
nodes.emplace(remote_node_2, CreateNodeResources(0, 20, 0, 0, 0, 0));
|
||||||
// Infeasible node
|
// Infeasible node
|
||||||
|
@ -85,14 +79,14 @@ TEST_F(SchedulingPolicyTest, RandomPolicyTest) {
|
||||||
|
|
||||||
raylet_scheduling_policy::SchedulingPolicy scheduling_policy(local_node, nodes);
|
raylet_scheduling_policy::SchedulingPolicy scheduling_policy(local_node, nodes);
|
||||||
|
|
||||||
std::map<int64_t, size_t> decisions;
|
std::map<scheduling::NodeID, size_t> decisions;
|
||||||
size_t num_node_0_picks = 0;
|
size_t num_node_0_picks = 0;
|
||||||
size_t num_node_1_picks = 0;
|
size_t num_node_1_picks = 0;
|
||||||
for (int i = 0; i < 1000; i++) {
|
for (int i = 0; i < 1000; i++) {
|
||||||
int64_t to_schedule = scheduling_policy.RandomPolicy(req, [](auto) { return true; });
|
auto to_schedule = scheduling_policy.RandomPolicy(req, [](auto) { return true; });
|
||||||
ASSERT_TRUE(to_schedule >= 0);
|
ASSERT_TRUE(to_schedule.ToInt() >= 0);
|
||||||
ASSERT_TRUE(to_schedule <= 1);
|
ASSERT_TRUE(to_schedule.ToInt() <= 1);
|
||||||
if (to_schedule == 0) {
|
if (to_schedule.ToInt() == 0) {
|
||||||
num_node_0_picks++;
|
num_node_0_picks++;
|
||||||
} else {
|
} else {
|
||||||
num_node_1_picks++;
|
num_node_1_picks++;
|
||||||
|
@ -104,10 +98,9 @@ TEST_F(SchedulingPolicyTest, RandomPolicyTest) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(SchedulingPolicyTest, FeasibleDefinitionTest) {
|
TEST_F(SchedulingPolicyTest, FeasibleDefinitionTest) {
|
||||||
StringIdMap map;
|
|
||||||
auto task_req1 =
|
auto task_req1 =
|
||||||
ResourceMapToResourceRequest(map, {{"CPU", 1}, {"object_store_memory", 1}}, false);
|
ResourceMapToResourceRequest({{"CPU", 1}, {"object_store_memory", 1}}, false);
|
||||||
auto task_req2 = ResourceMapToResourceRequest(map, {{"CPU", 1}}, false);
|
auto task_req2 = ResourceMapToResourceRequest({{"CPU", 1}}, false);
|
||||||
{
|
{
|
||||||
// Don't break with a non-resized predefined resources array.
|
// Don't break with a non-resized predefined resources array.
|
||||||
NodeResources resources;
|
NodeResources resources;
|
||||||
|
@ -127,10 +120,9 @@ TEST_F(SchedulingPolicyTest, FeasibleDefinitionTest) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(SchedulingPolicyTest, AvailableDefinitionTest) {
|
TEST_F(SchedulingPolicyTest, AvailableDefinitionTest) {
|
||||||
StringIdMap map;
|
|
||||||
auto task_req1 =
|
auto task_req1 =
|
||||||
ResourceMapToResourceRequest(map, {{"CPU", 1}, {"object_store_memory", 1}}, false);
|
ResourceMapToResourceRequest({{"CPU", 1}, {"object_store_memory", 1}}, false);
|
||||||
auto task_req2 = ResourceMapToResourceRequest(map, {{"CPU", 1}}, false);
|
auto task_req2 = ResourceMapToResourceRequest({{"CPU", 1}}, false);
|
||||||
{
|
{
|
||||||
// Don't break with a non-resized predefined resources array.
|
// Don't break with a non-resized predefined resources array.
|
||||||
NodeResources resources;
|
NodeResources resources;
|
||||||
|
@ -192,34 +184,28 @@ TEST_F(SchedulingPolicyTest, AvailableTruncationTest) {
|
||||||
// In this test, the local node and a remote node are both available. The remote node
|
// In this test, the local node and a remote node are both available. The remote node
|
||||||
// has a lower critical resource utilization, but they're both truncated to 0, so we
|
// has a lower critical resource utilization, but they're both truncated to 0, so we
|
||||||
// should still pick the local node (due to traversal order).
|
// should still pick the local node (due to traversal order).
|
||||||
StringIdMap map;
|
ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}}, false);
|
||||||
ResourceRequest req = ResourceMapToResourceRequest(map, {{"CPU", 1}}, false);
|
|
||||||
int64_t local_node = 0;
|
|
||||||
int64_t remote_node = 1;
|
|
||||||
|
|
||||||
absl::flat_hash_map<int64_t, Node> nodes;
|
|
||||||
nodes.emplace(local_node, CreateNodeResources(1, 2, 0, 0, 0, 0));
|
nodes.emplace(local_node, CreateNodeResources(1, 2, 0, 0, 0, 0));
|
||||||
nodes.emplace(remote_node, CreateNodeResources(0.75, 2, 0, 0, 0, 0));
|
nodes.emplace(remote_node, CreateNodeResources(0.75, 2, 0, 0, 0, 0));
|
||||||
|
|
||||||
int to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
auto to_schedule =
|
||||||
.HybridPolicy(req, 0.51, false, false, [](auto) { return true; });
|
raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
|
.HybridPolicy(req, 0.51, false, false, [](auto) { return true; });
|
||||||
ASSERT_EQ(to_schedule, local_node);
|
ASSERT_EQ(to_schedule, local_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(SchedulingPolicyTest, AvailableTieBreakTest) {
|
TEST_F(SchedulingPolicyTest, AvailableTieBreakTest) {
|
||||||
// In this test, the local node and a remote node are both available. The remote node
|
// In this test, the local node and a remote node are both available. The remote node
|
||||||
// has a lower critical resource utilization so we schedule on it.
|
// has a lower critical resource utilization so we schedule on it.
|
||||||
StringIdMap map;
|
ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}}, false);
|
||||||
ResourceRequest req = ResourceMapToResourceRequest(map, {{"CPU", 1}}, false);
|
|
||||||
int64_t local_node = 0;
|
|
||||||
int64_t remote_node = 1;
|
|
||||||
|
|
||||||
absl::flat_hash_map<int64_t, Node> nodes;
|
|
||||||
nodes.emplace(local_node, CreateNodeResources(1, 2, 0, 0, 0, 0));
|
nodes.emplace(local_node, CreateNodeResources(1, 2, 0, 0, 0, 0));
|
||||||
nodes.emplace(remote_node, CreateNodeResources(1.5, 2, 0, 0, 0, 0));
|
nodes.emplace(remote_node, CreateNodeResources(1.5, 2, 0, 0, 0, 0));
|
||||||
|
|
||||||
int to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
auto to_schedule =
|
||||||
.HybridPolicy(req, 0.50, false, false, [](auto) { return true; });
|
raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
|
.HybridPolicy(req, 0.50, false, false, [](auto) { return true; });
|
||||||
ASSERT_EQ(to_schedule, remote_node);
|
ASSERT_EQ(to_schedule, remote_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -227,96 +213,67 @@ TEST_F(SchedulingPolicyTest, AvailableOverFeasibleTest) {
|
||||||
// In this test, the local node is feasible and has a lower critical resource
|
// In this test, the local node is feasible and has a lower critical resource
|
||||||
// utilization, but the remote node can run the task immediately, so we pick the remote
|
// utilization, but the remote node can run the task immediately, so we pick the remote
|
||||||
// node.
|
// node.
|
||||||
StringIdMap map;
|
ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}, {"GPU", 1}}, false);
|
||||||
ResourceRequest req =
|
|
||||||
ResourceMapToResourceRequest(map, {{"CPU", 1}, {"GPU", 1}}, false);
|
|
||||||
int64_t local_node = 0;
|
|
||||||
int64_t remote_node = 1;
|
|
||||||
|
|
||||||
absl::flat_hash_map<int64_t, Node> nodes;
|
|
||||||
nodes.emplace(local_node, CreateNodeResources(10, 10, 0, 0, 0, 1));
|
nodes.emplace(local_node, CreateNodeResources(10, 10, 0, 0, 0, 1));
|
||||||
nodes.emplace(remote_node, CreateNodeResources(1, 10, 0, 0, 1, 1));
|
nodes.emplace(remote_node, CreateNodeResources(1, 10, 0, 0, 1, 1));
|
||||||
|
|
||||||
int to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
auto to_schedule =
|
||||||
.HybridPolicy(req, 0.50, false, false, [](auto) { return true; });
|
raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
|
.HybridPolicy(req, 0.50, false, false, [](auto) { return true; });
|
||||||
ASSERT_EQ(to_schedule, remote_node);
|
ASSERT_EQ(to_schedule, remote_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(SchedulingPolicyTest, InfeasibleTest) {
|
TEST_F(SchedulingPolicyTest, InfeasibleTest) {
|
||||||
// All the nodes are infeasible, so we return -1.
|
// All the nodes are infeasible, so we return -1.
|
||||||
StringIdMap map;
|
ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}, {"GPU", 1}}, false);
|
||||||
ResourceRequest req =
|
|
||||||
ResourceMapToResourceRequest(map, {{"CPU", 1}, {"GPU", 1}}, false);
|
|
||||||
int64_t local_node = 0;
|
|
||||||
int64_t remote_node = 1;
|
|
||||||
|
|
||||||
absl::flat_hash_map<int64_t, Node> nodes;
|
|
||||||
nodes.emplace(local_node, CreateNodeResources(10, 10, 0, 0, 0, 0));
|
nodes.emplace(local_node, CreateNodeResources(10, 10, 0, 0, 0, 0));
|
||||||
nodes.emplace(remote_node, CreateNodeResources(1, 10, 0, 0, 0, 0));
|
nodes.emplace(remote_node, CreateNodeResources(1, 10, 0, 0, 0, 0));
|
||||||
|
|
||||||
int to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
auto to_schedule =
|
||||||
.HybridPolicy(req, 0.50, false, false, [](auto) { return true; });
|
raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
ASSERT_EQ(to_schedule, -1);
|
.HybridPolicy(req, 0.50, false, false, [](auto) { return true; });
|
||||||
|
ASSERT_TRUE(to_schedule.IsNil());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(SchedulingPolicyTest, BarelyFeasibleTest) {
|
TEST_F(SchedulingPolicyTest, BarelyFeasibleTest) {
|
||||||
// Test the edge case where a task requires all of a node's resources, and the node is
|
// Test the edge case where a task requires all of a node's resources, and the node is
|
||||||
// fully utilized.
|
// fully utilized.
|
||||||
StringIdMap map;
|
ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}, {"GPU", 1}}, false);
|
||||||
ResourceRequest req =
|
|
||||||
ResourceMapToResourceRequest(map, {{"CPU", 1}, {"GPU", 1}}, false);
|
|
||||||
int64_t local_node = 0;
|
|
||||||
|
|
||||||
absl::flat_hash_map<int64_t, Node> nodes;
|
|
||||||
nodes.emplace(local_node, CreateNodeResources(0, 1, 0, 0, 0, 1));
|
nodes.emplace(local_node, CreateNodeResources(0, 1, 0, 0, 0, 1));
|
||||||
|
|
||||||
int to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
auto to_schedule =
|
||||||
.HybridPolicy(req, 0.50, false, false, [](auto) { return true; });
|
raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
|
.HybridPolicy(req, 0.50, false, false, [](auto) { return true; });
|
||||||
ASSERT_EQ(to_schedule, local_node);
|
ASSERT_EQ(to_schedule, local_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(SchedulingPolicyTest, TruncationAcrossFeasibleNodesTest) {
|
TEST_F(SchedulingPolicyTest, TruncationAcrossFeasibleNodesTest) {
|
||||||
// Same as AvailableTruncationTest except now none of the nodes are available, but the
|
// Same as AvailableTruncationTest except now none of the nodes are available, but the
|
||||||
// tie break logic should apply to feasible nodes too.
|
// tie break logic should apply to feasible nodes too.
|
||||||
StringIdMap map;
|
ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}, {"GPU", 1}}, false);
|
||||||
ResourceRequest req =
|
|
||||||
ResourceMapToResourceRequest(map, {{"CPU", 1}, {"GPU", 1}}, false);
|
|
||||||
int64_t local_node = 0;
|
|
||||||
int64_t remote_node = 1;
|
|
||||||
|
|
||||||
absl::flat_hash_map<int64_t, Node> nodes;
|
|
||||||
nodes.emplace(local_node, CreateNodeResources(1, 2, 0, 0, 0, 1));
|
nodes.emplace(local_node, CreateNodeResources(1, 2, 0, 0, 0, 1));
|
||||||
nodes.emplace(remote_node, CreateNodeResources(0.75, 2, 0, 0, 0, 1));
|
nodes.emplace(remote_node, CreateNodeResources(0.75, 2, 0, 0, 0, 1));
|
||||||
|
|
||||||
int to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
auto to_schedule =
|
||||||
.HybridPolicy(req, 0.51, false, false, [](auto) { return true; });
|
raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
|
.HybridPolicy(req, 0.51, false, false, [](auto) { return true; });
|
||||||
ASSERT_EQ(to_schedule, local_node);
|
ASSERT_EQ(to_schedule, local_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(SchedulingPolicyTest, ForceSpillbackIfAvailableTest) {
|
TEST_F(SchedulingPolicyTest, ForceSpillbackIfAvailableTest) {
|
||||||
// The local node is better, but we force spillback, so we'll schedule on a non-local
|
// The local node is better, but we force spillback, so we'll schedule on a non-local
|
||||||
// node anyways.
|
// node anyways.
|
||||||
StringIdMap map;
|
ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}, {"GPU", 1}}, false);
|
||||||
ResourceRequest req =
|
|
||||||
ResourceMapToResourceRequest(map, {{"CPU", 1}, {"GPU", 1}}, false);
|
|
||||||
int64_t local_node = 0;
|
|
||||||
int64_t remote_node = 1;
|
|
||||||
|
|
||||||
absl::flat_hash_map<int64_t, Node> nodes;
|
|
||||||
nodes.emplace(local_node, CreateNodeResources(2, 2, 0, 0, 1, 1));
|
nodes.emplace(local_node, CreateNodeResources(2, 2, 0, 0, 1, 1));
|
||||||
nodes.emplace(remote_node, CreateNodeResources(1, 10, 0, 0, 1, 10));
|
nodes.emplace(remote_node, CreateNodeResources(1, 10, 0, 0, 1, 10));
|
||||||
|
|
||||||
int to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
auto to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
.HybridPolicy(req, 0.51, true, true, [](auto) { return true; });
|
.HybridPolicy(req, 0.51, true, true, [](auto) { return true; });
|
||||||
ASSERT_EQ(to_schedule, remote_node);
|
ASSERT_EQ(to_schedule, remote_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(SchedulingPolicyTest, AvoidSchedulingCPURequestsOnGPUNodes) {
|
TEST_F(SchedulingPolicyTest, AvoidSchedulingCPURequestsOnGPUNodes) {
|
||||||
StringIdMap map;
|
|
||||||
int64_t local_node = 0;
|
|
||||||
int64_t remote_node = 1;
|
|
||||||
|
|
||||||
absl::flat_hash_map<int64_t, Node> nodes;
|
|
||||||
nodes.emplace(local_node, CreateNodeResources(10, 10, 0, 0, 1, 1));
|
nodes.emplace(local_node, CreateNodeResources(10, 10, 0, 0, 1, 1));
|
||||||
nodes.emplace(remote_node, CreateNodeResources(1, 2, 0, 0, 0, 0));
|
nodes.emplace(remote_node, CreateNodeResources(1, 2, 0, 0, 0, 0));
|
||||||
|
|
||||||
|
@ -324,18 +281,17 @@ TEST_F(SchedulingPolicyTest, AvoidSchedulingCPURequestsOnGPUNodes) {
|
||||||
// The local node is better, but it has GPUs, the request is
|
// The local node is better, but it has GPUs, the request is
|
||||||
// non GPU, and the remote node does not have GPUs, thus
|
// non GPU, and the remote node does not have GPUs, thus
|
||||||
// we should schedule on remote node.
|
// we should schedule on remote node.
|
||||||
const ResourceRequest req = ResourceMapToResourceRequest(map, {{"CPU", 1}}, false);
|
const ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}}, false);
|
||||||
const int to_schedule =
|
const auto to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
.HybridPolicy(
|
||||||
.HybridPolicy(
|
ResourceMapToResourceRequest({{"CPU", 1}}, false),
|
||||||
ResourceMapToResourceRequest(map, {{"CPU", 1}}, false), 0.51, false, true,
|
0.51, false, true, [](auto) { return true; }, true);
|
||||||
[](auto) { return true; }, true);
|
|
||||||
ASSERT_EQ(to_schedule, remote_node);
|
ASSERT_EQ(to_schedule, remote_node);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// A GPU request should be scheduled on a GPU node.
|
// A GPU request should be scheduled on a GPU node.
|
||||||
const ResourceRequest req = ResourceMapToResourceRequest(map, {{"GPU", 1}}, false);
|
const ResourceRequest req = ResourceMapToResourceRequest({{"GPU", 1}}, false);
|
||||||
const int to_schedule =
|
const auto to_schedule =
|
||||||
raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
.HybridPolicy(
|
.HybridPolicy(
|
||||||
req, 0.51, false, true, [](auto) { return true; }, true);
|
req, 0.51, false, true, [](auto) { return true; }, true);
|
||||||
|
@ -343,8 +299,8 @@ TEST_F(SchedulingPolicyTest, AvoidSchedulingCPURequestsOnGPUNodes) {
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// A CPU request can be be scheduled on a CPU node.
|
// A CPU request can be be scheduled on a CPU node.
|
||||||
const ResourceRequest req = ResourceMapToResourceRequest(map, {{"CPU", 1}}, false);
|
const ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}}, false);
|
||||||
const int to_schedule =
|
const auto to_schedule =
|
||||||
raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
.HybridPolicy(
|
.HybridPolicy(
|
||||||
req, 0.51, false, true, [](auto) { return true; }, true);
|
req, 0.51, false, true, [](auto) { return true; }, true);
|
||||||
|
@ -353,8 +309,8 @@ TEST_F(SchedulingPolicyTest, AvoidSchedulingCPURequestsOnGPUNodes) {
|
||||||
{
|
{
|
||||||
// A mixed CPU/GPU request should be scheduled on a GPU node.
|
// A mixed CPU/GPU request should be scheduled on a GPU node.
|
||||||
const ResourceRequest req =
|
const ResourceRequest req =
|
||||||
ResourceMapToResourceRequest(map, {{"CPU", 1}, {"GPU", 1}}, false);
|
ResourceMapToResourceRequest({{"CPU", 1}, {"GPU", 1}}, false);
|
||||||
const int to_schedule =
|
const auto to_schedule =
|
||||||
raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
.HybridPolicy(
|
.HybridPolicy(
|
||||||
req, 0.51, false, true, [](auto) { return true; }, true);
|
req, 0.51, false, true, [](auto) { return true; }, true);
|
||||||
|
@ -365,16 +321,11 @@ TEST_F(SchedulingPolicyTest, AvoidSchedulingCPURequestsOnGPUNodes) {
|
||||||
TEST_F(SchedulingPolicyTest, SchedulenCPURequestsOnGPUNodeAsALastResort) {
|
TEST_F(SchedulingPolicyTest, SchedulenCPURequestsOnGPUNodeAsALastResort) {
|
||||||
// Schedule on remote node, even though the request is CPU only, because
|
// Schedule on remote node, even though the request is CPU only, because
|
||||||
// we can not schedule on CPU nodes.
|
// we can not schedule on CPU nodes.
|
||||||
StringIdMap map;
|
ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}}, false);
|
||||||
ResourceRequest req = ResourceMapToResourceRequest(map, {{"CPU", 1}}, false);
|
|
||||||
int64_t local_node = 0;
|
|
||||||
int64_t remote_node = 1;
|
|
||||||
|
|
||||||
absl::flat_hash_map<int64_t, Node> nodes;
|
|
||||||
nodes.emplace(local_node, CreateNodeResources(0, 10, 0, 0, 0, 0));
|
nodes.emplace(local_node, CreateNodeResources(0, 10, 0, 0, 0, 0));
|
||||||
nodes.emplace(remote_node, CreateNodeResources(1, 1, 0, 0, 1, 1));
|
nodes.emplace(remote_node, CreateNodeResources(1, 1, 0, 0, 1, 1));
|
||||||
|
|
||||||
const int to_schedule =
|
const auto to_schedule =
|
||||||
raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
.HybridPolicy(
|
.HybridPolicy(
|
||||||
req, 0.51, false, true, [](auto) { return true; }, true);
|
req, 0.51, false, true, [](auto) { return true; }, true);
|
||||||
|
@ -383,81 +334,66 @@ TEST_F(SchedulingPolicyTest, SchedulenCPURequestsOnGPUNodeAsALastResort) {
|
||||||
|
|
||||||
TEST_F(SchedulingPolicyTest, ForceSpillbackTest) {
|
TEST_F(SchedulingPolicyTest, ForceSpillbackTest) {
|
||||||
// The local node is available but disqualified.
|
// The local node is available but disqualified.
|
||||||
StringIdMap map;
|
ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}, {"GPU", 1}}, false);
|
||||||
ResourceRequest req =
|
|
||||||
ResourceMapToResourceRequest(map, {{"CPU", 1}, {"GPU", 1}}, false);
|
|
||||||
int64_t local_node = 0;
|
|
||||||
int64_t remote_node = 1;
|
|
||||||
|
|
||||||
absl::flat_hash_map<int64_t, Node> nodes;
|
|
||||||
nodes.emplace(local_node, CreateNodeResources(2, 2, 0, 0, 1, 1));
|
nodes.emplace(local_node, CreateNodeResources(2, 2, 0, 0, 1, 1));
|
||||||
nodes.emplace(remote_node, CreateNodeResources(0, 2, 0, 0, 0, 1));
|
nodes.emplace(remote_node, CreateNodeResources(0, 2, 0, 0, 0, 1));
|
||||||
|
|
||||||
int to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
auto to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
.HybridPolicy(req, 0.51, true, false, [](auto) { return true; });
|
.HybridPolicy(req, 0.51, true, false, [](auto) { return true; });
|
||||||
ASSERT_EQ(to_schedule, remote_node);
|
ASSERT_EQ(to_schedule, remote_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(SchedulingPolicyTest, ForceSpillbackOnlyFeasibleLocallyTest) {
|
TEST_F(SchedulingPolicyTest, ForceSpillbackOnlyFeasibleLocallyTest) {
|
||||||
// The local node is better, but we force spillback, so we'll schedule on a non-local
|
// The local node is better, but we force spillback, so we'll schedule on a non-local
|
||||||
// node anyways.
|
// node anyways.
|
||||||
StringIdMap map;
|
ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}, {"GPU", 1}}, false);
|
||||||
ResourceRequest req =
|
|
||||||
ResourceMapToResourceRequest(map, {{"CPU", 1}, {"GPU", 1}}, false);
|
|
||||||
int64_t local_node = 0;
|
|
||||||
int64_t remote_node = 1;
|
|
||||||
|
|
||||||
absl::flat_hash_map<int64_t, Node> nodes;
|
|
||||||
nodes.emplace(local_node, CreateNodeResources(2, 2, 0, 0, 1, 1));
|
nodes.emplace(local_node, CreateNodeResources(2, 2, 0, 0, 1, 1));
|
||||||
nodes.emplace(remote_node, CreateNodeResources(0, 2, 0, 0, 0, 0));
|
nodes.emplace(remote_node, CreateNodeResources(0, 2, 0, 0, 0, 0));
|
||||||
|
|
||||||
int to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
auto to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
.HybridPolicy(req, 0.51, true, false, [](auto) { return true; });
|
.HybridPolicy(req, 0.51, true, false, [](auto) { return true; });
|
||||||
ASSERT_EQ(to_schedule, -1);
|
ASSERT_TRUE(to_schedule.IsNil());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(SchedulingPolicyTest, NonGpuNodePreferredSchedulingTest) {
|
TEST_F(SchedulingPolicyTest, NonGpuNodePreferredSchedulingTest) {
|
||||||
// Prefer to schedule on CPU nodes first.
|
// Prefer to schedule on CPU nodes first.
|
||||||
// GPU nodes should be preferred as a last resort.
|
// GPU nodes should be preferred as a last resort.
|
||||||
StringIdMap map;
|
|
||||||
int64_t local_node = 0;
|
|
||||||
int64_t remote_node_1 = 1;
|
|
||||||
int64_t remote_node_2 = 2;
|
|
||||||
|
|
||||||
// local {CPU:2, GPU:1}
|
// local {CPU:2, GPU:1}
|
||||||
// Remote {CPU: 2}
|
// Remote {CPU: 2}
|
||||||
absl::flat_hash_map<int64_t, Node> nodes;
|
|
||||||
nodes.emplace(local_node, CreateNodeResources(2, 2, 0, 0, 1, 1));
|
nodes.emplace(local_node, CreateNodeResources(2, 2, 0, 0, 1, 1));
|
||||||
nodes.emplace(remote_node_1, CreateNodeResources(2, 2, 0, 0, 0, 0));
|
nodes.emplace(remote_node, CreateNodeResources(2, 2, 0, 0, 0, 0));
|
||||||
nodes.emplace(remote_node_2, CreateNodeResources(3, 3, 0, 0, 0, 0));
|
nodes.emplace(remote_node_2, CreateNodeResources(3, 3, 0, 0, 0, 0));
|
||||||
|
|
||||||
ResourceRequest req = ResourceMapToResourceRequest(map, {{"CPU", 1}}, false);
|
ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}}, false);
|
||||||
int to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
auto to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
.HybridPolicy(
|
.HybridPolicy(
|
||||||
req, 0.51, false, true, [](auto) { return true; },
|
req, 0.51, false, true, [](auto) { return true; },
|
||||||
/*gpu_avoid_scheduling*/ true);
|
/*gpu_avoid_scheduling*/ true);
|
||||||
ASSERT_EQ(to_schedule, remote_node_1);
|
ASSERT_EQ(to_schedule, remote_node);
|
||||||
|
|
||||||
req = ResourceMapToResourceRequest(map, {{"CPU", 3}}, false);
|
req = ResourceMapToResourceRequest({{"CPU", 3}}, false);
|
||||||
to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
.HybridPolicy(
|
.HybridPolicy(
|
||||||
req, 0.51, false, true, [](auto) { return true; },
|
req, 0.51, false, true, [](auto) { return true; },
|
||||||
/*gpu_avoid_scheduling*/ true);
|
/*gpu_avoid_scheduling*/ true);
|
||||||
ASSERT_EQ(to_schedule, remote_node_2);
|
ASSERT_EQ(to_schedule, remote_node_2);
|
||||||
|
|
||||||
req = ResourceMapToResourceRequest(map, {{"CPU", 1}, {"GPU", 1}}, false);
|
req = ResourceMapToResourceRequest({{"CPU", 1}, {"GPU", 1}}, false);
|
||||||
to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
.HybridPolicy(
|
.HybridPolicy(
|
||||||
req, 0.51, false, true, [](auto) { return true; },
|
req, 0.51, false, true, [](auto) { return true; },
|
||||||
/*gpu_avoid_scheduling*/ true);
|
/*gpu_avoid_scheduling*/ true);
|
||||||
ASSERT_EQ(to_schedule, local_node);
|
ASSERT_EQ(to_schedule, local_node);
|
||||||
|
|
||||||
req = ResourceMapToResourceRequest(map, {{"CPU", 2}}, false);
|
req = ResourceMapToResourceRequest({{"CPU", 2}}, false);
|
||||||
to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
to_schedule = raylet_scheduling_policy::SchedulingPolicy(local_node, nodes)
|
||||||
.HybridPolicy(
|
.HybridPolicy(
|
||||||
req, 0.51, false, true, [](auto) { return true; },
|
req, 0.51, false, true, [](auto) { return true; },
|
||||||
/*gpu_avoid_scheduling*/ true);
|
/*gpu_avoid_scheduling*/ true);
|
||||||
ASSERT_EQ(to_schedule, remote_node_1);
|
ASSERT_EQ(to_schedule, remote_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
|
|
Loading…
Add table
Reference in a new issue