[scheduler][monitoring] dump detailed spilling metrics (#23321)

Dump the detailed spilling metrics in scheduler.
This commit is contained in:
Chen Shen 2022-03-28 10:49:04 -07:00 committed by GitHub
parent aae144d7f9
commit 51bdefc2c8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 18 additions and 0 deletions

View file

@ -228,6 +228,7 @@ void LocalTaskManager::DispatchScheduledTasksToWorkers() {
internal::UnscheduledWorkCause::WAITING_FOR_RESOURCES_AVAILABLE); internal::UnscheduledWorkCause::WAITING_FOR_RESOURCES_AVAILABLE);
break; break;
} }
num_unschedulable_task_spilled_++;
if (!spec.GetDependencies().empty()) { if (!spec.GetDependencies().empty()) {
task_dependency_manager_.RemoveTaskDependencies( task_dependency_manager_.RemoveTaskDependencies(
task.GetTaskSpecification().TaskId()); task.GetTaskSpecification().TaskId());
@ -337,6 +338,7 @@ void LocalTaskManager::SpillWaitingTasks() {
task_dependency_manager_.RemoveTaskDependencies( task_dependency_manager_.RemoveTaskDependencies(
task.GetTaskSpecification().TaskId()); task.GetTaskSpecification().TaskId());
} }
num_waiting_task_spilled_++;
waiting_tasks_index_.erase(task_id); waiting_tasks_index_.erase(task_id);
it = waiting_task_queue_.erase(it); it = waiting_task_queue_.erase(it);
} else { } else {
@ -1059,6 +1061,10 @@ void LocalTaskManager::DebugStr(std::stringstream &buffer) const {
buffer << "Waiting tasks size: " << waiting_tasks_index_.size() << "\n"; buffer << "Waiting tasks size: " << waiting_tasks_index_.size() << "\n";
buffer << "Number of executing tasks: " << executing_task_args_.size() << "\n"; buffer << "Number of executing tasks: " << executing_task_args_.size() << "\n";
buffer << "Number of pinned task arguments: " << pinned_task_arguments_.size() << "\n"; buffer << "Number of pinned task arguments: " << pinned_task_arguments_.size() << "\n";
buffer << "Number of total spilled tasks: " << num_task_spilled_ << "\n";
buffer << "Number of spilled waiting tasks: " << num_waiting_task_spilled_ << "\n";
buffer << "Number of spilled unschedulable tasks: " << num_unschedulable_task_spilled_
<< "\n";
buffer << "Resource usage {\n"; buffer << "Resource usage {\n";
// Calculates how much resources are occupied by tasks or actors. // Calculates how much resources are occupied by tasks or actors.

View file

@ -183,6 +183,10 @@ class LocalTaskManager : public ILocalTaskManager {
void DebugStr(std::stringstream &buffer) const override; void DebugStr(std::stringstream &buffer) const override;
size_t GetNumTaskSpilled() const override { return num_task_spilled_; } size_t GetNumTaskSpilled() const override { return num_task_spilled_; }
size_t GetNumWaitingTaskSpilled() const override { return num_waiting_task_spilled_; }
size_t GetNumUnschedulableTaskSpilled() const override {
return num_unschedulable_task_spilled_;
}
private: private:
struct SchedulingClassInfo; struct SchedulingClassInfo;
@ -376,6 +380,8 @@ class LocalTaskManager : public ILocalTaskManager {
const int64_t sched_cls_cap_max_ms_; const int64_t sched_cls_cap_max_ms_;
size_t num_task_spilled_ = 0; size_t num_task_spilled_ = 0;
size_t num_waiting_task_spilled_ = 0;
size_t num_unschedulable_task_spilled_ = 0;
friend class SchedulerResourceReporter; friend class SchedulerResourceReporter;
friend class ClusterTaskManagerTest; friend class ClusterTaskManagerTest;

View file

@ -68,6 +68,8 @@ class ILocalTaskManager {
virtual void DebugStr(std::stringstream &buffer) const = 0; virtual void DebugStr(std::stringstream &buffer) const = 0;
virtual size_t GetNumTaskSpilled() const = 0; virtual size_t GetNumTaskSpilled() const = 0;
virtual size_t GetNumWaitingTaskSpilled() const = 0;
virtual size_t GetNumUnschedulableTaskSpilled() const = 0;
}; };
} // namespace raylet } // namespace raylet
} // namespace ray } // namespace ray

View file

@ -138,6 +138,10 @@ void SchedulerStats::RecordMetrics() const {
ray::stats::STATS_scheduler_tasks.Record(num_cancelled_tasks_, "Cancelled"); ray::stats::STATS_scheduler_tasks.Record(num_cancelled_tasks_, "Cancelled");
ray::stats::STATS_scheduler_tasks.Record(num_tasks_to_dispatch_, "Dispatched"); ray::stats::STATS_scheduler_tasks.Record(num_tasks_to_dispatch_, "Dispatched");
ray::stats::STATS_scheduler_tasks.Record(num_tasks_to_schedule_, "Received"); ray::stats::STATS_scheduler_tasks.Record(num_tasks_to_schedule_, "Received");
ray::stats::STATS_scheduler_tasks.Record(local_task_manager_.GetNumWaitingTaskSpilled(),
"SpilledWaiting");
ray::stats::STATS_scheduler_tasks.Record(
local_task_manager_.GetNumUnschedulableTaskSpilled(), "SpilledUnschedulable");
/// Pending task count. /// Pending task count.
ray::stats::STATS_scheduler_unscheduleable_tasks.Record(num_infeasible_tasks_, ray::stats::STATS_scheduler_unscheduleable_tasks.Record(num_infeasible_tasks_,