[Core] Put raylet ip's in resource usage report (#13871)

* .

* done?

Co-authored-by: Alex Wu <alex@anyscale.com>
This commit is contained in:
Alex Wu 2021-02-03 11:28:56 -08:00 committed by GitHub
parent 79310452e7
commit f14171ced9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 29 additions and 0 deletions

View file

@ -7,6 +7,7 @@ import time
import ray
import ray.ray_constants
import ray.services
import ray.test_utils
from ray._raylet import GlobalStateAccessor
@ -332,6 +333,31 @@ def test_backlog_report(shutdown_only):
global_state_accessor.disconnect()
def test_heartbeat_ip(shutdown_only):
cluster = ray.init(
num_cpus=1, _system_config={
"report_worker_backlog": True,
})
global_state_accessor = GlobalStateAccessor(
cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD)
global_state_accessor.connect()
self_ip = ray.services.get_node_ip_address()
def self_ip_is_set():
message = global_state_accessor.get_all_resource_usage()
if message is None:
return False
resource_usage = ray.gcs_utils.ResourceUsageBatchData.FromString(
message)
resources_data = resource_usage.batch[0]
return resources_data.node_manager_address == self_ip
ray.test_utils.wait_for_condition(self_ip_is_set, timeout=2)
global_state_accessor.disconnect()
if __name__ == "__main__":
import pytest
import sys

View file

@ -327,6 +327,8 @@ message ResourcesData {
ResourceLoad resource_load_by_shape = 7;
// Whether this node manager is requesting global GC.
bool should_global_gc = 8;
// IP address of the node.
string node_manager_address = 9;
}
message ResourceUsageBatchData {

View file

@ -456,6 +456,7 @@ void NodeManager::Heartbeat() {
void NodeManager::ReportResourceUsage() {
auto resources_data = std::make_shared<rpc::ResourcesData>();
resources_data->set_node_id(self_node_id_.Binary());
resources_data->set_node_manager_address(initial_config_.node_manager_address);
// Update local chche from gcs remote cache, this is needed when gcs restart.
// We should always keep the cache view consistent.
cluster_resource_scheduler_->UpdateLastResourceUsage(