[Stats] Metrics Export User Interface Part 2 (Prometheus Service Discovery) (#9970)

* In progress.

* In Progress.

* Finish the working version.

* Write a documentation.

* Addressed code review.

* Fix lint error.

* Lint.

* Addressed code review. Make test less flaky.

* Use a random port for ray start.

* Modify doc.

* Make write atomic.
This commit is contained in:
SangBin Cho 2020-08-07 21:59:24 -07:00 committed by GitHub
parent d3cde16163
commit 39088ab6f2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 150 additions and 2 deletions

View file

@ -120,3 +120,34 @@ Getting Started (Cluster Launcher)
----------------------------------
When you use a Ray cluster launcher, it is common node IP addresses are changing because cluster is scaling up and down.
In this case, you can use Prometheus' `file based service discovery <https://prometheus.io/docs/guides/file-sd/#installing-configuring-and-running-prometheus>`_.
Prometheus Service Discovery Support
------------------------------------
Ray auto-generates a Prometheus `service discovery file <https://prometheus.io/docs/guides/file-sd/#installing-configuring-and-running-prometheus>`_ in a head node to help metrics agents' service discovery.
This allows you to easily scrape all metrics at each node in autoscaling clusters. Let's walkthrough how to acheive this.
The service discovery file is generated in a head node. Note that head node is a node where you started by `ray start --head` or ran `ray.init()`.
Inside a head node, check out a `temp_dir` of Ray. By default, it is `/tmp/ray` (in both Linux and MacOS). You should be able to find a file `prom_metrics_service_discovery.json`.
Ray periodically updates the addresses of all metrics agents in a cluster to this file.
Now, modify a Prometheus config to scrape the file for service discovery.
.. code-block:: yaml
# Prometheus config file
# my global config
global:
scrape_interval: 2s
evaluation_interval: 2s
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
- job_name: 'ray'
file_sd_configs:
- files:
- '/tmp/ray/prom_metrics_service_discovery.json'
Prometheus will automatically detect that the file contents are changing and update addresses it scrapes to based on the service discovery file generated by Ray.

View file

@ -36,6 +36,7 @@ from ray.dashboard.metrics_exporter.client import Exporter
from ray.dashboard.metrics_exporter.client import MetricsExportClient
from ray.dashboard.node_stats import NodeStats
from ray.dashboard.util import to_unix_time, measures_to_dict, format_resource
from ray.metrics_agent import PrometheusServiceDiscoveryWriter
try:
from ray.tune import Analysis
@ -493,6 +494,8 @@ class Dashboard:
self.dashboard_id = str(uuid.uuid4())
self.dashboard_controller = DashboardController(
redis_address, redis_password)
self.service_discovery = PrometheusServiceDiscoveryWriter(
redis_address, redis_password, temp_dir)
# Setting the environment variable RAY_DASHBOARD_DEV=1 disables some
# security checks in the dashboard server to ease development while
@ -571,6 +574,7 @@ class Dashboard:
def run(self):
self.log_dashboard_url()
self.dashboard_controller.start_collecting_metrics()
self.service_discovery.start()
if self.metrics_export_address:
self._start_exporting_metrics()
aiohttp.web.run_app(self.app, host=self.host, port=self.port)

View file

@ -1,5 +1,9 @@
import json
import logging
import os
import threading
import time
import traceback
from collections import defaultdict
from typing import List
@ -13,6 +17,8 @@ from opencensus.tags import tag_map as tag_map_module
from opencensus.tags import tag_value as tag_value_module
from opencensus.stats import view
import ray
from ray import prometheus_exporter
from ray.core.generated.common_pb2 import MetricPoint
@ -181,3 +187,73 @@ class MetricsAgent:
metric_name].view.measure._description = metric_description
self._registry[metric_name].view.measure._unit = metric_units
self._missing_information = False
class PrometheusServiceDiscoveryWriter(threading.Thread):
"""A class to support Prometheus service discovery.
It supports file-based service discovery. Checkout
https://prometheus.io/docs/guides/file-sd/ for more details.
Args:
redis_address(str): Ray's redis address.
redis_password(str): Ray's redis password.
temp_dir(str): Temporary directory used by
Ray to store logs and metadata.
"""
def __init__(self, redis_address, redis_password, temp_dir):
ray.state.state._initialize_global_state(
redis_address=redis_address, redis_password=redis_password)
self.temp_dir = temp_dir
self.default_service_discovery_flush_period = 5
super().__init__()
def get_file_discovery_content(self):
"""Return the content for Prometheus serivce discovery."""
nodes = ray.nodes()
metrics_export_addresses = [
"{}:{}".format(node["NodeManagerAddress"],
node["MetricsExportPort"]) for node in nodes
]
return json.dumps([{
"labels": {
"job": "ray"
},
"targets": metrics_export_addresses
}])
def write(self):
# Write a file based on https://prometheus.io/docs/guides/file-sd/
# Write should be atomic. Otherwise, Prometheus raises an error that
# json file format is invalid because it reads a file when
# file is re-written. Note that Prometheus still works although we
# have this error.
temp_file_name = self.get_temp_file_name()
with open(temp_file_name, "w") as json_file:
json_file.write(self.get_file_discovery_content())
# NOTE: os.rename is atomic, so we won't have race condition reading
# this file.
os.rename(temp_file_name, self.get_target_file_name())
def get_target_file_name(self):
return os.path.join(
self.temp_dir, ray.ray_constants.PROMETHEUS_SERVICE_DISCOVERY_FILE)
def get_temp_file_name(self):
return os.path.join(
self.temp_dir, "{}_{}".format(
"tmp", ray.ray_constants.PROMETHEUS_SERVICE_DISCOVERY_FILE))
def run(self):
while True:
# This thread won't be broken by exceptions.
try:
self.write()
except Exception as e:
logger.warning("Writing a service discovery file, {},"
"failed."
.format(self.writer.get_target_file_name()))
logger.warning(traceback.format_exc())
logger.warning("Error message: {}".format(e))
time.sleep(self.default_service_discovery_flush_period)

View file

@ -37,6 +37,7 @@ REDIS_MINIMUM_MEMORY_BYTES = 10**7
DEFAULT_PORT = 6379
DEFAULT_DASHBOARD_PORT = 8265
PROMETHEUS_SERVICE_DISCOVERY_FILE = "prom_metrics_service_discovery.json"
# Default resource requirements for actors when no resource requirements are
# specified.
DEFAULT_ACTOR_METHOD_CPU_SIMPLE = 1

View file

@ -350,7 +350,7 @@ def dashboard(cluster_config_file, cluster_name, port, remote_port):
@click.option(
"--metrics-export-port",
type=int,
default=8080,
default=None,
help="the port to use to expose Ray metrics through a "
"Prometheus endpoint.")
def start(node_ip_address, redis_address, address, redis_port, port,

View file

@ -1,3 +1,4 @@
import json
import pytest
from collections import defaultdict
@ -7,9 +8,12 @@ import requests
from opencensus.tags import tag_key as tag_key_module
from prometheus_client.parser import text_string_to_metric_families
import ray
from ray.core.generated.common_pb2 import MetricPoint
from ray.dashboard.util import get_unused_port
from ray.metrics_agent import Gauge, MetricsAgent
from ray.metrics_agent import (Gauge, MetricsAgent,
PrometheusServiceDiscoveryWriter)
def generate_metrics_point(name: str,
@ -180,6 +184,38 @@ def test_multiple_record(cleanup_agent):
assert sample_values == [point.value for point in points]
def test_prometheus_file_based_service_discovery(ray_start_cluster):
# Make sure Prometheus service discovery file is correctly written
# when number of nodes are dynamically changed.
NUM_NODES = 5
cluster = ray_start_cluster
nodes = [cluster.add_node() for _ in range(NUM_NODES)]
cluster.wait_for_nodes()
addr = ray.init(address=cluster.address)
redis_address = addr["redis_address"]
writer = PrometheusServiceDiscoveryWriter(
redis_address, ray.ray_constants.REDIS_DEFAULT_PASSWORD, "/tmp/ray")
def get_metrics_export_address_from_node(nodes):
return [
"{}:{}".format(node.node_ip_address, node.metrics_export_port)
for node in nodes
]
loaded_json_data = json.loads(writer.get_file_discovery_content())[0]
assert (set(get_metrics_export_address_from_node(nodes)) == set(
loaded_json_data["targets"]))
# Let's update nodes.
for _ in range(3):
nodes.append(cluster.add_node())
# Make sure service discovery file content is correctly updated.
loaded_json_data = json.loads(writer.get_file_discovery_content())[0]
assert (set(get_metrics_export_address_from_node(nodes)) == set(
loaded_json_data["targets"]))
if __name__ == "__main__":
import sys
sys.exit(pytest.main(["-v", __file__]))