ray/docker/kuberay-autoscaler/run_autoscaler.py
Dmitri Gekhtman a402e956a4
[KubeRay] Format autoscaling config based on RayCluster CR (#22348)
Closes #21655. At the start of each autoscaler iteration, we read the Ray Cluster CR from K8s and use it to extract the autoscaling config.
2022-02-22 11:06:37 -08:00

93 lines
3 KiB
Python

import argparse
import logging
import os
import time
import ray
from ray import ray_constants
from ray._private.ray_logging import setup_component_logger
from ray._private.services import get_node_ip_address
from ray.autoscaler._private.monitor import Monitor
from autoscaling_config import AutoscalingConfigProducer
def setup_logging() -> None:
"""Log to autoscaler log file
(typically, /tmp/ray/session_latest/logs/monitor.*)
Also log to pod stdout (logs viewable with `kubectl logs <head-pod> -c autoscaler`).
"""
# Write logs at info level to monitor.log.
setup_component_logger(
logging_level=ray_constants.LOGGER_LEVEL, # info
logging_format=ray_constants.LOGGER_FORMAT,
log_dir=os.path.join(
ray._private.utils.get_ray_temp_dir(), ray.node.SESSION_LATEST, "logs"
),
filename=ray_constants.MONITOR_LOG_FILE_NAME, # monitor.log
max_bytes=ray_constants.LOGGING_ROTATE_BYTES,
backup_count=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
)
# Also log to stdout for debugging with `kubectl logs`.
root_logger = logging.getLogger("")
root_logger.setLevel(logging.INFO)
root_handler = logging.StreamHandler()
root_handler.setLevel(logging.INFO)
root_handler.setFormatter(logging.Formatter(ray_constants.LOGGER_FORMAT))
root_logger.addHandler(root_handler)
if __name__ == "__main__":
setup_logging()
parser = argparse.ArgumentParser(description="Kuberay Autoscaler")
parser.add_argument(
"--cluster-name",
required=True,
type=str,
help="The name of the Ray Cluster.\n"
"Should coincide with the `metadata.name` of the RayCluster CR.",
)
parser.add_argument(
"--cluster-namespace",
required=True,
type=str,
help="The Kubernetes namespace the Ray Cluster lives in.\n"
"Should coincide with the `metadata.namespace` of the RayCluster CR.",
)
parser.add_argument(
"--redis-password",
required=False,
type=str,
default=None,
help="The password to use for Redis",
)
args = parser.parse_args()
head_ip = get_node_ip_address()
autoscaling_config_producer = AutoscalingConfigProducer(
args.cluster_name, args.cluster_namespace
)
# When the entrypoint code reaches here,
# the GCS might not have collected information on the head node itself.
# That can lead to a annoying artifact at the start of the autoscaler logs:
# a status message showing no nodes at all connected to the Ray cluster.
# Wait a bit to avoid that artifact.
# TODO (Dmitri): Fix StandardAutoscaler.summary() to avoid the issue
# and remove the sleep.
time.sleep(5)
Monitor(
address=f"{head_ip}:6379",
redis_password=args.redis_password,
# The `autoscaling_config` arg can be a dict or a `Callable: () -> dict`.
# In this case, it's a callable.
autoscaling_config=autoscaling_config_producer,
monitor_ip=head_ip,
).run()