2021-12-14 17:01:53 -08:00
|
|
|
import asyncio
|
2021-11-11 15:59:13 -08:00
|
|
|
import os
|
2022-03-01 19:27:09 -08:00
|
|
|
import pprint
|
2021-12-14 17:01:53 -08:00
|
|
|
import time
|
2022-06-21 15:13:29 -07:00
|
|
|
from subprocess import list2cmdline
|
2021-11-11 15:59:13 -08:00
|
|
|
from typing import Optional, Tuple
|
|
|
|
|
|
|
|
import click
|
|
|
|
|
2022-06-21 15:13:29 -07:00
|
|
|
import ray._private.ray_constants as ray_constants
|
|
|
|
from ray._private.storage import _load_class
|
|
|
|
from ray.autoscaler._private.cli_logger import add_click_logging_options, cf, cli_logger
|
|
|
|
from ray.dashboard.modules.dashboard_sdk import parse_runtime_env_args
|
2022-02-09 11:55:32 -08:00
|
|
|
from ray.job_submission import JobStatus, JobSubmissionClient
|
2022-02-28 15:57:46 -08:00
|
|
|
from ray.util.annotations import PublicAPI
|
2021-11-11 15:59:13 -08:00
|
|
|
|
|
|
|
|
2021-11-13 16:24:02 -08:00
|
|
|
def _get_sdk_client(
|
|
|
|
address: Optional[str], create_cluster_if_needed: bool = False
|
|
|
|
) -> JobSubmissionClient:
|
2021-11-24 15:38:26 -08:00
|
|
|
|
2022-05-20 14:10:36 -05:00
|
|
|
if address is None and "RAY_ADDRESS" in os.environ:
|
2021-11-11 15:59:13 -08:00
|
|
|
address = os.environ["RAY_ADDRESS"]
|
|
|
|
|
2021-12-14 17:01:53 -08:00
|
|
|
cli_logger.labeled_value("Job submission server address", address)
|
2021-11-13 16:24:02 -08:00
|
|
|
return JobSubmissionClient(address, create_cluster_if_needed)
|
2021-11-11 15:59:13 -08:00
|
|
|
|
|
|
|
|
2021-12-14 17:01:53 -08:00
|
|
|
def _log_big_success_msg(success_msg):
|
|
|
|
cli_logger.newline()
|
|
|
|
cli_logger.success("-" * len(success_msg))
|
|
|
|
cli_logger.success(success_msg)
|
|
|
|
cli_logger.success("-" * len(success_msg))
|
|
|
|
cli_logger.newline()
|
|
|
|
|
|
|
|
|
|
|
|
def _log_big_error_msg(success_msg):
|
|
|
|
cli_logger.newline()
|
|
|
|
cli_logger.error("-" * len(success_msg))
|
|
|
|
cli_logger.error(success_msg)
|
|
|
|
cli_logger.error("-" * len(success_msg))
|
|
|
|
cli_logger.newline()
|
|
|
|
|
|
|
|
|
|
|
|
def _log_job_status(client: JobSubmissionClient, job_id: str):
|
2022-02-22 16:18:16 -06:00
|
|
|
info = client.get_job_info(job_id)
|
|
|
|
if info.status == JobStatus.SUCCEEDED:
|
2021-12-14 17:01:53 -08:00
|
|
|
_log_big_success_msg(f"Job '{job_id}' succeeded")
|
2022-02-22 16:18:16 -06:00
|
|
|
elif info.status == JobStatus.STOPPED:
|
2021-12-14 17:01:53 -08:00
|
|
|
cli_logger.warning(f"Job '{job_id}' was stopped")
|
2022-02-22 16:18:16 -06:00
|
|
|
elif info.status == JobStatus.FAILED:
|
2021-12-14 17:01:53 -08:00
|
|
|
_log_big_error_msg(f"Job '{job_id}' failed")
|
2022-02-22 16:18:16 -06:00
|
|
|
if info.message is not None:
|
2022-03-10 13:15:16 -08:00
|
|
|
cli_logger.print(f"Status message: {info.message}", no_format=True)
|
2021-12-14 17:01:53 -08:00
|
|
|
else:
|
|
|
|
# Catch-all.
|
2022-02-22 16:18:16 -06:00
|
|
|
cli_logger.print(f"Status for job '{job_id}': {info.status}")
|
|
|
|
if info.message is not None:
|
2022-03-10 13:15:16 -08:00
|
|
|
cli_logger.print(f"Status message: {info.message}", no_format=True)
|
2021-12-14 17:01:53 -08:00
|
|
|
|
|
|
|
|
|
|
|
async def _tail_logs(client: JobSubmissionClient, job_id: str):
|
|
|
|
async for lines in client.tail_job_logs(job_id):
|
|
|
|
print(lines, end="")
|
|
|
|
|
|
|
|
_log_job_status(client, job_id)
|
|
|
|
|
|
|
|
|
2021-11-11 15:59:13 -08:00
|
|
|
@click.group("job")
|
|
|
|
def job_cli_group():
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2022-02-28 15:57:46 -08:00
|
|
|
@job_cli_group.command()
|
2021-11-11 15:59:13 -08:00
|
|
|
@click.option(
|
|
|
|
"--address",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
required=False,
|
2021-11-14 13:52:47 -08:00
|
|
|
help=(
|
|
|
|
"Address of the Ray cluster to connect to. Can also be specified "
|
|
|
|
"using the RAY_ADDRESS environment variable."
|
2022-01-29 18:41:57 -08:00
|
|
|
),
|
2021-11-14 13:52:47 -08:00
|
|
|
)
|
2021-11-11 15:59:13 -08:00
|
|
|
@click.option(
|
|
|
|
"--job-id",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
required=False,
|
2022-07-27 02:39:52 -07:00
|
|
|
help=("DEPRECATED: Use -- submission-id instead."),
|
|
|
|
)
|
|
|
|
@click.option(
|
|
|
|
"--submission-id",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
required=False,
|
|
|
|
help=(
|
|
|
|
"Submission ID to specify for the job. "
|
|
|
|
"If not provided, one will be generated."
|
|
|
|
),
|
2021-11-14 13:52:47 -08:00
|
|
|
)
|
|
|
|
@click.option(
|
|
|
|
"--runtime-env",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
required=False,
|
|
|
|
help="Path to a local YAML file containing a runtime_env definition.",
|
|
|
|
)
|
|
|
|
@click.option(
|
|
|
|
"--runtime-env-json",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
required=False,
|
|
|
|
help="JSON-serialized runtime_env dictionary.",
|
|
|
|
)
|
2021-11-11 15:59:13 -08:00
|
|
|
@click.option(
|
|
|
|
"--working-dir",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
required=False,
|
2021-11-14 13:52:47 -08:00
|
|
|
help=(
|
|
|
|
"Directory containing files that your job will run in. Can be a "
|
|
|
|
"local directory or a remote URI to a .zip file (S3, GS, HTTP). "
|
|
|
|
"If specified, this overrides the option in --runtime-env."
|
|
|
|
),
|
2021-11-11 15:59:13 -08:00
|
|
|
)
|
2021-12-14 17:01:53 -08:00
|
|
|
@click.option(
|
|
|
|
"--no-wait",
|
|
|
|
is_flag=True,
|
|
|
|
type=bool,
|
|
|
|
default=False,
|
|
|
|
help="If set, will not stream logs and wait for the job to exit.",
|
|
|
|
)
|
|
|
|
@add_click_logging_options
|
2021-11-11 15:59:13 -08:00
|
|
|
@click.argument("entrypoint", nargs=-1, required=True, type=click.UNPROCESSED)
|
2022-02-28 15:57:46 -08:00
|
|
|
@PublicAPI
|
|
|
|
def submit(
|
2021-11-11 15:59:13 -08:00
|
|
|
address: Optional[str],
|
|
|
|
job_id: Optional[str],
|
2022-07-27 02:39:52 -07:00
|
|
|
submission_id: Optional[str],
|
2021-11-14 13:52:47 -08:00
|
|
|
runtime_env: Optional[str],
|
|
|
|
runtime_env_json: Optional[str],
|
2021-12-14 17:01:53 -08:00
|
|
|
working_dir: Optional[str],
|
|
|
|
entrypoint: Tuple[str],
|
|
|
|
no_wait: bool,
|
|
|
|
):
|
2022-03-18 10:52:13 -07:00
|
|
|
"""Submits a job to be run on the cluster.
|
2021-11-11 15:59:13 -08:00
|
|
|
|
|
|
|
Example:
|
2022-03-25 01:04:02 +01:00
|
|
|
ray job submit -- python my_script.py --arg=val
|
2021-11-11 15:59:13 -08:00
|
|
|
"""
|
2022-06-01 11:15:43 -07:00
|
|
|
|
2022-07-27 02:39:52 -07:00
|
|
|
if job_id:
|
|
|
|
cli_logger.warning(
|
|
|
|
"--job-id option is deprecated. " "Please use --submission-id instead."
|
|
|
|
)
|
|
|
|
|
|
|
|
submission_id = submission_id or job_id
|
|
|
|
|
2022-06-01 11:15:43 -07:00
|
|
|
if ray_constants.RAY_JOB_SUBMIT_HOOK in os.environ:
|
|
|
|
# Submit all args as **kwargs per the JOB_SUBMIT_HOOK contract.
|
|
|
|
_load_class(os.environ[ray_constants.RAY_JOB_SUBMIT_HOOK])(
|
|
|
|
address=address,
|
2022-07-27 02:39:52 -07:00
|
|
|
job_id=submission_id,
|
|
|
|
submission_id=submission_id,
|
2022-06-01 11:15:43 -07:00
|
|
|
runtime_env=runtime_env,
|
|
|
|
runtime_env_json=runtime_env_json,
|
|
|
|
working_dir=working_dir,
|
|
|
|
entrypoint=entrypoint,
|
|
|
|
no_wait=no_wait,
|
|
|
|
)
|
|
|
|
|
2021-11-13 16:24:02 -08:00
|
|
|
client = _get_sdk_client(address, create_cluster_if_needed=True)
|
2021-11-11 15:59:13 -08:00
|
|
|
|
2022-03-09 21:31:23 -08:00
|
|
|
final_runtime_env = parse_runtime_env_args(
|
|
|
|
runtime_env=runtime_env,
|
|
|
|
runtime_env_json=runtime_env_json,
|
|
|
|
working_dir=working_dir,
|
|
|
|
)
|
2021-11-11 15:59:13 -08:00
|
|
|
|
|
|
|
job_id = client.submit_job(
|
2022-02-02 14:33:57 -06:00
|
|
|
entrypoint=list2cmdline(entrypoint),
|
2022-07-27 02:39:52 -07:00
|
|
|
submission_id=submission_id,
|
2022-02-02 14:33:57 -06:00
|
|
|
runtime_env=final_runtime_env,
|
2021-11-14 13:52:47 -08:00
|
|
|
)
|
2021-12-14 17:01:53 -08:00
|
|
|
|
|
|
|
_log_big_success_msg(f"Job '{job_id}' submitted successfully")
|
|
|
|
|
|
|
|
with cli_logger.group("Next steps"):
|
|
|
|
cli_logger.print("Query the logs of the job:")
|
|
|
|
with cli_logger.indented():
|
|
|
|
cli_logger.print(cf.bold(f"ray job logs {job_id}"))
|
|
|
|
|
|
|
|
cli_logger.print("Query the status of the job:")
|
|
|
|
with cli_logger.indented():
|
|
|
|
cli_logger.print(cf.bold(f"ray job status {job_id}"))
|
|
|
|
|
|
|
|
cli_logger.print("Request the job to be stopped:")
|
|
|
|
with cli_logger.indented():
|
|
|
|
cli_logger.print(cf.bold(f"ray job stop {job_id}"))
|
|
|
|
|
|
|
|
cli_logger.newline()
|
|
|
|
sdk_version = client.get_version()
|
|
|
|
# sdk version 0 does not have log streaming
|
|
|
|
if not no_wait:
|
|
|
|
if int(sdk_version) > 0:
|
|
|
|
cli_logger.print(
|
|
|
|
"Tailing logs until the job exits " "(disable with --no-wait):"
|
|
|
|
)
|
|
|
|
asyncio.get_event_loop().run_until_complete(_tail_logs(client, job_id))
|
|
|
|
else:
|
|
|
|
cli_logger.warning(
|
|
|
|
"Tailing logs is not enabled for job sdk client version "
|
|
|
|
f"{sdk_version}. Please upgrade your ray to latest version "
|
|
|
|
"for this feature."
|
|
|
|
)
|
2021-11-11 15:59:13 -08:00
|
|
|
|
|
|
|
|
2022-02-28 15:57:46 -08:00
|
|
|
@job_cli_group.command()
|
2021-11-11 15:59:13 -08:00
|
|
|
@click.option(
|
|
|
|
"--address",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
required=False,
|
2021-11-14 13:52:47 -08:00
|
|
|
help=(
|
|
|
|
"Address of the Ray cluster to connect to. Can also be specified "
|
|
|
|
"using the RAY_ADDRESS environment variable."
|
2022-01-29 18:41:57 -08:00
|
|
|
),
|
2021-11-14 13:52:47 -08:00
|
|
|
)
|
2021-11-11 15:59:13 -08:00
|
|
|
@click.argument("job-id", type=str)
|
2021-12-14 17:01:53 -08:00
|
|
|
@add_click_logging_options
|
2022-02-28 15:57:46 -08:00
|
|
|
@PublicAPI(stability="beta")
|
|
|
|
def status(address: Optional[str], job_id: str):
|
2021-11-11 15:59:13 -08:00
|
|
|
"""Queries for the current status of a job.
|
|
|
|
|
|
|
|
Example:
|
2022-03-25 01:04:02 +01:00
|
|
|
ray job status <my_job_id>
|
2021-11-11 15:59:13 -08:00
|
|
|
"""
|
|
|
|
client = _get_sdk_client(address)
|
2021-12-14 17:01:53 -08:00
|
|
|
_log_job_status(client, job_id)
|
2021-11-11 15:59:13 -08:00
|
|
|
|
|
|
|
|
2022-02-28 15:57:46 -08:00
|
|
|
@job_cli_group.command()
|
2021-11-11 15:59:13 -08:00
|
|
|
@click.option(
|
|
|
|
"--address",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
required=False,
|
2021-11-14 13:52:47 -08:00
|
|
|
help=(
|
|
|
|
"Address of the Ray cluster to connect to. Can also be specified "
|
|
|
|
"using the RAY_ADDRESS environment variable."
|
2022-01-29 18:41:57 -08:00
|
|
|
),
|
2021-11-14 13:52:47 -08:00
|
|
|
)
|
2021-12-14 17:01:53 -08:00
|
|
|
@click.option(
|
|
|
|
"--no-wait",
|
|
|
|
is_flag=True,
|
|
|
|
type=bool,
|
|
|
|
default=False,
|
|
|
|
help="If set, will not wait for the job to exit.",
|
|
|
|
)
|
2021-11-11 15:59:13 -08:00
|
|
|
@click.argument("job-id", type=str)
|
2021-12-14 17:01:53 -08:00
|
|
|
@add_click_logging_options
|
2022-02-28 15:57:46 -08:00
|
|
|
@PublicAPI(stability="beta")
|
|
|
|
def stop(address: Optional[str], no_wait: bool, job_id: str):
|
2021-11-11 15:59:13 -08:00
|
|
|
"""Attempts to stop a job.
|
|
|
|
|
|
|
|
Example:
|
2022-03-25 01:04:02 +01:00
|
|
|
ray job stop <my_job_id>
|
2021-11-11 15:59:13 -08:00
|
|
|
"""
|
|
|
|
client = _get_sdk_client(address)
|
2021-12-14 17:01:53 -08:00
|
|
|
cli_logger.print(f"Attempting to stop job {job_id}")
|
2021-11-11 15:59:13 -08:00
|
|
|
client.stop_job(job_id)
|
|
|
|
|
2021-12-14 17:01:53 -08:00
|
|
|
if no_wait:
|
|
|
|
return
|
|
|
|
else:
|
|
|
|
cli_logger.print(
|
|
|
|
f"Waiting for job '{job_id}' to exit " f"(disable with --no-wait):"
|
|
|
|
)
|
|
|
|
|
|
|
|
while True:
|
|
|
|
status = client.get_job_status(job_id)
|
2022-02-18 07:54:37 -08:00
|
|
|
if status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED}:
|
2021-12-14 17:01:53 -08:00
|
|
|
_log_job_status(client, job_id)
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
cli_logger.print(f"Job has not exited yet. Status: {status}")
|
|
|
|
time.sleep(1)
|
|
|
|
|
2021-11-11 15:59:13 -08:00
|
|
|
|
2022-02-28 15:57:46 -08:00
|
|
|
@job_cli_group.command()
|
2021-11-11 15:59:13 -08:00
|
|
|
@click.option(
|
|
|
|
"--address",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
required=False,
|
2021-11-14 13:52:47 -08:00
|
|
|
help=(
|
|
|
|
"Address of the Ray cluster to connect to. Can also be specified "
|
|
|
|
"using the RAY_ADDRESS environment variable."
|
2022-01-29 18:41:57 -08:00
|
|
|
),
|
2021-11-14 13:52:47 -08:00
|
|
|
)
|
2021-11-11 15:59:13 -08:00
|
|
|
@click.argument("job-id", type=str)
|
2021-12-14 17:01:53 -08:00
|
|
|
@click.option(
|
|
|
|
"-f",
|
|
|
|
"--follow",
|
|
|
|
is_flag=True,
|
|
|
|
type=bool,
|
|
|
|
default=False,
|
|
|
|
help="If set, follow the logs (like `tail -f`).",
|
|
|
|
)
|
|
|
|
@add_click_logging_options
|
2022-02-28 15:57:46 -08:00
|
|
|
@PublicAPI(stability="beta")
|
|
|
|
def logs(address: Optional[str], job_id: str, follow: bool):
|
2021-11-11 15:59:13 -08:00
|
|
|
"""Gets the logs of a job.
|
|
|
|
|
|
|
|
Example:
|
2022-03-25 01:04:02 +01:00
|
|
|
ray job logs <my_job_id>
|
2021-11-11 15:59:13 -08:00
|
|
|
"""
|
|
|
|
client = _get_sdk_client(address)
|
2021-12-14 17:01:53 -08:00
|
|
|
sdk_version = client.get_version()
|
|
|
|
# sdk version 0 did not have log streaming
|
|
|
|
if follow:
|
|
|
|
if int(sdk_version) > 0:
|
|
|
|
asyncio.get_event_loop().run_until_complete(_tail_logs(client, job_id))
|
|
|
|
else:
|
|
|
|
cli_logger.warning(
|
|
|
|
"Tailing logs is not enabled for job sdk client version "
|
|
|
|
f"{sdk_version}. Please upgrade your ray to latest version "
|
|
|
|
"for this feature."
|
|
|
|
)
|
|
|
|
else:
|
2022-03-01 19:27:09 -08:00
|
|
|
# Set no_format to True because the logs may have unescaped "{" and "}"
|
|
|
|
# and the CLILogger calls str.format().
|
|
|
|
cli_logger.print(client.get_job_logs(job_id), end="", no_format=True)
|
|
|
|
|
|
|
|
|
|
|
|
@job_cli_group.command()
|
|
|
|
@click.option(
|
|
|
|
"--address",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
required=False,
|
|
|
|
help=(
|
|
|
|
"Address of the Ray cluster to connect to. Can also be specified "
|
|
|
|
"using the RAY_ADDRESS environment variable."
|
|
|
|
),
|
|
|
|
)
|
|
|
|
@add_click_logging_options
|
|
|
|
@PublicAPI(stability="beta")
|
|
|
|
def list(address: Optional[str]):
|
|
|
|
"""Lists all running jobs and their information.
|
|
|
|
|
|
|
|
Example:
|
2022-03-25 01:04:02 +01:00
|
|
|
ray job list
|
2022-03-01 19:27:09 -08:00
|
|
|
"""
|
|
|
|
client = _get_sdk_client(address)
|
|
|
|
# Set no_format to True because the logs may have unescaped "{" and "}"
|
|
|
|
# and the CLILogger calls str.format().
|
|
|
|
cli_logger.print(pprint.pformat(client.list_jobs()), no_format=True)
|