ray/dashboard/modules/job/common.py

from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict, Optional, Tuple, Union
import pickle

from ray import ray_constants
from ray.experimental.internal_kv import (
    _internal_kv_initialized,
    _internal_kv_get,
    _internal_kv_list,
    _internal_kv_put,
)
from ray._private.runtime_env.packaging import parse_uri

# NOTE(edoakes): these constants should be considered a public API because
# they're exposed in the snapshot API.
JOB_ID_METADATA_KEY = "job_submission_id"
JOB_NAME_METADATA_KEY = "job_name"

# Version 0 -> 1: Added log streaming and changed behavior of job logs cli.
CURRENT_VERSION = "1"


class JobStatus(str, Enum):
    PENDING = "PENDING"
    RUNNING = "RUNNING"
    STOPPED = "STOPPED"
    SUCCEEDED = "SUCCEEDED"
    FAILED = "FAILED"

    def __str__(self):
        return f"{self.value}"

    def is_terminal(self):
        return self.value in {"STOPPED", "SUCCEEDED", "FAILED"}


@dataclass
class JobStatusInfo:
    status: JobStatus
    message: Optional[str] = None

    def __post_init__(self):
        if self.message is None:
            if self.status == JobStatus.PENDING:
                self.message = (
                    "Job has not started yet, likely waiting "
                    "for the runtime_env to be set up."
                )
            elif self.status == JobStatus.RUNNING:
                self.message = "Job is currently running."
            elif self.status == JobStatus.STOPPED:
                self.message = "Job was intentionally stopped."
            elif self.status == JobStatus.SUCCEEDED:
                self.message = "Job finished successfully."
            elif self.status == JobStatus.FAILED:
                self.message = "Job failed."


class JobStatusStorageClient:
    """
    Handles formatting of status storage key given job id.
    """

    JOB_STATUS_KEY_PREFIX = "_ray_internal_job_status"
    JOB_STATUS_KEY = f"{JOB_STATUS_KEY_PREFIX}_{{job_id}}"

    def __init__(self):
        assert _internal_kv_initialized()

    def put_status(self, job_id: str, status: Union[JobStatus, JobStatusInfo]):
        if isinstance(status, JobStatus):
            status = JobStatusInfo(status=status)
        elif not isinstance(status, JobStatusInfo):
            assert False, "status must be JobStatus or JobStatusInfo."

        _internal_kv_put(
            self.JOB_STATUS_KEY.format(job_id=job_id),
            pickle.dumps(status),
            namespace=ray_constants.KV_NAMESPACE_JOB,
        )

    def get_status(self, job_id: str) -> Optional[JobStatusInfo]:
        pickled_status = _internal_kv_get(
            self.JOB_STATUS_KEY.format(job_id=job_id),
            namespace=ray_constants.KV_NAMESPACE_JOB,
        )
        if pickled_status is None:
            return None
        else:
            return pickle.loads(pickled_status)

    def get_all_jobs(self) -> Dict[str, JobStatusInfo]:
        raw_job_ids = _internal_kv_list(self.JOB_STATUS_KEY_PREFIX)
        job_ids = [job_id.decode() for job_id in raw_job_ids]
        return {job_id: self.get_status(job_id) for job_id in job_ids}


def uri_to_http_components(package_uri: str) -> Tuple[str, str]:
    if not package_uri.endswith(".zip"):
        raise ValueError(f"package_uri ({package_uri}) does not end in .zip")
    # We need to strip the gcs:// prefix and .zip suffix to make it
    # possible to pass the package_uri over HTTP.
    protocol, package_name = parse_uri(package_uri)
    return protocol.value, package_name[: -len(".zip")]


def http_uri_components_to_uri(protocol: str, package_name: str) -> str:
    if package_name.endswith(".zip"):
        raise ValueError(f"package_name ({package_name}) should not end in .zip")
    return f"{protocol}://{package_name}.zip"


def validate_request_type(json_data: Dict[str, Any], request_type: dataclass) -> Any:
    return request_type(**json_data)


@dataclass
class VersionResponse:
    version: str
    ray_version: str
    ray_commit: str


@dataclass
class JobSubmitRequest:
    # Command to start execution, ex: "python script.py"
    entrypoint: str
    # Optional job_id to specify for the job. If the job_id is not specified,
    # one will be generated. If a job with the same job_id already exists, it
    # will be rejected.
    job_id: Optional[str] = None
    # Dict to setup execution environment.
    runtime_env: Optional[Dict[str, Any]] = None
    # Metadata to pass in to the JobConfig.
    metadata: Optional[Dict[str, str]] = None

    def __post_init__(self):
        if not isinstance(self.entrypoint, str):
            raise TypeError(f"entrypoint must be a string, got {type(self.entrypoint)}")

        if self.job_id is not None and not isinstance(self.job_id, str):
            raise TypeError(
                f"job_id must be a string if provided, got {type(self.job_id)}"
            )

        if self.runtime_env is not None:
            if not isinstance(self.runtime_env, dict):
                raise TypeError(
                    f"runtime_env must be a dict, got {type(self.runtime_env)}"
                )
            else:
                for k in self.runtime_env.keys():
                    if not isinstance(k, str):
                        raise TypeError(
                            f"runtime_env keys must be strings, got {type(k)}"
                        )

        if self.metadata is not None:
            if not isinstance(self.metadata, dict):
                raise TypeError(f"metadata must be a dict, got {type(self.metadata)}")
            else:
                for k in self.metadata.keys():
                    if not isinstance(k, str):
                        raise TypeError(f"metadata keys must be strings, got {type(k)}")
                for v in self.metadata.values():
                    if not isinstance(v, str):
                        raise TypeError(
                            f"metadata values must be strings, got {type(v)}"
                        )


@dataclass
class JobSubmitResponse:
    job_id: str


@dataclass
class JobStopResponse:
    stopped: bool


@dataclass
class JobStatusResponse:
    status: JobStatus
    message: Optional[str]


# TODO(jiaodong): Support log streaming #19415
@dataclass
class JobLogsResponse:
    logs: str
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00			`from dataclasses import dataclass`
			`from enum import Enum`
[job submission] Add a `message` to the JobStatus to return more detailed errors (#20491) 2021-11-18 10:15:23 -06:00			`from typing import Any, Dict, Optional, Tuple, Union`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00			`import pickle`
[job submission] Better validation + tests for input types, refactor API (#20332) 2021-11-13 22:54:01 -08:00
[gcs] Update all redis kv usage in python except function table (#20014) ## Why are these changes needed? This is part of redis removal project. In this PR all direct usage of redis got removed except function table. Function table will be migrated in the next PR ## Related issue number #19443 2021-11-10 20:24:53 -08:00			`from ray import ray_constants`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00			`from ray.experimental.internal_kv import (`
			`_internal_kv_initialized,`
			`_internal_kv_get,`
[jobs] Monitor jobs in the background to avoid requiring clients to poll (#22180) 2022-02-07 15:25:25 -06:00			`_internal_kv_list,`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00			`_internal_kv_put,`
			`)`
[job submission] Better validation + tests for input types, refactor API (#20332) 2021-11-13 22:54:01 -08:00			`from ray._private.runtime_env.packaging import parse_uri`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00
[job submission] Prefix job ID with `raysubmit_` and pass `job_name` metadata (#20490) 2021-11-17 21:48:22 -06:00			`# NOTE(edoakes): these constants should be considered a public API because`
			`# they're exposed in the snapshot API.`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00			`JOB_ID_METADATA_KEY = "job_submission_id"`
[job submission] Prefix job ID with `raysubmit_` and pass `job_name` metadata (#20490) 2021-11-17 21:48:22 -06:00			`JOB_NAME_METADATA_KEY = "job_name"`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00
[Jobs] Add log streaming for jobs (#20976) Current logs API simply returns a str to unblock development and integration. We should add proper log streaming for better UX and external job manager integration. Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> Co-authored-by: Ed Oakes <ed.nmi.oakes@gmail.com> Co-authored-by: Richard Liaw <rliaw@berkeley.edu> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Avnish Narayan <38871737+avnishn@users.noreply.github.com> Co-authored-by: Jiao Dong <jiaodong@anyscale.com> 2021-12-14 17:01:53 -08:00			`# Version 0 -> 1: Added log streaming and changed behavior of job logs cli.`
			`CURRENT_VERSION = "1"`
[jobs] Add /api/version endpoint (#20622) 2021-11-22 15:11:04 -06:00
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00
			`class JobStatus(str, Enum):`
			`PENDING = "PENDING"`
			`RUNNING = "RUNNING"`
			`STOPPED = "STOPPED"`
			`SUCCEEDED = "SUCCEEDED"`
			`FAILED = "FAILED"`

[jobs] Monitor jobs in the background to avoid requiring clients to poll (#22180) 2022-02-07 15:25:25 -06:00			`def __str__(self):`
			`return f"{self.value}"`

			`def is_terminal(self):`
			`return self.value in {"STOPPED", "SUCCEEDED", "FAILED"}`

[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00
[job submission] Add a `message` to the JobStatus to return more detailed errors (#20491) 2021-11-18 10:15:23 -06:00			`@dataclass`
			`class JobStatusInfo:`
			`status: JobStatus`
			`message: Optional[str] = None`

			`def __post_init__(self):`
			`if self.message is None:`
			`if self.status == JobStatus.PENDING:`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`self.message = (`
			`"Job has not started yet, likely waiting "`
			`"for the runtime_env to be set up."`
			`)`
[job submission] Add a `message` to the JobStatus to return more detailed errors (#20491) 2021-11-18 10:15:23 -06:00			`elif self.status == JobStatus.RUNNING:`
			`self.message = "Job is currently running."`
			`elif self.status == JobStatus.STOPPED:`
			`self.message = "Job was intentionally stopped."`
			`elif self.status == JobStatus.SUCCEEDED:`
			`self.message = "Job finished successfully."`
			`elif self.status == JobStatus.FAILED:`
			`self.message = "Job failed."`


[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00			`class JobStatusStorageClient:`
			`"""`
			`Handles formatting of status storage key given job id.`
			`"""`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00
[jobs] Monitor jobs in the background to avoid requiring clients to poll (#22180) 2022-02-07 15:25:25 -06:00			`JOB_STATUS_KEY_PREFIX = "_ray_internal_job_status"`
			`JOB_STATUS_KEY = f"{JOB_STATUS_KEY_PREFIX}_{{job_id}}"`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00
			`def __init__(self):`
			`assert _internal_kv_initialized()`

[job submission] Add a `message` to the JobStatus to return more detailed errors (#20491) 2021-11-18 10:15:23 -06:00			`def put_status(self, job_id: str, status: Union[JobStatus, JobStatusInfo]):`
			`if isinstance(status, JobStatus):`
			`status = JobStatusInfo(status=status)`
			`elif not isinstance(status, JobStatusInfo):`
			`assert False, "status must be JobStatus or JobStatusInfo."`

[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00			`_internal_kv_put(`
[gcs] Update all redis kv usage in python except function table (#20014) ## Why are these changes needed? This is part of redis removal project. In this PR all direct usage of redis got removed except function table. Function table will be migrated in the next PR ## Related issue number #19443 2021-11-10 20:24:53 -08:00			`self.JOB_STATUS_KEY.format(job_id=job_id),`
			`pickle.dumps(status),`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`namespace=ray_constants.KV_NAMESPACE_JOB,`
			`)`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00
[job submission] Add a `message` to the JobStatus to return more detailed errors (#20491) 2021-11-18 10:15:23 -06:00			`def get_status(self, job_id: str) -> Optional[JobStatusInfo]:`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00			`pickled_status = _internal_kv_get(`
[gcs] Update all redis kv usage in python except function table (#20014) ## Why are these changes needed? This is part of redis removal project. In this PR all direct usage of redis got removed except function table. Function table will be migrated in the next PR ## Related issue number #19443 2021-11-10 20:24:53 -08:00			`self.JOB_STATUS_KEY.format(job_id=job_id),`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`namespace=ray_constants.KV_NAMESPACE_JOB,`
			`)`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00			`if pickled_status is None:`
[job submission] Remove DOES_NOT_EXIST status (#20354) 2021-11-15 16:57:32 -08:00			`return None`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00			`else:`
			`return pickle.loads(pickled_status)`

[jobs] Monitor jobs in the background to avoid requiring clients to poll (#22180) 2022-02-07 15:25:25 -06:00			`def get_all_jobs(self) -> Dict[str, JobStatusInfo]:`
			`raw_job_ids = _internal_kv_list(self.JOB_STATUS_KEY_PREFIX)`
			`job_ids = [job_id.decode() for job_id in raw_job_ids]`
			`return {job_id: self.get_status(job_id) for job_id in job_ids}`

[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00
[job submission] Better validation + tests for input types, refactor API (#20332) 2021-11-13 22:54:01 -08:00			`def uri_to_http_components(package_uri: str) -> Tuple[str, str]:`
			`if not package_uri.endswith(".zip"):`
			`raise ValueError(f"package_uri ({package_uri}) does not end in .zip")`
			`# We need to strip the gcs:// prefix and .zip suffix to make it`
			`# possible to pass the package_uri over HTTP.`
			`protocol, package_name = parse_uri(package_uri)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`return protocol.value, package_name[: -len(".zip")]`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00

[job submission] Better validation + tests for input types, refactor API (#20332) 2021-11-13 22:54:01 -08:00			`def http_uri_components_to_uri(protocol: str, package_name: str) -> str:`
			`if package_name.endswith(".zip"):`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`raise ValueError(f"package_name ({package_name}) should not end in .zip")`
[job submission] Better validation + tests for input types, refactor API (#20332) 2021-11-13 22:54:01 -08:00			`return f"{protocol}://{package_name}.zip"`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00

[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`def validate_request_type(json_data: Dict[str, Any], request_type: dataclass) -> Any:`
[job submission] Better validation + tests for input types, refactor API (#20332) 2021-11-13 22:54:01 -08:00			`return request_type(**json_data)`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00

[jobs] Add /api/version endpoint (#20622) 2021-11-22 15:11:04 -06:00			`@dataclass`
			`class VersionResponse:`
			`version: str`
			`ray_version: str`
			`ray_commit: str`


[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00			`@dataclass`
			`class JobSubmitRequest:`
			`# Command to start execution, ex: "python script.py"`
			`entrypoint: str`
			`# Optional job_id to specify for the job. If the job_id is not specified,`
			`# one will be generated. If a job with the same job_id already exists, it`
			`# will be rejected.`
[job submission] Better validation + tests for input types, refactor API (#20332) 2021-11-13 22:54:01 -08:00			`job_id: Optional[str] = None`
			`# Dict to setup execution environment.`
			`runtime_env: Optional[Dict[str, Any]] = None`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00			`# Metadata to pass in to the JobConfig.`
[job submission] Better validation + tests for input types, refactor API (#20332) 2021-11-13 22:54:01 -08:00			`metadata: Optional[Dict[str, str]] = None`

			`def __post_init__(self):`
			`if not isinstance(self.entrypoint, str):`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`raise TypeError(f"entrypoint must be a string, got {type(self.entrypoint)}")`
[job submission] Better validation + tests for input types, refactor API (#20332) 2021-11-13 22:54:01 -08:00
			`if self.job_id is not None and not isinstance(self.job_id, str):`
			`raise TypeError(`
			`f"job_id must be a string if provided, got {type(self.job_id)}"`
			`)`

			`if self.runtime_env is not None:`
			`if not isinstance(self.runtime_env, dict):`
			`raise TypeError(`
			`f"runtime_env must be a dict, got {type(self.runtime_env)}"`
			`)`
			`else:`
			`for k in self.runtime_env.keys():`
			`if not isinstance(k, str):`
			`raise TypeError(`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`f"runtime_env keys must be strings, got {type(k)}"`
			`)`
[job submission] Better validation + tests for input types, refactor API (#20332) 2021-11-13 22:54:01 -08:00
			`if self.metadata is not None:`
			`if not isinstance(self.metadata, dict):`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`raise TypeError(f"metadata must be a dict, got {type(self.metadata)}")`
[job submission] Better validation + tests for input types, refactor API (#20332) 2021-11-13 22:54:01 -08:00			`else:`
			`for k in self.metadata.keys():`
			`if not isinstance(k, str):`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`raise TypeError(f"metadata keys must be strings, got {type(k)}")`
[job submission] Better validation + tests for input types, refactor API (#20332) 2021-11-13 22:54:01 -08:00			`for v in self.metadata.values():`
			`if not isinstance(v, str):`
			`raise TypeError(`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`f"metadata values must be strings, got {type(v)}"`
			`)`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00

			`@dataclass`
			`class JobSubmitResponse:`
			`job_id: str`


			`@dataclass`
			`class JobStopResponse:`
			`stopped: bool`


			`@dataclass`
			`class JobStatusResponse:`
[job submission] Better validation + tests for input types, refactor API (#20332) 2021-11-13 22:54:01 -08:00			`status: JobStatus`
[job submission] Add a `message` to the JobStatus to return more detailed errors (#20491) 2021-11-18 10:15:23 -06:00			`message: Optional[str]`
[job submission] Move job_manager to dashboard module, common parts to common.py (#20209) 2021-11-10 14:14:55 -08:00

			`# TODO(jiaodong): Support log streaming #19415`
			`@dataclass`
			`class JobLogsResponse:`
			`logs: str`