ray/rllib/offline/json_writer.py

from datetime import datetime
import json
import logging
import numpy as np
import os
from six.moves.urllib.parse import urlparse
import time

try:
    from smart_open import smart_open
except ImportError:
    smart_open = None

from ray.rllib.policy.sample_batch import MultiAgentBatch
from ray.rllib.offline.io_context import IOContext
from ray.rllib.offline.output_writer import OutputWriter
from ray.rllib.utils.annotations import override, PublicAPI
from ray.rllib.utils.compression import pack, compression_supported
from ray.rllib.utils.typing import FileType, SampleBatchType
from ray.util.ml_utils.json import SafeFallbackEncoder
from typing import Any, Dict, List

logger = logging.getLogger(__name__)

WINDOWS_DRIVES = [chr(i) for i in range(ord("c"), ord("z") + 1)]


# TODO(jungong) : use DatasetWriter to back JsonWriter, so we reduce
#     codebase complexity without losing existing functionality.
@PublicAPI
class JsonWriter(OutputWriter):
    """Writer object that saves experiences in JSON file chunks."""

    @PublicAPI
    def __init__(
        self,
        path: str,
        ioctx: IOContext = None,
        max_file_size: int = 64 * 1024 * 1024,
        compress_columns: List[str] = frozenset(["obs", "new_obs"]),
    ):
        """Initializes a JsonWriter instance.

        Args:
            path: a path/URI of the output directory to save files in.
            ioctx: current IO context object.
            max_file_size: max size of single files before rolling over.
            compress_columns: list of sample batch columns to compress.
        """
        logger.info(
            "You are using JSONWriter. It is recommended to use "
            + "DatasetWriter instead."
        )

        self.ioctx = ioctx or IOContext()
        self.max_file_size = max_file_size
        self.compress_columns = compress_columns
        if urlparse(path).scheme not in [""] + WINDOWS_DRIVES:
            self.path_is_uri = True
        else:
            path = os.path.abspath(os.path.expanduser(path))
            # Try to create local dirs if they don't exist
            try:
                os.makedirs(path)
            except OSError:
                pass  # already exists
            assert os.path.exists(path), "Failed to create {}".format(path)
            self.path_is_uri = False
        self.path = path
        self.file_index = 0
        self.bytes_written = 0
        self.cur_file = None

    @override(OutputWriter)
    def write(self, sample_batch: SampleBatchType):
        start = time.time()
        data = _to_json(sample_batch, self.compress_columns)
        f = self._get_file()
        f.write(data)
        f.write("\n")
        if hasattr(f, "flush"):  # legacy smart_open impls
            f.flush()
        self.bytes_written += len(data)
        logger.debug(
            "Wrote {} bytes to {} in {}s".format(len(data), f, time.time() - start)
        )

    def _get_file(self) -> FileType:
        if not self.cur_file or self.bytes_written >= self.max_file_size:
            if self.cur_file:
                self.cur_file.close()
            timestr = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
            path = os.path.join(
                self.path,
                "output-{}_worker-{}_{}.json".format(
                    timestr, self.ioctx.worker_index, self.file_index
                ),
            )
            if self.path_is_uri:
                if smart_open is None:
                    raise ValueError(
                        "You must install the `smart_open` module to write "
                        "to URIs like {}".format(path)
                    )
                self.cur_file = smart_open(path, "w")
            else:
                self.cur_file = open(path, "w")
            self.file_index += 1
            self.bytes_written = 0
            logger.info("Writing to new output file {}".format(self.cur_file))
        return self.cur_file


def _to_jsonable(v, compress: bool) -> Any:
    if compress and compression_supported():
        return str(pack(v))
    elif isinstance(v, np.ndarray):
        return v.tolist()
    return v


def _to_json_dict(batch: SampleBatchType, compress_columns: List[str]) -> Dict:
    out = {}
    if isinstance(batch, MultiAgentBatch):
        out["type"] = "MultiAgentBatch"
        out["count"] = batch.count
        policy_batches = {}
        for policy_id, sub_batch in batch.policy_batches.items():
            policy_batches[policy_id] = {}
            for k, v in sub_batch.items():
                policy_batches[policy_id][k] = _to_jsonable(
                    v, compress=k in compress_columns
                )
        out["policy_batches"] = policy_batches
    else:
        out["type"] = "SampleBatch"
        for k, v in batch.items():
            out[k] = _to_jsonable(v, compress=k in compress_columns)
    return out


def _to_json(batch: SampleBatchType, compress_columns: List[str]) -> str:
    out = _to_json_dict(batch, compress_columns)
    return json.dumps(out, cls=SafeFallbackEncoder)
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00			`from datetime import datetime`
			`import json`
			`import logging`
			`import numpy as np`
			`import os`
			`from six.moves.urllib.parse import urlparse`
			`import time`

			`try:`
			`from smart_open import smart_open`
			`except ImportError:`
			`smart_open = None`

[rllib] Rename PolicyGraph => Policy, move from evaluation/ to policy/ (#4819) This implements some of the renames proposed in #4813 We leave behind backwards-compatibility aliases for *PolicyGraph and SampleBatch. 2019-05-20 16:46:05 -07:00			`from ray.rllib.policy.sample_batch import MultiAgentBatch`
[rllib] Documentation for I/O API and multi-agent support / cleanup (#3650) 2019-01-03 15:15:36 +08:00			`from ray.rllib.offline.io_context import IOContext`
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00			`from ray.rllib.offline.output_writer import OutputWriter`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`from ray.rllib.utils.annotations import override, PublicAPI`
[rllib] Fix output API when lz4 not installed (#5421) 2019-08-10 13:53:27 -07:00			`from ray.rllib.utils.compression import pack, compression_supported`
[RLlib] Rename rllib.utils.types into typing to match built-in python module's name. (#10114) 2020-08-15 13:24:22 +02:00			`from ray.rllib.utils.typing import FileType, SampleBatchType`
[RLlib] Minor fix on json encoding during worker sampling (#20134) * import custom json encoder from util and improve encoder default function * linting 2021-11-10 09:46:41 +09:00			`from ray.util.ml_utils.json import SafeFallbackEncoder`
[RLlib] Dataset Reader/Writer for RLlib (#21808) 2022-01-26 07:00:46 -08:00			`from typing import Any, Dict, List`
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00
			`logger = logging.getLogger(__name__)`

[RLlib] Support windows drives other than C drive for the offline json API (#9909) 2020-08-13 05:57:54 -04:00			`WINDOWS_DRIVES = [chr(i) for i in range(ord("c"), ord("z") + 1)]`

[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00
[RLlib] Dataset Reader/Writer for RLlib (#21808) 2022-01-26 07:00:46 -08:00			`# TODO(jungong) : use DatasetWriter to back JsonWriter, so we reduce`
			`# codebase complexity without losing existing functionality.`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`@PublicAPI`
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00			`class JsonWriter(OutputWriter):`
			`"""Writer object that saves experiences in JSON file chunks."""`

[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`@PublicAPI`
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00			`def __init__(`
			`self,`
[RLlib] Offline Type Annotations (#9676) * Offline Annotations * Modifications * Fixed circular dependencies * Linter fix 2020-07-27 14:01:17 -07:00			`path: str,`
			`ioctx: IOContext = None,`
			`max_file_size: int = 64 * 1024 * 1024,`
			`compress_columns: List[str] = frozenset(["obs", "new_obs"]),`
			`):`
[RLlib; Docs overhaul] Docstring cleanup: Offline. (#19808) 2021-11-01 10:59:53 +01:00			`"""Initializes a JsonWriter instance.`
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00
[RLlib] SAC algo cleanup. (#10825) 2020-09-20 11:27:02 +02:00			`Args:`
[RLlib; Docs overhaul] Docstring cleanup: Offline. (#19808) 2021-11-01 10:59:53 +01:00			`path: a path/URI of the output directory to save files in.`
			`ioctx: current IO context object.`
			`max_file_size: max size of single files before rolling over.`
			`compress_columns: list of sample batch columns to compress.`
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00			`"""`
[RLlib] Dataset Reader/Writer for RLlib (#21808) 2022-01-26 07:00:46 -08:00			`logger.info(`
			`"You are using JSONWriter. It is recommended to use "`
			`+ "DatasetWriter instead."`
			`)`

[rllib] Documentation for I/O API and multi-agent support / cleanup (#3650) 2019-01-03 15:15:36 +08:00			`self.ioctx = ioctx or IOContext()`
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00			`self.max_file_size = max_file_size`
			`self.compress_columns = compress_columns`
[RLlib] Support windows drives other than C drive for the offline json API (#9909) 2020-08-13 05:57:54 -04:00			`if urlparse(path).scheme not in [""] + WINDOWS_DRIVES:`
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00			`self.path_is_uri = True`
			`else:`
[rllib] Basic infrastructure for off-policy estimation (IS, WIS) (#3941) 2019-02-13 16:25:05 -08:00			`path = os.path.abspath(os.path.expanduser(path))`
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00			`# Try to create local dirs if they don't exist`
			`try:`
			`os.makedirs(path)`
			`except OSError:`
			`pass # already exists`
			`assert os.path.exists(path), "Failed to create {}".format(path)`
			`self.path_is_uri = False`
[rllib] Basic infrastructure for off-policy estimation (IS, WIS) (#3941) 2019-02-13 16:25:05 -08:00			`self.path = path`
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00			`self.file_index = 0`
			`self.bytes_written = 0`
			`self.cur_file = None`

			`@override(OutputWriter)`
[RLlib] Offline Type Annotations (#9676) * Offline Annotations * Modifications * Fixed circular dependencies * Linter fix 2020-07-27 14:01:17 -07:00			`def write(self, sample_batch: SampleBatchType):`
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00			`start = time.time()`
			`data = _to_json(sample_batch, self.compress_columns)`
			`f = self._get_file()`
			`f.write(data)`
			`f.write("\n")`
			`if hasattr(f, "flush"): # legacy smart_open impls`
			`f.flush()`
			`self.bytes_written += len(data)`
			`logger.debug(`
			`"Wrote {} bytes to {} in {}s".format(len(data), f, time.time() - start)`
			`)`

[RLlib] Offline Type Annotations (#9676) * Offline Annotations * Modifications * Fixed circular dependencies * Linter fix 2020-07-27 14:01:17 -07:00			`def _get_file(self) -> FileType:`
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00			`if not self.cur_file or self.bytes_written >= self.max_file_size:`
			`if self.cur_file:`
			`self.cur_file.close()`
			`timestr = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")`
			`path = os.path.join(`
			`self.path,`
			`"output-{}_worker-{}_{}.json".format(`
			`timestr, self.ioctx.worker_index, self.file_index`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`),`
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00			`)`
			`if self.path_is_uri:`
			`if smart_open is None:`
			`raise ValueError(`
			"You must install the `smart_open` module to write "
			`"to URIs like {}".format(path)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`)`
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00			`self.cur_file = smart_open(path, "w")`
			`else:`
			`self.cur_file = open(path, "w")`
			`self.file_index += 1`
			`self.bytes_written = 0`
			`logger.info("Writing to new output file {}".format(self.cur_file))`
			`return self.cur_file`


[RLlib] Offline Type Annotations (#9676) * Offline Annotations * Modifications * Fixed circular dependencies * Linter fix 2020-07-27 14:01:17 -07:00			`def _to_jsonable(v, compress: bool) -> Any:`
[rllib] Fix output API when lz4 not installed (#5421) 2019-08-10 13:53:27 -07:00			`if compress and compression_supported():`
[rllib] Basic Offline Data IO API (#3473) 2018-12-12 13:57:48 -08:00			`return str(pack(v))`
			`elif isinstance(v, np.ndarray):`
			`return v.tolist()`
			`return v`


[RLlib] Dataset Reader/Writer for RLlib (#21808) 2022-01-26 07:00:46 -08:00			`def _to_json_dict(batch: SampleBatchType, compress_columns: List[str]) -> Dict:`
[rllib] Documentation for I/O API and multi-agent support / cleanup (#3650) 2019-01-03 15:15:36 +08:00			`out = {}`
			`if isinstance(batch, MultiAgentBatch):`
			`out["type"] = "MultiAgentBatch"`
			`out["count"] = batch.count`
			`policy_batches = {}`
			`for policy_id, sub_batch in batch.policy_batches.items():`
			`policy_batches[policy_id] = {}`
[RLlib] Remove all (already soft-deprecated) `SampleBatch.data` from code. (#15335) 2021-04-15 19:19:51 +02:00			`for k, v in sub_batch.items():`
[rllib] Documentation for I/O API and multi-agent support / cleanup (#3650) 2019-01-03 15:15:36 +08:00			`policy_batches[policy_id][k] = _to_jsonable(`
			`v, compress=k in compress_columns`
			`)`
			`out["policy_batches"] = policy_batches`
			`else:`
			`out["type"] = "SampleBatch"`
[RLlib] BC/MARWIL/recurrent nets minor cleanups and bug fixes. (#13064) 2020-12-27 09:46:03 -05:00			`for k, v in batch.items():`
[rllib] Documentation for I/O API and multi-agent support / cleanup (#3650) 2019-01-03 15:15:36 +08:00			`out[k] = _to_jsonable(v, compress=k in compress_columns)`
[RLlib] Dataset Reader/Writer for RLlib (#21808) 2022-01-26 07:00:46 -08:00			`return out`


			`def _to_json(batch: SampleBatchType, compress_columns: List[str]) -> str:`
			`out = _to_json_dict(batch, compress_columns)`
[RLlib] Minor fix on json encoding during worker sampling (#20134) * import custom json encoder from util and improve encoder default function * linting 2021-11-10 09:46:41 +09:00			`return json.dumps(out, cls=SafeFallbackEncoder)`