ray/rllib/evaluation/sample_batch_builder.py

import collections
import logging
import numpy as np
from typing import List, Any, Dict, Optional, TYPE_CHECKING

from ray.rllib.env.base_env import _DUMMY_AGENT_ID
from ray.rllib.evaluation.episode import MultiAgentEpisode
from ray.rllib.policy.policy import Policy
from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
from ray.rllib.utils.annotations import Deprecated, DeveloperAPI
from ray.rllib.utils.debug import summarize
from ray.rllib.utils.deprecation import deprecation_warning
from ray.rllib.utils.typing import PolicyID, AgentID
from ray.util.debug import log_once

if TYPE_CHECKING:
    from ray.rllib.agents.callbacks import DefaultCallbacks

logger = logging.getLogger(__name__)


def to_float_array(v: List[Any]) -> np.ndarray:
    arr = np.array(v)
    if arr.dtype == np.float64:
        return arr.astype(np.float32)  # save some memory
    return arr


@Deprecated(new="a child class of `SampleCollector`", error=False)
class SampleBatchBuilder:
    """Util to build a SampleBatch incrementally.

    For efficiency, SampleBatches hold values in column form (as arrays).
    However, it is useful to add data one row (dict) at a time.
    """

    _next_unroll_id = 0  # disambiguates unrolls within a single episode

    def __init__(self):
        self.buffers: Dict[str, List] = collections.defaultdict(list)
        self.count = 0

    def add_values(self, **values: Any) -> None:
        """Add the given dictionary (row) of values to this batch."""

        for k, v in values.items():
            self.buffers[k].append(v)
        self.count += 1

    def add_batch(self, batch: SampleBatch) -> None:
        """Add the given batch of values to this batch."""

        for k, column in batch.items():
            self.buffers[k].extend(column)
        self.count += batch.count

    def build_and_reset(self) -> SampleBatch:
        """Returns a sample batch including all previously added values."""

        batch = SampleBatch(
            {k: to_float_array(v)
             for k, v in self.buffers.items()})
        if SampleBatch.UNROLL_ID not in batch:
            batch[SampleBatch.UNROLL_ID] = np.repeat(
                SampleBatchBuilder._next_unroll_id, batch.count)
            SampleBatchBuilder._next_unroll_id += 1
        self.buffers.clear()
        self.count = 0
        return batch


# Deprecated class: Use a child class of `SampleCollector` instead
#  (which handles multi-agent setups as well).
@DeveloperAPI
class MultiAgentSampleBatchBuilder:
    """Util to build SampleBatches for each policy in a multi-agent env.

    Input data is per-agent, while output data is per-policy. There is an M:N
    mapping between agents and policies. We retain one local batch builder
    per agent. When an agent is done, then its local batch is appended into the
    corresponding policy batch for the agent's policy.
    """

    def __init__(self, policy_map: Dict[PolicyID, Policy], clip_rewards: bool,
                 callbacks: "DefaultCallbacks"):
        """Initialize a MultiAgentSampleBatchBuilder.

        Args:
            policy_map (Dict[str,Policy]): Maps policy ids to policy instances.
            clip_rewards (Union[bool,float]): Whether to clip rewards before
                postprocessing (at +/-1.0) or the actual value to +/- clip.
            callbacks (DefaultCallbacks): RLlib callbacks.
        """
        if log_once("MultiAgentSampleBatchBuilder"):
            deprecation_warning(
                old="MultiAgentSampleBatchBuilder", error=False)
        self.policy_map = policy_map
        self.clip_rewards = clip_rewards
        # Build the Policies' SampleBatchBuilders.
        self.policy_builders = {
            k: SampleBatchBuilder()
            for k in policy_map.keys()
        }
        # Whenever we observe a new agent, add a new SampleBatchBuilder for
        # this agent.
        self.agent_builders = {}
        # Internal agent-to-policy map.
        self.agent_to_policy = {}
        self.callbacks = callbacks
        # Number of "inference" steps taken in the environment.
        # Regardless of the number of agents involved in each of these steps.
        self.count = 0

    def total(self) -> int:
        """Returns the total number of steps taken in the env (all agents).

        Returns:
            int: The number of steps taken in total in the environment over all
                agents.
        """

        return sum(a.count for a in self.agent_builders.values())

    def has_pending_agent_data(self) -> bool:
        """Returns whether there is pending unprocessed data.

        Returns:
            bool: True if there is at least one per-agent builder (with data
                in it).
        """

        return len(self.agent_builders) > 0

    @DeveloperAPI
    def add_values(self, agent_id: AgentID, policy_id: AgentID,
                   **values: Any) -> None:
        """Add the given dictionary (row) of values to this batch.

        Args:
            agent_id (obj): Unique id for the agent we are adding values for.
            policy_id (obj): Unique id for policy controlling the agent.
            values (dict): Row of values to add for this agent.
        """

        if agent_id not in self.agent_builders:
            self.agent_builders[agent_id] = SampleBatchBuilder()
            self.agent_to_policy[agent_id] = policy_id

        # Include the current agent id for multi-agent algorithms.
        if agent_id != _DUMMY_AGENT_ID:
            values["agent_id"] = agent_id

        self.agent_builders[agent_id].add_values(**values)

    def postprocess_batch_so_far(
            self, episode: Optional[MultiAgentEpisode] = None) -> None:
        """Apply policy postprocessors to any unprocessed rows.

        This pushes the postprocessed per-agent batches onto the per-policy
        builders, clearing per-agent state.

        Args:
            episode (Optional[MultiAgentEpisode]): The Episode object that
                holds this MultiAgentBatchBuilder object.
        """

        # Materialize the batches so far.
        pre_batches = {}
        for agent_id, builder in self.agent_builders.items():
            pre_batches[agent_id] = (
                self.policy_map[self.agent_to_policy[agent_id]],
                builder.build_and_reset())

        # Apply postprocessor.
        post_batches = {}
        if self.clip_rewards is True:
            for _, (_, pre_batch) in pre_batches.items():
                pre_batch["rewards"] = np.sign(pre_batch["rewards"])
        elif self.clip_rewards:
            for _, (_, pre_batch) in pre_batches.items():
                pre_batch["rewards"] = np.clip(
                    pre_batch["rewards"],
                    a_min=-self.clip_rewards,
                    a_max=self.clip_rewards)
        for agent_id, (_, pre_batch) in pre_batches.items():
            other_batches = pre_batches.copy()
            del other_batches[agent_id]
            policy = self.policy_map[self.agent_to_policy[agent_id]]
            if any(pre_batch["dones"][:-1]) or len(set(
                    pre_batch["eps_id"])) > 1:
                raise ValueError(
                    "Batches sent to postprocessing must only contain steps "
                    "from a single trajectory.", pre_batch)
            # Call the Policy's Exploration's postprocess method.
            post_batches[agent_id] = pre_batch
            if getattr(policy, "exploration", None) is not None:
                policy.exploration.postprocess_trajectory(
                    policy, post_batches[agent_id], policy.get_session())
            post_batches[agent_id] = policy.postprocess_trajectory(
                post_batches[agent_id], other_batches, episode)

        if log_once("after_post"):
            logger.info(
                "Trajectory fragment after postprocess_trajectory():\n\n{}\n".
                format(summarize(post_batches)))

        # Append into policy batches and reset
        from ray.rllib.evaluation.rollout_worker import get_global_worker
        for agent_id, post_batch in sorted(post_batches.items()):
            self.callbacks.on_postprocess_trajectory(
                worker=get_global_worker(),
                episode=episode,
                agent_id=agent_id,
                policy_id=self.agent_to_policy[agent_id],
                policies=self.policy_map,
                postprocessed_batch=post_batch,
                original_batches=pre_batches)
            self.policy_builders[self.agent_to_policy[agent_id]].add_batch(
                post_batch)

        self.agent_builders.clear()
        self.agent_to_policy.clear()

    def check_missing_dones(self) -> None:
        for agent_id, builder in self.agent_builders.items():
            if builder.buffers["dones"][-1] is not True:
                raise ValueError(
                    "The environment terminated for all agents, but we still "
                    "don't have a last observation for "
                    "agent {} (policy {}). ".format(
                        agent_id, self.agent_to_policy[agent_id]) +
                    "Please ensure that you include the last observations "
                    "of all live agents when setting '__all__' done to True. "
                    "Alternatively, set no_done_at_end=True to allow this.")

    @DeveloperAPI
    def build_and_reset(self, episode: Optional[MultiAgentEpisode] = None
                        ) -> MultiAgentBatch:
        """Returns the accumulated sample batches for each policy.

        Any unprocessed rows will be first postprocessed with a policy
        postprocessor. The internal state of this builder will be reset.

        Args:
            episode (Optional[MultiAgentEpisode]): The Episode object that
                holds this MultiAgentBatchBuilder object or None.

        Returns:
            MultiAgentBatch: Returns the accumulated sample batches for each
                policy.
        """

        self.postprocess_batch_so_far(episode)
        policy_batches = {}
        for policy_id, builder in self.policy_builders.items():
            if builder.count > 0:
                policy_batches[policy_id] = builder.build_and_reset()
        old_count = self.count
        self.count = 0
        return MultiAgentBatch.wrap_as_needed(policy_batches, old_count)
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`import collections`
[rllib] Print out intermediate data shapes on the first iteration (#4426) 2019-03-26 00:27:59 -07:00			`import logging`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`import numpy as np`
[rllib] Add type annotations for evaluation/, env/ packages (#9003) 2020-06-19 13:09:05 -07:00			`from typing import List, Any, Dict, Optional, TYPE_CHECKING`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00
[RLlib] Remove all non-trajectory view API code. (#14860) 2021-03-23 17:50:18 +01:00			`from ray.rllib.env.base_env import _DUMMY_AGENT_ID`
[rllib] Add type annotations for evaluation/, env/ packages (#9003) 2020-06-19 13:09:05 -07:00			`from ray.rllib.evaluation.episode import MultiAgentEpisode`
			`from ray.rllib.policy.policy import Policy`
[rllib] Rename PolicyGraph => Policy, move from evaluation/ to policy/ (#4819) This implements some of the renames proposed in #4813 We leave behind backwards-compatibility aliases for *PolicyGraph and SampleBatch. 2019-05-20 16:46:05 -07:00			`from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch`
[RLlib] Add @Deprecated decorator to simplify/unify deprecation of classes, methods, functions. (#17530) 2021-08-03 18:30:02 -04:00			`from ray.rllib.utils.annotations import Deprecated, DeveloperAPI`
[Core/RLlib] Move `log_once` from rllib to ray.util. (#7273) * Move log_once from rllib to tune. * Move log_once from rllib to tune. * LINT. * Move to ray.util.debug. 2020-02-27 19:40:44 +01:00			`from ray.rllib.utils.debug import summarize`
[RLlib] Remove all non-trajectory view API code. (#14860) 2021-03-23 17:50:18 +01:00			`from ray.rllib.utils.deprecation import deprecation_warning`
[RLlib] Rename rllib.utils.types into typing to match built-in python module's name. (#10114) 2020-08-15 13:24:22 +02:00			`from ray.rllib.utils.typing import PolicyID, AgentID`
[RLlib] Unity3D integration (n Unity3D clients vs learning server). (#8590) 2020-05-30 22:48:34 +02:00			`from ray.util.debug import log_once`
[rllib] Print out intermediate data shapes on the first iteration (#4426) 2019-03-26 00:27:59 -07:00
[rllib] Add type annotations for evaluation/, env/ packages (#9003) 2020-06-19 13:09:05 -07:00			`if TYPE_CHECKING:`
			`from ray.rllib.agents.callbacks import DefaultCallbacks`

[rllib] Print out intermediate data shapes on the first iteration (#4426) 2019-03-26 00:27:59 -07:00			`logger = logging.getLogger(__name__)`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00

[rllib] Add type annotations for evaluation/, env/ packages (#9003) 2020-06-19 13:09:05 -07:00			`def to_float_array(v: List[Any]) -> np.ndarray:`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`arr = np.array(v)`
			`if arr.dtype == np.float64:`
			`return arr.astype(np.float32) # save some memory`
			`return arr`


[RLlib] Add @Deprecated decorator to simplify/unify deprecation of classes, methods, functions. (#17530) 2021-08-03 18:30:02 -04:00			@Deprecated(new="a child class of `SampleCollector`", error=False)
Remove (object) from class declarations. (#6658) 2020-01-02 17:42:13 -08:00			`class SampleBatchBuilder:`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`"""Util to build a SampleBatch incrementally.`

			`For efficiency, SampleBatches hold values in column form (as arrays).`
			`However, it is useful to add data one row (dict) at a time.`
			`"""`

[rllib] Flexible multi-agent replay modes and replay_sequence_length (#8893) 2020-06-12 20:17:27 -07:00			`_next_unroll_id = 0 # disambiguates unrolls within a single episode`

[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`def __init__(self):`
[rllib] Add type annotations for evaluation/, env/ packages (#9003) 2020-06-19 13:09:05 -07:00			`self.buffers: Dict[str, List] = collections.defaultdict(list)`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`self.count = 0`

[RLlib] Trajectory View API (part 2.5): Actual implementations (not used yet) of a SampleCollector. (#10112) 2020-08-15 15:09:00 +02:00			`def add_values(self, **values: Any) -> None:`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`"""Add the given dictionary (row) of values to this batch."""`

			`for k, v in values.items():`
			`self.buffers[k].append(v)`
			`self.count += 1`

[rllib] Add type annotations for evaluation/, env/ packages (#9003) 2020-06-19 13:09:05 -07:00			`def add_batch(self, batch: SampleBatch) -> None:`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`"""Add the given batch of values to this batch."""`

			`for k, column in batch.items():`
			`self.buffers[k].extend(column)`
			`self.count += batch.count`

[rllib] Add type annotations for evaluation/, env/ packages (#9003) 2020-06-19 13:09:05 -07:00			`def build_and_reset(self) -> SampleBatch:`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`"""Returns a sample batch including all previously added values."""`

			`batch = SampleBatch(`
			`{k: to_float_array(v)`
			`for k, v in self.buffers.items()})`
[RLlib] Obsolete usage tracking dict via sample batch. (#13065) 2021-03-17 08:18:15 +01:00			`if SampleBatch.UNROLL_ID not in batch:`
			`batch[SampleBatch.UNROLL_ID] = np.repeat(`
[rllib] Flexible multi-agent replay modes and replay_sequence_length (#8893) 2020-06-12 20:17:27 -07:00			`SampleBatchBuilder._next_unroll_id, batch.count)`
			`SampleBatchBuilder._next_unroll_id += 1`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`self.buffers.clear()`
			`self.count = 0`
			`return batch`


[RLlib] Remove all non-trajectory view API code. (#14860) 2021-03-23 17:50:18 +01:00			# Deprecated class: Use a child class of `SampleCollector` instead
			`# (which handles multi-agent setups as well).`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`@DeveloperAPI`
Remove (object) from class declarations. (#6658) 2020-01-02 17:42:13 -08:00			`class MultiAgentSampleBatchBuilder:`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`"""Util to build SampleBatches for each policy in a multi-agent env.`

			`Input data is per-agent, while output data is per-policy. There is an M:N`
			`mapping between agents and policies. We retain one local batch builder`
			`per agent. When an agent is done, then its local batch is appended into the`
			`corresponding policy batch for the agent's policy.`
			`"""`

[rllib] Add type annotations for evaluation/, env/ packages (#9003) 2020-06-19 13:09:05 -07:00			`def __init__(self, policy_map: Dict[PolicyID, Policy], clip_rewards: bool,`
			`callbacks: "DefaultCallbacks"):`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`"""Initialize a MultiAgentSampleBatchBuilder.`

[RLlib] Sample batch docs and cleanup. (#8778) 2020-06-04 22:47:32 +02:00			`Args:`
			`policy_map (Dict[str,Policy]): Maps policy ids to policy instances.`
			`clip_rewards (Union[bool,float]): Whether to clip rewards before`
			`postprocessing (at +/-1.0) or the actual value to +/- clip.`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`callbacks (DefaultCallbacks): RLlib callbacks.`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`"""`
[RLlib] `simple_optimizer` should not be used by default for tf+MA. (#15365) 2021-05-10 16:10:44 +02:00			`if log_once("MultiAgentSampleBatchBuilder"):`
			`deprecation_warning(`
			`old="MultiAgentSampleBatchBuilder", error=False)`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`self.policy_map = policy_map`
			`self.clip_rewards = clip_rewards`
[RLlib] Sample batch docs and cleanup. (#8778) 2020-06-04 22:47:32 +02:00			`# Build the Policies' SampleBatchBuilders.`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`self.policy_builders = {`
			`k: SampleBatchBuilder()`
			`for k in policy_map.keys()`
			`}`
[RLlib] Sample batch docs and cleanup. (#8778) 2020-06-04 22:47:32 +02:00			`# Whenever we observe a new agent, add a new SampleBatchBuilder for`
			`# this agent.`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`self.agent_builders = {}`
[RLlib] Sample batch docs and cleanup. (#8778) 2020-06-04 22:47:32 +02:00			`# Internal agent-to-policy map.`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`self.agent_to_policy = {}`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`self.callbacks = callbacks`
[RLlib] Sample batch docs and cleanup. (#8778) 2020-06-04 22:47:32 +02:00			`# Number of "inference" steps taken in the environment.`
			`# Regardless of the number of agents involved in each of these steps.`
			`self.count = 0`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00
[rllib] Add type annotations for evaluation/, env/ packages (#9003) 2020-06-19 13:09:05 -07:00			`def total(self) -> int:`
[RLlib] Sample batch docs and cleanup. (#8778) 2020-06-04 22:47:32 +02:00			`"""Returns the total number of steps taken in the env (all agents).`

			`Returns:`
			`int: The number of steps taken in total in the environment over all`
			`agents.`
			`"""`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00
[RLlib] Issue 7401: In eval mode (if evaluation_episodes > 0), agent hangs if Env does not terminate. (#7448) * Fix. * Rollback. * Fix issue 7421. * Fix. 2020-03-04 12:58:34 -08:00			`return sum(a.count for a in self.agent_builders.values())`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00
[rllib] Add type annotations for evaluation/, env/ packages (#9003) 2020-06-19 13:09:05 -07:00			`def has_pending_agent_data(self) -> bool:`
[RLlib] Sample batch docs and cleanup. (#8778) 2020-06-04 22:47:32 +02:00			`"""Returns whether there is pending unprocessed data.`

			`Returns:`
			`bool: True if there is at least one per-agent builder (with data`
			`in it).`
			`"""`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00
			`return len(self.agent_builders) > 0`

			`@DeveloperAPI`
[rllib] Add type annotations for evaluation/, env/ packages (#9003) 2020-06-19 13:09:05 -07:00			`def add_values(self, agent_id: AgentID, policy_id: AgentID,`
[RLlib] Trajectory View API (part 2.5): Actual implementations (not used yet) of a SampleCollector. (#10112) 2020-08-15 15:09:00 +02:00			`**values: Any) -> None:`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`"""Add the given dictionary (row) of values to this batch.`

[RLlib] SAC algo cleanup. (#10825) 2020-09-20 11:27:02 +02:00			`Args:`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`agent_id (obj): Unique id for the agent we are adding values for.`
			`policy_id (obj): Unique id for policy controlling the agent.`
			`values (dict): Row of values to add for this agent.`
			`"""`

			`if agent_id not in self.agent_builders:`
			`self.agent_builders[agent_id] = SampleBatchBuilder()`
			`self.agent_to_policy[agent_id] = policy_id`
[rllib] Flexible multi-agent replay modes and replay_sequence_length (#8893) 2020-06-12 20:17:27 -07:00
			`# Include the current agent id for multi-agent algorithms.`
			`if agent_id != _DUMMY_AGENT_ID:`
			`values["agent_id"] = agent_id`

[RLlib] Sample batch docs and cleanup. (#8778) 2020-06-04 22:47:32 +02:00			`self.agent_builders[agent_id].add_values(**values)`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00
[rllib] Add type annotations for evaluation/, env/ packages (#9003) 2020-06-19 13:09:05 -07:00			`def postprocess_batch_so_far(`
			`self, episode: Optional[MultiAgentEpisode] = None) -> None:`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`"""Apply policy postprocessors to any unprocessed rows.`

			`This pushes the postprocessed per-agent batches onto the per-policy`
			`builders, clearing per-agent state.`

[RLlib] Unity3D integration (n Unity3D clients vs learning server). (#8590) 2020-05-30 22:48:34 +02:00			`Args:`
[RLlib] Sample batch docs and cleanup. (#8778) 2020-06-04 22:47:32 +02:00			`episode (Optional[MultiAgentEpisode]): The Episode object that`
			`holds this MultiAgentBatchBuilder object.`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`"""`

[RLlib] Sample batch docs and cleanup. (#8778) 2020-06-04 22:47:32 +02:00			`# Materialize the batches so far.`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`pre_batches = {}`
			`for agent_id, builder in self.agent_builders.items():`
			`pre_batches[agent_id] = (`
			`self.policy_map[self.agent_to_policy[agent_id]],`
			`builder.build_and_reset())`

[RLlib] Sample batch docs and cleanup. (#8778) 2020-06-04 22:47:32 +02:00			`# Apply postprocessor.`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`post_batches = {}`
[RLlib] Sample batch docs and cleanup. (#8778) 2020-06-04 22:47:32 +02:00			`if self.clip_rewards is True:`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`for _, (_, pre_batch) in pre_batches.items():`
			`pre_batch["rewards"] = np.sign(pre_batch["rewards"])`
[RLlib] Sample batch docs and cleanup. (#8778) 2020-06-04 22:47:32 +02:00			`elif self.clip_rewards:`
			`for _, (_, pre_batch) in pre_batches.items():`
			`pre_batch["rewards"] = np.clip(`
			`pre_batch["rewards"],`
			`a_min=-self.clip_rewards,`
			`a_max=self.clip_rewards)`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`for agent_id, (_, pre_batch) in pre_batches.items():`
			`other_batches = pre_batches.copy()`
			`del other_batches[agent_id]`
			`policy = self.policy_map[self.agent_to_policy[agent_id]]`
			`if any(pre_batch["dones"][:-1]) or len(set(`
			`pre_batch["eps_id"])) > 1:`
			`raise ValueError(`
			`"Batches sent to postprocessing must only contain steps "`
			`"from a single trajectory.", pre_batch)`
[RLlib] Minimal ParamNoise PR. (#7772) 2020-03-29 00:16:30 +01:00			`# Call the Policy's Exploration's postprocess method.`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`post_batches[agent_id] = pre_batch`
[RLlib] Fix for issue https://github.com/ray-project/ray/issues/8191 (#8200) Fix attribute error when missing exploration in Policy. Issue #8191 2020-04-27 23:19:26 +02:00			`if getattr(policy, "exploration", None) is not None:`
			`policy.exploration.postprocess_trajectory(`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`policy, post_batches[agent_id], policy.get_session())`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`post_batches[agent_id] = policy.postprocess_trajectory(`
			`post_batches[agent_id], other_batches, episode)`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00
[rllib] Print out intermediate data shapes on the first iteration (#4426) 2019-03-26 00:27:59 -07:00			`if log_once("after_post"):`
			`logger.info(`
			`"Trajectory fragment after postprocess_trajectory():\n\n{}\n".`
			`format(summarize(post_batches)))`

[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`# Append into policy batches and reset`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`from ray.rllib.evaluation.rollout_worker import get_global_worker`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`for agent_id, post_batch in sorted(post_batches.items()):`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`self.callbacks.on_postprocess_trajectory(`
			`worker=get_global_worker(),`
			`episode=episode,`
			`agent_id=agent_id,`
			`policy_id=self.agent_to_policy[agent_id],`
			`policies=self.policy_map,`
			`postprocessed_batch=post_batch,`
			`original_batches=pre_batches)`
[rllib] set daemon status for PolicyServerInput thread (#7862) 2020-04-04 16:08:51 -07:00			`self.policy_builders[self.agent_to_policy[agent_id]].add_batch(`
			`post_batch)`
[rllib] Print out intermediate data shapes on the first iteration (#4426) 2019-03-26 00:27:59 -07:00
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`self.agent_builders.clear()`
			`self.agent_to_policy.clear()`

[rllib] Add type annotations for evaluation/, env/ packages (#9003) 2020-06-19 13:09:05 -07:00			`def check_missing_dones(self) -> None:`
[rllib] Raise an error if multi-agent envs terminate without a last observation for agents (#4139) * fix it * lint * Update rllib-training.rst 2019-02-23 21:23:40 -08:00			`for agent_id, builder in self.agent_builders.items():`
			`if builder.buffers["dones"][-1] is not True:`
			`raise ValueError(`
			`"The environment terminated for all agents, but we still "`
			`"don't have a last observation for "`
			`"agent {} (policy {}). ".format(`
			`agent_id, self.agent_to_policy[agent_id]) +`
			`"Please ensure that you include the last observations "`
[rllib] Feature/soft actor critic v2 (#5328) * Add base for Soft Actor-Critic * Pick changes from old SAC branch * Update sac.py * First implementation of sac model * Remove unnecessary SAC imports * Prune unnecessary noise and exploration code * Implement SAC model and use that in SAC policy * runs but doesn't learn * clear state * fix batch size * Add missing alpha grads and vars * -200 by 2k timesteps * doc * lazy squash * one file * ignore tfp * revert done 2019-08-01 23:37:36 -07:00			`"of all live agents when setting '__all__' done to True. "`
			`"Alternatively, set no_done_at_end=True to allow this.")`
[rllib] Raise an error if multi-agent envs terminate without a last observation for agents (#4139) * fix it * lint * Update rllib-training.rst 2019-02-23 21:23:40 -08:00
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`@DeveloperAPI`
[rllib] Add type annotations for evaluation/, env/ packages (#9003) 2020-06-19 13:09:05 -07:00			`def build_and_reset(self, episode: Optional[MultiAgentEpisode] = None`
			`) -> MultiAgentBatch:`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`"""Returns the accumulated sample batches for each policy.`

			`Any unprocessed rows will be first postprocessed with a policy`
			`postprocessor. The internal state of this builder will be reset.`

[RLlib] Unity3D integration (n Unity3D clients vs learning server). (#8590) 2020-05-30 22:48:34 +02:00			`Args:`
[RLlib] Sample batch docs and cleanup. (#8778) 2020-06-04 22:47:32 +02:00			`episode (Optional[MultiAgentEpisode]): The Episode object that`
			`holds this MultiAgentBatchBuilder object or None.`

			`Returns:`
			`MultiAgentBatch: Returns the accumulated sample batches for each`
			`policy.`
[rllib] annotate public vs developer vs private APIs (#3808) 2019-01-23 21:27:26 -08:00			`"""`

			`self.postprocess_batch_so_far(episode)`
			`policy_batches = {}`
			`for policy_id, builder in self.policy_builders.items():`
			`if builder.count > 0:`
			`policy_batches[policy_id] = builder.build_and_reset()`
			`old_count = self.count`
			`self.count = 0`
			`return MultiAgentBatch.wrap_as_needed(policy_batches, old_count)`