ray/rllib/evaluation/env_runner_v2.py

import logging
import time
from collections import defaultdict, namedtuple
from typing import TYPE_CHECKING, Dict, Iterator, List, Optional, Tuple, Union

import numpy as np
import tree  # pip install dm_tree

from ray.rllib.env.base_env import ASYNC_RESET_RETURN, BaseEnv
from ray.rllib.env.wrappers.atari_wrappers import MonitorEnv, get_wrapper_by_cls
from ray.rllib.evaluation.collectors.simple_list_collector import _PolicyCollectorGroup
from ray.rllib.evaluation.episode_v2 import EpisodeV2
from ray.rllib.evaluation.metrics import RolloutMetrics
from ray.rllib.models.preprocessors import Preprocessor
from ray.rllib.policy.policy import Policy
from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch, concat_samples
from ray.rllib.utils.annotations import DeveloperAPI
from ray.rllib.utils.filter import Filter
from ray.rllib.utils.numpy import convert_to_numpy
from ray.rllib.utils.spaces.space_utils import unbatch
from ray.rllib.utils.typing import (
    ActionConnectorDataType,
    AgentConnectorDataType,
    AgentID,
    EnvActionType,
    EnvID,
    EnvObsType,
    MultiAgentDict,
    MultiEnvDict,
    PolicyID,
    PolicyOutputType,
    SampleBatchType,
    StateBatches,
    TensorStructType,
)
from ray.util.debug import log_once

if TYPE_CHECKING:
    from gym.envs.classic_control.rendering import SimpleImageViewer

    from ray.rllib.algorithms.callbacks import DefaultCallbacks
    from ray.rllib.evaluation.rollout_worker import RolloutWorker


logger = logging.getLogger(__name__)


MIN_LARGE_BATCH_THRESHOLD = 1000
DEFAULT_LARGE_BATCH_THRESHOLD = 5000
MS_TO_SEC = 1000.0


_PolicyEvalData = namedtuple("_PolicyEvalData", ["env_id", "agent_id", "sample_batch"])


class _PerfStats:
    """Sampler perf stats that will be included in rollout metrics."""

    def __init__(self, ema_coef: Optional[float] = None):
        # If not None, enable Exponential Moving Average mode.
        # The way we update stats is by:
        #     updated = (1 - ema_coef) * old + ema_coef * new
        # In general provides more responsive stats about sampler performance.
        # TODO(jungong) : make ema the default (only) mode if it works well.
        self.ema_coef = ema_coef

        self.iters = 0
        self.raw_obs_processing_time = 0.0
        self.inference_time = 0.0
        self.action_processing_time = 0.0
        self.env_wait_time = 0.0
        self.env_render_time = 0.0

    def incr(self, field: str, value: Union[int, float]):
        if field == "iters":
            self.iters += value
            return

        # All the other fields support either global average or ema mode.
        if self.ema_coef is None:
            # Global average.
            self.__dict__[field] += value
        else:
            self.__dict__[field] = (1.0 - self.ema_coef) * self.__dict__[
                field
            ] + self.ema_coef * value

    def _get_avg(self):
        # Mean multiplicator (1000 = sec -> ms).
        factor = MS_TO_SEC / self.iters
        return {
            # Raw observation preprocessing.
            "mean_raw_obs_processing_ms": self.raw_obs_processing_time * factor,
            # Computing actions through policy.
            "mean_inference_ms": self.inference_time * factor,
            # Processing actions (to be sent to env, e.g. clipping).
            "mean_action_processing_ms": self.action_processing_time * factor,
            # Waiting for environment (during poll).
            "mean_env_wait_ms": self.env_wait_time * factor,
            # Environment rendering (False by default).
            "mean_env_render_ms": self.env_render_time * factor,
        }

    def _get_ema(self):
        # In EMA mode, stats are already (exponentially) averaged,
        # hence we only need to do the sec -> ms conversion here.
        return {
            # Raw observation preprocessing.
            "mean_raw_obs_processing_ms": self.raw_obs_processing_time * MS_TO_SEC,
            # Computing actions through policy.
            "mean_inference_ms": self.inference_time * MS_TO_SEC,
            # Processing actions (to be sent to env, e.g. clipping).
            "mean_action_processing_ms": self.action_processing_time * MS_TO_SEC,
            # Waiting for environment (during poll).
            "mean_env_wait_ms": self.env_wait_time * MS_TO_SEC,
            # Environment rendering (False by default).
            "mean_env_render_ms": self.env_render_time * MS_TO_SEC,
        }

    def get(self):
        if self.ema_coef is None:
            return self._get_avg()
        else:
            return self._get_ema()


class _NewDefaultDict(defaultdict):
    def __missing__(self, env_id):
        ret = self[env_id] = self.default_factory(env_id)
        return ret


def _build_multi_agent_batch(
    episode_id: int,
    batch_builder: _PolicyCollectorGroup,
    large_batch_threshold: int,
    multiple_episodes_in_batch: bool,
) -> MultiAgentBatch:
    """Build MultiAgentBatch from a dict of _PolicyCollectors.

    Args:
        env_steps: total env steps.
        policy_collectors: collected training SampleBatchs by policy.

    Returns:
        Always returns a sample batch in MultiAgentBatch format.
    """
    ma_batch = {}
    for pid, collector in batch_builder.policy_collectors.items():
        if collector.agent_steps <= 0:
            continue

        if batch_builder.agent_steps > large_batch_threshold and log_once(
            "large_batch_warning"
        ):
            logger.warning(
                "More than {} observations in {} env steps for "
                "episode {} ".format(
                    batch_builder.agent_steps, batch_builder.env_steps, episode_id
                )
                + "are buffered in the sampler. If this is more than you "
                "expected, check that that you set a horizon on your "
                "environment correctly and that it terminates at some "
                "point. Note: In multi-agent environments, "
                "`rollout_fragment_length` sets the batch size based on "
                "(across-agents) environment steps, not the steps of "
                "individual agents, which can result in unexpectedly "
                "large batches."
                + (
                    "Also, you may be waiting for your Env to "
                    "terminate (batch_mode=`complete_episodes`). Make sure "
                    "it does at some point."
                    if not multiple_episodes_in_batch
                    else ""
                )
            )

        ma_batch[pid] = collector.build()

    # Create the multi agent batch.
    return MultiAgentBatch(policy_batches=ma_batch, env_steps=batch_builder.env_steps)


def _batch_inference_sample_batches(eval_data: List[SampleBatch]) -> SampleBatch:
    """Batch a list of input SampleBatches into a single SampleBatch.

    Args:
        eval_data: list of SampleBatches.

    Returns:
        single batched SampleBatch.
    """
    inference_batch = concat_samples(eval_data)
    if "state_in_0" in inference_batch:
        batch_size = len(eval_data)
        inference_batch[SampleBatch.SEQ_LENS] = np.ones(batch_size, dtype=np.int32)
    return inference_batch


@DeveloperAPI
class EnvRunnerV2:
    """Collect experiences from user environment using Connectors."""

    def __init__(
        self,
        worker: "RolloutWorker",
        base_env: BaseEnv,
        horizon: Optional[int],
        multiple_episodes_in_batch: bool,
        callbacks: "DefaultCallbacks",
        perf_stats: _PerfStats,
        soft_horizon: bool,
        no_done_at_end: bool,
        rollout_fragment_length: int = 200,
        count_steps_by: str = "env_steps",
        render: bool = None,
    ):
        """
        Args:
            worker: Reference to the current rollout worker.
            base_env: Env implementing BaseEnv.
            horizon: Horizon of the episode.
            multiple_episodes_in_batch: Whether to pack multiple
                episodes into each batch. This guarantees batches will be exactly
                `rollout_fragment_length` in size.
            callbacks: User callbacks to run on episode events.
            perf_stats: Record perf stats into this object.
            soft_horizon: Calculate rewards but don't reset the
                environment when the horizon is hit.
            no_done_at_end: Ignore the done=True at the end of the episode
                and instead record done=False.
            rollout_fragment_length: The length of a fragment to collect
                before building a SampleBatch from the data and resetting
                the SampleBatchBuilder object.
            count_steps_by: One of "env_steps" (default) or "agent_steps".
                Use "agent_steps", if you want rollout lengths to be counted
                by individual agent steps. In a multi-agent env,
                a single env_step contains one or more agent_steps, depending
                on how many agents are present at any given time in the
                ongoing episode.
            render: Whether to try to render the environment after each
                step.
        """
        self._worker = worker
        self._base_env = base_env
        self._multiple_episodes_in_batch = multiple_episodes_in_batch
        self._callbacks = callbacks
        self._perf_stats = perf_stats
        self._soft_horizon = soft_horizon
        self._no_done_at_end = no_done_at_end
        self._rollout_fragment_length = rollout_fragment_length
        self._count_steps_by = count_steps_by
        self._render = render

        self._horizon = self._get_horizon(horizon)
        # May be populated for image rendering.
        self._simple_image_viewer: Optional[
            "SimpleImageViewer"
        ] = self._get_simple_image_viewer()

        # Keeps track of active episodes.
        self._active_episodes: Dict[EnvID, EpisodeV2] = _NewDefaultDict(
            self._new_episode
        )
        self._batch_builders: Dict[EnvID, _PolicyCollectorGroup] = _NewDefaultDict(
            self._new_batch_builder
        )

        self._large_batch_threshold: int = (
            max(MIN_LARGE_BATCH_THRESHOLD, self._rollout_fragment_length * 10)
            if self._rollout_fragment_length != float("inf")
            else DEFAULT_LARGE_BATCH_THRESHOLD
        )

    def _get_horizon(self, horizon: Optional[int]):
        """Try figuring out the proper horizon to use for rollout.

        Args:
            base_env: Env implementing BaseEnv.
            horizon: Horizon of the episode.
        """
        # Try to get Env's `max_episode_steps` prop. If it doesn't exist, ignore
        # error and continue with max_episode_steps=None.
        max_episode_steps = None
        try:
            max_episode_steps = self._base_env.get_sub_environments()[
                0
            ].spec.max_episode_steps
        except Exception:
            pass

        # Trainer has a given `horizon` setting.
        if horizon:
            # `horizon` is larger than env's limit.
            if max_episode_steps and horizon > max_episode_steps:
                # Try to override the env's own max-step setting with our horizon.
                # If this won't work, throw an error.
                try:
                    self._base_env.get_sub_environments()[
                        0
                    ].spec.max_episode_steps = horizon
                    self._base_env.get_sub_environments()[
                        0
                    ]._max_episode_steps = horizon
                except Exception:
                    raise ValueError(
                        "Your `horizon` setting ({}) is larger than the Env's own "
                        "timestep limit ({}), which seems to be unsettable! Try "
                        "to increase the Env's built-in limit to be at least as "
                        "large as your wanted `horizon`.".format(
                            horizon, max_episode_steps
                        )
                    )
        # Otherwise, set Trainer's horizon to env's max-steps.
        elif max_episode_steps:
            horizon = max_episode_steps
            logger.debug(
                "No episode horizon specified, setting it to Env's limit ({}).".format(
                    max_episode_steps
                )
            )
        # No horizon/max_episode_steps -> Episodes may be infinitely long.
        else:
            horizon = float("inf")
            logger.debug("No episode horizon specified, assuming inf.")

        return horizon

    def _get_simple_image_viewer(self):
        """Maybe construct a SimpleImageViewer instance for episode rendering."""
        # Try to render the env, if required.
        if not self._render:
            return None

        try:
            from gym.envs.classic_control.rendering import SimpleImageViewer

            return SimpleImageViewer()
        except (ImportError, ModuleNotFoundError):
            self._render = False  # disable rendering
            logger.warning(
                "Could not import gym.envs.classic_control."
                "rendering! Try `pip install gym[all]`."
            )

        return None

    def _new_episode(self, env_id) -> EpisodeV2:
        """Create a new episode."""
        episode = EpisodeV2(
            env_id,
            self._worker.policy_map,
            self._worker.policy_mapping_fn,
            worker=self._worker,
            callbacks=self._callbacks,
        )
        # Call each policy's Exploration.on_episode_start method.
        # Note: This may break the exploration (e.g. ParameterNoise) of
        # policies in the `policy_map` that have not been recently used
        # (and are therefore stashed to disk). However, we certainly do not
        # want to loop through all (even stashed) policies here as that
        # would counter the purpose of the LRU policy caching.
        for p in self._worker.policy_map.cache.values():
            if getattr(p, "exploration", None) is not None:
                p.exploration.on_episode_start(
                    policy=p,
                    environment=self._base_env,
                    episode=episode,
                    tf_sess=p.get_session(),
                )
        # Call on_episode_start callbacks.
        self._callbacks.on_episode_start(
            worker=self._worker,
            base_env=self._base_env,
            policies=self._worker.policy_map,
            episode=episode,
            env_index=env_id,
        )
        return episode

    def _new_batch_builder(self, _) -> _PolicyCollectorGroup:
        """Create a new batch builder.

        We create a _PolicyCollectorGroup based on the full policy_map
        as the batch builder.
        """
        return _PolicyCollectorGroup(self._worker.policy_map)

    def run(self) -> Iterator[SampleBatchType]:
        """Samples and yields training episodes continuously.

        Yields:
            Object containing state, action, reward, terminal condition,
            and other fields as dictated by `policy`.
        """
        while True:
            self._perf_stats.incr("iters", 1)

            t0 = time.time()
            # Get observations from all ready agents.
            # types: MultiEnvDict, MultiEnvDict, MultiEnvDict, MultiEnvDict, ...
            (
                unfiltered_obs,
                rewards,
                dones,
                infos,
                off_policy_actions,
            ) = self._base_env.poll()
            env_poll_time = time.time() - t0

            # Process observations and prepare for policy evaluation.
            t1 = time.time()
            # types: Set[EnvID], Dict[PolicyID, List[_PolicyEvalData]],
            #       List[Union[RolloutMetrics, SampleBatchType]]
            to_eval, outputs = self._process_observations(
                unfiltered_obs=unfiltered_obs,
                rewards=rewards,
                dones=dones,
                infos=infos,
            )
            self._perf_stats.incr("raw_obs_processing_time", time.time() - t1)

            for o in outputs:
                yield o

            # Do batched policy eval (accross vectorized envs).
            t2 = time.time()
            # types: Dict[PolicyID, Tuple[TensorStructType, StateBatch, dict]]
            eval_results = self._do_policy_eval(to_eval=to_eval)
            self._perf_stats.incr("inference_time", time.time() - t2)

            # Process results and update episode state.
            t3 = time.time()
            actions_to_send: Dict[
                EnvID, Dict[AgentID, EnvActionType]
            ] = self._process_policy_eval_results(
                to_eval=to_eval,
                eval_results=eval_results,
                off_policy_actions=off_policy_actions,
            )
            self._perf_stats.incr("action_processing_time", time.time() - t3)

            # Return computed actions to ready envs. We also send to envs that have
            # taken off-policy actions; those envs are free to ignore the action.
            t4 = time.time()
            self._base_env.send_actions(actions_to_send)
            self._perf_stats.incr("env_wait_time", env_poll_time + time.time() - t4)

            self._maybe_render()

    def _get_rollout_metrics(self, episode: EpisodeV2) -> List[RolloutMetrics]:
        """Get rollout metrics from completed episode."""
        # TODO(jungong) : why do we need to handle atari metrics differently?
        # Can we unify atari and normal env metrics?
        atari_metrics: List[RolloutMetrics] = _fetch_atari_metrics(self._base_env)
        if atari_metrics is not None:
            for m in atari_metrics:
                m._replace(custom_metrics=episode.custom_metrics)
            return atari_metrics
        # Otherwise, return RolloutMetrics for the episode.
        return [
            RolloutMetrics(
                episode.length,
                episode.total_reward,
                dict(episode.agent_rewards),
                episode.custom_metrics,
                {},
                episode.hist_data,
                episode.media,
            )
        ]

    def _process_observations(
        self,
        unfiltered_obs: MultiEnvDict,
        rewards: MultiEnvDict,
        dones: MultiEnvDict,
        infos: MultiEnvDict,
    ) -> Tuple[
        Dict[PolicyID, List[_PolicyEvalData]],
        List[Union[RolloutMetrics, SampleBatchType]],
    ]:
        """Process raw obs from env.

        Group data for active agents by policy. Reset environments that are done.

        Args:
            unfiltered_obs: obs
            rewards: rewards
            dones: dones
            infos: infos

        Returns:
            A tuple of:
                _PolicyEvalData for active agents for policy evaluation.
                SampleBatches and RolloutMetrics for completed agents for output.
        """
        # Output objects.
        to_eval: Dict[PolicyID, List[_PolicyEvalData]] = defaultdict(list)
        outputs: List[Union[RolloutMetrics, SampleBatchType]] = []

        # For each (vectorized) sub-environment.
        # types: EnvID, Dict[AgentID, EnvObsType]
        for env_id, env_obs in unfiltered_obs.items():
            # Check for env_id having returned an error instead of a multi-agent
            # obs dict. This is how our BaseEnv can tell the caller to `poll()` that
            # one of its sub-environments is faulty and should be restarted (and the
            # ongoing episode should not be used for training).
            if isinstance(env_obs, Exception):
                assert dones[env_id]["__all__"] is True, (
                    f"ERROR: When a sub-environment (env-id {env_id}) returns an error "
                    "as observation, the dones[__all__] flag must also be set to True!"
                )
                # all_agents_obs is an Exception here.
                # Drop this episode and skip to next.
                self.end_episode(env_id, env_obs)
                # Tell the sampler we have got a faulty episode.
                outputs.extend(RolloutMetrics(episode_faulty=True))
                continue

            episode: EpisodeV2 = self._active_episodes[env_id]

            # Episode length after this step.
            next_episode_length = episode.length + 1
            # Check episode termination conditions.
            if dones[env_id]["__all__"] or next_episode_length >= self._horizon:
                hit_horizon = (
                    next_episode_length >= self._horizon
                    and not dones[env_id]["__all__"]
                )
                all_agents_done = True
                # Add rollout metrics.
                outputs.extend(self._get_rollout_metrics(episode))
            else:
                hit_horizon = False
                all_agents_done = False

            # Special handling of common info dict.
            episode.set_last_info("__common__", infos[env_id].get("__common__", {}))

            # Agent sample batches grouped by policy. Each set of sample batches will
            # go through agent connectors together.
            sample_batches_by_policy = defaultdict(list)
            # Whether an agent is done, regardless of no_done_at_end or soft_horizon.
            agent_dones = {}
            for agent_id, obs in env_obs.items():
                assert agent_id != "__all__"

                policy_id: PolicyID = episode.policy_for(agent_id)

                agent_done = bool(all_agents_done or dones[env_id].get(agent_id))
                agent_dones[agent_id] = agent_done

                # A completely new agent is already done -> Skip entirely.
                if not episode.has_init_obs(agent_id) and agent_done:
                    continue

                values_dict = {
                    SampleBatch.T: episode.length - 1,
                    SampleBatch.ENV_ID: env_id,
                    SampleBatch.AGENT_INDEX: episode.agent_index(agent_id),
                    # Last action (SampleBatch.ACTIONS) column will be populated by
                    # StateBufferConnector.
                    # Reward received after taking action at timestep t.
                    SampleBatch.REWARDS: rewards[env_id].get(agent_id, 0.0),
                    # After taking action=a, did we reach terminal?
                    SampleBatch.DONES: (
                        False
                        if (
                            self._no_done_at_end or (hit_horizon and self._soft_horizon)
                        )
                        else agent_done
                    ),
                    SampleBatch.INFOS: infos[env_id].get(agent_id, {}),
                    SampleBatch.NEXT_OBS: obs,
                }

                # Queue this obs sample for connector preprocessing.
                sample_batches_by_policy[policy_id].append((agent_id, values_dict))

            # The entire episode is done.
            if all_agents_done:
                # Let's check to see if there are any agents that haven't got the
                # last "done" obs yet. If there are, we have to create fake-last
                # observations for them. (the environment is not required to do so if
                # dones[__all__]=True).
                for agent_id in episode.get_agents():
                    # If the latest obs we got for this agent is done, or if its
                    # episode state is already done, nothing to do.
                    if agent_dones.get(agent_id, False) or episode.is_done(agent_id):
                        continue

                    policy_id: PolicyID = episode.policy_for(agent_id)
                    policy = self._worker.policy_map[policy_id]

                    # Create a fake (all-0s) observation.
                    obs_space = policy.observation_space
                    obs_space = getattr(obs_space, "original_space", obs_space)
                    values_dict = {
                        SampleBatch.T: episode.length - 1,
                        SampleBatch.ENV_ID: env_id,
                        SampleBatch.AGENT_INDEX: episode.agent_index(agent_id),
                        SampleBatch.REWARDS: 0.0,
                        SampleBatch.DONES: True,
                        SampleBatch.INFOS: {},
                        SampleBatch.NEXT_OBS: tree.map_structure(
                            np.zeros_like, obs_space.sample()
                        ),
                    }

                    # Queue these fake obs for connector preprocessing too.
                    sample_batches_by_policy[policy_id].append((agent_id, values_dict))

            # Run agent connectors.
            processed = []
            for policy_id, batches in sample_batches_by_policy.items():
                policy: Policy = self._worker.policy_map[policy_id]
                # Collected full MultiAgentDicts for this environment.
                # Run agent connectors.
                assert (
                    policy.agent_connectors
                ), "EnvRunnerV2 requires agent connectors to work."

                acd_list: List[AgentConnectorDataType] = [
                    AgentConnectorDataType(env_id, agent_id, data)
                    for agent_id, data in batches
                ]
                processed.extend(policy.agent_connectors(acd_list))

            for d in processed:
                # Record transition info if applicable.
                if not episode.has_init_obs(d.agent_id):
                    episode.add_init_obs(
                        d.agent_id,
                        d.data.for_training[SampleBatch.T],
                        d.data.for_training[SampleBatch.NEXT_OBS],
                    )
                else:
                    episode.add_action_reward_done_next_obs(
                        d.agent_id, d.data.for_training
                    )

                if not all_agents_done and not agent_dones[d.agent_id]:
                    # Add to eval set if env is not done and this particular agent
                    # is also not done.
                    item = _PolicyEvalData(d.env_id, d.agent_id, d.data.for_action)
                    to_eval[policy_id].append(item)

            # Finished advancing episode by 1 step, mark it so.
            episode.step()

            # Exception: The very first env.poll() call causes the env to get reset
            # (no step taken yet, just a single starting observation logged).
            # We need to skip this callback in this case.
            if episode.length > 0:
                # Invoke the `on_episode_step` callback after the step is logged
                # to the episode.
                self._callbacks.on_episode_step(
                    worker=self._worker,
                    base_env=self._base_env,
                    policies=self._worker.policy_map,
                    episode=episode,
                    env_index=env_id,
                )

            # Episode is done for all agents (dones[__all__] == True)
            # or we hit the horizon.
            if all_agents_done:
                is_done = dones[env_id]["__all__"]
                # _handle_done_episode will build a MultiAgentBatch for all
                # the agents that are done during this step of rollout in
                # the case of _multiple_episodes_in_batch=False.
                self._handle_done_episode(
                    env_id, env_obs, is_done, hit_horizon, to_eval, outputs
                )

            # Try to build something.
            if self._multiple_episodes_in_batch:
                sample_batch = self._try_build_truncated_episode_multi_agent_batch(
                    self._batch_builders[env_id], episode
                )
                if sample_batch:
                    outputs.append(sample_batch)

                    # SampleBatch built from data collected by batch_builder.
                    # Clean up and delete the batch_builder.
                    del self._batch_builders[env_id]

        return to_eval, outputs

    def _handle_done_episode(
        self,
        env_id: EnvID,
        env_obs: MultiAgentDict,
        is_done: bool,
        hit_horizon: bool,
        to_eval: Dict[PolicyID, List[_PolicyEvalData]],
        outputs: List[SampleBatchType],
    ) -> None:
        """Handle an all-finished episode.

        Add collected SampleBatch to batch builder. Reset corresponding env, etc.

        Args:
            env_id: Environment ID.
            env_obs: Last per-environment observation.
            is_done: If all agents are done.
            hit_horizon: Whether the episode ended because it hit horizon.
            to_eval: Output container for policy eval data.
            outputs: Output container for collected sample batches.
        """
        check_dones = is_done and not self._no_done_at_end

        episode: EpisodeV2 = self._active_episodes[env_id]
        batch_builder = self._batch_builders[env_id]
        episode.postprocess_episode(
            batch_builder=batch_builder,
            is_done=is_done or (hit_horizon and not self._soft_horizon),
            check_dones=check_dones,
        )

        # If, we are not allowed to pack the next episode into the same
        # SampleBatch (batch_mode=complete_episodes) -> Build the
        # MultiAgentBatch from a single episode and add it to "outputs".
        # Otherwise, just postprocess and continue collecting across
        # episodes.
        if not self._multiple_episodes_in_batch:
            ma_sample_batch = _build_multi_agent_batch(
                episode.episode_id,
                batch_builder,
                self._large_batch_threshold,
                self._multiple_episodes_in_batch,
            )
            if ma_sample_batch:
                outputs.append(ma_sample_batch)

            # SampleBatch built from data collected by batch_builder.
            # Clean up and delete the batch_builder.
            del self._batch_builders[env_id]

            # Call each (in-memory) policy's Exploration.on_episode_end
            # method.
            # Note: This may break the exploration (e.g. ParameterNoise) of
            # policies in the `policy_map` that have not been recently used
            # (and are therefore stashed to disk). However, we certainly do not
            # want to loop through all (even stashed) policies here as that
            # would counter the purpose of the LRU policy caching.
            for p in self._worker.policy_map.cache.values():
                if getattr(p, "exploration", None) is not None:
                    p.exploration.on_episode_end(
                        policy=p,
                        environment=self._base_env,
                        episode=episode,
                        tf_sess=p.get_session(),
                    )
            # Call custom on_episode_end callback.
            self._callbacks.on_episode_end(
                worker=self._worker,
                base_env=self._base_env,
                policies=self._worker.policy_map,
                episode=episode,
                env_index=env_id,
            )

        # Clean up and deleted the post-processed episode now that we have collected
        # its data.
        self.end_episode(env_id, episode)

        # Horizon hit and we have a soft horizon (no hard env reset).
        if hit_horizon and self._soft_horizon:
            resetted_obs: Dict[EnvID, Dict[AgentID, EnvObsType]] = {env_id: env_obs}
            # Do not reset connector state if this is a soft reset.
            # Basically carry RNN and other buffered state to the
            # next episode from the same env.
        else:
            # TODO(jungong) : This will allow a single faulty env to
            # take out the entire RolloutWorker indefinitely. Revisit.
            while True:
                resetted_obs: Dict[
                    EnvID, Dict[AgentID, EnvObsType]
                ] = self._base_env.try_reset(env_id)
                if resetted_obs is None or not isinstance(resetted_obs, Exception):
                    break
                else:
                    # Report a faulty episode.
                    outputs.append(RolloutMetrics(episode_faulty=True))
            # Reset connector state if this is a hard reset.
            for p in self._worker.policy_map.cache.values():
                p.agent_connectors.reset(env_id)
        # Reset not supported, drop this env from the ready list.
        if resetted_obs is None:
            if self._horizon != float("inf"):
                raise ValueError(
                    "Setting episode horizon requires reset() support "
                    "from the environment."
                )
        # Creates a new episode if this is not async return.
        # If reset is async, we will get its result in some future poll.
        elif resetted_obs != ASYNC_RESET_RETURN:
            new_episode: EpisodeV2 = self._active_episodes[env_id]
            per_policy_resetted_obs: Dict[PolicyID, List] = defaultdict(list)
            # types: AgentID, EnvObsType
            for agent_id, raw_obs in resetted_obs[env_id].items():
                policy_id: PolicyID = new_episode.policy_for(agent_id)
                per_policy_resetted_obs[policy_id].append((agent_id, raw_obs))

            processed = []
            for policy_id, agents_obs in per_policy_resetted_obs.items():
                policy = self._worker.policy_map[policy_id]
                acd_list: List[AgentConnectorDataType] = [
                    AgentConnectorDataType(
                        env_id,
                        agent_id,
                        {
                            SampleBatch.T: new_episode.length - 1,
                            SampleBatch.NEXT_OBS: obs,
                        },
                    )
                    for agent_id, obs in agents_obs
                ]
                # Call agent connectors on these initial obs.
                processed.extend(policy.agent_connectors(acd_list))

            for d in processed:
                # Add initial obs to buffer.
                new_episode.add_init_obs(
                    d.agent_id,
                    d.data.for_training[SampleBatch.T],
                    d.data.for_training[SampleBatch.NEXT_OBS],
                )
                item = _PolicyEvalData(d.env_id, d.agent_id, d.data.for_action)
                to_eval[policy_id].append(item)

            # Step after adding initial obs. This will give us 0 env and agent step.
            new_episode.step()

    def end_episode(
        self, env_id: EnvID, episode_or_exception: Union[EpisodeV2, Exception]
    ):
        """Clena up an episode that has finished.

        Args:
            env_id: Env ID.
            episode_or_exception: Instance of an episode if it finished successfully.
                Otherwise, the exception that was thrown,
        """
        # Signal the end of an episode, either successfully with an Episode or
        # unsuccessfully with an Exception.
        self._callbacks.on_episode_end(
            worker=self._worker,
            base_env=self._base_env,
            policies=self._worker.policy_map,
            episode=episode_or_exception,
            env_index=env_id,
        )

        if isinstance(episode_or_exception, EpisodeV2):
            episode = episode_or_exception
            if episode.total_agent_steps == 0:
                # if the key does not exist it means that throughout the episode all
                # observations were empty (i.e. there was no agent in the env)
                msg = (
                    f"Data from episode {episode.episode_id} does not show any agent "
                    f"interactions. Hint: Make sure for at least one timestep in the "
                    f"episode, env.step() returns non-empty values."
                )
                raise ValueError(msg)

        # Clean up the episode and batch_builder for this env id.
        del self._active_episodes[env_id]

    def _try_build_truncated_episode_multi_agent_batch(
        self, batch_builder: _PolicyCollectorGroup, episode: EpisodeV2
    ) -> Union[None, SampleBatch, MultiAgentBatch]:
        # Measure batch size in env-steps.
        if self._count_steps_by == "env_steps":
            built_steps = batch_builder.env_steps
            ongoing_steps = episode.active_env_steps
        # Measure batch-size in agent-steps.
        else:
            built_steps = batch_builder.agent_steps
            ongoing_steps = episode.active_agent_steps

        # Reached the fragment-len -> We should build an MA-Batch.
        if built_steps + ongoing_steps >= self._rollout_fragment_length:
            if self._count_steps_by != "agent_steps":
                assert built_steps + ongoing_steps == self._rollout_fragment_length, (
                    f"built_steps ({built_steps}) + ongoing_steps ({ongoing_steps}) != "
                    f"rollout_fragment_length ({self._rollout_fragment_length})."
                )

            # If we reached the fragment-len only because of `episode_id`
            # (still ongoing) -> postprocess `episode_id` first.
            if built_steps < self._rollout_fragment_length:
                episode.postprocess_episode(batch_builder=batch_builder, is_done=False)

            # If builder has collected some data,
            # build the MA-batch and add to return values.
            if batch_builder.agent_steps > 0:
                return _build_multi_agent_batch(
                    episode.episode_id,
                    batch_builder,
                    self._large_batch_threshold,
                    self._multiple_episodes_in_batch,
                )
            # No batch-builder:
            # We have reached the rollout-fragment length w/o any agent
            # steps! Warn that the environment may never request any
            # actions from any agents.
            elif log_once("no_agent_steps"):
                logger.warning(
                    "Your environment seems to be stepping w/o ever "
                    "emitting agent observations (agents are never "
                    "requested to act)!"
                )

        return None

    def _do_policy_eval(
        self,
        to_eval: Dict[PolicyID, List[_PolicyEvalData]],
    ) -> Dict[PolicyID, PolicyOutputType]:
        """Call compute_actions on collected episode data to get next action.

        Args:
            to_eval: Mapping of policy IDs to lists of _PolicyEvalData objects
                (items in these lists will be the batch's items for the model
                forward pass).

        Returns:
            Dict mapping PolicyIDs to compute_actions_from_input_dict() outputs.
        """
        policies = self._worker.policy_map

        # In case policy map has changed, try to find the new policy that
        # should handle all these per-agent eval data.
        # Throws exception if these agents are mapped to multiple different
        # policies now.
        def _try_find_policy_again(eval_data: _PolicyEvalData):
            policy_id = None
            for d in eval_data:
                episode = self._active_episodes[d.env_id]
                # Force refresh policy mapping on the episode.
                pid = episode.policy_for(d.agent_id, refresh=True)
                if policy_id is not None and pid != policy_id:
                    raise ValueError(
                        "Policy map changed. The list of eval data that was handled "
                        f"by a same policy is now handled by policy {pid} "
                        "and {policy_id}. "
                        "Please don't do this in the middle of an episode."
                    )
                policy_id = pid
            return _get_or_raise(self._worker.policy_map, policy_id)

        eval_results: Dict[PolicyID, TensorStructType] = {}
        for policy_id, eval_data in to_eval.items():
            # In case the policyID has been removed from this worker, we need to
            # re-assign policy_id and re-lookup the Policy object to use.
            try:
                policy: Policy = _get_or_raise(policies, policy_id)
            except ValueError:
                # policy_mapping_fn from the worker may have already been
                # changed (mapping fn not staying constant within one episode).
                policy: Policy = _try_find_policy_again(eval_data)

            input_dict = _batch_inference_sample_batches(
                [d.sample_batch for d in eval_data]
            )
            eval_results[policy_id] = policy.compute_actions_from_input_dict(
                input_dict,
                timestep=policy.global_timestep,
                episodes=[self._active_episodes[t.env_id] for t in eval_data],
            )

        return eval_results

    def _process_policy_eval_results(
        self,
        to_eval: Dict[PolicyID, List[_PolicyEvalData]],
        eval_results: Dict[PolicyID, PolicyOutputType],
        off_policy_actions: MultiEnvDict,
    ):
        """Process the output of policy neural network evaluation.

        Records policy evaluation results into agent connectors and
        returns replies to send back to agents in the env.

        Args:
            to_eval: Mapping of policy IDs to lists of _PolicyEvalData objects.
            eval_results: Mapping of policy IDs to list of
                actions, rnn-out states, extra-action-fetches dicts.
            off_policy_actions: Doubly keyed dict of env-ids -> agent ids ->
                off-policy-action, returned by a `BaseEnv.poll()` call.

        Returns:
            Nested dict of env id -> agent id -> actions to be sent to
            Env (np.ndarrays).
        """
        actions_to_send: Dict[EnvID, Dict[AgentID, EnvActionType]] = defaultdict(dict)
        for eval_data in to_eval.values():
            for d in eval_data:
                actions_to_send[d.env_id] = {}  # at minimum send empty dict

        # types: PolicyID, List[_PolicyEvalData]
        for policy_id, eval_data in to_eval.items():
            actions: TensorStructType = eval_results[policy_id][0]
            actions = convert_to_numpy(actions)

            rnn_out: StateBatches = eval_results[policy_id][1]
            extra_action_out: dict = eval_results[policy_id][2]

            # In case actions is a list (representing the 0th dim of a batch of
            # primitive actions), try converting it first.
            if isinstance(actions, list):
                actions = np.array(actions)
            # Split action-component batches into single action rows.
            actions: List[EnvActionType] = unbatch(actions)

            policy: Policy = _get_or_raise(self._worker.policy_map, policy_id)
            assert (
                policy.agent_connectors and policy.action_connectors
            ), "EnvRunnerV2 requires action connectors to work."

            # types: int, EnvActionType
            for i, action in enumerate(actions):
                env_id: int = eval_data[i].env_id
                agent_id: AgentID = eval_data[i].agent_id

                rnn_states: List[StateBatches] = [c[i] for c in rnn_out]
                fetches: Dict = {k: v[i] for k, v in extra_action_out.items()}

                # Post-process policy output by running them through action connectors.
                ac_data = ActionConnectorDataType(
                    env_id, agent_id, (action, rnn_states, fetches)
                )
                action_to_send, rnn_states, fetches = policy.action_connectors(
                    ac_data
                ).output

                action_to_buffer = (
                    action_to_send
                    if env_id not in off_policy_actions
                    or agent_id not in off_policy_actions[env_id]
                    else off_policy_actions[env_id][agent_id]
                )

                # Notify agent connectors with this new policy output.
                # Necessary for state buffering agent connectors, for example.
                ac_data: AgentConnectorDataType = ActionConnectorDataType(
                    env_id, agent_id, (action_to_buffer, rnn_states, fetches)
                )
                policy.agent_connectors.on_policy_output(ac_data)

                assert agent_id not in actions_to_send[env_id]
                actions_to_send[env_id][agent_id] = action_to_send

        return actions_to_send

    def _maybe_render(self):
        """Visualize environment."""
        # Check if we should render.
        if not self._render or not self._simple_image_viewer:
            return

        t5 = time.time()

        # Render can either return an RGB image (uint8 [w x h x 3] numpy
        # array) or take care of rendering itself (returning True).
        rendered = self._base_env.try_render()
        # Rendering returned an image -> Display it in a SimpleImageViewer.
        if isinstance(rendered, np.ndarray) and len(rendered.shape) == 3:
            self._simple_image_viewer.imshow(rendered)
        elif rendered not in [True, False, None]:
            raise ValueError(
                f"The env's ({self._base_env}) `try_render()` method returned an"
                " unsupported value! Make sure you either return a "
                "uint8/w x h x 3 (RGB) image or handle rendering in a "
                "window and then return `True`."
            )

        self._perf_stats.incr("env_render_time", time.time() - t5)


def _fetch_atari_metrics(base_env: BaseEnv) -> List[RolloutMetrics]:
    """Atari games have multiple logical episodes, one per life.

    However, for metrics reporting we count full episodes, all lives included.
    """
    sub_environments = base_env.get_sub_environments()
    if not sub_environments:
        return None
    atari_out = []
    for sub_env in sub_environments:
        monitor = get_wrapper_by_cls(sub_env, MonitorEnv)
        if not monitor:
            return None
        for eps_rew, eps_len in monitor.next_episode_results():
            atari_out.append(RolloutMetrics(eps_len, eps_rew))
    return atari_out


def _get_or_raise(
    mapping: Dict[PolicyID, Union[Policy, Preprocessor, Filter]], policy_id: PolicyID
) -> Union[Policy, Preprocessor, Filter]:
    """Returns an object under key `policy_id` in `mapping`.

    Args:
        mapping (Dict[PolicyID, Union[Policy, Preprocessor, Filter]]): The
            mapping dict from policy id (str) to actual object (Policy,
            Preprocessor, etc.).
        policy_id: The policy ID to lookup.

    Returns:
        Union[Policy, Preprocessor, Filter]: The found object.

    Raises:
        ValueError: If `policy_id` cannot be found in `mapping`.
    """
    if policy_id not in mapping:
        raise ValueError(
            "Could not find policy for agent: PolicyID `{}` not found "
            "in policy map, whose keys are `{}`.".format(policy_id, mapping.keys())
        )
    return mapping[policy_id]