"""
Decentralized Distributed PPO (DD-PPO)
======================================

Unlike APPO or PPO, learning is no longer done centralized in the trainer
process. Instead, gradients are computed remotely on each rollout worker and
all-reduced to sync them at each mini-batch. This allows each worker's GPU
to be used both for sampling and for training.

DD-PPO should be used if you have envs that require GPUs to function, or have
a very large model that cannot be effectively optimized with the GPUs available
on a single machine (DD-PPO allows scaling to arbitrary numbers of GPUs across
multiple nodes, unlike PPO/APPO which is limited to GPUs on a single node).

Paper reference: https://arxiv.org/abs/1911.00357
Note that unlike the paper, we currently do not implement straggler mitigation.
"""

import logging
import sys
import time

import ray
from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG as PPO_DEFAULT_CONFIG, PPOTrainer
from ray.rllib.agents.trainer import Trainer
from ray.rllib.evaluation.worker_set import WorkerSet
from ray.rllib.execution.rollout_ops import ParallelRollouts
from ray.rllib.execution.metric_ops import StandardMetricsReporting
from ray.rllib.execution.common import (
    STEPS_SAMPLED_COUNTER,
    STEPS_TRAINED_COUNTER,
    STEPS_TRAINED_THIS_ITER_COUNTER,
    LEARN_ON_BATCH_TIMER,
    _get_shared_metrics,
    _get_global_vars,
)
from ray.rllib.evaluation.rollout_worker import get_global_worker
from ray.rllib.utils.annotations import override
from ray.rllib.utils.metrics.learner_info import LEARNER_INFO
from ray.rllib.utils.sgd import do_minibatch_sgd
from ray.rllib.utils.typing import TrainerConfigDict
from ray.util.iter import LocalIterator

logger = logging.getLogger(__name__)

# fmt: off
# __sphinx_doc_begin__

# Adds the following updates to the `PPOTrainer` config in
# rllib/agents/ppo/ppo.py.
DEFAULT_CONFIG = Trainer.merge_trainer_configs(
    PPO_DEFAULT_CONFIG,
    {
        # During the sampling phase, each rollout worker will collect a batch
        # `rollout_fragment_length * num_envs_per_worker` steps in size.
        "rollout_fragment_length": 100,
        # Vectorize the env (should enable by default since each worker has
        # a GPU).
        "num_envs_per_worker": 5,
        # During the SGD phase, workers iterate over minibatches of this size.
        # The effective minibatch size will be:
        # `sgd_minibatch_size * num_workers`.
        "sgd_minibatch_size": 50,
        # Number of SGD epochs per optimization round.
        "num_sgd_iter": 10,
        # Download weights between each training step. This adds a bit of
        # overhead but allows the user to access the weights from the trainer.
        "keep_local_weights_in_sync": True,

        # *** WARNING: configs below are DDPPO overrides over PPO; you
        #     shouldn't need to adjust them. ***
        # DDPPO requires PyTorch distributed.
        "framework": "torch",
        # The communication backend for PyTorch distributed.
        "torch_distributed_backend": "gloo",
        # Learning is no longer done on the driver process, so
        # giving GPUs to the driver does not make sense!
        "num_gpus": 0,
        # Each rollout worker gets a GPU.
        "num_gpus_per_worker": 1,
        # Require evenly sized batches. Otherwise,
        # collective allreduce could fail.
        "truncate_episodes": True,
        # This is auto set based on sample batch size.
        "train_batch_size": -1,
        # Kl divergence penalty should be fixed to 0 in DDPPO because in order
        # for it to be used as a penalty, we would have to un-decentralize
        # DDPPO
        "kl_coeff": 0.0,
        "kl_target": 0.0
    },
    _allow_unknown_configs=True,
)

# __sphinx_doc_end__
# fmt: on


class DDPPOTrainer(PPOTrainer):
    @classmethod
    @override(PPOTrainer)
    def get_default_config(cls) -> TrainerConfigDict:
        return DEFAULT_CONFIG

    @override(PPOTrainer)
    def validate_config(self, config):
        """Validates the Trainer's config dict.

        Args:
            config (TrainerConfigDict): The Trainer's config to check.

        Raises:
            ValueError: In case something is wrong with the config.
        """
        # Call (base) PPO's config validation function first.
        # Note that this will not touch or check on the train_batch_size=-1
        # setting.
        super().validate_config(config)

        # Error if run on Win.
        if sys.platform in ["win32", "cygwin"]:
            raise ValueError(
                "DD-PPO not supported on Win yet! " "Due to usage of torch.distributed."
            )

        # Auto-train_batch_size: Calculate from rollout len and
        # envs-per-worker.
        if config["train_batch_size"] == -1:
            config["train_batch_size"] = (
                config["rollout_fragment_length"] * config["num_envs_per_worker"]
            )
        # Users should not define `train_batch_size` directly (always -1).
        else:
            raise ValueError(
                "Set rollout_fragment_length instead of train_batch_size " "for DDPPO."
            )

        # Only supported for PyTorch so far.
        if config["framework"] != "torch":
            raise ValueError("Distributed data parallel is only supported for PyTorch")
        if config["torch_distributed_backend"] not in ("gloo", "mpi", "nccl"):
            raise ValueError(
                "Only gloo, mpi, or nccl is supported for "
                "the backend of PyTorch distributed."
            )
        # `num_gpus` must be 0/None, since all optimization happens on Workers.
        if config["num_gpus"]:
            raise ValueError(
                "When using distributed data parallel, you should set "
                "num_gpus=0 since all optimization "
                "is happening on workers. Enable GPUs for workers by setting "
                "num_gpus_per_worker=1."
            )
        # `batch_mode` must be "truncate_episodes".
        if config["batch_mode"] != "truncate_episodes":
            raise ValueError(
                "Distributed data parallel requires truncate_episodes " "batch mode."
            )
        # DDPPO doesn't support KL penalties like PPO-1.
        # In order to support KL penalties, DDPPO would need to become
        # undecentralized, which defeats the purpose of the algorithm.
        # Users can still tune the entropy coefficient to control the
        # policy entropy (similar to controlling the KL penalty).
        if config["kl_coeff"] != 0.0 or config["kl_target"] != 0.0:
            raise ValueError("DDPPO doesn't support KL penalties like PPO-1")

    @staticmethod
    @override(PPOTrainer)
    def execution_plan(
        workers: WorkerSet, config: TrainerConfigDict, **kwargs
    ) -> LocalIterator[dict]:
        """Execution plan of the DD-PPO algorithm. Defines the distributed dataflow.

        Args:
            workers (WorkerSet): The WorkerSet for training the Polic(y/ies)
                of the Trainer.
            config (TrainerConfigDict): The trainer's configuration dict.

        Returns:
            LocalIterator[dict]: The Policy class to use with PGTrainer.
                If None, use `default_policy` provided in build_trainer().
        """
        assert (
            len(kwargs) == 0
        ), "DDPPO execution_plan does NOT take any additional parameters"

        rollouts = ParallelRollouts(workers, mode="raw")

        # Setup the distributed processes.
        if not workers.remote_workers():
            raise ValueError("This optimizer requires >0 remote workers.")
        ip = ray.get(workers.remote_workers()[0].get_node_ip.remote())
        port = ray.get(workers.remote_workers()[0].find_free_port.remote())
        address = "tcp://{ip}:{port}".format(ip=ip, port=port)
        logger.info("Creating torch process group with leader {}".format(address))

        # Get setup tasks in order to throw errors on failure.
        ray.get(
            [
                worker.setup_torch_data_parallel.remote(
                    url=address,
                    world_rank=i,
                    world_size=len(workers.remote_workers()),
                    backend=config["torch_distributed_backend"],
                )
                for i, worker in enumerate(workers.remote_workers())
            ]
        )
        logger.info("Torch process group init completed")

        # This function is applied remotely on each rollout worker.
        def train_torch_distributed_allreduce(batch):
            expected_batch_size = (
                config["rollout_fragment_length"] * config["num_envs_per_worker"]
            )
            this_worker = get_global_worker()
            assert batch.count == expected_batch_size, (
                "Batch size possibly out of sync between workers, expected:",
                expected_batch_size,
                "got:",
                batch.count,
            )
            logger.info(
                "Executing distributed minibatch SGD "
                "with epoch size {}, minibatch size {}".format(
                    batch.count, config["sgd_minibatch_size"]
                )
            )
            info = do_minibatch_sgd(
                batch,
                this_worker.policy_map,
                this_worker,
                config["num_sgd_iter"],
                config["sgd_minibatch_size"],
                ["advantages"],
            )
            return info, batch.count

        # Broadcast the local set of global vars.
        def update_worker_global_vars(item):
            global_vars = _get_global_vars()
            for w in workers.remote_workers():
                w.set_global_vars.remote(global_vars)
            return item

        # Have to manually record stats since we are using "raw" rollouts mode.
        class RecordStats:
            def _on_fetch_start(self):
                self.fetch_start_time = time.perf_counter()

            def __call__(self, items):
                for item in items:
                    info, count = item
                    metrics = _get_shared_metrics()
                    metrics.counters[STEPS_TRAINED_THIS_ITER_COUNTER] = count
                    metrics.counters[STEPS_SAMPLED_COUNTER] += count
                    metrics.counters[STEPS_TRAINED_COUNTER] += count
                    metrics.info[LEARNER_INFO] = info
                # Since SGD happens remotely, the time delay between fetch and
                # completion is approximately the SGD step time.
                metrics.timers[LEARN_ON_BATCH_TIMER].push(
                    time.perf_counter() - self.fetch_start_time
                )

        train_op = (
            rollouts.for_each(train_torch_distributed_allreduce)  # allreduce
            .batch_across_shards()  # List[(grad_info, count)]
            .for_each(RecordStats())
        )

        train_op = train_op.for_each(update_worker_global_vars)

        # Sync down the weights. As with the sync up, this is not really
        # needed unless the user is reading the local weights.
        if config["keep_local_weights_in_sync"]:

            def download_weights(item):
                workers.local_worker().set_weights(
                    ray.get(workers.remote_workers()[0].get_weights.remote())
                )
                return item

            train_op = train_op.for_each(download_weights)

        # In debug mode, check the allreduce successfully synced the weights.
        if logger.isEnabledFor(logging.DEBUG):

            def check_sync(item):
                weights = ray.get(
                    [w.get_weights.remote() for w in workers.remote_workers()]
                )
                sums = []
                for w in weights:
                    acc = 0
                    for p in w.values():
                        for k, v in p.items():
                            acc += v.sum()
                    sums.append(float(acc))
                logger.debug("The worker weight sums are {}".format(sums))
                assert len(set(sums)) == 1, sums

            train_op = train_op.for_each(check_sync)

        return StandardMetricsReporting(train_op, workers, config)