"""
Asynchronous Proximal Policy Optimization (APPO)
================================================

This file defines the distributed Trainer class for the asynchronous version
of proximal policy optimization (APPO).
See `appo_[tf|torch]_policy.py` for the definition of the policy loss.

Detailed documentation:
https://docs.ray.io/en/master/rllib-algorithms.html#appo
"""
from typing import Optional, Type

from ray.rllib.agents.trainer import Trainer
from ray.rllib.agents.ppo.appo_tf_policy import AsyncPPOTFPolicy
from ray.rllib.agents.ppo.ppo import UpdateKL
from ray.rllib.agents import impala
from ray.rllib.policy.policy import Policy
from ray.rllib.execution.common import (
    STEPS_SAMPLED_COUNTER,
    LAST_TARGET_UPDATE_TS,
    NUM_TARGET_UPDATES,
    _get_shared_metrics,
)
from ray.rllib.utils.annotations import override
from ray.rllib.utils.typing import PartialTrainerConfigDict, TrainerConfigDict

# yapf: disable
# __sphinx_doc_begin__

# Adds the following updates to the `IMPALATrainer` config in
# rllib/agents/impala/impala.py.
DEFAULT_CONFIG = impala.ImpalaTrainer.merge_trainer_configs(
    impala.DEFAULT_CONFIG,  # See keys in impala.py, which are also supported.
    {
        # Whether to use V-trace weighted advantages. If false, PPO GAE
        # advantages will be used instead.
        "vtrace": True,

        # == These two options only apply if vtrace: False ==
        # Should use a critic as a baseline (otherwise don't use value
        # baseline; required for using GAE).
        "use_critic": True,
        # If true, use the Generalized Advantage Estimator (GAE)
        # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
        "use_gae": True,
        # GAE(lambda) parameter
        "lambda": 1.0,

        # == PPO surrogate loss options ==
        "clip_param": 0.4,

        # == PPO KL Loss options ==
        "use_kl_loss": False,
        "kl_coeff": 1.0,
        "kl_target": 0.01,

        # == IMPALA optimizer params (see documentation in impala.py) ==
        "rollout_fragment_length": 50,
        "train_batch_size": 500,
        "min_time_s_per_reporting": 10,
        "num_workers": 2,
        "num_gpus": 0,
        "num_multi_gpu_tower_stacks": 1,
        "minibatch_buffer_size": 1,
        "num_sgd_iter": 1,
        "replay_proportion": 0.0,
        "replay_buffer_num_slots": 100,
        "learner_queue_size": 16,
        "learner_queue_timeout": 300,
        "max_sample_requests_in_flight_per_worker": 2,
        "broadcast_interval": 1,
        "grad_clip": 40.0,
        "opt_type": "adam",
        "lr": 0.0005,
        "lr_schedule": None,
        "decay": 0.99,
        "momentum": 0.0,
        "epsilon": 0.1,
        "vf_loss_coeff": 0.5,
        "entropy_coeff": 0.01,
        "entropy_coeff_schedule": None,
    },
    _allow_unknown_configs=True,
)

# __sphinx_doc_end__
# yapf: enable


class UpdateTargetAndKL:
    def __init__(self, workers, config):
        self.workers = workers
        self.config = config
        self.update_kl = UpdateKL(workers)
        self.target_update_freq = (
            config["num_sgd_iter"] * config["minibatch_buffer_size"]
        )

    def __call__(self, fetches):
        metrics = _get_shared_metrics()
        cur_ts = metrics.counters[STEPS_SAMPLED_COUNTER]
        last_update = metrics.counters[LAST_TARGET_UPDATE_TS]
        if cur_ts - last_update > self.target_update_freq:
            metrics.counters[NUM_TARGET_UPDATES] += 1
            metrics.counters[LAST_TARGET_UPDATE_TS] = cur_ts
            # Update Target Network
            self.workers.local_worker().foreach_policy_to_train(
                lambda p, _: p.update_target()
            )
            # Also update KL Coeff
            if self.config["use_kl_loss"]:
                self.update_kl(fetches)


class APPOTrainer(impala.ImpalaTrainer):
    def __init__(self, config, *args, **kwargs):
        # Before init: Add the update target and kl hook.
        # This hook is called explicitly after each learner step in the
        # execution setup for IMPALA.
        config["after_train_step"] = UpdateTargetAndKL

        super().__init__(config, *args, **kwargs)

        # After init: Initialize target net.
        self.workers.local_worker().foreach_policy_to_train(
            lambda p, _: p.update_target()
        )

    @classmethod
    @override(Trainer)
    def get_default_config(cls) -> TrainerConfigDict:
        return DEFAULT_CONFIG

    @override(Trainer)
    def get_default_policy_class(
        self, config: PartialTrainerConfigDict
    ) -> Optional[Type[Policy]]:
        if config["framework"] == "torch":
            from ray.rllib.agents.ppo.appo_torch_policy import AsyncPPOTorchPolicy

            return AsyncPPOTorchPolicy
        elif config["framework"] == "tf":
            return AsyncPPOTFPolicy
        elif config["framework"] in ["tf2", "tfe"]:
            return AsyncPPOTFPolicy.as_eager()