"""Using an RLlib-ready RecSim environment and the SlateQ algorithm
for solving recommendation system problems.

This example supports three different RecSim (RLlib-ready) environments,
configured via the --env option:
- "long-term-satisfaction"
- "interest-exploration"
- "interest-evolution"
"""

import argparse
import numpy as np
from scipy.stats import sem

import ray
from ray import tune
from ray.rllib.algorithms import slateq
from ray.rllib.algorithms import dqn
from ray.rllib.examples.env.recommender_system_envs_with_recsim import (
    InterestEvolutionRecSimEnv,
    InterestExplorationRecSimEnv,
    LongTermSatisfactionRecSimEnv,
)
from ray.rllib.utils.test_utils import check_learning_achieved
from ray.tune.logger import pretty_print

parser = argparse.ArgumentParser()
parser.add_argument(
    "--run",
    type=str,
    default="SlateQ",
    choices=["SlateQ", "DQN"],
    help=("Select agent policy. Choose from: DQN and SlateQ. Default value: SlateQ."),
)
parser.add_argument(
    "--framework",
    choices=["tf", "tf2", "tfe", "torch"],
    default="tf",
    help="The DL framework specifier.",
)
parser.add_argument(
    "--env",
    type=str,
    default="interest-evolution",
    choices=["interest-evolution", "interest-exploration", "long-term-satisfaction"],
    help=("Select the RecSim env to use."),
)
parser.add_argument("--learning-starts", type=int, default=20000)
parser.add_argument(
    "--random-test-episodes",
    type=int,
    default=0,
    help="The number of test episodes to run with a random agent to figure out "
    "up front what the random baseline reward is.",
)
parser.add_argument(
    "--use-tune",
    action="store_true",
    help=(
        "Run with Tune so that the results are logged into Tensorboard. "
        "For debugging, it's easier to run without Ray Tune."
    ),
)
parser.add_argument("--tune-num-samples", type=int, default=1)

parser.add_argument(
    "--env-num-candidates",
    type=int,
    default=100,
    help="The number of candidates that the agent has to pick "
    "`--env-slate-size` from each timestep. These candidates will be "
    "sampled by the environment's built-in document sampler model.",
)
parser.add_argument(
    "--env-slate-size",
    type=int,
    default=2,
    help="The size of the slate to recommend (from out of "
    "`--env-num-candidates` sampled docs) each timestep.",
)
parser.add_argument(
    "--env-dont-resample-documents",
    action="store_true",
    help="Whether to NOT resample `--env-num-candidates` docs "
    "each timestep. If set, the env will only sample `--env-num-candidates`"
    " once at the beginning and the agent always has to pick "
    "`--env-slate-size` docs from this sample.",
)
parser.add_argument("--env-seed", type=int, default=0)
parser.add_argument("--num-cpus", type=int, default=0)
parser.add_argument("--num-gpus", type=float, default=0)
parser.add_argument("--num-workers", type=int, default=0)

parser.add_argument(
    "--local-mode",
    action="store_true",
    help="Init Ray in local mode for easier debugging.",
)
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument("--stop-reward", type=float, default=160.0)
parser.add_argument("--stop-timesteps", type=int, default=150000)


def main():
    args = parser.parse_args()
    ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)

    env_config = {
        "num_candidates": args.env_num_candidates,
        "resample_documents": not args.env_dont_resample_documents,
        "slate_size": args.env_slate_size,
        "seed": args.env_seed,
        "convert_to_discrete_action_space": args.run == "DQN",
    }

    config = {
        "env": (
            InterestEvolutionRecSimEnv
            if args.env == "interest-evolution"
            else InterestExplorationRecSimEnv
            if args.env == "interest-exploration"
            else LongTermSatisfactionRecSimEnv
        ),
        "framework": args.framework,
        "num_gpus": args.num_gpus,
        "num_workers": args.num_workers,
        "env_config": env_config,
        "replay_buffer_config": {
            "learning_starts": args.learning_starts,
        },
    }

    # Perform a test run on the env with a random agent to see, what
    # the random baseline reward is.
    if args.random_test_episodes:
        print(
            f"Running {args.random_test_episodes} episodes to get a random "
            "agent's baseline reward ..."
        )
        env = config["env"](config=env_config)
        env.reset()
        num_episodes = 0
        episode_rewards = []
        episode_reward = 0.0
        while num_episodes < args.random_test_episodes:
            action = env.action_space.sample()
            _, r, d, _ = env.step(action)
            episode_reward += r
            if d:
                num_episodes += 1
                episode_rewards.append(episode_reward)
                episode_reward = 0.0
                env.reset()
        print(
            f"Ran {args.random_test_episodes} episodes with a random agent "
            "reaching a mean episode return of "
            f"{np.mean(episode_rewards)}+/-{sem(episode_rewards)}."
        )

    if args.use_tune:
        stop = {
            "training_iteration": args.stop_iters,
            "timesteps_total": args.stop_timesteps,
            "episode_reward_mean": args.stop_reward,
        }

        results = tune.run(
            args.run,
            stop=stop,
            config=config,
            num_samples=args.tune_num_samples,
            verbose=2,
        )

        if args.as_test:
            check_learning_achieved(results, args.stop_reward)

    else:
        # Directly run using the trainer interface (good for debugging).
        if args.run == "DQN":
            trainer = dqn.DQN(config=config)
        else:
            trainer = slateq.SlateQ(config=config)
        for i in range(10):
            result = trainer.train()
            print(pretty_print(result))
    ray.shutdown()


if __name__ == "__main__":
    main()