"""Example of using LinUCB on a RecSim environment. """

from matplotlib import pyplot as plt
import pandas as pd
import time

from ray import tune
import ray.rllib.examples.env.recsim_recommender_system_envs  # noqa


if __name__ == "__main__":
    ray.init()

    config = {
        # "RecSim-v1" is a pre-registered RecSim env.
        # Alternatively, you can do:
        # `from ray.rllib.examples.env.recsim_recommender_system_envs import ...`
        # - LongTermSatisfactionRecSimEnv
        # - InterestExplorationRecSimEnv
        # - InterestEvolutionRecSimEnv
        # Then: "env": [the imported RecSim class]
        "env": "RecSim-v1",
        "env_config": {
            "convert_to_discrete_action_space": True,
            "wrap_for_bandits": True,
        },
    }

    # Actual training_iterations will be 10 * timesteps_per_iteration
    # (100 by default) = 2,000
    training_iterations = 10

    print("Running training for %s time steps" % training_iterations)

    start_time = time.time()
    analysis = tune.run(
        "BanditLinUCB",
        config=config,
        stop={"training_iteration": training_iterations},
        num_samples=1,
        checkpoint_at_end=False,
    )

    print("The trials took", time.time() - start_time, "seconds\n")

    # Analyze cumulative regrets of the trials
    frame = pd.DataFrame()
    for key, df in analysis.trial_dataframes.items():
        frame = frame.append(df, ignore_index=True)
    x = frame.groupby("agent_timesteps_total")["episode_reward_mean"].aggregate(
        ["mean", "max", "min", "std"]
    )

    plt.plot(x["mean"])
    plt.fill_between(
        x.index, x["mean"] - x["std"], x["mean"] + x["std"], color="b", alpha=0.2
    )
    plt.title("Episode reward mean")
    plt.xlabel("Training steps")
    plt.show()