ray/rllib/examples/bandit/tune_lin_ts_train_wheel_env.py

""" Example of using Linear Thompson Sampling on WheelBandit environment.
    For more information on WheelBandit, see https://arxiv.org/abs/1802.09127 .
"""

import argparse
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import time

import ray
from ray import tune
from ray.rllib.algorithms.bandit.bandit import BanditLinTSTrainer
from ray.rllib.examples.env.bandit_envs_discrete import WheelBanditEnv


def plot_model_weights(means, covs, ax):
    fmts = ["bo", "ro", "yx", "k+", "gx"]
    labels = ["arm{}".format(i) for i in range(5)]

    ax.set_title("Weights distributions of arms")

    for i in range(0, 5):
        x, y = np.random.multivariate_normal(means[i] / 30, covs[i], 5000).T
        ax.plot(x, y, fmts[i], label=labels[i])

    ax.set_aspect("equal")
    ax.grid(True, which="both")
    ax.axhline(y=0, color="k")
    ax.axvline(x=0, color="k")
    ax.legend(loc="best")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--framework",
        choices=["tf2", "torch"],
        default="torch",
        help="The DL framework specifier.",
    )
    args = parser.parse_args()
    print(f"Running with following CLI args: {args}")

    ray.init(num_cpus=2)

    config = {
        "env": WheelBanditEnv,
        "framework": args.framework,
        "eager_tracing": (args.framework == "tf2"),
    }

    # Actual env steps per `train()` call will be
    # 10 * `min_sample_timesteps_per_reporting` (100 by default) = 1,000
    training_iterations = 10

    print("Running training for %s time steps" % training_iterations)

    start_time = time.time()
    analysis = tune.run(
        "BanditLinTS",
        config=config,
        stop={"training_iteration": training_iterations},
        num_samples=1,
        checkpoint_at_end=True,
    )

    print("The trials took", time.time() - start_time, "seconds\n")

    # Analyze cumulative regrets of the trials.
    frame = pd.DataFrame()
    for key, df in analysis.trial_dataframes.items():
        frame = frame.append(df, ignore_index=True)

    x = frame.groupby("agent_timesteps_total")["episode_reward_mean"].aggregate(
        ["mean", "max", "min", "std"]
    )

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))

    ax1.plot(x["mean"])

    ax1.set_title("Episode reward mean")
    ax1.set_xlabel("Training steps")

    # Restore trainer from checkpoint
    trial = analysis.trials[0]
    trainer = BanditLinTSTrainer(config=config)
    trainer.restore(trial.checkpoint.value)

    # Get model to plot arm weights distribution
    model = trainer.get_policy().model
    means = [model.arms[i].theta.numpy() for i in range(5)]
    covs = [model.arms[i].covariance.numpy() for i in range(5)]

    # Plot weight distributions for different arms
    plot_model_weights(means, covs, ax2)
    fig.tight_layout()
    plt.show()
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`""" Example of using Linear Thompson Sampling on WheelBandit environment.`
			`For more information on WheelBandit, see https://arxiv.org/abs/1802.09127 .`
			`"""`

[RLlib] TF2 Bandit Agent (#22838) 2022-03-21 08:55:55 -07:00			`import argparse`
[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`from matplotlib import pyplot as plt`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`import numpy as np`
			`import pandas as pd`
[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`import time`

			`import ray`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`from ray import tune`
[RLlib] Agents to algos: DQN w/o Apex and R2D2, DDPG/TD3, SAC, SlateQ, QMIX, PG, Bandits (#24896) 2022-05-19 09:30:42 -07:00			`from ray.rllib.algorithms.bandit.bandit import BanditLinTSTrainer`
[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`from ray.rllib.examples.env.bandit_envs_discrete import WheelBanditEnv`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00

			`def plot_model_weights(means, covs, ax):`
			`fmts = ["bo", "ro", "yx", "k+", "gx"]`
[RLlib] Remove all f-strings to keep py3.5 compatibility. 2020-04-30 20:10:16 +02:00			`labels = ["arm{}".format(i) for i in range(5)]`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
			`ax.set_title("Weights distributions of arms")`

			`for i in range(0, 5):`
			`x, y = np.random.multivariate_normal(means[i] / 30, covs[i], 5000).T`
			`ax.plot(x, y, fmts[i], label=labels[i])`

			`ax.set_aspect("equal")`
			`ax.grid(True, which="both")`
			`ax.axhline(y=0, color="k")`
			`ax.axvline(x=0, color="k")`
			`ax.legend(loc="best")`


			`if __name__ == "__main__":`
[RLlib] TF2 Bandit Agent (#22838) 2022-03-21 08:55:55 -07:00			`parser = argparse.ArgumentParser()`
			`parser.add_argument(`
			`"--framework",`
			`choices=["tf2", "torch"],`
			`default="torch",`
			`help="The DL framework specifier.",`
			`)`
			`args = parser.parse_args()`
			`print(f"Running with following CLI args: {args}")`

[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`ray.init(num_cpus=2)`

			`config = {`
			`"env": WheelBanditEnv,`
[RLlib] TF2 Bandit Agent (#22838) 2022-03-21 08:55:55 -07:00			`"framework": args.framework,`
			`"eager_tracing": (args.framework == "tf2"),`
[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`}`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
[RLlib] Deprecate `timesteps_per_iteration` config key (in favor of `min_[sample\|train]_timesteps_per_reporting`. (#24372) 2022-05-02 12:51:14 +02:00			# Actual env steps per `train()` call will be
			# 10 * `min_sample_timesteps_per_reporting` (100 by default) = 1,000
[RLlib] Fix bandit example scripts and add all scripts to CI testing suite. 2021-06-15 13:30:31 +02:00			`training_iterations = 10`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
			`print("Running training for %s time steps" % training_iterations)`

			`start_time = time.time()`
			`analysis = tune.run(`
[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`"BanditLinTS",`
			`config=config,`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`stop={"training_iteration": training_iterations},`
[RLlib] TF2 Bandit Agent (#22838) 2022-03-21 08:55:55 -07:00			`num_samples=1,`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`checkpoint_at_end=True,`
			`)`

			`print("The trials took", time.time() - start_time, "seconds\n")`

[RLlib] Fix bandit example scripts and add all scripts to CI testing suite. 2021-06-15 13:30:31 +02:00			`# Analyze cumulative regrets of the trials.`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`frame = pd.DataFrame()`
			`for key, df in analysis.trial_dataframes.items():`
			`frame = frame.append(df, ignore_index=True)`

[RLlib] Fix bandit example scripts and add all scripts to CI testing suite. 2021-06-15 13:30:31 +02:00			`x = frame.groupby("agent_timesteps_total")["episode_reward_mean"].aggregate(`
			`["mean", "max", "min", "std"]`
			`)`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
			`fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))`

			`ax1.plot(x["mean"])`

[RLlib] Fix bandit example scripts and add all scripts to CI testing suite. 2021-06-15 13:30:31 +02:00			`ax1.set_title("Episode reward mean")`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`ax1.set_xlabel("Training steps")`

			`# Restore trainer from checkpoint`
			`trial = analysis.trials[0]`
[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`trainer = BanditLinTSTrainer(config=config)`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`trainer.restore(trial.checkpoint.value)`

			`# Get model to plot arm weights distribution`
			`model = trainer.get_policy().model`
			`means = [model.arms[i].theta.numpy() for i in range(5)]`
			`covs = [model.arms[i].covariance.numpy() for i in range(5)]`

			`# Plot weight distributions for different arms`
			`plot_model_weights(means, covs, ax2)`
			`fig.tight_layout()`
			`plt.show()`