ray/rllib/examples/bandit/tune_lin_ucb_train_recommendation.py

""" Example of using LinUCB on a recommendation environment with parametric
    actions. """

import argparse
from matplotlib import pyplot as plt
import os
import pandas as pd
import time

import ray
from ray import tune
from ray.tune import register_env
from ray.rllib.env.wrappers.recsim import (
    MultiDiscreteToDiscreteActionWrapper,
    RecSimObservationBanditWrapper,
)
from ray.rllib.examples.env.bandit_envs_recommender_system import (
    ParametricRecSys,
)

# Because ParametricRecSys follows RecSim's API, we have to wrap it before
# it can work with our Bandits agent.
register_env(
    "ParametricRecSysEnv",
    lambda cfg: MultiDiscreteToDiscreteActionWrapper(
        RecSimObservationBanditWrapper(ParametricRecSys(**cfg))
    ),
)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--framework",
        choices=["tf2", "torch"],
        default="torch",
        help="The DL framework specifier.",
    )
    args = parser.parse_args()
    print(f"Running with following CLI args: {args}")

    # Temp fix to avoid OMP conflict.
    os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

    ray.init()

    config = {
        "framework": args.framework,
        "eager_tracing": (args.framework == "tf2"),
        "env": "ParametricRecSysEnv",
        "env_config": {
            "embedding_size": 20,
            "num_docs_to_select_from": 10,
            "slate_size": 1,
            "num_docs_in_db": 100,
            "num_users_in_db": 1,
            "user_time_budget": 1.0,
        },
        "num_envs_per_worker": 2,  # Test with batched inference.
        "evaluation_interval": 20,
        "evaluation_duration": 100,
        "evaluation_duration_unit": "episodes",
        "simple_optimizer": True,
    }

    # Actual training_iterations will be 10 * timesteps_per_iteration
    # (100 by default) = 2,000
    training_iterations = 10

    print("Running training for %s time steps" % training_iterations)

    start_time = time.time()
    analysis = tune.run(
        "BanditLinUCB",
        config=config,
        stop={"training_iteration": training_iterations},
        num_samples=2,
        checkpoint_at_end=False,
    )

    print("The trials took", time.time() - start_time, "seconds\n")

    # Analyze cumulative regrets of the trials
    frame = pd.DataFrame()
    for key, df in analysis.trial_dataframes.items():
        frame = frame.append(df, ignore_index=True)
    x = frame.groupby("agent_timesteps_total")["episode_reward_mean"].aggregate(
        ["mean", "max", "min", "std"]
    )

    plt.plot(x["mean"])
    plt.fill_between(
        x.index, x["mean"] - x["std"], x["mean"] + x["std"], color="b", alpha=0.2
    )
    plt.title("Episode reward mean")
    plt.xlabel("Training steps")
    plt.show()
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`""" Example of using LinUCB on a recommendation environment with parametric`
			`actions. """`

[RLlib] TF2 Bandit Agent (#22838) 2022-03-21 08:55:55 -07:00			`import argparse`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`from matplotlib import pyplot as plt`
[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`import os`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`import pandas as pd`
[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`import time`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
[RLlib] Enable Bandits to work in batches mode(s) (vector envs + multiple workers + train_batch_sizes > 1). (#22465) 2022-02-17 22:32:26 +01:00			`import ray`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`from ray import tune`
[RLlib] Update bandit_envs_recommender_system (#22421) 2022-02-24 13:43:41 -08:00			`from ray.tune import register_env`
			`from ray.rllib.env.wrappers.recsim import (`
			`MultiDiscreteToDiscreteActionWrapper,`
			`RecSimObservationBanditWrapper,`
			`)`
			`from ray.rllib.examples.env.bandit_envs_recommender_system import (`
			`ParametricRecSys,`
			`)`

			`# Because ParametricRecSys follows RecSim's API, we have to wrap it before`
			`# it can work with our Bandits agent.`
			`register_env(`
			`"ParametricRecSysEnv",`
			`lambda cfg: MultiDiscreteToDiscreteActionWrapper(`
			`RecSimObservationBanditWrapper(ParametricRecSys(**cfg))`
			`),`
			`)`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
			`if __name__ == "__main__":`
[RLlib] TF2 Bandit Agent (#22838) 2022-03-21 08:55:55 -07:00			`parser = argparse.ArgumentParser()`
			`parser.add_argument(`
			`"--framework",`
			`choices=["tf2", "torch"],`
			`default="torch",`
			`help="The DL framework specifier.",`
			`)`
			`args = parser.parse_args()`
			`print(f"Running with following CLI args: {args}")`

[RLlib] Enable Bandits to work in batches mode(s) (vector envs + multiple workers + train_batch_sizes > 1). (#22465) 2022-02-17 22:32:26 +01:00			`# Temp fix to avoid OMP conflict.`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`os.environ["KMP_DUPLICATE_LIB_OK"] = "True"`

[RLlib] Enable Bandits to work in batches mode(s) (vector envs + multiple workers + train_batch_sizes > 1). (#22465) 2022-02-17 22:32:26 +01:00			`ray.init()`

[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`config = {`
[RLlib] TF2 Bandit Agent (#22838) 2022-03-21 08:55:55 -07:00			`"framework": args.framework,`
			`"eager_tracing": (args.framework == "tf2"),`
[RLlib] Update bandit_envs_recommender_system (#22421) 2022-02-24 13:43:41 -08:00			`"env": "ParametricRecSysEnv",`
			`"env_config": {`
			`"embedding_size": 20,`
			`"num_docs_to_select_from": 10,`
			`"slate_size": 1,`
			`"num_docs_in_db": 100,`
			`"num_users_in_db": 1,`
			`"user_time_budget": 1.0,`
			`},`
[RLlib] Enable Bandits to work in batches mode(s) (vector envs + multiple workers + train_batch_sizes > 1). (#22465) 2022-02-17 22:32:26 +01:00			`"num_envs_per_worker": 2, # Test with batched inference.`
[RLlib] Update bandit_envs_recommender_system (#22421) 2022-02-24 13:43:41 -08:00			`"evaluation_interval": 20,`
			`"evaluation_duration": 100,`
			`"evaluation_duration_unit": "episodes",`
			`"simple_optimizer": True,`
[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`}`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
[RLlib] Fix bandit example scripts and add all scripts to CI testing suite. 2021-06-15 13:30:31 +02:00			`# Actual training_iterations will be 10 * timesteps_per_iteration`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`# (100 by default) = 2,000`
[RLlib] Fix bandit example scripts and add all scripts to CI testing suite. 2021-06-15 13:30:31 +02:00			`training_iterations = 10`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
			`print("Running training for %s time steps" % training_iterations)`

			`start_time = time.time()`
			`analysis = tune.run(`
[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`"BanditLinUCB",`
			`config=config,`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`stop={"training_iteration": training_iterations},`
[RLlib] Fix bandit example scripts and add all scripts to CI testing suite. 2021-06-15 13:30:31 +02:00			`num_samples=2,`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`checkpoint_at_end=False,`
			`)`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
			`print("The trials took", time.time() - start_time, "seconds\n")`

			`# Analyze cumulative regrets of the trials`
			`frame = pd.DataFrame()`
			`for key, df in analysis.trial_dataframes.items():`
			`frame = frame.append(df, ignore_index=True)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`x = frame.groupby("agent_timesteps_total")["episode_reward_mean"].aggregate(`
			`["mean", "max", "min", "std"]`
			`)`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
			`plt.plot(x["mean"])`
			`plt.fill_between(`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`x.index, x["mean"] - x["std"], x["mean"] + x["std"], color="b", alpha=0.2`
			`)`
[RLlib] Fix bandit example scripts and add all scripts to CI testing suite. 2021-06-15 13:30:31 +02:00			`plt.title("Episode reward mean")`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`plt.xlabel("Training steps")`
			`plt.show()`