ray/rllib/examples/bandit/tune_lin_ucb_train_recommendation.py

""" Example of using LinUCB on a recommendation environment with parametric
    actions. """

import argparse
from matplotlib import pyplot as plt
import os
import pandas as pd
import time

import ray
from ray import air, tune
from ray.tune import register_env
from ray.rllib.env.wrappers.recsim import (
    MultiDiscreteToDiscreteActionWrapper,
    RecSimObservationBanditWrapper,
)
from ray.rllib.examples.env.bandit_envs_recommender_system import (
    ParametricRecSys,
)

# Because ParametricRecSys follows RecSim's API, we have to wrap it before
# it can work with our Bandits agent.
register_env(
    "ParametricRecSysEnv",
    lambda cfg: MultiDiscreteToDiscreteActionWrapper(
        RecSimObservationBanditWrapper(ParametricRecSys(**cfg))
    ),
)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--framework",
        choices=["tf2", "torch"],
        default="torch",
        help="The DL framework specifier.",
    )
    args = parser.parse_args()
    print(f"Running with following CLI args: {args}")

    # Temp fix to avoid OMP conflict.
    os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

    ray.init()

    config = {
        "framework": args.framework,
        "eager_tracing": (args.framework == "tf2"),
        "env": "ParametricRecSysEnv",
        "env_config": {
            "embedding_size": 20,
            "num_docs_to_select_from": 10,
            "slate_size": 1,
            "num_docs_in_db": 100,
            "num_users_in_db": 1,
            "user_time_budget": 1.0,
        },
        "num_envs_per_worker": 2,  # Test with batched inference.
        "evaluation_interval": 20,
        "evaluation_duration": 100,
        "evaluation_duration_unit": "episodes",
        "simple_optimizer": True,
    }

    # Actual env timesteps per `train()` call will be
    # 10 * min_sample_timesteps_per_iteration (100 by default) = 1,000.
    training_iterations = 10

    print("Running training for %s time steps" % training_iterations)

    start_time = time.time()
    tuner = tune.Tuner(
        "BanditLinUCB",
        param_space=config,
        run_config=air.RunConfig(
            stop={"training_iteration": training_iterations},
            checkpoint_config=air.CheckpointConfig(
                checkpoint_at_end=False,
            ),
        ),
        tune_config=tune.TuneConfig(
            num_samples=2,
        ),
    )
    results = tuner.fit()

    print("The trials took", time.time() - start_time, "seconds\n")

    # Analyze cumulative regrets of the trials
    frame = pd.DataFrame()
    for result in results:
        frame = frame.append(result.metrics_dataframe, ignore_index=True)
    x = frame.groupby("agent_timesteps_total")["episode_reward_mean"].aggregate(
        ["mean", "max", "min", "std"]
    )

    plt.plot(x["mean"])
    plt.fill_between(
        x.index, x["mean"] - x["std"], x["mean"] + x["std"], color="b", alpha=0.2
    )
    plt.title("Episode reward mean")
    plt.xlabel("Training steps")
    plt.show()
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`""" Example of using LinUCB on a recommendation environment with parametric`
			`actions. """`

[RLlib] TF2 Bandit Agent (#22838) 2022-03-21 08:55:55 -07:00			`import argparse`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`from matplotlib import pyplot as plt`
[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`import os`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`import pandas as pd`
[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`import time`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
[RLlib] Enable Bandits to work in batches mode(s) (vector envs + multiple workers + train_batch_sizes > 1). (#22465) 2022-02-17 22:32:26 +01:00			`import ray`
[air] update rllib example to use Tuner API. (#26987) update rllib example to use Tuner API. Signed-off-by: xwjiang2010 <xwjiang2010@gmail.com> 2022-07-27 04:12:59 -07:00			`from ray import air, tune`
[RLlib] Update bandit_envs_recommender_system (#22421) 2022-02-24 13:43:41 -08:00			`from ray.tune import register_env`
			`from ray.rllib.env.wrappers.recsim import (`
			`MultiDiscreteToDiscreteActionWrapper,`
			`RecSimObservationBanditWrapper,`
			`)`
			`from ray.rllib.examples.env.bandit_envs_recommender_system import (`
			`ParametricRecSys,`
			`)`

			`# Because ParametricRecSys follows RecSim's API, we have to wrap it before`
			`# it can work with our Bandits agent.`
			`register_env(`
			`"ParametricRecSysEnv",`
			`lambda cfg: MultiDiscreteToDiscreteActionWrapper(`
			`RecSimObservationBanditWrapper(ParametricRecSys(**cfg))`
			`),`
			`)`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
			`if __name__ == "__main__":`
[RLlib] TF2 Bandit Agent (#22838) 2022-03-21 08:55:55 -07:00			`parser = argparse.ArgumentParser()`
			`parser.add_argument(`
			`"--framework",`
			`choices=["tf2", "torch"],`
			`default="torch",`
			`help="The DL framework specifier.",`
			`)`
			`args = parser.parse_args()`
			`print(f"Running with following CLI args: {args}")`

[RLlib] Enable Bandits to work in batches mode(s) (vector envs + multiple workers + train_batch_sizes > 1). (#22465) 2022-02-17 22:32:26 +01:00			`# Temp fix to avoid OMP conflict.`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`os.environ["KMP_DUPLICATE_LIB_OK"] = "True"`

[RLlib] Enable Bandits to work in batches mode(s) (vector envs + multiple workers + train_batch_sizes > 1). (#22465) 2022-02-17 22:32:26 +01:00			`ray.init()`

[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`config = {`
[RLlib] TF2 Bandit Agent (#22838) 2022-03-21 08:55:55 -07:00			`"framework": args.framework,`
			`"eager_tracing": (args.framework == "tf2"),`
[RLlib] Update bandit_envs_recommender_system (#22421) 2022-02-24 13:43:41 -08:00			`"env": "ParametricRecSysEnv",`
			`"env_config": {`
			`"embedding_size": 20,`
			`"num_docs_to_select_from": 10,`
			`"slate_size": 1,`
			`"num_docs_in_db": 100,`
			`"num_users_in_db": 1,`
			`"user_time_budget": 1.0,`
			`},`
[RLlib] Enable Bandits to work in batches mode(s) (vector envs + multiple workers + train_batch_sizes > 1). (#22465) 2022-02-17 22:32:26 +01:00			`"num_envs_per_worker": 2, # Test with batched inference.`
[RLlib] Update bandit_envs_recommender_system (#22421) 2022-02-24 13:43:41 -08:00			`"evaluation_interval": 20,`
			`"evaluation_duration": 100,`
			`"evaluation_duration_unit": "episodes",`
			`"simple_optimizer": True,`
[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`}`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
[RLlib] Deprecate `timesteps_per_iteration` config key (in favor of `min_[sample\|train]_timesteps_per_reporting`. (#24372) 2022-05-02 12:51:14 +02:00			# Actual env timesteps per `train()` call will be
[RLlib] Trainer.training_iteration -> Trainer.training_step; Iterations vs reportings: Clarification of terms. (#25076) 2022-06-10 17:09:18 +02:00			`# 10 * min_sample_timesteps_per_iteration (100 by default) = 1,000.`
[RLlib] Fix bandit example scripts and add all scripts to CI testing suite. 2021-06-15 13:30:31 +02:00			`training_iterations = 10`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
			`print("Running training for %s time steps" % training_iterations)`

			`start_time = time.time()`
[air] update rllib example to use Tuner API. (#26987) update rllib example to use Tuner API. Signed-off-by: xwjiang2010 <xwjiang2010@gmail.com> 2022-07-27 04:12:59 -07:00			`tuner = tune.Tuner(`
[RLlib] Move bandits into main agents folder; Make RecSim adapter more accessible; (#21773) 2022-01-27 13:58:12 +01:00			`"BanditLinUCB",`
[air] update rllib example to use Tuner API. (#26987) update rllib example to use Tuner API. Signed-off-by: xwjiang2010 <xwjiang2010@gmail.com> 2022-07-27 04:12:59 -07:00			`param_space=config,`
			`run_config=air.RunConfig(`
			`stop={"training_iteration": training_iterations},`
			`checkpoint_config=air.CheckpointConfig(`
			`checkpoint_at_end=False,`
			`),`
			`),`
			`tune_config=tune.TuneConfig(`
			`num_samples=2,`
			`),`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`)`
[air] update rllib example to use Tuner API. (#26987) update rllib example to use Tuner API. Signed-off-by: xwjiang2010 <xwjiang2010@gmail.com> 2022-07-27 04:12:59 -07:00			`results = tuner.fit()`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
			`print("The trials took", time.time() - start_time, "seconds\n")`

			`# Analyze cumulative regrets of the trials`
			`frame = pd.DataFrame()`
[air] update rllib example to use Tuner API. (#26987) update rllib example to use Tuner API. Signed-off-by: xwjiang2010 <xwjiang2010@gmail.com> 2022-07-27 04:12:59 -07:00			`for result in results:`
			`frame = frame.append(result.metrics_dataframe, ignore_index=True)`
[RLlib] Fix bandit example scripts and add all scripts to CI testing suite. 2021-06-15 13:30:31 +02:00			`x = frame.groupby("agent_timesteps_total")["episode_reward_mean"].aggregate(`
			`["mean", "max", "min", "std"]`
			`)`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00
			`plt.plot(x["mean"])`
			`plt.fill_between(`
			`x.index, x["mean"] - x["std"], x["mean"] + x["std"], color="b", alpha=0.2`
			`)`
[RLlib] Fix bandit example scripts and add all scripts to CI testing suite. 2021-06-15 13:30:31 +02:00			`plt.title("Episode reward mean")`
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`plt.xlabel("Training steps")`
			`plt.show()`