ray/rllib/examples/bandit/tune_lin_ucb_train_recommendation.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

104 lines
3 KiB
Python
Raw Permalink Normal View History

""" Example of using LinUCB on a recommendation environment with parametric
actions. """
2022-03-21 08:55:55 -07:00
import argparse
from matplotlib import pyplot as plt
import os
import pandas as pd
import time
import ray
from ray import air, tune
from ray.tune import register_env
from ray.rllib.env.wrappers.recsim import (
MultiDiscreteToDiscreteActionWrapper,
RecSimObservationBanditWrapper,
)
from ray.rllib.examples.env.bandit_envs_recommender_system import (
ParametricRecSys,
)
# Because ParametricRecSys follows RecSim's API, we have to wrap it before
# it can work with our Bandits agent.
register_env(
"ParametricRecSysEnv",
lambda cfg: MultiDiscreteToDiscreteActionWrapper(
RecSimObservationBanditWrapper(ParametricRecSys(**cfg))
),
)
if __name__ == "__main__":
2022-03-21 08:55:55 -07:00
parser = argparse.ArgumentParser()
parser.add_argument(
"--framework",
choices=["tf2", "torch"],
default="torch",
help="The DL framework specifier.",
)
args = parser.parse_args()
print(f"Running with following CLI args: {args}")
# Temp fix to avoid OMP conflict.
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
ray.init()
config = {
2022-03-21 08:55:55 -07:00
"framework": args.framework,
"eager_tracing": (args.framework == "tf2"),
"env": "ParametricRecSysEnv",
"env_config": {
"embedding_size": 20,
"num_docs_to_select_from": 10,
"slate_size": 1,
"num_docs_in_db": 100,
"num_users_in_db": 1,
"user_time_budget": 1.0,
},
"num_envs_per_worker": 2, # Test with batched inference.
"evaluation_interval": 20,
"evaluation_duration": 100,
"evaluation_duration_unit": "episodes",
"simple_optimizer": True,
}
# Actual env timesteps per `train()` call will be
# 10 * min_sample_timesteps_per_iteration (100 by default) = 1,000.
training_iterations = 10
print("Running training for %s time steps" % training_iterations)
start_time = time.time()
tuner = tune.Tuner(
"BanditLinUCB",
param_space=config,
run_config=air.RunConfig(
stop={"training_iteration": training_iterations},
checkpoint_config=air.CheckpointConfig(
checkpoint_at_end=False,
),
),
tune_config=tune.TuneConfig(
num_samples=2,
),
)
results = tuner.fit()
print("The trials took", time.time() - start_time, "seconds\n")
# Analyze cumulative regrets of the trials
frame = pd.DataFrame()
for result in results:
frame = frame.append(result.metrics_dataframe, ignore_index=True)
x = frame.groupby("agent_timesteps_total")["episode_reward_mean"].aggregate(
["mean", "max", "min", "std"]
)
plt.plot(x["mean"])
plt.fill_between(
x.index, x["mean"] - x["std"], x["mean"] + x["std"], color="b", alpha=0.2
)
plt.title("Episode reward mean")
plt.xlabel("Training steps")
plt.show()