2022-02-04 17:01:12 +01:00
|
|
|
long-term-satisfaction-recsim-env-slateq:
|
2022-02-22 09:36:44 +01:00
|
|
|
env: ray.rllib.examples.env.recommender_system_envs_with_recsim.LongTermSatisfactionRecSimEnv
|
2022-02-04 17:01:12 +01:00
|
|
|
run: SlateQ
|
|
|
|
stop:
|
2022-02-22 09:36:44 +01:00
|
|
|
# Random baseline rewards:
|
|
|
|
# num_candidates=20; slate_size=2; resample=true: ~951
|
|
|
|
# num_candidates=50; slate_size=3; resample=true: ~946
|
|
|
|
evaluation/episode_reward_mean: 1000.0
|
|
|
|
timesteps_total: 200000
|
2022-02-04 17:01:12 +01:00
|
|
|
config:
|
2022-03-18 13:45:16 +01:00
|
|
|
# Works for both tf and torch.
|
|
|
|
framework: tf
|
2022-02-04 17:01:12 +01:00
|
|
|
|
2022-03-18 13:45:16 +01:00
|
|
|
metrics_num_episodes_for_smoothing: 200
|
2022-02-22 09:36:44 +01:00
|
|
|
|
2022-02-04 17:01:12 +01:00
|
|
|
# RLlib/RecSim wrapper specific settings:
|
|
|
|
env_config:
|
|
|
|
config:
|
2022-02-22 09:36:44 +01:00
|
|
|
# Each step, sample `num_candidates` documents using the env-internal
|
2022-02-04 17:01:12 +01:00
|
|
|
# document sampler model (a logic that creates n documents to select
|
|
|
|
# the slate from).
|
|
|
|
resample_documents: true
|
2022-02-22 09:36:44 +01:00
|
|
|
num_candidates: 50
|
2022-02-04 17:01:12 +01:00
|
|
|
# How many documents to recommend (out of `num_candidates`) each
|
|
|
|
# timestep?
|
|
|
|
slate_size: 2
|
|
|
|
# Should the action space be purely Discrete? Useful for algos that
|
2022-02-22 09:36:44 +01:00
|
|
|
# don't support MultiDiscrete (e.g. DQN or Bandits).
|
2022-02-04 17:01:12 +01:00
|
|
|
# SlateQ handles MultiDiscrete action spaces.
|
|
|
|
convert_to_discrete_action_space: false
|
|
|
|
seed: 42
|
|
|
|
|
2022-02-22 09:36:44 +01:00
|
|
|
exploration_config:
|
2022-03-18 13:45:16 +01:00
|
|
|
warmup_timesteps: 10000
|
|
|
|
epsilon_timesteps: 60000
|
2022-02-22 09:36:44 +01:00
|
|
|
|
2022-03-18 13:45:16 +01:00
|
|
|
target_network_update_freq: 3200
|