ray/rllib/tuned_examples/slateq/parametric-item-reco-env-slateq.yaml

50 lines
1.3 KiB
YAML

parametric-item-reco-env-slateq:
env: ray.rllib.examples.env.bandit_envs_recommender_system.ParametricItemRecoEnv
run: SlateQ
stop:
#evaluation/episode_reward_mean: 48.0
timesteps_total: 200000
config:
# SlateQ only supported for torch so far.
framework: torch
metrics_num_episodes_for_smoothing: 200
exploration_config:
temperature: 0.7
# Env c'tor kwargs:
env_config:
config:
slate_q: true
num_users: 50
num_items: 1000
num_candidates: 50
slate_size: 1
feature_dim: 16
grad_clip: 10.0
#double_q: false
#slateq_strategy: MYOP
# Larger networks seem to help (large obs/action spaces).
hiddens: [512, 512]
# Larger batch sizes seem to help (more stability, even with higher lr).
train_batch_size: 64
num_workers: 0
num_gpus: 0
lr_choice_model: 0.01
lr_q_model: 0.01
target_network_update_freq: 500
tau: 1.0
# Evaluation settings.
evaluation_interval: 1
evaluation_num_workers: 4
evaluation_duration: 200
evaluation_duration_unit: episodes
evaluation_parallel_to_training: true