ray/rllib/tuned_examples/slateq/long-term-satisfaction-recsim-env-slateq.yaml

long-term-satisfaction-recsim-env-slateq:
    env: ray.rllib.examples.env.recommender_system_envs_with_recsim.LongTermSatisfactionRecSimEnv
    run: SlateQ
    stop:
        # Random baseline rewards:
        # num_candidates=20; slate_size=2; resample=true: ~951
        # num_candidates=50; slate_size=3; resample=true: ~946
        evaluation/episode_reward_mean: 1000.0
        timesteps_total: 200000
    config:
        # SlateQ only supported for torch so far.
        framework: torch

        metrics_num_episodes_for_smoothing: 1000

        # RLlib/RecSim wrapper specific settings:
        env_config:
            config:
                # Each step, sample `num_candidates` documents using the env-internal
                # document sampler model (a logic that creates n documents to select
                # the slate from).
                resample_documents: true
                num_candidates: 50
                # How many documents to recommend (out of `num_candidates`) each
                # timestep?
                slate_size: 2
                # Should the action space be purely Discrete? Useful for algos that
                # don't support MultiDiscrete (e.g. DQN or Bandits).
                # SlateQ handles MultiDiscrete action spaces.
                convert_to_discrete_action_space: false
                seed: 42

        exploration_config:
            type: SlateSoftQ
            temperature: 0.7

        hiddens: [256, 256]

        num_workers: 0
        num_gpus: 0

        lr_choice_model: 0.002
        lr_q_model: 0.001

        target_network_update_freq: 800
        tau: 1.0

        # Evaluation settings.
        evaluation_interval: 1
        evaluation_num_workers: 4
        evaluation_duration: 200
        evaluation_duration_unit: episodes
        evaluation_parallel_to_training: true