interest-evolution-recsim-env-slateq:
    env: ray.rllib.examples.env.recommender_system_envs_with_recsim.InterestEvolutionRecSimEnv
    run: SlateQ
    stop:
        episode_reward_mean: 160.0
        timesteps_total: 100000
    config:
        framework: tf

        # RLlib/RecSim wrapper specific settings:
        env_config:
            # Env class specified above takes one `config` arg in its c'tor:
            config:
                # Each step, sample `num_candidates` documents using the env-internal
                # document sampler model (a logic that creates n documents to select
                # the slate from).
                resample_documents: true
                num_candidates: 10
                # How many documents to recommend (out of `num_candidates`) each
                # timestep?
                slate_size: 2
                # Should the action space be purely Discrete? Useful for algos that
                # don't support MultiDiscrete (e.g. DQN or Bandits).
                # SlateQ handles MultiDiscrete action spaces.
                convert_to_discrete_action_space: false
                seed: 0

        # Fake 2 GPUs.
        num_gpus: 2
        _fake_gpus: true

        exploration_config:
            warmup_timesteps: 10000
            epsilon_timesteps: 25000

        replay_buffer_config:
            capacity: 100000

        # Double learning rate and batch size.
        lr: 0.002
        train_batch_size: 64

        learning_starts: 10000
        target_network_update_freq: 3200

        metrics_num_episodes_for_smoothing: 200