invertedpendulum-td3:
    # This is a TD3 with stopping conditions and network size tuned specifically
    # for InvertedPendulum. Should be able to reach 1,000 reward (the maximum
    # achievable) in 10,000 to 20,000 steps.
    env: InvertedPendulum-v2
    run: TD3
    stop:
        episode_reward_mean: 9999.9
        time_total_s: 900 # 15 minutes
        timesteps_total: 1000000
    config:
        # === Model ===
        actor_hiddens: [32, 32]
        critic_hiddens: [32, 32]

        # === Exploration ===
        learning_starts: 1000
        pure_exploration_steps: 1000

        # === Evaluation ===
        evaluation_interval: 1
        evaluation_num_episodes: 5