invertedpendulum-td3: # This is a TD3 with stopping conditions and network size tuned specifically # for InvertedPendulum. Should be able to reach 1,000 reward (the maximum # achievable) in 10,000 to 20,000 steps. env: InvertedPendulum-v2 run: TD3 stop: episode_reward_mean: 9999.9 time_total_s: 900 # 15 minutes timesteps_total: 1000000 config: # === Model === actor_hiddens: [32, 32] critic_hiddens: [32, 32] # === Exploration === learning_starts: 1000 pure_exploration_steps: 1000 # === Evaluation === evaluation_interval: 1 evaluation_num_episodes: 5