invertedpendulum-td3: # This is a TD3 with stopping conditions and network size tuned specifically # for InvertedPendulum. Should be able to reach 1,000 reward (the maximum # achievable) in 10,000 to 20,000 steps. env: InvertedPendulum-v2 run: TD3 stop: episode_reward_mean: 9999.9 time_total_s: 900 # 15 minutes timesteps_total: 1000000 config: # Works for both torch and tf. framework: tf # === Model === actor_hiddens: [32, 32] critic_hiddens: [32, 32] # === Exploration === replay_buffer_config: learning_starts: 1000 exploration_config: random_timesteps: 1000 # === Evaluation === evaluation_interval: 10 evaluation_duration: 5