# Pendulum SAC can attain -150+ reward in 6-7k # Configurations are the similar to original softlearning/sac codebase pendulum_sac: env: Pendulum-v0 run: SAC stop: episode_reward_mean: -150 config: horizon: 200 soft_horizon: False Q_model: hidden_activation: relu hidden_layer_sizes: [256, 256] policy_model: hidden_activation: relu hidden_layer_sizes: [256, 256] tau: 0.005 target_entropy: auto no_done_at_end: True n_step: 1 sample_batch_size: 1 prioritized_replay: False train_batch_size: 256 target_network_update_freq: 1 timesteps_per_iteration: 1000 learning_starts: 256 optimization: actor_learning_rate: 0.0003 critic_learning_rate: 0.0003 entropy_learning_rate: 0.0003 num_workers: 0 num_gpus: 0 clip_actions: False normalize_actions: True evaluation_interval: 1 metrics_smoothing_episodes: 5