cartpole-appo-vtrace-separate-losses: env: CartPole-v0 run: APPO stop: episode_reward_mean: 150 timesteps_total: 200000 config: # Only works for tf|tf2 so far. framework: tf # Switch on >1 loss/optimizer API for TFPolicy and EagerTFPolicy. _tf_policy_handles_more_than_one_loss: true # APPO will produce two separate loss terms: # policy loss + value function loss. _separate_vf_optimizer: true # Separate learning rate for the value function branch. _lr_vf: 0.00075 num_envs_per_worker: 5 num_workers: 1 num_gpus: 0 observation_filter: MeanStdFilter num_sgd_iter: 6 vf_loss_coeff: 0.01 vtrace: true model: fcnet_hiddens: [32] fcnet_activation: linear # Make sure we really have completely separate branches. vf_share_layers: false