pendulum_crr: env: 'Pendulum-v1' run: CRR stop: # We could make this -200, but given that we have 4 cpus for our tests, we will have to settle for -300. evaluation/episode_reward_mean: -300 timesteps_total: 2000000 config: input: 'dataset' input_config: paths: 'tests/data/pendulum/pendulum_replay_v1.1.0.zip' format: 'json' framework: torch num_workers: 3 gamma: 0.99 train_batch_size: 1024 critic_hidden_activation: 'relu' critic_hiddens: [256, 256] critic_lr: 0.0003 actor_hidden_activation: 'relu' actor_hiddens: [256, 256] actor_lr: 0.0003 actions_in_input_normalized: True clip_actions: True # Q function update setting twin_q: True target_network_update_freq: 1 tau: 0.0001 # evaluation evaluation_config: explore: False input: sampler evaluation_duration: 10 evaluation_duration_unit: episodes evaluation_interval: 1 evaluation_num_workers: 1 evaluation_parallel_to_training: True # replay buffer # specific to CRR temperature: 1.0 weight_type: exp advantage_type: max max_weight: 20.0 n_action_sample: 4