from ray.rllib.agents.ppo.appo_policy import AsyncPPOTFPolicy from ray.rllib.agents.trainer import with_base_config from ray.rllib.agents.ppo.ppo import update_kl from ray.rllib.agents import impala # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = with_base_config(impala.DEFAULT_CONFIG, { # Whether to use V-trace weighted advantages. If false, PPO GAE advantages # will be used instead. "vtrace": False, # == These two options only apply if vtrace: False == # Should use a critic as a baseline (otherwise don't use value baseline; # required for using GAE). "use_critic": True, # If true, use the Generalized Advantage Estimator (GAE) # with a value function, see https://arxiv.org/pdf/1506.02438.pdf. "use_gae": True, # GAE(lambda) parameter "lambda": 1.0, # == PPO surrogate loss options == "clip_param": 0.4, # == PPO KL Loss options == "use_kl_loss": False, "kl_coeff": 1.0, "kl_target": 0.01, # == IMPALA optimizer params (see documentation in impala.py) == "rollout_fragment_length": 50, "train_batch_size": 500, "min_iter_time_s": 10, "num_workers": 2, "num_gpus": 0, "num_data_loader_buffers": 1, "minibatch_buffer_size": 1, "num_sgd_iter": 1, "replay_proportion": 0.0, "replay_buffer_num_slots": 100, "learner_queue_size": 16, "learner_queue_timeout": 300, "max_sample_requests_in_flight_per_worker": 2, "broadcast_interval": 1, "grad_clip": 40.0, "opt_type": "adam", "lr": 0.0005, "lr_schedule": None, "decay": 0.99, "momentum": 0.0, "epsilon": 0.1, "vf_loss_coeff": 0.5, "entropy_coeff": 0.01, "entropy_coeff_schedule": None, }) # __sphinx_doc_end__ # yapf: enable def update_target_and_kl(trainer, fetches): # Update the KL coeff depending on how many steps LearnerThread has stepped # through learner_steps = trainer.optimizer.learner.num_steps if learner_steps >= trainer.target_update_frequency: # Update Target Network trainer.optimizer.learner.num_steps = 0 trainer.workers.local_worker().foreach_trainable_policy( lambda p, _: p.update_target()) # Also update KL Coeff if trainer.config["use_kl_loss"]: update_kl(trainer, trainer.optimizer.learner.stats) def initialize_target(trainer): trainer.workers.local_worker().foreach_trainable_policy( lambda p, _: p.update_target()) trainer.target_update_frequency = trainer.config["num_sgd_iter"] \ * trainer.config["minibatch_buffer_size"] APPOTrainer = impala.ImpalaTrainer.with_updates( name="APPO", default_config=DEFAULT_CONFIG, default_policy=AsyncPPOTFPolicy, get_policy_class=lambda _: AsyncPPOTFPolicy, after_init=initialize_target, after_optimizer_step=update_target_and_kl)