mirror of
https://github.com/vale981/ray
synced 2025-03-09 04:46:38 -04:00
91 lines
2.9 KiB
Python
91 lines
2.9 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
from ray.rllib.agents.ppo.appo_policy import AsyncPPOTFPolicy
|
|
from ray.rllib.agents.trainer import with_base_config
|
|
from ray.rllib.agents.ppo.ppo import update_kl
|
|
from ray.rllib.agents import impala
|
|
|
|
# yapf: disable
|
|
# __sphinx_doc_begin__
|
|
DEFAULT_CONFIG = with_base_config(impala.DEFAULT_CONFIG, {
|
|
# Whether to use V-trace weighted advantages. If false, PPO GAE advantages
|
|
# will be used instead.
|
|
"vtrace": False,
|
|
|
|
# == These two options only apply if vtrace: False ==
|
|
# If true, use the Generalized Advantage Estimator (GAE)
|
|
# with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
|
|
"use_gae": True,
|
|
# GAE(lambda) parameter
|
|
"lambda": 1.0,
|
|
|
|
# == PPO surrogate loss options ==
|
|
"clip_param": 0.4,
|
|
|
|
# == PPO KL Loss options ==
|
|
"use_kl_loss": False,
|
|
"kl_coeff": 1.0,
|
|
"kl_target": 0.01,
|
|
|
|
# == IMPALA optimizer params (see documentation in impala.py) ==
|
|
"sample_batch_size": 50,
|
|
"train_batch_size": 500,
|
|
"min_iter_time_s": 10,
|
|
"num_workers": 2,
|
|
"num_gpus": 0,
|
|
"num_data_loader_buffers": 1,
|
|
"minibatch_buffer_size": 1,
|
|
"num_sgd_iter": 1,
|
|
"replay_proportion": 0.0,
|
|
"replay_buffer_num_slots": 100,
|
|
"learner_queue_size": 16,
|
|
"learner_queue_timeout": 300,
|
|
"max_sample_requests_in_flight_per_worker": 2,
|
|
"broadcast_interval": 1,
|
|
"grad_clip": 40.0,
|
|
"opt_type": "adam",
|
|
"lr": 0.0005,
|
|
"lr_schedule": None,
|
|
"decay": 0.99,
|
|
"momentum": 0.0,
|
|
"epsilon": 0.1,
|
|
"vf_loss_coeff": 0.5,
|
|
"entropy_coeff": 0.01,
|
|
"entropy_coeff_schedule": None,
|
|
})
|
|
# __sphinx_doc_end__
|
|
# yapf: enable
|
|
|
|
|
|
def update_target_and_kl(trainer, fetches):
|
|
# Update the KL coeff depending on how many steps LearnerThread has stepped
|
|
# through
|
|
learner_steps = trainer.optimizer.learner.num_steps
|
|
if learner_steps >= trainer.target_update_frequency:
|
|
|
|
# Update Target Network
|
|
trainer.optimizer.learner.num_steps = 0
|
|
trainer.workers.local_worker().foreach_trainable_policy(
|
|
lambda p, _: p.update_target())
|
|
|
|
# Also update KL Coeff
|
|
if trainer.config["use_kl_loss"]:
|
|
update_kl(trainer, trainer.optimizer.learner.stats)
|
|
|
|
|
|
def initialize_target(trainer):
|
|
trainer.workers.local_worker().foreach_trainable_policy(
|
|
lambda p, _: p.update_target())
|
|
trainer.target_update_frequency = trainer.config["num_sgd_iter"] \
|
|
* trainer.config["minibatch_buffer_size"]
|
|
|
|
|
|
APPOTrainer = impala.ImpalaTrainer.with_updates(
|
|
name="APPO",
|
|
default_config=DEFAULT_CONFIG,
|
|
default_policy=AsyncPPOTFPolicy,
|
|
get_policy_class=lambda _: AsyncPPOTFPolicy,
|
|
after_init=initialize_target,
|
|
after_optimizer_step=update_target_and_kl)
|