mirror of
https://github.com/vale981/ray
synced 2025-03-08 19:41:38 -05:00

* Remove all __future__ imports from RLlib. * Remove (object) again from tf_run_builder.py::TFRunBuilder. * Fix 2xLINT warnings. * Fix broken appo_policy import (must be appo_tf_policy) * Remove future imports from all other ray files (not just RLlib). * Remove future imports from all other ray files (not just RLlib). * Remove future import blocks that contain `unicode_literals` as well. Revert appo_tf_policy.py to appo_policy.py (belongs to another PR). * Add two empty lines before Schedule class. * Put back __future__ imports into determine_tests_to_run.py. Fails otherwise on a py2/print related error.
100 lines
3.6 KiB
Python
100 lines
3.6 KiB
Python
from ray.rllib.agents.trainer import with_common_config
|
|
from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
|
|
from ray.rllib.agents.qmix.qmix_policy import QMixTorchPolicy
|
|
from ray.rllib.optimizers import SyncBatchReplayOptimizer
|
|
|
|
# yapf: disable
|
|
# __sphinx_doc_begin__
|
|
DEFAULT_CONFIG = with_common_config({
|
|
# === QMix ===
|
|
# Mixing network. Either "qmix", "vdn", or None
|
|
"mixer": "qmix",
|
|
# Size of the mixing network embedding
|
|
"mixing_embed_dim": 32,
|
|
# Whether to use Double_Q learning
|
|
"double_q": True,
|
|
# Optimize over complete episodes by default.
|
|
"batch_mode": "complete_episodes",
|
|
|
|
# === Evaluation ===
|
|
# Evaluate with epsilon=0 every `evaluation_interval` training iterations.
|
|
# The evaluation stats will be reported under the "evaluation" metric key.
|
|
# Note that evaluation is currently not parallelized, and that for Ape-X
|
|
# metrics are already only reported for the lowest epsilon workers.
|
|
"evaluation_interval": None,
|
|
# Number of episodes to run per evaluation period.
|
|
"evaluation_num_episodes": 10,
|
|
|
|
# === Exploration ===
|
|
# Max num timesteps for annealing schedules. Exploration is annealed from
|
|
# 1.0 to exploration_fraction over this number of timesteps scaled by
|
|
# exploration_fraction
|
|
"schedule_max_timesteps": 100000,
|
|
# Number of env steps to optimize for before returning
|
|
"timesteps_per_iteration": 1000,
|
|
# Fraction of entire training period over which the exploration rate is
|
|
# annealed
|
|
"exploration_fraction": 0.1,
|
|
# Final value of random action probability
|
|
"exploration_final_eps": 0.02,
|
|
# Update the target network every `target_network_update_freq` steps.
|
|
"target_network_update_freq": 500,
|
|
|
|
# === Replay buffer ===
|
|
# Size of the replay buffer in steps.
|
|
"buffer_size": 10000,
|
|
|
|
# === Optimization ===
|
|
# Learning rate for RMSProp optimizer
|
|
"lr": 0.0005,
|
|
# RMSProp alpha
|
|
"optim_alpha": 0.99,
|
|
# RMSProp epsilon
|
|
"optim_eps": 0.00001,
|
|
# If not None, clip gradients during optimization at this value
|
|
"grad_norm_clipping": 10,
|
|
# How many steps of the model to sample before learning starts.
|
|
"learning_starts": 1000,
|
|
# Update the replay buffer with this many samples at once. Note that
|
|
# this setting applies per-worker if num_workers > 1.
|
|
"sample_batch_size": 4,
|
|
# Size of a batched sampled from replay buffer for training. Note that
|
|
# if async_updates is set, then each worker returns gradients for a
|
|
# batch of this size.
|
|
"train_batch_size": 32,
|
|
|
|
# === Parallelism ===
|
|
# Number of workers for collecting samples with. This only makes sense
|
|
# to increase if your environment is particularly slow to sample, or if
|
|
# you"re using the Async or Ape-X optimizers.
|
|
"num_workers": 0,
|
|
# Whether to use a distribution of epsilons across workers for exploration.
|
|
"per_worker_exploration": False,
|
|
# Whether to compute priorities on workers.
|
|
"worker_side_prioritization": False,
|
|
# Prevent iterations from going lower than this time span
|
|
"min_iter_time_s": 1,
|
|
|
|
# === Model ===
|
|
"model": {
|
|
"lstm_cell_size": 64,
|
|
"max_seq_len": 999999,
|
|
},
|
|
})
|
|
# __sphinx_doc_end__
|
|
# yapf: enable
|
|
|
|
|
|
def make_sync_batch_optimizer(workers, config):
|
|
return SyncBatchReplayOptimizer(
|
|
workers,
|
|
learning_starts=config["learning_starts"],
|
|
buffer_size=config["buffer_size"],
|
|
train_batch_size=config["train_batch_size"])
|
|
|
|
|
|
QMixTrainer = GenericOffPolicyTrainer.with_updates(
|
|
name="QMIX",
|
|
default_config=DEFAULT_CONFIG,
|
|
default_policy=QMixTorchPolicy,
|
|
make_policy_optimizer=make_sync_batch_optimizer)
|