mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00

* Exploration API (+EpsilonGreedy sub-class). * Exploration API (+EpsilonGreedy sub-class). * Cleanup/LINT. * Add `deterministic` to generic Trainer config (NOTE: this is still ignored by most Agents). * Add `error` option to deprecation_warning(). * WIP. * Bug fix: Get exploration-info for tf framework. Bug fix: Properly deprecate some DQN config keys. * WIP. * LINT. * WIP. * Split PerWorkerEpsilonGreedy out of EpsilonGreedy. Docstrings. * Fix bug in sampler.py in case Policy has self.exploration = None * Update rllib/agents/dqn/dqn.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * Update rllib/agents/trainer.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * Change requests. * LINT * In tune/utils/util.py::deep_update() Only keep deep_updat'ing if both original and value are dicts. If value is not a dict, set * Completely obsolete syn_replay_optimizer.py's parameters schedule_max_timesteps AND beta_annealing_fraction (replaced with prioritized_replay_beta_annealing_timesteps). * Update rllib/evaluation/worker_set.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Review fixes. * Fix default value for DQN's exploration spec. * LINT * Fix recursion bug (wrong parent c'tor). * Do not pass timestep to get_exploration_info. * Update tf_policy.py * Fix some remaining issues with test cases and remove more deprecated DQN/APEX exploration configs. * Bug fix tf-action-dist * DDPG incompatibility bug fix with new DQN exploration handling (which is imported by DDPG). * Switch off exploration when getting action probs from off-policy-estimator's policy. * LINT * Fix test_checkpoint_restore.py. * Deprecate all SAC exploration (unused) configs. * Properly use `model.last_output()` everywhere. Instead of `model._last_output`. * WIP. * Take out set_epsilon from multi-agent-env test (not needed, decays anyway). * WIP. * Trigger re-test (flaky checkpoint-restore test). * WIP. * WIP. * Add test case for deterministic action sampling in PPO. * bug fix. * Added deterministic test cases for different Agents. * Fix problem with TupleActions in dynamic-tf-policy. * Separate supported_spaces tests so they can be run separately for easier debugging. * LINT. * Fix autoregressive_action_dist.py test case. * Re-test. * Fix. * Remove duplicate py_test rule from bazel. * LINT. * WIP. * WIP. * SAC fix. * SAC fix. * WIP. * WIP. * WIP. * FIX 2 examples tests. * WIP. * WIP. * WIP. * WIP. * WIP. * Fix. * LINT. * Renamed test file. * WIP. * Add unittest.main. * Make action_dist_class mandatory. * fix * FIX. * WIP. * WIP. * Fix. * Fix. * Fix explorations test case (contextlib cannot find its own nullcontext??). * Force torch to be installed for QMIX. * LINT. * Fix determine_tests_to_run.py. * Fix determine_tests_to_run.py. * WIP * Add Random exploration component to tests (fixed issue with "static-graph randomness" via py_function). * Add Random exploration component to tests (fixed issue with "static-graph randomness" via py_function). * Rename some stuff. * Rename some stuff. * WIP. * WIP. * Fix SAC. * Fix SAC. * Fix strange tf-error in ray core tests. * Fix strange ray-core tf-error in test_memory_scheduling test case. * Fix test_io.py. * LINT. * Update SAC yaml files' config. Co-authored-by: Eric Liang <ekhliang@gmail.com>
206 lines
9 KiB
Python
206 lines
9 KiB
Python
from ray.rllib.policy.dynamic_tf_policy import DynamicTFPolicy
|
|
from ray.rllib.policy import eager_tf_policy
|
|
from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY
|
|
from ray.rllib.policy.tf_policy import TFPolicy
|
|
from ray.rllib.utils import add_mixins
|
|
from ray.rllib.utils.annotations import override, DeveloperAPI
|
|
from ray.rllib.utils import try_import_tf
|
|
|
|
tf = try_import_tf()
|
|
|
|
|
|
@DeveloperAPI
|
|
def build_tf_policy(name,
|
|
loss_fn,
|
|
get_default_config=None,
|
|
postprocess_fn=None,
|
|
stats_fn=None,
|
|
optimizer_fn=None,
|
|
gradients_fn=None,
|
|
apply_gradients_fn=None,
|
|
grad_stats_fn=None,
|
|
extra_action_fetches_fn=None,
|
|
extra_learn_fetches_fn=None,
|
|
before_init=None,
|
|
before_loss_init=None,
|
|
after_init=None,
|
|
make_model=None,
|
|
action_sampler_fn=None,
|
|
log_likelihood_fn=None,
|
|
mixins=None,
|
|
get_batch_divisibility_req=None,
|
|
obs_include_prev_action_reward=True):
|
|
"""Helper function for creating a dynamic tf policy at runtime.
|
|
|
|
Functions will be run in this order to initialize the policy:
|
|
1. Placeholder setup: postprocess_fn
|
|
2. Loss init: loss_fn, stats_fn
|
|
3. Optimizer init: optimizer_fn, gradients_fn, apply_gradients_fn,
|
|
grad_stats_fn
|
|
|
|
This means that you can e.g., depend on any policy attributes created in
|
|
the running of `loss_fn` in later functions such as `stats_fn`.
|
|
|
|
In eager mode, the following functions will be run repeatedly on each
|
|
eager execution: loss_fn, stats_fn, gradients_fn, apply_gradients_fn,
|
|
and grad_stats_fn.
|
|
|
|
This means that these functions should not define any variables internally,
|
|
otherwise they will fail in eager mode execution. Variable should only
|
|
be created in make_model (if defined).
|
|
|
|
Arguments:
|
|
name (str): name of the policy (e.g., "PPOTFPolicy")
|
|
loss_fn (func): function that returns a loss tensor as arguments
|
|
(policy, model, dist_class, train_batch)
|
|
get_default_config (func): optional function that returns the default
|
|
config to merge with any overrides
|
|
postprocess_fn (func): optional experience postprocessing function
|
|
that takes the same args as Policy.postprocess_trajectory()
|
|
stats_fn (func): optional function that returns a dict of
|
|
TF fetches given the policy and batch input tensors
|
|
optimizer_fn (func): optional function that returns a tf.Optimizer
|
|
given the policy and config
|
|
gradients_fn (func): optional function that returns a list of gradients
|
|
given (policy, optimizer, loss). If not specified, this
|
|
defaults to optimizer.compute_gradients(loss)
|
|
apply_gradients_fn (func): optional function that returns an apply
|
|
gradients op given (policy, optimizer, grads_and_vars)
|
|
grad_stats_fn (func): optional function that returns a dict of
|
|
TF fetches given the policy, batch input, and gradient tensors
|
|
extra_action_fetches_fn (func): optional function that returns
|
|
a dict of TF fetches given the policy object
|
|
extra_learn_fetches_fn (func): optional function that returns a dict of
|
|
extra values to fetch and return when learning on a batch
|
|
before_init (func): optional function to run at the beginning of
|
|
policy init that takes the same arguments as the policy constructor
|
|
before_loss_init (func): optional function to run prior to loss
|
|
init that takes the same arguments as the policy constructor
|
|
after_init (func): optional function to run at the end of policy init
|
|
that takes the same arguments as the policy constructor
|
|
make_model (func): optional function that returns a ModelV2 object
|
|
given (policy, obs_space, action_space, config).
|
|
All policy variables should be created in this function. If not
|
|
specified, a default model will be created.
|
|
action_sampler_fn (Optional[callable]): An optional callable returning
|
|
a tuple of action and action prob tensors given
|
|
(policy, model, input_dict, obs_space, action_space, config).
|
|
If None, a default action distribution will be used.
|
|
log_likelihood_fn (Optional[callable]): A callable,
|
|
returning a log-likelihood op.
|
|
If None, a default class is used and distribution inputs
|
|
(for parameterization) will be generated by a model call.
|
|
mixins (list): list of any class mixins for the returned policy class.
|
|
These mixins will be applied in order and will have higher
|
|
precedence than the DynamicTFPolicy class
|
|
get_batch_divisibility_req (func): optional function that returns
|
|
the divisibility requirement for sample batches
|
|
obs_include_prev_action_reward (bool): whether to include the
|
|
previous action and reward in the model input
|
|
|
|
Returns:
|
|
a DynamicTFPolicy instance that uses the specified args
|
|
"""
|
|
original_kwargs = locals().copy()
|
|
base = add_mixins(DynamicTFPolicy, mixins)
|
|
|
|
class policy_cls(base):
|
|
def __init__(self,
|
|
obs_space,
|
|
action_space,
|
|
config,
|
|
existing_model=None,
|
|
existing_inputs=None):
|
|
if get_default_config:
|
|
config = dict(get_default_config(), **config)
|
|
|
|
if before_init:
|
|
before_init(self, obs_space, action_space, config)
|
|
|
|
def before_loss_init_wrapper(policy, obs_space, action_space,
|
|
config):
|
|
if before_loss_init:
|
|
before_loss_init(policy, obs_space, action_space, config)
|
|
if extra_action_fetches_fn is None:
|
|
self._extra_action_fetches = {}
|
|
else:
|
|
self._extra_action_fetches = extra_action_fetches_fn(self)
|
|
|
|
DynamicTFPolicy.__init__(
|
|
self,
|
|
obs_space,
|
|
action_space,
|
|
config,
|
|
loss_fn,
|
|
stats_fn=stats_fn,
|
|
grad_stats_fn=grad_stats_fn,
|
|
before_loss_init=before_loss_init_wrapper,
|
|
make_model=make_model,
|
|
action_sampler_fn=action_sampler_fn,
|
|
log_likelihood_fn=log_likelihood_fn,
|
|
existing_model=existing_model,
|
|
existing_inputs=existing_inputs,
|
|
get_batch_divisibility_req=get_batch_divisibility_req,
|
|
obs_include_prev_action_reward=obs_include_prev_action_reward)
|
|
|
|
if after_init:
|
|
after_init(self, obs_space, action_space, config)
|
|
|
|
@override(Policy)
|
|
def postprocess_trajectory(self,
|
|
sample_batch,
|
|
other_agent_batches=None,
|
|
episode=None):
|
|
if not postprocess_fn:
|
|
return sample_batch
|
|
return postprocess_fn(self, sample_batch, other_agent_batches,
|
|
episode)
|
|
|
|
@override(TFPolicy)
|
|
def optimizer(self):
|
|
if optimizer_fn:
|
|
return optimizer_fn(self, self.config)
|
|
else:
|
|
return base.optimizer(self)
|
|
|
|
@override(TFPolicy)
|
|
def gradients(self, optimizer, loss):
|
|
if gradients_fn:
|
|
return gradients_fn(self, optimizer, loss)
|
|
else:
|
|
return base.gradients(self, optimizer, loss)
|
|
|
|
@override(TFPolicy)
|
|
def build_apply_op(self, optimizer, grads_and_vars):
|
|
if apply_gradients_fn:
|
|
return apply_gradients_fn(self, optimizer, grads_and_vars)
|
|
else:
|
|
return base.build_apply_op(self, optimizer, grads_and_vars)
|
|
|
|
@override(TFPolicy)
|
|
def extra_compute_action_fetches(self):
|
|
return dict(
|
|
base.extra_compute_action_fetches(self),
|
|
**self._extra_action_fetches)
|
|
|
|
@override(TFPolicy)
|
|
def extra_compute_grad_fetches(self):
|
|
if extra_learn_fetches_fn:
|
|
# auto-add empty learner stats dict if needed
|
|
return dict({
|
|
LEARNER_STATS_KEY: {}
|
|
}, **extra_learn_fetches_fn(self))
|
|
else:
|
|
return base.extra_compute_grad_fetches(self)
|
|
|
|
def with_updates(**overrides):
|
|
return build_tf_policy(**dict(original_kwargs, **overrides))
|
|
|
|
def as_eager():
|
|
return eager_tf_policy.build_eager_tf_policy(**original_kwargs)
|
|
|
|
policy_cls.with_updates = staticmethod(with_updates)
|
|
policy_cls.as_eager = staticmethod(as_eager)
|
|
policy_cls.__name__ = name
|
|
policy_cls.__qualname__ = name
|
|
return policy_cls
|