ray/rllib/models/torch/torch_action_dist.py

185 lines
5.7 KiB
Python
Raw Normal View History

import numpy as np
from ray.rllib.models.action_dist import ActionDistribution
from ray.rllib.utils.annotations import override
from ray.rllib.utils import try_import_torch
torch, nn = try_import_torch()
class TorchDistributionWrapper(ActionDistribution):
"""Wrapper class for torch.distributions."""
2020-03-04 09:41:40 +01:00
@override(ActionDistribution)
def __init__(self, inputs, model):
if not isinstance(inputs, torch.Tensor):
inputs = torch.Tensor(inputs)
2020-03-04 09:41:40 +01:00
super().__init__(inputs, model)
# Store the last sample here.
self.last_sample = None
@override(ActionDistribution)
def logp(self, actions):
return self.dist.log_prob(actions)
@override(ActionDistribution)
def entropy(self):
return self.dist.entropy()
@override(ActionDistribution)
def kl(self, other):
return torch.distributions.kl.kl_divergence(self.dist, other.dist)
@override(ActionDistribution)
def sample(self):
self.last_sample = self.dist.sample()
return self.last_sample
@override(ActionDistribution)
def sampled_action_logp(self):
assert self.last_sample is not None
return self.logp(self.last_sample)
class TorchCategorical(TorchDistributionWrapper):
"""Wrapper class for PyTorch Categorical distribution."""
@override(ActionDistribution)
[RLlib] SAC add discrete action support. (#7320) * Exploration API (+EpsilonGreedy sub-class). * Exploration API (+EpsilonGreedy sub-class). * Cleanup/LINT. * Add `deterministic` to generic Trainer config (NOTE: this is still ignored by most Agents). * Add `error` option to deprecation_warning(). * WIP. * Bug fix: Get exploration-info for tf framework. Bug fix: Properly deprecate some DQN config keys. * WIP. * LINT. * WIP. * Split PerWorkerEpsilonGreedy out of EpsilonGreedy. Docstrings. * Fix bug in sampler.py in case Policy has self.exploration = None * Update rllib/agents/dqn/dqn.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * Update rllib/agents/trainer.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * Change requests. * LINT * In tune/utils/util.py::deep_update() Only keep deep_updat'ing if both original and value are dicts. If value is not a dict, set * Completely obsolete syn_replay_optimizer.py's parameters schedule_max_timesteps AND beta_annealing_fraction (replaced with prioritized_replay_beta_annealing_timesteps). * Update rllib/evaluation/worker_set.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Review fixes. * Fix default value for DQN's exploration spec. * LINT * Fix recursion bug (wrong parent c'tor). * Do not pass timestep to get_exploration_info. * Update tf_policy.py * Fix some remaining issues with test cases and remove more deprecated DQN/APEX exploration configs. * Bug fix tf-action-dist * DDPG incompatibility bug fix with new DQN exploration handling (which is imported by DDPG). * Switch off exploration when getting action probs from off-policy-estimator's policy. * LINT * Fix test_checkpoint_restore.py. * Deprecate all SAC exploration (unused) configs. * Properly use `model.last_output()` everywhere. Instead of `model._last_output`. * WIP. * Take out set_epsilon from multi-agent-env test (not needed, decays anyway). * WIP. * Trigger re-test (flaky checkpoint-restore test). * WIP. * WIP. * Add test case for deterministic action sampling in PPO. * bug fix. * Added deterministic test cases for different Agents. * Fix problem with TupleActions in dynamic-tf-policy. * Separate supported_spaces tests so they can be run separately for easier debugging. * LINT. * Fix autoregressive_action_dist.py test case. * Re-test. * Fix. * Remove duplicate py_test rule from bazel. * LINT. * WIP. * WIP. * SAC fix. * SAC fix. * WIP. * WIP. * WIP. * FIX 2 examples tests. * WIP. * WIP. * WIP. * WIP. * WIP. * Fix. * LINT. * Renamed test file. * WIP. * Add unittest.main. * Make action_dist_class mandatory. * fix * FIX. * WIP. * WIP. * Fix. * Fix. * Fix explorations test case (contextlib cannot find its own nullcontext??). * Force torch to be installed for QMIX. * LINT. * Fix determine_tests_to_run.py. * Fix determine_tests_to_run.py. * WIP * Add Random exploration component to tests (fixed issue with "static-graph randomness" via py_function). * Add Random exploration component to tests (fixed issue with "static-graph randomness" via py_function). * Rename some stuff. * Rename some stuff. * WIP. * update. * WIP. * Gumbel Softmax Dist. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP * WIP. * WIP. * Hypertune. * Hypertune. * Hypertune. * Lock-in. * Cleanup. * LINT. * Fix. * Update rllib/policy/eager_tf_policy.py Co-Authored-By: Kristian Hartikainen <kristian.hartikainen@gmail.com> * Update rllib/agents/sac/sac_policy.py Co-Authored-By: Kristian Hartikainen <kristian.hartikainen@gmail.com> * Update rllib/agents/sac/sac_policy.py Co-Authored-By: Kristian Hartikainen <kristian.hartikainen@gmail.com> * Update rllib/models/tf/tf_action_dist.py Co-Authored-By: Kristian Hartikainen <kristian.hartikainen@gmail.com> * Update rllib/models/tf/tf_action_dist.py Co-Authored-By: Kristian Hartikainen <kristian.hartikainen@gmail.com> * Fix items from review comments. * Add dm_tree to RLlib dependencies. * Add dm_tree to RLlib dependencies. * Fix DQN test cases ((Torch)Categorical). * Fix wrong pip install. Co-authored-by: Eric Liang <ekhliang@gmail.com> Co-authored-by: Kristian Hartikainen <kristian.hartikainen@gmail.com>
2020-03-06 19:37:12 +01:00
def __init__(self, inputs, model=None, temperature=1.0):
[RLlib] DQN torch version. (#7597) * Fix. * Rollback. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * Fix. * Fix. * Fix. * Fix. * Fix. * WIP. * WIP. * Fix. * Test case fixes. * Test case fixes and LINT. * Test case fixes and LINT. * Rollback. * WIP. * WIP. * Test case fixes. * Fix. * Fix. * Fix. * Add regression test for DQN w/ param noise. * Fixes and LINT. * Fixes and LINT. * Fixes and LINT. * Fixes and LINT. * Fixes and LINT. * Comment * Regression test case. * WIP. * WIP. * LINT. * LINT. * WIP. * Fix. * Fix. * Fix. * LINT. * Fix (SAC does currently not support eager). * Fix. * WIP. * LINT. * Update rllib/evaluation/sampler.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/evaluation/sampler.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/utils/exploration/exploration.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/utils/exploration/exploration.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * WIP. * Fix. * LINT. * LINT. * Fix and LINT. * WIP. * WIP. * WIP. * WIP. * Fix. * LINT. * Fix. * Fix and LINT. * Update rllib/utils/exploration/exploration.py * Update rllib/policy/dynamic_tf_policy.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/policy/dynamic_tf_policy.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/policy/dynamic_tf_policy.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Fixes. * WIP. * LINT. * Fixes and LINT. * LINT and fixes. * LINT. * Move action_dist back into torch extra_action_out_fn and LINT. * Working SimpleQ learning cartpole on both torch AND tf. * Working Rainbow learning cartpole on tf. * Working Rainbow learning cartpole on tf. * WIP. * LINT. * LINT. * Update docs and add torch to APEX test. * LINT. * Fix. * LINT. * Fix. * Fix. * Fix and docstrings. * Fix broken RLlib tests in master. * Split BAZEL learning tests into cartpole and pendulum (reached the 60min barrier). * Fix error_outputs option in BAZEL for RLlib regression tests. * Fix. * Tune param-noise tests. * LINT. * Fix. * Fix. * test * test * test * Fix. * Fix. * WIP. * WIP. * WIP. * WIP. * LINT. * WIP. Co-authored-by: Eric Liang <ekhliang@gmail.com>
2020-04-06 20:56:16 +02:00
if temperature != 1.0:
assert temperature > 0.0, \
"Categorical `temperature` must be > 0.0!"
inputs /= temperature
super().__init__(inputs, model)
[RLlib] SAC add discrete action support. (#7320) * Exploration API (+EpsilonGreedy sub-class). * Exploration API (+EpsilonGreedy sub-class). * Cleanup/LINT. * Add `deterministic` to generic Trainer config (NOTE: this is still ignored by most Agents). * Add `error` option to deprecation_warning(). * WIP. * Bug fix: Get exploration-info for tf framework. Bug fix: Properly deprecate some DQN config keys. * WIP. * LINT. * WIP. * Split PerWorkerEpsilonGreedy out of EpsilonGreedy. Docstrings. * Fix bug in sampler.py in case Policy has self.exploration = None * Update rllib/agents/dqn/dqn.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * Update rllib/agents/trainer.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * Change requests. * LINT * In tune/utils/util.py::deep_update() Only keep deep_updat'ing if both original and value are dicts. If value is not a dict, set * Completely obsolete syn_replay_optimizer.py's parameters schedule_max_timesteps AND beta_annealing_fraction (replaced with prioritized_replay_beta_annealing_timesteps). * Update rllib/evaluation/worker_set.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Review fixes. * Fix default value for DQN's exploration spec. * LINT * Fix recursion bug (wrong parent c'tor). * Do not pass timestep to get_exploration_info. * Update tf_policy.py * Fix some remaining issues with test cases and remove more deprecated DQN/APEX exploration configs. * Bug fix tf-action-dist * DDPG incompatibility bug fix with new DQN exploration handling (which is imported by DDPG). * Switch off exploration when getting action probs from off-policy-estimator's policy. * LINT * Fix test_checkpoint_restore.py. * Deprecate all SAC exploration (unused) configs. * Properly use `model.last_output()` everywhere. Instead of `model._last_output`. * WIP. * Take out set_epsilon from multi-agent-env test (not needed, decays anyway). * WIP. * Trigger re-test (flaky checkpoint-restore test). * WIP. * WIP. * Add test case for deterministic action sampling in PPO. * bug fix. * Added deterministic test cases for different Agents. * Fix problem with TupleActions in dynamic-tf-policy. * Separate supported_spaces tests so they can be run separately for easier debugging. * LINT. * Fix autoregressive_action_dist.py test case. * Re-test. * Fix. * Remove duplicate py_test rule from bazel. * LINT. * WIP. * WIP. * SAC fix. * SAC fix. * WIP. * WIP. * WIP. * FIX 2 examples tests. * WIP. * WIP. * WIP. * WIP. * WIP. * Fix. * LINT. * Renamed test file. * WIP. * Add unittest.main. * Make action_dist_class mandatory. * fix * FIX. * WIP. * WIP. * Fix. * Fix. * Fix explorations test case (contextlib cannot find its own nullcontext??). * Force torch to be installed for QMIX. * LINT. * Fix determine_tests_to_run.py. * Fix determine_tests_to_run.py. * WIP * Add Random exploration component to tests (fixed issue with "static-graph randomness" via py_function). * Add Random exploration component to tests (fixed issue with "static-graph randomness" via py_function). * Rename some stuff. * Rename some stuff. * WIP. * update. * WIP. * Gumbel Softmax Dist. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP * WIP. * WIP. * Hypertune. * Hypertune. * Hypertune. * Lock-in. * Cleanup. * LINT. * Fix. * Update rllib/policy/eager_tf_policy.py Co-Authored-By: Kristian Hartikainen <kristian.hartikainen@gmail.com> * Update rllib/agents/sac/sac_policy.py Co-Authored-By: Kristian Hartikainen <kristian.hartikainen@gmail.com> * Update rllib/agents/sac/sac_policy.py Co-Authored-By: Kristian Hartikainen <kristian.hartikainen@gmail.com> * Update rllib/models/tf/tf_action_dist.py Co-Authored-By: Kristian Hartikainen <kristian.hartikainen@gmail.com> * Update rllib/models/tf/tf_action_dist.py Co-Authored-By: Kristian Hartikainen <kristian.hartikainen@gmail.com> * Fix items from review comments. * Add dm_tree to RLlib dependencies. * Add dm_tree to RLlib dependencies. * Fix DQN test cases ((Torch)Categorical). * Fix wrong pip install. Co-authored-by: Eric Liang <ekhliang@gmail.com> Co-authored-by: Kristian Hartikainen <kristian.hartikainen@gmail.com>
2020-03-06 19:37:12 +01:00
self.dist = torch.distributions.categorical.Categorical(
logits=self.inputs)
@override(ActionDistribution)
def deterministic_sample(self):
self.last_sample = self.dist.probs.argmax(dim=1)
return self.last_sample
@staticmethod
@override(ActionDistribution)
def required_model_output_shape(action_space, model_config):
return action_space.n
2020-03-04 09:41:40 +01:00
class TorchMultiCategorical(TorchDistributionWrapper):
"""MultiCategorical distribution for MultiDiscrete action spaces."""
@override(TorchDistributionWrapper)
def __init__(self, inputs, model, input_lens):
super().__init__(inputs, model)
# If input_lens is np.ndarray or list, force-make it a tuple.
inputs_split = self.inputs.split(tuple(input_lens), dim=1)
2020-03-04 09:41:40 +01:00
self.cats = [
torch.distributions.categorical.Categorical(logits=input_)
for input_ in inputs_split
]
@override(TorchDistributionWrapper)
def sample(self):
arr = [cat.sample() for cat in self.cats]
self.last_sample = torch.stack(arr, dim=1)
return self.last_sample
2020-03-04 09:41:40 +01:00
@override(ActionDistribution)
def deterministic_sample(self):
arr = [torch.argmax(cat.probs, -1) for cat in self.cats]
self.last_sample = torch.stack(arr, dim=1)
return self.last_sample
2020-03-04 09:41:40 +01:00
@override(TorchDistributionWrapper)
def logp(self, actions):
# # If tensor is provided, unstack it into list.
if isinstance(actions, torch.Tensor):
actions = torch.unbind(actions, dim=1)
logps = torch.stack(
[cat.log_prob(act) for cat, act in zip(self.cats, actions)])
return torch.sum(logps, dim=0)
@override(ActionDistribution)
def multi_entropy(self):
return torch.stack([cat.entropy() for cat in self.cats], dim=1)
@override(TorchDistributionWrapper)
def entropy(self):
return torch.sum(self.multi_entropy(), dim=1)
@override(ActionDistribution)
def multi_kl(self, other):
return torch.stack(
[
torch.distributions.kl.kl_divergence(cat, oth_cat)
for cat, oth_cat in zip(self.cats, other.cats)
],
dim=1,
)
@override(TorchDistributionWrapper)
def kl(self, other):
return torch.sum(self.multi_kl(other), dim=1)
@staticmethod
@override(ActionDistribution)
def required_model_output_shape(action_space, model_config):
return np.sum(action_space.nvec)
class TorchDiagGaussian(TorchDistributionWrapper):
"""Wrapper class for PyTorch Normal distribution."""
@override(ActionDistribution)
def __init__(self, inputs, model):
super().__init__(inputs, model)
mean, log_std = torch.chunk(inputs, 2, dim=1)
self.dist = torch.distributions.normal.Normal(mean, torch.exp(log_std))
@override(ActionDistribution)
def deterministic_sample(self):
self.last_sample = self.dist.mean
return self.last_sample
@override(TorchDistributionWrapper)
def logp(self, actions):
return super().logp(actions).sum(-1)
@override(TorchDistributionWrapper)
def entropy(self):
return super().entropy().sum(-1)
@override(TorchDistributionWrapper)
def kl(self, other):
return super().kl(other).sum(-1)
@staticmethod
@override(ActionDistribution)
def required_model_output_shape(action_space, model_config):
return np.prod(action_space.shape) * 2
class TorchDeterministic(TorchDistributionWrapper):
"""Action distribution that returns the input values directly.
This is similar to DiagGaussian with standard deviation zero (thus only
requiring the "mean" values as NN output).
"""
@override(ActionDistribution)
def deterministic_sample(self):
return self.inputs
@override(TorchDistributionWrapper)
def sampled_action_logp(self):
return 0.0
@override(TorchDistributionWrapper)
def sample(self):
return self.deterministic_sample()
@staticmethod
@override(ActionDistribution)
def required_model_output_shape(action_space, model_config):
return np.prod(action_space.shape)