mirror of
https://github.com/vale981/ray
synced 2025-03-10 05:16:49 -04:00

* Exploration API (+EpsilonGreedy sub-class). * Exploration API (+EpsilonGreedy sub-class). * Cleanup/LINT. * Add `deterministic` to generic Trainer config (NOTE: this is still ignored by most Agents). * Add `error` option to deprecation_warning(). * WIP. * Bug fix: Get exploration-info for tf framework. Bug fix: Properly deprecate some DQN config keys. * WIP. * LINT. * WIP. * Split PerWorkerEpsilonGreedy out of EpsilonGreedy. Docstrings. * Fix bug in sampler.py in case Policy has self.exploration = None * Update rllib/agents/dqn/dqn.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * Update rllib/agents/trainer.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * Change requests. * LINT * In tune/utils/util.py::deep_update() Only keep deep_updat'ing if both original and value are dicts. If value is not a dict, set * Completely obsolete syn_replay_optimizer.py's parameters schedule_max_timesteps AND beta_annealing_fraction (replaced with prioritized_replay_beta_annealing_timesteps). * Update rllib/evaluation/worker_set.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Review fixes. * Fix default value for DQN's exploration spec. * LINT * Fix recursion bug (wrong parent c'tor). * Do not pass timestep to get_exploration_info. * Update tf_policy.py * Fix some remaining issues with test cases and remove more deprecated DQN/APEX exploration configs. * Bug fix tf-action-dist * DDPG incompatibility bug fix with new DQN exploration handling (which is imported by DDPG). * Switch off exploration when getting action probs from off-policy-estimator's policy. * LINT * Fix test_checkpoint_restore.py. * Deprecate all SAC exploration (unused) configs. * Properly use `model.last_output()` everywhere. Instead of `model._last_output`. * WIP. * Take out set_epsilon from multi-agent-env test (not needed, decays anyway). * WIP. * Trigger re-test (flaky checkpoint-restore test). * WIP. * WIP. * Add test case for deterministic action sampling in PPO. * bug fix. * Added deterministic test cases for different Agents. * Fix problem with TupleActions in dynamic-tf-policy. * Separate supported_spaces tests so they can be run separately for easier debugging. * LINT. * Fix autoregressive_action_dist.py test case. * Re-test. * Fix. * Remove duplicate py_test rule from bazel. * LINT. * WIP. * WIP. * SAC fix. * SAC fix. * WIP. * WIP. * WIP. * FIX 2 examples tests. * WIP. * WIP. * WIP. * WIP. * WIP. * Fix. * LINT. * Renamed test file. * WIP. * Add unittest.main. * Make action_dist_class mandatory. * fix * FIX. * WIP. * WIP. * Fix. * Fix. * Fix explorations test case (contextlib cannot find its own nullcontext??). * Force torch to be installed for QMIX. * LINT. * Fix determine_tests_to_run.py. * Fix determine_tests_to_run.py. * WIP * Add Random exploration component to tests (fixed issue with "static-graph randomness" via py_function). * Add Random exploration component to tests (fixed issue with "static-graph randomness" via py_function). * Rename some stuff. * Rename some stuff. * WIP. * WIP. * Fix SAC. * Fix SAC. * Fix strange tf-error in ray core tests. * Fix strange ray-core tf-error in test_memory_scheduling test case. * Fix test_io.py. * LINT. * Update SAC yaml files' config. Co-authored-by: Eric Liang <ekhliang@gmail.com>
108 lines
4.6 KiB
Python
108 lines
4.6 KiB
Python
import numpy as np
|
|
from gym.spaces import Box
|
|
from scipy.stats import norm
|
|
from tensorflow.python.eager.context import eager_mode
|
|
import unittest
|
|
|
|
from ray.rllib.models.tf.tf_action_dist import Categorical, SquashedGaussian
|
|
from ray.rllib.utils import try_import_tf
|
|
from ray.rllib.utils.numpy import MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT
|
|
from ray.rllib.utils.test_utils import check
|
|
|
|
tf = try_import_tf()
|
|
|
|
|
|
class TestDistributions(unittest.TestCase):
|
|
"""Tests ActionDistribution classes."""
|
|
|
|
def test_categorical(self):
|
|
"""Tests the Categorical ActionDistribution (tf only)."""
|
|
num_samples = 100000
|
|
logits = tf.placeholder(tf.float32, shape=(None, 10))
|
|
z = 8 * (np.random.rand(10) - 0.5)
|
|
data = np.tile(z, (num_samples, 1))
|
|
c = Categorical(logits, {}) # dummy config dict
|
|
sample_op = c.sample()
|
|
sess = tf.Session()
|
|
sess.run(tf.global_variables_initializer())
|
|
samples = sess.run(sample_op, feed_dict={logits: data})
|
|
counts = np.zeros(10)
|
|
for sample in samples:
|
|
counts[sample] += 1.0
|
|
probs = np.exp(z) / np.sum(np.exp(z))
|
|
self.assertTrue(np.sum(np.abs(probs - counts / num_samples)) <= 0.01)
|
|
|
|
def test_squashed_gaussian(self):
|
|
"""Tests the SquashedGaussia ActionDistribution (tf-eager only)."""
|
|
with eager_mode():
|
|
input_space = Box(-1.0, 1.0, shape=(200, 10))
|
|
low, high = -2.0, 1.0
|
|
|
|
# Batch of size=n and deterministic.
|
|
inputs = input_space.sample()
|
|
means, _ = np.split(inputs, 2, axis=-1)
|
|
squashed_distribution = SquashedGaussian(
|
|
inputs, {}, low=low, high=high)
|
|
expected = ((np.tanh(means) + 1.0) / 2.0) * (high - low) + low
|
|
# Sample n times, expect always mean value (deterministic draw).
|
|
out = squashed_distribution.deterministic_sample()
|
|
check(out, expected)
|
|
|
|
# Batch of size=n and non-deterministic -> expect roughly the mean.
|
|
inputs = input_space.sample()
|
|
means, log_stds = np.split(inputs, 2, axis=-1)
|
|
squashed_distribution = SquashedGaussian(
|
|
inputs, {}, low=low, high=high)
|
|
expected = ((np.tanh(means) + 1.0) / 2.0) * (high - low) + low
|
|
values = squashed_distribution.sample()
|
|
self.assertTrue(np.max(values) < high)
|
|
self.assertTrue(np.min(values) > low)
|
|
|
|
check(np.mean(values), expected.mean(), decimals=1)
|
|
|
|
# Test log-likelihood outputs.
|
|
sampled_action_logp = squashed_distribution.sampled_action_logp()
|
|
# Convert to parameters for distr.
|
|
stds = np.exp(
|
|
np.clip(log_stds, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT))
|
|
# Unsquash values, then get log-llh from regular gaussian.
|
|
unsquashed_values = np.arctanh((values - low) /
|
|
(high - low) * 2.0 - 1.0)
|
|
log_prob_unsquashed = \
|
|
np.sum(np.log(norm.pdf(unsquashed_values, means, stds)), -1)
|
|
log_prob = log_prob_unsquashed - \
|
|
np.sum(np.log(1 - np.tanh(unsquashed_values) ** 2),
|
|
axis=-1)
|
|
check(np.mean(sampled_action_logp), np.mean(log_prob), rtol=0.01)
|
|
|
|
# NN output.
|
|
means = np.array([[0.1, 0.2, 0.3, 0.4, 50.0],
|
|
[-0.1, -0.2, -0.3, -0.4, -1.0]])
|
|
log_stds = np.array([[0.8, -0.2, 0.3, -1.0, 2.0],
|
|
[0.7, -0.3, 0.4, -0.9, 2.0]])
|
|
squashed_distribution = SquashedGaussian(
|
|
np.concatenate([means, log_stds], axis=-1), {},
|
|
low=low,
|
|
high=high)
|
|
# Convert to parameters for distr.
|
|
stds = np.exp(log_stds)
|
|
# Values to get log-likelihoods for.
|
|
values = np.array([[0.9, 0.2, 0.4, -0.1, -1.05],
|
|
[-0.9, -0.2, 0.4, -0.1, -1.05]])
|
|
|
|
# Unsquash values, then get log-llh from regular gaussian.
|
|
unsquashed_values = np.arctanh((values - low) /
|
|
(high - low) * 2.0 - 1.0)
|
|
log_prob_unsquashed = \
|
|
np.sum(np.log(norm.pdf(unsquashed_values, means, stds)), -1)
|
|
log_prob = log_prob_unsquashed - \
|
|
np.sum(np.log(1 - np.tanh(unsquashed_values) ** 2),
|
|
axis=-1)
|
|
|
|
out = squashed_distribution.logp(values)
|
|
check(out, log_prob)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import unittest
|
|
unittest.main(verbosity=1)
|