ray/rllib/algorithms/impala/tests/test_vtrace.py

# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for V-trace.

For details and theory see:

"IMPALA: Scalable Distributed Deep-RL with
Importance Weighted Actor-Learner Architectures"
by Espeholt, Soyer, Munos et al.
"""

from gym.spaces import Box
import numpy as np
import unittest

from ray.rllib.algorithms.impala import vtrace_tf as vtrace_tf
from ray.rllib.algorithms.impala import vtrace_torch as vtrace_torch
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.utils.numpy import softmax
from ray.rllib.utils.test_utils import check, framework_iterator

tf1, tf, tfv = try_import_tf()
torch, nn = try_import_torch()


def _ground_truth_calculation(
    vtrace,
    discounts,
    log_rhos,
    rewards,
    values,
    bootstrap_value,
    clip_rho_threshold,
    clip_pg_rho_threshold,
):
    """Calculates the ground truth for V-trace in Python/Numpy."""
    vs = []
    seq_len = len(discounts)
    rhos = np.exp(log_rhos)
    cs = np.minimum(rhos, 1.0)
    clipped_rhos = rhos
    if clip_rho_threshold:
        clipped_rhos = np.minimum(rhos, clip_rho_threshold)
    clipped_pg_rhos = rhos
    if clip_pg_rho_threshold:
        clipped_pg_rhos = np.minimum(rhos, clip_pg_rho_threshold)

    # This is a very inefficient way to calculate the V-trace ground truth.
    # We calculate it this way because it is close to the mathematical notation
    # of
    # V-trace.
    # v_s = V(x_s)
    #       + \sum^{T-1}_{t=s} \gamma^{t-s}
    #         * \prod_{i=s}^{t-1} c_i
    #         * \rho_t (r_t + \gamma V(x_{t+1}) - V(x_t))
    # Note that when we take the product over c_i, we write `s:t` as the
    # notation
    # of the paper is inclusive of the `t-1`, but Python is exclusive.
    # Also note that np.prod([]) == 1.
    values_t_plus_1 = np.concatenate([values[1:], bootstrap_value[None, :]], axis=0)
    for s in range(seq_len):
        v_s = np.copy(values[s])  # Very important copy.
        for t in range(s, seq_len):
            v_s += (
                np.prod(discounts[s:t], axis=0)
                * np.prod(cs[s:t], axis=0)
                * clipped_rhos[t]
                * (rewards[t] + discounts[t] * values_t_plus_1[t] - values[t])
            )
        vs.append(v_s)
    vs = np.stack(vs, axis=0)
    pg_advantages = clipped_pg_rhos * (
        rewards
        + discounts * np.concatenate([vs[1:], bootstrap_value[None, :]], axis=0)
        - values
    )

    return vtrace.VTraceReturns(vs=vs, pg_advantages=pg_advantages)


class LogProbsFromLogitsAndActionsTest(unittest.TestCase):
    def test_log_probs_from_logits_and_actions(self):
        """Tests log_probs_from_logits_and_actions."""
        seq_len = 7
        num_actions = 3
        batch_size = 4

        for fw, sess in framework_iterator(frameworks=("torch", "tf"), session=True):
            vtrace = vtrace_tf if fw != "torch" else vtrace_torch
            policy_logits = Box(
                -1.0, 1.0, (seq_len, batch_size, num_actions), np.float32
            ).sample()
            actions = np.random.randint(
                0, num_actions - 1, size=(seq_len, batch_size), dtype=np.int32
            )

            if fw == "torch":
                action_log_probs_tensor = vtrace.log_probs_from_logits_and_actions(
                    torch.from_numpy(policy_logits), torch.from_numpy(actions)
                )
            else:
                action_log_probs_tensor = vtrace.log_probs_from_logits_and_actions(
                    policy_logits, actions
                )

            # Ground Truth
            # Using broadcasting to create a mask that indexes action logits
            action_index_mask = actions[..., None] == np.arange(num_actions)

            def index_with_mask(array, mask):
                return array[mask].reshape(*array.shape[:-1])

            # Note: Normally log(softmax) is not a good idea because it's not
            # numerically stable. However, in this test we have well-behaved
            # values.
            ground_truth_v = index_with_mask(
                np.log(softmax(policy_logits)), action_index_mask
            )

            if sess:
                action_log_probs_tensor = sess.run(action_log_probs_tensor)
            check(action_log_probs_tensor, ground_truth_v)


class VtraceTest(unittest.TestCase):
    def test_vtrace(self):
        """Tests V-trace against ground truth data calculated in python."""
        seq_len = 5
        batch_size = 10

        # Create log_rhos such that rho will span from near-zero to above the
        # clipping thresholds. In particular, calculate log_rhos in
        # [-2.5, 2.5),
        # so that rho is in approx [0.08, 12.2).
        space_w_time = Box(-1.0, 1.0, (seq_len, batch_size), np.float32)
        space_only_batch = Box(-1.0, 1.0, (batch_size,), np.float32)
        log_rhos = space_w_time.sample() / (batch_size * seq_len)
        log_rhos = 5 * (log_rhos - 0.5)  # [0.0, 1.0) -> [-2.5, 2.5).
        values = {
            "log_rhos": log_rhos,
            # T, B where B_i: [0.9 / (i+1)] * T
            "discounts": np.array(
                [[0.9 / (b + 1) for b in range(batch_size)] for _ in range(seq_len)]
            ),
            "rewards": space_w_time.sample(),
            "values": space_w_time.sample() / batch_size,
            "bootstrap_value": space_only_batch.sample() + 1.0,
            "clip_rho_threshold": 3.7,
            "clip_pg_rho_threshold": 2.2,
        }

        for fw, sess in framework_iterator(frameworks=("torch", "tf"), session=True):
            vtrace = vtrace_tf if fw != "torch" else vtrace_torch
            output = vtrace.from_importance_weights(**values)
            if sess:
                output = sess.run(output)

            ground_truth_v = _ground_truth_calculation(vtrace, **values)
            check(output, ground_truth_v)

    def test_vtrace_from_logits(self):
        """Tests V-trace calculated from logits."""
        seq_len = 5
        batch_size = 15
        num_actions = 3
        clip_rho_threshold = None  # No clipping.
        clip_pg_rho_threshold = None  # No clipping.
        space = Box(-1.0, 1.0, (seq_len, batch_size, num_actions))
        action_space = Box(
            0,
            num_actions - 1,
            (
                seq_len,
                batch_size,
            ),
            dtype=np.int32,
        )
        space_w_time = Box(
            -1.0,
            1.0,
            (
                seq_len,
                batch_size,
            ),
        )
        space_only_batch = Box(-1.0, 1.0, (batch_size,))

        for fw, sess in framework_iterator(frameworks=("torch", "tf"), session=True):
            vtrace = vtrace_tf if fw != "torch" else vtrace_torch

            if fw == "tf":
                # Intentionally leaving shapes unspecified to test if V-trace
                # can deal with that.
                inputs_ = {
                    # T, B, NUM_ACTIONS
                    "behaviour_policy_logits": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, None]
                    ),
                    # T, B, NUM_ACTIONS
                    "target_policy_logits": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, None]
                    ),
                    "actions": tf1.placeholder(dtype=tf.int32, shape=[None, None]),
                    "discounts": tf1.placeholder(dtype=tf.float32, shape=[None, None]),
                    "rewards": tf1.placeholder(dtype=tf.float32, shape=[None, None]),
                    "values": tf1.placeholder(dtype=tf.float32, shape=[None, None]),
                    "bootstrap_value": tf1.placeholder(dtype=tf.float32, shape=[None]),
                }
            else:
                inputs_ = {
                    # T, B, NUM_ACTIONS
                    "behaviour_policy_logits": space.sample(),
                    # T, B, NUM_ACTIONS
                    "target_policy_logits": space.sample(),
                    "actions": action_space.sample(),
                    "discounts": space_w_time.sample(),
                    "rewards": space_w_time.sample(),
                    "values": space_w_time.sample(),
                    "bootstrap_value": space_only_batch.sample(),
                }
            from_logits_output = vtrace.from_logits(
                clip_rho_threshold=clip_rho_threshold,
                clip_pg_rho_threshold=clip_pg_rho_threshold,
                **inputs_
            )

            if fw != "torch":
                target_log_probs = vtrace.log_probs_from_logits_and_actions(
                    inputs_["target_policy_logits"], inputs_["actions"]
                )
                behaviour_log_probs = vtrace.log_probs_from_logits_and_actions(
                    inputs_["behaviour_policy_logits"], inputs_["actions"]
                )
            else:
                target_log_probs = vtrace.log_probs_from_logits_and_actions(
                    torch.from_numpy(inputs_["target_policy_logits"]),
                    torch.from_numpy(inputs_["actions"]),
                )
                behaviour_log_probs = vtrace.log_probs_from_logits_and_actions(
                    torch.from_numpy(inputs_["behaviour_policy_logits"]),
                    torch.from_numpy(inputs_["actions"]),
                )
            log_rhos = target_log_probs - behaviour_log_probs
            ground_truth = (log_rhos, behaviour_log_probs, target_log_probs)

            if sess:
                values = {
                    "behaviour_policy_logits": space.sample(),
                    "target_policy_logits": space.sample(),
                    "actions": action_space.sample(),
                    "discounts": space_w_time.sample(),
                    "rewards": space_w_time.sample(),
                    "values": space_w_time.sample() / batch_size,
                    "bootstrap_value": space_only_batch.sample() + 1.0,
                }
                feed_dict = {inputs_[k]: v for k, v in values.items()}
                from_logits_output = sess.run(from_logits_output, feed_dict=feed_dict)
                log_rhos, behaviour_log_probs, target_log_probs = sess.run(
                    ground_truth, feed_dict=feed_dict
                )

                # Calculate V-trace using the ground truth logits.
                from_iw = vtrace.from_importance_weights(
                    log_rhos=log_rhos,
                    discounts=values["discounts"],
                    rewards=values["rewards"],
                    values=values["values"],
                    bootstrap_value=values["bootstrap_value"],
                    clip_rho_threshold=clip_rho_threshold,
                    clip_pg_rho_threshold=clip_pg_rho_threshold,
                )
                from_iw = sess.run(from_iw)
            else:
                from_iw = vtrace.from_importance_weights(
                    log_rhos=log_rhos,
                    discounts=inputs_["discounts"],
                    rewards=inputs_["rewards"],
                    values=inputs_["values"],
                    bootstrap_value=inputs_["bootstrap_value"],
                    clip_rho_threshold=clip_rho_threshold,
                    clip_pg_rho_threshold=clip_pg_rho_threshold,
                )

            check(from_iw.vs, from_logits_output.vs)
            check(from_iw.pg_advantages, from_logits_output.pg_advantages)
            check(behaviour_log_probs, from_logits_output.behaviour_action_log_probs)
            check(target_log_probs, from_logits_output.target_action_log_probs)
            check(log_rhos, from_logits_output.log_rhos)

    def test_higher_rank_inputs_for_importance_weights(self):
        """Checks support for additional dimensions in inputs."""
        for fw in framework_iterator(frameworks=("torch", "tf"), session=True):
            vtrace = vtrace_tf if fw != "torch" else vtrace_torch
            if fw == "tf":
                inputs_ = {
                    "log_rhos": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, 1]
                    ),
                    "discounts": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, 1]
                    ),
                    "rewards": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, 42]
                    ),
                    "values": tf1.placeholder(dtype=tf.float32, shape=[None, None, 42]),
                    "bootstrap_value": tf1.placeholder(
                        dtype=tf.float32, shape=[None, 42]
                    ),
                }
            else:
                inputs_ = {
                    "log_rhos": Box(-1.0, 1.0, (8, 10, 1)).sample(),
                    "discounts": Box(-1.0, 1.0, (8, 10, 1)).sample(),
                    "rewards": Box(-1.0, 1.0, (8, 10, 42)).sample(),
                    "values": Box(-1.0, 1.0, (8, 10, 42)).sample(),
                    "bootstrap_value": Box(-1.0, 1.0, (10, 42)).sample(),
                }
            output = vtrace.from_importance_weights(**inputs_)
            check(int(output.vs.shape[-1]), 42)

    def test_inconsistent_rank_inputs_for_importance_weights(self):
        """Test one of many possible errors in shape of inputs."""
        for fw in framework_iterator(frameworks=("torch", "tf"), session=True):
            vtrace = vtrace_tf if fw != "torch" else vtrace_torch
            if fw == "tf":
                inputs_ = {
                    "log_rhos": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, 1]
                    ),
                    "discounts": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, 1]
                    ),
                    "rewards": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, 42]
                    ),
                    "values": tf1.placeholder(dtype=tf.float32, shape=[None, None, 42]),
                    # Should be [None, 42].
                    "bootstrap_value": tf1.placeholder(dtype=tf.float32, shape=[None]),
                }
            else:
                inputs_ = {
                    "log_rhos": Box(-1.0, 1.0, (7, 15, 1)).sample(),
                    "discounts": Box(-1.0, 1.0, (7, 15, 1)).sample(),
                    "rewards": Box(-1.0, 1.0, (7, 15, 42)).sample(),
                    "values": Box(-1.0, 1.0, (7, 15, 42)).sample(),
                    # Should be [15, 42].
                    "bootstrap_value": Box(-1.0, 1.0, (7,)).sample(),
                }
            with self.assertRaisesRegex(
                (ValueError, AssertionError), "must have rank 2"
            ):
                vtrace.from_importance_weights(**inputs_)


if __name__ == "__main__":
    tf.test.main()