ray/rllib/examples/env/mbmpo_env.py

from gym.envs.classic_control import PendulumEnv, CartPoleEnv
import numpy as np

# MuJoCo may not be installed.
HalfCheetahEnv = HopperEnv = None

try:
    from gym.envs.mujoco import HalfCheetahEnv, HopperEnv
except Exception:
    pass


class CartPoleWrapper(CartPoleEnv):
    """Wrapper for the Cartpole-v0 environment.

    Adds an additional `reward` method for some model-based RL algos (e.g.
    MB-MPO).
    """

    def reward(self, obs, action, obs_next):
        # obs = batch * [pos, vel, angle, rotation_rate]
        x = obs_next[:, 0]
        theta = obs_next[:, 2]

        # 1.0 if we are still on, 0.0 if we are terminated due to bounds
        # (angular or x-axis) being breached.
        rew = 1.0 - ((x < -self.x_threshold) | (x > self.x_threshold) |
                     (theta < -self.theta_threshold_radians) |
                     (theta > self.theta_threshold_radians)).astype(np.float32)

        return rew


class PendulumWrapper(PendulumEnv):
    """Wrapper for the Pendulum-v1 environment.

    Adds an additional `reward` method for some model-based RL algos (e.g.
    MB-MPO).
    """

    def reward(self, obs, action, obs_next):
        # obs = [cos(theta), sin(theta), dtheta/dt]
        # To get the angle back from obs: atan2(sin(theta), cos(theta)).
        theta = np.arctan2(
            np.clip(obs[:, 1], -1.0, 1.0), np.clip(obs[:, 0], -1.0, 1.0))
        # Do everything in (B,) space (single theta-, action- and
        # reward values).
        a = np.clip(action, -self.max_torque, self.max_torque)[0]
        costs = self.angle_normalize(theta) ** 2 + \
            0.1 * obs[:, 2] ** 2 + 0.001 * (a ** 2)
        return -costs

    @staticmethod
    def angle_normalize(x):
        return (((x + np.pi) % (2 * np.pi)) - np.pi)


class HalfCheetahWrapper(HalfCheetahEnv or object):
    """Wrapper for the MuJoCo HalfCheetah-v2 environment.

    Adds an additional `reward` method for some model-based RL algos (e.g.
    MB-MPO).
    """

    def reward(self, obs, action, obs_next):
        if obs.ndim == 2 and action.ndim == 2:
            assert obs.shape == obs_next.shape
            forward_vel = obs_next[:, 8]
            ctrl_cost = 0.1 * np.sum(np.square(action), axis=1)
            reward = forward_vel - ctrl_cost
            return np.minimum(np.maximum(-1000.0, reward), 1000.0)
        else:
            forward_vel = obs_next[8]
            ctrl_cost = 0.1 * np.square(action).sum()
            reward = forward_vel - ctrl_cost
            return np.minimum(np.maximum(-1000.0, reward), 1000.0)


class HopperWrapper(HopperEnv or object):
    """Wrapper for the MuJoCo Hopper-v2 environment.

    Adds an additional `reward` method for some model-based RL algos (e.g.
    MB-MPO).
    """

    def reward(self, obs, action, obs_next):
        alive_bonus = 1.0
        assert obs.ndim == 2 and action.ndim == 2
        assert (obs.shape == obs_next.shape
                and action.shape[0] == obs.shape[0])
        vel = obs_next[:, 5]
        ctrl_cost = 1e-3 * np.sum(np.square(action), axis=1)
        reward = vel + alive_bonus - ctrl_cost
        return np.minimum(np.maximum(-1000.0, reward), 1000.0)


if __name__ == "__main__":
    env = PendulumWrapper()
    env.reset()
    for _ in range(100):
        env.step(env.action_space.sample())
        env.render()