mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
66 lines
2 KiB
Python
66 lines
2 KiB
Python
import gym
|
|
import numpy as np
|
|
|
|
|
|
class LookAndPush(gym.Env):
|
|
"""Memory-requiring Env: Best sequence of actions depends on prev. states.
|
|
|
|
Optimal behavior:
|
|
0) a=0 -> observe next state (s'), which is the "hidden" state.
|
|
If a=1 here, the hidden state is not observed.
|
|
1) a=1 to always jump to s=2 (not matter what the prev. state was).
|
|
2) a=1 to move to s=3.
|
|
3) a=1 to move to s=4.
|
|
4) a=0 OR 1 depending on s' observed after 0): +10 reward and done.
|
|
otherwise: -10 reward and done.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.action_space = gym.spaces.Discrete(2)
|
|
self.observation_space = gym.spaces.Discrete(5)
|
|
self._state = None
|
|
self._case = None
|
|
|
|
def reset(self):
|
|
self._state = 2
|
|
self._case = np.random.choice(2)
|
|
return self._state
|
|
|
|
def step(self, action):
|
|
assert self.action_space.contains(action)
|
|
|
|
if self._state == 4:
|
|
if action and self._case:
|
|
return self._state, 10., True, {}
|
|
else:
|
|
return self._state, -10, True, {}
|
|
else:
|
|
if action:
|
|
if self._state == 0:
|
|
self._state = 2
|
|
else:
|
|
self._state += 1
|
|
elif self._state == 2:
|
|
self._state = self._case
|
|
|
|
return self._state, -1, False, {}
|
|
|
|
|
|
class OneHot(gym.Wrapper):
|
|
def __init__(self, env):
|
|
super(OneHot, self).__init__(env)
|
|
self.observation_space = gym.spaces.Box(0., 1.,
|
|
(env.observation_space.n, ))
|
|
|
|
def reset(self, **kwargs):
|
|
obs = self.env.reset(**kwargs)
|
|
return self._encode_obs(obs)
|
|
|
|
def step(self, action):
|
|
obs, reward, done, info = self.env.step(action)
|
|
return self._encode_obs(obs), reward, done, info
|
|
|
|
def _encode_obs(self, obs):
|
|
new_obs = np.ones(self.env.observation_space.n)
|
|
new_obs[obs] = 1.0
|
|
return new_obs
|