mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
94 lines
3.3 KiB
Python
94 lines
3.3 KiB
Python
# __quick_start_begin__
|
|
import gym
|
|
from ray.rllib.algorithms.ppo import PPO
|
|
|
|
|
|
# Define your problem using python and openAI's gym API:
|
|
class SimpleCorridor(gym.Env):
|
|
"""Corridor in which an agent must learn to move right to reach the exit.
|
|
|
|
---------------------
|
|
| S | 1 | 2 | 3 | G | S=start; G=goal; corridor_length=5
|
|
---------------------
|
|
|
|
Possible actions to chose from are: 0=left; 1=right
|
|
Observations are floats indicating the current field index, e.g. 0.0 for
|
|
starting position, 1.0 for the field next to the starting position, etc..
|
|
Rewards are -0.1 for all steps, except when reaching the goal (+1.0).
|
|
"""
|
|
|
|
def __init__(self, config):
|
|
self.end_pos = config["corridor_length"]
|
|
self.cur_pos = 0
|
|
self.action_space = gym.spaces.Discrete(2) # left and right
|
|
self.observation_space = gym.spaces.Box(0.0, self.end_pos, shape=(1,))
|
|
|
|
def reset(self):
|
|
"""Resets the episode and returns the initial observation of the new one."""
|
|
self.cur_pos = 0
|
|
# Return initial observation.
|
|
return [self.cur_pos]
|
|
|
|
def step(self, action):
|
|
"""Takes a single step in the episode given `action`
|
|
|
|
Returns:
|
|
New observation, reward, done-flag, info-dict (empty).
|
|
"""
|
|
# Walk left.
|
|
if action == 0 and self.cur_pos > 0:
|
|
self.cur_pos -= 1
|
|
# Walk right.
|
|
elif action == 1:
|
|
self.cur_pos += 1
|
|
# Set `done` flag when end of corridor (goal) reached.
|
|
done = self.cur_pos >= self.end_pos
|
|
# +1 when goal reached, otherwise -1.
|
|
reward = 1.0 if done else -0.1
|
|
return [self.cur_pos], reward, done, {}
|
|
|
|
|
|
# Create an RLlib Algorithm instance.
|
|
algo = PPO(
|
|
config={
|
|
# Env class to use (here: our gym.Env sub-class from above).
|
|
"env": SimpleCorridor,
|
|
# Config dict to be passed to our custom env's constructor.
|
|
"env_config": {
|
|
# Use corridor with 20 fields (including S and G).
|
|
"corridor_length": 20
|
|
},
|
|
# Parallelize environment rollouts.
|
|
"num_workers": 3,
|
|
}
|
|
)
|
|
|
|
# Train for n iterations and report results (mean episode rewards).
|
|
# Since we have to move at least 19 times in the env to reach the goal and
|
|
# each move gives us -0.1 reward (except the last move at the end: +1.0),
|
|
# we can expect to reach an optimal episode reward of -0.1*18 + 1.0 = -0.8
|
|
for i in range(5):
|
|
results = algo.train()
|
|
print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}")
|
|
|
|
# Perform inference (action computations) based on given env observations.
|
|
# Note that we are using a slightly different env here (len 10 instead of 20),
|
|
# however, this should still work as the agent has (hopefully) learned
|
|
# to "just always walk right!"
|
|
env = SimpleCorridor({"corridor_length": 10})
|
|
# Get the initial observation (should be: [0.0] for the starting position).
|
|
obs = env.reset()
|
|
done = False
|
|
total_reward = 0.0
|
|
# Play one episode.
|
|
while not done:
|
|
# Compute a single action, given the current observation
|
|
# from the environment.
|
|
action = algo.compute_single_action(obs)
|
|
# Apply the computed action in the environment.
|
|
obs, reward, done, info = env.step(action)
|
|
# Sum up rewards for reporting purposes.
|
|
total_reward += reward
|
|
# Report results.
|
|
print(f"Played 1 episode; total-reward={total_reward}")
|
|
# __quick_start_end__
|