[rllib] Better example rnn envs (#5300)

This commit is contained in:
Eric Liang 2019-07-28 14:07:18 -07:00 committed by GitHub
parent 1465a30ea9
commit 3bdd114282
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 85 additions and 16 deletions

View file

@ -326,7 +326,10 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
/ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_keras_model.py --run=DQN --stop=50
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
/ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_keras_rnn_model.py --run=PPO --stop=50
/ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_keras_rnn_model.py --run=PPO --stop=50 --env=RepeatAfterMeEnv
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
/ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_keras_rnn_model.py --run=PPO --stop=50 --env=RepeatInitialEnv
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
/ray/ci/suppress_output python /ray/python/ray/rllib/examples/parametric_action_cartpole.py --run=PG --stop=50

View file

@ -35,9 +35,11 @@ DEFAULT_CONFIG = with_common_config({
"lr": 5e-5,
# Learning rate schedule
"lr_schedule": None,
# Share layers for value function
# Share layers for value function. If you set this to True, it's important
# to tune vf_loss_coeff.
"vf_share_layers": False,
# Coefficient of the value function loss
# Coefficient of the value function loss. It's important to tune this if
# you set vf_share_layers: True
"vf_loss_coeff": 1.0,
# Coefficient of the entropy regularizer
"entropy_coeff": 0.0,

View file

@ -4,12 +4,15 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym
from gym.spaces import Discrete
import numpy as np
import random
import argparse
import ray
from ray import tune
from ray.rllib.examples.cartpole_lstm import CartPoleStatelessEnv
from ray.tune.registry import register_env
from ray.rllib.models import ModelCatalog
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.recurrent_tf_modelv2 import RecurrentTFModelV2
@ -20,7 +23,8 @@ tf = try_import_tf()
parser = argparse.ArgumentParser()
parser.add_argument("--run", type=str, default="PPO")
parser.add_argument("--stop", type=int, default=200)
parser.add_argument("--env", type=str, default="RepeatAfterMeEnv")
parser.add_argument("--stop", type=int, default=90)
class MyKerasRNN(RecurrentTFModelV2):
@ -55,14 +59,12 @@ class MyKerasRNN(RecurrentTFModelV2):
initial_state=[state_in_h, state_in_c])
# Postprocess LSTM output with another hidden layer and compute values
dense2 = tf.keras.layers.Dense(
hiddens_size, activation=tf.nn.relu, name="dense2")(lstm_out)
logits = tf.keras.layers.Dense(
self.num_outputs,
activation=tf.keras.activations.linear,
name="logits")(dense2)
name="logits")(lstm_out)
values = tf.keras.layers.Dense(
1, activation=None, name="values")(dense2)
1, activation=None, name="values")(lstm_out)
# Create the RNN model
self.rnn_model = tf.keras.Model(
@ -89,20 +91,82 @@ class MyKerasRNN(RecurrentTFModelV2):
return tf.reshape(self._value_out, [-1])
class RepeatInitialEnv(gym.Env):
"""Simple env in which the policy learns to repeat the initial observation
seen at timestep 0."""
def __init__(self):
self.observation_space = Discrete(2)
self.action_space = Discrete(2)
self.token = None
self.num_steps = 0
def reset(self):
self.token = random.choice([0, 1])
self.num_steps = 0
return self.token
def step(self, action):
if action == self.token:
reward = 1
else:
reward = -1
self.num_steps += 1
done = self.num_steps > 100
return 0, reward, done, {}
class RepeatAfterMeEnv(gym.Env):
"""Simple env in which the policy learns to repeat a previous observation
token after a given delay."""
def __init__(self, config):
self.observation_space = Discrete(2)
self.action_space = Discrete(2)
self.delay = config["repeat_delay"]
assert self.delay >= 1, "delay must be at least 1"
self.history = []
def reset(self):
self.history = [0] * self.delay
return self._next_obs()
def step(self, action):
if action == self.history[-(1 + self.delay)]:
reward = 1
else:
reward = -1
done = len(self.history) > 100
return self._next_obs(), reward, done, {}
def _next_obs(self):
token = random.choice([0, 1])
self.history.append(token)
return token
if __name__ == "__main__":
ray.init()
args = parser.parse_args()
ModelCatalog.register_custom_model("rnn", MyKerasRNN)
register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c))
register_env("RepeatInitialEnv", lambda _: RepeatInitialEnv())
tune.run(
args.run,
stop={"episode_reward_mean": args.stop},
config={
"env": CartPoleStatelessEnv,
"num_envs_per_worker": 4,
"num_sgd_iter": 3,
"vf_loss_coeff": 1e-4,
"env": args.env,
"env_config": {
"repeat_delay": 2,
},
"gamma": 0.9,
"num_workers": 0,
"num_envs_per_worker": 20,
"entropy_coeff": 0.001,
"num_sgd_iter": 5,
"vf_loss_coeff": 1e-5,
"model": {
"custom_model": "rnn",
"max_seq_len": 7,
"max_seq_len": 20,
},
})

View file

@ -444,7 +444,7 @@ def test_actor_deletion(ray_start_regular):
def test_actor_deletion_with_gpus(shutdown_only):
ray.init(num_cpus=1, num_gpus=1)
ray.init(num_cpus=1, num_gpus=1, object_store_memory=int(10**8))
# When an actor that uses a GPU exits, make sure that the GPU resources
# are released.
@ -1740,7 +1740,7 @@ def test_nondeterministic_reconstruction_concurrent_forks(
@pytest.fixture
def setup_queue_actor():
ray.init(num_cpus=1)
ray.init(num_cpus=1, object_store_memory=int(10**8))
@ray.remote
class Queue(object):