mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[rllib] Better example rnn envs (#5300)
This commit is contained in:
parent
1465a30ea9
commit
3bdd114282
4 changed files with 85 additions and 16 deletions
|
@ -326,7 +326,10 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
|
|||
/ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_keras_model.py --run=DQN --stop=50
|
||||
|
||||
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
|
||||
/ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_keras_rnn_model.py --run=PPO --stop=50
|
||||
/ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_keras_rnn_model.py --run=PPO --stop=50 --env=RepeatAfterMeEnv
|
||||
|
||||
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
|
||||
/ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_keras_rnn_model.py --run=PPO --stop=50 --env=RepeatInitialEnv
|
||||
|
||||
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
|
||||
/ray/ci/suppress_output python /ray/python/ray/rllib/examples/parametric_action_cartpole.py --run=PG --stop=50
|
||||
|
|
|
@ -35,9 +35,11 @@ DEFAULT_CONFIG = with_common_config({
|
|||
"lr": 5e-5,
|
||||
# Learning rate schedule
|
||||
"lr_schedule": None,
|
||||
# Share layers for value function
|
||||
# Share layers for value function. If you set this to True, it's important
|
||||
# to tune vf_loss_coeff.
|
||||
"vf_share_layers": False,
|
||||
# Coefficient of the value function loss
|
||||
# Coefficient of the value function loss. It's important to tune this if
|
||||
# you set vf_share_layers: True
|
||||
"vf_loss_coeff": 1.0,
|
||||
# Coefficient of the entropy regularizer
|
||||
"entropy_coeff": 0.0,
|
||||
|
|
|
@ -4,12 +4,15 @@ from __future__ import absolute_import
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import gym
|
||||
from gym.spaces import Discrete
|
||||
import numpy as np
|
||||
import random
|
||||
import argparse
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.rllib.examples.cartpole_lstm import CartPoleStatelessEnv
|
||||
from ray.tune.registry import register_env
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.modelv2 import ModelV2
|
||||
from ray.rllib.models.tf.recurrent_tf_modelv2 import RecurrentTFModelV2
|
||||
|
@ -20,7 +23,8 @@ tf = try_import_tf()
|
|||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--run", type=str, default="PPO")
|
||||
parser.add_argument("--stop", type=int, default=200)
|
||||
parser.add_argument("--env", type=str, default="RepeatAfterMeEnv")
|
||||
parser.add_argument("--stop", type=int, default=90)
|
||||
|
||||
|
||||
class MyKerasRNN(RecurrentTFModelV2):
|
||||
|
@ -55,14 +59,12 @@ class MyKerasRNN(RecurrentTFModelV2):
|
|||
initial_state=[state_in_h, state_in_c])
|
||||
|
||||
# Postprocess LSTM output with another hidden layer and compute values
|
||||
dense2 = tf.keras.layers.Dense(
|
||||
hiddens_size, activation=tf.nn.relu, name="dense2")(lstm_out)
|
||||
logits = tf.keras.layers.Dense(
|
||||
self.num_outputs,
|
||||
activation=tf.keras.activations.linear,
|
||||
name="logits")(dense2)
|
||||
name="logits")(lstm_out)
|
||||
values = tf.keras.layers.Dense(
|
||||
1, activation=None, name="values")(dense2)
|
||||
1, activation=None, name="values")(lstm_out)
|
||||
|
||||
# Create the RNN model
|
||||
self.rnn_model = tf.keras.Model(
|
||||
|
@ -89,20 +91,82 @@ class MyKerasRNN(RecurrentTFModelV2):
|
|||
return tf.reshape(self._value_out, [-1])
|
||||
|
||||
|
||||
class RepeatInitialEnv(gym.Env):
|
||||
"""Simple env in which the policy learns to repeat the initial observation
|
||||
seen at timestep 0."""
|
||||
|
||||
def __init__(self):
|
||||
self.observation_space = Discrete(2)
|
||||
self.action_space = Discrete(2)
|
||||
self.token = None
|
||||
self.num_steps = 0
|
||||
|
||||
def reset(self):
|
||||
self.token = random.choice([0, 1])
|
||||
self.num_steps = 0
|
||||
return self.token
|
||||
|
||||
def step(self, action):
|
||||
if action == self.token:
|
||||
reward = 1
|
||||
else:
|
||||
reward = -1
|
||||
self.num_steps += 1
|
||||
done = self.num_steps > 100
|
||||
return 0, reward, done, {}
|
||||
|
||||
|
||||
class RepeatAfterMeEnv(gym.Env):
|
||||
"""Simple env in which the policy learns to repeat a previous observation
|
||||
token after a given delay."""
|
||||
|
||||
def __init__(self, config):
|
||||
self.observation_space = Discrete(2)
|
||||
self.action_space = Discrete(2)
|
||||
self.delay = config["repeat_delay"]
|
||||
assert self.delay >= 1, "delay must be at least 1"
|
||||
self.history = []
|
||||
|
||||
def reset(self):
|
||||
self.history = [0] * self.delay
|
||||
return self._next_obs()
|
||||
|
||||
def step(self, action):
|
||||
if action == self.history[-(1 + self.delay)]:
|
||||
reward = 1
|
||||
else:
|
||||
reward = -1
|
||||
done = len(self.history) > 100
|
||||
return self._next_obs(), reward, done, {}
|
||||
|
||||
def _next_obs(self):
|
||||
token = random.choice([0, 1])
|
||||
self.history.append(token)
|
||||
return token
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ray.init()
|
||||
args = parser.parse_args()
|
||||
ModelCatalog.register_custom_model("rnn", MyKerasRNN)
|
||||
register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c))
|
||||
register_env("RepeatInitialEnv", lambda _: RepeatInitialEnv())
|
||||
tune.run(
|
||||
args.run,
|
||||
stop={"episode_reward_mean": args.stop},
|
||||
config={
|
||||
"env": CartPoleStatelessEnv,
|
||||
"num_envs_per_worker": 4,
|
||||
"num_sgd_iter": 3,
|
||||
"vf_loss_coeff": 1e-4,
|
||||
"env": args.env,
|
||||
"env_config": {
|
||||
"repeat_delay": 2,
|
||||
},
|
||||
"gamma": 0.9,
|
||||
"num_workers": 0,
|
||||
"num_envs_per_worker": 20,
|
||||
"entropy_coeff": 0.001,
|
||||
"num_sgd_iter": 5,
|
||||
"vf_loss_coeff": 1e-5,
|
||||
"model": {
|
||||
"custom_model": "rnn",
|
||||
"max_seq_len": 7,
|
||||
"max_seq_len": 20,
|
||||
},
|
||||
})
|
||||
|
|
|
@ -444,7 +444,7 @@ def test_actor_deletion(ray_start_regular):
|
|||
|
||||
|
||||
def test_actor_deletion_with_gpus(shutdown_only):
|
||||
ray.init(num_cpus=1, num_gpus=1)
|
||||
ray.init(num_cpus=1, num_gpus=1, object_store_memory=int(10**8))
|
||||
|
||||
# When an actor that uses a GPU exits, make sure that the GPU resources
|
||||
# are released.
|
||||
|
@ -1740,7 +1740,7 @@ def test_nondeterministic_reconstruction_concurrent_forks(
|
|||
|
||||
@pytest.fixture
|
||||
def setup_queue_actor():
|
||||
ray.init(num_cpus=1)
|
||||
ray.init(num_cpus=1, object_store_memory=int(10**8))
|
||||
|
||||
@ray.remote
|
||||
class Queue(object):
|
||||
|
|
Loading…
Add table
Reference in a new issue