[rllib] Rename algorithms (#890)

* rename algorithms

* fix

* fix jenkins test

* fix documentation

* fix
This commit is contained in:
Philipp Moritz 2017-08-29 16:56:42 -07:00 committed by Robert Nishihara
parent e1831792f8
commit 164a8f368e
30 changed files with 83 additions and 88 deletions

View file

@ -8,14 +8,14 @@ You can run training with
::
python train.py --env CartPole-v0 --alg PolicyGradient
python train.py --env CartPole-v0 --alg PPO
The available algorithms are:
- ``PolicyGradient`` is a proximal variant of
- ``PPO`` is a proximal variant of
`TRPO <https://arxiv.org/abs/1502.05477>`__.
- ``EvolutionStrategies`` is decribed in `this
- ``ES`` is decribed in `this
paper <https://arxiv.org/abs/1703.03864>`__. Our implementation
borrows code from
`here <https://github.com/openai/evolution-strategies-starter>`__.

View file

@ -1,3 +1,3 @@
from ray.rllib.a3c.a3c import A3C, DEFAULT_CONFIG
from ray.rllib.a3c.a3c import A3CAgent, DEFAULT_CONFIG
__all__ = ["A3C", "DEFAULT_CONFIG"]
__all__ = ["A3CAgent", "DEFAULT_CONFIG"]

View file

@ -11,7 +11,7 @@ import os
import ray
from ray.rllib.a3c.runner import RunnerThread, process_rollout
from ray.rllib.a3c.envs import create_env
from ray.rllib.common import Algorithm, TrainingResult
from ray.rllib.common import Agent, TrainingResult
from ray.rllib.a3c.shared_model_lstm import SharedModelLSTM
@ -87,11 +87,11 @@ class Runner(object):
return gradient, info
class A3C(Algorithm):
class A3CAgent(Agent):
def __init__(self, env_name, config,
policy_cls=SharedModelLSTM, upload_dir=None):
config.update({"alg": "A3C"})
Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
Agent.__init__(self, env_name, config, upload_dir=upload_dir)
self.env = create_env(env_name)
self.policy = policy_cls(
self.env.observation_space.shape, self.env.action_space)

View file

@ -54,11 +54,11 @@ TrainingResult = namedtuple("TrainingResult", [
])
class Algorithm(object):
"""All RLlib algorithms extend this base class.
class Agent(object):
"""All RLlib agents extend this base class.
Algorithm objects retain internal model state between calls to train(), so
you should create a new algorithm instance for each training session.
Agent objects retain internal model state between calls to train(), so
you should create a new agent instance for each training session.
Attributes:
env_name (str): Name of the OpenAI gym environment to train against.
@ -69,7 +69,7 @@ class Algorithm(object):
"""
def __init__(self, env_name, config, upload_dir=None):
"""Initialize an RLLib algorithm.
"""Initialize an RLLib agent.
Args:
env_name (str): The name of the OpenAI gym environment to use.

View file

@ -2,6 +2,6 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.dqn.dqn import DQN, DEFAULT_CONFIG
from ray.rllib.dqn.dqn import DQNAgent, DEFAULT_CONFIG
__all__ = ["DQN", "DEFAULT_CONFIG"]
__all__ = ["DQNAgent", "DEFAULT_CONFIG"]

View file

@ -10,7 +10,7 @@ import pickle
import os
import tensorflow as tf
from ray.rllib.common import Algorithm, TrainingResult
from ray.rllib.common import Agent, TrainingResult
from ray.rllib.dqn import logger, models
from ray.rllib.dqn.common.atari_wrappers_deprecated \
import wrap_dqn, ScaledFloatFrame
@ -102,11 +102,11 @@ DEFAULT_CONFIG = dict(
num_cpu=16)
class DQN(Algorithm):
class DQNAgent(Agent):
def __init__(self, env_name, config, upload_dir=None):
config.update({"alg": "DQN"})
Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
Agent.__init__(self, env_name, config, upload_dir=upload_dir)
with tf.Graph().as_default():
self._init()

View file

@ -0,0 +1,3 @@
from ray.rllib.es.es import (ESAgent, DEFAULT_CONFIG)
__all__ = ["ESAgent", "DEFAULT_CONFIG"]

View file

@ -15,14 +15,14 @@ import time
import tensorflow as tf
import ray
from ray.rllib.common import Algorithm, TrainingResult
from ray.rllib.common import Agent, TrainingResult
from ray.rllib.models import ModelCatalog
from ray.rllib.evolution_strategies import optimizers
from ray.rllib.evolution_strategies import policies
from ray.rllib.evolution_strategies import tabular_logger as tlogger
from ray.rllib.evolution_strategies import tf_util
from ray.rllib.evolution_strategies import utils
from ray.rllib.es import optimizers
from ray.rllib.es import policies
from ray.rllib.es import tabular_logger as tlogger
from ray.rllib.es import tf_util
from ray.rllib.es import utils
Result = namedtuple("Result", [
@ -160,11 +160,11 @@ class Worker(object):
ob_count=task_ob_stat.count)
class EvolutionStrategies(Algorithm):
class ESAgent(Agent):
def __init__(self, env_name, config, upload_dir=None):
config.update({"alg": "EvolutionStrategies"})
Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
Agent.__init__(self, env_name, config, upload_dir=upload_dir)
with tf.Graph().as_default():
self._init()

View file

@ -13,7 +13,7 @@ import h5py
import numpy as np
import tensorflow as tf
from ray.rllib.evolution_strategies import tf_util as U
from ray.rllib.es import tf_util as U
from ray.rllib.models import ModelCatalog
logger = logging.getLogger(__name__)

View file

@ -1,4 +0,0 @@
from ray.rllib.evolution_strategies.evolution_strategies import (
EvolutionStrategies, DEFAULT_CONFIG)
__all__ = ["EvolutionStrategies", "DEFAULT_CONFIG"]

View file

@ -39,7 +39,7 @@ class LocalSyncParallelOptimizer(object):
processed.
build_loss: Function that takes the specified inputs and returns an
object with a 'loss' property that is a scalar Tensor. For example,
ray.rllib.policy_gradient.ProximalPolicyLoss.
ray.rllib.ppo.ProximalPolicyLoss.
logdir: Directory to place debugging output in.
"""

View file

@ -1,4 +0,0 @@
from ray.rllib.policy_gradient.policy_gradient import (
PolicyGradient, DEFAULT_CONFIG)
__all__ = ["PolicyGradient", "DEFAULT_CONFIG"]

View file

@ -0,0 +1,3 @@
from ray.rllib.ppo.ppo import (PPOAgent, DEFAULT_CONFIG)
__all__ = ["PPOAgent", "DEFAULT_CONFIG"]

View file

@ -11,10 +11,10 @@ import tensorflow as tf
from tensorflow.python import debug as tf_debug
import ray
from ray.rllib.common import Algorithm, TrainingResult
from ray.rllib.policy_gradient.agent import Agent, RemoteAgent
from ray.rllib.policy_gradient.rollout import collect_samples
from ray.rllib.policy_gradient.utils import shuffle
from ray.rllib.common import Agent, TrainingResult
from ray.rllib.ppo.runner import Runner, RemoteRunner
from ray.rllib.ppo.rollout import collect_samples
from ray.rllib.ppo.utils import shuffle
DEFAULT_CONFIG = {
@ -75,11 +75,11 @@ DEFAULT_CONFIG = {
}
class PolicyGradient(Algorithm):
class PPOAgent(Agent):
def __init__(self, env_name, config, upload_dir=None):
config.update({"alg": "PolicyGradient"})
config.update({"alg": "PPO"})
Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
Agent.__init__(self, env_name, config, upload_dir=upload_dir)
with tf.Graph().as_default():
self._init()
@ -88,9 +88,9 @@ class PolicyGradient(Algorithm):
self.global_step = 0
self.j = 0
self.kl_coeff = self.config["kl_coeff"]
self.model = Agent(self.env_name, 1, self.config, self.logdir, False)
self.model = Runner(self.env_name, 1, self.config, self.logdir, False)
self.agents = [
RemoteAgent.remote(
RemoteRunner.remote(
self.env_name, 1, self.config, self.logdir, True)
for _ in range(self.config["num_workers"])]
self.start_time = time.time()
@ -121,10 +121,10 @@ class PolicyGradient(Algorithm):
if self.file_writer:
traj_stats = tf.Summary(value=[
tf.Summary.Value(
tag="policy_gradient/rollouts/mean_reward",
tag="ppo/rollouts/mean_reward",
simple_value=total_reward),
tf.Summary.Value(
tag="policy_gradient/rollouts/traj_len_mean",
tag="ppo/rollouts/traj_len_mean",
simple_value=traj_len_mean)])
self.file_writer.add_summary(traj_stats, self.global_step)
self.global_step += 1
@ -191,7 +191,7 @@ class PolicyGradient(Algorithm):
values = []
if i == config["num_sgd_iter"] - 1:
metric_prefix = "policy_gradient/sgd/final_iter/"
metric_prefix = "ppo/sgd/final_iter/"
values.append(tf.Summary.Value(
tag=metric_prefix + "kl_coeff",
simple_value=self.kl_coeff))

View file

@ -5,8 +5,8 @@ from __future__ import print_function
import numpy as np
import ray
from ray.rllib.policy_gradient.filter import NoFilter
from ray.rllib.policy_gradient.utils import concatenate
from ray.rllib.ppo.filter import NoFilter
from ray.rllib.ppo.utils import concatenate
def rollouts(policy, env, horizon, observation_filter=NoFilter(),

View file

@ -14,12 +14,12 @@ import ray
from ray.rllib.parallel import LocalSyncParallelOptimizer
from ray.rllib.models import ModelCatalog
from ray.rllib.policy_gradient.env import BatchedEnv
from ray.rllib.policy_gradient.loss import ProximalPolicyLoss
from ray.rllib.policy_gradient.filter import MeanStdFilter
from ray.rllib.policy_gradient.rollout import (
from ray.rllib.ppo.env import BatchedEnv
from ray.rllib.ppo.loss import ProximalPolicyLoss
from ray.rllib.ppo.filter import MeanStdFilter
from ray.rllib.ppo.rollout import (
rollouts, add_return_values, add_advantage_values)
from ray.rllib.policy_gradient.utils import flatten, concatenate
from ray.rllib.ppo.utils import flatten, concatenate
# TODO(pcm): Make sure that both observation_filter and reward_filter
# are correctly handled, i.e. (a) the values are accumulated accross
@ -28,9 +28,9 @@ from ray.rllib.policy_gradient.utils import flatten, concatenate
# as part of the checkpoint so training can resume properly.
class Agent(object):
class Runner(object):
"""
Agent class that holds the simulator environment and the policy.
Runner class that holds the simulator environment and the policy.
Initializes the tensorflow graphs for both training and evaluation.
One common policy graph is initialized on '/cpu:0' and holds all the shared
@ -244,4 +244,4 @@ class Agent(object):
return concatenate(trajectories), total_rewards, trajectory_lengths
RemoteAgent = ray.remote(Agent)
RemoteRunner = ray.remote(Runner)

View file

@ -8,7 +8,7 @@ import tensorflow as tf
from numpy.testing import assert_allclose
from ray.rllib.models.action_dist import Categorical
from ray.rllib.policy_gradient.utils import flatten, concatenate
from ray.rllib.ppo.utils import flatten, concatenate
# TODO(ekl): move to rllib/models dir

View file

@ -1,14 +1,14 @@
#!/bin/bash
python train.py --env Hopper-v1 --config '{"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
python train.py --env Hopper-v1 --config '{"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}' --alg PPO --upload-dir s3://bucketname/
python train.py --env CartPole-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
python train.py --env CartPole-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}' --alg PPO --upload-dir s3://bucketname/
python train.py --env Walker2d-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
python train.py --env Walker2d-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}' --alg PPO --upload-dir s3://bucketname/
python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}' --alg PolicyGradient --upload-dir s3://bucketname/
python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}' --alg PPO --upload-dir s3://bucketname/
python train.py --env Humanoid-v1 --config '{"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}' --alg PolicyGradient --upload-dir s3://bucketname/
python train.py --env Humanoid-v1 --config '{"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}' --alg PPO --upload-dir s3://bucketname/
python train.py --env PongNoFrameskip-v0 --alg DQN --upload-dir s3://bucketname/

View file

@ -7,20 +7,18 @@ from __future__ import print_function
import ray
import random
from ray.rllib.dqn import DQN, DEFAULT_CONFIG as DQN_CONFIG
from ray.rllib.evolution_strategies import (
EvolutionStrategies, DEFAULT_CONFIG as ES_CONFIG)
from ray.rllib.policy_gradient import (
PolicyGradient, DEFAULT_CONFIG as PG_CONFIG)
from ray.rllib.a3c import A3C, DEFAULT_CONFIG as A3C_CONFIG
from ray.rllib.dqn import (DQNAgent, DEFAULT_CONFIG as DQN_CONFIG)
from ray.rllib.es import (ESAgent, DEFAULT_CONFIG as ES_CONFIG)
from ray.rllib.ppo import (PPOAgent, DEFAULT_CONFIG as PG_CONFIG)
from ray.rllib.a3c import (A3CAgent, DEFAULT_CONFIG as A3C_CONFIG)
ray.init()
for (cls, default_config) in [
(DQN, DQN_CONFIG),
(DQNAgent, DQN_CONFIG),
# TODO(ekl) this fails with multiple ES instances in a process
(EvolutionStrategies, ES_CONFIG),
(PolicyGradient, PG_CONFIG),
(A3C, A3C_CONFIG)]:
(ESAgent, ES_CONFIG),
(PPOAgent, PG_CONFIG),
(A3CAgent, A3C_CONFIG)]:
config = default_config.copy()
config["num_sgd_iter"] = 5
config["episodes_per_batch"] = 100

View file

@ -10,8 +10,8 @@ import os
import sys
import ray
import ray.rllib.policy_gradient as pg
import ray.rllib.evolution_strategies as es
import ray.rllib.ppo as ppo
import ray.rllib.es as es
import ray.rllib.dqn as dqn
import ray.rllib.a3c as a3c
@ -42,30 +42,29 @@ if __name__ == "__main__":
ray.init(redis_address=args.redis_address)
env_name = args.env
if args.alg == "PolicyGradient":
config = pg.DEFAULT_CONFIG.copy()
if args.alg == "PPO":
config = ppo.DEFAULT_CONFIG.copy()
config.update(json_config)
alg = pg.PolicyGradient(
alg = ppo.PPOAgent(
env_name, config, upload_dir=args.upload_dir)
elif args.alg == "EvolutionStrategies":
elif args.alg == "ES":
config = es.DEFAULT_CONFIG.copy()
config.update(json_config)
alg = es.EvolutionStrategies(
alg = es.ESAgent(
env_name, config, upload_dir=args.upload_dir)
elif args.alg == "DQN":
config = dqn.DEFAULT_CONFIG.copy()
config.update(json_config)
alg = dqn.DQN(
alg = dqn.DQNAgent(
env_name, config, upload_dir=args.upload_dir)
elif args.alg == "A3C":
config = a3c.DEFAULT_CONFIG.copy()
config.update(json_config)
alg = a3c.A3C(
alg = a3c.A3CAgent(
env_name, config, upload_dir=args.upload_dir)
else:
assert False, ("Unknown algorithm, check --alg argument. Valid "
"choices are PolicyGradient, EvolutionStrategies, "
"DQN and A3C.")
"choices are PPO, ES, DQN and A3C.")
result_logger = ray.rllib.common.RLLibLogger(
os.path.join(alg.logdir, "result.json"))

View file

@ -66,21 +66,21 @@ docker run --shm-size=10G --memory=10G $DOCKER_SHA \
docker run --shm-size=10G --memory=10G $DOCKER_SHA \
python /ray/python/ray/rllib/train.py \
--env CartPole-v1 \
--alg PolicyGradient \
--alg PPO \
--num-iterations 2 \
--config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "sgd_stepsize": 1e-4, "sgd_batchsize": 64, "timesteps_per_batch": 2000, "num_workers": 1}'
docker run --shm-size=10G --memory=10G $DOCKER_SHA \
python /ray/python/ray/rllib/train.py \
--env CartPole-v1 \
--alg PolicyGradient \
--alg PPO \
--num-iterations 2 \
--config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "sgd_stepsize": 1e-4, "sgd_batchsize": 64, "timesteps_per_batch": 2000, "num_workers": 1, "use_gae": false}'
docker run --shm-size=10G --memory=10G $DOCKER_SHA \
python /ray/python/ray/rllib/train.py \
--env Pendulum-v0 \
--alg EvolutionStrategies \
--alg ES \
--num-iterations 2 \
--config '{"stepsize": 0.01}'