From 164a8f368e4e07cc8610354c431f256510825511 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Tue, 29 Aug 2017 16:56:42 -0700 Subject: [PATCH] [rllib] Rename algorithms (#890) * rename algorithms * fix * fix jenkins test * fix documentation * fix --- python/ray/rllib/README.rst | 6 ++--- python/ray/rllib/a3c/__init__.py | 4 ++-- python/ray/rllib/a3c/a3c.py | 6 ++--- python/ray/rllib/common.py | 10 ++++---- python/ray/rllib/dqn/__init__.py | 4 ++-- python/ray/rllib/dqn/dqn.py | 6 ++--- python/ray/rllib/es/__init__.py | 3 +++ .../evolution_strategies.py => es/es.py} | 16 ++++++------- .../optimizers.py | 0 .../{evolution_strategies => es}/policies.py | 2 +- .../tabular_logger.py | 0 .../{evolution_strategies => es}/tf_util.py | 0 .../{evolution_strategies => es}/utils.py | 0 .../rllib/{evolution_strategies => es}/viz.py | 0 .../rllib/evolution_strategies/__init__.py | 4 ---- python/ray/rllib/parallel.py | 2 +- python/ray/rllib/policy_gradient/__init__.py | 4 ---- python/ray/rllib/ppo/__init__.py | 3 +++ .../ray/rllib/{policy_gradient => ppo}/env.py | 0 .../rllib/{policy_gradient => ppo}/filter.py | 0 .../rllib/{policy_gradient => ppo}/loss.py | 0 .../policy_gradient.py => ppo/ppo.py} | 24 +++++++++---------- .../rllib/{policy_gradient => ppo}/rollout.py | 4 ++-- .../agent.py => ppo/runner.py} | 16 ++++++------- .../{policy_gradient => ppo}/test/test.py | 2 +- .../rllib/{policy_gradient => ppo}/utils.py | 0 python/ray/rllib/test/test.sh | 10 ++++---- .../ray/rllib/test/test_checkpoint_restore.py | 18 +++++++------- python/ray/rllib/train.py | 21 ++++++++-------- test/jenkins_tests/run_multi_node_tests.sh | 6 ++--- 30 files changed, 83 insertions(+), 88 deletions(-) create mode 100644 python/ray/rllib/es/__init__.py rename python/ray/rllib/{evolution_strategies/evolution_strategies.py => es/es.py} (96%) rename python/ray/rllib/{evolution_strategies => es}/optimizers.py (100%) rename python/ray/rllib/{evolution_strategies => es}/policies.py (99%) rename python/ray/rllib/{evolution_strategies => es}/tabular_logger.py (100%) rename python/ray/rllib/{evolution_strategies => es}/tf_util.py (100%) rename python/ray/rllib/{evolution_strategies => es}/utils.py (100%) rename python/ray/rllib/{evolution_strategies => es}/viz.py (100%) delete mode 100644 python/ray/rllib/evolution_strategies/__init__.py delete mode 100644 python/ray/rllib/policy_gradient/__init__.py create mode 100644 python/ray/rllib/ppo/__init__.py rename python/ray/rllib/{policy_gradient => ppo}/env.py (100%) rename python/ray/rllib/{policy_gradient => ppo}/filter.py (100%) rename python/ray/rllib/{policy_gradient => ppo}/loss.py (100%) rename python/ray/rllib/{policy_gradient/policy_gradient.py => ppo/ppo.py} (93%) rename python/ray/rllib/{policy_gradient => ppo}/rollout.py (97%) rename python/ray/rllib/{policy_gradient/agent.py => ppo/runner.py} (96%) rename python/ray/rllib/{policy_gradient => ppo}/test/test.py (97%) rename python/ray/rllib/{policy_gradient => ppo}/utils.py (100%) diff --git a/python/ray/rllib/README.rst b/python/ray/rllib/README.rst index f15d48897..4834aac03 100644 --- a/python/ray/rllib/README.rst +++ b/python/ray/rllib/README.rst @@ -8,14 +8,14 @@ You can run training with :: - python train.py --env CartPole-v0 --alg PolicyGradient + python train.py --env CartPole-v0 --alg PPO The available algorithms are: -- ``PolicyGradient`` is a proximal variant of +- ``PPO`` is a proximal variant of `TRPO `__. -- ``EvolutionStrategies`` is decribed in `this +- ``ES`` is decribed in `this paper `__. Our implementation borrows code from `here `__. diff --git a/python/ray/rllib/a3c/__init__.py b/python/ray/rllib/a3c/__init__.py index 6df6b2cc6..2d9aaede4 100644 --- a/python/ray/rllib/a3c/__init__.py +++ b/python/ray/rllib/a3c/__init__.py @@ -1,3 +1,3 @@ -from ray.rllib.a3c.a3c import A3C, DEFAULT_CONFIG +from ray.rllib.a3c.a3c import A3CAgent, DEFAULT_CONFIG -__all__ = ["A3C", "DEFAULT_CONFIG"] +__all__ = ["A3CAgent", "DEFAULT_CONFIG"] diff --git a/python/ray/rllib/a3c/a3c.py b/python/ray/rllib/a3c/a3c.py index ed791ba86..f82dea64a 100644 --- a/python/ray/rllib/a3c/a3c.py +++ b/python/ray/rllib/a3c/a3c.py @@ -11,7 +11,7 @@ import os import ray from ray.rllib.a3c.runner import RunnerThread, process_rollout from ray.rllib.a3c.envs import create_env -from ray.rllib.common import Algorithm, TrainingResult +from ray.rllib.common import Agent, TrainingResult from ray.rllib.a3c.shared_model_lstm import SharedModelLSTM @@ -87,11 +87,11 @@ class Runner(object): return gradient, info -class A3C(Algorithm): +class A3CAgent(Agent): def __init__(self, env_name, config, policy_cls=SharedModelLSTM, upload_dir=None): config.update({"alg": "A3C"}) - Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) + Agent.__init__(self, env_name, config, upload_dir=upload_dir) self.env = create_env(env_name) self.policy = policy_cls( self.env.observation_space.shape, self.env.action_space) diff --git a/python/ray/rllib/common.py b/python/ray/rllib/common.py index 88edb8c0a..6434d87f0 100644 --- a/python/ray/rllib/common.py +++ b/python/ray/rllib/common.py @@ -54,11 +54,11 @@ TrainingResult = namedtuple("TrainingResult", [ ]) -class Algorithm(object): - """All RLlib algorithms extend this base class. +class Agent(object): + """All RLlib agents extend this base class. - Algorithm objects retain internal model state between calls to train(), so - you should create a new algorithm instance for each training session. + Agent objects retain internal model state between calls to train(), so + you should create a new agent instance for each training session. Attributes: env_name (str): Name of the OpenAI gym environment to train against. @@ -69,7 +69,7 @@ class Algorithm(object): """ def __init__(self, env_name, config, upload_dir=None): - """Initialize an RLLib algorithm. + """Initialize an RLLib agent. Args: env_name (str): The name of the OpenAI gym environment to use. diff --git a/python/ray/rllib/dqn/__init__.py b/python/ray/rllib/dqn/__init__.py index 6640753d2..d42995b4e 100644 --- a/python/ray/rllib/dqn/__init__.py +++ b/python/ray/rllib/dqn/__init__.py @@ -2,6 +2,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from ray.rllib.dqn.dqn import DQN, DEFAULT_CONFIG +from ray.rllib.dqn.dqn import DQNAgent, DEFAULT_CONFIG -__all__ = ["DQN", "DEFAULT_CONFIG"] +__all__ = ["DQNAgent", "DEFAULT_CONFIG"] diff --git a/python/ray/rllib/dqn/dqn.py b/python/ray/rllib/dqn/dqn.py index c6e43598a..dbed542dc 100644 --- a/python/ray/rllib/dqn/dqn.py +++ b/python/ray/rllib/dqn/dqn.py @@ -10,7 +10,7 @@ import pickle import os import tensorflow as tf -from ray.rllib.common import Algorithm, TrainingResult +from ray.rllib.common import Agent, TrainingResult from ray.rllib.dqn import logger, models from ray.rllib.dqn.common.atari_wrappers_deprecated \ import wrap_dqn, ScaledFloatFrame @@ -102,11 +102,11 @@ DEFAULT_CONFIG = dict( num_cpu=16) -class DQN(Algorithm): +class DQNAgent(Agent): def __init__(self, env_name, config, upload_dir=None): config.update({"alg": "DQN"}) - Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) + Agent.__init__(self, env_name, config, upload_dir=upload_dir) with tf.Graph().as_default(): self._init() diff --git a/python/ray/rllib/es/__init__.py b/python/ray/rllib/es/__init__.py new file mode 100644 index 000000000..b459494a9 --- /dev/null +++ b/python/ray/rllib/es/__init__.py @@ -0,0 +1,3 @@ +from ray.rllib.es.es import (ESAgent, DEFAULT_CONFIG) + +__all__ = ["ESAgent", "DEFAULT_CONFIG"] diff --git a/python/ray/rllib/evolution_strategies/evolution_strategies.py b/python/ray/rllib/es/es.py similarity index 96% rename from python/ray/rllib/evolution_strategies/evolution_strategies.py rename to python/ray/rllib/es/es.py index ea14c29ba..4052f6ed0 100644 --- a/python/ray/rllib/evolution_strategies/evolution_strategies.py +++ b/python/ray/rllib/es/es.py @@ -15,14 +15,14 @@ import time import tensorflow as tf import ray -from ray.rllib.common import Algorithm, TrainingResult +from ray.rllib.common import Agent, TrainingResult from ray.rllib.models import ModelCatalog -from ray.rllib.evolution_strategies import optimizers -from ray.rllib.evolution_strategies import policies -from ray.rllib.evolution_strategies import tabular_logger as tlogger -from ray.rllib.evolution_strategies import tf_util -from ray.rllib.evolution_strategies import utils +from ray.rllib.es import optimizers +from ray.rllib.es import policies +from ray.rllib.es import tabular_logger as tlogger +from ray.rllib.es import tf_util +from ray.rllib.es import utils Result = namedtuple("Result", [ @@ -160,11 +160,11 @@ class Worker(object): ob_count=task_ob_stat.count) -class EvolutionStrategies(Algorithm): +class ESAgent(Agent): def __init__(self, env_name, config, upload_dir=None): config.update({"alg": "EvolutionStrategies"}) - Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) + Agent.__init__(self, env_name, config, upload_dir=upload_dir) with tf.Graph().as_default(): self._init() diff --git a/python/ray/rllib/evolution_strategies/optimizers.py b/python/ray/rllib/es/optimizers.py similarity index 100% rename from python/ray/rllib/evolution_strategies/optimizers.py rename to python/ray/rllib/es/optimizers.py diff --git a/python/ray/rllib/evolution_strategies/policies.py b/python/ray/rllib/es/policies.py similarity index 99% rename from python/ray/rllib/evolution_strategies/policies.py rename to python/ray/rllib/es/policies.py index bafdcd85c..617d23a7d 100644 --- a/python/ray/rllib/evolution_strategies/policies.py +++ b/python/ray/rllib/es/policies.py @@ -13,7 +13,7 @@ import h5py import numpy as np import tensorflow as tf -from ray.rllib.evolution_strategies import tf_util as U +from ray.rllib.es import tf_util as U from ray.rllib.models import ModelCatalog logger = logging.getLogger(__name__) diff --git a/python/ray/rllib/evolution_strategies/tabular_logger.py b/python/ray/rllib/es/tabular_logger.py similarity index 100% rename from python/ray/rllib/evolution_strategies/tabular_logger.py rename to python/ray/rllib/es/tabular_logger.py diff --git a/python/ray/rllib/evolution_strategies/tf_util.py b/python/ray/rllib/es/tf_util.py similarity index 100% rename from python/ray/rllib/evolution_strategies/tf_util.py rename to python/ray/rllib/es/tf_util.py diff --git a/python/ray/rllib/evolution_strategies/utils.py b/python/ray/rllib/es/utils.py similarity index 100% rename from python/ray/rllib/evolution_strategies/utils.py rename to python/ray/rllib/es/utils.py diff --git a/python/ray/rllib/evolution_strategies/viz.py b/python/ray/rllib/es/viz.py similarity index 100% rename from python/ray/rllib/evolution_strategies/viz.py rename to python/ray/rllib/es/viz.py diff --git a/python/ray/rllib/evolution_strategies/__init__.py b/python/ray/rllib/evolution_strategies/__init__.py deleted file mode 100644 index 6b064796e..000000000 --- a/python/ray/rllib/evolution_strategies/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from ray.rllib.evolution_strategies.evolution_strategies import ( - EvolutionStrategies, DEFAULT_CONFIG) - -__all__ = ["EvolutionStrategies", "DEFAULT_CONFIG"] diff --git a/python/ray/rllib/parallel.py b/python/ray/rllib/parallel.py index 3aee4c170..5deff3c12 100644 --- a/python/ray/rllib/parallel.py +++ b/python/ray/rllib/parallel.py @@ -39,7 +39,7 @@ class LocalSyncParallelOptimizer(object): processed. build_loss: Function that takes the specified inputs and returns an object with a 'loss' property that is a scalar Tensor. For example, - ray.rllib.policy_gradient.ProximalPolicyLoss. + ray.rllib.ppo.ProximalPolicyLoss. logdir: Directory to place debugging output in. """ diff --git a/python/ray/rllib/policy_gradient/__init__.py b/python/ray/rllib/policy_gradient/__init__.py deleted file mode 100644 index bcd558423..000000000 --- a/python/ray/rllib/policy_gradient/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from ray.rllib.policy_gradient.policy_gradient import ( - PolicyGradient, DEFAULT_CONFIG) - -__all__ = ["PolicyGradient", "DEFAULT_CONFIG"] diff --git a/python/ray/rllib/ppo/__init__.py b/python/ray/rllib/ppo/__init__.py new file mode 100644 index 000000000..c039f1248 --- /dev/null +++ b/python/ray/rllib/ppo/__init__.py @@ -0,0 +1,3 @@ +from ray.rllib.ppo.ppo import (PPOAgent, DEFAULT_CONFIG) + +__all__ = ["PPOAgent", "DEFAULT_CONFIG"] diff --git a/python/ray/rllib/policy_gradient/env.py b/python/ray/rllib/ppo/env.py similarity index 100% rename from python/ray/rllib/policy_gradient/env.py rename to python/ray/rllib/ppo/env.py diff --git a/python/ray/rllib/policy_gradient/filter.py b/python/ray/rllib/ppo/filter.py similarity index 100% rename from python/ray/rllib/policy_gradient/filter.py rename to python/ray/rllib/ppo/filter.py diff --git a/python/ray/rllib/policy_gradient/loss.py b/python/ray/rllib/ppo/loss.py similarity index 100% rename from python/ray/rllib/policy_gradient/loss.py rename to python/ray/rllib/ppo/loss.py diff --git a/python/ray/rllib/policy_gradient/policy_gradient.py b/python/ray/rllib/ppo/ppo.py similarity index 93% rename from python/ray/rllib/policy_gradient/policy_gradient.py rename to python/ray/rllib/ppo/ppo.py index ec7fce4cf..f41d6eba7 100644 --- a/python/ray/rllib/policy_gradient/policy_gradient.py +++ b/python/ray/rllib/ppo/ppo.py @@ -11,10 +11,10 @@ import tensorflow as tf from tensorflow.python import debug as tf_debug import ray -from ray.rllib.common import Algorithm, TrainingResult -from ray.rllib.policy_gradient.agent import Agent, RemoteAgent -from ray.rllib.policy_gradient.rollout import collect_samples -from ray.rllib.policy_gradient.utils import shuffle +from ray.rllib.common import Agent, TrainingResult +from ray.rllib.ppo.runner import Runner, RemoteRunner +from ray.rllib.ppo.rollout import collect_samples +from ray.rllib.ppo.utils import shuffle DEFAULT_CONFIG = { @@ -75,11 +75,11 @@ DEFAULT_CONFIG = { } -class PolicyGradient(Algorithm): +class PPOAgent(Agent): def __init__(self, env_name, config, upload_dir=None): - config.update({"alg": "PolicyGradient"}) + config.update({"alg": "PPO"}) - Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) + Agent.__init__(self, env_name, config, upload_dir=upload_dir) with tf.Graph().as_default(): self._init() @@ -88,9 +88,9 @@ class PolicyGradient(Algorithm): self.global_step = 0 self.j = 0 self.kl_coeff = self.config["kl_coeff"] - self.model = Agent(self.env_name, 1, self.config, self.logdir, False) + self.model = Runner(self.env_name, 1, self.config, self.logdir, False) self.agents = [ - RemoteAgent.remote( + RemoteRunner.remote( self.env_name, 1, self.config, self.logdir, True) for _ in range(self.config["num_workers"])] self.start_time = time.time() @@ -121,10 +121,10 @@ class PolicyGradient(Algorithm): if self.file_writer: traj_stats = tf.Summary(value=[ tf.Summary.Value( - tag="policy_gradient/rollouts/mean_reward", + tag="ppo/rollouts/mean_reward", simple_value=total_reward), tf.Summary.Value( - tag="policy_gradient/rollouts/traj_len_mean", + tag="ppo/rollouts/traj_len_mean", simple_value=traj_len_mean)]) self.file_writer.add_summary(traj_stats, self.global_step) self.global_step += 1 @@ -191,7 +191,7 @@ class PolicyGradient(Algorithm): values = [] if i == config["num_sgd_iter"] - 1: - metric_prefix = "policy_gradient/sgd/final_iter/" + metric_prefix = "ppo/sgd/final_iter/" values.append(tf.Summary.Value( tag=metric_prefix + "kl_coeff", simple_value=self.kl_coeff)) diff --git a/python/ray/rllib/policy_gradient/rollout.py b/python/ray/rllib/ppo/rollout.py similarity index 97% rename from python/ray/rllib/policy_gradient/rollout.py rename to python/ray/rllib/ppo/rollout.py index b44666c3f..a96d290a7 100644 --- a/python/ray/rllib/policy_gradient/rollout.py +++ b/python/ray/rllib/ppo/rollout.py @@ -5,8 +5,8 @@ from __future__ import print_function import numpy as np import ray -from ray.rllib.policy_gradient.filter import NoFilter -from ray.rllib.policy_gradient.utils import concatenate +from ray.rllib.ppo.filter import NoFilter +from ray.rllib.ppo.utils import concatenate def rollouts(policy, env, horizon, observation_filter=NoFilter(), diff --git a/python/ray/rllib/policy_gradient/agent.py b/python/ray/rllib/ppo/runner.py similarity index 96% rename from python/ray/rllib/policy_gradient/agent.py rename to python/ray/rllib/ppo/runner.py index 456568f64..cc71c9d65 100644 --- a/python/ray/rllib/policy_gradient/agent.py +++ b/python/ray/rllib/ppo/runner.py @@ -14,12 +14,12 @@ import ray from ray.rllib.parallel import LocalSyncParallelOptimizer from ray.rllib.models import ModelCatalog -from ray.rllib.policy_gradient.env import BatchedEnv -from ray.rllib.policy_gradient.loss import ProximalPolicyLoss -from ray.rllib.policy_gradient.filter import MeanStdFilter -from ray.rllib.policy_gradient.rollout import ( +from ray.rllib.ppo.env import BatchedEnv +from ray.rllib.ppo.loss import ProximalPolicyLoss +from ray.rllib.ppo.filter import MeanStdFilter +from ray.rllib.ppo.rollout import ( rollouts, add_return_values, add_advantage_values) -from ray.rllib.policy_gradient.utils import flatten, concatenate +from ray.rllib.ppo.utils import flatten, concatenate # TODO(pcm): Make sure that both observation_filter and reward_filter # are correctly handled, i.e. (a) the values are accumulated accross @@ -28,9 +28,9 @@ from ray.rllib.policy_gradient.utils import flatten, concatenate # as part of the checkpoint so training can resume properly. -class Agent(object): +class Runner(object): """ - Agent class that holds the simulator environment and the policy. + Runner class that holds the simulator environment and the policy. Initializes the tensorflow graphs for both training and evaluation. One common policy graph is initialized on '/cpu:0' and holds all the shared @@ -244,4 +244,4 @@ class Agent(object): return concatenate(trajectories), total_rewards, trajectory_lengths -RemoteAgent = ray.remote(Agent) +RemoteRunner = ray.remote(Runner) diff --git a/python/ray/rllib/policy_gradient/test/test.py b/python/ray/rllib/ppo/test/test.py similarity index 97% rename from python/ray/rllib/policy_gradient/test/test.py rename to python/ray/rllib/ppo/test/test.py index a9ff78d4e..6ab59af93 100644 --- a/python/ray/rllib/policy_gradient/test/test.py +++ b/python/ray/rllib/ppo/test/test.py @@ -8,7 +8,7 @@ import tensorflow as tf from numpy.testing import assert_allclose from ray.rllib.models.action_dist import Categorical -from ray.rllib.policy_gradient.utils import flatten, concatenate +from ray.rllib.ppo.utils import flatten, concatenate # TODO(ekl): move to rllib/models dir diff --git a/python/ray/rllib/policy_gradient/utils.py b/python/ray/rllib/ppo/utils.py similarity index 100% rename from python/ray/rllib/policy_gradient/utils.py rename to python/ray/rllib/ppo/utils.py diff --git a/python/ray/rllib/test/test.sh b/python/ray/rllib/test/test.sh index 30d6cf427..f40ba1383 100755 --- a/python/ray/rllib/test/test.sh +++ b/python/ray/rllib/test/test.sh @@ -1,14 +1,14 @@ #!/bin/bash -python train.py --env Hopper-v1 --config '{"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}' --alg PolicyGradient --upload-dir s3://bucketname/ +python train.py --env Hopper-v1 --config '{"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}' --alg PPO --upload-dir s3://bucketname/ -python train.py --env CartPole-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}' --alg PolicyGradient --upload-dir s3://bucketname/ +python train.py --env CartPole-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}' --alg PPO --upload-dir s3://bucketname/ -python train.py --env Walker2d-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}' --alg PolicyGradient --upload-dir s3://bucketname/ +python train.py --env Walker2d-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}' --alg PPO --upload-dir s3://bucketname/ -python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}' --alg PolicyGradient --upload-dir s3://bucketname/ +python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}' --alg PPO --upload-dir s3://bucketname/ -python train.py --env Humanoid-v1 --config '{"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}' --alg PolicyGradient --upload-dir s3://bucketname/ +python train.py --env Humanoid-v1 --config '{"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}' --alg PPO --upload-dir s3://bucketname/ python train.py --env PongNoFrameskip-v0 --alg DQN --upload-dir s3://bucketname/ diff --git a/python/ray/rllib/test/test_checkpoint_restore.py b/python/ray/rllib/test/test_checkpoint_restore.py index 149b8aab0..bd1281239 100755 --- a/python/ray/rllib/test/test_checkpoint_restore.py +++ b/python/ray/rllib/test/test_checkpoint_restore.py @@ -7,20 +7,18 @@ from __future__ import print_function import ray import random -from ray.rllib.dqn import DQN, DEFAULT_CONFIG as DQN_CONFIG -from ray.rllib.evolution_strategies import ( - EvolutionStrategies, DEFAULT_CONFIG as ES_CONFIG) -from ray.rllib.policy_gradient import ( - PolicyGradient, DEFAULT_CONFIG as PG_CONFIG) -from ray.rllib.a3c import A3C, DEFAULT_CONFIG as A3C_CONFIG +from ray.rllib.dqn import (DQNAgent, DEFAULT_CONFIG as DQN_CONFIG) +from ray.rllib.es import (ESAgent, DEFAULT_CONFIG as ES_CONFIG) +from ray.rllib.ppo import (PPOAgent, DEFAULT_CONFIG as PG_CONFIG) +from ray.rllib.a3c import (A3CAgent, DEFAULT_CONFIG as A3C_CONFIG) ray.init() for (cls, default_config) in [ - (DQN, DQN_CONFIG), + (DQNAgent, DQN_CONFIG), # TODO(ekl) this fails with multiple ES instances in a process - (EvolutionStrategies, ES_CONFIG), - (PolicyGradient, PG_CONFIG), - (A3C, A3C_CONFIG)]: + (ESAgent, ES_CONFIG), + (PPOAgent, PG_CONFIG), + (A3CAgent, A3C_CONFIG)]: config = default_config.copy() config["num_sgd_iter"] = 5 config["episodes_per_batch"] = 100 diff --git a/python/ray/rllib/train.py b/python/ray/rllib/train.py index 156cf4d7a..403365857 100755 --- a/python/ray/rllib/train.py +++ b/python/ray/rllib/train.py @@ -10,8 +10,8 @@ import os import sys import ray -import ray.rllib.policy_gradient as pg -import ray.rllib.evolution_strategies as es +import ray.rllib.ppo as ppo +import ray.rllib.es as es import ray.rllib.dqn as dqn import ray.rllib.a3c as a3c @@ -42,30 +42,29 @@ if __name__ == "__main__": ray.init(redis_address=args.redis_address) env_name = args.env - if args.alg == "PolicyGradient": - config = pg.DEFAULT_CONFIG.copy() + if args.alg == "PPO": + config = ppo.DEFAULT_CONFIG.copy() config.update(json_config) - alg = pg.PolicyGradient( + alg = ppo.PPOAgent( env_name, config, upload_dir=args.upload_dir) - elif args.alg == "EvolutionStrategies": + elif args.alg == "ES": config = es.DEFAULT_CONFIG.copy() config.update(json_config) - alg = es.EvolutionStrategies( + alg = es.ESAgent( env_name, config, upload_dir=args.upload_dir) elif args.alg == "DQN": config = dqn.DEFAULT_CONFIG.copy() config.update(json_config) - alg = dqn.DQN( + alg = dqn.DQNAgent( env_name, config, upload_dir=args.upload_dir) elif args.alg == "A3C": config = a3c.DEFAULT_CONFIG.copy() config.update(json_config) - alg = a3c.A3C( + alg = a3c.A3CAgent( env_name, config, upload_dir=args.upload_dir) else: assert False, ("Unknown algorithm, check --alg argument. Valid " - "choices are PolicyGradient, EvolutionStrategies, " - "DQN and A3C.") + "choices are PPO, ES, DQN and A3C.") result_logger = ray.rllib.common.RLLibLogger( os.path.join(alg.logdir, "result.json")) diff --git a/test/jenkins_tests/run_multi_node_tests.sh b/test/jenkins_tests/run_multi_node_tests.sh index e71e518ed..0fb3cbf43 100755 --- a/test/jenkins_tests/run_multi_node_tests.sh +++ b/test/jenkins_tests/run_multi_node_tests.sh @@ -66,21 +66,21 @@ docker run --shm-size=10G --memory=10G $DOCKER_SHA \ docker run --shm-size=10G --memory=10G $DOCKER_SHA \ python /ray/python/ray/rllib/train.py \ --env CartPole-v1 \ - --alg PolicyGradient \ + --alg PPO \ --num-iterations 2 \ --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "sgd_stepsize": 1e-4, "sgd_batchsize": 64, "timesteps_per_batch": 2000, "num_workers": 1}' docker run --shm-size=10G --memory=10G $DOCKER_SHA \ python /ray/python/ray/rllib/train.py \ --env CartPole-v1 \ - --alg PolicyGradient \ + --alg PPO \ --num-iterations 2 \ --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "sgd_stepsize": 1e-4, "sgd_batchsize": 64, "timesteps_per_batch": 2000, "num_workers": 1, "use_gae": false}' docker run --shm-size=10G --memory=10G $DOCKER_SHA \ python /ray/python/ray/rllib/train.py \ --env Pendulum-v0 \ - --alg EvolutionStrategies \ + --alg ES \ --num-iterations 2 \ --config '{"stepsize": 0.01}'