mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[rllib] Rename algorithms (#890)
* rename algorithms * fix * fix jenkins test * fix documentation * fix
This commit is contained in:
parent
e1831792f8
commit
164a8f368e
30 changed files with 83 additions and 88 deletions
|
@ -8,14 +8,14 @@ You can run training with
|
|||
|
||||
::
|
||||
|
||||
python train.py --env CartPole-v0 --alg PolicyGradient
|
||||
python train.py --env CartPole-v0 --alg PPO
|
||||
|
||||
The available algorithms are:
|
||||
|
||||
- ``PolicyGradient`` is a proximal variant of
|
||||
- ``PPO`` is a proximal variant of
|
||||
`TRPO <https://arxiv.org/abs/1502.05477>`__.
|
||||
|
||||
- ``EvolutionStrategies`` is decribed in `this
|
||||
- ``ES`` is decribed in `this
|
||||
paper <https://arxiv.org/abs/1703.03864>`__. Our implementation
|
||||
borrows code from
|
||||
`here <https://github.com/openai/evolution-strategies-starter>`__.
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
from ray.rllib.a3c.a3c import A3C, DEFAULT_CONFIG
|
||||
from ray.rllib.a3c.a3c import A3CAgent, DEFAULT_CONFIG
|
||||
|
||||
__all__ = ["A3C", "DEFAULT_CONFIG"]
|
||||
__all__ = ["A3CAgent", "DEFAULT_CONFIG"]
|
||||
|
|
|
@ -11,7 +11,7 @@ import os
|
|||
import ray
|
||||
from ray.rllib.a3c.runner import RunnerThread, process_rollout
|
||||
from ray.rllib.a3c.envs import create_env
|
||||
from ray.rllib.common import Algorithm, TrainingResult
|
||||
from ray.rllib.common import Agent, TrainingResult
|
||||
from ray.rllib.a3c.shared_model_lstm import SharedModelLSTM
|
||||
|
||||
|
||||
|
@ -87,11 +87,11 @@ class Runner(object):
|
|||
return gradient, info
|
||||
|
||||
|
||||
class A3C(Algorithm):
|
||||
class A3CAgent(Agent):
|
||||
def __init__(self, env_name, config,
|
||||
policy_cls=SharedModelLSTM, upload_dir=None):
|
||||
config.update({"alg": "A3C"})
|
||||
Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
|
||||
Agent.__init__(self, env_name, config, upload_dir=upload_dir)
|
||||
self.env = create_env(env_name)
|
||||
self.policy = policy_cls(
|
||||
self.env.observation_space.shape, self.env.action_space)
|
||||
|
|
|
@ -54,11 +54,11 @@ TrainingResult = namedtuple("TrainingResult", [
|
|||
])
|
||||
|
||||
|
||||
class Algorithm(object):
|
||||
"""All RLlib algorithms extend this base class.
|
||||
class Agent(object):
|
||||
"""All RLlib agents extend this base class.
|
||||
|
||||
Algorithm objects retain internal model state between calls to train(), so
|
||||
you should create a new algorithm instance for each training session.
|
||||
Agent objects retain internal model state between calls to train(), so
|
||||
you should create a new agent instance for each training session.
|
||||
|
||||
Attributes:
|
||||
env_name (str): Name of the OpenAI gym environment to train against.
|
||||
|
@ -69,7 +69,7 @@ class Algorithm(object):
|
|||
"""
|
||||
|
||||
def __init__(self, env_name, config, upload_dir=None):
|
||||
"""Initialize an RLLib algorithm.
|
||||
"""Initialize an RLLib agent.
|
||||
|
||||
Args:
|
||||
env_name (str): The name of the OpenAI gym environment to use.
|
||||
|
|
|
@ -2,6 +2,6 @@ from __future__ import absolute_import
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.dqn.dqn import DQN, DEFAULT_CONFIG
|
||||
from ray.rllib.dqn.dqn import DQNAgent, DEFAULT_CONFIG
|
||||
|
||||
__all__ = ["DQN", "DEFAULT_CONFIG"]
|
||||
__all__ = ["DQNAgent", "DEFAULT_CONFIG"]
|
||||
|
|
|
@ -10,7 +10,7 @@ import pickle
|
|||
import os
|
||||
import tensorflow as tf
|
||||
|
||||
from ray.rllib.common import Algorithm, TrainingResult
|
||||
from ray.rllib.common import Agent, TrainingResult
|
||||
from ray.rllib.dqn import logger, models
|
||||
from ray.rllib.dqn.common.atari_wrappers_deprecated \
|
||||
import wrap_dqn, ScaledFloatFrame
|
||||
|
@ -102,11 +102,11 @@ DEFAULT_CONFIG = dict(
|
|||
num_cpu=16)
|
||||
|
||||
|
||||
class DQN(Algorithm):
|
||||
class DQNAgent(Agent):
|
||||
def __init__(self, env_name, config, upload_dir=None):
|
||||
config.update({"alg": "DQN"})
|
||||
|
||||
Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
|
||||
Agent.__init__(self, env_name, config, upload_dir=upload_dir)
|
||||
|
||||
with tf.Graph().as_default():
|
||||
self._init()
|
||||
|
|
3
python/ray/rllib/es/__init__.py
Normal file
3
python/ray/rllib/es/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from ray.rllib.es.es import (ESAgent, DEFAULT_CONFIG)
|
||||
|
||||
__all__ = ["ESAgent", "DEFAULT_CONFIG"]
|
|
@ -15,14 +15,14 @@ import time
|
|||
import tensorflow as tf
|
||||
|
||||
import ray
|
||||
from ray.rllib.common import Algorithm, TrainingResult
|
||||
from ray.rllib.common import Agent, TrainingResult
|
||||
from ray.rllib.models import ModelCatalog
|
||||
|
||||
from ray.rllib.evolution_strategies import optimizers
|
||||
from ray.rllib.evolution_strategies import policies
|
||||
from ray.rllib.evolution_strategies import tabular_logger as tlogger
|
||||
from ray.rllib.evolution_strategies import tf_util
|
||||
from ray.rllib.evolution_strategies import utils
|
||||
from ray.rllib.es import optimizers
|
||||
from ray.rllib.es import policies
|
||||
from ray.rllib.es import tabular_logger as tlogger
|
||||
from ray.rllib.es import tf_util
|
||||
from ray.rllib.es import utils
|
||||
|
||||
|
||||
Result = namedtuple("Result", [
|
||||
|
@ -160,11 +160,11 @@ class Worker(object):
|
|||
ob_count=task_ob_stat.count)
|
||||
|
||||
|
||||
class EvolutionStrategies(Algorithm):
|
||||
class ESAgent(Agent):
|
||||
def __init__(self, env_name, config, upload_dir=None):
|
||||
config.update({"alg": "EvolutionStrategies"})
|
||||
|
||||
Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
|
||||
Agent.__init__(self, env_name, config, upload_dir=upload_dir)
|
||||
|
||||
with tf.Graph().as_default():
|
||||
self._init()
|
|
@ -13,7 +13,7 @@ import h5py
|
|||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
from ray.rllib.evolution_strategies import tf_util as U
|
||||
from ray.rllib.es import tf_util as U
|
||||
from ray.rllib.models import ModelCatalog
|
||||
|
||||
logger = logging.getLogger(__name__)
|
|
@ -1,4 +0,0 @@
|
|||
from ray.rllib.evolution_strategies.evolution_strategies import (
|
||||
EvolutionStrategies, DEFAULT_CONFIG)
|
||||
|
||||
__all__ = ["EvolutionStrategies", "DEFAULT_CONFIG"]
|
|
@ -39,7 +39,7 @@ class LocalSyncParallelOptimizer(object):
|
|||
processed.
|
||||
build_loss: Function that takes the specified inputs and returns an
|
||||
object with a 'loss' property that is a scalar Tensor. For example,
|
||||
ray.rllib.policy_gradient.ProximalPolicyLoss.
|
||||
ray.rllib.ppo.ProximalPolicyLoss.
|
||||
logdir: Directory to place debugging output in.
|
||||
"""
|
||||
|
||||
|
|
|
@ -1,4 +0,0 @@
|
|||
from ray.rllib.policy_gradient.policy_gradient import (
|
||||
PolicyGradient, DEFAULT_CONFIG)
|
||||
|
||||
__all__ = ["PolicyGradient", "DEFAULT_CONFIG"]
|
3
python/ray/rllib/ppo/__init__.py
Normal file
3
python/ray/rllib/ppo/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from ray.rllib.ppo.ppo import (PPOAgent, DEFAULT_CONFIG)
|
||||
|
||||
__all__ = ["PPOAgent", "DEFAULT_CONFIG"]
|
|
@ -11,10 +11,10 @@ import tensorflow as tf
|
|||
from tensorflow.python import debug as tf_debug
|
||||
|
||||
import ray
|
||||
from ray.rllib.common import Algorithm, TrainingResult
|
||||
from ray.rllib.policy_gradient.agent import Agent, RemoteAgent
|
||||
from ray.rllib.policy_gradient.rollout import collect_samples
|
||||
from ray.rllib.policy_gradient.utils import shuffle
|
||||
from ray.rllib.common import Agent, TrainingResult
|
||||
from ray.rllib.ppo.runner import Runner, RemoteRunner
|
||||
from ray.rllib.ppo.rollout import collect_samples
|
||||
from ray.rllib.ppo.utils import shuffle
|
||||
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
|
@ -75,11 +75,11 @@ DEFAULT_CONFIG = {
|
|||
}
|
||||
|
||||
|
||||
class PolicyGradient(Algorithm):
|
||||
class PPOAgent(Agent):
|
||||
def __init__(self, env_name, config, upload_dir=None):
|
||||
config.update({"alg": "PolicyGradient"})
|
||||
config.update({"alg": "PPO"})
|
||||
|
||||
Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
|
||||
Agent.__init__(self, env_name, config, upload_dir=upload_dir)
|
||||
|
||||
with tf.Graph().as_default():
|
||||
self._init()
|
||||
|
@ -88,9 +88,9 @@ class PolicyGradient(Algorithm):
|
|||
self.global_step = 0
|
||||
self.j = 0
|
||||
self.kl_coeff = self.config["kl_coeff"]
|
||||
self.model = Agent(self.env_name, 1, self.config, self.logdir, False)
|
||||
self.model = Runner(self.env_name, 1, self.config, self.logdir, False)
|
||||
self.agents = [
|
||||
RemoteAgent.remote(
|
||||
RemoteRunner.remote(
|
||||
self.env_name, 1, self.config, self.logdir, True)
|
||||
for _ in range(self.config["num_workers"])]
|
||||
self.start_time = time.time()
|
||||
|
@ -121,10 +121,10 @@ class PolicyGradient(Algorithm):
|
|||
if self.file_writer:
|
||||
traj_stats = tf.Summary(value=[
|
||||
tf.Summary.Value(
|
||||
tag="policy_gradient/rollouts/mean_reward",
|
||||
tag="ppo/rollouts/mean_reward",
|
||||
simple_value=total_reward),
|
||||
tf.Summary.Value(
|
||||
tag="policy_gradient/rollouts/traj_len_mean",
|
||||
tag="ppo/rollouts/traj_len_mean",
|
||||
simple_value=traj_len_mean)])
|
||||
self.file_writer.add_summary(traj_stats, self.global_step)
|
||||
self.global_step += 1
|
||||
|
@ -191,7 +191,7 @@ class PolicyGradient(Algorithm):
|
|||
|
||||
values = []
|
||||
if i == config["num_sgd_iter"] - 1:
|
||||
metric_prefix = "policy_gradient/sgd/final_iter/"
|
||||
metric_prefix = "ppo/sgd/final_iter/"
|
||||
values.append(tf.Summary.Value(
|
||||
tag=metric_prefix + "kl_coeff",
|
||||
simple_value=self.kl_coeff))
|
|
@ -5,8 +5,8 @@ from __future__ import print_function
|
|||
import numpy as np
|
||||
import ray
|
||||
|
||||
from ray.rllib.policy_gradient.filter import NoFilter
|
||||
from ray.rllib.policy_gradient.utils import concatenate
|
||||
from ray.rllib.ppo.filter import NoFilter
|
||||
from ray.rllib.ppo.utils import concatenate
|
||||
|
||||
|
||||
def rollouts(policy, env, horizon, observation_filter=NoFilter(),
|
|
@ -14,12 +14,12 @@ import ray
|
|||
|
||||
from ray.rllib.parallel import LocalSyncParallelOptimizer
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.policy_gradient.env import BatchedEnv
|
||||
from ray.rllib.policy_gradient.loss import ProximalPolicyLoss
|
||||
from ray.rllib.policy_gradient.filter import MeanStdFilter
|
||||
from ray.rllib.policy_gradient.rollout import (
|
||||
from ray.rllib.ppo.env import BatchedEnv
|
||||
from ray.rllib.ppo.loss import ProximalPolicyLoss
|
||||
from ray.rllib.ppo.filter import MeanStdFilter
|
||||
from ray.rllib.ppo.rollout import (
|
||||
rollouts, add_return_values, add_advantage_values)
|
||||
from ray.rllib.policy_gradient.utils import flatten, concatenate
|
||||
from ray.rllib.ppo.utils import flatten, concatenate
|
||||
|
||||
# TODO(pcm): Make sure that both observation_filter and reward_filter
|
||||
# are correctly handled, i.e. (a) the values are accumulated accross
|
||||
|
@ -28,9 +28,9 @@ from ray.rllib.policy_gradient.utils import flatten, concatenate
|
|||
# as part of the checkpoint so training can resume properly.
|
||||
|
||||
|
||||
class Agent(object):
|
||||
class Runner(object):
|
||||
"""
|
||||
Agent class that holds the simulator environment and the policy.
|
||||
Runner class that holds the simulator environment and the policy.
|
||||
|
||||
Initializes the tensorflow graphs for both training and evaluation.
|
||||
One common policy graph is initialized on '/cpu:0' and holds all the shared
|
||||
|
@ -244,4 +244,4 @@ class Agent(object):
|
|||
return concatenate(trajectories), total_rewards, trajectory_lengths
|
||||
|
||||
|
||||
RemoteAgent = ray.remote(Agent)
|
||||
RemoteRunner = ray.remote(Runner)
|
|
@ -8,7 +8,7 @@ import tensorflow as tf
|
|||
from numpy.testing import assert_allclose
|
||||
|
||||
from ray.rllib.models.action_dist import Categorical
|
||||
from ray.rllib.policy_gradient.utils import flatten, concatenate
|
||||
from ray.rllib.ppo.utils import flatten, concatenate
|
||||
|
||||
|
||||
# TODO(ekl): move to rllib/models dir
|
|
@ -1,14 +1,14 @@
|
|||
#!/bin/bash
|
||||
|
||||
python train.py --env Hopper-v1 --config '{"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
|
||||
python train.py --env Hopper-v1 --config '{"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}' --alg PPO --upload-dir s3://bucketname/
|
||||
|
||||
python train.py --env CartPole-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
|
||||
python train.py --env CartPole-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}' --alg PPO --upload-dir s3://bucketname/
|
||||
|
||||
python train.py --env Walker2d-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
|
||||
python train.py --env Walker2d-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}' --alg PPO --upload-dir s3://bucketname/
|
||||
|
||||
python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}' --alg PolicyGradient --upload-dir s3://bucketname/
|
||||
python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}' --alg PPO --upload-dir s3://bucketname/
|
||||
|
||||
python train.py --env Humanoid-v1 --config '{"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}' --alg PolicyGradient --upload-dir s3://bucketname/
|
||||
python train.py --env Humanoid-v1 --config '{"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}' --alg PPO --upload-dir s3://bucketname/
|
||||
|
||||
python train.py --env PongNoFrameskip-v0 --alg DQN --upload-dir s3://bucketname/
|
||||
|
||||
|
|
|
@ -7,20 +7,18 @@ from __future__ import print_function
|
|||
import ray
|
||||
import random
|
||||
|
||||
from ray.rllib.dqn import DQN, DEFAULT_CONFIG as DQN_CONFIG
|
||||
from ray.rllib.evolution_strategies import (
|
||||
EvolutionStrategies, DEFAULT_CONFIG as ES_CONFIG)
|
||||
from ray.rllib.policy_gradient import (
|
||||
PolicyGradient, DEFAULT_CONFIG as PG_CONFIG)
|
||||
from ray.rllib.a3c import A3C, DEFAULT_CONFIG as A3C_CONFIG
|
||||
from ray.rllib.dqn import (DQNAgent, DEFAULT_CONFIG as DQN_CONFIG)
|
||||
from ray.rllib.es import (ESAgent, DEFAULT_CONFIG as ES_CONFIG)
|
||||
from ray.rllib.ppo import (PPOAgent, DEFAULT_CONFIG as PG_CONFIG)
|
||||
from ray.rllib.a3c import (A3CAgent, DEFAULT_CONFIG as A3C_CONFIG)
|
||||
|
||||
ray.init()
|
||||
for (cls, default_config) in [
|
||||
(DQN, DQN_CONFIG),
|
||||
(DQNAgent, DQN_CONFIG),
|
||||
# TODO(ekl) this fails with multiple ES instances in a process
|
||||
(EvolutionStrategies, ES_CONFIG),
|
||||
(PolicyGradient, PG_CONFIG),
|
||||
(A3C, A3C_CONFIG)]:
|
||||
(ESAgent, ES_CONFIG),
|
||||
(PPOAgent, PG_CONFIG),
|
||||
(A3CAgent, A3C_CONFIG)]:
|
||||
config = default_config.copy()
|
||||
config["num_sgd_iter"] = 5
|
||||
config["episodes_per_batch"] = 100
|
||||
|
|
|
@ -10,8 +10,8 @@ import os
|
|||
import sys
|
||||
|
||||
import ray
|
||||
import ray.rllib.policy_gradient as pg
|
||||
import ray.rllib.evolution_strategies as es
|
||||
import ray.rllib.ppo as ppo
|
||||
import ray.rllib.es as es
|
||||
import ray.rllib.dqn as dqn
|
||||
import ray.rllib.a3c as a3c
|
||||
|
||||
|
@ -42,30 +42,29 @@ if __name__ == "__main__":
|
|||
ray.init(redis_address=args.redis_address)
|
||||
|
||||
env_name = args.env
|
||||
if args.alg == "PolicyGradient":
|
||||
config = pg.DEFAULT_CONFIG.copy()
|
||||
if args.alg == "PPO":
|
||||
config = ppo.DEFAULT_CONFIG.copy()
|
||||
config.update(json_config)
|
||||
alg = pg.PolicyGradient(
|
||||
alg = ppo.PPOAgent(
|
||||
env_name, config, upload_dir=args.upload_dir)
|
||||
elif args.alg == "EvolutionStrategies":
|
||||
elif args.alg == "ES":
|
||||
config = es.DEFAULT_CONFIG.copy()
|
||||
config.update(json_config)
|
||||
alg = es.EvolutionStrategies(
|
||||
alg = es.ESAgent(
|
||||
env_name, config, upload_dir=args.upload_dir)
|
||||
elif args.alg == "DQN":
|
||||
config = dqn.DEFAULT_CONFIG.copy()
|
||||
config.update(json_config)
|
||||
alg = dqn.DQN(
|
||||
alg = dqn.DQNAgent(
|
||||
env_name, config, upload_dir=args.upload_dir)
|
||||
elif args.alg == "A3C":
|
||||
config = a3c.DEFAULT_CONFIG.copy()
|
||||
config.update(json_config)
|
||||
alg = a3c.A3C(
|
||||
alg = a3c.A3CAgent(
|
||||
env_name, config, upload_dir=args.upload_dir)
|
||||
else:
|
||||
assert False, ("Unknown algorithm, check --alg argument. Valid "
|
||||
"choices are PolicyGradient, EvolutionStrategies, "
|
||||
"DQN and A3C.")
|
||||
"choices are PPO, ES, DQN and A3C.")
|
||||
|
||||
result_logger = ray.rllib.common.RLLibLogger(
|
||||
os.path.join(alg.logdir, "result.json"))
|
||||
|
|
|
@ -66,21 +66,21 @@ docker run --shm-size=10G --memory=10G $DOCKER_SHA \
|
|||
docker run --shm-size=10G --memory=10G $DOCKER_SHA \
|
||||
python /ray/python/ray/rllib/train.py \
|
||||
--env CartPole-v1 \
|
||||
--alg PolicyGradient \
|
||||
--alg PPO \
|
||||
--num-iterations 2 \
|
||||
--config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "sgd_stepsize": 1e-4, "sgd_batchsize": 64, "timesteps_per_batch": 2000, "num_workers": 1}'
|
||||
|
||||
docker run --shm-size=10G --memory=10G $DOCKER_SHA \
|
||||
python /ray/python/ray/rllib/train.py \
|
||||
--env CartPole-v1 \
|
||||
--alg PolicyGradient \
|
||||
--alg PPO \
|
||||
--num-iterations 2 \
|
||||
--config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "sgd_stepsize": 1e-4, "sgd_batchsize": 64, "timesteps_per_batch": 2000, "num_workers": 1, "use_gae": false}'
|
||||
|
||||
docker run --shm-size=10G --memory=10G $DOCKER_SHA \
|
||||
python /ray/python/ray/rllib/train.py \
|
||||
--env Pendulum-v0 \
|
||||
--alg EvolutionStrategies \
|
||||
--alg ES \
|
||||
--num-iterations 2 \
|
||||
--config '{"stepsize": 0.01}'
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue