diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml index 1bb0b495c..eb984f182 100644 --- a/.buildkite/pipeline.ml.yml +++ b/.buildkite/pipeline.ml.yml @@ -129,7 +129,7 @@ # Test all tests in the `agents` (soon to be "trainers") dir: - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only - --test_tag_filters=algorithms_dir_generic,-multi_gpu + --test_tag_filters=trainers_dir_generic,-multi_gpu --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... @@ -141,7 +141,7 @@ # Test all tests in the `agents` (soon to be "trainers") dir: - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only - --test_tag_filters=algorithms_dir,-algorithms_dir_generic,-multi_gpu + --test_tag_filters=trainers_dir,-trainers_dir_generic,-multi_gpu --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... @@ -154,7 +154,7 @@ # "learning_tests|quick_train|examples|tests_dir". - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only - --test_tag_filters=-learning_tests,-quick_train,-memory_leak_tests,-examples,-tests_dir,-algorithms_dir,-documentation,-multi_gpu + --test_tag_filters=-learning_tests,-quick_train,-memory_leak_tests,-examples,-tests_dir,-trainers_dir,-documentation,-multi_gpu --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... diff --git a/README.rst b/README.rst index 8cf4a608e..1b31c4388 100644 --- a/README.rst +++ b/README.rst @@ -179,7 +179,7 @@ It offers high scalability and unified APIs for a .. code-block:: python import gym - from ray.rllib.algorithms.ppo import PPO + from ray.rllib.agents.ppo import PPOTrainer # Define your problem using python and openAI's gym API: @@ -229,7 +229,7 @@ It offers high scalability and unified APIs for a # Create an RLlib Trainer instance. - trainer = PPO( + trainer = PPOTrainer( config={ # Env class to use (here: our gym.Env sub-class from above). "env": SimpleCorridor, diff --git a/doc/source/ray-overview/doc_test/ray_rllib.py b/doc/source/ray-overview/doc_test/ray_rllib.py index 0b828b65c..9455e84a3 100644 --- a/doc/source/ray-overview/doc_test/ray_rllib.py +++ b/doc/source/ray-overview/doc_test/ray_rllib.py @@ -1,8 +1,8 @@ from ray import tune -from ray.rllib.algorithms.ppo import PPO +from ray.rllib.agents.ppo import PPOTrainer tune.run( - PPO, + PPOTrainer, stop={"episode_len_mean": 20}, config={"env": "CartPole-v0", "framework": "torch", "log_level": "INFO"}, ) diff --git a/doc/source/rllib/core-concepts.rst b/doc/source/rllib/core-concepts.rst index c25e5acb0..9f504c23a 100644 --- a/doc/source/rllib/core-concepts.rst +++ b/doc/source/rllib/core-concepts.rst @@ -25,14 +25,14 @@ Trainers also implement the :ref:`Tune Trainable API ` for easy You have three ways to interact with a trainer. You can use the basic Python API or the command line to train it, or you can use Ray Tune to tune hyperparameters of your reinforcement learning algorithm. -The following example shows three equivalent ways of interacting with the ``PPO`` Trainer, +The following example shows three equivalent ways of interacting with the ``PPOTrainer``, which implements the proximal policy optimization algorithm in RLlib. .. tabbed:: Basic RLlib Trainer .. code-block:: python - trainer = PPO(env="CartPole-v0", config={"train_batch_size": 4000}) + trainer = PPOTrainer(env="CartPole-v0", config={"train_batch_size": 4000}) while True: print(trainer.train()) @@ -47,7 +47,7 @@ which implements the proximal policy optimization algorithm in RLlib. .. code-block:: python from ray import tune - tune.run(PPO, config={"env": "CartPole-v0", "train_batch_size": 4000}) + tune.run(PPOTrainer, config={"env": "CartPole-v0", "train_batch_size": 4000}) diff --git a/doc/source/rllib/doc_code/training.py b/doc/source/rllib/doc_code/training.py index 109de48bf..5317e922e 100644 --- a/doc/source/rllib/doc_code/training.py +++ b/doc/source/rllib/doc_code/training.py @@ -22,9 +22,9 @@ prep.transform(env.reset()).shape # __query_action_dist_start__ # Get a reference to the policy import numpy as np -from ray.rllib.algorithms.ppo import PPO +from ray.rllib.agents.ppo import PPOTrainer -trainer = PPO(env="CartPole-v0", config={"framework": "tf2", "num_workers": 0}) +trainer = PPOTrainer(env="CartPole-v0", config={"framework": "tf2", "num_workers": 0}) policy = trainer.get_policy() # diff --git a/doc/source/rllib/package_ref/policy/custom_policies.rst b/doc/source/rllib/package_ref/policy/custom_policies.rst index 5632c7444..173940ebd 100644 --- a/doc/source/rllib/package_ref/policy/custom_policies.rst +++ b/doc/source/rllib/package_ref/policy/custom_policies.rst @@ -23,4 +23,4 @@ framework-agnostic policy), * :py:meth:`~ray.rllib.policy.policy.Policy.postprocess_trajectory` * :py:meth:`~ray.rllib.policy.policy.Policy.loss` -`See here for an example on how to override TorchPolicy `_. +`See here for an example on how to override TorchPolicy `_. diff --git a/doc/source/rllib/rllib-algorithms.rst b/doc/source/rllib/rllib-algorithms.rst index 45cac18de..51aee4a19 100644 --- a/doc/source/rllib/rllib-algorithms.rst +++ b/doc/source/rllib/rllib-algorithms.rst @@ -130,7 +130,7 @@ Importance Weighted Actor-Learner Architecture (IMPALA) ------------------------------------------------------- |pytorch| |tensorflow| `[paper] `__ -`[implementation] `__ +`[implementation] `__ In IMPALA, a central learner runs SGD in a tight loop while asynchronously pulling sample batches from many actor processes. RLlib's IMPALA implementation uses DeepMind's reference `V-trace code `__. Note that we do not provide a deep residual network out of the box, but one can be plugged in as a `custom model `__. Multiple learner GPUs and experience replay are also supported. .. figure:: images/impala-arch.svg @@ -168,7 +168,7 @@ SpaceInvaders 843 ~300 **IMPALA-specific configs** (see also `common configs `__): -.. literalinclude:: ../../../rllib/algorithms/impala/impala.py +.. literalinclude:: ../../../rllib/agents/impala/impala.py :language: python :start-after: __sphinx_doc_begin__ :end-before: __sphinx_doc_end__ @@ -179,7 +179,7 @@ Asynchronous Proximal Policy Optimization (APPO) ------------------------------------------------ |pytorch| |tensorflow| `[paper] `__ -`[implementation] `__ +`[implementation] `__ We include an asynchronous variant of Proximal Policy Optimization (PPO) based on the IMPALA architecture. This is similar to IMPALA but using a surrogate policy loss with clipping. Compared to synchronous PPO, APPO is more efficient in wall-clock time due to its use of asynchronous sampling. Using a clipped loss also allows for multiple SGD passes, and therefore the potential for better sample efficiency compared to IMPALA. V-trace can also be enabled to correct for off-policy samples. .. tip:: @@ -190,11 +190,11 @@ We include an asynchronous variant of Proximal Policy Optimization (PPO) based o APPO architecture (same as IMPALA) -Tuned examples: `PongNoFrameskip-v4 `__ +Tuned examples: `PongNoFrameskip-v4 `__ **APPO-specific configs** (see also `common configs `__): -.. literalinclude:: ../../../rllib/algorithms/appo/appo.py +.. literalinclude:: ../../../rllib/agents/ppo/appo.py :language: python :start-after: __sphinx_doc_begin__ :end-before: __sphinx_doc_end__ @@ -205,7 +205,7 @@ Decentralized Distributed Proximal Policy Optimization (DD-PPO) --------------------------------------------------------------- |pytorch| `[paper] `__ -`[implementation] `__ +`[implementation] `__ Unlike APPO or PPO, with DD-PPO policy improvement is no longer done centralized in the trainer process. Instead, gradients are computed remotely on each rollout worker and all-reduced at each mini-batch using `torch distributed `__. This allows each worker's GPU to be used both for sampling and for training. .. tip:: @@ -216,11 +216,11 @@ Unlike APPO or PPO, with DD-PPO policy improvement is no longer done centralized DD-PPO architecture (both sampling and learning are done on worker GPUs) -Tuned examples: `CartPole-v0 `__, `BreakoutNoFrameskip-v4 `__ +Tuned examples: `CartPole-v0 `__, `BreakoutNoFrameskip-v4 `__ **DDPPO-specific configs** (see also `common configs `__): -.. literalinclude:: ../../../rllib/algorithms/ddppo/ddppo.py +.. literalinclude:: ../../../rllib/agents/ppo/ddppo.py :language: python :start-after: __sphinx_doc_begin__ :end-before: __sphinx_doc_end__ @@ -396,7 +396,7 @@ Proximal Policy Optimization (PPO) ---------------------------------- |pytorch| |tensorflow| `[paper] `__ -`[implementation] `__ +`[implementation] `__ PPO's clipped objective supports multiple SGD passes over the same batch of experiences. RLlib's multi-GPU optimizer pins that data in GPU memory to avoid unnecessary transfers from host memory, substantially improving performance over a naive implementation. PPO scales out using multiple workers for experience collection, and also to multiple GPUs for SGD. .. tip:: @@ -445,7 +445,7 @@ HalfCheetah 9664 ~7700 **PPO-specific configs** (see also `common configs `__): -.. literalinclude:: ../../../rllib/algorithms/ppo/ppo.py +.. literalinclude:: ../../../rllib/agents/ppo/ppo.py :language: python :start-after: __sphinx_doc_begin__ :end-before: __sphinx_doc_end__ diff --git a/doc/source/rllib/rllib-concepts.rst b/doc/source/rllib/rllib-concepts.rst index 82965f288..5a06e3549 100644 --- a/doc/source/rllib/rllib-concepts.rst +++ b/doc/source/rllib/rllib-concepts.rst @@ -210,11 +210,11 @@ You might be wondering how RLlib makes the advantages placeholder automatically In the above section you saw how to compose a simple policy gradient algorithm with RLlib. In this example, we'll dive into how PPO is defined within RLlib and how you can modify it. -First, check out the `PPO trainer definition `__: +First, check out the `PPO trainer definition `__: .. code-block:: python - class PPO(Trainer): + class PPOTrainer(Trainer): @classmethod @override(Trainer) def get_default_config(cls) -> TrainerConfigDict: @@ -280,7 +280,7 @@ Suppose we want to customize PPO to use an asynchronous-gradient optimization st .. code-block:: python - from ray.rllib.algorithms.ppo import PPO + from ray.rllib.agents.ppo import PPOTrainer from ray.rllib.execution.rollout_ops import AsyncGradients from ray.rllib.execution.train_ops import ApplyGradients from ray.rllib.execution.metric_ops import StandardMetricsReporting @@ -307,7 +307,7 @@ Now let's look at each PPO policy definition: PPOTFPolicy = build_tf_policy( name="PPOTFPolicy", - get_default_config=lambda: ray.rllib.algorithms.ppo.ppo.PPOConfig().to_dict(), + get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, loss_fn=ppo_surrogate_loss, stats_fn=kl_and_loss_stats, extra_action_out_fn=vf_preds_and_logits_fetches, @@ -562,8 +562,8 @@ You can use the ``with_updates`` method on Trainers and Policy objects built wit .. code-block:: python - from ray.rllib.algorithms.ppo import PPO - from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTFPolicy + from ray.rllib.agents.ppo import PPOTrainer + from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy CustomPolicy = PPOTFPolicy.with_updates( name="MyCustomPPOTFPolicy", diff --git a/doc/source/rllib/rllib-env.rst b/doc/source/rllib/rllib-env.rst index b87a2fafd..7a675e6b5 100644 --- a/doc/source/rllib/rllib-env.rst +++ b/doc/source/rllib/rllib-env.rst @@ -33,7 +33,7 @@ You can pass either a string name or a Python class to specify an environment. B return , , , ray.init() - trainer = ppo.PPO(env=MyEnv, config={ + trainer = ppo.PPOTrainer(env=MyEnv, config={ "env_config": {}, # config to pass to env class }) @@ -50,7 +50,7 @@ You can also register a custom env creator function with a string name. This fun return MyEnv(...) # return an env instance register_env("my_env", env_creator) - trainer = ppo.PPO(env="my_env") + trainer = ppo.PPOTrainer(env="my_env") For a full runnable code example using the custom environment API, see `custom_env.py `__. diff --git a/doc/source/rllib/rllib-models.rst b/doc/source/rllib/rllib-models.rst index 55520eb99..d6b354b0a 100644 --- a/doc/source/rllib/rllib-models.rst +++ b/doc/source/rllib/rllib-models.rst @@ -215,7 +215,7 @@ Once implemented, your TF model can then be registered and used in place of a bu .. code-block:: python import ray - import ray.rllib.algorithms.ppo as ppo + import ray.rllib.agents.ppo as ppo from ray.rllib.models import ModelCatalog from ray.rllib.models.tf.tf_modelv2 import TFModelV2 @@ -227,7 +227,7 @@ Once implemented, your TF model can then be registered and used in place of a bu ModelCatalog.register_custom_model("my_tf_model", MyModelClass) ray.init() - trainer = ppo.PPO(env="CartPole-v0", config={ + trainer = ppo.PPOTrainer(env="CartPole-v0", config={ "model": { "custom_model": "my_tf_model", # Extra kwargs to be passed to your model's c'tor. @@ -282,7 +282,7 @@ Once implemented, your PyTorch model can then be registered and used in place of ModelCatalog.register_custom_model("my_torch_model", CustomTorchModel) ray.init() - trainer = ppo.PPO(env="CartPole-v0", config={ + trainer = ppo.PPOTrainer(env="CartPole-v0", config={ "framework": "torch", "model": { "custom_model": "my_torch_model", @@ -488,7 +488,7 @@ Similar to custom models and preprocessors, you can also specify a custom action .. code-block:: python import ray - import ray.rllib.algorithms.ppo as ppo + import ray.rllib.agents.ppo as ppo from ray.rllib.models import ModelCatalog from ray.rllib.models.preprocessors import Preprocessor @@ -508,7 +508,7 @@ Similar to custom models and preprocessors, you can also specify a custom action ModelCatalog.register_custom_action_dist("my_dist", MyActionDist) ray.init() - trainer = ppo.PPO(env="CartPole-v0", config={ + trainer = ppo.PPOTrainer(env="CartPole-v0", config={ "model": { "custom_action_dist": "my_dist", }, diff --git a/doc/source/rllib/rllib-offline.rst b/doc/source/rllib/rllib-offline.rst index 8c420734d..446526ab4 100644 --- a/doc/source/rllib/rllib-offline.rst +++ b/doc/source/rllib/rllib-offline.rst @@ -238,7 +238,7 @@ You can configure experience input for an agent using the following options: objects, which have the advantage of being type safe, allowing users to set different config settings within meaningful sub-categories (e.g. ``my_config.offline_data(input_=[xyz])``), and offer the ability to construct a Trainer instance from these config objects (via their ``.build()`` method). - So far, this is only supported for some Trainer classes, such as :py:class:`~ray.rllib.algorithms.ppo.ppo.PPO`, + So far, this is only supported for some Trainer classes, such as :py:class:`~ray.rllib.agents.ppo.ppo.PPOTrainer`, but we are rolling this out right now across all RLlib. @@ -335,7 +335,7 @@ You can configure experience output for an agent using the following options: objects, which have the advantage of being type safe, allowing users to set different config settings within meaningful sub-categories (e.g. ``my_config.offline_data(input_=[xyz])``), and offer the ability to construct a Trainer instance from these config objects (via their ``.build()`` method). - So far, this is only supported for some Trainer classes, such as :py:class:`~ray.rllib.algorithms.ppo.ppo.PPO`, + So far, this is only supported for some Trainer classes, such as :py:class:`~ray.rllib.agents.ppo.ppo.PPOTrainer`, but we are rolling this out right now across all RLlib. .. code-block:: python diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst index b75f69edf..280f0fac3 100644 --- a/doc/source/rllib/rllib-training.rst +++ b/doc/source/rllib/rllib-training.rst @@ -164,7 +164,7 @@ Common Parameters objects, which have the advantage of being type safe, allowing users to set different config settings within meaningful sub-categories (e.g. ``my_config.training(lr=0.0003)``), and offer the ability to construct a Trainer instance from these config objects (via their ``build()`` method). - So far, this is only supported for some Trainer classes, such as :py:class:`~ray.rllib.algorithms.ppo.ppo.PPO`, + So far, this is only supported for some Trainer classes, such as :py:class:`~ray.rllib.agents.ppo.ppo.PPOTrainer`, but we are rolling this out right now across all RLlib. The following is a list of the common algorithm hyper-parameters: @@ -705,14 +705,14 @@ Here is an example of the basic usage (for a more complete example, see `custom_ .. code-block:: python import ray - import ray.rllib.algorithms.ppo as ppo + import ray.rllib.agents.ppo as ppo from ray.tune.logger import pretty_print ray.init() config = ppo.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 config["num_workers"] = 1 - trainer = ppo.PPO(config=config, env="CartPole-v0") + trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") # Can optionally call trainer.restore(path) to load a checkpoint. @@ -783,7 +783,7 @@ It also simplifies saving the trained agent. For example: # tune.run() allows setting a custom log directory (other than ``~/ray-results``) # and automatically saving the trained agent analysis = ray.tune.run( - ppo.PPO, + ppo.PPOTrainer, config=config, local_dir=log_dir, stop=stop_criteria, @@ -807,7 +807,7 @@ Loading and restoring a trained agent from a checkpoint is simple: .. code-block:: python - agent = ppo.PPO(config=config, env=env_class) + agent = ppo.PPOTrainer(config=config, env=env_class) agent.restore(checkpoint_path) @@ -1340,10 +1340,10 @@ customizations to your training loop. import ray from ray import tune - from ray.rllib.algorithms.ppo import PPO + from ray.rllib.agents.ppo import PPOTrainer def train(config, reporter): - trainer = PPO(config=config, env=YourEnv) + trainer = PPOTrainer(config=config, env=YourEnv) while True: result = trainer.train() reporter(**result) diff --git a/doc/source/serve/tutorials/rllib.md b/doc/source/serve/tutorials/rllib.md index 2ae8a41c7..cf71f144a 100644 --- a/doc/source/serve/tutorials/rllib.md +++ b/doc/source/serve/tutorials/rllib.md @@ -28,25 +28,22 @@ We will train and checkpoint a simple PPO model with the `CartPole-v0` environme In this tutorial we simply write to local disk, but in production you might want to consider using a cloud storage solution like S3 or a shared file system. -Let's get started by defining a `PPO` instance, training it for one iteration and then creating a checkpoint: +Let's get started by defining a `PPOTrainer` instance, training it for one iteration and then creating a checkpoint: ```{code-cell} python3 :tags: [remove-output] import ray -import ray.rllib.algorithms.ppo as ppo +import ray.rllib.agents.ppo as ppo from ray import serve def train_ppo_model(): - # Configure our PPO algorithm. - config = ppo.PPOConfig()\ - .framework("torch")\ - .rollouts(num_rollout_workers=0) - # Create a `PPO` Trainer instance from the config. - trainer = config.build(env="CartPole-v0") - # Train for one iteration. + trainer = ppo.PPOTrainer( + config={"framework": "torch", "num_workers": 0}, + env="CartPole-v0", + ) + # Train for one iteration trainer.train() - # Save state of the trained Trainer in a checkpoint. trainer.save("/tmp/rllib_checkpoint") return "/tmp/rllib_checkpoint/checkpoint_000001/checkpoint-1" @@ -57,7 +54,7 @@ checkpoint_path = train_ppo_model() You create deployments with Ray Serve by using the `@serve.deployment` on a class that implements two methods: - The `__init__` call creates the deployment instance and loads your data once. - In the below example we restore our `PPO` Trainer from the checkpoint we just created. + In the below example we restore our `PPOTrainer` from the checkpoint we just created. - The `__call__` method will be invoked every request. For each incoming request, this method has access to a `request` object, which is a [Starlette Request](https://www.starlette.io/requests/). @@ -75,10 +72,13 @@ from starlette.requests import Request @serve.deployment(route_prefix="/cartpole-ppo") class ServePPOModel: def __init__(self, checkpoint_path) -> None: - config = ppo.PPOConfig()\ - .framework("torch")\ - .rollouts(num_rollout_workers=0) - self.trainer = config.build(env="CartPole-v0") + self.trainer = ppo.PPOTrainer( + config={ + "framework": "torch", + "num_workers": 0, + }, + env="CartPole-v0", + ) self.trainer.restore(checkpoint_path) async def __call__(self, request: Request): diff --git a/python/ray/tests/test_usage_stats.py b/python/ray/tests/test_usage_stats.py index 5c2a68e03..49cfc6cce 100644 --- a/python/ray/tests/test_usage_stats.py +++ b/python/ray/tests/test_usage_stats.py @@ -595,7 +595,7 @@ provider: ray_usage_lib._recorded_library_usages.clear() if os.environ.get("RAY_MINIMAL") != "1": from ray import tune # noqa: F401 - from ray.rllib.algorithms.ppo import PPO # noqa: F401 + from ray.rllib.agents.ppo import PPOTrainer # noqa: F401 from ray import train # noqa: F401 ray.init(address=cluster.address) diff --git a/python/ray/tune/tests/test_integration_wandb.py b/python/ray/tune/tests/test_integration_wandb.py index e3855f329..da0cc98a3 100644 --- a/python/ray/tune/tests/test_integration_wandb.py +++ b/python/ray/tune/tests/test_integration_wandb.py @@ -351,12 +351,12 @@ class WandbIntegrationTest(unittest.TestCase): """Test compatibility with RLlib configuration dicts""" # Local import to avoid tune dependency on rllib try: - from ray.rllib.algorithms.ppo import PPO + from ray.rllib.agents.ppo import PPOTrainer except ImportError: self.skipTest("ray[rllib] not available") return - class WandbPPOTrainer(_MockWandbTrainableMixin, PPO): + class WandbPPOTrainer(_MockWandbTrainableMixin, PPOTrainer): pass config = { diff --git a/release/tune_tests/cloud_tests/workloads/_tune_script.py b/release/tune_tests/cloud_tests/workloads/_tune_script.py index c07c6f1e6..837dfe95e 100644 --- a/release/tune_tests/cloud_tests/workloads/_tune_script.py +++ b/release/tune_tests/cloud_tests/workloads/_tune_script.py @@ -8,7 +8,7 @@ import time import ray from ray import tune from ray.rllib.agents import DefaultCallbacks -from ray.rllib.algorithms.ppo import PPO +from ray.rllib.agents.ppo import PPOTrainer def fn_trainable(config, checkpoint_dir=None): @@ -68,7 +68,7 @@ def run_tune( if trainable == "rllib_str": train = "PPO" else: - train = PPO + train = PPOTrainer config = { "env": "CartPole-v1", diff --git a/rllib/BUILD b/rllib/BUILD index ac9a4b390..eb6d7a26c 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -16,7 +16,7 @@ # -- "fake_gpus": Tests that run using 2 fake GPUs. # - Quick agent compilation/tune-train tests, tagged "quick_train". -# NOTE: These should be obsoleted in favor of "algorithms_dir" tests as +# NOTE: These should be obsoleted in favor of "trainers_dir" tests as # they cover the same functionaliy. # - Folder-bound tests, tagged with the name of the top-level dir: @@ -27,7 +27,7 @@ # - `policy` directory tests. # - `utils` directory tests. -# - Trainer ("agents") tests, tagged "algorithms_dir". +# - Trainer ("agents") tests, tagged "trainers_dir". # - Tests directory (everything in rllib/tests/...), tagged: "tests_dir" and # "tests_dir_[A-Z]" @@ -168,8 +168,8 @@ py_test( tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"], size = "large", srcs = ["tests/run_regression_tests.py"], - data = ["tuned_examples/appo/cartpole-appo.yaml"], - args = ["--yaml-dir=tuned_examples/appo"] + data = ["tuned_examples/ppo/cartpole-appo.yaml"], + args = ["--yaml-dir=tuned_examples/ppo"] ) # py_test( @@ -178,8 +178,8 @@ py_test( # tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"], # size = "large", # srcs = ["tests/run_regression_tests.py"], -# data = ["tuned_examples/appo/cartpole-appo-vtrace.yaml"], -# args = ["--yaml-dir=tuned_examples/appo"] +# data = ["tuned_examples/ppo/cartpole-appo-vtrace.yaml"], +# args = ["--yaml-dir=tuned_examples/ppo"] # ) py_test( @@ -189,9 +189,9 @@ py_test( size = "large", srcs = ["tests/run_regression_tests.py"], data = [ - "tuned_examples/appo/cartpole-appo-vtrace-separate-losses.yaml" + "tuned_examples/ppo/cartpole-appo-vtrace-separate-losses.yaml" ], - args = ["--yaml-dir=tuned_examples/appo"] + args = ["--yaml-dir=tuned_examples/ppo"] ) # py_test( @@ -200,8 +200,8 @@ py_test( # tags = ["team:ml", "learning_tests", "learning_tests_discrete"], # size = "large", # srcs = ["tests/run_regression_tests.py"], -# data = ["tuned_examples/appo/frozenlake-appo-vtrace.yaml"], -# args = ["--yaml-dir=tuned_examples/appo"] +# data = ["tuned_examples/ppo/frozenlake-appo-vtrace.yaml"], +# args = ["--yaml-dir=tuned_examples/ppo"] # ) py_test( @@ -210,8 +210,8 @@ py_test( tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"], size = "large", srcs = ["tests/run_regression_tests.py"], - data = ["tuned_examples/appo/cartpole-appo-vtrace-fake-gpus.yaml"], - args = ["--yaml-dir=tuned_examples/appo"] + data = ["tuned_examples/ppo/cartpole-appo-vtrace-fake-gpus.yaml"], + args = ["--yaml-dir=tuned_examples/ppo"] ) # ARS @@ -268,8 +268,8 @@ py_test( tags = ["team:ml", "torch_only", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"], size = "large", srcs = ["tests/run_regression_tests.py"], - data = glob(["tuned_examples/ddppo/cartpole-ddppo.yaml"]), - args = ["--yaml-dir=tuned_examples/ddppo"] + data = glob(["tuned_examples/ppo/cartpole-ddppo.yaml"]), + args = ["--yaml-dir=tuned_examples/ppo"] ) py_test( @@ -278,8 +278,8 @@ py_test( tags = ["team:ml", "torch_only", "learning_tests", "learning_tests_pendulum", "learning_tests_continuous"], size = "large", srcs = ["tests/run_regression_tests.py"], - data = glob(["tuned_examples/ddppo/pendulum-ddppo.yaml"]), - args = ["--yaml-dir=tuned_examples/ddppo"] + data = glob(["tuned_examples/ppo/pendulum-ddppo.yaml"]), + args = ["--yaml-dir=tuned_examples/ppo"] ) # DQN @@ -637,13 +637,13 @@ py_test( # Agents (Compilation, Losses, simple agent functionality tests) # rllib/agents/ # -# Tag: algorithms_dir +# Tag: trainers_dir # -------------------------------------------------------------------- # Generic (all Trainers) py_test( name = "test_callbacks", - tags = ["team:ml", "algorithms_dir", "algorithms_dir_generic"], + tags = ["team:ml", "trainers_dir", "trainers_dir_generic"], size = "medium", srcs = ["agents/tests/test_callbacks.py"] ) @@ -651,31 +651,31 @@ py_test( py_test( name = "test_memory_leaks_generic", main = "agents/tests/test_memory_leaks.py", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", srcs = ["agents/tests/test_memory_leaks.py"] ) py_test( name = "test_trainer", - tags = ["team:ml", "algorithms_dir", "algorithms_dir_generic"], + tags = ["team:ml", "trainers_dir", "trainers_dir_generic"], size = "large", srcs = ["agents/tests/test_trainer.py"] ) py_test( name = "tests/test_worker_failures", - tags = ["team:ml", "tests_dir", "algorithms_dir_generic"], + tags = ["team:ml", "tests_dir", "trainers_dir_generic"], size = "large", srcs = ["agents/tests/test_worker_failures.py"] ) -# Specific Algorithms +# Specific Trainers (Algorithms) # A2C py_test( name = "test_a2c", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", srcs = ["algorithms/a2c/tests/test_a2c.py"] ) @@ -683,7 +683,7 @@ py_test( # A3C py_test( name = "test_a3c", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", srcs = ["algorithms/a3c/tests/test_a3c.py"] ) @@ -691,7 +691,7 @@ py_test( # AlphaStar py_test( name = "test_alpha_star", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", srcs = ["algorithms/alpha_star/tests/test_alpha_star.py"] ) @@ -699,39 +699,31 @@ py_test( # AlphaZero py_test( name = "test_alpha_zero", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", srcs = ["algorithms/alpha_zero/tests/test_alpha_zero.py"] ) -# APEX-DQN +# APEXTrainer (DQN) py_test( name = "test_apex_dqn", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", srcs = ["agents/dqn/tests/test_apex_dqn.py"] ) -# APEX-DDPG +# APEXDDPGTrainer py_test( name = "test_apex_ddpg", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "medium", srcs = ["algorithms/ddpg/tests/test_apex_ddpg.py"] ) -# APPO -py_test( - name = "test_appo", - tags = ["team:ml", "algorithms_dir"], - size = "large", - srcs = ["algorithms/appo/tests/test_appo.py"] -) - # ARS py_test( name = "test_ars", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "medium", srcs = ["algorithms/ars/tests/test_ars.py"] ) @@ -739,39 +731,31 @@ py_test( # Bandits py_test( name = "test_bandits", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "medium", srcs = ["algorithms/bandit/tests/test_bandits.py"], ) -# CQL +# CQLTrainer py_test( name = "test_cql", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "medium", srcs = ["algorithms/cql/tests/test_cql.py"] ) -# DDPG +# DDPGTrainer py_test( name = "test_ddpg", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", srcs = ["algorithms/ddpg/tests/test_ddpg.py"] ) -# DDPPO -py_test( - name = "test_ddppo", - tags = ["team:ml", "algorithms_dir"], - size = "medium", - srcs = ["algorithms/ddppo/tests/test_ddppo.py"] -) - -# DQN +# DQNTrainer py_test( name = "test_dqn", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", srcs = ["algorithms/dqn/tests/test_dqn.py"] ) @@ -779,7 +763,7 @@ py_test( # Dreamer py_test( name = "test_dreamer", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "medium", srcs = ["algorithms/dreamer/tests/test_dreamer.py"] ) @@ -787,137 +771,153 @@ py_test( # ES py_test( name = "test_es", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "medium", srcs = ["algorithms/es/tests/test_es.py"] ) -# Impala +# IMPALA py_test( name = "test_impala", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", - srcs = ["algorithms/impala/tests/test_impala.py"] + srcs = ["agents/impala/tests/test_impala.py"] ) py_test( name = "test_vtrace", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "small", - srcs = ["algorithms/impala/tests/test_vtrace.py"] + srcs = ["agents/impala/tests/test_vtrace.py"] ) -# MARWIL +# MARWILTrainer py_test( name = "test_marwil", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", # Include the json data file. data = ["tests/data/cartpole/large.json"], srcs = ["algorithms/marwil/tests/test_marwil.py"] ) -# BC (sub-type of MARWIL) +# BCTrainer (sub-type of MARWIL) py_test( name = "test_bc", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", # Include the json data file. data = ["tests/data/cartpole/large.json"], srcs = ["algorithms/marwil/tests/test_bc.py"] ) -# MADDPG +# MADDPGTrainer py_test( name = "test_maddpg", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "medium", srcs = ["algorithms/maddpg/tests/test_maddpg.py"] ) -# MAML +# MAMLTrainer py_test( name = "test_maml", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "medium", srcs = ["algorithms/maml/tests/test_maml.py"] ) -# MBMPO +# MBMPOTrainer py_test( name = "test_mbmpo", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "medium", srcs = ["algorithms/mbmpo/tests/test_mbmpo.py"] ) -# PG +# PGTrainer py_test( name = "test_pg", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", srcs = ["algorithms/pg/tests/test_pg.py"] ) -# PPO +# PPOTrainer py_test( name = "test_ppo", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", - srcs = ["algorithms/ppo/tests/test_ppo.py"] + srcs = ["agents/ppo/tests/test_ppo.py"] ) -# QMix +# PPO: DDPPO +py_test( + name = "test_ddppo", + tags = ["team:ml", "trainers_dir"], + size = "medium", + srcs = ["agents/ppo/tests/test_ddppo.py"] +) + +# PPO: APPO +py_test( + name = "test_appo", + tags = ["team:ml", "trainers_dir"], + size = "large", + srcs = ["agents/ppo/tests/test_appo.py"] +) + +# QMixTrainer py_test( name = "test_qmix", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "medium", srcs = ["algorithms/qmix/tests/test_qmix.py"] ) -# R2D2 +# R2D2Trainer py_test( name = "test_r2d2", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", srcs = ["agents/dqn/tests/test_r2d2.py"] ) -# RNNSAC +# RNNSACTrainer py_test( name = "test_rnnsac", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "medium", srcs = ["algorithms/sac/tests/test_rnnsac.py"] ) -# SAC +# SACTrainer py_test( name = "test_sac", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", srcs = ["algorithms/sac/tests/test_sac.py"] ) -# SimpleQ +# SimpleQTrainer py_test( name = "test_simple_q", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "medium", srcs = ["algorithms/dqn/tests/test_simple_q.py"] ) -# SlateQ +# SlateQTrainer py_test( name = "test_slateq", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "medium", srcs = ["algorithms/slateq/tests/test_slateq.py"] ) -# TD3 +# TD3Trainer py_test( name = "test_td3", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], size = "large", srcs = ["algorithms/ddpg/tests/test_td3.py"] ) @@ -928,7 +928,7 @@ py_test( py_test( name = "random_agent", - tags = ["team:ml", "algorithms_dir"], + tags = ["team:ml", "trainers_dir"], main = "contrib/random_agent/random_agent.py", size = "small", srcs = ["contrib/random_agent/random_agent.py"] @@ -957,8 +957,8 @@ py_test( main = "utils/tests/run_memory_leak_tests.py", size = "large", srcs = ["utils/tests/run_memory_leak_tests.py"], - data = ["tuned_examples/appo/memory-leak-test-appo.yaml"], - args = ["--yaml-dir=tuned_examples/appo"] + data = ["tuned_examples/ppo/memory-leak-test-appo.yaml"], + args = ["--yaml-dir=tuned_examples/ppo"] ) py_test( diff --git a/rllib/README.rst b/rllib/README.rst index 791c4f524..32acd1867 100644 --- a/rllib/README.rst +++ b/rllib/README.rst @@ -123,7 +123,7 @@ Quick First Experiment .. code-block:: python import gym - from ray.rllib.algorithms.ppo import PPO + from ray.rllib.agents.ppo import PPOTrainer # Define your problem using python and openAI's gym API: @@ -176,7 +176,7 @@ Quick First Experiment # Create an RLlib Trainer instance to learn how to act in the above # environment. - trainer = PPO( + trainer = PPOTrainer( config={ # Env class to use (here: our gym.Env sub-class from above). "env": ParrotEnv, diff --git a/rllib/agents/dqn/__init__.py b/rllib/agents/dqn/__init__.py index 0268b2f86..9b5097bec 100644 --- a/rllib/agents/dqn/__init__.py +++ b/rllib/agents/dqn/__init__.py @@ -1,7 +1,8 @@ +from ray.rllib.agents.dqn.apex import ApexConfig, ApexTrainer, APEX_DEFAULT_CONFIG from ray.rllib.algorithms.dqn.dqn import DQNConfig, DQNTrainer, DEFAULT_CONFIG from ray.rllib.algorithms.dqn.dqn_tf_policy import DQNTFPolicy from ray.rllib.algorithms.dqn.dqn_torch_policy import DQNTorchPolicy -from ray.rllib.agents.dqn.r2d2 import R2D2Config, R2D2Trainer, R2D2_DEFAULT_CONFIG +from ray.rllib.agents.dqn.r2d2 import R2D2Trainer, R2D2_DEFAULT_CONFIG from ray.rllib.agents.dqn.r2d2_torch_policy import R2D2TorchPolicy from ray.rllib.algorithms.dqn.simple_q import ( SimpleQConfig, @@ -10,7 +11,6 @@ from ray.rllib.algorithms.dqn.simple_q import ( ) from ray.rllib.algorithms.dqn.simple_q_tf_policy import SimpleQTFPolicy from ray.rllib.algorithms.dqn.simple_q_torch_policy import SimpleQTorchPolicy -from ray.rllib.agents.dqn.apex import ApexConfig, ApexTrainer, APEX_DEFAULT_CONFIG __all__ = [ "ApexConfig", @@ -19,7 +19,6 @@ __all__ = [ "DQNTFPolicy", "DQNTorchPolicy", "DQNTrainer", - "R2D2Config", "R2D2TorchPolicy", "R2D2Trainer", "SimpleQConfig", diff --git a/rllib/agents/impala/__init__.py b/rllib/agents/impala/__init__.py index be3ab20df..07c4f39ab 100644 --- a/rllib/agents/impala/__init__.py +++ b/rllib/agents/impala/__init__.py @@ -1,17 +1,7 @@ -from ray.rllib.algorithms.impala.impala import ( - DEFAULT_CONFIG, - ImpalaConfig, - Impala as ImpalaTrainer, -) -from ray.rllib.utils.deprecation import deprecation_warning - +from ray.rllib.agents.impala.impala import DEFAULT_CONFIG, ImpalaConfig, ImpalaTrainer __all__ = [ "ImpalaConfig", "ImpalaTrainer", "DEFAULT_CONFIG", ] - -deprecation_warning( - "ray.rllib.agents.impala", "ray.rllib.algorithms.impala", error=False -) diff --git a/rllib/algorithms/impala/impala.py b/rllib/agents/impala/impala.py similarity index 97% rename from rllib/algorithms/impala/impala.py rename to rllib/agents/impala/impala.py index 56b1d9bab..ec3d4796c 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/agents/impala/impala.py @@ -56,10 +56,10 @@ logger = logging.getLogger(__name__) class ImpalaConfig(TrainerConfig): - """Defines a configuration class from which an Impala can be built. + """Defines a configuration class from which an ImpalaTrainer can be built. Example: - >>> from ray.rllib.algorithms.impala import ImpalaConfig + >>> from ray.rllib.agents.impala import ImpalaConfig >>> config = ImpalaConfig().training(lr=0.0003, train_batch_size=512)\ ... .resources(num_gpus=4)\ ... .rollouts(num_rollout_workers=64) @@ -69,7 +69,7 @@ class ImpalaConfig(TrainerConfig): >>> trainer.train() Example: - >>> from ray.rllib.algorithms.impala import ImpalaConfig + >>> from ray.rllib.agents.impala import ImpalaConfig >>> from ray import tune >>> config = ImpalaConfig() >>> # Print out some default values. @@ -89,7 +89,7 @@ class ImpalaConfig(TrainerConfig): def __init__(self, trainer_class=None): """Initializes a ImpalaConfig instance.""" - super().__init__(trainer_class=trainer_class or Impala) + super().__init__(trainer_class=trainer_class or ImpalaTrainer) # fmt: off # __sphinx_doc_begin__ @@ -433,7 +433,7 @@ class BroadcastUpdateLearnerWeights: self.workers.local_worker().set_global_vars(_get_global_vars()) -class Impala(Trainer): +class ImpalaTrainer(Trainer): """Importance weighted actor/learner architecture (IMPALA) Trainer == Overview of data flow in IMPALA == @@ -458,31 +458,31 @@ class Impala(Trainer): ) -> Optional[Type[Policy]]: if config["framework"] == "torch": if config["vtrace"]: - from ray.rllib.algorithms.impala.impala_torch_policy import ( - ImpalaTorchPolicy, + from ray.rllib.agents.impala.vtrace_torch_policy import ( + VTraceTorchPolicy, ) - return ImpalaTorchPolicy + return VTraceTorchPolicy else: from ray.rllib.algorithms.a3c.a3c_torch_policy import A3CTorchPolicy return A3CTorchPolicy elif config["framework"] == "tf": if config["vtrace"]: - from ray.rllib.algorithms.impala.impala_tf_policy import ( - ImpalaTF1Policy, + from ray.rllib.agents.impala.vtrace_tf_policy import ( + VTraceStaticGraphTFPolicy, ) - return ImpalaTF1Policy + return VTraceStaticGraphTFPolicy else: from ray.rllib.algorithms.a3c.a3c_tf_policy import A3CTFPolicy return A3CTFPolicy else: if config["vtrace"]: - from ray.rllib.algorithms.impala.impala_tf_policy import ImpalaTF2Policy + from ray.rllib.agents.impala.vtrace_tf_policy import VTraceEagerTFPolicy - return ImpalaTF2Policy + return VTraceEagerTFPolicy else: from ray.rllib.algorithms.a3c.a3c_tf_policy import A3CTFPolicy @@ -940,14 +940,14 @@ class AggregatorWorker: return platform.node() -# Deprecated: Use ray.rllib.algorithms.impala.ImpalaConfig instead! +# Deprecated: Use ray.rllib.agents.pg.PGConfig instead! class _deprecated_default_config(dict): def __init__(self): super().__init__(ImpalaConfig().to_dict()) @Deprecated( - old="ray.rllib.agents.impala.impala::DEFAULT_CONFIG", - new="ray.rllib.algorithms.impala.impala::IMPALAConfig(...)", + old="ray.rllib.agents.impala.default_config::DEFAULT_CONFIG", + new="ray.rllib.agents.impala.impala.IMPALAConfig(...)", error=False, ) def __getitem__(self, item): diff --git a/rllib/algorithms/impala/tests/test_impala.py b/rllib/agents/impala/tests/test_impala.py similarity index 96% rename from rllib/algorithms/impala/tests/test_impala.py rename to rllib/agents/impala/tests/test_impala.py index bb63791d7..8a440f1af 100644 --- a/rllib/algorithms/impala/tests/test_impala.py +++ b/rllib/agents/impala/tests/test_impala.py @@ -1,7 +1,7 @@ import unittest import ray -import ray.rllib.algorithms.impala as impala +import ray.rllib.agents.impala as impala from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY @@ -25,7 +25,7 @@ class TestIMPALA(unittest.TestCase): ray.shutdown() def test_impala_compilation(self): - """Test whether Impala can be built with both frameworks.""" + """Test whether an ImpalaTrainer can be built with both frameworks.""" config = ( impala.ImpalaConfig() .resources(num_gpus=0) diff --git a/rllib/algorithms/impala/tests/test_vtrace.py b/rllib/agents/impala/tests/test_vtrace.py similarity index 99% rename from rllib/algorithms/impala/tests/test_vtrace.py rename to rllib/agents/impala/tests/test_vtrace.py index 83a50767d..e20e6d74e 100644 --- a/rllib/algorithms/impala/tests/test_vtrace.py +++ b/rllib/agents/impala/tests/test_vtrace.py @@ -24,8 +24,8 @@ from gym.spaces import Box import numpy as np import unittest -from ray.rllib.algorithms.impala import vtrace_tf as vtrace_tf -from ray.rllib.algorithms.impala import vtrace_torch as vtrace_torch +from ray.rllib.agents.impala import vtrace_tf as vtrace_tf +from ray.rllib.agents.impala import vtrace_torch as vtrace_torch from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.numpy import softmax from ray.rllib.utils.test_utils import check, framework_iterator diff --git a/rllib/algorithms/impala/vtrace_tf.py b/rllib/agents/impala/vtrace_tf.py similarity index 100% rename from rllib/algorithms/impala/vtrace_tf.py rename to rllib/agents/impala/vtrace_tf.py diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/agents/impala/vtrace_tf_policy.py similarity index 96% rename from rllib/algorithms/impala/impala_tf_policy.py rename to rllib/agents/impala/vtrace_tf_policy.py index 475baac92..4a8d1281c 100644 --- a/rllib/algorithms/impala/impala_tf_policy.py +++ b/rllib/agents/impala/vtrace_tf_policy.py @@ -8,7 +8,7 @@ import gym from typing import Dict, List, Type, Union import ray -from ray.rllib.algorithms.impala import vtrace_tf as vtrace +from ray.rllib.agents.impala import vtrace_tf as vtrace from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.models.tf.tf_action_dist import Categorical, TFActionDistribution from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2 @@ -23,7 +23,6 @@ from ray.rllib.utils.typing import ( LocalOptimizer, ModelGradients, TensorType, - TFPolicyV2Type, ) tf1, tf, tfv = try_import_tf() @@ -219,7 +218,7 @@ class VTraceOptimizer: pass # TODO: maybe standardize this function, so the choice of optimizers are more - # predictable for common algorithms. + # predictable for common agents. def optimizer( self, ) -> Union["tf.keras.optimizers.Optimizer", List["tf.keras.optimizers.Optimizer"]]: @@ -254,19 +253,19 @@ class VTraceOptimizer: # We need this builder function because we want to share the same # custom logics between TF1 dynamic and TF2 eager policies. -def get_impala_tf_policy(base: TFPolicyV2Type) -> TFPolicyV2Type: - """Construct an ImpalaTFPolicy inheriting either dynamic or eager base policies. +def get_vtrace_tf_policy(base: type) -> type: + """Construct an VTraceTFPolicy inheriting either dynamic or eager base policies. Args: base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2. Returns: - A TF Policy to be used with Impala. + A TF Policy to be used with ImpalaTrainer. """ # VTrace mixins are placed in front of more general mixins to make sure # their functions like optimizer() overrides all the other implementations # (e.g., LearningRateSchedule.optimizer()) - class ImpalaTFPolicy( + class VTraceTFPolicy( VTraceClipGradients, VTraceOptimizer, LearningRateSchedule, @@ -284,9 +283,7 @@ def get_impala_tf_policy(base: TFPolicyV2Type) -> TFPolicyV2Type: # First thing first, enable eager execution if necessary. base.enable_eager_execution_if_necessary() - config = dict( - ray.rllib.algorithms.impala.impala.ImpalaConfig().to_dict(), **config - ) + config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) # Initialize base class. base.__init__( @@ -437,8 +434,8 @@ def get_impala_tf_policy(base: TFPolicyV2Type) -> TFPolicyV2Type: def get_batch_divisibility_req(self) -> int: return self.config["rollout_fragment_length"] - return ImpalaTFPolicy + return VTraceTFPolicy -ImpalaTF1Policy = get_impala_tf_policy(DynamicTFPolicyV2) -ImpalaTF2Policy = get_impala_tf_policy(EagerTFPolicyV2) +VTraceStaticGraphTFPolicy = get_vtrace_tf_policy(DynamicTFPolicyV2) +VTraceEagerTFPolicy = get_vtrace_tf_policy(EagerTFPolicyV2) diff --git a/rllib/algorithms/impala/vtrace_torch.py b/rllib/agents/impala/vtrace_torch.py similarity index 99% rename from rllib/algorithms/impala/vtrace_torch.py rename to rllib/agents/impala/vtrace_torch.py index e5d23fbb1..f92d141fd 100644 --- a/rllib/algorithms/impala/vtrace_torch.py +++ b/rllib/agents/impala/vtrace_torch.py @@ -29,7 +29,7 @@ multi_from_logits method accepts lists of tensors instead of just tensors. """ -from ray.rllib.algorithms.impala.vtrace_tf import VTraceFromLogitsReturns, VTraceReturns +from ray.rllib.agents.impala.vtrace_tf import VTraceFromLogitsReturns, VTraceReturns from ray.rllib.models.torch.torch_action_dist import TorchCategorical from ray.rllib.utils import force_list from ray.rllib.utils.framework import try_import_torch diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/agents/impala/vtrace_torch_policy.py similarity index 98% rename from rllib/algorithms/impala/impala_torch_policy.py rename to rllib/agents/impala/vtrace_torch_policy.py index 52e46b6fe..cdb7ad749 100644 --- a/rllib/algorithms/impala/impala_torch_policy.py +++ b/rllib/agents/impala/vtrace_torch_policy.py @@ -4,7 +4,7 @@ import numpy as np from typing import Dict, List, Type, Union import ray -import ray.rllib.algorithms.impala.vtrace_torch as vtrace +import ray.rllib.agents.impala.vtrace_torch as vtrace from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.models.action_dist import ActionDistribution from ray.rllib.models.torch.torch_action_dist import TorchCategorical @@ -188,18 +188,16 @@ class VTraceOptimizer: # VTrace mixins are placed in front of more general mixins to make sure # their functions like optimizer() overrides all the other implementations # (e.g., LearningRateSchedule.optimizer()) -class ImpalaTorchPolicy( +class VTraceTorchPolicy( VTraceOptimizer, LearningRateSchedule, EntropyCoeffSchedule, TorchPolicyV2, ): - """PyTorch policy class used with Impala.""" + """PyTorch policy class used with ImpalaTrainer.""" def __init__(self, observation_space, action_space, config): - config = dict( - ray.rllib.algorithms.impala.impala.ImpalaConfig().to_dict(), **config - ) + config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) VTraceOptimizer.__init__(self) # Need to initialize learning rate variable before calling diff --git a/rllib/agents/ppo/README.md b/rllib/agents/ppo/README.md new file mode 100644 index 000000000..4095f3550 --- /dev/null +++ b/rllib/agents/ppo/README.md @@ -0,0 +1,37 @@ +# Proximal Policy Optimization (PPO) + +## Overview + +[PPO](https://arxiv.org/abs/1707.06347) is a model-free on-policy RL algorithm that works well for both discrete and continuous action space environments. PPO utilizes an actor-critic framework, where there are two networks, an actor (policy network) and critic network (value function). + +There are two formulations of PPO, which are both implemented in RLlib. The first formulation of PPO imitates the prior paper [TRPO](https://arxiv.org/abs/1502.05477) without the complexity of second-order optimization. In this formulation, for every iteration, an old version of an actor-network is saved and the agent seeks to optimize the RL objective while staying close to the old policy. This makes sure that the agent does not destabilize during training. In the second formulation, To mitigate destructive large policy updates, an issue discovered for vanilla policy gradient methods, PPO introduces the surrogate objective, which clips large action probability ratios between the current and old policy. Clipping has been shown in the paper to significantly improve training stability and speed. + +## Distributed PPO Algorithms + +PPO is a core algorithm in RLlib due to its ability to scale well with the number of nodes. In RLlib, we provide various implementation of distributed PPO, with different underlying execution plans, as shown below. + +Distributed baseline PPO is a synchronous distributed RL algorithm. Data collection nodes, which represent the old policy, gather data synchronously to create a large pool of on-policy data from which the agent performs minibatch gradient descent on. + +On the other hand, Asychronous PPO (APPO) opts to imitate IMPALA as its distributed execution plan. Data collection nodes gather data asynchronously, which are collected in a circular replay buffer. A target network and doubly-importance sampled surrogate objective is introduced to enforce training stability in the asynchronous data-collection setting. + +Lastly, Decentralized Distributed PPO (DDPPO) removes the assumption that gradient-updates must be done on a central node. Instead, gradients are computed remotely on each data collection node and all-reduced at each mini-batch using torch distributed. This allows each worker’s GPU to be used both for sampling and for training. + +## Documentation & Implementation: + +1) Proximal Policy Optimization (PPO). + + **[Detailed Documentation](https://docs.ray.io/en/master/rllib-algorithms.html#ppo)** + + **[Implementation](https://github.com/ray-project/ray/blob/master/rllib/agents/ppo/ppo.py)** + +2) [Asynchronous Proximal Policy Optimization (APPO)](https://arxiv.org/abs/1912.00167). + + **[Detailed Documentation](https://docs.ray.io/en/master/rllib-algorithms.html#appo)** + + **[Implementation](https://github.com/ray-project/ray/blob/master/rllib/agents/ppo/appo.py)** + +3) [Decentralized Distributed Proximal Policy Optimization (DDPPO)](https://arxiv.org/abs/1911.00357) + + **[Detailed Documentation](https://docs.ray.io/en/master/rllib-algorithms.html#decentralized-distributed-proximal-policy-optimization-dd-ppo)** + + **[Implementation](https://github.com/ray-project/ray/blob/master/rllib/agents/ppo/ddppo.py)** diff --git a/rllib/agents/ppo/__init__.py b/rllib/agents/ppo/__init__.py index c1e8bedda..dd998c21d 100644 --- a/rllib/agents/ppo/__init__.py +++ b/rllib/agents/ppo/__init__.py @@ -1,23 +1,18 @@ -from ray.rllib.algorithms.ppo.ppo import PPOConfig, PPO as PPOTrainer, DEFAULT_CONFIG -from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy, PPOTF2Policy -from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy -from ray.rllib.algorithms.appo.appo import APPOConfig, APPO as APPOTrainer -from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF1Policy, APPOTF2Policy -from ray.rllib.algorithms.appo.appo_torch_policy import APPOTorchPolicy -from ray.rllib.algorithms.ddppo.ddppo import DDPPOConfig, DDPPO as DDPPOTrainer +from ray.rllib.agents.ppo.ppo import PPOConfig, PPOTrainer, DEFAULT_CONFIG +from ray.rllib.agents.ppo.ppo_tf_policy import PPOStaticGraphTFPolicy, PPOEagerTFPolicy +from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy +from ray.rllib.agents.ppo.appo import APPOConfig, APPOTrainer +from ray.rllib.agents.ppo.ddppo import DDPPOConfig, DDPPOTrainer __all__ = [ "APPOConfig", - "APPOTF1Policy", - "APPOTF2Policy", - "APPOTorchPolicy", "APPOTrainer", "DDPPOConfig", "DDPPOTrainer", "DEFAULT_CONFIG", "PPOConfig", - "PPOTF1Policy", - "PPOTF2Policy", + "PPOStaticGraphTFPolicy", + "PPOEagerTFPolicy", "PPOTorchPolicy", "PPOTrainer", ] diff --git a/rllib/algorithms/appo/appo.py b/rllib/agents/ppo/appo.py similarity index 90% rename from rllib/algorithms/appo/appo.py rename to rllib/agents/ppo/appo.py index f2ca5f02a..f17be9875 100644 --- a/rllib/algorithms/appo/appo.py +++ b/rllib/agents/ppo/appo.py @@ -13,7 +13,7 @@ from typing import Optional, Type import logging from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY -from ray.rllib.algorithms.impala import Impala, ImpalaConfig +from ray.rllib.agents.impala import ImpalaTrainer, ImpalaConfig from ray.rllib.policy.policy import Policy from ray.rllib.utils.annotations import override from ray.rllib.utils.deprecation import Deprecated @@ -33,10 +33,10 @@ logger = logging.getLogger(__name__) class APPOConfig(ImpalaConfig): - """Defines a configuration class from which an APPO Trainer can be built. + """Defines a APPOTrainer configuration class from which a new Trainer can be built. Example: - >>> from ray.rllib.algorithms.appo import APPOConfig + >>> from ray.rllib.agents.ppo import APPOConfig >>> config = APPOConfig().training(lr=0.01, grad_clip=30.0)\ ... .resources(num_gpus=1)\ ... .rollouts(num_rollout_workers=16) @@ -46,7 +46,7 @@ class APPOConfig(ImpalaConfig): >>> trainer.train() Example: - >>> from ray.rllib.algorithms.appo import APPOConfig + >>> from ray.rllib.agents.ppo import APPOConfig >>> from ray import tune >>> config = APPOConfig() >>> # Print out some default values. @@ -66,7 +66,7 @@ class APPOConfig(ImpalaConfig): def __init__(self, trainer_class=None): """Initializes a APPOConfig instance.""" - super().__init__(trainer_class=trainer_class or APPO) + super().__init__(trainer_class=trainer_class or APPOTrainer) # fmt: off # __sphinx_doc_begin__ @@ -166,9 +166,8 @@ class APPOConfig(ImpalaConfig): return self -class APPO(Impala): +class APPOTrainer(ImpalaTrainer): def __init__(self, config, *args, **kwargs): - """Initializes a DDPPO instance.""" super().__init__(config, *args, **kwargs) # After init: Initialize target net. @@ -227,7 +226,7 @@ class APPO(Impala): # Worker. self.workers.local_worker().foreach_policy_to_train(update) - @override(Impala) + @override(ImpalaTrainer) def training_iteration(self) -> ResultDict: train_results = super().training_iteration() @@ -237,36 +236,36 @@ class APPO(Impala): return train_results @classmethod - @override(Impala) + @override(ImpalaTrainer) def get_default_config(cls) -> TrainerConfigDict: return APPOConfig().to_dict() - @override(Impala) + @override(ImpalaTrainer) def get_default_policy_class( self, config: PartialTrainerConfigDict ) -> Optional[Type[Policy]]: if config["framework"] == "torch": - from ray.rllib.algorithms.appo.appo_torch_policy import APPOTorchPolicy + from ray.rllib.agents.ppo.appo_torch_policy import APPOTorchPolicy return APPOTorchPolicy elif config["framework"] == "tf": - from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF1Policy + from ray.rllib.agents.ppo.appo_tf_policy import APPOStaticGraphTFPolicy - return APPOTF1Policy + return APPOStaticGraphTFPolicy else: - from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF2Policy + from ray.rllib.agents.ppo.appo_tf_policy import APPOEagerTFPolicy - return APPOTF2Policy + return APPOEagerTFPolicy -# Deprecated: Use ray.rllib.algorithms.appo.APPOConfig instead! +# Deprecated: Use ray.rllib.agents.ppo.APPOConfig instead! class _deprecated_default_config(dict): def __init__(self): super().__init__(APPOConfig().to_dict()) @Deprecated( - old="ray.rllib.agents.ppo.appo::DEFAULT_CONFIG", - new="ray.rllib.algorithms.appo.appo::APPOConfig(...)", + old="ray.rllib.agents.ppo.appo.DEFAULT_CONFIG", + new="ray.rllib.agents.ppo.appo.APPOConfig(...)", error=False, ) def __getitem__(self, item): diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/agents/ppo/appo_tf_policy.py similarity index 97% rename from rllib/algorithms/appo/appo_tf_policy.py rename to rllib/agents/ppo/appo_tf_policy.py index bca21eb0f..3d5e893ee 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/agents/ppo/appo_tf_policy.py @@ -11,8 +11,8 @@ import gym from typing import Dict, List, Optional, Type, Union import ray -from ray.rllib.algorithms.impala import vtrace_tf as vtrace -from ray.rllib.algorithms.impala.impala_tf_policy import ( +from ray.rllib.agents.impala import vtrace_tf as vtrace +from ray.rllib.agents.impala.vtrace_tf_policy import ( _make_time_major, VTraceClipGradients, VTraceOptimizer, @@ -123,7 +123,7 @@ def get_appo_tf_policy(base: type) -> type: base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2. Returns: - A TF Policy to be used with Impala. + A TF Policy to be used with ImpalaTrainer. """ class APPOTFPolicy( @@ -147,9 +147,7 @@ def get_appo_tf_policy(base: type) -> type: # First thing first, enable eager execution if necessary. base.enable_eager_execution_if_necessary() - config = dict( - ray.rllib.algorithms.appo.appo.APPOConfig().to_dict(), **config - ) + config = dict(ray.rllib.agents.ppo.appo.DEFAULT_CONFIG, **config) # Although this is a no-op, we call __init__ here to make it clear # that base.__init__ will use the make_model() call. @@ -472,5 +470,5 @@ def get_appo_tf_policy(base: type) -> type: return APPOTFPolicy -APPOTF1Policy = get_appo_tf_policy(DynamicTFPolicyV2) -APPOTF2Policy = get_appo_tf_policy(EagerTFPolicyV2) +APPOStaticGraphTFPolicy = get_appo_tf_policy(DynamicTFPolicyV2) +APPOEagerTFPolicy = get_appo_tf_policy(EagerTFPolicyV2) diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/agents/ppo/appo_torch_policy.py similarity index 97% rename from rllib/algorithms/appo/appo_torch_policy.py rename to rllib/agents/ppo/appo_torch_policy.py index ac10d00bd..d3fdc326d 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/agents/ppo/appo_torch_policy.py @@ -11,12 +11,12 @@ import logging from typing import Any, Dict, List, Optional, Type, Union import ray -from ray.rllib.algorithms.appo.appo_tf_policy import make_appo_model -import ray.rllib.algorithms.impala.vtrace_torch as vtrace -from ray.rllib.algorithms.impala.impala_torch_policy import ( +import ray.rllib.agents.impala.vtrace_torch as vtrace +from ray.rllib.agents.impala.vtrace_torch_policy import ( make_time_major, VTraceOptimizer, ) +from ray.rllib.agents.ppo.appo_tf_policy import make_appo_model from ray.rllib.evaluation.episode import Episode from ray.rllib.evaluation.postprocessing import ( compute_gae_for_sample_batch, @@ -65,10 +65,10 @@ class APPOTorchPolicy( TargetNetworkMixin, TorchPolicyV2, ): - """PyTorch policy class used with APPO.""" + """PyTorch policy class used with APPOTrainer.""" def __init__(self, observation_space, action_space, config): - config = dict(ray.rllib.algorithms.appo.appo.APPOConfig().to_dict(), **config) + config = dict(ray.rllib.agents.ppo.appo.DEFAULT_CONFIG, **config) # Although this is a no-op, we call __init__ here to make it clear # that base.__init__ will use the make_model() call. diff --git a/rllib/algorithms/ddppo/ddppo.py b/rllib/agents/ppo/ddppo.py similarity index 95% rename from rllib/algorithms/ddppo/ddppo.py rename to rllib/agents/ppo/ddppo.py index 242f64aad..5d52d5740 100644 --- a/rllib/algorithms/ddppo/ddppo.py +++ b/rllib/agents/ppo/ddppo.py @@ -21,7 +21,7 @@ import time from typing import Callable, Optional, Union import ray -from ray.rllib.algorithms.ppo import PPOConfig, PPO +from ray.rllib.agents.ppo.ppo import PPOConfig, PPOTrainer from ray.rllib.evaluation.postprocessing import Postprocessing from ray.rllib.evaluation.rollout_worker import RolloutWorker from ray.rllib.execution.common import ( @@ -52,10 +52,10 @@ logger = logging.getLogger(__name__) class DDPPOConfig(PPOConfig): - """Defines a configuration class from which a DDPPO Trainer can be built. + """Defines a PPOTrainer configuration class from which a PPOTrainer can be built. Example: - >>> from ray.rllib.algorithms.ddppo import DDPPOConfig + >>> from ray.rllib.agents.ppo import DDPPOConfig >>> config = DDPPOConfig().training(lr=0.003, keep_local_weights_in_sync=True)\ ... .resources(num_gpus=1)\ ... .rollouts(num_workers=10) @@ -65,7 +65,7 @@ class DDPPOConfig(PPOConfig): >>> trainer.train() Example: - >>> from ray.rllib.algorithms.ddppo import DDPPOConfig + >>> from ray.rllib.agents.ppo import DDPPOConfig >>> from ray import tune >>> config = DDPPOConfig() >>> # Print out some default values. @@ -85,7 +85,7 @@ class DDPPOConfig(PPOConfig): def __init__(self, trainer_class=None): """Initializes a DDPPOConfig instance.""" - super().__init__(trainer_class=trainer_class or DDPPO) + super().__init__(trainer_class=trainer_class or DDPPOTrainer) # fmt: off # __sphinx_doc_begin__ @@ -157,7 +157,7 @@ class DDPPOConfig(PPOConfig): return self -class DDPPO(PPO): +class DDPPOTrainer(PPOTrainer): def __init__( self, config: Optional[PartialTrainerConfigDict] = None, @@ -166,7 +166,7 @@ class DDPPO(PPO): remote_checkpoint_dir: Optional[str] = None, sync_function_tpl: Optional[str] = None, ): - """Initializes a DDPPO instance. + """Initializes a DDPPOTrainer instance. Args: config: Algorithm-specific configuration dict. @@ -195,11 +195,11 @@ class DDPPO(PPO): ) * config.get("num_envs_per_worker", DEFAULT_CONFIG["num_envs_per_worker"]) @classmethod - @override(PPO) + @override(PPOTrainer) def get_default_config(cls) -> TrainerConfigDict: return DDPPOConfig().to_dict() - @override(PPO) + @override(PPOTrainer) def validate_config(self, config): """Validates the Trainer's config dict. @@ -250,7 +250,7 @@ class DDPPO(PPO): if config["kl_coeff"] != 0.0 or config["kl_target"] != 0.0: raise ValueError("DDPPO doesn't support KL penalties like PPO-1") - @override(PPO) + @override(PPOTrainer) def setup(self, config: PartialTrainerConfigDict): super().setup(config) @@ -281,7 +281,7 @@ class DDPPO(PPO): ray_wait_timeout_s=0.03, ) - @override(PPO) + @override(PPOTrainer) def training_iteration(self) -> ResultDict: # Shortcut. first_worker = self.workers.remote_workers()[0] @@ -372,14 +372,14 @@ class DDPPO(PPO): } -# Deprecated: Use ray.rllib.algorithms.ddppo.DDPPOConfig instead! +# Deprecated: Use ray.rllib.agents.ppo.DDPPOConfig instead! class _deprecated_default_config(dict): def __init__(self): super().__init__(DDPPOConfig().to_dict()) @Deprecated( - old="ray.rllib.agents.ppo.ddppo::DEFAULT_CONFIG", - new="ray.rllib.algorithms.ddppo.ddppo::DDPPOConfig(...)", + old="ray.rllib.agents.ppo.ddppo.DEFAULT_CONFIG", + new="ray.rllib.agents.ppo.ddppo.DDPPOConfig(...)", error=False, ) def __getitem__(self, item): diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/agents/ppo/ppo.py similarity index 95% rename from rllib/algorithms/ppo/ppo.py rename to rllib/agents/ppo/ppo.py index 2b2a17f1d..8026cce44 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/agents/ppo/ppo.py @@ -40,10 +40,10 @@ logger = logging.getLogger(__name__) class PPOConfig(TrainerConfig): - """Defines a configuration class from which a PPO Trainer can be built. + """Defines a PPOTrainer configuration class from which a PPOTrainer can be built. Example: - >>> from ray.rllib.algorithms.ppo import PPOConfig + >>> from ray.rllib.agents.ppo import PPOConfig >>> config = PPOConfig().training(gamma=0.9, lr=0.01, kl_coeff=0.3)\ ... .resources(num_gpus=0)\ ... .rollouts(num_workers=4) @@ -53,7 +53,7 @@ class PPOConfig(TrainerConfig): >>> trainer.train() Example: - >>> from ray.rllib.algorithms.ppo import PPOConfig + >>> from ray.rllib.agents.ppo import PPOConfig >>> from ray import tune >>> config = PPOConfig() >>> # Print out some default values. @@ -73,7 +73,7 @@ class PPOConfig(TrainerConfig): def __init__(self, trainer_class=None): """Initializes a PPOConfig instance.""" - super().__init__(trainer_class=trainer_class or PPO) + super().__init__(trainer_class=trainer_class or PPOTrainer) # fmt: off # __sphinx_doc_begin__ @@ -267,7 +267,7 @@ def warn_about_bad_reward_scales(config, result): return result -class PPO(Trainer): +class PPOTrainer(Trainer): # TODO: Change the return value of this method to return a TrainerConfig object # instead. @classmethod @@ -365,17 +365,17 @@ class PPO(Trainer): @override(Trainer) def get_default_policy_class(self, config: TrainerConfigDict) -> Type[Policy]: if config["framework"] == "torch": - from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy + from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy return PPOTorchPolicy elif config["framework"] == "tf": - from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy + from ray.rllib.agents.ppo.ppo_tf_policy import PPOStaticGraphTFPolicy - return PPOTF1Policy + return PPOStaticGraphTFPolicy else: - from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF2Policy + from ray.rllib.agents.ppo.ppo_tf_policy import PPOEagerTFPolicy - return PPOTF2Policy + return PPOEagerTFPolicy @ExperimentalAPI def training_iteration(self) -> ResultDict: @@ -455,14 +455,14 @@ class PPO(Trainer): return train_results -# Deprecated: Use ray.rllib.algorithms.ppo.PPOConfig instead! +# Deprecated: Use ray.rllib.agents.ppo.PPOConfig instead! class _deprecated_default_config(dict): def __init__(self): super().__init__(PPOConfig().to_dict()) @Deprecated( - old="ray.rllib.agents.ppo.ppo::DEFAULT_CONFIG", - new="ray.rllib.algorithms.ppo.ppo::PPOConfig(...)", + old="ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG", + new="ray.rllib.agents.ppo.ppo.PPOConfig(...)", error=False, ) def __getitem__(self, item): diff --git a/rllib/algorithms/ppo/ppo_tf_policy.py b/rllib/agents/ppo/ppo_tf_policy.py similarity index 97% rename from rllib/algorithms/ppo/ppo_tf_policy.py rename to rllib/agents/ppo/ppo_tf_policy.py index 5fa5fadba..3c38b0f42 100644 --- a/rllib/algorithms/ppo/ppo_tf_policy.py +++ b/rllib/agents/ppo/ppo_tf_policy.py @@ -60,7 +60,7 @@ def get_ppo_tf_policy(base: TFPolicyV2Type) -> TFPolicyV2Type: base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2. Returns: - A TF Policy to be used with PPO. + A TF Policy to be used with PPOTrainer. """ class PPOTFPolicy( @@ -81,7 +81,7 @@ def get_ppo_tf_policy(base: TFPolicyV2Type) -> TFPolicyV2Type: # First thing first, enable eager execution if necessary. base.enable_eager_execution_if_necessary() - config = dict(ray.rllib.algorithms.ppo.ppo.PPOConfig().to_dict(), **config) + config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) validate_config(config) # Initialize base class. @@ -241,5 +241,5 @@ def get_ppo_tf_policy(base: TFPolicyV2Type) -> TFPolicyV2Type: return PPOTFPolicy -PPOTF1Policy = get_ppo_tf_policy(DynamicTFPolicyV2) -PPOTF2Policy = get_ppo_tf_policy(EagerTFPolicyV2) +PPOStaticGraphTFPolicy = get_ppo_tf_policy(DynamicTFPolicyV2) +PPOEagerTFPolicy = get_ppo_tf_policy(EagerTFPolicyV2) diff --git a/rllib/algorithms/ppo/ppo_torch_policy.py b/rllib/agents/ppo/ppo_torch_policy.py similarity index 97% rename from rllib/algorithms/ppo/ppo_torch_policy.py rename to rllib/agents/ppo/ppo_torch_policy.py index 725bfea34..cd6d0667d 100644 --- a/rllib/algorithms/ppo/ppo_torch_policy.py +++ b/rllib/agents/ppo/ppo_torch_policy.py @@ -2,7 +2,7 @@ import logging from typing import Dict, List, Type, Union import ray -from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config +from ray.rllib.agents.ppo.ppo_tf_policy import validate_config from ray.rllib.evaluation.postprocessing import ( Postprocessing, compute_gae_for_sample_batch, @@ -39,10 +39,10 @@ class PPOTorchPolicy( KLCoeffMixin, TorchPolicyV2, ): - """PyTorch policy class used with PPO.""" + """PyTorch policy class used with PPOTrainer.""" def __init__(self, observation_space, action_space, config): - config = dict(ray.rllib.algorithms.ppo.ppo.PPOConfig().to_dict(), **config) + config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) validate_config(config) TorchPolicyV2.__init__( diff --git a/rllib/algorithms/appo/tests/test_appo.py b/rllib/agents/ppo/tests/test_appo.py similarity index 91% rename from rllib/algorithms/appo/tests/test_appo.py rename to rllib/agents/ppo/tests/test_appo.py index 7ee60e64a..2f55d8ac4 100644 --- a/rllib/algorithms/appo/tests/test_appo.py +++ b/rllib/agents/ppo/tests/test_appo.py @@ -1,7 +1,7 @@ import unittest import ray -import ray.rllib.algorithms.appo as appo +import ray.rllib.agents.ppo as ppo from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY from ray.rllib.utils.test_utils import ( @@ -21,8 +21,8 @@ class TestAPPO(unittest.TestCase): ray.shutdown() def test_appo_compilation(self): - """Test whether APPO can be built with both frameworks.""" - config = appo.APPOConfig().rollouts(num_rollout_workers=1) + """Test whether an APPOTrainer can be built with both frameworks.""" + config = ppo.appo.APPOConfig().rollouts(num_rollout_workers=1) num_iterations = 2 for _ in framework_iterator(config, with_eager_tracing=True): @@ -47,9 +47,11 @@ class TestAPPO(unittest.TestCase): trainer.stop() def test_appo_compilation_use_kl_loss(self): - """Test whether APPO can be built with kl_loss enabled.""" + """Test whether an APPOTrainer can be built with kl_loss enabled.""" config = ( - appo.APPOConfig().rollouts(num_rollout_workers=1).training(use_kl_loss=True) + ppo.appo.APPOConfig() + .rollouts(num_rollout_workers=1) + .training(use_kl_loss=True) ) num_iterations = 2 @@ -66,7 +68,7 @@ class TestAPPO(unittest.TestCase): # Not explicitly setting this should cause a warning, but not fail. # config["_tf_policy_handles_more_than_one_loss"] = True config = ( - appo.APPOConfig() + ppo.appo.APPOConfig() .rollouts(num_rollout_workers=1) .training(_separate_vf_optimizer=True, _lr_vf=0.002) ) @@ -89,7 +91,7 @@ class TestAPPO(unittest.TestCase): def test_appo_entropy_coeff_schedule(self): # Initial lr, doesn't really matter because of the schedule below. config = ( - appo.APPOConfig() + ppo.appo.APPOConfig() .rollouts( num_rollout_workers=1, batch_mode="truncate_episodes", diff --git a/rllib/algorithms/ddppo/tests/test_ddppo.py b/rllib/agents/ppo/tests/test_ddppo.py similarity index 89% rename from rllib/algorithms/ddppo/tests/test_ddppo.py rename to rllib/agents/ppo/tests/test_ddppo.py index c0c5c5222..1538c7283 100644 --- a/rllib/algorithms/ddppo/tests/test_ddppo.py +++ b/rllib/agents/ppo/tests/test_ddppo.py @@ -2,7 +2,7 @@ import unittest import pytest import ray -import ray.rllib.algorithms.ddppo as ddppo +import ray.rllib.agents.ppo as ppo from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY from ray.rllib.utils.test_utils import ( @@ -23,8 +23,8 @@ class TestDDPPO(unittest.TestCase): ray.shutdown() def test_ddppo_compilation(self): - """Test whether DDPPO can be built with both frameworks.""" - config = ddppo.DDPPOConfig().resources(num_gpus_per_worker=0) + """Test whether a DDPPOTrainer can be built with both frameworks.""" + config = ppo.DDPPOConfig().resources(num_gpus_per_worker=0) num_iterations = 2 @@ -44,7 +44,7 @@ class TestDDPPO(unittest.TestCase): def test_ddppo_schedule(self): """Test whether lr_schedule will anneal lr to 0""" - config = ddppo.DDPPOConfig() + config = ppo.DDPPOConfig() config.resources(num_gpus_per_worker=0) config.training(lr_schedule=[[0, config.lr], [1000, 0.0]]) @@ -64,7 +64,7 @@ class TestDDPPO(unittest.TestCase): def test_validate_config(self): """Test if DDPPO will raise errors after invalid configs are passed.""" - config = ddppo.DDPPOConfig().training(kl_coeff=1.0) + config = ppo.DDPPOConfig().training(kl_coeff=1.0) msg = "DDPPO doesn't support KL penalties like PPO-1" with pytest.raises(ValueError, match=msg): config.build(env="CartPole-v0") diff --git a/rllib/algorithms/ppo/tests/test_ppo.py b/rllib/agents/ppo/tests/test_ppo.py similarity index 96% rename from rllib/algorithms/ppo/tests/test_ppo.py rename to rllib/agents/ppo/tests/test_ppo.py index eee120cef..ad070ca40 100644 --- a/rllib/algorithms/ppo/tests/test_ppo.py +++ b/rllib/agents/ppo/tests/test_ppo.py @@ -3,9 +3,9 @@ import unittest import ray from ray.rllib.agents.callbacks import DefaultCallbacks -import ray.rllib.algorithms.ppo as ppo -from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF2Policy -from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy +import ray.rllib.agents.ppo as ppo +from ray.rllib.agents.ppo.ppo_tf_policy import PPOEagerTFPolicy +from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy from ray.rllib.evaluation.postprocessing import ( compute_gae_for_sample_batch, Postprocessing, @@ -89,7 +89,7 @@ class TestPPO(unittest.TestCase): ray.shutdown() def test_ppo_compilation_and_schedule_mixins(self): - """Test whether PPO can be built with all frameworks.""" + """Test whether a PPOTrainer can be built with all frameworks.""" # Build a PPOConfig object. config = ( @@ -172,7 +172,7 @@ class TestPPO(unittest.TestCase): # Test against all frameworks. for fw in framework_iterator(config): # Default Agent should be setup with StochasticSampling. - trainer = ppo.PPO(config=config, env="FrozenLake-v1") + trainer = ppo.PPOTrainer(config=config, env="FrozenLake-v1") # explore=False, always expect the same (deterministic) action. a_ = trainer.compute_single_action( obs, explore=False, prev_action=np.array(2), prev_reward=np.array(1.0) @@ -223,7 +223,7 @@ class TestPPO(unittest.TestCase): ) for fw, sess in framework_iterator(config, session=True): - trainer = ppo.PPO(config=config, env="CartPole-v0") + trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Check the free log std var is created. @@ -265,7 +265,7 @@ class TestPPO(unittest.TestCase): # Expect warning. print(f"Accessing learning-rate from legacy config dict: {ppo_config['lr']}") # Build Trainer. - ppo_trainer = ppo.PPO(config=ppo_config, env="CartPole-v1") + ppo_trainer = ppo.PPOTrainer(config=ppo_config, env="CartPole-v1") print(ppo_trainer.train()) def test_ppo_loss_function(self): @@ -286,7 +286,7 @@ class TestPPO(unittest.TestCase): ) for fw, sess in framework_iterator(config, session=True): - trainer = ppo.PPO(config=config, env="CartPole-v0") + trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Check no free log std var by default. @@ -313,7 +313,7 @@ class TestPPO(unittest.TestCase): # Calculate actual PPO loss. if fw in ["tf2", "tfe"]: - PPOTF2Policy.loss(policy, policy.model, Categorical, train_batch) + PPOEagerTFPolicy.loss(policy, policy.model, Categorical, train_batch) elif fw == "torch": PPOTorchPolicy.loss( policy, policy.model, policy.dist_class, train_batch diff --git a/rllib/agents/registry.py b/rllib/agents/registry.py index 5103d0063..70cb00476 100644 --- a/rllib/agents/registry.py +++ b/rllib/agents/registry.py @@ -8,49 +8,55 @@ from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS def _import_a2c(): import ray.rllib.algorithms.a2c as a2c - return a2c.A2C, a2c.A2CConfig().to_dict() + return a2c.A2C, a2c.A2C_DEFAULT_CONFIG def _import_a3c(): import ray.rllib.algorithms.a3c as a3c - return a3c.A3C, a3c.A3CConfig().to_dict() + return a3c.A3C, a3c.DEFAULT_CONFIG def _import_alpha_star(): - import ray.rllib.algorithms.alpha_star as alpha_star + from ray.rllib.algorithms.alpha_star.alpha_star import ( + AlphaStarTrainer, + DEFAULT_CONFIG, + ) - return alpha_star.AlphaStarTrainer, alpha_star.AlphaStarConfig().to_dict() + return AlphaStarTrainer, DEFAULT_CONFIG def _import_alpha_zero(): - import ray.rllib.algorithms.alpha_zero as alpha_zero + from ray.rllib.algorithms.alpha_zero.alpha_zero import ( + AlphaZeroTrainer, + DEFAULT_CONFIG, + ) - return alpha_zero.AlphaZeroTrainer, alpha_zero.AlphaZeroConfig().to_dict() + return AlphaZeroTrainer, DEFAULT_CONFIG def _import_apex(): from ray.rllib.agents import dqn - return dqn.ApexTrainer, dqn.apex.ApexConfig().to_dict() + return dqn.ApexTrainer, dqn.apex.APEX_DEFAULT_CONFIG def _import_apex_ddpg(): from ray.rllib.algorithms import ddpg - return ddpg.ApexDDPGTrainer, ddpg.apex.ApexDDPGConfig().to_dict() + return ddpg.ApexDDPGTrainer, ddpg.apex.APEX_DDPG_DEFAULT_CONFIG def _import_appo(): - import ray.rllib.algorithms.appo as appo + from ray.rllib.agents import ppo - return appo.APPO, appo.APPOConfig().to_dict() + return ppo.APPOTrainer, ppo.appo.DEFAULT_CONFIG def _import_ars(): from ray.rllib.algorithms import ars - return ars.ARSTrainer, ars.ARSConfig().to_dict() + return ars.ARSTrainer, ars.DEFAULT_CONFIG def _import_bandit_lints(): @@ -68,127 +74,127 @@ def _import_bandit_linucb(): def _import_bc(): from ray.rllib.algorithms import marwil - return marwil.BCTrainer, marwil.BCConfig().to_dict() + return marwil.BCTrainer, marwil.DEFAULT_CONFIG def _import_cql(): from ray.rllib.algorithms import cql - return cql.CQLTrainer, cql.CQLConfig().to_dict() + return cql.CQLTrainer, cql.DEFAULT_CONFIG def _import_ddpg(): from ray.rllib.algorithms import ddpg - return ddpg.DDPGTrainer, ddpg.DDPGConfig().to_dict() + return ddpg.DDPGTrainer, ddpg.DEFAULT_CONFIG def _import_ddppo(): - import ray.rllib.algorithms.ddppo as ddppo + from ray.rllib.agents import ppo - return ddppo.DDPPO, ddppo.DDPPOConfig().to_dict() + return ppo.DDPPOTrainer, ppo.DEFAULT_CONFIG def _import_dqn(): from ray.rllib.algorithms import dqn - return dqn.DQNTrainer, dqn.DQNConfig().to_dict() + return dqn.DQNTrainer, dqn.DEFAULT_CONFIG def _import_dreamer(): from ray.rllib.algorithms import dreamer - return dreamer.DREAMERTrainer, dreamer.DREAMERConfig().to_dict() + return dreamer.DREAMERTrainer, dreamer.DEFAULT_CONFIG def _import_es(): from ray.rllib.algorithms import es - return es.ESTrainer, es.ESConfig().to_dict() + return es.ESTrainer, es.DEFAULT_CONFIG def _import_impala(): - import ray.rllib.algorithms.impala as impala + from ray.rllib.agents import impala - return impala.Impala, impala.ImpalaConfig().to_dict() + return impala.ImpalaTrainer, impala.DEFAULT_CONFIG def _import_maddpg(): - import ray.rllib.algorithms.maddpg as maddpg + from ray.rllib.agents import maddpg - return maddpg.MADDPGTrainer, maddpg.MADDPGConfig().to_dict() + return maddpg.MADDPGTrainer, maddpg.DEFAULT_CONFIG def _import_maml(): from ray.rllib.algorithms import maml - return maml.MAMLTrainer, maml.MAMLConfig().to_dict() + return maml.MAMLTrainer, maml.DEFAULT_CONFIG def _import_marwil(): from ray.rllib.algorithms import marwil - return marwil.MARWILTrainer, marwil.MARWILConfig().to_dict() + return marwil.MARWILTrainer, marwil.DEFAULT_CONFIG def _import_mbmpo(): from ray.rllib.algorithms import mbmpo - return mbmpo.MBMPOTrainer, mbmpo.MBMPOConfig().to_dict() + return mbmpo.MBMPOTrainer, mbmpo.DEFAULT_CONFIG def _import_pg(): from ray.rllib.algorithms import pg - return pg.PGTrainer, pg.PGConfig().to_dict() + return pg.PGTrainer, pg.DEFAULT_CONFIG def _import_ppo(): - import ray.rllib.algorithms.ppo as ppo + from ray.rllib.agents import ppo - return ppo.PPO, ppo.PPOConfig().to_dict() + return ppo.PPOTrainer, ppo.DEFAULT_CONFIG def _import_qmix(): from ray.rllib.algorithms import qmix - return qmix.QMixTrainer, qmix.QMixConfig().to_dict() + return qmix.QMixTrainer, qmix.DEFAULT_CONFIG def _import_r2d2(): from ray.rllib.agents import dqn - return dqn.R2D2Trainer, dqn.R2D2Config().to_dict() + return dqn.R2D2Trainer, dqn.R2D2_DEFAULT_CONFIG def _import_sac(): from ray.rllib.algorithms import sac - return sac.SACTrainer, sac.SACConfig().to_dict() + return sac.SACTrainer, sac.DEFAULT_CONFIG def _import_rnnsac(): from ray.rllib.algorithms import sac - return sac.RNNSACTrainer, sac.RNNSACConfig().to_dict() + return sac.RNNSACTrainer, sac.RNNSAC_DEFAULT_CONFIG def _import_simple_q(): from ray.rllib.algorithms import dqn - return dqn.SimpleQTrainer, dqn.simple_q.SimpleQConfig().to_dict() + return dqn.SimpleQTrainer, dqn.simple_q.DEFAULT_CONFIG def _import_slate_q(): from ray.rllib.algorithms import slateq - return slateq.SlateQTrainer, slateq.SlateQConfig().to_dict() + return slateq.SlateQTrainer, slateq.DEFAULT_CONFIG def _import_td3(): from ray.rllib.algorithms import ddpg - return ddpg.TD3Trainer, ddpg.td3.TD3Config().to_dict() + return ddpg.TD3Trainer, ddpg.td3.TD3_DEFAULT_CONFIG ALGORITHMS = { diff --git a/rllib/agents/tests/test_memory_leaks.py b/rllib/agents/tests/test_memory_leaks.py index 814e17602..58ced3a11 100644 --- a/rllib/agents/tests/test_memory_leaks.py +++ b/rllib/agents/tests/test_memory_leaks.py @@ -2,7 +2,7 @@ import unittest import ray import ray.rllib.algorithms.dqn as dqn -import ray.rllib.algorithms.ppo as ppo +import ray.rllib.agents.ppo as ppo from ray.rllib.examples.env.memory_leaking_env import MemoryLeakingEnv from ray.rllib.examples.policy.memory_leaking_policy import MemoryLeakingPolicy from ray.rllib.policy.policy import PolicySpec @@ -30,7 +30,7 @@ class TestMemoryLeaks(unittest.TestCase): config["env_config"] = { "static_samples": True, } - trainer = ppo.PPO(config=config) + trainer = ppo.PPOTrainer(config=config) results = check_memory_leaks(trainer, to_check={"env"}, repeats=150) assert results["env"] trainer.stop() diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py index 8b8445914..97a398b5a 100644 --- a/rllib/agents/trainer.py +++ b/rllib/agents/trainer.py @@ -1431,9 +1431,9 @@ class Trainer(Trainable): If None, the output format will be DL framework specific. Example: - >>> from ray.rllib.algorithms.ppo import PPO + >>> from ray.rllib.agents.ppo import PPOTrainer >>> # Use a Trainer from RLlib or define your own. - >>> trainer = PPO(...) # doctest: +SKIP + >>> trainer = PPOTrainer(...) # doctest: +SKIP >>> for _ in range(10): # doctest: +SKIP >>> trainer.train() # doctest: +SKIP >>> trainer.export_policy_model("/tmp/dir") # doctest: +SKIP @@ -1456,9 +1456,9 @@ class Trainer(Trainable): policy_id: Optional policy id to export. Example: - >>> from ray.rllib.algorithms.ppo import PPO + >>> from ray.rllib.agents.ppo import PPOTrainer >>> # Use a Trainer from RLlib or define your own. - >>> trainer = PPO(...) # doctest: +SKIP + >>> trainer = PPOTrainer(...) # doctest: +SKIP >>> for _ in range(10): # doctest: +SKIP >>> trainer.train() # doctest: +SKIP >>> trainer.export_policy_checkpoint("/tmp/export_dir") # doctest: +SKIP @@ -1478,8 +1478,8 @@ class Trainer(Trainable): policy_id: Optional policy id to import into. Example: - >>> from ray.rllib.algorithms.ppo import PPO - >>> trainer = PPO(...) # doctest: +SKIP + >>> from ray.rllib.agents.ppo import PPOTrainer + >>> trainer = PPOTrainer(...) # doctest: +SKIP >>> trainer.import_policy_model_from_h5("/tmp/weights.h5") # doctest: +SKIP >>> for _ in range(10): # doctest: +SKIP >>> trainer.train() # doctest: +SKIP diff --git a/rllib/algorithms/a3c/a3c.py b/rllib/algorithms/a3c/a3c.py index c3596dd38..4116076c5 100644 --- a/rllib/algorithms/a3c/a3c.py +++ b/rllib/algorithms/a3c/a3c.py @@ -31,7 +31,7 @@ logger = logging.getLogger(__name__) class A3CConfig(TrainerConfig): - """Defines a configuration class from which a A3C Trainer can be built. + """Defines a PPOTrainer configuration class from which a PPOTrainer can be built. Example: >>> from ray import tune diff --git a/rllib/algorithms/alpha_star/alpha_star.py b/rllib/algorithms/alpha_star/alpha_star.py index c7977f4d5..399e2adad 100644 --- a/rllib/algorithms/alpha_star/alpha_star.py +++ b/rllib/algorithms/alpha_star/alpha_star.py @@ -12,7 +12,7 @@ from ray.actor import ActorHandle from ray.rllib.algorithms.alpha_star.distributed_learners import DistributedLearners from ray.rllib.algorithms.alpha_star.league_builder import AlphaStarLeagueBuilder from ray.rllib.agents.trainer import Trainer -import ray.rllib.algorithms.appo.appo as appo +import ray.rllib.agents.ppo.appo as appo from ray.rllib.evaluation.rollout_worker import RolloutWorker from ray.rllib.execution.parallel_requests import ( AsyncRequestsManager, @@ -232,12 +232,12 @@ class AlphaStarConfig(appo.APPOConfig): return self -class AlphaStarTrainer(appo.APPO): - _allow_unknown_subkeys = appo.APPO._allow_unknown_subkeys + [ +class AlphaStarTrainer(appo.APPOTrainer): + _allow_unknown_subkeys = appo.APPOTrainer._allow_unknown_subkeys + [ "league_builder_config", ] _override_all_subkeys_if_type_changes = ( - appo.APPO._override_all_subkeys_if_type_changes + appo.APPOTrainer._override_all_subkeys_if_type_changes + [ "league_builder_config", ] @@ -310,11 +310,11 @@ class AlphaStarTrainer(appo.APPO): ) @classmethod - @override(appo.APPO) + @override(appo.APPOTrainer) def get_default_config(cls) -> TrainerConfigDict: return AlphaStarConfig().to_dict() - @override(appo.APPO) + @override(appo.APPOTrainer) def validate_config(self, config: TrainerConfigDict): # Create the LeagueBuilder object, allowing it to build the multiagent # config as well. @@ -323,7 +323,7 @@ class AlphaStarTrainer(appo.APPO): ) super().validate_config(config) - @override(appo.APPO) + @override(appo.APPOTrainer) def setup(self, config: PartialTrainerConfigDict): # Call super's setup to validate config, create RolloutWorkers # (train and eval), etc.. @@ -604,7 +604,7 @@ class AlphaStarTrainer(appo.APPO): return train_results - @override(appo.APPO) + @override(appo.APPOTrainer) def __getstate__(self) -> dict: state = super().__getstate__() state.update( @@ -614,7 +614,7 @@ class AlphaStarTrainer(appo.APPO): ) return state - @override(appo.APPO) + @override(appo.APPOTrainer) def __setstate__(self, state: dict) -> None: state_copy = state.copy() self.league_builder.__setstate__(state.pop("league_builder", {})) diff --git a/rllib/algorithms/appo/README.md b/rllib/algorithms/appo/README.md deleted file mode 100644 index 458659c34..000000000 --- a/rllib/algorithms/appo/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# Asynchronous Proximal Policy Optimization (APPO) - -## Overview - -[PPO](https://arxiv.org/abs/1707.06347) is a model-free on-policy RL algorithm that works -well for both discrete and continuous action space environments. PPO utilizes an -actor-critic framework, where there are two networks, an actor (policy network) and -critic network (value function). - -## Distributed PPO Algorithms - -### Distributed baseline PPO -[See implementation here](https://github.com/ray-project/ray/blob/master/rllib/algorithms/ppo/ppo.py) - -### Asychronous PPO (APPO) .. - -.. opts to imitate IMPALA as its distributed execution plan. -Data collection nodes gather data asynchronously, which are collected in a circular replay -buffer. A target network and doubly-importance sampled surrogate objective is introduced -to enforce training stability in the asynchronous data-collection setting. -[See implementation here](https://github.com/ray-project/ray/blob/master/rllib/algorithms/appo/appo.py) - -### Decentralized Distributed PPO (DDPPO) - -[See implementation here](https://github.com/ray-project/ray/blob/master/rllib/algorithms/ddppo/ddppo.py) - - -## Documentation & Implementation: - -### [Asynchronous Proximal Policy Optimization (APPO)](https://arxiv.org/abs/1912.00167). - -**[Detailed Documentation](https://docs.ray.io/en/master/rllib-algorithms.html#appo)** - -**[Implementation](https://github.com/ray-project/ray/blob/master/rllib/agents/ppo/appo.py)** diff --git a/rllib/algorithms/appo/__init__.py b/rllib/algorithms/appo/__init__.py deleted file mode 100644 index cc03908e2..000000000 --- a/rllib/algorithms/appo/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from ray.rllib.algorithms.appo.appo import APPO, APPOConfig, DEFAULT_CONFIG -from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF1Policy, APPOTF2Policy -from ray.rllib.algorithms.appo.appo_torch_policy import APPOTorchPolicy - -__all__ = [ - "APPO", - "APPOConfig", - "APPOTF1Policy", - "APPOTF2Policy", - "APPOTorchPolicy", - "DEFAULT_CONFIG", -] diff --git a/rllib/algorithms/ddpg/apex.py b/rllib/algorithms/ddpg/apex.py index 6c2282fa0..d98e4a5ab 100644 --- a/rllib/algorithms/ddpg/apex.py +++ b/rllib/algorithms/ddpg/apex.py @@ -2,8 +2,8 @@ from typing import List, Optional from ray.actor import ActorHandle from ray.rllib.agents import Trainer -from ray.rllib.algorithms.ddpg.ddpg import DDPGConfig, DDPGTrainer from ray.rllib.agents.dqn.apex import ApexTrainer +from ray.rllib.algorithms.ddpg.ddpg import DDPGConfig, DDPGTrainer from ray.rllib.evaluation.worker_set import WorkerSet from ray.rllib.utils.annotations import override from ray.rllib.utils.typing import TrainerConfigDict diff --git a/rllib/algorithms/ddppo/README.md b/rllib/algorithms/ddppo/README.md deleted file mode 100644 index 1d8fedfa4..000000000 --- a/rllib/algorithms/ddppo/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# Decentralized Distributed Proximal Policy Optimization (DDPPO) - -## Overview - -[PPO](https://arxiv.org/abs/1707.06347) is a model-free on-policy RL algorithm that works -well for both discrete and continuous action space environments. PPO utilizes an -actor-critic framework, where there are two networks, an actor (policy network) and -critic network (value function). - -## Distributed PPO Algorithms - -### Distributed baseline PPO -[See implementation here](https://github.com/ray-project/ray/blob/master/rllib/algorithms/ppo/ppo.py) - -### Asychronous PPO (APPO) -[See implementation here](https://github.com/ray-project/ray/blob/master/rllib/algorithms/appo/appo.py) - - -### Decentralized Distributed PPO (DDPPO) .. - -.. removes the assumption that gradient-updates must -be done on a central node. Instead, gradients are computed remotely on each data -collection node and all-reduced at each mini-batch using torch distributed. This allows -each worker’s GPU to be used both for sampling and for training. - -[See implementation here](https://github.com/ray-project/ray/blob/master/rllib/algorithms/ddppo/ddppo.py) - - -## Documentation & Implementation: - -### [Decentralized Distributed Proximal Policy Optimization (DDPPO)](https://arxiv.org/abs/1911.00357) - -**[Detailed Documentation](https://docs.ray.io/en/master/rllib-algorithms.html#decentralized-distributed-proximal-policy-optimization-dd-ppo)** - -**[Implementation](https://github.com/ray-project/ray/blob/master/rllib/algorithms/ddppo/ddppo.py)** diff --git a/rllib/algorithms/ddppo/__init__.py b/rllib/algorithms/ddppo/__init__.py deleted file mode 100644 index b33460d78..000000000 --- a/rllib/algorithms/ddppo/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from ray.rllib.algorithms.ddppo.ddppo import DDPPOConfig, DDPPO, DEFAULT_CONFIG - -__all__ = [ - "DDPPOConfig", - "DDPPO", - "DEFAULT_CONFIG", -] diff --git a/rllib/algorithms/dreamer/dreamer.py b/rllib/algorithms/dreamer/dreamer.py index db7752d49..231f89c0f 100644 --- a/rllib/algorithms/dreamer/dreamer.py +++ b/rllib/algorithms/dreamer/dreamer.py @@ -28,7 +28,7 @@ logger = logging.getLogger(__name__) class DREAMERConfig(TrainerConfig): - """Defines a configuration class from which a Dreamer Trainer can be built. + """Defines a PPOTrainer configuration class from which a PPOTrainer can be built. Example: >>> from ray.rllib.algorithms.dreamer import DREAMERConfig diff --git a/rllib/algorithms/impala/__init__.py b/rllib/algorithms/impala/__init__.py deleted file mode 100644 index 626022747..000000000 --- a/rllib/algorithms/impala/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from ray.rllib.algorithms.impala.impala import Impala, ImpalaConfig, DEFAULT_CONFIG -from ray.rllib.algorithms.impala.impala_tf_policy import ( - ImpalaTF1Policy, - ImpalaTF2Policy, -) -from ray.rllib.algorithms.impala.impala_torch_policy import ImpalaTorchPolicy - -__all__ = [ - "ImpalaConfig", - "Impala", - "ImpalaTF1Policy", - "ImpalaTF2Policy", - "ImpalaTorchPolicy", - "DEFAULT_CONFIG", -] diff --git a/rllib/algorithms/maml/maml_tf_policy.py b/rllib/algorithms/maml/maml_tf_policy.py index ccc6894b8..5fe1682b2 100644 --- a/rllib/algorithms/maml/maml_tf_policy.py +++ b/rllib/algorithms/maml/maml_tf_policy.py @@ -2,7 +2,7 @@ import logging from typing import Dict, List, Type, Union import ray -from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config +from ray.rllib.agents.ppo.ppo_tf_policy import validate_config from ray.rllib.evaluation.postprocessing import ( Postprocessing, compute_gae_for_sample_batch, diff --git a/rllib/algorithms/maml/maml_torch_policy.py b/rllib/algorithms/maml/maml_torch_policy.py index c40901db9..ca469e8bb 100644 --- a/rllib/algorithms/maml/maml_torch_policy.py +++ b/rllib/algorithms/maml/maml_torch_policy.py @@ -2,7 +2,7 @@ import logging from typing import Dict, List, Type, Union import ray -from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config +from ray.rllib.agents.ppo.ppo_tf_policy import validate_config from ray.rllib.evaluation.postprocessing import ( Postprocessing, compute_gae_for_sample_batch, diff --git a/rllib/algorithms/ppo/README.md b/rllib/algorithms/ppo/README.md deleted file mode 100644 index 154c4629b..000000000 --- a/rllib/algorithms/ppo/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# Proximal Policy Optimization (PPO) - -## Overview - -[PPO](https://arxiv.org/abs/1707.06347) is a model-free on-policy RL algorithm that works -well for both discrete and continuous action space environments. PPO utilizes an -actor-critic framework, where there are two networks, an actor (policy network) and -critic network (value function). - -There are two formulations of PPO, which are both implemented in RLlib. The first -formulation of PPO imitates the prior paper [TRPO](https://arxiv.org/abs/1502.05477) -without the complexity of second-order optimization. In this formulation, for every -iteration, an old version of an actor-network is saved and the agent seeks to optimize -the RL objective while staying close to the old policy. This makes sure that the agent -does not destabilize during training. In the second formulation, To mitigate destructive -large policy updates, an issue discovered for vanilla policy gradient methods, PPO -introduces the surrogate objective, which clips large action probability ratios between -the current and old policy. Clipping has been shown in the paper to significantly -improve training stability and speed. - -## Distributed PPO Algorithms - -PPO is a core algorithm in RLlib due to its ability to scale well with the number of nodes. - -In RLlib, we provide various implementations of distributed PPO, with different underlying -execution plans, as shown below: - -### Distributed baseline PPO .. -.. is a synchronous distributed RL algorithm (this algo here). -Data collection nodes, which represent the old policy, gather data synchronously to -create a large pool of on-policy data from which the agent performs minibatch -gradient descent on. - -### Asychronous PPO (APPO) - -[See implementation here](https://github.com/ray-project/ray/blob/master/rllib/algorithms/appo/appo.py) - -### Decentralized Distributed PPO (DDPPO) - -[See implementation here](https://github.com/ray-project/ray/blob/master/rllib/algorithms/ddppo/ddppo.py) - - -## Documentation & Implementation: - -### Proximal Policy Optimization (PPO). - -**[Detailed Documentation](https://docs.ray.io/en/master/rllib-algorithms.html#ppo)** - -**[Implementation](https://github.com/ray-project/ray/blob/master/rllib/algorithms/ppo/ppo.py)** diff --git a/rllib/algorithms/ppo/__init__.py b/rllib/algorithms/ppo/__init__.py deleted file mode 100644 index c592b38f7..000000000 --- a/rllib/algorithms/ppo/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from ray.rllib.algorithms.ppo.ppo import PPOConfig, PPO, DEFAULT_CONFIG -from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy, PPOTF2Policy -from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy - -__all__ = [ - "PPOConfig", - "PPOTF1Policy", - "PPOTF2Policy", - "PPOTorchPolicy", - "PPO", - "DEFAULT_CONFIG", -] diff --git a/rllib/evaluation/tests/test_trajectory_view_api.py b/rllib/evaluation/tests/test_trajectory_view_api.py index 3101c1381..b2addf289 100644 --- a/rllib/evaluation/tests/test_trajectory_view_api.py +++ b/rllib/evaluation/tests/test_trajectory_view_api.py @@ -7,7 +7,7 @@ import unittest import ray from ray.rllib.agents.callbacks import DefaultCallbacks import ray.rllib.algorithms.dqn as dqn -import ray.rllib.algorithms.ppo as ppo +import ray.rllib.agents.ppo as ppo from ray.rllib.examples.env.debug_counter_env import MultiAgentDebugCounterEnv from ray.rllib.examples.env.multi_agent import MultiAgentPendulum from ray.rllib.evaluation.rollout_worker import RolloutWorker @@ -104,7 +104,7 @@ class TestTrajectoryViewAPI(unittest.TestCase): config["model"]["lstm_use_prev_reward"] = True for _ in framework_iterator(config): - trainer = ppo.PPO(config, env="CartPole-v0") + trainer = ppo.PPOTrainer(config, env="CartPole-v0") policy = trainer.get_policy() view_req_model = policy.model.view_requirements view_req_policy = policy.view_requirements @@ -168,7 +168,7 @@ class TestTrajectoryViewAPI(unittest.TestCase): config["env_config"] = {"config": {"start_at_t": 1}} # first obs is [1.0] for _ in framework_iterator(config, frameworks="tf2"): - trainer = ppo.PPO( + trainer = ppo.PPOTrainer( config, env="ray.rllib.examples.env.debug_counter_env.DebugCounterEnv", ) @@ -322,6 +322,7 @@ class TestTrajectoryViewAPI(unittest.TestCase): print(batch) def test_counting_by_agent_steps(self): + """Test whether a PPOTrainer can be built with all frameworks.""" config = copy.deepcopy(ppo.DEFAULT_CONFIG) num_agents = 3 @@ -341,7 +342,7 @@ class TestTrajectoryViewAPI(unittest.TestCase): config["env_config"] = {"num_agents": num_agents} num_iterations = 2 - trainer = ppo.PPO(config=config) + trainer = ppo.PPOTrainer(config=config) results = None for i in range(num_iterations): results = trainer.train() diff --git a/rllib/examples/action_masking.py b/rllib/examples/action_masking.py index 3db2b076e..9b2736f2e 100644 --- a/rllib/examples/action_masking.py +++ b/rllib/examples/action_masking.py @@ -147,7 +147,7 @@ if __name__ == "__main__": raise ValueError("This example only supports APPO and PPO.") ppo_config = ppo.DEFAULT_CONFIG.copy() ppo_config.update(config) - trainer = ppo.PPO(config=ppo_config, env=ActionMaskEnv) + trainer = ppo.PPOTrainer(config=ppo_config, env=ActionMaskEnv) # run manual training loop and print results after each iteration for _ in range(args.stop_iters): result = trainer.train() diff --git a/rllib/examples/attention_net.py b/rllib/examples/attention_net.py index 8ce9bc060..16496b7cb 100644 --- a/rllib/examples/attention_net.py +++ b/rllib/examples/attention_net.py @@ -170,7 +170,7 @@ if __name__ == "__main__": raise ValueError("Only support --run PPO with --no-tune.") ppo_config = ppo.DEFAULT_CONFIG.copy() ppo_config.update(config) - trainer = ppo.PPO(config=ppo_config, env=args.env) + trainer = ppo.PPOTrainer(config=ppo_config, env=args.env) # run manual training loop and print results after each iteration for _ in range(args.stop_iters): result = trainer.train() diff --git a/rllib/examples/autoregressive_action_dist.py b/rllib/examples/autoregressive_action_dist.py index 6a64370e6..c388439ca 100644 --- a/rllib/examples/autoregressive_action_dist.py +++ b/rllib/examples/autoregressive_action_dist.py @@ -163,7 +163,7 @@ if __name__ == "__main__": raise ValueError("Only support --run PPO with --no-tune.") ppo_config = ppo.DEFAULT_CONFIG.copy() ppo_config.update(config) - trainer = ppo.PPO(config=ppo_config, env=CorrelatedActionsEnv) + trainer = ppo.PPOTrainer(config=ppo_config, env=CorrelatedActionsEnv) # run manual training loop and print results after each iteration for _ in range(args.stop_iters): result = trainer.train() diff --git a/rllib/examples/cartpole_lstm.py b/rllib/examples/cartpole_lstm.py index 33f827d76..a0fe86e92 100644 --- a/rllib/examples/cartpole_lstm.py +++ b/rllib/examples/cartpole_lstm.py @@ -86,9 +86,9 @@ if __name__ == "__main__": # Example (use `config` from the above code): # >> import numpy as np - # >> from ray.rllib.algorithms.ppo import PPO + # >> from ray.rllib.agents.ppo import PPOTrainer # >> - # >> trainer = PPO(config) + # >> trainer = PPOTrainer(config) # >> lstm_cell_size = config["model"]["lstm_cell_size"] # >> env = StatelessCartPole() # >> obs = env.reset() diff --git a/rllib/examples/centralized_critic.py b/rllib/examples/centralized_critic.py index 517dc068c..273b539d3 100644 --- a/rllib/examples/centralized_critic.py +++ b/rllib/examples/centralized_critic.py @@ -20,12 +20,12 @@ import os import ray from ray import tune -from ray.rllib.algorithms.ppo.ppo import PPO -from ray.rllib.algorithms.ppo.ppo_tf_policy import ( - PPOTF1Policy, - PPOTF2Policy, +from ray.rllib.agents.ppo.ppo import PPOTrainer +from ray.rllib.agents.ppo.ppo_tf_policy import ( + PPOStaticGraphTFPolicy, + PPOEagerTFPolicy, ) -from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy +from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy from ray.rllib.evaluation.postprocessing import compute_advantages, Postprocessing from ray.rllib.examples.env.two_step_game import TwoStepGame from ray.rllib.examples.models.centralized_critic_models import ( @@ -209,8 +209,8 @@ def get_ccppo_policy(base): return CCPPOTFPolicy -CCPPOStaticGraphTFPolicy = get_ccppo_policy(PPOTF1Policy) -CCPPOEagerTFPolicy = get_ccppo_policy(PPOTF2Policy) +CCPPOStaticGraphTFPolicy = get_ccppo_policy(PPOStaticGraphTFPolicy) +CCPPOEagerTFPolicy = get_ccppo_policy(PPOEagerTFPolicy) class CCPPOTorchPolicy(CentralizedValueMixin, PPOTorchPolicy): @@ -231,8 +231,8 @@ class CCPPOTorchPolicy(CentralizedValueMixin, PPOTorchPolicy): ) -class CCTrainer(PPO): - @override(PPO) +class CCTrainer(PPOTrainer): + @override(PPOTrainer) def get_default_policy_class(self, config): if config["framework"] == "torch": return CCPPOTorchPolicy diff --git a/rllib/examples/coin_game_env.py b/rllib/examples/coin_game_env.py index c1a1067f7..91a706856 100644 --- a/rllib/examples/coin_game_env.py +++ b/rllib/examples/coin_game_env.py @@ -7,7 +7,7 @@ import os import ray from ray import tune -from ray.rllib.algorithms.ppo import PPO +from ray.rllib.agents.ppo import PPOTrainer from ray.rllib.examples.env.coin_game_non_vectorized_env import CoinGame, AsymCoinGame parser = argparse.ArgumentParser() @@ -71,7 +71,7 @@ def main(debug, stop_iters=2000, tf=False, asymmetric_env=False): } tune_analysis = tune.run( - PPO, + PPOTrainer, config=rllib_config, stop=stop, checkpoint_freq=0, diff --git a/rllib/examples/custom_env.py b/rllib/examples/custom_env.py index 4c6587847..d73521f18 100644 --- a/rllib/examples/custom_env.py +++ b/rllib/examples/custom_env.py @@ -187,7 +187,7 @@ if __name__ == "__main__": ppo_config.update(config) # use fixed learning rate instead of grid search (needs tune) ppo_config["lr"] = 1e-3 - trainer = ppo.PPO(config=ppo_config, env=SimpleCorridor) + trainer = ppo.PPOTrainer(config=ppo_config, env=SimpleCorridor) # run manual training loop and print results after each iteration for _ in range(args.stop_iters): result = trainer.train() diff --git a/rllib/examples/custom_experiment.py b/rllib/examples/custom_experiment.py index 0e3391d81..37abcae72 100644 --- a/rllib/examples/custom_experiment.py +++ b/rllib/examples/custom_experiment.py @@ -3,7 +3,7 @@ import argparse import ray from ray import tune -import ray.rllib.algorithms.ppo as ppo +from ray.rllib.agents import ppo parser = argparse.ArgumentParser() parser.add_argument("--train-iterations", type=int, default=10) @@ -11,7 +11,7 @@ parser.add_argument("--train-iterations", type=int, default=10) def experiment(config): iterations = config.pop("train-iterations") - train_agent = ppo.PPO(config=config, env="CartPole-v0") + train_agent = ppo.PPOTrainer(config=config, env="CartPole-v0") checkpoint = None train_results = {} @@ -25,7 +25,7 @@ def experiment(config): # Manual Eval config["num_workers"] = 0 - eval_agent = ppo.PPO(config=config, env="CartPole-v0") + eval_agent = ppo.PPOTrainer(config=config, env="CartPole-v0") eval_agent.restore(checkpoint) env = eval_agent.workers.local_worker().env @@ -53,5 +53,5 @@ if __name__ == "__main__": tune.run( experiment, config=config, - resources_per_trial=ppo.PPO.default_resource_request(config), + resources_per_trial=ppo.PPOTrainer.default_resource_request(config), ) diff --git a/rllib/examples/custom_rnn_model.py b/rllib/examples/custom_rnn_model.py index 370d1f442..8e2372b90 100644 --- a/rllib/examples/custom_rnn_model.py +++ b/rllib/examples/custom_rnn_model.py @@ -90,9 +90,9 @@ if __name__ == "__main__": # Example (use `config` from the above code): # >> import numpy as np - # >> from ray.rllib.algorithms.ppo import PPO + # >> from ray.rllib.agents.ppo import PPOTrainer # >> - # >> trainer = PPO(config) + # >> trainer = PPOTrainer(config) # >> lstm_cell_size = config["model"]["custom_model_config"]["cell_size"] # >> env = RepeatAfterMeEnv({}) # >> obs = env.reset() diff --git a/rllib/examples/custom_train_fn.py b/rllib/examples/custom_train_fn.py index 2e8f087d6..f1567d45a 100644 --- a/rllib/examples/custom_train_fn.py +++ b/rllib/examples/custom_train_fn.py @@ -10,7 +10,7 @@ import os import ray from ray import tune -from ray.rllib.algorithms.ppo import PPO +from ray.rllib.agents.ppo import PPOTrainer parser = argparse.ArgumentParser() parser.add_argument( @@ -25,7 +25,7 @@ def my_train_fn(config, reporter): iterations = config.pop("train-iterations", 10) # Train for n iterations with high LR - agent1 = PPO(env="CartPole-v0", config=config) + agent1 = PPOTrainer(env="CartPole-v0", config=config) for _ in range(iterations): result = agent1.train() result["phase"] = 1 @@ -36,7 +36,7 @@ def my_train_fn(config, reporter): # Train for n iterations with low LR config["lr"] = 0.0001 - agent2 = PPO(env="CartPole-v0", config=config) + agent2 = PPOTrainer(env="CartPole-v0", config=config) agent2.restore(state) for _ in range(iterations): result = agent2.train() @@ -58,5 +58,5 @@ if __name__ == "__main__": "num_workers": 0, "framework": args.framework, } - resources = PPO.default_resource_request(config) + resources = PPOTrainer.default_resource_request(config) tune.run(my_train_fn, resources_per_trial=resources, config=config) diff --git a/rllib/examples/documentation/custom_gym_env.py b/rllib/examples/documentation/custom_gym_env.py index 11353c0c8..37b491478 100644 --- a/rllib/examples/documentation/custom_gym_env.py +++ b/rllib/examples/documentation/custom_gym_env.py @@ -2,7 +2,7 @@ import gym import ray -import ray.rllib.algorithms.ppo as ppo +from ray.rllib.agents import ppo class SimpleCorridor(gym.Env): @@ -35,7 +35,7 @@ config = { }, } -trainer = ppo.PPO(config=config) +trainer = ppo.PPOTrainer(config=config) for _ in range(3): print(trainer.train()) # __rllib-custom-gym-env-end__ diff --git a/rllib/examples/documentation/rllib_in_60s.py b/rllib/examples/documentation/rllib_in_60s.py index 18b3b7edf..4defad394 100644 --- a/rllib/examples/documentation/rllib_in_60s.py +++ b/rllib/examples/documentation/rllib_in_60s.py @@ -1,6 +1,6 @@ # __rllib-in-60s-begin__ # Import the RL algorithm (Trainer) we would like to use. -from ray.rllib.algorithms.ppo import PPO +from ray.rllib.agents.ppo import PPOTrainer # Configure the algorithm. config = { @@ -28,7 +28,7 @@ config = { } # Create our RLlib Trainer. -trainer = PPO(config=config) +trainer = PPOTrainer(config=config) # Run it for n training iterations. A training iteration includes # parallel sample collection by the environment workers as well as diff --git a/rllib/examples/documentation/rllib_on_ray_readme.py b/rllib/examples/documentation/rllib_on_ray_readme.py index b8001d76a..a5cac81f9 100644 --- a/rllib/examples/documentation/rllib_on_ray_readme.py +++ b/rllib/examples/documentation/rllib_on_ray_readme.py @@ -1,6 +1,6 @@ # __quick_start_begin__ import gym -from ray.rllib.algorithms.ppo import PPO +from ray.rllib.agents.ppo import PPOTrainer # Define your problem using python and openAI's gym API: @@ -49,7 +49,7 @@ class SimpleCorridor(gym.Env): # Create an RLlib Trainer instance. -trainer = PPO( +trainer = PPOTrainer( config={ # Env class to use (here: our gym.Env sub-class from above). "env": SimpleCorridor, diff --git a/rllib/examples/documentation/rllib_on_rllib_readme.py b/rllib/examples/documentation/rllib_on_rllib_readme.py index abf14a24b..aeff7ce18 100644 --- a/rllib/examples/documentation/rllib_on_rllib_readme.py +++ b/rllib/examples/documentation/rllib_on_rllib_readme.py @@ -1,5 +1,5 @@ import gym -from ray.rllib.algorithms.ppo import PPO +from ray.rllib.agents.ppo import PPOTrainer # Define your problem using python and openAI's gym API: @@ -51,7 +51,7 @@ class ParrotEnv(gym.Env): # Create an RLlib Trainer instance to learn how to act in the above # environment. -trainer = PPO( +trainer = PPOTrainer( config={ # Env class to use (here: our gym.Env sub-class from above). "env": ParrotEnv, diff --git a/rllib/examples/export/onnx_tf.py b/rllib/examples/export/onnx_tf.py index 361baa16a..d259a48d9 100644 --- a/rllib/examples/export/onnx_tf.py +++ b/rllib/examples/export/onnx_tf.py @@ -1,6 +1,6 @@ import numpy as np import ray -import ray.rllib.algorithms.ppo as ppo +import ray.rllib.agents.ppo as ppo import onnxruntime import os import shutil @@ -24,7 +24,7 @@ test_data = { # Start Ray and initialize a PPO trainer ray.init() -trainer = ppo.PPO(config=config, env="CartPole-v0") +trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") # You could train the model here # trainer.train() diff --git a/rllib/examples/export/onnx_torch.py b/rllib/examples/export/onnx_torch.py index e48c73f32..1c461aa2e 100644 --- a/rllib/examples/export/onnx_torch.py +++ b/rllib/examples/export/onnx_torch.py @@ -2,7 +2,7 @@ from distutils.version import LooseVersion import numpy as np import ray -import ray.rllib.algorithms.ppo as ppo +import ray.rllib.agents.ppo as ppo import onnxruntime import os import shutil @@ -28,7 +28,7 @@ test_data = { # Start Ray and initialize a PPO trainer ray.init() -trainer = ppo.PPO(config=config, env="CartPole-v0") +trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") # You could train the model here # trainer.train() diff --git a/rllib/examples/fractional_gpus.py b/rllib/examples/fractional_gpus.py index 2397a452d..9ff34ab7c 100644 --- a/rllib/examples/fractional_gpus.py +++ b/rllib/examples/fractional_gpus.py @@ -104,8 +104,8 @@ if __name__ == "__main__": # Note: The above GPU settings should also work in case you are not # running via tune.run(), but instead do: - # >> from ray.rllib.algorithms.ppo import PPO - # >> trainer = PPO(config=config) + # >> from ray.rllib.agents.ppo import PPOTrainer + # >> trainer = PPOTrainer(config=config) # >> for _ in range(10): # >> results = trainer.train() # >> print(results) diff --git a/rllib/examples/lstm_auto_wrapping.py b/rllib/examples/lstm_auto_wrapping.py index 20f383ca1..88e24df33 100644 --- a/rllib/examples/lstm_auto_wrapping.py +++ b/rllib/examples/lstm_auto_wrapping.py @@ -1,7 +1,7 @@ import numpy as np import ray -import ray.rllib.algorithms.ppo as ppo +import ray.rllib.agents.ppo as ppo from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 from ray.rllib.models.catalog import ModelCatalog from ray.rllib.utils.framework import try_import_torch @@ -40,7 +40,7 @@ if __name__ == "__main__": ModelCatalog.register_custom_model("my_torch_model", MyCustomModel) # Create the Trainer. - trainer = ppo.PPO( + trainer = ppo.PPOTrainer( env="CartPole-v0", config={ "framework": "torch", diff --git a/rllib/examples/multi_agent_two_trainers.py b/rllib/examples/multi_agent_two_trainers.py index b8f9d9367..0a6a6dd08 100644 --- a/rllib/examples/multi_agent_two_trainers.py +++ b/rllib/examples/multi_agent_two_trainers.py @@ -14,10 +14,10 @@ import os import ray from ray.rllib.algorithms.dqn import DQNTrainer, DQNTFPolicy, DQNTorchPolicy -from ray.rllib.algorithms.ppo import ( - PPO, - PPOTF1Policy, - PPOTF2Policy, +from ray.rllib.agents.ppo import ( + PPOTrainer, + PPOStaticGraphTFPolicy, + PPOEagerTFPolicy, PPOTorchPolicy, ) from ray.rllib.examples.env.multi_agent import MultiAgentCartPole @@ -66,9 +66,9 @@ if __name__ == "__main__": if framework == "torch": return PPOTorchPolicy elif framework == "tf": - return PPOTF1Policy + return PPOStaticGraphTFPolicy else: - return PPOTF2Policy + return PPOEagerTFPolicy elif algorithm == "DQN": if framework == "torch": return DQNTorchPolicy @@ -100,7 +100,7 @@ if __name__ == "__main__": else: return "dqn_policy" - ppo_trainer = PPO( + ppo_trainer = PPOTrainer( env="multi_agent_cartpole", config={ "multiagent": { diff --git a/rllib/examples/remote_envs_with_inference_done_on_main_node.py b/rllib/examples/remote_envs_with_inference_done_on_main_node.py index 0a8805178..73badfc45 100644 --- a/rllib/examples/remote_envs_with_inference_done_on_main_node.py +++ b/rllib/examples/remote_envs_with_inference_done_on_main_node.py @@ -13,7 +13,7 @@ import argparse import os import ray -from ray.rllib.algorithms.ppo import PPO +from ray.rllib.agents.ppo import PPOTrainer from ray.rllib.agents.trainer import Trainer from ray.rllib.utils.annotations import override from ray.rllib.utils.test_utils import check_learning_achieved @@ -76,10 +76,10 @@ def get_cli_args(): # The modified Trainer class we will use. This is the exact same -# as a PPO, but with the additional default_resource_request +# as a PPOTrainer, but with the additional default_resource_request # override, telling tune that it's ok (not mandatory) to place our # n remote envs on a different node (each env using 1 CPU). -class PPOTrainerRemoteInference(PPO): +class PPOTrainerRemoteInference(PPOTrainer): @classmethod @override(Trainer) def default_resource_request(cls, config): diff --git a/rllib/examples/restore_1_of_n_agents_from_checkpoint.py b/rllib/examples/restore_1_of_n_agents_from_checkpoint.py index 94cf5fedc..16e588a2a 100644 --- a/rllib/examples/restore_1_of_n_agents_from_checkpoint.py +++ b/rllib/examples/restore_1_of_n_agents_from_checkpoint.py @@ -16,7 +16,7 @@ import random import ray from ray import tune -from ray.rllib.algorithms.ppo import PPO +from ray.rllib.agents.ppo import PPOTrainer from ray.rllib.examples.env.multi_agent import MultiAgentCartPole from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.test_utils import check_learning_achieved @@ -102,7 +102,7 @@ if __name__ == "__main__": print(f".. best checkpoint was: {best_checkpoint}") # Create a new dummy Trainer to "fix" our checkpoint. - new_trainer = PPO(config=config) + new_trainer = PPOTrainer(config=config) # Get untrained weights for all policies. untrained_weights = new_trainer.get_weights() # Restore all policies from checkpoint. diff --git a/rllib/examples/sb2rllib_rllib_example.py b/rllib/examples/sb2rllib_rllib_example.py index 936bae364..e88ab2d4d 100644 --- a/rllib/examples/sb2rllib_rllib_example.py +++ b/rllib/examples/sb2rllib_rllib_example.py @@ -7,7 +7,7 @@ Run example: python sb2rllib_rllib_example.py """ import gym import ray -import ray.rllib.algorithms.ppo as ppo +import ray.rllib.agents.ppo as ppo # settings used for both stable baselines and rllib env_name = "CartPole-v1" @@ -30,7 +30,7 @@ checkpoint_path = analysis.get_best_checkpoint(trial=analysis.get_best_trial()) print(f"Trained model saved at {checkpoint_path}") # load and restore model -agent = ppo.PPO(env=env_name) +agent = ppo.PPOTrainer(env=env_name) agent.restore(checkpoint_path) print(f"Agent loaded from saved model at {checkpoint_path}") diff --git a/rllib/examples/self_play_league_based_with_open_spiel.py b/rllib/examples/self_play_league_based_with_open_spiel.py index d7f748419..2564186b6 100644 --- a/rllib/examples/self_play_league_based_with_open_spiel.py +++ b/rllib/examples/self_play_league_based_with_open_spiel.py @@ -40,7 +40,7 @@ import re import ray from ray import tune from ray.rllib.agents.callbacks import DefaultCallbacks -from ray.rllib.algorithms.ppo import PPO +from ray.rllib.agents.ppo import PPOTrainer from ray.rllib.examples.self_play_with_open_spiel import ask_user_for_action from ray.rllib.examples.policy.random_policy import RandomPolicy from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv @@ -340,7 +340,7 @@ if __name__ == "__main__": # human on command line. if args.num_episodes_human_play > 0: num_episodes = 0 - trainer = PPO(config=dict(config, **{"explore": False})) + trainer = PPOTrainer(config=dict(config, **{"explore": False})) if args.from_checkpoint: trainer.restore(args.from_checkpoint) else: diff --git a/rllib/examples/self_play_with_open_spiel.py b/rllib/examples/self_play_with_open_spiel.py index f541fcdae..b3201f0ae 100644 --- a/rllib/examples/self_play_with_open_spiel.py +++ b/rllib/examples/self_play_with_open_spiel.py @@ -28,7 +28,7 @@ import sys import ray from ray import tune from ray.rllib.agents.callbacks import DefaultCallbacks -from ray.rllib.algorithms.ppo import PPO +from ray.rllib.agents.ppo import PPOTrainer from ray.rllib.examples.policy.random_policy import RandomPolicy from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv from ray.rllib.policy.policy import PolicySpec @@ -242,7 +242,7 @@ if __name__ == "__main__": # human on command line. if args.num_episodes_human_play > 0: num_episodes = 0 - trainer = PPO(config=dict(config, **{"explore": False})) + trainer = PPOTrainer(config=dict(config, **{"explore": False})) if args.from_checkpoint: trainer.restore(args.from_checkpoint) else: diff --git a/rllib/examples/sumo_env_local.py b/rllib/examples/sumo_env_local.py index bc0498854..4b195c7de 100644 --- a/rllib/examples/sumo_env_local.py +++ b/rllib/examples/sumo_env_local.py @@ -19,7 +19,7 @@ from pprint import pformat import ray from ray import tune -from ray.rllib.algorithms.ppo import ppo +from ray.rllib.agents.ppo import ppo from ray.rllib.examples.simulators.sumo import marlenvironment from ray.rllib.utils.test_utils import check_learning_achieved @@ -78,7 +78,7 @@ if __name__ == "__main__": tune.register_env("sumo_test_env", marlenvironment.env_creator) # Algorithm. - policy_class = ppo.PPOTF1Policy + policy_class = ppo.PPOStaticGraphTFPolicy config = ppo.DEFAULT_CONFIG config["framework"] = "tf" config["gamma"] = 0.99 diff --git a/rllib/examples/trajectory_view_api.py b/rllib/examples/trajectory_view_api.py index 76fcc7326..e785d4054 100644 --- a/rllib/examples/trajectory_view_api.py +++ b/rllib/examples/trajectory_view_api.py @@ -2,7 +2,7 @@ import argparse import numpy as np import ray -from ray.rllib.algorithms.ppo import PPO +from ray.rllib.agents.ppo import PPOTrainer from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole from ray.rllib.examples.models.trajectory_view_utilizing_models import ( FrameStackingCartPoleModel, @@ -94,7 +94,7 @@ if __name__ == "__main__": ) checkpoint_path = checkpoints[0][0] - trainer = PPO(config) + trainer = PPOTrainer(config) trainer.restore(checkpoint_path) # Inference loop. diff --git a/rllib/examples/two_trainer_workflow.py b/rllib/examples/two_trainer_workflow.py index ec9ff13ef..2cbfc6d1b 100644 --- a/rllib/examples/two_trainer_workflow.py +++ b/rllib/examples/two_trainer_workflow.py @@ -15,9 +15,9 @@ from ray.rllib.agents.trainer import Trainer from ray.rllib.algorithms.dqn.dqn import DEFAULT_CONFIG as DQN_CONFIG from ray.rllib.algorithms.dqn.dqn_tf_policy import DQNTFPolicy from ray.rllib.algorithms.dqn.dqn_torch_policy import DQNTorchPolicy -from ray.rllib.algorithms.ppo.ppo import DEFAULT_CONFIG as PPO_CONFIG -from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy -from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy +from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG as PPO_CONFIG +from ray.rllib.agents.ppo.ppo_tf_policy import PPOStaticGraphTFPolicy +from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy from ray.rllib.evaluation.postprocessing import Postprocessing from ray.rllib.execution.rollout_ops import synchronous_parallel_sample from ray.rllib.execution.train_ops import train_one_step @@ -179,7 +179,9 @@ if __name__ == "__main__": # policy configs, we have to explicitly set it in the multiagent config: policies = { "ppo_policy": ( - PPOTorchPolicy if args.torch or args.mixed_torch_tf else PPOTF1Policy, + PPOTorchPolicy + if args.torch or args.mixed_torch_tf + else PPOStaticGraphTFPolicy, None, None, ppo_config, diff --git a/rllib/models/modelv2.py b/rllib/models/modelv2.py index 624cab877..017669099 100644 --- a/rllib/models/modelv2.py +++ b/rllib/models/modelv2.py @@ -283,8 +283,8 @@ class ModelV2: h5_file: The h5 file name to import weights from. Example: - >>> from ray.rllib.algorithms.ppo import PPO - >>> trainer = PPO(...) # doctest: +SKIP + >>> from ray.rllib.agents.ppo import PPOTrainer + >>> trainer = PPOTrainer(...) # doctest: +SKIP >>> trainer.import_policy_model_from_h5("/tmp/weights.h5") # doctest: +SKIP >>> for _ in range(10): # doctest: +SKIP >>> trainer.train() # doctest: +SKIP diff --git a/rllib/models/tests/test_models.py b/rllib/models/tests/test_models.py index f50edef3a..58fe761df 100644 --- a/rllib/models/tests/test_models.py +++ b/rllib/models/tests/test_models.py @@ -3,7 +3,7 @@ import numpy as np import unittest import ray -import ray.rllib.algorithms.ppo as ppo +import ray.rllib.agents.ppo as ppo from ray.rllib.examples.models.modelv3 import RNNModel from ray.rllib.models.tf.tf_modelv2 import TFModelV2 from ray.rllib.models.tf.fcnet import FullyConnectedNetwork @@ -71,7 +71,7 @@ class TestModels(unittest.TestCase): }, "num_workers": 0, } - trainer = ppo.PPO(config=config) + trainer = ppo.PPOTrainer(config=config) for _ in range(2): results = trainer.train() print(results) diff --git a/rllib/models/tests/test_preprocessors.py b/rllib/models/tests/test_preprocessors.py index afcfcd4dd..bb0fb1eed 100644 --- a/rllib/models/tests/test_preprocessors.py +++ b/rllib/models/tests/test_preprocessors.py @@ -4,7 +4,7 @@ import numpy as np import unittest import ray -import ray.rllib.algorithms.ppo as ppo +import ray.rllib.agents.ppo as ppo from ray.rllib.models.catalog import ModelCatalog from ray.rllib.models.preprocessors import ( DictFlatteningPreprocessor, @@ -67,7 +67,7 @@ class TestPreprocessors(unittest.TestCase): num_iterations = 1 # Only supported for tf so far. for _ in framework_iterator(config): - trainer = ppo.PPO(config=config) + trainer = ppo.PPOTrainer(config=config) for i in range(num_iterations): results = trainer.train() check_train_results(results) diff --git a/rllib/policy/tests/test_compute_log_likelihoods.py b/rllib/policy/tests/test_compute_log_likelihoods.py index a1e8afc73..c5ad1c539 100644 --- a/rllib/policy/tests/test_compute_log_likelihoods.py +++ b/rllib/policy/tests/test_compute_log_likelihoods.py @@ -5,7 +5,7 @@ import unittest import ray import ray.rllib.algorithms.dqn as dqn import ray.rllib.algorithms.pg as pg -import ray.rllib.algorithms.ppo as ppo +import ray.rllib.agents.ppo as ppo import ray.rllib.algorithms.sac as sac from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.test_utils import check, framework_iterator @@ -164,14 +164,14 @@ class TestComputeLogLikelihood(unittest.TestCase): config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" prev_a = np.array([0.0]) - do_test_log_likelihood(ppo.PPO, config, prev_a, continuous=True) + do_test_log_likelihood(ppo.PPOTrainer, config, prev_a, continuous=True) def test_ppo_discr(self): """Tests PPO's (discr. actions) compute_log_likelihoods method.""" config = ppo.DEFAULT_CONFIG.copy() config["seed"] = 42 prev_a = np.array(0) - do_test_log_likelihood(ppo.PPO, config, prev_a) + do_test_log_likelihood(ppo.PPOTrainer, config, prev_a) def test_sac_cont(self): """Tests SAC's (cont. actions) compute_log_likelihoods method.""" diff --git a/rllib/tests/test_execution.py b/rllib/tests/test_execution.py index f6a4e8470..a587f870c 100644 --- a/rllib/tests/test_execution.py +++ b/rllib/tests/test_execution.py @@ -5,7 +5,7 @@ import queue import unittest import ray -from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy +from ray.rllib.agents.ppo.ppo_tf_policy import PPOStaticGraphTFPolicy from ray.rllib.evaluation.worker_set import WorkerSet from ray.rllib.evaluation.rollout_worker import RolloutWorker from ray.rllib.execution.common import STEPS_SAMPLED_COUNTER, STEPS_TRAINED_COUNTER @@ -38,13 +38,13 @@ def iter_list(values): def make_workers(n): local = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), - policy_spec=PPOTF1Policy, + policy_spec=PPOStaticGraphTFPolicy, rollout_fragment_length=100, ) remotes = [ RolloutWorker.as_remote().remote( env_creator=lambda _: gym.make("CartPole-v0"), - policy_spec=PPOTF1Policy, + policy_spec=PPOStaticGraphTFPolicy, rollout_fragment_length=100, ) for _ in range(n) diff --git a/rllib/tests/test_lstm.py b/rllib/tests/test_lstm.py index d6f3b33db..238619c44 100644 --- a/rllib/tests/test_lstm.py +++ b/rllib/tests/test_lstm.py @@ -3,7 +3,7 @@ import pickle import unittest import ray -from ray.rllib.algorithms.ppo import PPO +from ray.rllib.agents.ppo import PPOTrainer from ray.rllib.examples.env.debug_counter_env import DebugCounterEnv from ray.rllib.examples.models.rnn_spy_model import RNNSpyModel from ray.rllib.models import ModelCatalog @@ -176,7 +176,7 @@ class TestRNNSequencing(unittest.TestCase): def test_simple_optimizer_sequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) - ppo = PPO( + ppo = PPOTrainer( env="counter", config={ "num_workers": 0, @@ -244,7 +244,7 @@ class TestRNNSequencing(unittest.TestCase): def test_minibatch_sequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) - ppo = PPO( + ppo = PPOTrainer( env="counter", config={ "shuffle_sequences": False, # for deterministic testing diff --git a/rllib/tests/test_nn_framework_import_errors.py b/rllib/tests/test_nn_framework_import_errors.py index beb627abe..450e7e6c7 100644 --- a/rllib/tests/test_nn_framework_import_errors.py +++ b/rllib/tests/test_nn_framework_import_errors.py @@ -2,7 +2,7 @@ import os import pytest -import ray.rllib.algorithms.ppo as ppo +import ray.rllib.agents.ppo as ppo from ray.rllib.utils.test_utils import framework_iterator @@ -18,7 +18,7 @@ def test_dont_import_tf_error(): with pytest.raises( ImportError, match="However, there was no installation found." ): - ppo.PPO(config, env="CartPole-v1") + ppo.PPOTrainer(config, env="CartPole-v1") def test_dont_import_torch_error(): @@ -29,7 +29,7 @@ def test_dont_import_torch_error(): os.environ["RLLIB_TEST_NO_TORCH_IMPORT"] = "1" config = {"framework": "torch"} with pytest.raises(ImportError, match="However, there was no installation found."): - ppo.PPO(config, env="CartPole-v1") + ppo.PPOTrainer(config, env="CartPole-v1") if __name__ == "__main__": diff --git a/rllib/tests/test_ray_client.py b/rllib/tests/test_ray_client.py index 6628bab80..71292cd37 100644 --- a/rllib/tests/test_ray_client.py +++ b/rllib/tests/test_ray_client.py @@ -5,7 +5,7 @@ import unittest import pytest import ray from ray import tune -import ray.rllib.algorithms.ppo as ppo +from ray.rllib.agents import ppo from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole from ray.util.client.ray_client_helpers import ray_start_client_server @@ -29,7 +29,7 @@ class TestRayClient(unittest.TestCase): "num_workers": 0, "framework": "tf", } - resources = ppo.PPO.default_resource_request(config) + resources = ppo.PPOTrainer.default_resource_request(config) from ray.rllib.examples.custom_train_fn import my_train_fn tune.run(my_train_fn, resources_per_trial=resources, config=config) @@ -63,7 +63,7 @@ class TestRayClient(unittest.TestCase): tune.run( experiment, config=config, - resources_per_trial=ppo.PPO.default_resource_request(config), + resources_per_trial=ppo.PPOTrainer.default_resource_request(config), ) diff --git a/rllib/train.py b/rllib/train.py index a68e2890e..16759dfe1 100755 --- a/rllib/train.py +++ b/rllib/train.py @@ -23,10 +23,10 @@ Training example via RLlib CLI: rllib train --run DQN --env CartPole-v0 Grid search example via RLlib CLI: - rllib train -f tuned_examples/cartpole-ppo-grid-search-example.yaml + rllib train -f tuned_examples/cartpole-grid-search-example.yaml Grid search example via executable: - ./train.py -f tuned_examples/cartpole-ppo-grid-search-example.yaml + ./train.py -f tuned_examples/cartpole-grid-search-example.yaml Note that -f overrides all other trial-specific command-line options. """ diff --git a/rllib/tuned_examples/ddppo/atari-ddppo.yaml b/rllib/tuned_examples/ppo/atari-ddppo.yaml similarity index 100% rename from rllib/tuned_examples/ddppo/atari-ddppo.yaml rename to rllib/tuned_examples/ppo/atari-ddppo.yaml diff --git a/rllib/tuned_examples/appo/cartpole-appo-vtrace-fake-gpus.yaml b/rllib/tuned_examples/ppo/cartpole-appo-vtrace-fake-gpus.yaml similarity index 100% rename from rllib/tuned_examples/appo/cartpole-appo-vtrace-fake-gpus.yaml rename to rllib/tuned_examples/ppo/cartpole-appo-vtrace-fake-gpus.yaml diff --git a/rllib/tuned_examples/appo/cartpole-appo-vtrace-separate-losses.yaml b/rllib/tuned_examples/ppo/cartpole-appo-vtrace-separate-losses.yaml similarity index 100% rename from rllib/tuned_examples/appo/cartpole-appo-vtrace-separate-losses.yaml rename to rllib/tuned_examples/ppo/cartpole-appo-vtrace-separate-losses.yaml diff --git a/rllib/tuned_examples/appo/cartpole-appo-vtrace.yaml b/rllib/tuned_examples/ppo/cartpole-appo-vtrace.yaml similarity index 100% rename from rllib/tuned_examples/appo/cartpole-appo-vtrace.yaml rename to rllib/tuned_examples/ppo/cartpole-appo-vtrace.yaml diff --git a/rllib/tuned_examples/appo/cartpole-appo.yaml b/rllib/tuned_examples/ppo/cartpole-appo.yaml similarity index 100% rename from rllib/tuned_examples/appo/cartpole-appo.yaml rename to rllib/tuned_examples/ppo/cartpole-appo.yaml diff --git a/rllib/tuned_examples/ddppo/cartpole-ddppo.yaml b/rllib/tuned_examples/ppo/cartpole-ddppo.yaml similarity index 100% rename from rllib/tuned_examples/ddppo/cartpole-ddppo.yaml rename to rllib/tuned_examples/ppo/cartpole-ddppo.yaml diff --git a/rllib/tuned_examples/ppo/cartpole-ppo-grid-search-example.yaml b/rllib/tuned_examples/ppo/cartpole-grid-search-example.yaml similarity index 90% rename from rllib/tuned_examples/ppo/cartpole-ppo-grid-search-example.yaml rename to rllib/tuned_examples/ppo/cartpole-grid-search-example.yaml index 0bf1f098e..2e622d336 100644 --- a/rllib/tuned_examples/ppo/cartpole-ppo-grid-search-example.yaml +++ b/rllib/tuned_examples/ppo/cartpole-grid-search-example.yaml @@ -1,4 +1,4 @@ -cartpole-ppo-grid-search-example: +cartpole-ppo: env: CartPole-v0 run: PPO stop: diff --git a/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml b/rllib/tuned_examples/ppo/frozenlake-appo-vtrace.yaml similarity index 100% rename from rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml rename to rllib/tuned_examples/ppo/frozenlake-appo-vtrace.yaml diff --git a/rllib/tuned_examples/appo/halfcheetah-appo.yaml b/rllib/tuned_examples/ppo/halfcheetah-appo.yaml similarity index 100% rename from rllib/tuned_examples/appo/halfcheetah-appo.yaml rename to rllib/tuned_examples/ppo/halfcheetah-appo.yaml diff --git a/rllib/tuned_examples/appo/memory-leak-test-appo.yaml b/rllib/tuned_examples/ppo/memory-leak-test-appo.yaml similarity index 100% rename from rllib/tuned_examples/appo/memory-leak-test-appo.yaml rename to rllib/tuned_examples/ppo/memory-leak-test-appo.yaml diff --git a/rllib/tuned_examples/appo/pendulum-appo.yaml b/rllib/tuned_examples/ppo/pendulum-appo.yaml similarity index 100% rename from rllib/tuned_examples/appo/pendulum-appo.yaml rename to rllib/tuned_examples/ppo/pendulum-appo.yaml diff --git a/rllib/tuned_examples/ddppo/pendulum-ddppo.yaml b/rllib/tuned_examples/ppo/pendulum-ddppo.yaml similarity index 100% rename from rllib/tuned_examples/ddppo/pendulum-ddppo.yaml rename to rllib/tuned_examples/ppo/pendulum-ddppo.yaml diff --git a/rllib/tuned_examples/appo/pong-appo.yaml b/rllib/tuned_examples/ppo/pong-appo.yaml similarity index 100% rename from rllib/tuned_examples/appo/pong-appo.yaml rename to rllib/tuned_examples/ppo/pong-appo.yaml diff --git a/rllib/utils/error.py b/rllib/utils/error.py index 47e7b166b..ea229e145 100644 --- a/rllib/utils/error.py +++ b/rllib/utils/error.py @@ -56,5 +56,5 @@ To change the config for the `rllib train|rollout` command, use To change the config for `tune.run()` in a script: Modify the python dict passed to `tune.run(config=[...])`. To change the config for an RLlib Trainer instance: Modify the python dict - passed to the Trainer's constructor, e.g. `PPO(config=[...])`. + passed to the Trainer's constructor, e.g. `PPOTrainer(config=[...])`. """ diff --git a/rllib/utils/exploration/tests/test_curiosity.py b/rllib/utils/exploration/tests/test_curiosity.py index 9dcef9aa8..3bc0c1382 100644 --- a/rllib/utils/exploration/tests/test_curiosity.py +++ b/rllib/utils/exploration/tests/test_curiosity.py @@ -8,7 +8,7 @@ import unittest import ray from ray import tune from ray.rllib.agents.callbacks import DefaultCallbacks -import ray.rllib.algorithms.ppo as ppo +import ray.rllib.agents.ppo as ppo from ray.rllib.utils.test_utils import check_learning_achieved, framework_iterator from ray.rllib.utils.numpy import one_hot from ray.tune import register_env @@ -180,7 +180,7 @@ class TestCuriosity(unittest.TestCase): "type": "StochasticSampling", }, } - trainer = ppo.PPO(config=config) + trainer = ppo.PPOTrainer(config=config) learnt = False for i in range(num_iterations): result = trainer.train() @@ -199,7 +199,7 @@ class TestCuriosity(unittest.TestCase): # config["exploration_config"] = { # "type": "StochasticSampling", # } - # trainer = ppo.PPO(config=config) + # trainer = ppo.PPOTrainer(config=config) # rewards_wo = 0.0 # for _ in range(num_iterations): # result = trainer.train() @@ -251,7 +251,7 @@ class TestCuriosity(unittest.TestCase): } for _ in framework_iterator(config, frameworks="torch"): # To replay: - # trainer = ppo.PPO(config=config) + # trainer = ppo.PPOTrainer(config=config) # trainer.restore("[checkpoint file]") # env = env_maker(config["env_config"]) # s = env.reset() diff --git a/rllib/utils/exploration/tests/test_explorations.py b/rllib/utils/exploration/tests/test_explorations.py index b88f7f96f..8b3bb76ac 100644 --- a/rllib/utils/exploration/tests/test_explorations.py +++ b/rllib/utils/exploration/tests/test_explorations.py @@ -8,9 +8,9 @@ import ray.rllib.algorithms.a3c as a3c import ray.rllib.algorithms.ddpg as ddpg import ray.rllib.algorithms.ddpg.td3 as td3 import ray.rllib.algorithms.dqn as dqn -import ray.rllib.algorithms.impala as impala +import ray.rllib.agents.impala as impala import ray.rllib.algorithms.pg as pg -import ray.rllib.algorithms.ppo as ppo +import ray.rllib.agents.ppo as ppo import ray.rllib.algorithms.sac as sac from ray.rllib.utils import check, framework_iterator @@ -34,7 +34,7 @@ def do_test_explorations( local_config = core_config.copy() if exploration == "Random": # TODO(sven): Random doesn't work for IMPALA yet. - if run is impala.Impala: + if run is impala.ImpalaTrainer: continue local_config["exploration_config"] = {"type": "Random"} print("exploration={}".format(exploration or "default")) @@ -139,7 +139,7 @@ class TestExplorations(unittest.TestCase): def test_impala(self): do_test_explorations( - impala.Impala, + impala.ImpalaTrainer, "CartPole-v0", dict(impala.DEFAULT_CONFIG.copy(), num_gpus=0), np.array([0.0, 0.1, 0.0, 0.0]), @@ -157,7 +157,7 @@ class TestExplorations(unittest.TestCase): def test_ppo_discr(self): do_test_explorations( - ppo.PPO, + ppo.PPOTrainer, "CartPole-v0", ppo.DEFAULT_CONFIG, np.array([0.0, 0.1, 0.0, 0.0]), @@ -166,7 +166,7 @@ class TestExplorations(unittest.TestCase): def test_ppo_cont(self): do_test_explorations( - ppo.PPO, + ppo.PPOTrainer, "Pendulum-v1", ppo.DEFAULT_CONFIG, np.array([0.0, 0.1, 0.0]), diff --git a/rllib/utils/exploration/tests/test_random_encoder.py b/rllib/utils/exploration/tests/test_random_encoder.py index f8a960e7f..f6a24de2b 100644 --- a/rllib/utils/exploration/tests/test_random_encoder.py +++ b/rllib/utils/exploration/tests/test_random_encoder.py @@ -3,8 +3,7 @@ import unittest import pytest import ray -import ray.rllib.algorithms.ppo as ppo -import ray.rllib.algorithms.sac as sac +from ray.rllib.agents import ppo, sac from ray.rllib.agents.callbacks import RE3UpdateCallbacks @@ -25,11 +24,11 @@ class TestRE3(unittest.TestCase): Both the on-policy and off-policy setups are validated. """ if rl_algorithm == "PPO": - config = ppo.PPOConfig().to_dict() - trainer_cls = ppo.PPO + config = ppo.DEFAULT_CONFIG.copy() + trainer_cls = ppo.PPOTrainer beta_schedule = "constant" elif rl_algorithm == "SAC": - config = sac.SACConfig().to_dict() + config = sac.DEFAULT_CONFIG.copy() trainer_cls = sac.SACTrainer beta_schedule = "linear_decay" diff --git a/rllib/utils/tests/test_errors.py b/rllib/utils/tests/test_errors.py index 6b2880022..96f64cf7c 100644 --- a/rllib/utils/tests/test_errors.py +++ b/rllib/utils/tests/test_errors.py @@ -1,7 +1,7 @@ import unittest import ray -import ray.rllib.algorithms.impala as impala +import ray.rllib.agents.impala as impala import ray.rllib.algorithms.pg as pg from ray.rllib.utils.error import EnvError from ray.rllib.utils.test_utils import framework_iterator @@ -32,7 +32,7 @@ class TestErrors(unittest.TestCase): RuntimeError, # (?s): "dot matches all" (also newlines). "(?s)Found 0 GPUs on your machine.+To change the config", - lambda: impala.Impala(config=config, env=env), + lambda: impala.ImpalaTrainer(config=config, env=env), ) def test_bad_envs(self):