From 96693055bd4b466e438564f20e9adaef52e4471b Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Mon, 20 Jun 2022 15:54:00 +0200 Subject: [PATCH] [RLlib] More Trainer -> Algorithm renaming cleanups. (#25869) --- .buildkite/pipeline.ml.yml | 8 ++-- doc/source/rllib/rllib-training.rst | 4 +- rllib/BUILD | 39 +++++++-------- rllib/agents/tests/__init__.py | 0 .../alpha_zero/tests/test_alpha_zero.py | 5 +- rllib/algorithms/appo/tests/test_appo.py | 48 +++++++++---------- rllib/algorithms/ars/tests/test_ars.py | 8 ++-- rllib/algorithms/es/tests/test_es.py | 8 ++-- rllib/algorithms/marwil/marwil.py | 4 +- rllib/algorithms/r2d2/r2d2.py | 6 +-- rllib/algorithms/r2d2/tests/test_r2d2.py | 6 +-- rllib/algorithms/sac/sac.py | 4 +- rllib/algorithms/td3/td3.py | 4 +- .../tests/test_algorithm.py} | 6 +-- .../tests/test_callbacks.py | 16 +++---- .../tests/test_memory_leaks.py | 12 ++--- .../tests/test_worker_failures.py | 0 rllib/connectors/action/pipeline.py | 6 +-- rllib/connectors/agent/pipeline.py | 4 +- rllib/connectors/connector.py | 4 +- rllib/env/multi_agent_env.py | 2 +- .../tests/test_trajectory_view_api.py | 16 +++---- rllib/examples/cartpole_lstm.py | 6 +-- rllib/examples/eager_execution.py | 6 +-- rllib/examples/offline_rl.py | 13 +++-- rllib/examples/random_parametric_agent.py | 8 ++-- ...e_envs_with_inference_done_on_main_node.py | 10 ++-- rllib/examples/two_trainer_workflow.py | 8 ++-- rllib/examples/vizdoom_with_attention_net.py | 2 +- rllib/execution/metric_ops.py | 2 +- rllib/offline/estimators/tests/test_ope.py | 18 +++---- rllib/policy/eager_tf_policy.py | 2 +- rllib/policy/torch_mixins.py | 2 +- rllib/tests/test_dependency_torch.py | 6 +-- rllib/tests/test_placement_groups.py | 6 +-- rllib/tests/test_timesteps.py | 9 ++-- rllib/utils/annotations.py | 16 +++---- rllib/utils/debug/memory.py | 6 +-- rllib/utils/metrics/__init__.py | 2 +- 39 files changed, 166 insertions(+), 166 deletions(-) delete mode 100644 rllib/agents/tests/__init__.py rename rllib/{agents/tests/test_trainer.py => algorithms/tests/test_algorithm.py} (98%) rename rllib/{agents => algorithms}/tests/test_callbacks.py (89%) rename rllib/{agents => algorithms}/tests/test_memory_leaks.py (85%) rename rllib/{agents => algorithms}/tests/test_worker_failures.py (100%) diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml index ceda116bb..8487630ba 100644 --- a/.buildkite/pipeline.ml.yml +++ b/.buildkite/pipeline.ml.yml @@ -123,24 +123,24 @@ --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... -- label: ":brain: RLlib: Trainer Tests (generic)" +- label: ":brain: RLlib: Algorithm Tests (generic)" conditions: ["RAY_CI_RLLIB_DIRECTLY_AFFECTED"] commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - RLLIB_TESTING=1 PYTHON=3.7 ./ci/env/install-dependencies.sh - # Test all tests in the `agents` (soon to be "trainers") dir: + # Test all tests in the `algorithms` dir: - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=algorithms_dir_generic,-multi_gpu --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... -- label: ":brain: RLlib: Trainer Tests (specific algos)" +- label: ":brain: RLlib: Algorithm Tests (specific algos)" conditions: ["RAY_CI_RLLIB_DIRECTLY_AFFECTED"] commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - RLLIB_TESTING=1 PYTHON=3.7 ./ci/env/install-dependencies.sh - # Test all tests in the `agents` (soon to be "trainers") dir: + # Test all tests in the `algorithms` dir: - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=algorithms_dir,-algorithms_dir_generic,-multi_gpu diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst index f6c6aac37..56d87544a 100644 --- a/doc/source/rllib/rllib-training.rst +++ b/doc/source/rllib/rllib-training.rst @@ -740,7 +740,7 @@ Here is an example of the basic usage (for a more complete example, see `custom_ # NOTE: In order for this to work, your (custom) model needs to implement # the `import_from_h5` method. # See https://github.com/ray-project/ray/blob/master/rllib/tests/test_model_imports.py - # for detailed examples for tf- and torch trainers/models. + # for detailed examples for tf- and torch policies/models. .. note:: @@ -1270,7 +1270,7 @@ Below are some examples of how the custom evaluation metrics are reported nested Sample output for `python custom_eval.py --custom-eval` ------------------------------------------------------------------------ - INFO trainer.py:631 -- Running custom eval function + INFO algorithm.py:631 -- Running custom eval function Update corridor length to 4 Update corridor length to 7 Custom evaluation round 1 diff --git a/rllib/BUILD b/rllib/BUILD index 878628b1c..257224686 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -15,7 +15,7 @@ # actions vs continuous actions. # -- "fake_gpus": Tests that run using 2 fake GPUs. -# - Quick agent compilation/tune-train tests, tagged "quick_train". +# - Quick algo compilation/tune-train tests, tagged "quick_train". # NOTE: These should be obsoleted in favor of "algorithms_dir" tests as # they cover the same functionaliy. @@ -28,7 +28,7 @@ # - `policy` directory tests. # - `utils` directory tests. -# - Trainer ("agents") tests, tagged "algorithms_dir". +# - Algorithm tests, tagged "algorithms_dir". # - Tests directory (everything in rllib/tests/...), tagged: "tests_dir" and # "tests_dir_[A-Z]" @@ -65,7 +65,7 @@ load("//bazel:python.bzl", "py_test_module_list") # -------------------------------------------------------------------- -# Agents learning regression tests. +# Algorithms learning regression tests. # # Tag: learning_tests # @@ -685,40 +685,41 @@ py_test( # -------------------------------------------------------------------- -# Agents (Compilation, Losses, simple agent functionality tests) +# Algorithms (Compilation, Losses, simple functionality tests) # rllib/algorithms/ # # Tag: algorithms_dir # -------------------------------------------------------------------- -# Generic (all Trainers) +# Generic (all Algorithms) + +py_test( + name = "test_algorithm", + tags = ["team:rllib", "algorithms_dir", "algorithms_dir_generic"], + size = "large", + srcs = ["algorithms/tests/test_algorithm.py"] +) + py_test( name = "test_callbacks", tags = ["team:rllib", "algorithms_dir", "algorithms_dir_generic"], size = "medium", - srcs = ["agents/tests/test_callbacks.py"] + srcs = ["algorithms/tests/test_callbacks.py"] ) py_test( name = "test_memory_leaks_generic", - main = "agents/tests/test_memory_leaks.py", + main = "algorithms/tests/test_memory_leaks.py", tags = ["team:rllib", "algorithms_dir"], size = "large", - srcs = ["agents/tests/test_memory_leaks.py"] -) - -py_test( - name = "test_trainer", - tags = ["team:rllib", "algorithms_dir", "algorithms_dir_generic"], - size = "large", - srcs = ["agents/tests/test_trainer.py"] + srcs = ["algorithms/tests/test_memory_leaks.py"] ) py_test( name = "tests/test_worker_failures", tags = ["team:rllib", "tests_dir", "algorithms_dir_generic"], size = "large", - srcs = ["agents/tests/test_worker_failures.py"] + srcs = ["algorithms/tests/test_worker_failures.py"] ) # Specific Algorithms @@ -809,7 +810,7 @@ py_test( py_test( name = "test_cql", tags = ["team:rllib", "algorithms_dir"], - size = "medium", + size = "large", srcs = ["algorithms/cql/tests/test_cql.py"] ) @@ -982,7 +983,7 @@ py_test( ) # -------------------------------------------------------------------- -# contrib Agents +# contrib Algorithms # -------------------------------------------------------------------- py_test( @@ -1071,7 +1072,7 @@ py_test( ) # -------------------------------------------------------------------- -# Agents (quick training test iterations via `rllib train`) +# Algorithms (quick training test iterations via `rllib train`) # # Tag: quick_train # diff --git a/rllib/agents/tests/__init__.py b/rllib/agents/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rllib/algorithms/alpha_zero/tests/test_alpha_zero.py b/rllib/algorithms/alpha_zero/tests/test_alpha_zero.py index 7d3c30e4d..5531642c1 100644 --- a/rllib/algorithms/alpha_zero/tests/test_alpha_zero.py +++ b/rllib/algorithms/alpha_zero/tests/test_alpha_zero.py @@ -30,11 +30,12 @@ class TestAlphaZero(unittest.TestCase): # Only working for torch right now. for _ in framework_iterator(config, frameworks="torch"): - trainer = config.build() + algo = config.build() for i in range(num_iterations): - results = trainer.train() + results = algo.train() check_train_results(results) print(results) + algo.stop() if __name__ == "__main__": diff --git a/rllib/algorithms/appo/tests/test_appo.py b/rllib/algorithms/appo/tests/test_appo.py index 55ee3880f..fc8fe64ba 100644 --- a/rllib/algorithms/appo/tests/test_appo.py +++ b/rllib/algorithms/appo/tests/test_appo.py @@ -28,23 +28,23 @@ class TestAPPO(unittest.TestCase): for _ in framework_iterator(config, with_eager_tracing=True): print("w/o v-trace") config.vtrace = False - trainer = config.build(env="CartPole-v0") + algo = config.build(env="CartPole-v0") for i in range(num_iterations): - results = trainer.train() + results = algo.train() check_train_results(results) print(results) - check_compute_single_action(trainer) - trainer.stop() + check_compute_single_action(algo) + algo.stop() print("w/ v-trace") config.vtrace = True - trainer = config.build(env="CartPole-v0") + algo = config.build(env="CartPole-v0") for i in range(num_iterations): - results = trainer.train() + results = algo.train() check_train_results(results) print(results) - check_compute_single_action(trainer) - trainer.stop() + check_compute_single_action(algo) + algo.stop() def test_appo_compilation_use_kl_loss(self): """Test whether APPO can be built with kl_loss enabled.""" @@ -54,13 +54,13 @@ class TestAPPO(unittest.TestCase): num_iterations = 2 for _ in framework_iterator(config, with_eager_tracing=True): - trainer = config.build(env="CartPole-v0") + algo = config.build(env="CartPole-v0") for i in range(num_iterations): - results = trainer.train() + results = algo.train() check_train_results(results) print(results) - check_compute_single_action(trainer) - trainer.stop() + check_compute_single_action(algo) + algo.stop() def test_appo_two_tf_optimizers(self): # Not explicitly setting this should cause a warning, but not fail. @@ -78,13 +78,13 @@ class TestAPPO(unittest.TestCase): # Only supported for tf so far. for _ in framework_iterator(config, frameworks=("tf2", "tf")): - trainer = config.build(env="CartPole-v0") + algo = config.build(env="CartPole-v0") for i in range(num_iterations): - results = trainer.train() + results = algo.train() check_train_results(results) print(results) - check_compute_single_action(trainer) - trainer.stop() + check_compute_single_action(algo) + algo.stop() def test_appo_entropy_coeff_schedule(self): # Initial lr, doesn't really matter because of the schedule below. @@ -113,33 +113,33 @@ class TestAPPO(unittest.TestCase): # which entropy coeff depends on, is updated after each worker rollout. config.min_time_s_per_iteration = 0 - def _step_n_times(trainer, n: int): - """Step trainer n times. + def _step_n_times(algo, n: int): + """Step Algorithm n times. Returns: learning rate at the end of the execution. """ for _ in range(n): - results = trainer.train() - print(trainer.workers.local_worker().global_vars) + results = algo.train() + print(algo.workers.local_worker().global_vars) print(results) return results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY][ "entropy_coeff" ] for _ in framework_iterator(config): - trainer = config.build(env="CartPole-v0") + algo = config.build(env="CartPole-v0") - coeff = _step_n_times(trainer, 10) # 200 timesteps + coeff = _step_n_times(algo, 10) # 200 timesteps # Should be close to the starting coeff of 0.01. self.assertLessEqual(coeff, 0.01) self.assertGreaterEqual(coeff, 0.001) - coeff = _step_n_times(trainer, 20) # 400 timesteps + coeff = _step_n_times(algo, 20) # 400 timesteps # Should have annealed to the final coeff of 0.0001. self.assertLessEqual(coeff, 0.001) - trainer.stop() + algo.stop() if __name__ == "__main__": diff --git a/rllib/algorithms/ars/tests/test_ars.py b/rllib/algorithms/ars/tests/test_ars.py index 0031b61ef..ce8a79d79 100644 --- a/rllib/algorithms/ars/tests/test_ars.py +++ b/rllib/algorithms/ars/tests/test_ars.py @@ -33,13 +33,13 @@ class TestARS(unittest.TestCase): num_iterations = 2 for _ in framework_iterator(config): - trainer = config.build(env="CartPole-v0") + algo = config.build(env="CartPole-v0") for i in range(num_iterations): - results = trainer.train() + results = algo.train() print(results) - check_compute_single_action(trainer) - trainer.stop() + check_compute_single_action(algo) + algo.stop() if __name__ == "__main__": diff --git a/rllib/algorithms/es/tests/test_es.py b/rllib/algorithms/es/tests/test_es.py index 1363570e0..826993d0a 100644 --- a/rllib/algorithms/es/tests/test_es.py +++ b/rllib/algorithms/es/tests/test_es.py @@ -29,13 +29,13 @@ class TestES(unittest.TestCase): for _ in framework_iterator(config): for env in ["CartPole-v0", "Pendulum-v1"]: - trainer = config.build(env=env) + algo = config.build(env=env) for i in range(num_iterations): - results = trainer.train() + results = algo.train() print(results) - check_compute_single_action(trainer) - trainer.stop() + check_compute_single_action(algo) + algo.stop() ray.shutdown() diff --git a/rllib/algorithms/marwil/marwil.py b/rllib/algorithms/marwil/marwil.py index db3a646f7..b4f91ee6b 100644 --- a/rllib/algorithms/marwil/marwil.py +++ b/rllib/algorithms/marwil/marwil.py @@ -37,8 +37,8 @@ class MARWILConfig(AlgorithmConfig): ... .offline_data(input_=["./rllib/tests/data/cartpole/large.json"]) >>> print(config.to_dict()) >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> trainer = config.build() - >>> trainer.train() + >>> algo = config.build() + >>> algo.train() Example: >>> from ray.rllib.algorithms.marwil import MARWILConfig diff --git a/rllib/algorithms/r2d2/r2d2.py b/rllib/algorithms/r2d2/r2d2.py index 882fae3da..ea59f6e2f 100644 --- a/rllib/algorithms/r2d2/r2d2.py +++ b/rllib/algorithms/r2d2/r2d2.py @@ -30,9 +30,9 @@ class R2D2Config(DQNConfig): >>> .resources(num_gpus=1)\ >>> .rollouts(num_rollout_workers=30)\ >>> .environment("CartPole-v1") - >>> trainer = R2D2(config=config) + >>> algo = R2D2(config=config) >>> while True: - >>> trainer.train() + >>> algo.train() Example: >>> from ray.rllib.algorithms.r2d2.r2d2 import R2D2Config @@ -170,8 +170,6 @@ class R2D2Config(DQNConfig): return self -# Build an R2D2 trainer, which uses the framework specific Policy -# determined in `get_policy_class()` above. class R2D2(DQN): """Recurrent Experience Replay in Distrib. Reinforcement Learning (R2D2). diff --git a/rllib/algorithms/r2d2/tests/test_r2d2.py b/rllib/algorithms/r2d2/tests/test_r2d2.py index 675f668c7..5d53aba56 100644 --- a/rllib/algorithms/r2d2/tests/test_r2d2.py +++ b/rllib/algorithms/r2d2/tests/test_r2d2.py @@ -78,14 +78,14 @@ class TestR2D2(unittest.TestCase): # Test building an R2D2 agent in all frameworks. for _ in framework_iterator(config, with_eager_tracing=True): - trainer = config.build(env="CartPole-v0") + algo = config.build(env="CartPole-v0") for i in range(num_iterations): - results = trainer.train() + results = algo.train() check_train_results(results) check_batch_sizes(results) print(results) - check_compute_single_action(trainer, include_state=True) + check_compute_single_action(algo, include_state=True) if __name__ == "__main__": diff --git a/rllib/algorithms/sac/sac.py b/rllib/algorithms/sac/sac.py index 1112573ff..ca508b7e6 100644 --- a/rllib/algorithms/sac/sac.py +++ b/rllib/algorithms/sac/sac.py @@ -29,8 +29,8 @@ class SACConfig(AlgorithmConfig): ... .rollouts(num_rollout_workers=4) >>> print(config.to_dict()) >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> trainer = config.build(env="CartPole-v1") - >>> trainer.train() + >>> algo = config.build(env="CartPole-v1") + >>> algo.train() """ def __init__(self, algo_class=None): diff --git a/rllib/algorithms/td3/td3.py b/rllib/algorithms/td3/td3.py index 74355f1de..e7d8b6184 100644 --- a/rllib/algorithms/td3/td3.py +++ b/rllib/algorithms/td3/td3.py @@ -18,8 +18,8 @@ class TD3Config(DDPGConfig): >>> config = TD3Config().training(lr=0.01).resources(num_gpus=1) >>> print(config.to_dict()) >>> # Build a Algorithm object from the config and run one training iteration. - >>> trainer = config.build(env="Pendulum-v1") - >>> trainer.train() + >>> algo = config.build(env="Pendulum-v1") + >>> algo.train() Example: >>> from ray.rllib.algorithms.ddpg.td3 import TD3Config diff --git a/rllib/agents/tests/test_trainer.py b/rllib/algorithms/tests/test_algorithm.py similarity index 98% rename from rllib/agents/tests/test_trainer.py rename to rllib/algorithms/tests/test_algorithm.py index dba4a9759..fbdaf0c59 100644 --- a/rllib/agents/tests/test_trainer.py +++ b/rllib/algorithms/tests/test_algorithm.py @@ -38,10 +38,10 @@ class TestAlgorithm(unittest.TestCase): algo = pg.PG(env="CartPole-v0", config=standard_config) # When (we validate config 2 times). - # Try deprecated `Trainer._validate_config()` method (static). + # Try deprecated `Algorithm._validate_config()` method (static). algo._validate_config(standard_config, algo) config_v1 = copy.deepcopy(standard_config) - # Try new method: `Trainer.validate_config()` (non-static). + # Try new method: `Algorithm.validate_config()` (non-static). algo.validate_config(standard_config) config_v2 = copy.deepcopy(standard_config) @@ -239,7 +239,7 @@ class TestAlgorithm(unittest.TestCase): algo_wo_env_on_driver.stop() # Try again using `create_env_on_driver=True`. - # This force-adds the env on the local-worker, so this Trainer + # This force-adds the env on the local-worker, so this Algorithm # can `evaluate` even though it doesn't have an evaluation-worker # set. config.create_env_on_local_worker = True diff --git a/rllib/agents/tests/test_callbacks.py b/rllib/algorithms/tests/test_callbacks.py similarity index 89% rename from rllib/agents/tests/test_callbacks.py rename to rllib/algorithms/tests/test_callbacks.py index 8f8e37fa5..ddbbb90bb 100644 --- a/rllib/agents/tests/test_callbacks.py +++ b/rllib/algorithms/tests/test_callbacks.py @@ -47,13 +47,13 @@ class TestCallbacks(unittest.TestCase): config = dict(base_config, callbacks=callbacks) for _ in framework_iterator(config, frameworks=("tf", "torch")): - trainer = dqn.DQN(config=config) + algo = dqn.DQN(config=config) # Fake the counter on the local worker (doesn't have an env) and # set it to -1 so the below `foreach_worker()` won't fail. - trainer.workers.local_worker().sum_sub_env_vector_indices = -1 + algo.workers.local_worker().sum_sub_env_vector_indices = -1 # Get sub-env vector index sums from the 2 remote workers: - sum_sub_env_vector_indices = trainer.workers.foreach_worker( + sum_sub_env_vector_indices = algo.workers.foreach_worker( lambda w: w.sum_sub_env_vector_indices ) # Local worker has no environments -> Expect the -1 special @@ -63,7 +63,7 @@ class TestCallbacks(unittest.TestCase): # of 6 (sum of vector indices: 0 + 1 + 2 + 3). self.assertTrue(sum_sub_env_vector_indices[1] == 6) self.assertTrue(sum_sub_env_vector_indices[2] == 6) - trainer.stop() + algo.stop() def test_on_sub_environment_created_with_remote_envs(self): base_config = { @@ -84,13 +84,13 @@ class TestCallbacks(unittest.TestCase): config = dict(base_config, callbacks=callbacks) for _ in framework_iterator(config, frameworks=("tf", "torch")): - trainer = dqn.DQN(config=config) + algo = dqn.DQN(config=config) # Fake the counter on the local worker (doesn't have an env) and # set it to -1 so the below `foreach_worker()` won't fail. - trainer.workers.local_worker().sum_sub_env_vector_indices = -1 + algo.workers.local_worker().sum_sub_env_vector_indices = -1 # Get sub-env vector index sums from the 2 remote workers: - sum_sub_env_vector_indices = trainer.workers.foreach_worker( + sum_sub_env_vector_indices = algo.workers.foreach_worker( lambda w: w.sum_sub_env_vector_indices ) # Local worker has no environments -> Expect the -1 special @@ -100,7 +100,7 @@ class TestCallbacks(unittest.TestCase): # of 6 (sum of vector indices: 0 + 1 + 2 + 3). self.assertTrue(sum_sub_env_vector_indices[1] == 6) self.assertTrue(sum_sub_env_vector_indices[2] == 6) - trainer.stop() + algo.stop() if __name__ == "__main__": diff --git a/rllib/agents/tests/test_memory_leaks.py b/rllib/algorithms/tests/test_memory_leaks.py similarity index 85% rename from rllib/agents/tests/test_memory_leaks.py rename to rllib/algorithms/tests/test_memory_leaks.py index 42a603381..eff42a4ff 100644 --- a/rllib/agents/tests/test_memory_leaks.py +++ b/rllib/algorithms/tests/test_memory_leaks.py @@ -30,10 +30,10 @@ class TestMemoryLeaks(unittest.TestCase): config["env_config"] = { "static_samples": True, } - trainer = ppo.PPO(config=config) - results = check_memory_leaks(trainer, to_check={"env"}, repeats=150) + algo = ppo.PPO(config=config) + results = check_memory_leaks(algo, to_check={"env"}, repeats=150) assert results["env"] - trainer.stop() + algo.stop() def test_leaky_policy(self): """Tests, whether our diagnostics tools can detect leaks in a policy.""" @@ -45,10 +45,10 @@ class TestMemoryLeaks(unittest.TestCase): config["multiagent"]["policies"] = { "default_policy": PolicySpec(policy_class=MemoryLeakingPolicy), } - trainer = dqn.DQN(config=config) - results = check_memory_leaks(trainer, to_check={"policy"}, repeats=300) + algo = dqn.DQN(config=config) + results = check_memory_leaks(algo, to_check={"policy"}, repeats=300) assert results["policy"] - trainer.stop() + algo.stop() if __name__ == "__main__": diff --git a/rllib/agents/tests/test_worker_failures.py b/rllib/algorithms/tests/test_worker_failures.py similarity index 100% rename from rllib/agents/tests/test_worker_failures.py rename to rllib/algorithms/tests/test_worker_failures.py diff --git a/rllib/connectors/action/pipeline.py b/rllib/connectors/action/pipeline.py index ba9ad55e3..49e1a982d 100644 --- a/rllib/connectors/action/pipeline.py +++ b/rllib/connectors/action/pipeline.py @@ -12,7 +12,7 @@ from ray.rllib.connectors.connector import ( from ray.rllib.utils.annotations import DeveloperAPI from ray.rllib.utils.typing import ( ActionConnectorDataType, - TrainerConfigDict, + AlgorithmConfigDict, ) @@ -50,8 +50,8 @@ register_connector(ActionConnectorPipeline.__name__, ActionConnectorPipeline) @DeveloperAPI -def get_action_connectors_from_trainer_config( - config: TrainerConfigDict, action_space: gym.Space +def get_action_connectors_from_algorithm_config( + config: AlgorithmConfigDict, action_space: gym.Space ) -> ActionConnectorPipeline: connectors = [] return ActionConnectorPipeline(connectors) diff --git a/rllib/connectors/agent/pipeline.py b/rllib/connectors/agent/pipeline.py index 595f02c39..1e2ad7dfa 100644 --- a/rllib/connectors/agent/pipeline.py +++ b/rllib/connectors/agent/pipeline.py @@ -15,7 +15,7 @@ from ray.rllib.utils.annotations import DeveloperAPI from ray.rllib.utils.typing import ( ActionConnectorDataType, AgentConnectorDataType, - TrainerConfigDict, + AlgorithmConfigDict, ) @@ -67,7 +67,7 @@ register_connector(AgentConnectorPipeline.__name__, AgentConnectorPipeline) # TODO(jungong) : finish this. @DeveloperAPI def get_agent_connectors_from_config( - config: TrainerConfigDict, obs_space: gym.Space + config: AlgorithmConfigDict, obs_space: gym.Space ) -> AgentConnectorPipeline: connectors = [FlattenDataAgentConnector()] diff --git a/rllib/connectors/connector.py b/rllib/connectors/connector.py index 492683a20..27bbf778a 100644 --- a/rllib/connectors/connector.py +++ b/rllib/connectors/connector.py @@ -13,8 +13,8 @@ from ray.rllib.utils.annotations import DeveloperAPI from ray.rllib.utils.typing import ( ActionConnectorDataType, AgentConnectorDataType, + AlgorithmConfigDict, TensorType, - TrainerConfigDict, ) logger = logging.getLogger(__name__) @@ -34,7 +34,7 @@ class ConnectorContext: def __init__( self, - config: TrainerConfigDict = None, + config: AlgorithmConfigDict = None, model_initial_states: List[TensorType] = None, observation_space: gym.Space = None, action_space: gym.Space = None, diff --git a/rllib/env/multi_agent_env.py b/rllib/env/multi_agent_env.py index 80be168bc..e22a7380b 100644 --- a/rllib/env/multi_agent_env.py +++ b/rllib/env/multi_agent_env.py @@ -30,7 +30,7 @@ class MultiAgentEnv(gym.Env): """An environment that hosts multiple independent agents. Agents are identified by (string) agent ids. Note that these "agents" here - are not to be confused with RLlib Trainers, which are also sometimes + are not to be confused with RLlib Algorithms, which are also sometimes referred to as "agents" or "RL agents". """ diff --git a/rllib/evaluation/tests/test_trajectory_view_api.py b/rllib/evaluation/tests/test_trajectory_view_api.py index 5d49ebd29..de9298d03 100644 --- a/rllib/evaluation/tests/test_trajectory_view_api.py +++ b/rllib/evaluation/tests/test_trajectory_view_api.py @@ -168,16 +168,16 @@ class TestTrajectoryViewAPI(unittest.TestCase): config["env_config"] = {"config": {"start_at_t": 1}} # first obs is [1.0] for _ in framework_iterator(config, frameworks="tf2"): - trainer = ppo.PPO( + algo = ppo.PPO( config, env="ray.rllib.examples.env.debug_counter_env.DebugCounterEnv", ) - rw = trainer.workers.local_worker() + rw = algo.workers.local_worker() sample = rw.sample() - assert sample.count == trainer.config["rollout_fragment_length"] - results = trainer.train() + assert sample.count == algo.config["rollout_fragment_length"] + results = algo.train() assert results["timesteps_total"] == config["train_batch_size"] - trainer.stop() + algo.stop() def test_traj_view_next_action(self): action_space = Discrete(2) @@ -341,10 +341,10 @@ class TestTrajectoryViewAPI(unittest.TestCase): config["env_config"] = {"num_agents": num_agents} num_iterations = 2 - trainer = ppo.PPO(config=config) + algo = ppo.PPO(config=config) results = None for i in range(num_iterations): - results = trainer.train() + results = algo.train() self.assertEqual(results["agent_timesteps_total"], results["timesteps_total"]) self.assertEqual( results["num_env_steps_trained"] * num_agents, @@ -358,7 +358,7 @@ class TestTrajectoryViewAPI(unittest.TestCase): results["agent_timesteps_total"], (num_iterations + 1) * config["train_batch_size"], ) - trainer.stop() + algo.stop() def test_get_single_step_input_dict_batch_repeat_value_larger_1(self): """Test whether a SampleBatch produces the correct 1-step input dict.""" diff --git a/rllib/examples/cartpole_lstm.py b/rllib/examples/cartpole_lstm.py index 33f827d76..b1672cfd8 100644 --- a/rllib/examples/cartpole_lstm.py +++ b/rllib/examples/cartpole_lstm.py @@ -81,14 +81,14 @@ if __name__ == "__main__": "episode_reward_mean": args.stop_reward, } - # To run the Trainer without tune.run, using our LSTM model and + # To run the Algorithm without tune.run, using our LSTM model and # manual state-in handling, do the following: # Example (use `config` from the above code): # >> import numpy as np # >> from ray.rllib.algorithms.ppo import PPO # >> - # >> trainer = PPO(config) + # >> algo = PPO(config) # >> lstm_cell_size = config["model"]["lstm_cell_size"] # >> env = StatelessCartPole() # >> obs = env.reset() @@ -101,7 +101,7 @@ if __name__ == "__main__": # >> prev_r = 0.0 # >> # >> while True: - # >> a, state_out, _ = trainer.compute_single_action( + # >> a, state_out, _ = algo.compute_single_action( # .. obs, state, prev_a, prev_r) # >> obs, reward, done, _ = env.step(a) # >> if done: diff --git a/rllib/examples/eager_execution.py b/rllib/examples/eager_execution.py index a570578ff..d716209ee 100644 --- a/rllib/examples/eager_execution.py +++ b/rllib/examples/eager_execution.py @@ -92,8 +92,8 @@ MyTFPolicy = build_tf_policy( ) -# Create a new Trainer using the Policy defined above. -class MyTrainer(Algorithm): +# Create a new Algorithm using the Policy defined above. +class MyAlgo(Algorithm): def get_default_policy_class(self, config): return MyTFPolicy @@ -117,7 +117,7 @@ if __name__ == "__main__": "episode_reward_mean": args.stop_reward, } - results = tune.run(MyTrainer, stop=stop, config=config, verbose=1) + results = tune.run(MyAlgo, stop=stop, config=config, verbose=1) if args.as_test: check_learning_achieved(results, args.stop_reward) diff --git a/rllib/examples/offline_rl.py b/rllib/examples/offline_rl.py index cd955923d..6782ab06b 100644 --- a/rllib/examples/offline_rl.py +++ b/rllib/examples/offline_rl.py @@ -83,11 +83,11 @@ if __name__ == "__main__": min_reward = -300 # Test for torch framework (tf not implemented yet). - trainer = cql.CQL(config=config) + algo = cql.CQL(config=config) learnt = False for i in range(num_iterations): print(f"Iter {i}") - eval_results = trainer.train().get("evaluation") + eval_results = algo.train().get("evaluation") if eval_results: print("... R={}".format(eval_results["episode_reward_mean"])) # Learn until some reward is reached on an actual live env. @@ -101,7 +101,7 @@ if __name__ == "__main__": ) # Get policy, model, and replay-buffer. - pol = trainer.get_policy() + pol = algo.get_policy() cql_model = pol.model from ray.rllib.algorithms.cql.cql import replay_buffer @@ -116,7 +116,7 @@ if __name__ == "__main__": final_q_values = torch.min(q_values, twin_q_values) print(final_q_values) - # Example on how to do evaluation on the trained Trainer + # Example on how to do evaluation on the trained Algorithm. # using the data from our buffer. # Get a sample (MultiAgentBatch). multi_agent_batch = replay_buffer.sample(num_items=config["train_batch_size"]) @@ -128,11 +128,10 @@ if __name__ == "__main__": model_out, _ = cql_model({"obs": obs}) # The estimated Q-values from the (historic) actions in the batch. q_values_old = cql_model.get_q_values(model_out, torch.from_numpy(batch["actions"])) - # The estimated Q-values for the new actions computed - # by our trainer policy. + # The estimated Q-values for the new actions computed by our policy. actions_new = pol.compute_actions_from_input_dict({"obs": obs})[0] q_values_new = cql_model.get_q_values(model_out, torch.from_numpy(actions_new)) print(f"Q-val batch={q_values_old}") print(f"Q-val policy={q_values_new}") - trainer.stop() + algo.stop() diff --git a/rllib/examples/random_parametric_agent.py b/rllib/examples/random_parametric_agent.py index 8a17402ed..2082e7423 100644 --- a/rllib/examples/random_parametric_agent.py +++ b/rllib/examples/random_parametric_agent.py @@ -58,10 +58,10 @@ class RandomParametricPolicy(Policy, ABC): pass -class RandomParametricTrainer(Algorithm): - """Algo with Policy and config defined above and overriding `training_iteration`. +class RandomParametricAlgorithm(Algorithm): + """Algo with Policy and config defined above and overriding `training_step`. - Overrides the `training_iteration` method, which only runs a (dummy) + Overrides the `training_step` method, which only runs a (dummy) rollout and performs no learning. """ @@ -79,7 +79,7 @@ class RandomParametricTrainer(Algorithm): def main(): register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10)) - algo = RandomParametricTrainer(env="pa_cartpole") + algo = RandomParametricAlgorithm(env="pa_cartpole") result = algo.train() assert result["episode_reward_mean"] > 10, result print("Test: OK") diff --git a/rllib/examples/remote_envs_with_inference_done_on_main_node.py b/rllib/examples/remote_envs_with_inference_done_on_main_node.py index f1d8b82e5..2cae5891b 100644 --- a/rllib/examples/remote_envs_with_inference_done_on_main_node.py +++ b/rllib/examples/remote_envs_with_inference_done_on_main_node.py @@ -75,10 +75,10 @@ def get_cli_args(): return args -# The modified Trainer class we will use. This is the exact same -# as a PPO, but with the additional default_resource_request -# override, telling tune that it's ok (not mandatory) to place our -# n remote envs on a different node (each env using 1 CPU). +# The modified Algorithm class we will use: +# Subclassing from PPO, our algo will only modity `default_resource_request`, +# telling Ray Tune that it's ok (not mandatory) to place our n remote envs on a +# different node (each env using 1 CPU). class PPORemoteInference(PPO): @classmethod @override(Algorithm) @@ -145,7 +145,7 @@ if __name__ == "__main__": ): break - # Run with Tune for auto env and trainer creation and TensorBoard. + # Run with Tune for auto env and algorithm creation and TensorBoard. else: stop = { "training_iteration": args.stop_iters, diff --git a/rllib/examples/two_trainer_workflow.py b/rllib/examples/two_trainer_workflow.py index ba96715a4..bdc54fbcc 100644 --- a/rllib/examples/two_trainer_workflow.py +++ b/rllib/examples/two_trainer_workflow.py @@ -64,12 +64,12 @@ parser.add_argument( ) -# Define new Trainer with custom execution_plan/workflow. -class MyTrainer(Algorithm): +# Define new Algorithm with custom execution_plan/workflow. +class MyAlgo(Algorithm): @classmethod @override(Algorithm) def get_default_config(cls) -> AlgorithmConfigDict: - # Run this Trainer with new `training_iteration` API and set some PPO-specific + # Run this Algorithm with new `training_step` API and set some PPO-specific # parameters. return with_common_config( { @@ -218,7 +218,7 @@ if __name__ == "__main__": "episode_reward_mean": args.stop_reward, } - results = tune.run(MyTrainer, config=config, stop=stop) + results = tune.run(MyAlgo, config=config, stop=stop) if args.as_test: check_learning_achieved(results, args.stop_reward) diff --git a/rllib/examples/vizdoom_with_attention_net.py b/rllib/examples/vizdoom_with_attention_net.py index c7e640bac..774e530a1 100644 --- a/rllib/examples/vizdoom_with_attention_net.py +++ b/rllib/examples/vizdoom_with_attention_net.py @@ -17,7 +17,7 @@ parser.add_argument( type=str, default=None, help="Full path to a checkpoint file for restoring a previously saved " - "Trainer state.", + "Algorithm state.", ) parser.add_argument("--num-workers", type=int, default=0) parser.add_argument( diff --git a/rllib/execution/metric_ops.py b/rllib/execution/metric_ops.py index c6187ecaa..a709fccbe 100644 --- a/rllib/execution/metric_ops.py +++ b/rllib/execution/metric_ops.py @@ -27,7 +27,7 @@ def StandardMetricsReporting( train_op: Operator for executing training steps. We ignore the output values. workers: Rollout workers to collect metrics from. - config: Trainer configuration, used to determine the frequency + config: Algorithm configuration, used to determine the frequency of stats reporting. selected_workers: Override the list of remote workers to collect metrics from. diff --git a/rllib/offline/estimators/tests/test_ope.py b/rllib/offline/estimators/tests/test_ope.py index c7e567443..350df560e 100644 --- a/rllib/offline/estimators/tests/test_ope.py +++ b/rllib/offline/estimators/tests/test_ope.py @@ -51,7 +51,7 @@ class TestOPE(unittest.TestCase): .framework("torch") .rollouts(batch_mode="complete_episodes") ) - cls.trainer = config.build() + cls.algo = config.build() # Train DQN for evaluation policy tune.run( @@ -80,7 +80,7 @@ class TestOPE(unittest.TestCase): done = False rewards = [] while not done: - act = cls.trainer.compute_single_action(obs) + act = cls.algo.compute_single_action(obs) obs, reward, done, _ = env.step(act) rewards.append(reward) ret = 0 @@ -105,7 +105,7 @@ class TestOPE(unittest.TestCase): name = "is" estimator = ImportanceSampling( name=name, - policy=self.trainer.get_policy(), + policy=self.algo.get_policy(), gamma=self.gamma, ) estimator.process(self.batch) @@ -118,7 +118,7 @@ class TestOPE(unittest.TestCase): name = "wis" estimator = WeightedImportanceSampling( name=name, - policy=self.trainer.get_policy(), + policy=self.algo.get_policy(), gamma=self.gamma, ) estimator.process(self.batch) @@ -131,7 +131,7 @@ class TestOPE(unittest.TestCase): name = "dm_qreg" estimator = DirectMethod( name=name, - policy=self.trainer.get_policy(), + policy=self.algo.get_policy(), gamma=self.gamma, q_model_type="qreg", **self.model_config, @@ -146,7 +146,7 @@ class TestOPE(unittest.TestCase): name = "dm_fqe" estimator = DirectMethod( name=name, - policy=self.trainer.get_policy(), + policy=self.algo.get_policy(), gamma=self.gamma, q_model_type="fqe", **self.model_config, @@ -161,7 +161,7 @@ class TestOPE(unittest.TestCase): name = "dr_qreg" estimator = DoublyRobust( name=name, - policy=self.trainer.get_policy(), + policy=self.algo.get_policy(), gamma=self.gamma, q_model_type="qreg", **self.model_config, @@ -176,7 +176,7 @@ class TestOPE(unittest.TestCase): name = "dr_fqe" estimator = DoublyRobust( name=name, - policy=self.trainer.get_policy(), + policy=self.algo.get_policy(), gamma=self.gamma, q_model_type="fqe", **self.model_config, @@ -187,7 +187,7 @@ class TestOPE(unittest.TestCase): self.mean_ret[name] = np.mean([e.metrics["v_new"] for e in estimates]) self.std_ret[name] = np.std([e.metrics["v_new"] for e in estimates]) - def test_ope_in_trainer(self): + def test_ope_in_algo(self): # TODO (rohan): Add performance tests for off_policy_estimation_methods, # with fixed seeds and hyperparameters pass diff --git a/rllib/policy/eager_tf_policy.py b/rllib/policy/eager_tf_policy.py index d871a6849..819354c9d 100644 --- a/rllib/policy/eager_tf_policy.py +++ b/rllib/policy/eager_tf_policy.py @@ -294,7 +294,7 @@ def _build_eager_tf_policy( much simpler, but has lower performance. You shouldn't need to call this directly. Rather, prefer to build a TF - graph policy and use set {"framework": "tfe"} in the trainer config to have + graph policy and use set {"framework": "tfe"} in the Algorithm's config to have it automatically be converted to an eager policy. This has the same signature as build_tf_policy().""" diff --git a/rllib/policy/torch_mixins.py b/rllib/policy/torch_mixins.py index 4d7641856..221a146c9 100644 --- a/rllib/policy/torch_mixins.py +++ b/rllib/policy/torch_mixins.py @@ -78,7 +78,7 @@ class EntropyCoeffSchedule: class KLCoeffMixin: """Assigns the `update_kl()` method to a TorchPolicy. - This is used by Trainers to update the KL coefficient + This is used by Algorithms to update the KL coefficient after each learning step based on `config.kl_target` and the measured KL value (from the train_batch). """ diff --git a/rllib/tests/test_dependency_torch.py b/rllib/tests/test_dependency_torch.py index 19e8d3e0d..9fde8698e 100755 --- a/rllib/tests/test_dependency_torch.py +++ b/rllib/tests/test_dependency_torch.py @@ -7,7 +7,7 @@ if __name__ == "__main__": # Do not import torch for testing purposes. os.environ["RLLIB_TEST_NO_TORCH_IMPORT"] = "1" - # Test registering (includes importing) all Trainers. + # Test registering (includes importing) all Algorithms. from ray.rllib import _register_all # This should surface any dependency on torch, e.g. inside function @@ -19,7 +19,7 @@ if __name__ == "__main__": assert "torch" not in sys.modules, "`torch` initially present, when it shouldn't!" # Note: No ray.init(), to test it works without Ray - trainer = A2C( + algo = A2C( env="CartPole-v0", config={ "framework": "tf", @@ -31,7 +31,7 @@ if __name__ == "__main__": }, }, ) - trainer.train() + algo.train() assert ( "torch" not in sys.modules diff --git a/rllib/tests/test_placement_groups.py b/rllib/tests/test_placement_groups.py index b4d3b57d5..8522aa199 100644 --- a/rllib/tests/test_placement_groups.py +++ b/rllib/tests/test_placement_groups.py @@ -57,10 +57,10 @@ class TestPlacementGroups(unittest.TestCase): config["env"] = "CartPole-v0" config["framework"] = "tf" - # Create a trainer with an overridden default_resource_request + # Create an Algorithm with an overridden default_resource_request # method that returns a PlacementGroupFactory. - class MyTrainer(PG): + class MyAlgo(PG): @classmethod def default_resource_request(cls, config): head_bundle = {"CPU": 1, "GPU": 0} @@ -70,7 +70,7 @@ class TestPlacementGroups(unittest.TestCase): strategy=config["placement_strategy"], ) - tune.register_trainable("my_trainable", MyTrainer) + tune.register_trainable("my_trainable", MyAlgo) global trial_executor trial_executor = RayTrialExecutor(reuse_actors=False) diff --git a/rllib/tests/test_timesteps.py b/rllib/tests/test_timesteps.py index e1f6590bb..307c5981d 100644 --- a/rllib/tests/test_timesteps.py +++ b/rllib/tests/test_timesteps.py @@ -27,11 +27,11 @@ class TestTimeSteps(unittest.TestCase): obs_batch = np.array([1]) for _ in framework_iterator(config): - trainer = pg.PG(config=config, env=RandomEnv) - policy = trainer.get_policy() + algo = pg.PG(config=config, env=RandomEnv) + policy = algo.get_policy() for i in range(1, 21): - trainer.compute_single_action(obs) + algo.compute_single_action(obs) check(policy.global_timestep, i) for i in range(1, 21): policy.compute_actions(obs_batch) @@ -45,7 +45,8 @@ class TestTimeSteps(unittest.TestCase): for i in range(1, 11): policy.compute_actions(obs_batch) check(policy.global_timestep, i + crazy_timesteps) - trainer.train() + algo.train() + algo.stop() if __name__ == "__main__": diff --git a/rllib/utils/annotations.py b/rllib/utils/annotations.py index 6f31093e7..59957b6bd 100644 --- a/rllib/utils/annotations.py +++ b/rllib/utils/annotations.py @@ -36,18 +36,18 @@ def PublicAPI(obj): can expect these APIs to remain stable across RLlib releases. Subclasses that inherit from a ``@PublicAPI`` base class can be - assumed part of the RLlib public API as well (e.g., all trainer classes - are in public API because Trainer is ``@PublicAPI``). + assumed part of the RLlib public API as well (e.g., all Algorithm classes + are in public API because Algorithm is ``@PublicAPI``). - In addition, you can assume all trainer configurations are part of their + In addition, you can assume all algo configurations are part of their public API as well. Examples: - >>> # Indicates that the `Trainer` class is exposed to end users + >>> # Indicates that the `Algorithm` class is exposed to end users >>> # of RLlib and will remain stable across RLlib releases. >>> from ray import tune >>> @PublicAPI # doctest: +SKIP - >>> class Trainer(tune.Trainable): # doctest: +SKIP + >>> class Algorithm(tune.Trainable): # doctest: +SKIP ... ... # doctest: +SKIP """ @@ -110,7 +110,7 @@ def ExperimentalAPI(obj): def OverrideToImplementCustomLogic(obj): """Users should override this in their sub-classes to implement custom logic. - Used in Trainer and Policy to tag methods that need overriding, e.g. + Used in Algorithm and Policy to tag methods that need overriding, e.g. `Policy.loss()`. Examples: @@ -132,9 +132,9 @@ def OverrideToImplementCustomLogic_CallToSuperRecommended(obj): Thereby, it is recommended (but not required) to call the super-class' corresponding method. - Used in Trainer and Policy to tag methods that need overriding, but the + Used in Algorithm and Policy to tag methods that need overriding, but the super class' method should still be called, e.g. - `Trainer.setup()`. + `Algorithm.setup()`. Examples: >>> from ray import tune diff --git a/rllib/utils/debug/memory.py b/rllib/utils/debug/memory.py index 5d0eeeead..0f5a9e82a 100644 --- a/rllib/utils/debug/memory.py +++ b/rllib/utils/debug/memory.py @@ -36,7 +36,7 @@ Suspect = DeveloperAPI( @DeveloperAPI def check_memory_leaks( - trainer, + algorithm, to_check: Optional[Set[str]] = None, repeats: Optional[int] = None, max_num_trials: int = 3, @@ -49,7 +49,7 @@ def check_memory_leaks( un-GC'd items to memory. Args: - trainer: The Algorithm instance to test. + algorithm: The Algorithm instance to test. to_check: Set of strings to indentify components to test. Allowed strings are: "env", "policy", "model", "rollout_worker". By default, check all of these. @@ -62,7 +62,7 @@ def check_memory_leaks( A defaultdict(list) with keys being the `to_check` strings and values being lists of Suspect instances that were found. """ - local_worker = trainer.workers.local_worker() + local_worker = algorithm.workers.local_worker() # Which components should we test? to_check = to_check or {"env", "model", "policy", "rollout_worker"} diff --git a/rllib/utils/metrics/__init__.py b/rllib/utils/metrics/__init__.py index 080bc6b14..8960e960e 100644 --- a/rllib/utils/metrics/__init__.py +++ b/rllib/utils/metrics/__init__.py @@ -12,7 +12,7 @@ NUM_AGENT_STEPS_TRAINED_THIS_ITER = "num_agent_steps_trained_this_iter" LAST_TARGET_UPDATE_TS = "last_target_update_ts" NUM_TARGET_UPDATES = "num_target_updates" -# Performance timers (keys for Trainer._timers or metrics.timers). +# Performance timers (keys for Algorithm._timers or metrics.timers). TRAINING_ITERATION_TIMER = "training_iteration" APPLY_GRADS_TIMER = "apply_grad" COMPUTE_GRADS_TIMER = "compute_grads"