ray/rllib/algorithms/ddpg/apex.py

from typing import List

from ray.actor import ActorHandle
from ray.rllib.agents import Trainer
from ray.rllib.agents.dqn.apex import ApexTrainer
from ray.rllib.algorithms.ddpg.ddpg import DDPGConfig, DDPGTrainer
from ray.rllib.evaluation.worker_set import WorkerSet
from ray.rllib.utils.annotations import override
from ray.rllib.utils.typing import TrainerConfigDict
from ray.util.iter import LocalIterator
from ray.rllib.utils.typing import PartialTrainerConfigDict
from ray.rllib.utils.typing import ResultDict
from ray.rllib.utils.deprecation import DEPRECATED_VALUE

APEX_DDPG_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs(
    DDPGConfig().to_dict(),  # see also the options in ddpg.py, which are also supported
    {
        "optimizer": {
            "max_weight_sync_delay": 400,
            "num_replay_buffer_shards": 4,
            "debug": False,
        },
        "exploration_config": {"type": "PerWorkerOrnsteinUhlenbeckNoise"},
        "n_step": 3,
        "num_gpus": 0,
        "num_workers": 32,
        "replay_buffer_config": {
            "capacity": 2000000,
            "no_local_replay_buffer": True,
            # Specify prioritized replay by supplying a buffer type that supports
            # prioritization, for example: MultiAgentPrioritizedReplayBuffer.
            "prioritized_replay": DEPRECATED_VALUE,
            "learning_starts": 50000,
            # Whether all shards of the replay buffer must be co-located
            # with the learner process (running the execution plan).
            # This is preferred b/c the learner process should have quick
            # access to the data from the buffer shards, avoiding network
            # traffic each time samples from the buffer(s) are drawn.
            # Set this to False for relaxing this constraint and allowing
            # replay shards to be created on node(s) other than the one
            # on which the learner is located.
            "replay_buffer_shards_colocated_with_driver": True,
            "worker_side_prioritization": True,
        },
        "train_batch_size": 512,
        "rollout_fragment_length": 50,
        # Update the target network every `target_network_update_freq` sample timesteps.
        "target_network_update_freq": 500000,
        "min_sample_timesteps_per_reporting": 25000,
        "min_time_s_per_reporting": 30,
        "training_intensity": 1,
        # max number of inflight requests to each sampling worker
        # see the AsyncRequestsManager class for more details
        # Tuning these values is important when running experimens with large sample
        # batches. If the sample batches are large in size, then there is the risk that
        # the object store may fill up, causing the store to spill objects to disk.
        # This can cause any asynchronous requests to become very slow, making your
        # experiment run slowly. You can inspect the object store during your
        # experiment via a call to ray memory on your headnode, and by using the ray
        # dashboard. If you're seeing that the object store is filling up, turn down
        # the number of remote requests in flight, or enable compression in your
        # experiment of timesteps.
        "max_requests_in_flight_per_sampler_worker": 2,
        "max_requests_in_flight_per_replay_worker": float("inf"),
        "timeout_s_sampler_manager": 0.0,
        "timeout_s_replay_manager": 0.0,
    },
    _allow_unknown_configs=True,
)


class ApexDDPGTrainer(DDPGTrainer, ApexTrainer):
    @classmethod
    @override(DDPGTrainer)
    def get_default_config(cls) -> TrainerConfigDict:
        return APEX_DDPG_DEFAULT_CONFIG

    @override(DDPGTrainer)
    def setup(self, config: PartialTrainerConfigDict):
        return ApexTrainer.setup(self, config)

    @override(DDPGTrainer)
    def training_iteration(self) -> ResultDict:
        """Use APEX-DQN's training iteration function."""
        return ApexTrainer.training_iteration(self)

    @override(Trainer)
    def on_worker_failures(
        self, removed_workers: List[ActorHandle], new_workers: List[ActorHandle]
    ):
        """Handle the failures of remote sampling workers

        Args:
            removed_workers: removed worker ids.
            new_workers: ids of newly created workers.
        """
        self._sampling_actor_manager.remove_workers(removed_workers)
        self._sampling_actor_manager.add_workers(new_workers)

    @staticmethod
    @override(DDPGTrainer)
    def execution_plan(
        workers: WorkerSet, config: dict, **kwargs
    ) -> LocalIterator[dict]:
        """Use APEX-DQN's execution plan."""
        return ApexTrainer.execution_plan(workers, config, **kwargs)
[RLlib] Async parallel execution manager. (#24423) 2022-05-25 08:54:08 -07:00			`from typing import List`

			`from ray.actor import ActorHandle`
			`from ray.rllib.agents import Trainer`
[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633) 2021-11-30 18:05:44 +01:00			`from ray.rllib.agents.dqn.apex import ApexTrainer`
[RLlib] Agents to algos: DQN w/o Apex and R2D2, DDPG/TD3, SAC, SlateQ, QMIX, PG, Bandits (#24896) 2022-05-19 09:30:42 -07:00			`from ray.rllib.algorithms.ddpg.ddpg import DDPGConfig, DDPGTrainer`
[RLlib] Trainer sub-class DDPG/TD3/APEX-DDPG (instead of `build_trainer`). (#20636) 2021-12-01 10:52:12 +01:00			`from ray.rllib.evaluation.worker_set import WorkerSet`
			`from ray.rllib.utils.annotations import override`
			`from ray.rllib.utils.typing import TrainerConfigDict`
			`from ray.util.iter import LocalIterator`
[RLlib] Replay Buffer API and Ape-X. (#24506) 2022-05-17 13:43:49 +02:00			`from ray.rllib.utils.typing import PartialTrainerConfigDict`
			`from ray.rllib.utils.typing import ResultDict`
			`from ray.rllib.utils.deprecation import DEPRECATED_VALUE`
[rllib] Contribute DDPG to RLlib (#1877) * ongoing ddpg * ongoing ddpg converged * gpu machine changes * tuned * tuned ddpg specification * ddpg * supplement missed optimizer argument clip_rewards in default DQN configuration * ddpg supports vision env (atari) now * revised according to code review comments * added regression test case * removed irrelevant files * validate ddpg on mountain_car_continuous * restore unnecessary slight changes * revised according to eric's comments * added the requested tests * revised accordingly * revised accordingly and re-validated * formatted by yapf * fix lint errors * formatted by yapf * fix lint errors * formatted by yapf * fix lint error 2018-04-19 22:36:29 -07:00
[RLlib] DDPG refactor and Exploration API action noise classes. (#7314) * WIP. * WIP. * WIP. * WIP. * WIP. * Fix * WIP. * Add TD3 quick Pendulum regresison. * Cleanup. * Fix. * LINT. * Fix. * Sort quick_learning test cases, add TD3. * Sort quick_learning test cases, add TD3. * Revert test_checkpoint_restore.py (debugging) changes. * Fix old soft_q settings in documentation and test configs. * More doc fixes. * Fix test case. * Fix test case. * Lower test load. * WIP. 2020-03-01 20:53:35 +01:00			`APEX_DDPG_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs(`
[RLlib] Config objects for DDPG and SimpleQ. (#24339) 2022-05-12 16:12:42 +02:00			`DDPGConfig().to_dict(), # see also the options in ddpg.py, which are also supported`
Use flake8-comprehensions (#1976) * Add flake8 to Travis * Add flake8-comprehensions [flake8 plugin](https://github.com/adamchainz/flake8-comprehensions) that checks for useless constructions. * Use generators instead of lists where appropriate A lot of the builtins can take in generators instead of lists. This commit applies `flake8-comprehensions` to find them. * Fix lint error * Fix some string formatting The rest can be fixed in another PR * Fix compound literals syntax This should probably be merged after #1963. * dict() -> {} * Use dict literal syntax dict(...) -> {...} * Rewrite nested dicts * Fix hanging indent * Add missing import * Add missing quote * fmt * Add missing whitespace * rm duplicate pip install This is already installed in another file. * Fix indent * move `merge_dicts` into utils * Bring up to date with `master` * Add automatic syntax upgrade * rm pyupgrade In case users want to still use it on their own, the upgrade-syn.sh script was left in the `.travis` dir. 2018-05-20 16:15:06 -07:00			`{`
[RLlib] DDPG refactor and Exploration API action noise classes. (#7314) * WIP. * WIP. * WIP. * WIP. * WIP. * Fix * WIP. * Add TD3 quick Pendulum regresison. * Cleanup. * Fix. * LINT. * Fix. * Sort quick_learning test cases, add TD3. * Sort quick_learning test cases, add TD3. * Revert test_checkpoint_restore.py (debugging) changes. * Fix old soft_q settings in documentation and test configs. * More doc fixes. * Fix test case. * Fix test case. * Lower test load. * WIP. 2020-03-01 20:53:35 +01:00			`"optimizer": {`
			`"max_weight_sync_delay": 400,`
			`"num_replay_buffer_shards": 4,`
			`"debug": False,`
			`},`
			`"exploration_config": {"type": "PerWorkerOrnsteinUhlenbeckNoise"},`
[rllib] Refactor rllib to have a common sample collection pathway (#2149) 2018-06-09 00:21:35 -07:00			`"n_step": 3,`
[rllib] Clean up agent resource configurations (#3296) Closes #3284 2018-11-13 18:00:03 -08:00			`"num_gpus": 0,`
[rllib] Refactor rllib to have a common sample collection pathway (#2149) 2018-06-09 00:21:35 -07:00			`"num_workers": 32,`
[RLlib] Replay Buffer API and Ape-X. (#24506) 2022-05-17 13:43:49 +02:00			`"replay_buffer_config": {`
			`"capacity": 2000000,`
			`"no_local_replay_buffer": True,`
			`# Specify prioritized replay by supplying a buffer type that supports`
			`# prioritization, for example: MultiAgentPrioritizedReplayBuffer.`
			`"prioritized_replay": DEPRECATED_VALUE,`
			`"learning_starts": 50000,`
			`# Whether all shards of the replay buffer must be co-located`
			`# with the learner process (running the execution plan).`
			`# This is preferred b/c the learner process should have quick`
			`# access to the data from the buffer shards, avoiding network`
			`# traffic each time samples from the buffer(s) are drawn.`
			`# Set this to False for relaxing this constraint and allowing`
			`# replay shards to be created on node(s) other than the one`
			`# on which the learner is located.`
			`"replay_buffer_shards_colocated_with_driver": True,`
			`"worker_side_prioritization": True,`
			`},`
[rllib] Refactor rllib to have a common sample collection pathway (#2149) 2018-06-09 00:21:35 -07:00			`"train_batch_size": 512,`
[rllib] Rename sample_batch_size => rollout_fragment_length (#7503) * bulk rename * deprecation warn * update doc * update fig * line length * rename * make pytest comptaible * fix test * fi sys * rename * wip * fix more * lint * update svg * comments * lint * fix use of batch steps 2020-03-14 12:05:04 -07:00			`"rollout_fragment_length": 50,`
[RLlib] APPO Training iteration fn. (#24545) 2022-05-17 10:31:07 +02:00			# Update the target network every `target_network_update_freq` sample timesteps.
[rllib] Refactor rllib to have a common sample collection pathway (#2149) 2018-06-09 00:21:35 -07:00			`"target_network_update_freq": 500000,`
[RLlib] Deprecate `timesteps_per_iteration` config key (in favor of `min_[sample\|train]_timesteps_per_reporting`. (#24372) 2022-05-02 12:51:14 +02:00			`"min_sample_timesteps_per_reporting": 25000,`
[RLlib] Preparatory PR for multi-agent multi-GPU learner (alpha-star style) #03 (#21652) 2022-01-25 14:16:58 +01:00			`"min_time_s_per_reporting": 30,`
[RLlib] Async parallel execution manager. (#24423) 2022-05-25 08:54:08 -07:00			`"training_intensity": 1,`
			`# max number of inflight requests to each sampling worker`
			`# see the AsyncRequestsManager class for more details`
			`# Tuning these values is important when running experimens with large sample`
			`# batches. If the sample batches are large in size, then there is the risk that`
			`# the object store may fill up, causing the store to spill objects to disk.`
			`# This can cause any asynchronous requests to become very slow, making your`
			`# experiment run slowly. You can inspect the object store during your`
			`# experiment via a call to ray memory on your headnode, and by using the ray`
			`# dashboard. If you're seeing that the object store is filling up, turn down`
			`# the number of remote requests in flight, or enable compression in your`
			`# experiment of timesteps.`
			`"max_requests_in_flight_per_sampler_worker": 2,`
			`"max_requests_in_flight_per_replay_worker": float("inf"),`
			`"timeout_s_sampler_manager": 0.0,`
			`"timeout_s_replay_manager": 0.0,`
Use flake8-comprehensions (#1976) * Add flake8 to Travis * Add flake8-comprehensions [flake8 plugin](https://github.com/adamchainz/flake8-comprehensions) that checks for useless constructions. * Use generators instead of lists where appropriate A lot of the builtins can take in generators instead of lists. This commit applies `flake8-comprehensions` to find them. * Fix lint error * Fix some string formatting The rest can be fixed in another PR * Fix compound literals syntax This should probably be merged after #1963. * dict() -> {} * Use dict literal syntax dict(...) -> {...} * Rewrite nested dicts * Fix hanging indent * Add missing import * Add missing quote * fmt * Add missing whitespace * rm duplicate pip install This is already installed in another file. * Fix indent * move `merge_dicts` into utils * Bring up to date with `master` * Add automatic syntax upgrade * rm pyupgrade In case users want to still use it on their own, the upgrade-syn.sh script was left in the `.travis` dir. 2018-05-20 16:15:06 -07:00			`},`
[RLlib] Decentralized multi-agent learning; PR #01 (#21421) 2022-01-13 10:52:55 +01:00			`_allow_unknown_configs=True,`
Use flake8-comprehensions (#1976) * Add flake8 to Travis * Add flake8-comprehensions [flake8 plugin](https://github.com/adamchainz/flake8-comprehensions) that checks for useless constructions. * Use generators instead of lists where appropriate A lot of the builtins can take in generators instead of lists. This commit applies `flake8-comprehensions` to find them. * Fix lint error * Fix some string formatting The rest can be fixed in another PR * Fix compound literals syntax This should probably be merged after #1963. * dict() -> {} * Use dict literal syntax dict(...) -> {...} * Rewrite nested dicts * Fix hanging indent * Add missing import * Add missing quote * fmt * Add missing whitespace * rm duplicate pip install This is already installed in another file. * Fix indent * move `merge_dicts` into utils * Bring up to date with `master` * Add automatic syntax upgrade * rm pyupgrade In case users want to still use it on their own, the upgrade-syn.sh script was left in the `.travis` dir. 2018-05-20 16:15:06 -07:00			`)`
[rllib] Contribute DDPG to RLlib (#1877) * ongoing ddpg * ongoing ddpg converged * gpu machine changes * tuned * tuned ddpg specification * ddpg * supplement missed optimizer argument clip_rewards in default DQN configuration * ddpg supports vision env (atari) now * revised according to code review comments * added regression test case * removed irrelevant files * validate ddpg on mountain_car_continuous * restore unnecessary slight changes * revised according to eric's comments * added the requested tests * revised accordingly * revised accordingly and re-validated * formatted by yapf * fix lint errors * formatted by yapf * fix lint errors * formatted by yapf * fix lint error 2018-04-19 22:36:29 -07:00
[RLlib] Trainer sub-class DDPG/TD3/APEX-DDPG (instead of `build_trainer`). (#20636) 2021-12-01 10:52:12 +01:00
[RLlib] Replay Buffer API and Ape-X. (#24506) 2022-05-17 13:43:49 +02:00			`class ApexDDPGTrainer(DDPGTrainer, ApexTrainer):`
[RLlib] Trainer sub-class DDPG/TD3/APEX-DDPG (instead of `build_trainer`). (#20636) 2021-12-01 10:52:12 +01:00			`@classmethod`
			`@override(DDPGTrainer)`
			`def get_default_config(cls) -> TrainerConfigDict:`
			`return APEX_DDPG_DEFAULT_CONFIG`

[RLlib] Replay Buffer API and Ape-X. (#24506) 2022-05-17 13:43:49 +02:00			`@override(DDPGTrainer)`
			`def setup(self, config: PartialTrainerConfigDict):`
			`return ApexTrainer.setup(self, config)`

			`@override(DDPGTrainer)`
			`def training_iteration(self) -> ResultDict:`
			`"""Use APEX-DQN's training iteration function."""`
			`return ApexTrainer.training_iteration(self)`

[RLlib] Async parallel execution manager. (#24423) 2022-05-25 08:54:08 -07:00			`@override(Trainer)`
			`def on_worker_failures(`
			`self, removed_workers: List[ActorHandle], new_workers: List[ActorHandle]`
			`):`
			`"""Handle the failures of remote sampling workers`

			`Args:`
			`removed_workers: removed worker ids.`
			`new_workers: ids of newly created workers.`
			`"""`
			`self._sampling_actor_manager.remove_workers(removed_workers)`
			`self._sampling_actor_manager.add_workers(new_workers)`

[RLlib] Trainer sub-class DDPG/TD3/APEX-DDPG (instead of `build_trainer`). (#20636) 2021-12-01 10:52:12 +01:00			`@staticmethod`
			`@override(DDPGTrainer)`
			`def execution_plan(`
			`workers: WorkerSet, config: dict, **kwargs`
			`) -> LocalIterator[dict]:`
			`"""Use APEX-DQN's execution plan."""`
			`return ApexTrainer.execution_plan(workers, config, **kwargs)`