ray/rllib/execution/learner_thread.py

import copy
from six.moves import queue
import threading
from typing import Dict, Optional

from ray.util.timer import _Timer
from ray.rllib.evaluation.rollout_worker import RolloutWorker
from ray.rllib.execution.minibatch_buffer import MinibatchBuffer
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder, LEARNER_INFO
from ray.rllib.utils.metrics.window_stat import WindowStat
from ray.util.iter import _NextValueNotReady

tf1, tf, tfv = try_import_tf()


class LearnerThread(threading.Thread):
    """Background thread that updates the local model from sample trajectories.

    The learner thread communicates with the main thread through Queues. This
    is needed since Ray operations can only be run on the main thread. In
    addition, moving heavyweight gradient ops session runs off the main thread
    improves overall throughput.
    """

    def __init__(
        self,
        local_worker: RolloutWorker,
        minibatch_buffer_size: int,
        num_sgd_iter: int,
        learner_queue_size: int,
        learner_queue_timeout: int,
    ):
        """Initialize the learner thread.

        Args:
            local_worker: process local rollout worker holding
                policies this thread will call learn_on_batch() on
            minibatch_buffer_size: max number of train batches to store
                in the minibatching buffer
            num_sgd_iter: number of passes to learn on per train batch
            learner_queue_size: max size of queue of inbound
                train batches to this thread
            learner_queue_timeout: raise an exception if the queue has
                been empty for this long in seconds
        """
        threading.Thread.__init__(self)
        self.learner_queue_size = WindowStat("size", 50)
        self.local_worker = local_worker
        self.inqueue = queue.Queue(maxsize=learner_queue_size)
        self.outqueue = queue.Queue()
        self.minibatch_buffer = MinibatchBuffer(
            inqueue=self.inqueue,
            size=minibatch_buffer_size,
            timeout=learner_queue_timeout,
            num_passes=num_sgd_iter,
            init_num_passes=num_sgd_iter,
        )
        self.queue_timer = _Timer()
        self.grad_timer = _Timer()
        self.load_timer = _Timer()
        self.load_wait_timer = _Timer()
        self.daemon = True
        self.weights_updated = False
        self.learner_info = {}
        self.stopped = False
        self.num_steps = 0

    def run(self) -> None:
        # Switch on eager mode if configured.
        if self.local_worker.policy_config.get("framework") in ["tf2", "tfe"]:
            tf1.enable_eager_execution()
        while not self.stopped:
            self.step()

    def step(self) -> Optional[_NextValueNotReady]:
        with self.queue_timer:
            try:
                batch, _ = self.minibatch_buffer.get()
            except queue.Empty:
                return _NextValueNotReady()
        with self.grad_timer:
            # Use LearnerInfoBuilder as a unified way to build the final
            # results dict from `learn_on_loaded_batch` call(s).
            # This makes sure results dicts always have the same structure
            # no matter the setup (multi-GPU, multi-agent, minibatch SGD,
            # tf vs torch).
            learner_info_builder = LearnerInfoBuilder(num_devices=1)
            multi_agent_results = self.local_worker.learn_on_batch(batch)
            for pid, results in multi_agent_results.items():
                learner_info_builder.add_learn_on_batch_results(results, pid)
            self.learner_info = learner_info_builder.finalize()
            self.weights_updated = True

        self.num_steps += 1
        # Put tuple: env-steps, agent-steps, and learner info into the queue.
        self.outqueue.put((batch.count, batch.agent_steps(), self.learner_info))
        self.learner_queue_size.push(self.inqueue.qsize())

    def add_learner_metrics(self, result: Dict, overwrite_learner_info=True) -> Dict:
        """Add internal metrics to a result dict."""

        def timer_to_ms(timer):
            return round(1000 * timer.mean, 3)

        if overwrite_learner_info:
            result["info"].update(
                {
                    "learner_queue": self.learner_queue_size.stats(),
                    LEARNER_INFO: copy.deepcopy(self.learner_info),
                    "timing_breakdown": {
                        "learner_grad_time_ms": timer_to_ms(self.grad_timer),
                        "learner_load_time_ms": timer_to_ms(self.load_timer),
                        "learner_load_wait_time_ms": timer_to_ms(self.load_wait_timer),
                        "learner_dequeue_time_ms": timer_to_ms(self.queue_timer),
                    },
                }
            )
        else:
            result["info"].update(
                {
                    "learner_queue": self.learner_queue_size.stats(),
                    "timing_breakdown": {
                        "learner_grad_time_ms": timer_to_ms(self.grad_timer),
                        "learner_load_time_ms": timer_to_ms(self.load_timer),
                        "learner_load_wait_time_ms": timer_to_ms(self.load_wait_timer),
                        "learner_dequeue_time_ms": timer_to_ms(self.queue_timer),
                    },
                }
            )
        return result
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`import copy`
[RLlib] Fix PR 16162: Having added sleep to `_NextValueNotReady` causes TD3 tests to become flakey. (#16309) 2021-06-08 16:27:02 +02:00			`from six.moves import queue`
[RLlib] Allow `rllib rollout` to run distributed via evaluation workers. (#13718) 2021-02-08 12:05:16 +01:00			`import threading`
[RLlib] Refactor: All tf static graph code should reside inside Policy class. (#17169) 2021-07-20 14:58:13 -04:00			`from typing import Dict, Optional`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00
[RLlib] Cleanup some deprecated metric keys and classes. (#26036) 2022-06-23 21:30:01 +02:00			`from ray.util.timer import _Timer`
[RLlib] Remove bad spinlocks to allow pytorch GPU scheduler to interrupt. (#16162) 2021-06-01 15:40:28 +01:00			`from ray.rllib.evaluation.rollout_worker import RolloutWorker`
[RLlib] Replay Buffer API and Ape-X. (#24506) 2022-05-17 13:43:49 +02:00			`from ray.rllib.execution.minibatch_buffer import MinibatchBuffer`
[RLlib] Tf2.x native. (#8752) 2020-07-11 22:06:35 +02:00			`from ray.rllib.utils.framework import try_import_tf`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder, LEARNER_INFO`
[RLlib] Trainer sub-class PPO/DDPPO (instead of `build_trainer()`). (#20571) 2021-11-23 23:01:05 +01:00			`from ray.rllib.utils.metrics.window_stat import WindowStat`
[RLlib] Remove bad spinlocks to allow pytorch GPU scheduler to interrupt. (#16162) 2021-06-01 15:40:28 +01:00			`from ray.util.iter import _NextValueNotReady`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00
[RLlib] Tf2.x native. (#8752) 2020-07-11 22:06:35 +02:00			`tf1, tf, tfv = try_import_tf()`

[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00
			`class LearnerThread(threading.Thread):`
			`"""Background thread that updates the local model from sample trajectories.`

			`The learner thread communicates with the main thread through Queues. This`
			`is needed since Ray operations can only be run on the main thread. In`
			`addition, moving heavyweight gradient ops session runs off the main thread`
			`improves overall throughput.`
			`"""`

[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`def __init__(`
			`self,`
			`local_worker: RolloutWorker,`
			`minibatch_buffer_size: int,`
			`num_sgd_iter: int,`
			`learner_queue_size: int,`
			`learner_queue_timeout: int,`
			`):`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`"""Initialize the learner thread.`

[RLlib] SAC algo cleanup. (#10825) 2020-09-20 11:27:02 +02:00			`Args:`
Clean up docstyle in python modules and add LINT rule (#25272) 2022-06-01 11:27:54 -07:00			`local_worker: process local rollout worker holding`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`policies this thread will call learn_on_batch() on`
Clean up docstyle in python modules and add LINT rule (#25272) 2022-06-01 11:27:54 -07:00			`minibatch_buffer_size: max number of train batches to store`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`in the minibatching buffer`
Clean up docstyle in python modules and add LINT rule (#25272) 2022-06-01 11:27:54 -07:00			`num_sgd_iter: number of passes to learn on per train batch`
			`learner_queue_size: max size of queue of inbound`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`train batches to this thread`
Clean up docstyle in python modules and add LINT rule (#25272) 2022-06-01 11:27:54 -07:00			`learner_queue_timeout: raise an exception if the queue has`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`been empty for this long in seconds`
			`"""`
			`threading.Thread.__init__(self)`
			`self.learner_queue_size = WindowStat("size", 50)`
			`self.local_worker = local_worker`
			`self.inqueue = queue.Queue(maxsize=learner_queue_size)`
			`self.outqueue = queue.Queue()`
			`self.minibatch_buffer = MinibatchBuffer(`
			`inqueue=self.inqueue,`
			`size=minibatch_buffer_size,`
			`timeout=learner_queue_timeout,`
			`num_passes=num_sgd_iter,`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`init_num_passes=num_sgd_iter,`
			`)`
[RLlib] Cleanup some deprecated metric keys and classes. (#26036) 2022-06-23 21:30:01 +02:00			`self.queue_timer = _Timer()`
			`self.grad_timer = _Timer()`
			`self.load_timer = _Timer()`
			`self.load_wait_timer = _Timer()`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`self.daemon = True`
			`self.weights_updated = False`
[RLlib] Unify all RLlib Trainer.train() -> results[info][learner][policy ID][learner_stats] and add structure tests. (#18879) 2021-09-30 16:39:05 +02:00			`self.learner_info = {}`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`self.stopped = False`
			`self.num_steps = 0`

[RLlib] Execution Annotation (#13036) 2020-12-24 06:30:33 -08:00			`def run(self) -> None:`
[RLlib] Tf2.x native. (#8752) 2020-07-11 22:06:35 +02:00			`# Switch on eager mode if configured.`
			`if self.local_worker.policy_config.get("framework") in ["tf2", "tfe"]:`
			`tf1.enable_eager_execution()`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`while not self.stopped:`
			`self.step()`

[RLlib] Refactor: All tf static graph code should reside inside Policy class. (#17169) 2021-07-20 14:58:13 -04:00			`def step(self) -> Optional[_NextValueNotReady]:`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`with self.queue_timer:`
[RLlib] Allow `rllib rollout` to run distributed via evaluation workers. (#13718) 2021-02-08 12:05:16 +01:00			`try:`
			`batch, _ = self.minibatch_buffer.get()`
			`except queue.Empty:`
[RLlib] Remove bad spinlocks to allow pytorch GPU scheduler to interrupt. (#16162) 2021-06-01 15:40:28 +01:00			`return _NextValueNotReady()`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`with self.grad_timer:`
[RLlib] Unify all RLlib Trainer.train() -> results[info][learner][policy ID][learner_stats] and add structure tests. (#18879) 2021-09-30 16:39:05 +02:00			`# Use LearnerInfoBuilder as a unified way to build the final`
			# results dict from `learn_on_loaded_batch` call(s).
			`# This makes sure results dicts always have the same structure`
			`# no matter the setup (multi-GPU, multi-agent, minibatch SGD,`
			`# tf vs torch).`
			`learner_info_builder = LearnerInfoBuilder(num_devices=1)`
			`multi_agent_results = self.local_worker.learn_on_batch(batch)`
			`for pid, results in multi_agent_results.items():`
			`learner_info_builder.add_learn_on_batch_results(results, pid)`
			`self.learner_info = learner_info_builder.finalize()`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`self.weights_updated = True`

			`self.num_steps += 1`
[RLlib] APPO Training iteration fn. (#24545) 2022-05-17 10:31:07 +02:00			`# Put tuple: env-steps, agent-steps, and learner info into the queue.`
			`self.outqueue.put((batch.count, batch.agent_steps(), self.learner_info))`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`self.learner_queue_size.push(self.inqueue.qsize())`

[RLlib] Impala training iteration fn (#23454) 2022-05-05 10:11:08 -04:00			`def add_learner_metrics(self, result: Dict, overwrite_learner_info=True) -> Dict:`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`"""Add internal metrics to a result dict."""`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00
			`def timer_to_ms(timer):`
			`return round(1000 * timer.mean, 3)`

[RLlib] Impala training iteration fn (#23454) 2022-05-05 10:11:08 -04:00			`if overwrite_learner_info:`
			`result["info"].update(`
			`{`
			`"learner_queue": self.learner_queue_size.stats(),`
			`LEARNER_INFO: copy.deepcopy(self.learner_info),`
			`"timing_breakdown": {`
			`"learner_grad_time_ms": timer_to_ms(self.grad_timer),`
			`"learner_load_time_ms": timer_to_ms(self.load_timer),`
			`"learner_load_wait_time_ms": timer_to_ms(self.load_wait_timer),`
			`"learner_dequeue_time_ms": timer_to_ms(self.queue_timer),`
			`},`
			`}`
			`)`
			`else:`
			`result["info"].update(`
			`{`
			`"learner_queue": self.learner_queue_size.stats(),`
			`"timing_breakdown": {`
			`"learner_grad_time_ms": timer_to_ms(self.grad_timer),`
			`"learner_load_time_ms": timer_to_ms(self.load_timer),`
			`"learner_load_wait_time_ms": timer_to_ms(self.load_wait_timer),`
			`"learner_dequeue_time_ms": timer_to_ms(self.queue_timer),`
			`},`
			`}`
			`)`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`return result`