ray/rllib/execution/train_ops.py

import logging
import numpy as np
import math
from typing import Dict

from ray.rllib.execution.common import (
    LEARN_ON_BATCH_TIMER,
    LOAD_BATCH_TIMER,
)
from ray.rllib.utils.annotations import DeveloperAPI
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.metrics import NUM_ENV_STEPS_TRAINED, NUM_AGENT_STEPS_TRAINED
from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder
from ray.rllib.utils.sgd import do_minibatch_sgd

tf1, tf, tfv = try_import_tf()

logger = logging.getLogger(__name__)


@DeveloperAPI
def train_one_step(algorithm, train_batch, policies_to_train=None) -> Dict:
    """Function that improves the all policies in `train_batch` on the local worker.

    Examples:
        >>> from ray.rllib.execution.rollout_ops import synchronous_parallel_sample
        >>> algo = [...] # doctest: +SKIP
        >>> train_batch = synchronous_parallel_sample(algo.workers) # doctest: +SKIP
        >>> # This trains the policy on one batch.
        >>> results = train_one_step(algo, train_batch)) # doctest: +SKIP
        {"default_policy": ...}

    Updates the NUM_ENV_STEPS_TRAINED and NUM_AGENT_STEPS_TRAINED counters as well as
    the LEARN_ON_BATCH_TIMER timer of the `algorithm` object.
    """

    config = algorithm.config
    workers = algorithm.workers
    local_worker = workers.local_worker()
    num_sgd_iter = config.get("num_sgd_iter", 1)
    sgd_minibatch_size = config.get("sgd_minibatch_size", 0)

    learn_timer = algorithm._timers[LEARN_ON_BATCH_TIMER]
    with learn_timer:
        # Subsample minibatches (size=`sgd_minibatch_size`) from the
        # train batch and loop through train batch `num_sgd_iter` times.
        if num_sgd_iter > 1 or sgd_minibatch_size > 0:
            info = do_minibatch_sgd(
                train_batch,
                {
                    pid: local_worker.get_policy(pid)
                    for pid in policies_to_train
                    or local_worker.get_policies_to_train(train_batch)
                },
                local_worker,
                num_sgd_iter,
                sgd_minibatch_size,
                [],
            )
        # Single update step using train batch.
        else:
            info = local_worker.learn_on_batch(train_batch)

    learn_timer.push_units_processed(train_batch.count)
    algorithm._counters[NUM_ENV_STEPS_TRAINED] += train_batch.count
    algorithm._counters[NUM_AGENT_STEPS_TRAINED] += train_batch.agent_steps()

    return info


@DeveloperAPI
def multi_gpu_train_one_step(algorithm, train_batch) -> Dict:
    """Multi-GPU version of train_one_step.

    Uses the policies' `load_batch_into_buffer` and `learn_on_loaded_batch` methods
    to be more efficient wrt CPU/GPU data transfers. For example, when doing multiple
    passes through a train batch (e.g. for PPO) using `config.num_sgd_iter`, the
    actual train batch is only split once and loaded once into the GPU(s).

    Examples:
        >>> from ray.rllib.execution.rollout_ops import synchronous_parallel_sample
        >>> algo = [...] # doctest: +SKIP
        >>> train_batch = synchronous_parallel_sample(algo.workers) # doctest: +SKIP
        >>> # This trains the policy on one batch.
        >>> results = multi_gpu_train_one_step(algo, train_batch)) # doctest: +SKIP
        {"default_policy": ...}

    Updates the NUM_ENV_STEPS_TRAINED and NUM_AGENT_STEPS_TRAINED counters as well as
    the LOAD_BATCH_TIMER and LEARN_ON_BATCH_TIMER timers of the Algorithm instance.
    """
    config = algorithm.config
    workers = algorithm.workers
    local_worker = workers.local_worker()
    num_sgd_iter = config.get("num_sgd_iter", 1)
    sgd_minibatch_size = config.get("sgd_minibatch_size", config["train_batch_size"])

    # Determine the number of devices (GPUs or 1 CPU) we use.
    num_devices = int(math.ceil(config["num_gpus"] or 1))

    # Make sure total batch size is dividable by the number of devices.
    # Batch size per tower.
    per_device_batch_size = sgd_minibatch_size // num_devices
    # Total batch size.
    batch_size = per_device_batch_size * num_devices
    assert batch_size % num_devices == 0
    assert batch_size >= num_devices, "Batch size too small!"

    # Handle everything as if multi-agent.
    train_batch = train_batch.as_multi_agent()

    # Load data into GPUs.
    load_timer = algorithm._timers[LOAD_BATCH_TIMER]
    with load_timer:
        num_loaded_samples = {}
        for policy_id, batch in train_batch.policy_batches.items():
            # Not a policy-to-train.
            if not local_worker.is_policy_to_train(policy_id, train_batch):
                continue

            # Decompress SampleBatch, in case some columns are compressed.
            batch.decompress_if_needed()

            # Load the entire train batch into the Policy's only buffer
            # (idx=0). Policies only have >1 buffers, if we are training
            # asynchronously.
            num_loaded_samples[policy_id] = local_worker.policy_map[
                policy_id
            ].load_batch_into_buffer(batch, buffer_index=0)

    # Execute minibatch SGD on loaded data.
    learn_timer = algorithm._timers[LEARN_ON_BATCH_TIMER]
    with learn_timer:
        # Use LearnerInfoBuilder as a unified way to build the final
        # results dict from `learn_on_loaded_batch` call(s).
        # This makes sure results dicts always have the same structure
        # no matter the setup (multi-GPU, multi-agent, minibatch SGD,
        # tf vs torch).
        learner_info_builder = LearnerInfoBuilder(num_devices=num_devices)

        for policy_id, samples_per_device in num_loaded_samples.items():
            policy = local_worker.policy_map[policy_id]
            num_batches = max(1, int(samples_per_device) // int(per_device_batch_size))
            logger.debug("== sgd epochs for {} ==".format(policy_id))
            for _ in range(num_sgd_iter):
                permutation = np.random.permutation(num_batches)
                for batch_index in range(num_batches):
                    # Learn on the pre-loaded data in the buffer.
                    # Note: For minibatch SGD, the data is an offset into
                    # the pre-loaded entire train batch.
                    results = policy.learn_on_loaded_batch(
                        permutation[batch_index] * per_device_batch_size, buffer_index=0
                    )

                    learner_info_builder.add_learn_on_batch_results(results, policy_id)

        # Tower reduce and finalize results.
        learner_info = learner_info_builder.finalize()

    load_timer.push_units_processed(train_batch.count)
    learn_timer.push_units_processed(train_batch.count)

    # TODO: Move this into Trainer's `training_iteration` method for
    #  better transparency.
    algorithm._counters[NUM_ENV_STEPS_TRAINED] += train_batch.count
    algorithm._counters[NUM_AGENT_STEPS_TRAINED] += train_batch.agent_steps()

    return learner_info
[rllib] Pull out experimental dsl into rllib.execution module, add initial unit tests (#7958) 2020-04-10 00:56:08 -07:00			`import logging`
[rllib] Execute PPO using training workflow (#8206) * wip * add kl * kl * works now * doc update * reorg * add ddppo * add stats * fix fetch * comment * fix learner stat regression * test fixes * fix test 2020-04-30 01:18:09 -07:00			`import numpy as np`
			`import math`
[RLlib] Remove execution plan code no longer used by RLlib. (#25624) 2022-06-14 01:57:27 -07:00			`from typing import Dict`
[rllib] Pull out experimental dsl into rllib.execution module, add initial unit tests (#7958) 2020-04-10 00:56:08 -07:00
[rllib] Add type annotations for evaluation/, env/ packages (#9003) 2020-06-19 13:09:05 -07:00			`from ray.rllib.execution.common import (`
[RLlib] Unify all RLlib Trainer.train() -> results[info][learner][policy ID][learner_stats] and add structure tests. (#18879) 2021-09-30 16:39:05 +02:00			`LEARN_ON_BATCH_TIMER,`
[RLlib] Issue #13802: Enhance metrics for `multiagent->count_steps_by=agent_steps` setting. (#14033) 2021-03-18 20:27:41 +01:00			`LOAD_BATCH_TIMER,`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`)`
[RLlib] Rewrite PPO to use training_iteration + enable DD-PPO for Win32. (#23673) 2022-04-11 08:39:10 +02:00			`from ray.rllib.utils.annotations import DeveloperAPI`
[RLlib] Minor `rllib.utils` cleanup. (#8932) 2020-06-16 08:52:20 +02:00			`from ray.rllib.utils.framework import try_import_tf`
[RLlib] POC: Run PGTrainer w/o the distr. exec API (Trainer's new training_iteration method). (#20984) 2021-12-21 08:39:05 +01:00			`from ray.rllib.utils.metrics import NUM_ENV_STEPS_TRAINED, NUM_AGENT_STEPS_TRAINED`
[RLlib] Remove execution plan code no longer used by RLlib. (#25624) 2022-06-14 01:57:27 -07:00			`from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder`
[RLlib] Multi-GPU for tf-DQN/PG/A2C. (#13393) 2021-03-08 15:41:27 +01:00			`from ray.rllib.utils.sgd import do_minibatch_sgd`
[rllib] Execute PPO using training workflow (#8206) * wip * add kl * kl * works now * doc update * reorg * add ddppo * add stats * fix fetch * comment * fix learner stat regression * test fixes * fix test 2020-04-30 01:18:09 -07:00
[RLlib] Tf2x preparation; part 2 (upgrading `try_import_tf()`). (#9136) * WIP. * Fixes. * LINT. * WIP. * WIP. * Fixes. * Fixes. * Fixes. * Fixes. * WIP. * Fixes. * Test * Fix. * Fixes and LINT. * Fixes and LINT. * LINT. 2020-06-30 10:13:20 +02:00			`tf1, tf, tfv = try_import_tf()`
[rllib] Pull out experimental dsl into rllib.execution module, add initial unit tests (#7958) 2020-04-10 00:56:08 -07:00
			`logger = logging.getLogger(__name__)`


[RLlib] Rewrite PPO to use training_iteration + enable DD-PPO for Win32. (#23673) 2022-04-11 08:39:10 +02:00			`@DeveloperAPI`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`def train_one_step(algorithm, train_batch, policies_to_train=None) -> Dict:`
[RLlib] Rewrite PPO to use training_iteration + enable DD-PPO for Win32. (#23673) 2022-04-11 08:39:10 +02:00			"""Function that improves the all policies in `train_batch` on the local worker.

			`Examples:`
			`>>> from ray.rllib.execution.rollout_ops import synchronous_parallel_sample`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`>>> algo = [...] # doctest: +SKIP`
			`>>> train_batch = synchronous_parallel_sample(algo.workers) # doctest: +SKIP`
[RLlib] Rewrite PPO to use training_iteration + enable DD-PPO for Win32. (#23673) 2022-04-11 08:39:10 +02:00			`>>> # This trains the policy on one batch.`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`>>> results = train_one_step(algo, train_batch)) # doctest: +SKIP`
[RLlib] Rewrite PPO to use training_iteration + enable DD-PPO for Win32. (#23673) 2022-04-11 08:39:10 +02:00			`{"default_policy": ...}`

			`Updates the NUM_ENV_STEPS_TRAINED and NUM_AGENT_STEPS_TRAINED counters as well as`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			the LEARN_ON_BATCH_TIMER timer of the `algorithm` object.
[RLlib] Rewrite PPO to use training_iteration + enable DD-PPO for Win32. (#23673) 2022-04-11 08:39:10 +02:00			`"""`

[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`config = algorithm.config`
			`workers = algorithm.workers`
[RLlib] POC: Run PGTrainer w/o the distr. exec API (Trainer's new training_iteration method). (#20984) 2021-12-21 08:39:05 +01:00			`local_worker = workers.local_worker()`
[RLlib] Fix config mismatch for train_one_step. num_sgd_iter instead of sgd_num_iter. (#21555) 2022-01-18 15:00:27 +00:00			`num_sgd_iter = config.get("num_sgd_iter", 1)`
[RLlib] POC: Run PGTrainer w/o the distr. exec API (Trainer's new training_iteration method). (#20984) 2021-12-21 08:39:05 +01:00			`sgd_minibatch_size = config.get("sgd_minibatch_size", 0)`

[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`learn_timer = algorithm._timers[LEARN_ON_BATCH_TIMER]`
[RLlib] POC: Run PGTrainer w/o the distr. exec API (Trainer's new training_iteration method). (#20984) 2021-12-21 08:39:05 +01:00			`with learn_timer:`
			# Subsample minibatches (size=`sgd_minibatch_size`) from the
			# train batch and loop through train batch `num_sgd_iter` times.
			`if num_sgd_iter > 1 or sgd_minibatch_size > 0:`
			`info = do_minibatch_sgd(`
[RLlib] Make `policies_to_train` more flexible via callable option. (#20735) 2022-01-27 12:17:34 +01:00			`train_batch,`
			`{`
			`pid: local_worker.get_policy(pid)`
[RLlib] Examples folder: All `training_iteration` translations. (#23712) 2022-04-05 16:33:50 +02:00			`for pid in policies_to_train`
			`or local_worker.get_policies_to_train(train_batch)`
[RLlib] Make `policies_to_train` more flexible via callable option. (#20735) 2022-01-27 12:17:34 +01:00			`},`
			`local_worker,`
			`num_sgd_iter,`
			`sgd_minibatch_size,`
			`[],`
			`)`
[RLlib] POC: Run PGTrainer w/o the distr. exec API (Trainer's new training_iteration method). (#20984) 2021-12-21 08:39:05 +01:00			`# Single update step using train batch.`
			`else:`
			`info = local_worker.learn_on_batch(train_batch)`

			`learn_timer.push_units_processed(train_batch.count)`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`algorithm._counters[NUM_ENV_STEPS_TRAINED] += train_batch.count`
			`algorithm._counters[NUM_AGENT_STEPS_TRAINED] += train_batch.agent_steps()`
[RLlib] POC: Run PGTrainer w/o the distr. exec API (Trainer's new training_iteration method). (#20984) 2021-12-21 08:39:05 +01:00
			`return info`


[RLlib] Rewrite PPO to use training_iteration + enable DD-PPO for Win32. (#23673) 2022-04-11 08:39:10 +02:00			`@DeveloperAPI`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`def multi_gpu_train_one_step(algorithm, train_batch) -> Dict:`
[RLlib] Rewrite PPO to use training_iteration + enable DD-PPO for Win32. (#23673) 2022-04-11 08:39:10 +02:00			`"""Multi-GPU version of train_one_step.`

			Uses the policies' `load_batch_into_buffer` and `learn_on_loaded_batch` methods
			`to be more efficient wrt CPU/GPU data transfers. For example, when doing multiple`
			passes through a train batch (e.g. for PPO) using `config.num_sgd_iter`, the
			`actual train batch is only split once and loaded once into the GPU(s).`

			`Examples:`
			`>>> from ray.rllib.execution.rollout_ops import synchronous_parallel_sample`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`>>> algo = [...] # doctest: +SKIP`
			`>>> train_batch = synchronous_parallel_sample(algo.workers) # doctest: +SKIP`
[RLlib] Rewrite PPO to use training_iteration + enable DD-PPO for Win32. (#23673) 2022-04-11 08:39:10 +02:00			`>>> # This trains the policy on one batch.`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`>>> results = multi_gpu_train_one_step(algo, train_batch)) # doctest: +SKIP`
[RLlib] Rewrite PPO to use training_iteration + enable DD-PPO for Win32. (#23673) 2022-04-11 08:39:10 +02:00			`{"default_policy": ...}`

			`Updates the NUM_ENV_STEPS_TRAINED and NUM_AGENT_STEPS_TRAINED counters as well as`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`the LOAD_BATCH_TIMER and LEARN_ON_BATCH_TIMER timers of the Algorithm instance.`
[RLlib] Rewrite PPO to use training_iteration + enable DD-PPO for Win32. (#23673) 2022-04-11 08:39:10 +02:00			`"""`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`config = algorithm.config`
			`workers = algorithm.workers`
[RLlib] Issue 18499: PGTrainer with training_iteration fn does not support multi-GPU. (#21376) 2022-01-05 18:22:33 +01:00			`local_worker = workers.local_worker()`
[RLlib] Rewrite PPO to use training_iteration + enable DD-PPO for Win32. (#23673) 2022-04-11 08:39:10 +02:00			`num_sgd_iter = config.get("num_sgd_iter", 1)`
[RLlib] Issue 18499: PGTrainer with training_iteration fn does not support multi-GPU. (#21376) 2022-01-05 18:22:33 +01:00			`sgd_minibatch_size = config.get("sgd_minibatch_size", config["train_batch_size"])`

			`# Determine the number of devices (GPUs or 1 CPU) we use.`
			`num_devices = int(math.ceil(config["num_gpus"] or 1))`

			`# Make sure total batch size is dividable by the number of devices.`
			`# Batch size per tower.`
			`per_device_batch_size = sgd_minibatch_size // num_devices`
			`# Total batch size.`
			`batch_size = per_device_batch_size * num_devices`
			`assert batch_size % num_devices == 0`
			`assert batch_size >= num_devices, "Batch size too small!"`

			`# Handle everything as if multi-agent.`
			`train_batch = train_batch.as_multi_agent()`

			`# Load data into GPUs.`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`load_timer = algorithm._timers[LOAD_BATCH_TIMER]`
[RLlib] Issue 18499: PGTrainer with training_iteration fn does not support multi-GPU. (#21376) 2022-01-05 18:22:33 +01:00			`with load_timer:`
			`num_loaded_samples = {}`
			`for policy_id, batch in train_batch.policy_batches.items():`
			`# Not a policy-to-train.`
[RLlib] Make `policies_to_train` more flexible via callable option. (#20735) 2022-01-27 12:17:34 +01:00			`if not local_worker.is_policy_to_train(policy_id, train_batch):`
[RLlib] Issue 18499: PGTrainer with training_iteration fn does not support multi-GPU. (#21376) 2022-01-05 18:22:33 +01:00			`continue`

			`# Decompress SampleBatch, in case some columns are compressed.`
			`batch.decompress_if_needed()`

			`# Load the entire train batch into the Policy's only buffer`
			`# (idx=0). Policies only have >1 buffers, if we are training`
			`# asynchronously.`
			`num_loaded_samples[policy_id] = local_worker.policy_map[`
			`policy_id`
			`].load_batch_into_buffer(batch, buffer_index=0)`

			`# Execute minibatch SGD on loaded data.`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`learn_timer = algorithm._timers[LEARN_ON_BATCH_TIMER]`
[RLlib] Issue 18499: PGTrainer with training_iteration fn does not support multi-GPU. (#21376) 2022-01-05 18:22:33 +01:00			`with learn_timer:`
			`# Use LearnerInfoBuilder as a unified way to build the final`
			# results dict from `learn_on_loaded_batch` call(s).
			`# This makes sure results dicts always have the same structure`
			`# no matter the setup (multi-GPU, multi-agent, minibatch SGD,`
			`# tf vs torch).`
			`learner_info_builder = LearnerInfoBuilder(num_devices=num_devices)`

			`for policy_id, samples_per_device in num_loaded_samples.items():`
			`policy = local_worker.policy_map[policy_id]`
			`num_batches = max(1, int(samples_per_device) // int(per_device_batch_size))`
			`logger.debug("== sgd epochs for {} ==".format(policy_id))`
			`for _ in range(num_sgd_iter):`
			`permutation = np.random.permutation(num_batches)`
			`for batch_index in range(num_batches):`
			`# Learn on the pre-loaded data in the buffer.`
			`# Note: For minibatch SGD, the data is an offset into`
			`# the pre-loaded entire train batch.`
			`results = policy.learn_on_loaded_batch(`
			`permutation[batch_index] * per_device_batch_size, buffer_index=0`
			`)`

			`learner_info_builder.add_learn_on_batch_results(results, policy_id)`

			`# Tower reduce and finalize results.`
			`learner_info = learner_info_builder.finalize()`

			`load_timer.push_units_processed(train_batch.count)`
			`learn_timer.push_units_processed(train_batch.count)`

[RLlib] CQL: training iteration function. (#24166) 2022-04-26 14:28:39 +02:00			# TODO: Move this into Trainer's `training_iteration` method for
			`# better transparency.`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`algorithm._counters[NUM_ENV_STEPS_TRAINED] += train_batch.count`
			`algorithm._counters[NUM_AGENT_STEPS_TRAINED] += train_batch.agent_steps()`
[RLlib] Issue 18499: PGTrainer with training_iteration fn does not support multi-GPU. (#21376) 2022-01-05 18:22:33 +01:00
			`return learner_info`