ray/rllib/utils/sgd.py

"""Utils for minibatch SGD across multiple RLlib policies."""

import logging
import numpy as np
import random

from ray.rllib.utils.annotations import DeveloperAPI
from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder

logger = logging.getLogger(__name__)


@DeveloperAPI
def standardized(array: np.ndarray):
    """Normalize the values in an array.

    Args:
        array (np.ndarray): Array of values to normalize.

    Returns:
        array with zero mean and unit standard deviation.
    """
    return (array - array.mean()) / max(1e-4, array.std())


@DeveloperAPI
def minibatches(samples: SampleBatch, sgd_minibatch_size: int, shuffle: bool = True):
    """Return a generator yielding minibatches from a sample batch.

    Args:
        samples: SampleBatch to split up.
        sgd_minibatch_size: Size of minibatches to return.
        shuffle: Whether to shuffle the order of the generated minibatches.
            Note that in case of a non-recurrent policy, the incoming batch
            is globally shuffled first regardless of this setting, before
            the minibatches are generated from it!

    Yields:
        SampleBatch: Each of size `sgd_minibatch_size`.
    """
    if not sgd_minibatch_size:
        yield samples
        return

    if isinstance(samples, MultiAgentBatch):
        raise NotImplementedError(
            "Minibatching not implemented for multi-agent in simple mode"
        )

    if "state_in_0" not in samples and "state_out_0" not in samples:
        samples.shuffle()

    all_slices = samples._get_slice_indices(sgd_minibatch_size)
    data_slices, state_slices = all_slices

    if len(state_slices) == 0:
        if shuffle:
            random.shuffle(data_slices)
        for i, j in data_slices:
            yield samples.slice(i, j)
    else:
        all_slices = list(zip(data_slices, state_slices))
        if shuffle:
            # Make sure to shuffle data and states while linked together.
            random.shuffle(all_slices)
        for (i, j), (si, sj) in all_slices:
            yield samples.slice(i, j, si, sj)


@DeveloperAPI
def do_minibatch_sgd(
    samples,
    policies,
    local_worker,
    num_sgd_iter,
    sgd_minibatch_size,
    standardize_fields,
):
    """Execute minibatch SGD.

    Args:
        samples (SampleBatch): Batch of samples to optimize.
        policies (dict): Dictionary of policies to optimize.
        local_worker (RolloutWorker): Master rollout worker instance.
        num_sgd_iter (int): Number of epochs of optimization to take.
        sgd_minibatch_size (int): Size of minibatches to use for optimization.
        standardize_fields (list): List of sample field names that should be
            normalized prior to optimization.

    Returns:
        averaged info fetches over the last SGD epoch taken.
    """

    # Handle everything as if multi-agent.
    samples = samples.as_multi_agent()

    # Use LearnerInfoBuilder as a unified way to build the final
    # results dict from `learn_on_loaded_batch` call(s).
    # This makes sure results dicts always have the same structure
    # no matter the setup (multi-GPU, multi-agent, minibatch SGD,
    # tf vs torch).
    learner_info_builder = LearnerInfoBuilder(num_devices=1)
    for policy_id, policy in policies.items():
        if policy_id not in samples.policy_batches:
            continue

        batch = samples.policy_batches[policy_id]
        for field in standardize_fields:
            batch[field] = standardized(batch[field])

        # Check to make sure that the sgd_minibatch_size is not smaller
        # than max_seq_len otherwise this will cause indexing errors while
        # performing sgd when using a RNN or Attention model
        if (
            policy.is_recurrent()
            and policy.config["model"]["max_seq_len"] > sgd_minibatch_size
        ):
            raise ValueError(
                "`sgd_minibatch_size` ({}) cannot be smaller than"
                "`max_seq_len` ({}).".format(
                    sgd_minibatch_size, policy.config["model"]["max_seq_len"]
                )
            )

        for i in range(num_sgd_iter):
            for minibatch in minibatches(batch, sgd_minibatch_size):
                results = (
                    local_worker.learn_on_batch(
                        MultiAgentBatch({policy_id: minibatch}, minibatch.count)
                    )
                )[policy_id]
                learner_info_builder.add_learn_on_batch_results(results, policy_id)

    learner_info = learner_info_builder.finalize()
    return learner_info
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00			`"""Utils for minibatch SGD across multiple RLlib policies."""`

			`import logging`
[RLlib] Unify all RLlib Trainer.train() -> results[info][learner][policy ID][learner_stats] and add structure tests. (#18879) 2021-09-30 16:39:05 +02:00			`import numpy as np`
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00			`import random`

[api] Add API stability annotations for all RLlib symbols and add to LINT (#25060) 2022-05-24 22:14:25 -07:00			`from ray.rllib.utils.annotations import DeveloperAPI`
[RLlib] Issue 18499: PGTrainer with training_iteration fn does not support multi-GPU. (#21376) 2022-01-05 18:22:33 +01:00			`from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch`
[RLlib] Unify all RLlib Trainer.train() -> results[info][learner][policy ID][learner_stats] and add structure tests. (#18879) 2021-09-30 16:39:05 +02:00			`from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder`
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00
			`logger = logging.getLogger(__name__)`


[api] Add API stability annotations for all RLlib symbols and add to LINT (#25060) 2022-05-24 22:14:25 -07:00			`@DeveloperAPI`
[RLlib] Unify all RLlib Trainer.train() -> results[info][learner][policy ID][learner_stats] and add structure tests. (#18879) 2021-09-30 16:39:05 +02:00			`def standardized(array: np.ndarray):`
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00			`"""Normalize the values in an array.`

[RLlib] SAC algo cleanup. (#10825) 2020-09-20 11:27:02 +02:00			`Args:`
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00			`array (np.ndarray): Array of values to normalize.`

			`Returns:`
			`array with zero mean and unit standard deviation.`
			`"""`
			`return (array - array.mean()) / max(1e-4, array.std())`


[api] Add API stability annotations for all RLlib symbols and add to LINT (#25060) 2022-05-24 22:14:25 -07:00			`@DeveloperAPI`
[RLlib] Reinstate trajectory view API tests. (#18809) 2021-09-23 08:31:51 +02:00			`def minibatches(samples: SampleBatch, sgd_minibatch_size: int, shuffle: bool = True):`
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00			`"""Return a generator yielding minibatches from a sample batch.`

[RLlib] SAC algo cleanup. (#10825) 2020-09-20 11:27:02 +02:00			`Args:`
[RLlib] Reinstate trajectory view API tests. (#18809) 2021-09-23 08:31:51 +02:00			`samples: SampleBatch to split up.`
			`sgd_minibatch_size: Size of minibatches to return.`
			`shuffle: Whether to shuffle the order of the generated minibatches.`
			`Note that in case of a non-recurrent policy, the incoming batch`
			`is globally shuffled first regardless of this setting, before`
			`the minibatches are generated from it!`

			`Yields:`
			SampleBatch: Each of size `sgd_minibatch_size`.
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00			`"""`
			`if not sgd_minibatch_size:`
			`yield samples`
			`return`

			`if isinstance(samples, MultiAgentBatch):`
			`raise NotImplementedError(`
			`"Minibatching not implemented for multi-agent in simple mode"`
			`)`

[RLlib] Torch multi-GPU + LSTM/RNN bug fix. (#15492) 2021-05-18 11:51:05 +02:00			`if "state_in_0" not in samples and "state_out_0" not in samples:`
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00			`samples.shuffle()`

[RLlib] Torch multi-GPU + LSTM/RNN bug fix. (#15492) 2021-05-18 11:51:05 +02:00			`all_slices = samples._get_slice_indices(sgd_minibatch_size)`
			`data_slices, state_slices = all_slices`
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00
[RLlib] Torch multi-GPU + LSTM/RNN bug fix. (#15492) 2021-05-18 11:51:05 +02:00			`if len(state_slices) == 0:`
			`if shuffle:`
			`random.shuffle(data_slices)`
			`for i, j in data_slices:`
			`yield samples.slice(i, j)`
			`else:`
			`all_slices = list(zip(data_slices, state_slices))`
			`if shuffle:`
			`# Make sure to shuffle data and states while linked together.`
			`random.shuffle(all_slices)`
			`for (i, j), (si, sj) in all_slices:`
			`yield samples.slice(i, j, si, sj)`
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00

[api] Add API stability annotations for all RLlib symbols and add to LINT (#25060) 2022-05-24 22:14:25 -07:00			`@DeveloperAPI`
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00			`def do_minibatch_sgd(`
			`samples,`
			`policies,`
			`local_worker,`
			`num_sgd_iter,`
			`sgd_minibatch_size,`
			`standardize_fields,`
			`):`
			`"""Execute minibatch SGD.`

[RLlib] SAC algo cleanup. (#10825) 2020-09-20 11:27:02 +02:00			`Args:`
[RLlib] Extend on_learn_on_batch callback to allow for custom metrics to be added. (#13584) 2021-02-08 15:02:19 +01:00			`samples (SampleBatch): Batch of samples to optimize.`
			`policies (dict): Dictionary of policies to optimize.`
			`local_worker (RolloutWorker): Master rollout worker instance.`
			`num_sgd_iter (int): Number of epochs of optimization to take.`
			`sgd_minibatch_size (int): Size of minibatches to use for optimization.`
			`standardize_fields (list): List of sample field names that should be`
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00			`normalized prior to optimization.`

			`Returns:`
			`averaged info fetches over the last SGD epoch taken.`
			`"""`
[RLlib] Issue 18499: PGTrainer with training_iteration fn does not support multi-GPU. (#21376) 2022-01-05 18:22:33 +01:00
			`# Handle everything as if multi-agent.`
			`samples = samples.as_multi_agent()`
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00
[RLlib] Unify all RLlib Trainer.train() -> results[info][learner][policy ID][learner_stats] and add structure tests. (#18879) 2021-09-30 16:39:05 +02:00			`# Use LearnerInfoBuilder as a unified way to build the final`
			# results dict from `learn_on_loaded_batch` call(s).
			`# This makes sure results dicts always have the same structure`
			`# no matter the setup (multi-GPU, multi-agent, minibatch SGD,`
			`# tf vs torch).`
			`learner_info_builder = LearnerInfoBuilder(num_devices=1)`
[RLlib] Update `max_seq_len` in pad_batch_to_sequences_of_same_size (#20743) 2021-11-30 12:00:07 -05:00			`for policy_id, policy in policies.items():`
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00			`if policy_id not in samples.policy_batches:`
			`continue`

			`batch = samples.policy_batches[policy_id]`
			`for field in standardize_fields:`
			`batch[field] = standardized(batch[field])`

[RLlib] Update `max_seq_len` in pad_batch_to_sequences_of_same_size (#20743) 2021-11-30 12:00:07 -05:00			`# Check to make sure that the sgd_minibatch_size is not smaller`
			`# than max_seq_len otherwise this will cause indexing errors while`
			`# performing sgd when using a RNN or Attention model`
			`if (`
			`policy.is_recurrent()`
			`and policy.config["model"]["max_seq_len"] > sgd_minibatch_size`
			`):`
			`raise ValueError(`
			"`sgd_minibatch_size` ({}) cannot be smaller than"
			"`max_seq_len` ({}).".format(
			`sgd_minibatch_size, policy.config["model"]["max_seq_len"]`
			`)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`)`
[RLlib] Update `max_seq_len` in pad_batch_to_sequences_of_same_size (#20743) 2021-11-30 12:00:07 -05:00
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00			`for i in range(num_sgd_iter):`
			`for minibatch in minibatches(batch, sgd_minibatch_size):`
[RLlib] Unify all RLlib Trainer.train() -> results[info][learner][policy ID][learner_stats] and add structure tests. (#18879) 2021-09-30 16:39:05 +02:00			`results = (`
			`local_worker.learn_on_batch(`
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00			`MultiAgentBatch({policy_id: minibatch}, minibatch.count)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`)`
[rllib] [experimental] Decentralized Distributed PPO for torch (DD-PPO) (#6918) 2020-01-25 22:36:43 -08:00			`)[policy_id]`
[RLlib] Unify all RLlib Trainer.train() -> results[info][learner][policy ID][learner_stats] and add structure tests. (#18879) 2021-09-30 16:39:05 +02:00			`learner_info_builder.add_learn_on_batch_results(results, policy_id)`

			`learner_info = learner_info_builder.finalize()`
			`return learner_info`