ray/rllib/agents/slateq/slateq.py

"""
SlateQ (Reinforcement Learning for Recommendation)
==================================================

This file defines the trainer class for the SlateQ algorithm from the
`"Reinforcement Learning for Slate-based Recommender Systems: A Tractable
Decomposition and Practical Methodology" <https://arxiv.org/abs/1905.12767>`_
paper.

See `slateq_torch_policy.py` for the definition of the policy. Currently, only
PyTorch is supported. The algorithm is written and tested for Google's RecSim
environment (https://github.com/google-research/recsim).
"""

import logging
from typing import List, Type

from ray.rllib.agents.slateq.slateq_tf_policy import SlateQTFPolicy
from ray.rllib.agents.slateq.slateq_torch_policy import SlateQTorchPolicy
from ray.rllib.agents.trainer import Trainer, with_common_config
from ray.rllib.evaluation.worker_set import WorkerSet
from ray.rllib.execution.concurrency_ops import Concurrently
from ray.rllib.execution.metric_ops import StandardMetricsReporting
from ray.rllib.execution.replay_ops import Replay, StoreToReplayBuffer
from ray.rllib.execution.rollout_ops import ParallelRollouts
from ray.rllib.execution.train_ops import (
    MultiGPUTrainOneStep,
    TrainOneStep,
    UpdateTargetNetwork,
)
from ray.rllib.policy.policy import Policy
from ray.rllib.utils.annotations import override
from ray.rllib.utils.deprecation import DEPRECATED_VALUE
from ray.rllib.utils.typing import TrainerConfigDict
from ray.util.iter import LocalIterator

logger = logging.getLogger(__name__)


# fmt: off
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
    # === Model ===
    # Dense-layer setup for each the n (document) candidate Q-network stacks.
    "fcnet_hiddens_per_candidate": [256, 32],

    # === Exploration Settings ===
    "exploration_config": {
        # The Exploration class to use.
        # Must be SlateEpsilonGreedy or SlateSoftQ to handle the problem that
        # the action space of the policy is different from the space used inside
        # the exploration component.
        # E.g.: action_space=MultiDiscrete([5, 5]) <- slate-size=2, num-docs=5,
        # but action distribution is Categorical(5*4) -> all possible unique slates.
        "type": "SlateEpsilonGreedy",
        "warmup_timesteps": 20000,
        "epsilon_timesteps": 250000,
        "final_epsilon": 0.01,
    },
    # Switch to greedy actions in evaluation workers.
    "evaluation_config": {
        "explore": False,
    },

    # Minimum env steps to optimize for per train call. This value does
    # not affect learning, only the length of iterations.
    "timesteps_per_iteration": 1000,
    # Update the target network every `target_network_update_freq` steps.
    "target_network_update_freq": 3200,
    # Update the target by \tau * policy + (1-\tau) * target_policy.
    "tau": 1.0,

    # If True, use huber loss instead of squared loss for critic network
    # Conventionally, no need to clip gradients if using a huber loss
    "use_huber": False,
    # Threshold of the huber loss.
    "huber_threshold": 1.0,

    # === Replay buffer ===
    # Size of the replay buffer. Note that if async_updates is set, then
    # each worker will have a replay buffer of this size.
    "buffer_size": DEPRECATED_VALUE,
    "replay_buffer_config": {
        "type": "MultiAgentReplayBuffer",
        "capacity": 100000,
    },
    # The number of contiguous environment steps to replay at once. This may
    # be set to greater than 1 to support recurrent models.
    "replay_sequence_length": 1,
    # Whether to LZ4 compress observations
    "compress_observations": False,
    # If set, this will fix the ratio of replayed from a buffer and learned on
    # timesteps to sampled from an environment and stored in the replay buffer
    # timesteps. Otherwise, the replay will proceed at the native ratio
    # determined by (train_batch_size / rollout_fragment_length).
    "training_intensity": None,

    # === Optimization ===
    # Learning rate for RMSprop optimizer for the q-model.
    "lr": 0.00025,
    # Learning rate schedule.
    # In the format of [[timestep, value], [timestep, value], ...]
    # A schedule should normally start from timestep 0.
    "lr_schedule": None,
    # Learning rate for adam optimizer for the user choice model.
    "lr_choice_model": 1e-3,  # Only relevant for framework=torch.

    # RMSProp epsilon hyper parameter.
    "rmsprop_epsilon": 1e-5,
    # If not None, clip gradients during optimization at this value
    "grad_clip": None,
    # How many steps of the model to sample before learning starts.
    "learning_starts": 20000,
    # Update the replay buffer with this many samples at once. Note that
    # this setting applies per-worker if num_workers > 1.
    "rollout_fragment_length": 4,
    # Size of a batch sampled from replay buffer for training. Note that
    # if async_updates is set, then each worker returns gradients for a
    # batch of this size.
    "train_batch_size": 32,

    # === Parallelism ===
    # Number of workers for collecting samples with. This only makes sense
    # to increase if your environment is particularly slow to sample, or if
    # you"re using the Async or Ape-X optimizers.
    "num_workers": 0,
    # Whether to compute priorities on workers.
    "worker_side_prioritization": False,
    # Prevent reporting frequency from going lower than this time span.
    "min_time_s_per_reporting": 1,

    # Switch on no-preprocessors for easier Q-model coding.
    "_disable_preprocessor_api": True,
})
# __sphinx_doc_end__
# fmt: on


def calculate_round_robin_weights(config: TrainerConfigDict) -> List[float]:
    """Calculate the round robin weights for the rollout and train steps"""
    if not config["training_intensity"]:
        return [1, 1]
    # e.g., 32 / 4 -> native ratio of 8.0
    native_ratio = config["train_batch_size"] / config["rollout_fragment_length"]
    # Training intensity is specified in terms of
    # (steps_replayed / steps_sampled), so adjust for the native ratio.
    weights = [1, config["training_intensity"] / native_ratio]
    return weights


class SlateQTrainer(Trainer):
    @classmethod
    @override(Trainer)
    def get_default_config(cls) -> TrainerConfigDict:
        return DEFAULT_CONFIG

    @override(Trainer)
    def get_default_policy_class(self, config: TrainerConfigDict) -> Type[Policy]:
        if config["framework"] == "torch":
            return SlateQTorchPolicy
        else:
            return SlateQTFPolicy

    @staticmethod
    @override(Trainer)
    def execution_plan(
        workers: WorkerSet, config: TrainerConfigDict, **kwargs
    ) -> LocalIterator[dict]:
        assert (
            "local_replay_buffer" in kwargs
        ), "SlateQ execution plan requires a local replay buffer."

        rollouts = ParallelRollouts(workers, mode="bulk_sync")

        # We execute the following steps concurrently:
        # (1) Generate rollouts and store them in our local replay buffer.
        # Calling next() on store_op drives this.
        store_op = rollouts.for_each(
            StoreToReplayBuffer(local_buffer=kwargs["local_replay_buffer"])
        )

        if config["simple_optimizer"]:
            train_step_op = TrainOneStep(workers)
        else:
            train_step_op = MultiGPUTrainOneStep(
                workers=workers,
                sgd_minibatch_size=config["train_batch_size"],
                num_sgd_iter=1,
                num_gpus=config["num_gpus"],
                _fake_gpus=config["_fake_gpus"],
            )

        # (2) Read and train on experiences from the replay buffer. Every batch
        # returned from the LocalReplay() iterator is passed to TrainOneStep to
        # take a SGD step.
        replay_op = (
            Replay(local_buffer=kwargs["local_replay_buffer"])
            .for_each(train_step_op)
            .for_each(
                UpdateTargetNetwork(workers, config["target_network_update_freq"])
            )
        )

        # Alternate deterministically between (1) and (2). Only return the
        # output of (2) since training metrics are not available until (2)
        # runs.
        train_op = Concurrently(
            [store_op, replay_op],
            mode="round_robin",
            output_indexes=[1],
            round_robin_weights=calculate_round_robin_weights(config),
        )

        return StandardMetricsReporting(train_op, workers, config)
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`"""`
			`SlateQ (Reinforcement Learning for Recommendation)`
			`==================================================`

			`This file defines the trainer class for the SlateQ algorithm from the`
			`"Reinforcement Learning for Slate-based Recommender Systems: A Tractable
			Decomposition and Practical Methodology" <https://arxiv.org/abs/1905.12767>`_
			`paper.`

			See `slateq_torch_policy.py` for the definition of the policy. Currently, only
			`PyTorch is supported. The algorithm is written and tested for Google's RecSim`
			`environment (https://github.com/google-research/recsim).`
			`"""`

			`import logging`
			`from typing import List, Type`

[RLlib] Slate-Q tf implementation and tests/benchmarks. (#22389) 2022-02-22 09:36:44 +01:00			`from ray.rllib.agents.slateq.slateq_tf_policy import SlateQTFPolicy`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`from ray.rllib.agents.slateq.slateq_torch_policy import SlateQTorchPolicy`
[RLlib] Sub-class `Trainer` (instead of `build_trainer()`): All remaining classes; soft-deprecate `build_trainer`. (#20725) 2021-12-04 22:05:26 +01:00			`from ray.rllib.agents.trainer import Trainer, with_common_config`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`from ray.rllib.evaluation.worker_set import WorkerSet`
			`from ray.rllib.execution.concurrency_ops import Concurrently`
			`from ray.rllib.execution.metric_ops import StandardMetricsReporting`
			`from ray.rllib.execution.replay_ops import Replay, StoreToReplayBuffer`
			`from ray.rllib.execution.rollout_ops import ParallelRollouts`
[RLlib] SlateQ (tf GPU + multi-GPU) + Bandit fixes (#23276) 2022-03-18 13:45:16 +01:00			`from ray.rllib.execution.train_ops import (`
			`MultiGPUTrainOneStep,`
			`TrainOneStep,`
			`UpdateTargetNetwork,`
			`)`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`from ray.rllib.policy.policy import Policy`
[RLlib] Sub-class `Trainer` (instead of `build_trainer()`): All remaining classes; soft-deprecate `build_trainer`. (#20725) 2021-12-04 22:05:26 +01:00			`from ray.rllib.utils.annotations import override`
[RLlib] Unify the way we create local replay buffer for all agents (#19627) * [RLlib] Unify the way we create and use LocalReplayBuffer for all the agents. This change 1. Get rid of the try...except clause when we call execution_plan(), and get rid of the Deprecation warning as a result. 2. Fix the execution_plan() call in Trainer._try_recover() too. 3. Most importantly, makes it much easier to create and use different types of local replay buffers for all our agents. E.g., allow us to easily create a reservoir sampling replay buffer for APPO agent for Riot in the near future. * Introduce explicit configuration for replay buffer types. * Fix is_training key error. * actually deprecate buffer_size field. 2021-10-26 11:56:02 -07:00			`from ray.rllib.utils.deprecation import DEPRECATED_VALUE`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`from ray.rllib.utils.typing import TrainerConfigDict`
			`from ray.util.iter import LocalIterator`

			`logger = logging.getLogger(__name__)`


[CI] Replace YAPF disables with Black disables (#21982) 2022-02-08 16:29:25 -08:00			`# fmt: off`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`# __sphinx_doc_begin__`
			`DEFAULT_CONFIG = with_common_config({`
			`# === Model ===`
[RLlib] Slate-Q tf implementation and tests/benchmarks. (#22389) 2022-02-22 09:36:44 +01:00			`# Dense-layer setup for each the n (document) candidate Q-network stacks.`
[RLlib] SlateQ: Add a hard-task learning test to weekly regression suite. (#22544) 2022-02-25 21:58:16 +01:00			`"fcnet_hiddens_per_candidate": [256, 32],`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00
			`# === Exploration Settings ===`
			`"exploration_config": {`
			`# The Exploration class to use.`
[RLlib] Slate-Q tf implementation and tests/benchmarks. (#22389) 2022-02-22 09:36:44 +01:00			`# Must be SlateEpsilonGreedy or SlateSoftQ to handle the problem that`
			`# the action space of the policy is different from the space used inside`
			`# the exploration component.`
			`# E.g.: action_space=MultiDiscrete([5, 5]) <- slate-size=2, num-docs=5,`
			`# but action distribution is Categorical(5*4) -> all possible unique slates.`
[RLlib] Cleanup SlateQ algo; add test + add target Q-net (#21827) 2022-02-04 17:01:12 +01:00			`"type": "SlateEpsilonGreedy",`
[RLlib] Slate-Q tf implementation and tests/benchmarks. (#22389) 2022-02-22 09:36:44 +01:00			`"warmup_timesteps": 20000,`
			`"epsilon_timesteps": 250000,`
			`"final_epsilon": 0.01,`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`},`
			`# Switch to greedy actions in evaluation workers.`
			`"evaluation_config": {`
			`"explore": False,`
			`},`

			`# Minimum env steps to optimize for per train call. This value does`
			`# not affect learning, only the length of iterations.`
			`"timesteps_per_iteration": 1000,`
[RLlib] Cleanup SlateQ algo; add test + add target Q-net (#21827) 2022-02-04 17:01:12 +01:00			# Update the target network every `target_network_update_freq` steps.
[RLlib] Slate-Q tf implementation and tests/benchmarks. (#22389) 2022-02-22 09:36:44 +01:00			`"target_network_update_freq": 3200,`
[RLlib] Cleanup SlateQ algo; add test + add target Q-net (#21827) 2022-02-04 17:01:12 +01:00			`# Update the target by \tau * policy + (1-\tau) * target_policy.`
[RLlib] Slate-Q tf implementation and tests/benchmarks. (#22389) 2022-02-22 09:36:44 +01:00			`"tau": 1.0,`

			`# If True, use huber loss instead of squared loss for critic network`
			`# Conventionally, no need to clip gradients if using a huber loss`
			`"use_huber": False,`
[RLlib] SlateQ: Add a hard-task learning test to weekly regression suite. (#22544) 2022-02-25 21:58:16 +01:00			`# Threshold of the huber loss.`
[RLlib] Slate-Q tf implementation and tests/benchmarks. (#22389) 2022-02-22 09:36:44 +01:00			`"huber_threshold": 1.0,`
[RLlib] R2D2 Implementation. (#13933) 2021-02-25 12:18:11 +01:00
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`# === Replay buffer ===`
			`# Size of the replay buffer. Note that if async_updates is set, then`
			`# each worker will have a replay buffer of this size.`
[RLlib] Unify the way we create local replay buffer for all agents (#19627) * [RLlib] Unify the way we create and use LocalReplayBuffer for all the agents. This change 1. Get rid of the try...except clause when we call execution_plan(), and get rid of the Deprecation warning as a result. 2. Fix the execution_plan() call in Trainer._try_recover() too. 3. Most importantly, makes it much easier to create and use different types of local replay buffers for all our agents. E.g., allow us to easily create a reservoir sampling replay buffer for APPO agent for Riot in the near future. * Introduce explicit configuration for replay buffer types. * Fix is_training key error. * actually deprecate buffer_size field. 2021-10-26 11:56:02 -07:00			`"buffer_size": DEPRECATED_VALUE,`
			`"replay_buffer_config": {`
[RLlib] Replay buffer API (cleanups; docstrings; renames; move into `rllib/execution/buffers` dir) (#20552) 2021-11-19 11:57:37 +01:00			`"type": "MultiAgentReplayBuffer",`
[RLlib] Cleanup SlateQ algo; add test + add target Q-net (#21827) 2022-02-04 17:01:12 +01:00			`"capacity": 100000,`
[RLlib] Unify the way we create local replay buffer for all agents (#19627) * [RLlib] Unify the way we create and use LocalReplayBuffer for all the agents. This change 1. Get rid of the try...except clause when we call execution_plan(), and get rid of the Deprecation warning as a result. 2. Fix the execution_plan() call in Trainer._try_recover() too. 3. Most importantly, makes it much easier to create and use different types of local replay buffers for all our agents. E.g., allow us to easily create a reservoir sampling replay buffer for APPO agent for Riot in the near future. * Introduce explicit configuration for replay buffer types. * Fix is_training key error. * actually deprecate buffer_size field. 2021-10-26 11:56:02 -07:00			`},`
[RLlib] R2D2 Implementation. (#13933) 2021-02-25 12:18:11 +01:00			`# The number of contiguous environment steps to replay at once. This may`
			`# be set to greater than 1 to support recurrent models.`
			`"replay_sequence_length": 1,`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`# Whether to LZ4 compress observations`
			`"compress_observations": False,`
			`# If set, this will fix the ratio of replayed from a buffer and learned on`
			`# timesteps to sampled from an environment and stored in the replay buffer`
			`# timesteps. Otherwise, the replay will proceed at the native ratio`
			`# determined by (train_batch_size / rollout_fragment_length).`
			`"training_intensity": None,`

			`# === Optimization ===`
[RLlib] SlateQ: Add a hard-task learning test to weekly regression suite. (#22544) 2022-02-25 21:58:16 +01:00			`# Learning rate for RMSprop optimizer for the q-model.`
			`"lr": 0.00025,`
[RLlib] Slate-Q tf implementation and tests/benchmarks. (#22389) 2022-02-22 09:36:44 +01:00			`# Learning rate schedule.`
			`# In the format of [[timestep, value], [timestep, value], ...]`
			`# A schedule should normally start from timestep 0.`
[RLlib] SlateQ: Add a hard-task learning test to weekly regression suite. (#22544) 2022-02-25 21:58:16 +01:00			`"lr_schedule": None,`
			`# Learning rate for adam optimizer for the user choice model.`
			`"lr_choice_model": 1e-3, # Only relevant for framework=torch.`
[RLlib] Slate-Q tf implementation and tests/benchmarks. (#22389) 2022-02-22 09:36:44 +01:00
[RLlib] SlateQ: Add a hard-task learning test to weekly regression suite. (#22544) 2022-02-25 21:58:16 +01:00			`# RMSProp epsilon hyper parameter.`
			`"rmsprop_epsilon": 1e-5,`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`# If not None, clip gradients during optimization at this value`
[RLlib] Slate-Q tf implementation and tests/benchmarks. (#22389) 2022-02-22 09:36:44 +01:00			`"grad_clip": None,`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`# How many steps of the model to sample before learning starts.`
[RLlib] Slate-Q tf implementation and tests/benchmarks. (#22389) 2022-02-22 09:36:44 +01:00			`"learning_starts": 20000,`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`# Update the replay buffer with this many samples at once. Note that`
			`# this setting applies per-worker if num_workers > 1.`
[RLlib] Slate-Q tf implementation and tests/benchmarks. (#22389) 2022-02-22 09:36:44 +01:00			`"rollout_fragment_length": 4,`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`# Size of a batch sampled from replay buffer for training. Note that`
			`# if async_updates is set, then each worker returns gradients for a`
			`# batch of this size.`
			`"train_batch_size": 32,`

			`# === Parallelism ===`
			`# Number of workers for collecting samples with. This only makes sense`
			`# to increase if your environment is particularly slow to sample, or if`
			`# you"re using the Async or Ape-X optimizers.`
			`"num_workers": 0,`
			`# Whether to compute priorities on workers.`
			`"worker_side_prioritization": False,`
[RLlib] Preparatory PR for multi-agent multi-GPU learner (alpha-star style) #03 (#21652) 2022-01-25 14:16:58 +01:00			`# Prevent reporting frequency from going lower than this time span.`
			`"min_time_s_per_reporting": 1,`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00
[RLlib] SlateQ: Add a hard-task learning test to weekly regression suite. (#22544) 2022-02-25 21:58:16 +01:00			`# Switch on no-preprocessors for easier Q-model coding.`
			`"_disable_preprocessor_api": True,`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`})`
			`# __sphinx_doc_end__`
[CI] Replace YAPF disables with Black disables (#21982) 2022-02-08 16:29:25 -08:00			`# fmt: on`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00

			`def calculate_round_robin_weights(config: TrainerConfigDict) -> List[float]:`
			`"""Calculate the round robin weights for the rollout and train steps"""`
			`if not config["training_intensity"]:`
			`return [1, 1]`
			`# e.g., 32 / 4 -> native ratio of 8.0`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`native_ratio = config["train_batch_size"] / config["rollout_fragment_length"]`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`# Training intensity is specified in terms of`
			`# (steps_replayed / steps_sampled), so adjust for the native ratio.`
			`weights = [1, config["training_intensity"] / native_ratio]`
			`return weights`


[RLlib] Sub-class `Trainer` (instead of `build_trainer()`): All remaining classes; soft-deprecate `build_trainer`. (#20725) 2021-12-04 22:05:26 +01:00			`class SlateQTrainer(Trainer):`
			`@classmethod`
			`@override(Trainer)`
			`def get_default_config(cls) -> TrainerConfigDict:`
			`return DEFAULT_CONFIG`

			`@override(Trainer)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`def get_default_policy_class(self, config: TrainerConfigDict) -> Type[Policy]:`
[RLlib] SlateQ: Add a hard-task learning test to weekly regression suite. (#22544) 2022-02-25 21:58:16 +01:00			`if config["framework"] == "torch":`
[RLlib] Sub-class `Trainer` (instead of `build_trainer()`): All remaining classes; soft-deprecate `build_trainer`. (#20725) 2021-12-04 22:05:26 +01:00			`return SlateQTorchPolicy`
[RLlib] Slate-Q tf implementation and tests/benchmarks. (#22389) 2022-02-22 09:36:44 +01:00			`else:`
			`return SlateQTFPolicy`
[RLlib] Sub-class `Trainer` (instead of `build_trainer()`): All remaining classes; soft-deprecate `build_trainer`. (#20725) 2021-12-04 22:05:26 +01:00
			`@staticmethod`
			`@override(Trainer)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`def execution_plan(`
			`workers: WorkerSet, config: TrainerConfigDict, **kwargs`
			`) -> LocalIterator[dict]:`
			`assert (`
			`"local_replay_buffer" in kwargs`
			`), "SlateQ execution plan requires a local replay buffer."`
[RLlib] Sub-class `Trainer` (instead of `build_trainer()`): All remaining classes; soft-deprecate `build_trainer`. (#20725) 2021-12-04 22:05:26 +01:00
			`rollouts = ParallelRollouts(workers, mode="bulk_sync")`

			`# We execute the following steps concurrently:`
			`# (1) Generate rollouts and store them in our local replay buffer.`
			`# Calling next() on store_op drives this.`
			`store_op = rollouts.for_each(`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`StoreToReplayBuffer(local_buffer=kwargs["local_replay_buffer"])`
			`)`
[RLlib] Sub-class `Trainer` (instead of `build_trainer()`): All remaining classes; soft-deprecate `build_trainer`. (#20725) 2021-12-04 22:05:26 +01:00
[RLlib] SlateQ (tf GPU + multi-GPU) + Bandit fixes (#23276) 2022-03-18 13:45:16 +01:00			`if config["simple_optimizer"]:`
			`train_step_op = TrainOneStep(workers)`
			`else:`
			`train_step_op = MultiGPUTrainOneStep(`
			`workers=workers,`
			`sgd_minibatch_size=config["train_batch_size"],`
			`num_sgd_iter=1,`
			`num_gpus=config["num_gpus"],`
			`_fake_gpus=config["_fake_gpus"],`
			`)`

[RLlib] Sub-class `Trainer` (instead of `build_trainer()`): All remaining classes; soft-deprecate `build_trainer`. (#20725) 2021-12-04 22:05:26 +01:00			`# (2) Read and train on experiences from the replay buffer. Every batch`
			`# returned from the LocalReplay() iterator is passed to TrainOneStep to`
			`# take a SGD step.`
[RLlib] Cleanup SlateQ algo; add test + add target Q-net (#21827) 2022-02-04 17:01:12 +01:00			`replay_op = (`
			`Replay(local_buffer=kwargs["local_replay_buffer"])`
[RLlib] SlateQ (tf GPU + multi-GPU) + Bandit fixes (#23276) 2022-03-18 13:45:16 +01:00			`.for_each(train_step_op)`
[RLlib] Cleanup SlateQ algo; add test + add target Q-net (#21827) 2022-02-04 17:01:12 +01:00			`.for_each(`
			`UpdateTargetNetwork(workers, config["target_network_update_freq"])`
			`)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`)`
[RLlib] Sub-class `Trainer` (instead of `build_trainer()`): All remaining classes; soft-deprecate `build_trainer`. (#20725) 2021-12-04 22:05:26 +01:00
[RLlib] SlateQ: Add a hard-task learning test to weekly regression suite. (#22544) 2022-02-25 21:58:16 +01:00			`# Alternate deterministically between (1) and (2). Only return the`
			`# output of (2) since training metrics are not available until (2)`
			`# runs.`
			`train_op = Concurrently(`
			`[store_op, replay_op],`
			`mode="round_robin",`
			`output_indexes=[1],`
			`round_robin_weights=calculate_round_robin_weights(config),`
			`)`
[RLlib] Sub-class `Trainer` (instead of `build_trainer()`): All remaining classes; soft-deprecate `build_trainer`. (#20725) 2021-12-04 22:05:26 +01:00
			`return StandardMetricsReporting(train_op, workers, config)`