[RLlib] Add Decision Transformer (DT) (#27890)

2025-03-04 17:41:43 -05:00 · 2022-08-17 13:49:13 -07:00 · 2022-08-17 13:49:13 -07:00 · edde905741
commit edde905741
parent 6be4bf8be3
12 changed files with 1050 additions and 3 deletions
--- a/rllib/BUILD
+++ b/rllib/BUILD
@ -381,6 +381,35 @@ py_test(
    args = ["--yaml-dir=tuned_examples/dqn"]
 )

+# DT
+py_test(
+   name = "learning_tests_pendulum_dt",
+   main = "tests/run_regression_tests.py",
+   tags = ["team:rllib", "torch_only", "learning_tests", "learning_tests_pendulum", "learning_tests_continuous"],
+   size = "large",
+   srcs = ["tests/run_regression_tests.py"],
+   # Include an offline json data file as well.
+   data = [
+       "tuned_examples/dt/pendulum-v1-dt.yaml",
+       "tests/data/pendulum/pendulum_expert_sac_50eps.zip",
+   ],
+   args = ["--yaml-dir=tuned_examples/dt"]
+)
+
+py_test(
+    name = "learning_tests_cartpole_dt",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:rllib", "torch_only", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    # Include an offline json data file as well.
+    data = [
+        "tuned_examples/dt/cartpole-v0-dt.yaml",
+        "tests/data/cartpole/large.json",
+    ],
+    args = ["--yaml-dir=tuned_examples/dt"]
+)
+
 # Simple-Q
 py_test(
    name = "learning_tests_cartpole_simpleq",
@ -928,6 +957,14 @@ py_test(
    srcs = ["algorithms/dt/tests/test_dt_policy.py"]
 )

+py_test(
+    name = "test_dt",
+    tags = ["team:rllib", "algorithms_dir"],
+    size = "medium",
+    srcs = ["algorithms/dt/tests/test_dt.py"],
+    data = ["tests/data/pendulum/large.json"],
+)
+
 # ES
 py_test(
    name = "test_es",
@ -3148,6 +3185,16 @@ py_test(
    args = ["--stop-iters=2", "--framework=torch"]
 )

+py_test(
+    name = "examples/inference_and_serving/policy_inference_after_training_with_dt_torch",
+    main = "examples/inference_and_serving/policy_inference_after_training_with_dt.py",
+    tags = ["team:rllib", "exclusive", "examples", "examples_P"],
+    size = "medium",
+    srcs = ["examples/inference_and_serving/policy_inference_after_training_with_dt.py"],
+    data = ["tests/data/cartpole/large.json"],
+    args = ["--input-files=tests/data/cartpole/large.json"]
+)
+
 py_test(
    name = "examples/inference_and_serving/policy_inference_after_training_with_lstm_tf",
    main = "examples/inference_and_serving/policy_inference_after_training_with_lstm.py",
--- a/rllib/algorithms/dt/init.py
+++ b/rllib/algorithms/dt/init.py
@ -0,0 +1,6 @@
+from ray.rllib.algorithms.dt.dt import DT, DTConfig
+
+__all__ = [
+    "DT",
+    "DTConfig",
+]
--- a/rllib/algorithms/dt/dt.py
+++ b/rllib/algorithms/dt/dt.py
@ -0,0 +1,401 @@
+import logging
+import math
+from typing import List, Optional, Type, Tuple, Dict, Any, Union
+
+from ray.rllib import SampleBatch
+from ray.rllib.algorithms.algorithm import Algorithm, AlgorithmConfig
+from ray.rllib.algorithms.dt.segmentation_buffer import MultiAgentSegmentationBuffer
+from ray.rllib.execution import synchronous_parallel_sample
+from ray.rllib.execution.train_ops import multi_gpu_train_one_step, train_one_step
+from ray.rllib.policy import Policy
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
+from ray.rllib.utils.annotations import override, PublicAPI
+from ray.rllib.utils.metrics import (
+    NUM_AGENT_STEPS_SAMPLED,
+    NUM_ENV_STEPS_SAMPLED,
+    SAMPLE_TIMER,
+    NUM_AGENT_STEPS_TRAINED,
+)
+from ray.rllib.utils.typing import (
+    AlgorithmConfigDict,
+    ResultDict,
+    TensorStructType,
+    PolicyID,
+    TensorType,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class DTConfig(AlgorithmConfig):
+    def __init__(self, algo_class=None):
+        super().__init__(algo_class=algo_class or DT)
+
+        # fmt: off
+        # __sphinx_doc_begin__
+        # DT-specific settings.
+        # Required settings during training and evaluation:
+        # Initial return to go used as target during rollout.
+        self.target_return = None
+        # Rollout horizon/maximum episode length.
+        self.horizon = None
+
+        # Model settings:
+        self.model = {
+            # Transformer (GPT) context length.
+            "max_seq_len": 5,
+        }
+
+        # Transformer (GPT) settings:
+        self.embed_dim = 128
+        self.num_layers = 2
+        self.num_heads = 1
+        self.embed_pdrop = 0.1
+        self.resid_pdrop = 0.1
+        self.attn_pdrop = 0.1
+
+        # Optimization settings:
+        self.lr = 1e-4
+        self.lr_schedule = None
+        self.optimizer = {
+            # Weight decay for Adam optimizer.
+            "weight_decay": 1e-4,
+            # Betas for Adam optimizer.
+            "betas": (0.9, 0.95),
+        }
+        self.grad_clip = None
+        # Coefficients on the loss for each of the heads.
+        # By default, only use the actions outputs for training.
+        self.loss_coef_actions = 1
+        self.loss_coef_obs = 0
+        self.loss_coef_returns_to_go = 0
+
+        self.replay_buffer_config = {
+            # How many trajectories/episodes does the segmentation buffer hold.
+            # Increase for more data shuffling but increased memory usage.
+            "capacity": 20,
+            # Do not change the type of replay buffer.
+            "type": MultiAgentSegmentationBuffer,
+        }
+        # __sphinx_doc_end__
+        # fmt: on
+
+        # Overwriting the trainer config default
+        # If data ingestion/sample_time is slow, increase this.
+        self.num_workers = 0
+        # Number of training_step calls between evaluation rollouts.
+        self.min_train_timesteps_per_iteration = 5000
+
+        # Don't change
+        self.offline_sampling = True
+        self.postprocess_inputs = True
+        self.discount = None
+
+    def training(
+        self,
+        *,
+        replay_buffer_config: Optional[Dict[str, Any]],
+        embed_dim: Optional[int] = None,
+        num_layers: Optional[int] = None,
+        num_heads: Optional[int] = None,
+        embed_pdrop: Optional[float] = None,
+        resid_pdrop: Optional[float] = None,
+        attn_pdrop: Optional[float] = None,
+        grad_clip: Optional[float] = None,
+        loss_coef_actions: Optional[float] = None,
+        loss_coef_obs: Optional[float] = None,
+        loss_coef_returns_to_go: Optional[float] = None,
+        lr_schedule: Optional[List[List[Union[int, float]]]] = None,
+        **kwargs,
+    ) -> "DTConfig":
+        """
+        === DT configs
+
+        Args:
+            replay_buffer_config: Replay buffer config.
+                {
+                    "capacity": How many trajectories/episodes does the buffer hold.
+                }
+            embed_dim: Dimension of the embeddings in the GPT model.
+            num_layers: Number of attention layers in the GPT model.
+            num_heads: Number of attention heads in the GPT model. Must divide
+                embed_dim evenly.
+            embed_pdrop: Dropout probability of the embedding layer of the GPT model.
+            resid_pdrop: Dropout probability of the residual layer of the GPT model.
+            attn_pdrop: Dropout probability of the attention layer of the GPT model.
+            grad_clip: If specified, clip the global norm of gradients by this amount.
+            lr_schedule: Learning rate schedule. In the format of
+                [[timestep, lr-value], [timestep, lr-value], ...]
+                Intermediary timesteps will be assigned to interpolated learning rate
+                values. A schedule should normally start from timestep 0.
+            loss_coef_actions: Coefficients on the loss for the actions output.
+                Default to 1.
+            loss_coef_obs: Coefficients on the loss for the obs output. Default to 0.
+                Set to a value greater than 0 to regress on the obs output.
+            loss_coef_returns_to_go: Coefficients on the loss for the returns_to_go
+                output. Default to 0. Set to a value greater than 0 to regress on the
+                returns_to_go output.
+            **kwargs: Forward compatibility kwargs
+
+        Returns:
+            This updated DTConfig object.
+        """
+        super().training(**kwargs)
+        if replay_buffer_config is not None:
+            self.replay_buffer_config = replay_buffer_config
+        if embed_dim is not None:
+            self.embed_dim = embed_dim
+        if num_layers is not None:
+            self.num_layers = num_layers
+        if num_heads is not None:
+            self.num_heads = num_heads
+        if embed_pdrop is not None:
+            self.embed_pdrop = embed_pdrop
+        if resid_pdrop is not None:
+            self.resid_pdrop = resid_pdrop
+        if attn_pdrop is not None:
+            self.attn_pdrop = attn_pdrop
+        if grad_clip is not None:
+            self.grad_clip = grad_clip
+        if lr_schedule is not None:
+            self.lr_schedule = lr_schedule
+        if loss_coef_actions is not None:
+            self.loss_coef_actions = loss_coef_actions
+        if loss_coef_obs is not None:
+            self.loss_coef_obs = loss_coef_obs
+        if loss_coef_returns_to_go is not None:
+            self.loss_coef_returns_to_go = loss_coef_returns_to_go
+
+        return self
+
+    def evaluation(
+        self,
+        *,
+        target_return: Optional[float] = None,
+        **kwargs,
+    ) -> "DTConfig":
+        """
+        === DT configs
+
+        Args:
+            target_return: The target return-to-go for inference/evaluation.
+            **kwargs: Forward compatibility kwargs
+
+        Returns:
+            This updated DTConfig object.
+        """
+        super().evaluation(**kwargs)
+        if target_return is not None:
+            self.target_return = target_return
+
+        return self
+
+
+class DT(Algorithm):
+    """Implements Decision Transformer: https://arxiv.org/abs/2106.01345"""
+
+    # TODO: we have a circular dependency for get
+    #  default config. config -> Trainer -> config
+    #  defining Config class in the same file for now as a workaround.
+
+    @override(Algorithm)
+    def validate_config(self, config: AlgorithmConfigDict) -> None:
+        """Validates the Trainer's config dict.
+
+        Args:
+            config: The Trainer's config to check.
+
+        Raises:
+            ValueError: In case something is wrong with the config.
+        """
+        # Call super's validation method.
+        super().validate_config(config)
+
+        # target_return must be specified
+        assert (
+            self.config.get("target_return") is not None
+        ), "Must specify a target return (total sum of rewards)."
+
+        # horizon must be specified and >= 2
+        assert self.config.get("horizon") is not None, "Must specify rollout horizon."
+        assert self.config["horizon"] >= 2, "rollout horizon must be at least 2."
+
+        # replay_buffer's type must be MultiAgentSegmentationBuffer
+        assert (
+            self.config.get("replay_buffer_config") is not None
+        ), "Must specify replay_buffer_config."
+        replay_buffer_type = self.config["replay_buffer_config"].get("type")
+        assert (
+            replay_buffer_type == MultiAgentSegmentationBuffer
+        ), "replay_buffer's type must be MultiAgentSegmentationBuffer."
+
+        # max_seq_len must be specified in model
+        model_max_seq_len = self.config["model"].get("max_seq_len")
+        assert model_max_seq_len is not None, "Must specify model's max_seq_len."
+
+        # User shouldn't need to specify replay_buffer's max_seq_len.
+        # Autofill for replay buffer API. If they did specify, make sure it
+        # matches with model's max_seq_len
+        buffer_max_seq_len = self.config["replay_buffer_config"].get("max_seq_len")
+        if buffer_max_seq_len is None:
+            self.config["replay_buffer_config"]["max_seq_len"] = model_max_seq_len
+        else:
+            assert (
+                buffer_max_seq_len == model_max_seq_len
+            ), "replay_buffer's max_seq_len must equal model's max_seq_len."
+
+        # Same thing for buffer's max_ep_len, which should be autofilled from
+        # rollout's horizon, or check that it matches if user specified.
+        buffer_max_ep_len = self.config["replay_buffer_config"].get("max_ep_len")
+        if buffer_max_ep_len is None:
+            self.config["replay_buffer_config"]["max_ep_len"] = self.config["horizon"]
+        else:
+            assert (
+                buffer_max_ep_len == self.config["horizon"]
+            ), "replay_buffer's max_ep_len must equal rollout horizon."
+
+    @classmethod
+    @override(Algorithm)
+    def get_default_config(cls) -> AlgorithmConfigDict:
+        return DTConfig().to_dict()
+
+    @override(Algorithm)
+    def get_default_policy_class(self, config: AlgorithmConfigDict) -> Type[Policy]:
+        if config["framework"] == "torch":
+            from ray.rllib.algorithms.dt.dt_torch_policy import DTTorchPolicy
+
+            return DTTorchPolicy
+        else:
+            raise ValueError("Non-torch frameworks are not supported yet!")
+
+    @override(Algorithm)
+    def training_step(self) -> ResultDict:
+        with self._timers[SAMPLE_TIMER]:
+            # TODO: Add ability to do obs_filter for offline sampling.
+            train_batch = synchronous_parallel_sample(worker_set=self.workers)
+
+        train_batch = train_batch.as_multi_agent()
+        self._counters[NUM_AGENT_STEPS_SAMPLED] += train_batch.agent_steps()
+        self._counters[NUM_ENV_STEPS_SAMPLED] += train_batch.env_steps()
+
+        # Because each sample is a segment of max_seq_len transitions, doing
+        # the division makes it so the total number of transitions per train
+        # step is consistent.
+        num_steps = train_batch.env_steps()
+        batch_size = int(math.ceil(num_steps / self.config["model"]["max_seq_len"]))
+
+        # Add the batch of episodes to the segmentation buffer.
+        self.local_replay_buffer.add(train_batch)
+        # Sample a batch of segments.
+        train_batch = self.local_replay_buffer.sample(batch_size)
+
+        # Postprocess batch before we learn on it.
+        post_fn = self.config.get("before_learn_on_batch") or (lambda b, *a: b)
+        train_batch = post_fn(train_batch, self.workers, self.config)
+
+        # Learn on training batch.
+        # Use simple optimizer (only for multi-agent or tf-eager; all other
+        # cases should use the multi-GPU optimizer, even if only using 1 GPU)
+        if self.config.get("simple_optimizer", False):
+            train_results = train_one_step(self, train_batch)
+        else:
+            train_results = multi_gpu_train_one_step(self, train_batch)
+
+        # Update learning rate scheduler.
+        global_vars = {
+            # Note: this counts the number of segments trained, not timesteps.
+            # i.e. NUM_AGENT_STEPS_TRAINED: B, NUM_AGENT_STEPS_SAMPLED: B*T
+            "timestep": self._counters[NUM_AGENT_STEPS_TRAINED],
+        }
+        self.workers.local_worker().set_global_vars(global_vars)
+
+        return train_results
+
+    @PublicAPI
+    @override(Algorithm)
+    def compute_single_action(
+        self,
+        *args,
+        input_dict: Optional[SampleBatch] = None,
+        full_fetch: bool = True,
+        **kwargs,
+    ) -> Tuple[TensorStructType, List[TensorType], Dict[str, TensorType]]:
+        """Computes an action for the specified policy on the local worker.
+
+        Note that you can also access the policy object through
+        self.get_policy(policy_id) and call compute_single_action() on it
+        directly.
+
+        Args:
+            input_dict: A SampleBatch taken from get_initial_input_dict or
+                get_next_input_dict.
+            full_fetch: Whether to return extra action fetch results.
+                This is always True for DT.
+            kwargs: forward compatibility args.
+
+        Returns:
+            A tuple containing: (
+                the computed action,
+                list of RNN states (empty for DT),
+                extra action output (pass to get_next_input_dict),
+            )
+        """
+        assert input_dict is not None, (
+            "DT must take in input_dict for inference. "
+            "See get_initial_input_dict() and get_next_input_dict()."
+        )
+        assert (
+            full_fetch
+        ), "DT needs full_fetch=True. Pass extra into get_next_input_dict()."
+
+        return super().compute_single_action(
+            *args, input_dict=input_dict.copy(), full_fetch=full_fetch, **kwargs
+        )
+
+    @PublicAPI
+    def get_initial_input_dict(
+        self,
+        observation: TensorStructType,
+        policy_id: PolicyID = DEFAULT_POLICY_ID,
+    ) -> SampleBatch:
+        """Get the initial input_dict to be passed into compute_single_action.
+
+        Args:
+            observation: first (unbatched) observation from env.reset()
+            policy_id: Policy to query (only applies to multi-agent).
+                Default: "default_policy".
+
+        Returns:
+            The input_dict for inference.
+        """
+        policy = self.get_policy(policy_id)
+        return policy.get_initial_input_dict(observation)
+
+    @PublicAPI
+    def get_next_input_dict(
+        self,
+        input_dict: SampleBatch,
+        action: TensorStructType,
+        reward: TensorStructType,
+        next_obs: TensorStructType,
+        extra: Dict[str, TensorType],
+        policy_id: PolicyID = DEFAULT_POLICY_ID,
+    ) -> SampleBatch:
+        """Returns a new input_dict after stepping through the environment once.
+
+        Args:
+            input_dict: the input dict passed into compute_single_action.
+            action: the (unbatched) action taken this step.
+            reward: the (unbatched) reward from env.step
+            next_obs: the (unbatached) next observation from env.step
+            extra: the extra action out from compute_single_action.
+                For DT this case contains current returns to go *before* the current
+                reward is subtracted from target_return.
+            policy_id: Policy to query (only applies to multi-agent).
+                Default: "default_policy".
+
+        Returns:
+            A new input_dict to be passed into compute_single_action.
+        """
+        policy = self.get_policy(policy_id)
+        return policy.get_next_input_dict(input_dict, action, reward, next_obs, extra)
--- a/rllib/algorithms/dt/segmentation_buffer.py
+++ b/rllib/algorithms/dt/segmentation_buffer.py
@ -58,9 +58,6 @@ class SegmentationBuffer:
            self._add_single_episode(episode)

    def _add_single_episode(self, episode: SampleBatch):
-        # Truncate if episode too long.
-        # Note: sometimes this happens if the dataset shuffles such that the
-        # same episode is concatenated together twice (which is okay).
        ep_len = episode.env_steps()

        if ep_len > self.max_ep_len:
--- a/rllib/algorithms/dt/tests/test_dt.py
+++ b/rllib/algorithms/dt/tests/test_dt.py
@ -0,0 +1,269 @@
+from pathlib import Path
+import os
+import unittest
+from typing import Dict
+
+import gym
+import numpy as np
+
+import ray
+from ray.rllib import SampleBatch
+from ray.rllib.algorithms.dt import DTConfig
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.test_utils import check_train_results
+
+tf1, tf, tfv = try_import_tf()
+torch, _ = try_import_torch()
+
+
+def _assert_input_dict_equals(d1: Dict[str, np.ndarray], d2: Dict[str, np.ndarray]):
+    for key in d1.keys():
+        assert key in d2.keys()
+
+    for key in d2.keys():
+        assert key in d1.keys()
+
+    for key in d1.keys():
+        assert isinstance(d1[key], np.ndarray)
+        assert isinstance(d2[key], np.ndarray)
+        assert d1[key].shape == d2[key].shape
+        assert np.allclose(d1[key], d2[key])
+
+
+class TestDT(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        ray.init()
+
+    @classmethod
+    def tearDownClass(cls):
+        ray.shutdown()
+
+    def test_dt_compilation(self):
+        """Test whether a DT algorithm can be built with all supported frameworks."""
+
+        rllib_dir = Path(__file__).parent.parent.parent.parent
+        data_file = os.path.join(rllib_dir, "tests/data/pendulum/large.json")
+
+        input_config = {
+            "paths": data_file,
+            "format": "json",
+        }
+
+        config = (
+            DTConfig()
+            .environment(
+                env="Pendulum-v1",
+                clip_actions=True,
+                normalize_actions=True,
+            )
+            .framework("torch")
+            .offline_data(
+                input_="dataset",
+                input_config=input_config,
+                actions_in_input_normalized=True,
+            )
+            .training(
+                train_batch_size=200,
+                replay_buffer_config={
+                    "capacity": 8,
+                },
+                model={
+                    "max_seq_len": 4,
+                },
+                num_layers=1,
+                num_heads=1,
+                embed_dim=64,
+            )
+            .evaluation(
+                target_return=-120,
+                evaluation_interval=2,
+                evaluation_num_workers=0,
+                evaluation_duration=10,
+                evaluation_duration_unit="episodes",
+                evaluation_parallel_to_training=False,
+                evaluation_config={"input": "sampler", "explore": False},
+            )
+            .rollouts(
+                num_rollout_workers=0,
+                horizon=200,
+            )
+            .reporting(
+                min_train_timesteps_per_iteration=10,
+            )
+        )
+
+        num_iterations = 4
+
+        for _ in ["torch"]:
+            algo = config.build()
+            # check if 4 iterations raises any errors
+            for i in range(num_iterations):
+                results = algo.train()
+                check_train_results(results)
+                print(results)
+                if (i + 1) % 2 == 0:
+                    # evaluation happens every 2 iterations
+                    eval_results = results["evaluation"]
+                    print(
+                        f"iter={algo.iteration} "
+                        f"R={eval_results['episode_reward_mean']}"
+                    )
+
+            # do example inference rollout
+            env = gym.make("Pendulum-v1")
+
+            obs = env.reset()
+            input_dict = algo.get_initial_input_dict(obs)
+
+            for _ in range(200):
+                action, _, extra = algo.compute_single_action(input_dict=input_dict)
+                obs, reward, done, _ = env.step(action)
+                if done:
+                    break
+                else:
+                    input_dict = algo.get_next_input_dict(
+                        input_dict,
+                        action,
+                        reward,
+                        obs,
+                        extra,
+                    )
+
+            env.close()
+            algo.stop()
+
+    def test_inference_methods(self):
+        """Test inference methods."""
+
+        config = (
+            DTConfig()
+            .environment(
+                env="Pendulum-v1",
+                clip_actions=True,
+                normalize_actions=True,
+            )
+            .framework("torch")
+            .training(
+                train_batch_size=200,
+                replay_buffer_config={
+                    "capacity": 8,
+                },
+                model={
+                    "max_seq_len": 3,
+                },
+                num_layers=1,
+                num_heads=1,
+                embed_dim=64,
+            )
+            .evaluation(
+                target_return=-120,
+            )
+            .rollouts(
+                num_rollout_workers=0,
+                horizon=200,
+            )
+        )
+        algo = config.build()
+
+        # Do a controlled fake rollout for 2 steps and check input_dict
+        # first input_dict
+        obs = np.array([0.0, 1.0, 2.0])
+
+        input_dict = algo.get_initial_input_dict(obs)
+        target = SampleBatch(
+            {
+                SampleBatch.OBS: np.array(
+                    [
+                        [0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0],
+                        [0.0, 1.0, 2.0],
+                    ],
+                    dtype=np.float32,
+                ),
+                SampleBatch.ACTIONS: np.array([[0.0], [0.0]], dtype=np.float32),
+                SampleBatch.RETURNS_TO_GO: np.array([0.0, 0.0], dtype=np.float32),
+                SampleBatch.REWARDS: np.zeros((), dtype=np.float32),
+                SampleBatch.T: np.array([-1, -1], dtype=np.int32),
+            }
+        )
+        _assert_input_dict_equals(input_dict, target)
+
+        # forward pass with first input_dict
+        action, _, extra = algo.compute_single_action(input_dict=input_dict)
+        assert action.shape == (1,)
+        assert SampleBatch.RETURNS_TO_GO in extra
+        assert np.isclose(extra[SampleBatch.RETURNS_TO_GO], -120.0)
+
+        # second input_dict
+        action = np.array([0.5])
+        obs = np.array([3.0, 4.0, 5.0])
+        reward = -10.0
+
+        input_dict = algo.get_next_input_dict(
+            input_dict,
+            action,
+            reward,
+            obs,
+            extra,
+        )
+        target = SampleBatch(
+            {
+                SampleBatch.OBS: np.array(
+                    [
+                        [0.0, 0.0, 0.0],
+                        [0.0, 1.0, 2.0],
+                        [3.0, 4.0, 5.0],
+                    ],
+                    dtype=np.float32,
+                ),
+                SampleBatch.ACTIONS: np.array([[0.0], [0.5]], dtype=np.float32),
+                SampleBatch.RETURNS_TO_GO: np.array([0.0, -120.0], dtype=np.float32),
+                SampleBatch.REWARDS: np.asarray(-10.0),
+                SampleBatch.T: np.array([-1, 0], dtype=np.int32),
+            }
+        )
+        _assert_input_dict_equals(input_dict, target)
+
+        # forward pass with second input_dict
+        action, _, extra = algo.compute_single_action(input_dict=input_dict)
+        assert action.shape == (1,)
+        assert SampleBatch.RETURNS_TO_GO in extra
+        assert np.isclose(extra[SampleBatch.RETURNS_TO_GO], -110.0)
+
+        # third input_dict
+        action = np.array([-0.2])
+        obs = np.array([6.0, 7.0, 8.0])
+        reward = -20.0
+
+        input_dict = algo.get_next_input_dict(
+            input_dict,
+            action,
+            reward,
+            obs,
+            extra,
+        )
+        target = SampleBatch(
+            {
+                SampleBatch.OBS: np.array(
+                    [
+                        [0.0, 1.0, 2.0],
+                        [3.0, 4.0, 5.0],
+                        [6.0, 7.0, 8.0],
+                    ],
+                    dtype=np.float32,
+                ),
+                SampleBatch.ACTIONS: np.array([[0.5], [-0.2]], dtype=np.float32),
+                SampleBatch.RETURNS_TO_GO: np.array([-120, -110.0], dtype=np.float32),
+                SampleBatch.REWARDS: np.asarray(-20.0),
+                SampleBatch.T: np.array([0, 1], dtype=np.int32),
+            }
+        )
+        _assert_input_dict_equals(input_dict, target)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+
+    sys.exit(pytest.main(["-v", __file__]))
--- a/rllib/algorithms/registry.py
+++ b/rllib/algorithms/registry.py
@ -108,6 +108,12 @@ def _import_dreamer():
    return dreamer.Dreamer, dreamer.DreamerConfig().to_dict()


+def _import_dt():
+    import ray.rllib.algorithms.dt as dt
+
+    return dt.DT, dt.DTConfig().to_dict()
+
+
 def _import_es():
    import ray.rllib.algorithms.es as es

@ -215,6 +221,7 @@ ALGORITHMS = {
    "DDPPO": _import_ddppo,
    "DQN": _import_dqn,
    "Dreamer": _import_dreamer,
+    "DT": _import_dt,
    "IMPALA": _import_impala,
    "APPO": _import_appo,
    "AlphaStar": _import_alpha_star,
@ -309,6 +316,7 @@ POLICIES = {
    "DQNTFPolicy": "dqn.dqn_tf_policy",
    "DQNTorchPolicy": "dqn.dqn_torch_policy",
    "DreamerTorchPolicy": "dreamer.dreamer_torch_policy",
+    "DTTorchPolicy": "dt.dt_torch_policy",
    "ESTFPolicy": "es.es_tf_policy",
    "ESTorchPolicy": "es.es_torch_policy",
    "ImpalaTF1Policy": "impala.impala_tf_policy",
--- a/rllib/examples/inference_and_serving/policy_inference_after_training_with_dt.py
+++ b/rllib/examples/inference_and_serving/policy_inference_after_training_with_dt.py
@ -0,0 +1,182 @@
+"""
+Example showing how you can use your trained Decision Transformer (DT) policy for
+inference (computing actions) in an environment.
+"""
+import argparse
+from pathlib import Path
+
+import gym
+import os
+
+import ray
+from ray import air, tune
+from ray.rllib.algorithms.dt import DTConfig
+from ray.rllib.algorithms.registry import get_algorithm_class
+from ray.tune.utils.log import Verbosity
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-cpus", type=int, default=0)
+    parser.add_argument(
+        "--input-files",
+        nargs="+",
+        default=[
+            os.path.join(
+                os.path.dirname(os.path.abspath(__file__)),
+                "../../tests/data/cartpole/large.json",
+            )
+        ],
+        help="List of paths to offline json files/zips for training.",
+    )
+    parser.add_argument(
+        "--num-episodes-during-inference",
+        type=int,
+        default=10,
+        help="Number of episodes to do inference over after training.",
+    )
+
+    args = parser.parse_args()
+
+    ray.init(num_cpus=args.num_cpus or None)
+
+    # Bazel makes it hard to find files specified in `args` (and `data`).
+    # Look for them here.
+    input_files = []
+    for input_file in args.input_files:
+        if not os.path.exists(input_file):
+            # This script runs in the ray/rllib/examples/inference_and_serving dir.
+            rllib_dir = Path(__file__).parent.parent.parent
+            input_dir = rllib_dir.absolute().joinpath(input_file)
+            input_files.append(str(input_dir))
+        else:
+            input_files.append(input_file)
+
+    # Get max_ep_len
+    env = gym.make("CartPole-v0")
+    max_ep_len = env.spec.max_episode_steps
+    env.close()
+
+    # Training config
+    config = (
+        DTConfig()
+        .environment(
+            env="CartPole-v0",
+            clip_actions=False,
+            normalize_actions=False,
+        )
+        .framework("torch")
+        .offline_data(
+            input_="dataset",
+            input_config={
+                "format": "json",
+                "paths": input_files,
+            },
+            actions_in_input_normalized=True,
+        )
+        .training(
+            lr=0.01,
+            optimizer={
+                "weight_decay": 0.1,
+                "betas": [0.9, 0.999],
+            },
+            train_batch_size=512,
+            replay_buffer_config={
+                "capacity": 20,
+            },
+            model={
+                "max_seq_len": 3,
+            },
+            num_layers=1,
+            num_heads=1,
+            embed_dim=64,
+        )
+        # Need to do evaluation rollouts for stopping condition.
+        .evaluation(
+            target_return=200.0,
+            evaluation_interval=1,
+            evaluation_num_workers=1,
+            evaluation_duration=10,
+            evaluation_duration_unit="episodes",
+            evaluation_parallel_to_training=False,
+            evaluation_config={"input": "sampler", "explore": False},
+        )
+        .rollouts(
+            num_rollout_workers=0,
+            # This needs to be specified
+            horizon=max_ep_len,
+        )
+        .reporting(
+            min_train_timesteps_per_iteration=5000,
+        )
+        .resources(
+            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+            num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")),
+        )
+    )
+    config = config.to_dict()
+
+    # Configure when to stop training
+    # Note that for an offline RL algorithm, we don't do training rollouts,
+    # instead we have to rely on evaluation rollouts.
+    stop = {
+        "evaluation/episode_reward_mean": 200.0,
+        "training_iteration": 100,
+    }
+
+    print("Training policy until desired reward/iterations. ...")
+    tuner = tune.Tuner(
+        "DT",
+        param_space=config,
+        run_config=air.RunConfig(
+            stop=stop,
+            verbose=Verbosity.V3_TRIAL_DETAILS,
+            checkpoint_config=air.CheckpointConfig(
+                checkpoint_frequency=1,
+                checkpoint_at_end=True,
+            ),
+        ),
+    )
+    results = tuner.fit()
+
+    print("Training completed. Restoring new Algorithm for action inference.")
+    # Get the last checkpoint from the above training run.
+    checkpoint = results.get_best_result().checkpoint
+    # Create new Algorithm and restore its state from the last checkpoint.
+    algo = get_algorithm_class("DT")(config=config)
+    algo.restore(checkpoint)
+
+    # Create the env to do inference in.
+    env = gym.make("CartPole-v0")
+
+    obs = env.reset()
+    input_dict = algo.get_initial_input_dict(obs)
+
+    num_episodes = 0
+    total_rewards = 0.0
+
+    while num_episodes < args.num_episodes_during_inference:
+        # Compute an action (`a`).
+        a, _, extra = algo.compute_single_action(input_dict=input_dict)
+        # Send the computed action `a` to the env.
+        obs, reward, done, _ = env.step(a)
+        # Add to total rewards.
+        total_rewards += reward
+        # Is the episode `done`? -> Reset.
+        if done:
+            print(f"Episode {num_episodes+1} - return: {total_rewards}")
+            obs = env.reset()
+            input_dict = algo.get_initial_input_dict(obs)
+            num_episodes += 1
+            total_rewards = 0.0
+        # Episode is still ongoing -> Continue.
+        else:
+            input_dict = algo.get_next_input_dict(
+                input_dict,
+                a,
+                reward,
+                obs,
+                extra,
+            )
+
+    env.close()
+    ray.shutdown()
--- a/rllib/tests/data/pendulum/pendulum_expert_sac_50eps.zip
+++ b/rllib/tests/data/pendulum/pendulum_expert_sac_50eps.zip
--- a/rllib/tests/data/pendulum/pendulum_medium_sac_50eps.zip
+++ b/rllib/tests/data/pendulum/pendulum_medium_sac_50eps.zip
--- a/rllib/tuned_examples/dt/cartpole-v0-dt.yaml
+++ b/rllib/tuned_examples/dt/cartpole-v0-dt.yaml
@ -0,0 +1,42 @@
+cartpole_dt:
+    env: 'CartPole-v0'
+    run: DT
+    stop:
+      evaluation/episode_reward_mean: 200
+      training_iteration: 100
+    config:
+        input: 'dataset'
+        input_config:         
+            paths: 'tests/data/cartpole/large.json'
+            format: 'json'
+        num_workers: 3
+        actions_in_input_normalized: True
+        clip_actions: False
+        # training
+        framework: torch
+        train_batch_size: 512
+        min_train_timesteps_per_iteration: 5000
+        target_return: 200
+        lr: 0.01
+        optimizer:
+            weight_decay: 0.1
+            betas: [0.9, 0.999]
+        replay_buffer_config:
+            capacity: 20
+        # model
+        model:
+            max_seq_len: 3
+        num_layers: 1
+        num_heads: 1
+        embed_dim: 64
+        # rollout
+        horizon: 200
+        # evaluation
+        evaluation_config:
+            explore: False
+            input: sampler
+        evaluation_duration: 10
+        evaluation_duration_unit: episodes
+        evaluation_interval: 1
+        evaluation_num_workers: 1
+        evaluation_parallel_to_training: True
--- a/rllib/tuned_examples/dt/pendulum-v1-dt.yaml
+++ b/rllib/tuned_examples/dt/pendulum-v1-dt.yaml
@ -0,0 +1,46 @@
+pendulum_dt:
+    env: 'Pendulum-v1'
+    run: DT
+    stop:
+        # We could make this higher, but given that we have 4 cpus for our tests, we will have to settle for -300.
+        evaluation/episode_reward_mean: -300
+        timesteps_total: 20000000
+    config:
+        input: 'dataset'
+        input_config:         
+            paths: 'tests/data/pendulum/pendulum_expert_sac_50eps.zip'
+            format: 'json'
+        num_workers: 3
+        actions_in_input_normalized: True
+        clip_actions: True
+        normalize_actions: True
+        # training
+        framework: torch
+        train_batch_size: 512
+        min_train_timesteps_per_iteration: 5000
+        target_return: -120.0
+        lr: 0.0
+        lr_schedule: [[0, 0.0], [10000, 0.01]]
+        grad_clip: 1.0
+        optimizer:
+            weight_decay: 0.1
+            betas: [0.9, 0.999]
+        replay_buffer_config:
+            capacity: 20
+        # model
+        model:
+            max_seq_len: 3
+        num_layers: 1
+        num_heads: 1
+        embed_dim: 64
+        # rollout
+        horizon: 200
+        # evaluation
+        evaluation_config:
+            explore: False
+            input: sampler
+        evaluation_duration: 10
+        evaluation_duration_unit: episodes
+        evaluation_interval: 1
+        evaluation_num_workers: 1
+        evaluation_parallel_to_training: True
--- a/rllib/tuned_examples/dt/pendulum-v1-medium-expert-dt.yaml
+++ b/rllib/tuned_examples/dt/pendulum-v1-medium-expert-dt.yaml
@ -0,0 +1,49 @@
+pendulum_medium_expert_dt:
+    env: 'Pendulum-v1'
+    run: DT
+    stop:
+        # We could make this higher, but given that we have 4 cpus for our tests, we will have to settle for -350.
+        evaluation/episode_reward_mean: -350
+        timesteps_total: 20000000
+    config:
+        input: 'dataset'
+        input_config:         
+            paths: [
+                'tests/data/pendulum/pendulum_expert_sac_50eps.zip',
+                'tests/data/pendulum/pendulum_medium_sac_50eps.zip',
+            ]
+            format: 'json'
+        num_workers: 3
+        actions_in_input_normalized: True
+        clip_actions: True
+        normalize_actions: True
+        # training
+        framework: torch
+        train_batch_size: 512
+        min_train_timesteps_per_iteration: 5000
+        target_return: -120.0
+        lr: 0.0
+        lr_schedule: [[0, 0.0], [100000, 0.01]]
+        grad_clip: 1.0
+        optimizer:
+            weight_decay: 0.1
+            betas: [0.9, 0.999]
+        replay_buffer_config:
+            capacity: 20
+        # model
+        model:
+            max_seq_len: 3
+        num_layers: 1
+        num_heads: 1
+        embed_dim: 64
+        # rollout
+        horizon: 200
+        # evaluation
+        evaluation_config:
+            explore: False
+            input: sampler
+        evaluation_duration: 10
+        evaluation_duration_unit: episodes
+        evaluation_interval: 1
+        evaluation_num_workers: 1
+        evaluation_parallel_to_training: True