[RLlib] Fix all example scripts to run on GPUs. (#11105)

2025-03-06 02:21:39 -05:00 · 2020-10-02 23:07:44 +02:00 · 2020-10-02 23:07:44 +02:00 · c17169dc11
commit c17169dc11
parent 5a42ed1848
56 changed files with 221 additions and 98 deletions
--- a/rllib/agents/ddpg/apex.py
+++ b/rllib/agents/ddpg/apex.py
@ -27,14 +27,7 @@ APEX_DDPG_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs(
    },
 )
 def validate_config(config):
    if config.get("framework") == "tfe":
        raise ValueError("APEX_DDPG does not support tf-eager yet!")
 ApexDDPGTrainer = DDPGTrainer.with_updates(
    name="APEX_DDPG",
    default_config=APEX_DDPG_DEFAULT_CONFIG,
    validate_config=validate_config,
    execution_plan=apex_execution_plan)
--- a/rllib/agents/ppo/ppo_tf_policy.py
+++ b/rllib/agents/ppo/ppo_tf_policy.py
@ -251,7 +251,10 @@ class KLCoeffMixin:
        self.kl_coeff_val = config["kl_coeff"]
        # The current KL value (as tf Variable for in-graph operations).
        self.kl_coeff = get_variable(
-            float(self.kl_coeff_val), tf_name="kl_coeff", trainable=False)
+            float(self.kl_coeff_val),
            tf_name="kl_coeff",
            trainable=False,
            framework=config["framework"])
        # Constant target value.
        self.kl_target = config["kl_target"]
--- a/rllib/agents/ppo/tests/test_ppo.py
+++ b/rllib/agents/ppo/tests/test_ppo.py
@ -37,7 +37,7 @@ FAKE_BATCH = {
 class TestPPO(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
-        ray.init(local_mode=True)
+        ray.init()
    @classmethod
    def tearDownClass(cls):
--- a/rllib/agents/sac/tests/test_sac.py
+++ b/rllib/agents/sac/tests/test_sac.py
@ -166,7 +166,8 @@ class TestSAC(unittest.TestCase):
            # Set all weights (of all nets) to fixed values.
            if weights_dict is None:
-                assert fw in ["tf", "tfe"]  # Start with the tf vars-dict.
+                # Start with the tf vars-dict.
                assert fw in ["tf2", "tf", "tfe"]
                weights_dict = policy.get_weights()
                if fw == "tfe":
                    log_alpha = weights_dict[10]
--- a/rllib/examples/attention_net.py
+++ b/rllib/examples/attention_net.py
@ -1,4 +1,5 @@
 import argparse
 import os
 import ray
 from ray import tune
@ -42,6 +43,8 @@ if __name__ == "__main__":
            "repeat_delay": 2,
        },
        "gamma": 0.99,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", 0)),
        "num_workers": 0,
        "num_envs_per_worker": 20,
        "entropy_coeff": 0.001,
--- a/rllib/examples/autoregressive_action_dist.py
+++ b/rllib/examples/autoregressive_action_dist.py
@ -11,6 +11,7 @@ This examples shows both.
 """
 import argparse
 import os
 import ray
 from ray import tune
@ -44,7 +45,8 @@ if __name__ == "__main__":
    config = {
        "env": CorrelatedActionsEnv,
        "gamma": 0.5,
-        "num_gpus": 0,
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "model": {
            "custom_model": "autoregressive_model",
            "custom_action_dist": "binary_autoreg_dist",
@ -58,7 +60,7 @@ if __name__ == "__main__":
        "episode_reward_mean": args.stop_reward,
    }
-    results = tune.run(args.run, stop=stop, config=config)
+    results = tune.run(args.run, stop=stop, config=config, verbose=1)
    if args.as_test:
        check_learning_achieved(results, args.stop_reward)
--- a/rllib/examples/batch_norm_model.py
+++ b/rllib/examples/batch_norm_model.py
@ -1,6 +1,7 @@
 """Example of using a custom model with batch norm."""
 import argparse
 import os
 import ray
 from ray import tune
@ -32,6 +33,8 @@ if __name__ == "__main__":
        "model": {
            "custom_model": "bn_model",
        },
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 0,
        "framework": "torch" if args.torch else "tf",
    }
@ -42,7 +45,7 @@ if __name__ == "__main__":
        "episode_reward_mean": args.stop_reward,
    }
-    results = tune.run(args.run, stop=stop, config=config)
+    results = tune.run(args.run, stop=stop, config=config, verbose=1)
    if args.as_test:
        check_learning_achieved(results, args.stop_reward)
--- a/rllib/examples/cartpole_lstm.py
+++ b/rllib/examples/cartpole_lstm.py
@ -1,4 +1,5 @@
 import argparse
 import os
 from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole
 from ray.rllib.utils.test_utils import check_learning_achieved
@ -35,8 +36,11 @@ if __name__ == "__main__":
    }
    config = dict(
-        configs[args.run], **{
+        configs[args.run],
        **{
            "env": StatelessCartPole,
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "model": {
                "use_lstm": True,
                "lstm_use_prev_action_reward": args.use_prev_action_reward,
--- a/rllib/examples/centralized_critic.py
+++ b/rllib/examples/centralized_critic.py
@ -16,6 +16,7 @@ modifies the environment.
 import argparse
 import numpy as np
 from gym.spaces import Discrete
 import os
 import ray
 from ray import tune
@ -90,7 +91,7 @@ def centralized_critic_postprocessing(policy,
                    sample_batch[OPPONENT_OBS], policy.device),
                convert_to_torch_tensor(
                    sample_batch[OPPONENT_ACTION], policy.device)) \
-                .detach().numpy()
+                .cpu().detach().numpy()
        else:
            sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf(
                sample_batch[SampleBatch.CUR_OBS], sample_batch[OPPONENT_OBS],
@ -137,14 +138,22 @@ def loss_with_central_critic(policy, model, dist_class, train_batch):
    return loss
-def setup_mixins(policy, obs_space, action_space, config):
+def setup_tf_mixins(policy, obs_space, action_space, config):
-    # copied from PPO
+    # Copied from PPOTFPolicy (w/o ValueNetworkMixin).
    KLCoeffMixin.__init__(policy, config)
    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
                                  config["entropy_coeff_schedule"])
    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
 def setup_torch_mixins(policy, obs_space, action_space, config):
    # Copied from PPOTorchPolicy  (w/o ValueNetworkMixin).
    TorchKLCoeffMixin.__init__(policy, config)
    TorchEntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
                                       config["entropy_coeff_schedule"])
    TorchLR.__init__(policy, config["lr"], config["lr_schedule"])
 def central_vf_stats(policy, train_batch, grads):
    # Report the explained variance of the central value function.
    return {
@ -158,7 +167,7 @@ CCPPOTFPolicy = PPOTFPolicy.with_updates(
    name="CCPPOTFPolicy",
    postprocess_fn=centralized_critic_postprocessing,
    loss_fn=loss_with_central_critic,
-    before_loss_init=setup_mixins,
+    before_loss_init=setup_tf_mixins,
    grad_stats_fn=central_vf_stats,
    mixins=[
        LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
@ -169,7 +178,7 @@ CCPPOTorchPolicy = PPOTorchPolicy.with_updates(
    name="CCPPOTorchPolicy",
    postprocess_fn=centralized_critic_postprocessing,
    loss_fn=loss_with_central_critic,
-    before_init=setup_mixins,
+    before_init=setup_torch_mixins,
    mixins=[
        TorchLR, TorchEntropyCoeffSchedule, TorchKLCoeffMixin,
        CentralizedValueMixin
@ -188,7 +197,7 @@ CCTrainer = PPOTrainer.with_updates(
 )
 if __name__ == "__main__":
-    ray.init(local_mode=True)
+    ray.init()
    args = parser.parse_args()
    ModelCatalog.register_custom_model(
@ -198,6 +207,8 @@ if __name__ == "__main__":
    config = {
        "env": TwoStepGame,
        "batch_mode": "complete_episodes",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 0,
        "multiagent": {
            "policies": {
@ -222,7 +233,7 @@ if __name__ == "__main__":
        "episode_reward_mean": args.stop_reward,
    }
-    results = tune.run(CCTrainer, config=config, stop=stop)
+    results = tune.run(CCTrainer, config=config, stop=stop, verbose=1)
    if args.as_test:
        check_learning_achieved(results, args.stop_reward)
--- a/rllib/examples/centralized_critic_2.py
+++ b/rllib/examples/centralized_critic_2.py
@ -12,6 +12,7 @@ modifies the policy to add a centralized value function.
 import numpy as np
 from gym.spaces import Dict, Discrete
 import argparse
 import os
 from ray import tune
 from ray.rllib.agents.callbacks import DefaultCallbacks
@ -87,6 +88,8 @@ if __name__ == "__main__":
        "env": TwoStepGame,
        "batch_mode": "complete_episodes",
        "callbacks": FillInActions,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 0,
        "multiagent": {
            "policies": {
--- a/rllib/examples/complex_struct_space.py
+++ b/rllib/examples/complex_struct_space.py
@ -8,7 +8,9 @@ For PyTorch / TF eager mode, use the --torch and --eager flags.
 """
 import argparse
 import os
 import ray
 from ray import tune
 from ray.rllib.models import ModelCatalog
 from ray.rllib.examples.env.simple_rpg import SimpleRPG
@ -17,9 +19,10 @@ from ray.rllib.examples.models.simple_rpg_model import CustomTorchRPGModel, \
 parser = argparse.ArgumentParser()
 parser.add_argument(
-    "--framework", choices=["tf", "tfe", "torch"], default="tf")
+    "--framework", choices=["tf2", "tf", "tfe", "torch"], default="tf2")
 if __name__ == "__main__":
    ray.init()
    args = parser.parse_args()
    if args.framework == "torch":
        ModelCatalog.register_custom_model("my_model", CustomTorchRPGModel)
@ -31,6 +34,8 @@ if __name__ == "__main__":
        "env": SimpleRPG,
        "rollout_fragment_length": 1,
        "train_batch_size": 2,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 0,
        "model": {
            "custom_model": "my_model",
--- a/rllib/examples/custom_env.py
+++ b/rllib/examples/custom_env.py
@ -11,6 +11,7 @@ import argparse
 import gym
 from gym.spaces import Discrete, Box
 import numpy as np
 import os
 import ray
 from ray import tune
@ -114,6 +115,8 @@ if __name__ == "__main__":
        "env_config": {
            "corridor_length": 5,
        },
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "model": {
            "custom_model": "my_model",
        },
--- a/rllib/examples/custom_eval.py
+++ b/rllib/examples/custom_eval.py
@ -67,6 +67,7 @@ Result for PG_SimpleCorridor_0de4e686:
 """
 import argparse
 import os
 import ray
 from ray import tune
@ -137,7 +138,9 @@ if __name__ == "__main__":
            "corridor_length": 10,
        },
        "horizon": 20,
-        "log_level": "INFO",
+
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        # Training rollouts will be collected using just the learner
        # process, but evaluation will be done in parallel with two
--- a/rllib/examples/custom_fast_model.py
+++ b/rllib/examples/custom_fast_model.py
@ -5,6 +5,7 @@ for running perf microbenchmarks.
 """
 import argparse
 import os
 import ray
 import ray.tune as tune
@ -32,7 +33,8 @@ if __name__ == "__main__":
        "model": {
            "custom_model": "fast_model"
        },
-        "num_gpus": 0,
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 2,
        "num_envs_per_worker": 10,
        "num_data_loader_buffers": 1,
@ -40,7 +42,7 @@ if __name__ == "__main__":
        "broadcast_interval": 50,
        "rollout_fragment_length": 100,
        "train_batch_size": sample_from(
-            lambda spec: 1000 * max(1, spec.config.num_gpus)),
+            lambda spec: 1000 * max(1, spec.config.num_gpus or 1)),
        "fake_sampler": True,
        "framework": "torch" if args.torch else "tf",
    }
@ -50,6 +52,6 @@ if __name__ == "__main__":
        "timesteps_total": args.stop_timesteps,
    }
-    tune.run("IMPALA", config=config, stop=stop)
+    tune.run("IMPALA", config=config, stop=stop, verbose=1)
    ray.shutdown()
--- a/rllib/examples/custom_keras_model.py
+++ b/rllib/examples/custom_keras_model.py
@ -1,6 +1,7 @@
 """Example of using a custom ModelV2 Keras-style model."""
 import argparse
 import os
 import ray
 from ray import tune
@ -119,11 +120,12 @@ if __name__ == "__main__":
        args.run,
        stop={"episode_reward_mean": args.stop},
        config=dict(
-            extra_config, **{
+            extra_config,
-                "log_level": "INFO",
+            **{
                "env": "BreakoutNoFrameskip-v4"
                if args.use_vision_network else "CartPole-v0",
-                "num_gpus": 0,
+                # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
                "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
                "callbacks": {
                    "on_train_result": check_has_custom_metric,
                },
--- a/rllib/examples/custom_loss.py
+++ b/rllib/examples/custom_loss.py
@ -50,6 +50,8 @@ if __name__ == "__main__":
    config = {
        "env": "CartPole-v0",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 0,
        "model": {
            "custom_model": "custom_loss",
@ -64,4 +66,4 @@ if __name__ == "__main__":
        "training_iteration": args.stop_iters,
    }
-    tune.run("PG", config=config, stop=stop)
+    tune.run("PG", config=config, stop=stop, verbose=1)
--- a/rllib/examples/custom_metrics_and_callbacks.py
+++ b/rllib/examples/custom_metrics_and_callbacks.py
@ -7,14 +7,19 @@ custom metric.
 from typing import Dict
 import argparse
 import numpy as np
 import os
 import ray
 from ray import tune
 from ray.rllib.agents.callbacks import DefaultCallbacks
 from ray.rllib.env import BaseEnv
 from ray.rllib.evaluation import MultiAgentEpisode, RolloutWorker
 from ray.rllib.policy import Policy
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.evaluation import MultiAgentEpisode, RolloutWorker
+
-from ray.rllib.agents.callbacks import DefaultCallbacks
+parser = argparse.ArgumentParser()
 parser.add_argument("--torch", action="store_true")
 parser.add_argument("--stop-iters", type=int, default=2000)
 class MyCallbacks(DefaultCallbacks):
@ -65,8 +70,6 @@ class MyCallbacks(DefaultCallbacks):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--stop-iters", type=int, default=2000)
    args = parser.parse_args()
    ray.init()
@ -79,7 +82,9 @@ if __name__ == "__main__":
            "env": "CartPole-v0",
            "num_envs_per_worker": 2,
            "callbacks": MyCallbacks,
-            "framework": "tf",
+            "framework": "torch" if args.torch else "tf",
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        }).trials
    # verify custom metrics for integration tests
--- a/rllib/examples/custom_metrics_and_callbacks_legacy.py
+++ b/rllib/examples/custom_metrics_and_callbacks_legacy.py
@ -2,6 +2,7 @@
 import argparse
 import numpy as np
 import os
 import ray
 from ray import tune
@ -73,6 +74,8 @@ if __name__ == "__main__":
                "on_postprocess_traj": on_postprocess_traj,
            },
            "framework": "tf",
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        }).trials
    # verify custom metrics for integration tests
--- a/rllib/examples/custom_rnn_model.py
+++ b/rllib/examples/custom_rnn_model.py
@ -1,6 +1,7 @@
 """Example of using a custom RNN keras model."""
 import argparse
 import os
 import ray
 from ray import tune
@ -37,6 +38,8 @@ if __name__ == "__main__":
            "repeat_delay": 2,
        },
        "gamma": 0.9,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 0,
        "num_envs_per_worker": 20,
        "entropy_coeff": 0.001,
--- a/rllib/examples/custom_tf_policy.py
+++ b/rllib/examples/custom_tf_policy.py
@ -1,4 +1,5 @@
 import argparse
 import os
 import ray
 from ray import tune
@ -50,6 +51,8 @@ if __name__ == "__main__":
        stop={"training_iteration": args.stop_iters},
        config={
            "env": "CartPole-v0",
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "num_workers": 2,
            "framework": "tf",
        })
--- a/rllib/examples/custom_torch_policy.py
+++ b/rllib/examples/custom_torch_policy.py
@ -1,4 +1,5 @@
 import argparse
 import os
 import ray
 from ray import tune
@ -36,6 +37,8 @@ if __name__ == "__main__":
        stop={"training_iteration": args.stop_iters},
        config={
            "env": "CartPole-v0",
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "num_workers": 2,
            "framework": "torch",
        })
--- a/rllib/examples/custom_train_fn.py
+++ b/rllib/examples/custom_train_fn.py
@ -6,6 +6,7 @@ This example shows:
 You can visualize experiment results in ~/ray_results using TensorBoard.
 """
 import argparse
 import os
 import ray
 from ray import tune
@ -43,6 +44,8 @@ if __name__ == "__main__":
    args = parser.parse_args()
    config = {
        "lr": 0.01,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 0,
        "framework": "torch" if args.torch else "tf",
    }
--- a/rllib/examples/eager_execution.py
+++ b/rllib/examples/eager_execution.py
@ -1,4 +1,5 @@
 import argparse
 import os
 import random
 import ray
@ -58,12 +59,14 @@ MyTrainer = build_trainer(
 )
 if __name__ == "__main__":
-    ray.init()
+    ray.init(local_mode=True)
    args = parser.parse_args()
    ModelCatalog.register_custom_model("eager_model", EagerModel)
    config = {
        "env": "CartPole-v0",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 0,
        "model": {
            "custom_model": "eager_model"
@ -76,7 +79,7 @@ if __name__ == "__main__":
        "episode_reward_mean": args.stop_reward,
    }
-    results = tune.run(MyTrainer, stop=stop, config=config)
+    results = tune.run(MyTrainer, stop=stop, config=config, verbose=1)
    if args.as_test:
        check_learning_achieved(results, args.stop_reward)
--- a/rllib/examples/hierarchical_training.py
+++ b/rllib/examples/hierarchical_training.py
@ -25,6 +25,7 @@ using --flat in this example.
 import argparse
 from gym.spaces import Discrete, Tuple
 import logging
 import os
 import ray
 from ray import tune
@ -75,7 +76,6 @@ if __name__ == "__main__":
        config = {
            "env": HierarchicalWindyMazeEnv,
            "num_workers": 0,
            "log_level": "INFO",
            "entropy_coeff": 0.01,
            "multiagent": {
                "policies": {
@ -94,6 +94,8 @@ if __name__ == "__main__":
                "policy_mapping_fn": function(policy_mapping_fn),
            },
            "framework": "torch" if args.torch else "tf",
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        }
        results = tune.run("PPO", stop=stop, config=config, verbose=1)
--- a/rllib/examples/mobilenet_v2_with_lstm.py
+++ b/rllib/examples/mobilenet_v2_with_lstm.py
@ -5,8 +5,9 @@
 import argparse
 from gym.spaces import Discrete, Box
 import numpy as np
 import os
-from ray.rllib.agents.ppo import PPOTrainer
+from ray import tune
 from ray.rllib.examples.env.random_env import RandomEnv
 from ray.rllib.examples.models.mobilenet_v2_with_lstm_models import \
    MobileV2PlusRNNModel, TorchMobileV2PlusRNNModel
@ -21,6 +22,9 @@ cnn_shape_torch = (3, 224, 224)
 parser = argparse.ArgumentParser()
 parser.add_argument("--torch", action="store_true")
 parser.add_argument("--stop-iters", type=int, default=200)
 parser.add_argument("--stop-reward", type=float, default=0.0)
 parser.add_argument("--stop-timesteps", type=int, default=100000)
 if __name__ == "__main__":
    args = parser.parse_args()
@ -30,8 +34,15 @@ if __name__ == "__main__":
        "my_model", TorchMobileV2PlusRNNModel
        if args.torch else MobileV2PlusRNNModel)
    stop = {
        "training_iteration": args.stop_iters,
        "timesteps_total": args.stop_timesteps,
        "episode_reward_mean": args.stop_reward,
    }
    # Configure our Trainer.
    config = {
        "env": RandomEnv,
        "framework": "torch" if args.torch else "tf",
        "model": {
            "custom_model": "my_model",
@ -42,6 +53,8 @@ if __name__ == "__main__":
            "max_seq_len": 20,
        },
        "vf_share_layers": True,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 0,  # no parallelism
        "env_config": {
            "action_space": Discrete(2),
@ -54,5 +67,4 @@ if __name__ == "__main__":
        },
    }
-    trainer = PPOTrainer(config=config, env=RandomEnv)
+    tune.run("PPO", config=config, stop=stop, verbose=1)
    print(trainer.train())
--- a/rllib/examples/models/autoregressive_action_dist.py
+++ b/rllib/examples/models/autoregressive_action_dist.py
@ -131,8 +131,8 @@ class TorchBinaryAutoregressiveDistribution(TorchDistributionWrapper):
    def _a1_distribution(self):
        BATCH = self.inputs.shape[0]
-        a1_logits, _ = self.model.action_module(self.inputs,
+        zeros = torch.zeros((BATCH, 1)).to(self.inputs.device)
-                                                torch.zeros((BATCH, 1)))
+        a1_logits, _ = self.model.action_module(self.inputs, zeros)
        a1_dist = TorchCategorical(a1_logits)
        return a1_dist
--- a/rllib/examples/models/custom_loss_model.py
+++ b/rllib/examples/models/custom_loss_model.py
@ -116,7 +116,7 @@ class TorchCustomLossModel(TorchModelV2, nn.Module):
        # Define a secondary loss by building a graph copy with weight sharing.
        obs = restore_original_dimensions(
-            torch.from_numpy(batch["obs"]).float(),
+            torch.from_numpy(batch["obs"]).float().to(policy_loss[0].device),
            self.obs_space,
            tensorlib="torch")
        logits, _ = self.forward({"obs": obs}, [], None)
@ -130,8 +130,8 @@ class TorchCustomLossModel(TorchModelV2, nn.Module):
        # Compute the IL loss.
        action_dist = TorchCategorical(logits, self.model_config)
-        imitation_loss = torch.mean(
+        imitation_loss = torch.mean(-action_dist.logp(
-            -action_dist.logp(torch.from_numpy(batch["actions"])))
+            torch.from_numpy(batch["actions"]).to(policy_loss[0].device)))
        self.imitation_loss_metric = imitation_loss.item()
        self.policy_loss_metric = np.mean([l.item() for l in policy_loss])
--- a/rllib/examples/models/fast_model.py
+++ b/rllib/examples/models/fast_model.py
@ -57,8 +57,8 @@ class TorchFastModel(TorchModelV2, nn.Module):
                              model_config, name)
        nn.Module.__init__(self)
-        self.bias = torch.tensor(
+        self.bias = nn.Parameter(
-            [0.0], dtype=torch.float32, requires_grad=True)
+            torch.tensor([0.0], dtype=torch.float32, requires_grad=True))
        # Only needed to give some params to the optimizer (even though,
        # they are never used anywhere).
@ -67,8 +67,9 @@ class TorchFastModel(TorchModelV2, nn.Module):
    @override(ModelV2)
    def forward(self, input_dict, state, seq_lens):
-        self._output = self.bias + \
+        self._output = self.bias + torch.zeros(
-            torch.zeros(size=(input_dict["obs"].shape[0], self.num_outputs))
+            size=(input_dict["obs"].shape[0], self.num_outputs)).to(
                self.bias.device)
        return self._output, []
    @override(ModelV2)
--- a/rllib/examples/models/mobilenet_v2_with_lstm_models.py
+++ b/rllib/examples/models/mobilenet_v2_with_lstm_models.py
@ -89,14 +89,15 @@ class MobileV2PlusRNNModel(RecurrentNetwork):
        return tf.reshape(self._value_out, [-1])
-class TorchMobileV2PlusRNNModel(TorchRNN):
+class TorchMobileV2PlusRNNModel(TorchRNN, nn.Module):
    """A conv. + recurrent torch net example using a pre-trained MobileNet."""
    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name, cnn_shape):
-        super().__init__(obs_space, action_space, num_outputs, model_config,
+        TorchRNN.__init__(self, obs_space, action_space, num_outputs,
-                         name)
+                          model_config, name)
        nn.Module.__init__(self)
        self.lstm_state_size = 16
        self.cnn_shape = list(cnn_shape)
--- a/rllib/examples/models/shared_weights_model.py
+++ b/rllib/examples/models/shared_weights_model.py
@ -125,12 +125,13 @@ class TorchSharedWeightsModel(TorchModelV2, nn.Module):
            activation_fn=None,
            initializer=torch.nn.init.xavier_uniform_,
        )
        self._global_shared_layer = TORCH_GLOBAL_SHARED_LAYER
        self._output = None
    @override(ModelV2)
    def forward(self, input_dict, state, seq_lens):
        out = self.first_layer(input_dict["obs"])
-        self._output = TORCH_GLOBAL_SHARED_LAYER(out)
+        self._output = self._global_shared_layer(out)
        model_out = self.last_layer(self._output)
        return model_out, []
--- a/rllib/examples/multi_agent_cartpole.py
+++ b/rllib/examples/multi_agent_cartpole.py
@ -11,6 +11,7 @@ execution, set the TF_TIMELINE_DIR environment variable.
 import argparse
 import gym
 import os
 import random
 import ray
@ -75,6 +76,8 @@ if __name__ == "__main__":
            "num_agents": args.num_agents,
        },
        "simple_optimizer": args.simple,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_sgd_iter": 10,
        "multiagent": {
            "policies": policies,
--- a/rllib/examples/multi_agent_custom_policy.py
+++ b/rllib/examples/multi_agent_custom_policy.py
@ -15,6 +15,7 @@ Result for PG_multi_cartpole_0:
 import argparse
 import gym
 import os
 import ray
 from ray import tune
@ -60,6 +61,8 @@ if __name__ == "__main__":
                lambda agent_id: ["pg_policy", "random"][agent_id % 2]),
        },
        "framework": "torch" if args.torch else "tf",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
    }
    results = tune.run("PG", config=config, stop=stop, verbose=1)
--- a/rllib/examples/multi_agent_two_trainers.py
+++ b/rllib/examples/multi_agent_two_trainers.py
@ -10,6 +10,7 @@ For a simpler example, see also: multiagent_cartpole.py
 import argparse
 import gym
 import os
 import ray
 from ray.rllib.agents.dqn import DQNTrainer, DQNTFPolicy, DQNTorchPolicy
@ -38,9 +39,9 @@ if __name__ == "__main__":
    # Simple environment with 4 independent cartpole entities
    register_env("multi_agent_cartpole",
                 lambda _: MultiAgentCartPole({"num_agents": 4}))
-    single_env = gym.make("CartPole-v0")
+    single_dummy_env = gym.make("CartPole-v0")
-    obs_space = single_env.observation_space
+    obs_space = single_dummy_env.observation_space
-    act_space = single_env.action_space
+    act_space = single_dummy_env.action_space
    # You can also have multiple policies per trainer, but here we just
    # show one each for PPO and DQN.
@ -69,6 +70,8 @@ if __name__ == "__main__":
            # disable filters, otherwise we would need to synchronize those
            # as well to the DQN agent
            "observation_filter": "NoFilter",
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": "torch" if args.torch else "tf",
        })
@ -82,6 +85,8 @@ if __name__ == "__main__":
            },
            "gamma": 0.95,
            "n_step": 3,
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": "torch" if args.torch or args.mixed_torch_tf else "tf"
        })
--- a/rllib/examples/nested_action_spaces.py
+++ b/rllib/examples/nested_action_spaces.py
@ -1,5 +1,6 @@
 import argparse
 from gym.spaces import Dict, Tuple, Box, Discrete
 import os
 import ray
 import ray.tune as tune
@ -40,6 +41,8 @@ if __name__ == "__main__":
        "gamma": 0.0,  # No history in Env (bandit problem).
        "lr": 0.0005,
        "num_envs_per_worker": 20,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_sgd_iter": 4,
        "num_workers": 0,
        "vf_loss_coeff": 0.01,
--- a/rllib/examples/parametric_actions_cartpole.py
+++ b/rllib/examples/parametric_actions_cartpole.py
@ -15,6 +15,7 @@ Working configurations are given below.
 """
 import argparse
 import os
 import ray
 from ray import tune
@ -55,14 +56,18 @@ if __name__ == "__main__":
    else:
        cfg = {}
-    config = dict({
+    config = dict(
-        "env": "pa_cartpole",
+        {
-        "model": {
+            "env": "pa_cartpole",
-            "custom_model": "pa_model",
+            "model": {
                "custom_model": "pa_model",
            },
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "num_workers": 0,
            "framework": "torch" if args.torch else "tf",
        },
-        "num_workers": 0,
+        **cfg)
        "framework": "torch" if args.torch else "tf",
    }, **cfg)
    stop = {
        "training_iteration": args.stop_iters,
--- a/rllib/examples/pettingzoo_env.py
+++ b/rllib/examples/pettingzoo_env.py
@ -1,15 +1,13 @@
 from copy import deepcopy
-import ray
+from numpy import float32
-try:
+import os
    from ray.rllib.agents.agent import get_agent_class
 except ImportError:
    from ray.rllib.agents.registry import get_agent_class
 from ray.tune.registry import register_env
 from ray.rllib.env import PettingZooEnv
 from pettingzoo.butterfly import pistonball_v0
 from supersuit import normalize_obs_v0, dtype_v0, color_reduction_v0
-from numpy import float32
+import ray
 from ray.rllib.agents.registry import get_agent_class
 from ray.rllib.env import PettingZooEnv
 from ray.tune.registry import register_env
 if __name__ == "__main__":
    """For this script, you need:
@ -37,7 +35,7 @@ if __name__ == "__main__":
    config = deepcopy(get_agent_class(alg_name)._default_config)
    # 2. Set environment config. This will be passed to
-    # the env_creator function via the register env lambda below
+    # the env_creator function via the register env lambda below.
    config["env_config"] = {"local_ratio": 0.5}
    # 3. Register env
@ -58,6 +56,8 @@ if __name__ == "__main__":
        "policy_mapping_fn": lambda agent_id: "av"
    }
    # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
    config["num_gpus"] = int(os.environ.get("RLLIB_NUM_GPUS", "0"))
    config["log_level"] = "DEBUG"
    config["num_workers"] = 1
    # Fragment length, collected at once from each worker and for each agent!
--- a/rllib/examples/rock_paper_scissors_multiagent.py
+++ b/rllib/examples/rock_paper_scissors_multiagent.py
@ -9,6 +9,7 @@ This demonstrates running the following policies in competition:
 import argparse
 from gym.spaces import Discrete
 import os
 import random
 from ray import tune
@ -63,6 +64,8 @@ def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"):
    config = {
        "env": RockPaperScissors,
        "gamma": 0.9,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 0,
        "num_envs_per_worker": 4,
        "rollout_fragment_length": 10,
--- a/rllib/examples/rollout_worker_custom_workflow.py
+++ b/rllib/examples/rollout_worker_custom_workflow.py
@ -8,6 +8,7 @@ collection and policy optimization.
 import argparse
 import gym
 import numpy as np
 import os
 import ray
 from ray import tune
@ -32,6 +33,7 @@ class CustomPolicy(Policy):
    def __init__(self, observation_space, action_space, config):
        super().__init__(observation_space, action_space, config)
        self.config["framework"] = None
        # example parameter
        self.w = 1.0
@ -107,7 +109,8 @@ if __name__ == "__main__":
    tune.run(
        training_workflow,
        resources_per_trial={
-            "gpu": 1 if args.gpu else 0,
+            "gpu": 1 if args.gpu
            or int(os.environ.get("RLLIB_FORCE_NUM_GPUS", 0)) else 0,
            "cpu": 1,
            "extra_cpu": args.num_workers,
        },
@ -115,4 +118,5 @@ if __name__ == "__main__":
            "num_workers": args.num_workers,
            "num_iters": args.num_iters,
        },
        verbose=1,
    )
--- a/rllib/examples/serving/unity3d_server.py
+++ b/rllib/examples/serving/unity3d_server.py
@ -69,7 +69,7 @@ parser.add_argument(
 if __name__ == "__main__":
    args = parser.parse_args()
-    ray.init(local_mode=True)
+    ray.init()
    # Create a fake-env for the server. This env will never be used (neither
    # for sampling, nor for evaluation) and its obs/action Spaces do not
--- a/rllib/examples/two_step_game.py
+++ b/rllib/examples/two_step_game.py
@ -10,6 +10,7 @@ See also: centralized_critic.py for centralized critic PPO on this game.
 import argparse
 from gym.spaces import Tuple, MultiDiscrete, Dict, Discrete
 import os
 import ray
 from ray import tune
@ -77,6 +78,8 @@ if __name__ == "__main__":
                "policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
            },
            "framework": "torch" if args.torch else "tf",
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        }
        group = False
    elif args.run == "QMIX":
@ -93,11 +96,17 @@ if __name__ == "__main__":
                "separate_state_space": True,
                "one_hot_state_encoding": True
            },
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": "torch" if args.torch else "tf",
        }
        group = True
    else:
-        config = {"framework": "torch" if args.torch else "tf"}
+        config = {
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": "torch" if args.torch else "tf",
        }
        group = False
    ray.init(num_cpus=args.num_cpus or None)
--- a/rllib/examples/two_trainer_workflow.py
+++ b/rllib/examples/two_trainer_workflow.py
@ -7,6 +7,7 @@ via a custom training workflow.
 import argparse
 import gym
 import os
 import ray
 from ray import tune
@ -139,6 +140,8 @@ if __name__ == "__main__":
            "policy_mapping_fn": policy_mapping_fn,
            "policies_to_train": ["dqn_policy", "ppo_policy"],
        },
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "framework": "torch" if args.torch else "tf",
    }
--- a/rllib/examples/unity3d_env_local.py
+++ b/rllib/examples/unity3d_env_local.py
@ -21,6 +21,7 @@ $ python unity3d_env_local.py --env 3DBall --stop-reward [..] [--torch]?
 """
 import argparse
 import os
 import ray
 from ray import tune
@ -99,6 +100,8 @@ if __name__ == "__main__":
        "gamma": 0.99,
        "sgd_minibatch_size": 256,
        "train_batch_size": 4000,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_sgd_iter": 20,
        "rollout_fragment_length": 200,
        "clip_param": 0.2,
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@ -307,7 +307,7 @@ class ModelCatalog:
            model_cls = ModelCatalog._wrap_if_needed(model_cls,
                                                     model_interface)
-            if framework in ["tf", "tfe"]:
+            if framework in ["tf2", "tf", "tfe"]:
                # Track and warn if vars were created but not registered.
                created = set()
--- a/rllib/models/tests/test_distributions.py
+++ b/rllib/models/tests/test_distributions.py
@ -423,7 +423,7 @@ class TestDistributions(unittest.TestCase):
    def test_gumbel_softmax(self):
        """Tests the GumbelSoftmax ActionDistribution (tf + eager only)."""
        for fw, sess in framework_iterator(
-                frameworks=["tf", "tfe"], session=True):
+                frameworks=("tf2", "tf", "tfe"), session=True):
            batch_size = 1000
            num_categories = 5
            input_space = Box(-1.0, 1.0, shape=(batch_size, num_categories))
--- a/rllib/policy/eager_tf_policy.py
+++ b/rllib/policy/eager_tf_policy.py
@ -200,7 +200,7 @@ def build_eager_tf_policy(name,
    class eager_policy_cls(base):
        def __init__(self, observation_space, action_space, config):
            assert tf.executing_eagerly()
-            self.framework = "tfe"
+            self.framework = config.get("framework", "tfe")
            Policy.__init__(self, observation_space, action_space, config)
            self._is_training = False
            self._loss_initialized = False
--- a/rllib/policy/sample_batch.py
+++ b/rllib/policy/sample_batch.py
@ -88,7 +88,7 @@ class SampleBatch:
    @staticmethod
    @PublicAPI
-    def concat_samples(samples: List[Dict[str, TensorType]]) -> \
+    def concat_samples(samples: List["SampleBatch"]) -> \
            Union["SampleBatch", "MultiAgentBatch"]:
        """Concatenates n data dicts or MultiAgentBatches.
--- a/rllib/policy/torch_policy.py
+++ b/rllib/policy/torch_policy.py
@ -392,7 +392,7 @@ class TorchPolicy(Policy):
                grad_info["allreduce_latency"] += time.time() - start
-        # Step the optimizer
+        # Step the optimizers.
        for i, opt in enumerate(self._optimizers):
            opt.step()
--- a/rllib/policy/torch_policy_template.py
+++ b/rllib/policy/torch_policy_template.py
@ -66,7 +66,8 @@ def build_torch_policy(
        mixins: Optional[List[type]] = None,
        view_requirements_fn: Optional[Callable[[], Dict[
            str, ViewRequirement]]] = None,
-        get_batch_divisibility_req: Optional[Callable[[Policy], int]] = None):
+        get_batch_divisibility_req: Optional[Callable[[Policy], int]] = None
 ) -> Type[TorchPolicy]:
    """Helper function for creating a torch policy class at runtime.
    Args:
@ -167,7 +168,8 @@ def build_torch_policy(
            sample batches. If None, will assume a value of 1.
    Returns:
-        type: TorchPolicy child class constructed from the specified args.
+        Type[TorchPolicy]: TorchPolicy child class constructed from the
            specified args.
    """
    original_kwargs = locals().copy()
--- a/rllib/tests/test_checkpoint_restore.py
+++ b/rllib/tests/test_checkpoint_restore.py
@ -96,7 +96,7 @@ def ckpt_restore_test(alg_name, tfe=False):
            if optim_state:
                s2 = alg2.get_policy().get_state().get("_optimizer_variables")
                # Tf -> Compare states 1:1.
-                if fw in ["tf", "tfe"]:
+                if fw in ["tf2", "tf", "tfe"]:
                    check(s2, optim_state)
                # For torch, optimizers have state_dicts with keys=params,
                # which are different for the two models (ignore these
--- a/rllib/utils/init.py
+++ b/rllib/utils/init.py
@ -59,22 +59,22 @@ __all__ = [
    "add_mixins",
    "check",
    "check_compute_single_action",
    "deep_update",
    "deprecation_warning",
    "fc",
    "force_list",
    "force_tuple",
    "framework_iterator",
    "lstm",
    "one_hot",
    "relu",
    "sigmoid",
    "softmax",
    "deep_update",
    "merge_dicts",
    "one_hot",
    "override",
    "relu",
    "renamed_function",
    "renamed_agent",
    "renamed_class",
    "sigmoid",
    "softmax",
    "try_import_tf",
    "try_import_tfp",
    "try_import_torch",
--- a/rllib/utils/exploration/curiosity.py
+++ b/rllib/utils/exploration/curiosity.py
@ -211,7 +211,7 @@ class Curiosity(Exploration):
        })
        phi, next_phi = torch.chunk(phis, 2)
        actions_tensor = torch.from_numpy(
-            sample_batch[SampleBatch.ACTIONS]).long()
+            sample_batch[SampleBatch.ACTIONS]).long().to(policy.device)
        # Predict next phi with forward model.
        predicted_next_phi = self.model._curiosity_forward_fcnet(
--- a/rllib/utils/exploration/epsilon_greedy.py
+++ b/rllib/utils/exploration/epsilon_greedy.py
@ -57,7 +57,7 @@ class EpsilonGreedy(Exploration):
            0, framework=framework, tf_name="timestep")
        # Build the tf-info-op.
-        if self.framework in ["tf", "tfe"]:
+        if self.framework in ["tf2", "tf", "tfe"]:
            self._tf_info_op = self.get_info()
    @override(Exploration)
@ -68,7 +68,7 @@ class EpsilonGreedy(Exploration):
                               explore: bool = True):
        q_values = action_distribution.inputs
-        if self.framework in ["tf", "tfe"]:
+        if self.framework in ["tf2", "tf", "tfe"]:
            return self._get_tf_exploration_action_op(q_values, explore,
                                                      timestep)
        else:
--- a/rllib/utils/exploration/gaussian_noise.py
+++ b/rllib/utils/exploration/gaussian_noise.py
@ -72,7 +72,7 @@ class GaussianNoise(Exploration):
            0, framework=self.framework, tf_name="timestep")
        # Build the tf-info-op.
-        if self.framework in ["tf", "tfe"]:
+        if self.framework in ["tf2", "tf", "tfe"]:
            self._tf_info_op = self.get_info()
    @override(Exploration)
--- a/rllib/utils/exploration/parameter_noise.py
+++ b/rllib/utils/exploration/parameter_noise.py
@ -291,7 +291,7 @@ class ParameterNoise(Exploration):
        """Samples new noise and stores it in `self.noise`."""
        if self.framework == "tf":
            tf_sess.run(self.tf_sample_new_noise_op)
-        elif self.framework == "tfe":
+        elif self.framework in ["tfe", "tf2"]:
            self._tf_sample_new_noise_op()
        else:
            for i in range(len(self.noise)):
@ -340,7 +340,7 @@ class ParameterNoise(Exploration):
        # Add stored noise to the model's parameters.
        if self.framework == "tf":
            tf_sess.run(self.tf_add_stored_noise_op)
-        elif self.framework == "tfe":
+        elif self.framework in ["tf2", "tfe"]:
            self._tf_add_stored_noise_op()
        else:
            for i in range(len(self.noise)):
@ -378,7 +378,7 @@ class ParameterNoise(Exploration):
        # Removes the stored noise from the model's parameters.
        if self.framework == "tf":
            tf_sess.run(self.tf_remove_noise_op)
-        elif self.framework == "tfe":
+        elif self.framework in ["tf2", "tfe"]:
            self._tf_remove_noise_op()
        else:
            for var, noise in zip(self.model_variables, self.noise):
--- a/rllib/utils/exploration/random.py
+++ b/rllib/utils/exploration/random.py
@ -46,7 +46,7 @@ class Random(Exploration):
                               timestep: Union[int, TensorType],
                               explore: bool = True):
        # Instantiate the distribution object.
-        if self.framework in ["tf", "tfe"]:
+        if self.framework in ["tf2", "tf", "tfe"]:
            return self.get_tf_exploration_action_op(action_distribution,
                                                     explore)
        else:
--- a/rllib/utils/framework.py
+++ b/rllib/utils/framework.py
@ -190,7 +190,7 @@ def get_variable(value,
        any: A framework-specific variable (tf.Variable, torch.tensor, or
            python primitive).
    """
-    if framework in ["tf", "tfe"]:
+    if framework in ["tf2", "tf", "tfe"]:
        import tensorflow as tf
        dtype = dtype or getattr(
            value, "dtype", tf.float32