diff --git a/rllib/BUILD b/rllib/BUILD
index 89f0b4403..41ff14485 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -11,10 +11,9 @@
 # Currently we have the following categories:
 
 # - Learning tests/regression, tagged:
-# -- "learning_tests_[tf|tf2|torch]": Distinguish tf/tf2 vs torch.
-# -- "learning_tests_[discrete|continuous]_[tf|tf2|torch]": distinguish discrete
-#    actions vs continuous actions AND tf vs torch.
-# -- "fake_gpus_[tf|torch]": Tests that run using 2 fake GPUs.
+# -- "learning_tests_[discrete|continuous]": distinguish discrete
+#    actions vs continuous actions.
+# -- "fake_gpus": Tests that run using 2 fake GPUs.
 
 # - Quick agent compilation/tune-train tests, tagged "quick_train".
 #   NOTE: These should be obsoleted in favor of "trainers_dir" tests as
@@ -413,6 +412,37 @@ py_test(
     args = ["--yaml-dir=tuned_examples/ppo"]
 )
 
+# QMIX
+py_test(
+    name = "learning_tests_two_step_game_qmix",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:ml", "learning_tests", "learning_tests_discrete"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/qmix/two-step-game-qmix.yaml"],
+    args = ["--yaml-dir=tuned_examples/qmix", "--framework=torch"]
+)
+
+py_test(
+    name = "learning_tests_two_step_game_qmix_vdn_mixer",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:ml", "learning_tests", "learning_tests_discrete"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/qmix/two-step-game-qmix-vdn-mixer.yaml"],
+    args = ["--yaml-dir=tuned_examples/qmix", "--framework=torch"]
+)
+
+py_test(
+    name = "learning_tests_two_step_game_qmix_no_mixer",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:ml", "learning_tests", "learning_tests_discrete"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/qmix/two-step-game-qmix-no-mixer.yaml"],
+    args = ["--yaml-dir=tuned_examples/qmix", "--framework=torch"]
+)
+
 # R2D2
 py_test(
     name = "learning_tests_stateless_cartpole_r2d2",
@@ -2683,15 +2713,6 @@ py_test(
     args = ["--as-test", "--framework=torch", "--stop-reward=7", "--run=PG"]
 )
 
-py_test(
-    name = "examples/two_step_game_qmix",
-    main = "examples/two_step_game.py",
-    tags = ["team:ml", "examples", "examples_T"],
-    size = "large",
-    srcs = ["examples/two_step_game.py"],
-    args = ["--as-test", "--framework=torch", "--stop-reward=7", "--run=QMIX"]
-)
-
 py_test(
     name = "contrib/bandits/examples/lin_ts",
     main = "contrib/bandits/examples/simple_context_bandit.py",
diff --git a/rllib/agents/qmix/qmix.py b/rllib/agents/qmix/qmix.py
index 17cd77f51..a53ee96c1 100644
--- a/rllib/agents/qmix/qmix.py
+++ b/rllib/agents/qmix/qmix.py
@@ -34,8 +34,9 @@ DEFAULT_CONFIG = with_common_config({
         "type": "EpsilonGreedy",
         # Config for the Exploration class' constructor:
         "initial_epsilon": 1.0,
-        "final_epsilon": 0.02,
-        "epsilon_timesteps": 10000,  # Timesteps over which to anneal epsilon.
+        "final_epsilon": 0.01,
+        # Timesteps over which to anneal epsilon.
+        "epsilon_timesteps": 40000,
 
         # For soft_q, use:
         # "exploration_config" = {
diff --git a/rllib/agents/qmix/tests/test_qmix.py b/rllib/agents/qmix/tests/test_qmix.py
index c5dad9d98..f141a6d8c 100644
--- a/rllib/agents/qmix/tests/test_qmix.py
+++ b/rllib/agents/qmix/tests/test_qmix.py
@@ -24,9 +24,10 @@ class AvailActionsTestEnv(MultiAgentEnv):
 
     def __init__(self, env_config):
         self.state = None
-        self.avail = env_config["avail_action"]
+        self.avail = env_config.get("avail_actions", [3])
         self.action_mask = np.array([0] * 10)
-        self.action_mask[env_config["avail_action"]] = 1
+        for a in self.avail:
+            self.action_mask[a] = 1
 
     def reset(self):
         self.state = 0
@@ -34,22 +35,31 @@ class AvailActionsTestEnv(MultiAgentEnv):
             "agent_1": {
                 "obs": self.observation_space["obs"].sample(),
                 "action_mask": self.action_mask
-            }
+            },
+            "agent_2": {
+                "obs": self.observation_space["obs"].sample(),
+                "action_mask": self.action_mask
+            },
         }
 
     def step(self, action_dict):
         if self.state > 0:
-            assert action_dict["agent_1"] == self.avail, \
+            assert (action_dict["agent_1"] in self.avail and
+                    action_dict["agent_2"] in self.avail), \
                 "Failed to obey available actions mask!"
         self.state += 1
-        rewards = {"agent_1": 1}
+        rewards = {"agent_1": 1, "agent_2": 0.5}
         obs = {
             "agent_1": {
                 "obs": self.observation_space["obs"].sample(),
                 "action_mask": self.action_mask
+            },
+            "agent_2": {
+                "obs": self.observation_space["obs"].sample(),
+                "action_mask": self.action_mask
             }
         }
-        dones = {"__all__": self.state > 20}
+        dones = {"__all__": self.state >= 20}
         return obs, rewards, dones, {}
 
 
@@ -64,28 +74,33 @@ class TestQMix(unittest.TestCase):
 
     def test_avail_actions_qmix(self):
         grouping = {
-            "group_1": ["agent_1"],  # trivial grouping for testing
+            "group_1": ["agent_1", "agent_2"],
         }
-        obs_space = Tuple([AvailActionsTestEnv.observation_space])
-        act_space = Tuple([AvailActionsTestEnv.action_space])
+        obs_space = Tuple([
+            AvailActionsTestEnv.observation_space,
+            AvailActionsTestEnv.observation_space
+        ])
+        act_space = Tuple([
+            AvailActionsTestEnv.action_space, AvailActionsTestEnv.action_space
+        ])
         register_env(
             "action_mask_test",
             lambda config: AvailActionsTestEnv(config).with_agent_groups(
                 grouping, obs_space=obs_space, act_space=act_space))
 
-        agent = QMixTrainer(
+        trainer = QMixTrainer(
             env="action_mask_test",
             config={
                 "num_envs_per_worker": 5,  # test with vectorization on
                 "env_config": {
-                    "avail_action": 3,
+                    "avail_actions": [3, 4, 8],
                 },
                 "framework": "torch",
             })
         for _ in range(4):
-            agent.train()  # OK if it doesn't trip the action assertion error
-        assert agent.train()["episode_reward_mean"] == 21.0
-        agent.stop()
+            trainer.train()  # OK if it doesn't trip the action assertion error
+        assert trainer.train()["episode_reward_mean"] == 30.0
+        trainer.stop()
         ray.shutdown()
 
 
diff --git a/rllib/examples/env/two_step_game.py b/rllib/examples/env/two_step_game.py
index a6a50607b..221c64e59 100644
--- a/rllib/examples/env/two_step_game.py
+++ b/rllib/examples/env/two_step_game.py
@@ -1,4 +1,4 @@
-from gym.spaces import MultiDiscrete, Dict, Discrete
+from gym.spaces import Dict, Discrete, MultiDiscrete, Tuple
 import numpy as np
 
 from ray.rllib.env.multi_agent_env import MultiAgentEnv, ENV_STATE
@@ -109,3 +109,23 @@ class TwoStepGame(MultiAgentEnv):
             return np.concatenate([self.state, [2]])
         else:
             return np.flatnonzero(self.state)[0] + 3
+
+
+class TwoStepGameWithGroupedAgents(MultiAgentEnv):
+    def __init__(self, env_config):
+        env = TwoStepGame(env_config)
+        tuple_obs_space = Tuple([env.observation_space, env.observation_space])
+        tuple_act_space = Tuple([env.action_space, env.action_space])
+
+        self.env = env.with_agent_groups(
+            groups={"agents": [0, 1]},
+            obs_space=tuple_obs_space,
+            act_space=tuple_act_space)
+        self.observation_space = self.env.observation_space
+        self.action_space = self.env.action_space
+
+    def reset(self):
+        return self.env.reset()
+
+    def step(self, actions):
+        return self.env.step(actions)
diff --git a/rllib/examples/two_step_game.py b/rllib/examples/two_step_game.py
index 5e90c3a0b..9d7927e63 100644
--- a/rllib/examples/two_step_game.py
+++ b/rllib/examples/two_step_game.py
@@ -14,7 +14,7 @@ import os
 
 import ray
 from ray import tune
-from ray.tune import register_env, grid_search
+from ray.tune import register_env
 from ray.rllib.env.multi_agent_env import ENV_STATE
 from ray.rllib.examples.env.two_step_game import TwoStepGame
 from ray.rllib.policy.policy import PolicySpec
@@ -32,6 +32,12 @@ parser.add_argument(
     default="tf",
     help="The DL framework specifier.")
 parser.add_argument("--num-cpus", type=int, default=0)
+parser.add_argument(
+    "--mixer",
+    type=str,
+    default="qmix",
+    choices=["qmix", "vdn", "none"],
+    help="The mixer model to use.")
 parser.add_argument(
     "--as-test",
     action="store_true",
@@ -45,12 +51,12 @@ parser.add_argument(
 parser.add_argument(
     "--stop-timesteps",
     type=int,
-    default=50000,
+    default=70000,
     help="Number of timesteps to train.")
 parser.add_argument(
     "--stop-reward",
     type=float,
-    default=7.0,
+    default=8.0,
     help="Reward at which we stop training.")
 parser.add_argument(
     "--local-mode",
@@ -116,11 +122,10 @@ if __name__ == "__main__":
             "rollout_fragment_length": 4,
             "train_batch_size": 32,
             "exploration_config": {
-                "epsilon_timesteps": 5000,
-                "final_epsilon": 0.05,
+                "final_epsilon": 0.0,
             },
             "num_workers": 0,
-            "mixer": grid_search([None, "qmix"]),
+            "mixer": args.mixer,
             "env_config": {
                 "separate_state_space": True,
                 "one_hot_state_encoding": True
@@ -147,9 +152,6 @@ if __name__ == "__main__":
         "env": "grouped_twostep" if group else TwoStepGame,
     })
 
-    if args.as_test:
-        config["seed"] = 1234
-
     results = tune.run(args.run, stop=stop, config=config, verbose=2)
 
     if args.as_test:
diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py
index cfc0ba8f3..a65403881 100644
--- a/rllib/tests/run_regression_tests.py
+++ b/rllib/tests/run_regression_tests.py
@@ -53,6 +53,10 @@ parser.add_argument(
 if __name__ == "__main__":
     args = parser.parse_args()
 
+    # Error if deprecated --torch option used.
+    if args.torch:
+        deprecation_warning(old="--torch", new="--framework=torch", error=True)
+
     # Bazel regression test mode: Get path to look for yaml files.
     # Get the path or single file to use.
     rllib_dir = Path(__file__).parent.parent
@@ -81,13 +85,14 @@ if __name__ == "__main__":
         assert len(experiments) == 1,\
             "Error, can only run a single experiment per yaml file!"
 
-        # Add torch option to exp config.
         exp = list(experiments.values())[0]
         exp["config"]["framework"] = args.framework
-        if args.torch:
-            deprecation_warning(old="--torch", new="--framework=torch")
-            exp["config"]["framework"] = "torch"
-            args.framework = "torch"
+
+        # QMIX does not support tf yet -> skip.
+        if exp["run"] == "QMIX" and args.framework != "torch":
+            print(f"Skipping framework='{args.framework}' for QMIX.")
+            continue
+
         # Always run with eager-tracing when framework=tf2.
         if args.framework in ["tf2", "tfe"]:
             exp["config"]["eager_tracing"] = True
diff --git a/rllib/tests/test_io.py b/rllib/tests/test_io.py
index 480b857da..c8cb6239a 100644
--- a/rllib/tests/test_io.py
+++ b/rllib/tests/test_io.py
@@ -42,7 +42,7 @@ class AgentIOTest(unittest.TestCase):
         shutil.rmtree(self.test_dir)
         ray.shutdown()
 
-    def writeOutputs(self, output, fw):
+    def write_outputs(self, output, fw):
         agent = PGTrainer(
             env="CartPole-v0",
             config={
@@ -53,23 +53,23 @@ class AgentIOTest(unittest.TestCase):
         agent.train()
         return agent
 
-    def testAgentOutputOk(self):
+    def test_agent_output_ok(self):
         for fw in framework_iterator(frameworks=("torch", "tf")):
-            self.writeOutputs(self.test_dir, fw)
+            self.write_outputs(self.test_dir, fw)
             self.assertEqual(len(os.listdir(self.test_dir + fw)), 1)
             reader = JsonReader(self.test_dir + fw + "/*.json")
             reader.next()
 
-    def testAgentOutputLogdir(self):
+    def test_agent_output_logdir(self):
         """Test special value 'logdir' as Agent's output."""
         for fw in framework_iterator():
-            agent = self.writeOutputs("logdir", fw)
+            agent = self.write_outputs("logdir", fw)
             self.assertEqual(
                 len(glob.glob(agent.logdir + "/output-*.json")), 1)
 
-    def testAgentInputDir(self):
+    def test_agent_input_dir(self):
         for fw in framework_iterator(frameworks=("torch", "tf")):
-            self.writeOutputs(self.test_dir, fw)
+            self.write_outputs(self.test_dir, fw)
             agent = PGTrainer(
                 env="CartPole-v0",
                 config={
@@ -81,16 +81,16 @@ class AgentIOTest(unittest.TestCase):
             self.assertEqual(result["timesteps_total"], 250)  # read from input
             self.assertTrue(np.isnan(result["episode_reward_mean"]))
 
-    def testSplitByEpisode(self):
+    def test_split_by_episode(self):
         splits = SAMPLES.split_by_episode()
         self.assertEqual(len(splits), 3)
         self.assertEqual(splits[0].count, 2)
         self.assertEqual(splits[1].count, 1)
         self.assertEqual(splits[2].count, 1)
 
-    def testAgentInputPostprocessingEnabled(self):
+    def test_agent_input_postprocessing_enabled(self):
         for fw in framework_iterator(frameworks=("tf", "torch")):
-            self.writeOutputs(self.test_dir, fw)
+            self.write_outputs(self.test_dir, fw)
 
             # Rewrite the files to drop advantages and value_targets for
             # testing
@@ -100,7 +100,7 @@ class AgentIOTest(unittest.TestCase):
                     for line in f.readlines():
                         data = json.loads(line)
                         # Data won't contain rewards as these are not included
-                        # in the writeOutputs run (not needed in the
+                        # in the write_outputs run (not needed in the
                         # SampleBatch). Flip out "rewards" for "advantages"
                         # just for testing.
                         data["rewards"] = data["advantages"]
@@ -125,9 +125,9 @@ class AgentIOTest(unittest.TestCase):
             self.assertEqual(result["timesteps_total"], 250)  # read from input
             self.assertTrue(np.isnan(result["episode_reward_mean"]))
 
-    def testAgentInputEvalSim(self):
+    def test_agent_input_eval_sim(self):
         for fw in framework_iterator():
-            self.writeOutputs(self.test_dir, fw)
+            self.write_outputs(self.test_dir, fw)
             agent = PGTrainer(
                 env="CartPole-v0",
                 config={
@@ -142,9 +142,9 @@ class AgentIOTest(unittest.TestCase):
                 time.sleep(0.1)
             assert False, "did not see any simulation results"
 
-    def testAgentInputList(self):
+    def test_agent_input_list(self):
         for fw in framework_iterator(frameworks=("torch", "tf")):
-            self.writeOutputs(self.test_dir, fw)
+            self.write_outputs(self.test_dir, fw)
             agent = PGTrainer(
                 env="CartPole-v0",
                 config={
@@ -157,9 +157,9 @@ class AgentIOTest(unittest.TestCase):
             self.assertEqual(result["timesteps_total"], 250)  # read from input
             self.assertTrue(np.isnan(result["episode_reward_mean"]))
 
-    def testAgentInputDict(self):
+    def test_agent_input_dict(self):
         for fw in framework_iterator():
-            self.writeOutputs(self.test_dir, fw)
+            self.write_outputs(self.test_dir, fw)
             agent = PGTrainer(
                 env="CartPole-v0",
                 config={
@@ -174,7 +174,7 @@ class AgentIOTest(unittest.TestCase):
             result = agent.train()
             self.assertTrue(not np.isnan(result["episode_reward_mean"]))
 
-    def testMultiAgent(self):
+    def test_multi_agent(self):
         register_env("multi_agent_cartpole",
                      lambda _: MultiAgentCartPole({"num_agents": 10}))
 
@@ -234,7 +234,7 @@ class AgentIOTest(unittest.TestCase):
         ]
         for input_procedure in test_input_procedure:
             for fw in framework_iterator(frameworks=("torch", "tf")):
-                self.writeOutputs(self.test_dir, fw)
+                self.write_outputs(self.test_dir, fw)
                 agent = PGTrainer(
                     env="CartPole-v0",
                     config={
diff --git a/rllib/tuned_examples/qmix/two-step-game-qmix-no-mixer.yaml b/rllib/tuned_examples/qmix/two-step-game-qmix-no-mixer.yaml
new file mode 100644
index 000000000..f2b529211
--- /dev/null
+++ b/rllib/tuned_examples/qmix/two-step-game-qmix-no-mixer.yaml
@@ -0,0 +1,22 @@
+two-step-game-qmix-without-mixer:
+    env: ray.rllib.examples.env.two_step_game.TwoStepGameWithGroupedAgents
+    run: QMIX
+    stop:
+        episode_reward_mean: 7.0
+        timesteps_total: 70000
+    config:
+        # QMIX only supports torch for now.
+        framework: torch
+
+        env_config:
+            env_config:
+              separate_state_space: true
+              one_hot_state_encoding: true
+
+        exploration_config:
+            final_epsilon: 0.0
+
+        rollout_fragment_length: 4
+        train_batch_size: 32
+        num_workers: 0
+        mixer: null
diff --git a/rllib/tuned_examples/qmix/two-step-game-qmix-vdn-mixer.yaml b/rllib/tuned_examples/qmix/two-step-game-qmix-vdn-mixer.yaml
new file mode 100644
index 000000000..4e8baffa4
--- /dev/null
+++ b/rllib/tuned_examples/qmix/two-step-game-qmix-vdn-mixer.yaml
@@ -0,0 +1,22 @@
+two-step-game-qmix-with-vdn-mixer:
+    env: ray.rllib.examples.env.two_step_game.TwoStepGameWithGroupedAgents
+    run: QMIX
+    stop:
+        episode_reward_mean: 7.0
+        timesteps_total: 70000
+    config:
+        # QMIX only supports torch for now.
+        framework: torch
+
+        env_config:
+            env_config:
+              separate_state_space: true
+              one_hot_state_encoding: true
+
+        exploration_config:
+            final_epsilon: 0.0
+
+        rollout_fragment_length: 4
+        train_batch_size: 32
+        num_workers: 0
+        mixer: vdn
diff --git a/rllib/tuned_examples/qmix/two-step-game-qmix.yaml b/rllib/tuned_examples/qmix/two-step-game-qmix.yaml
new file mode 100644
index 000000000..36359b780
--- /dev/null
+++ b/rllib/tuned_examples/qmix/two-step-game-qmix.yaml
@@ -0,0 +1,23 @@
+two-step-game-qmix-with-qmix-mixer:
+    env: ray.rllib.examples.env.two_step_game.TwoStepGameWithGroupedAgents
+    run: QMIX
+    stop:
+        episode_reward_mean: 8.0
+        timesteps_total: 70000
+    config:
+        # QMIX only supports torch for now.
+        framework: torch
+
+        env_config:
+            env_config:
+              separate_state_space: true
+              one_hot_state_encoding: true
+
+        # W/o this setting, won't get to 8.0 reward.
+        exploration_config:
+            final_epsilon: 0.0
+
+        rollout_fragment_length: 4
+        train_batch_size: 32
+        num_workers: 0
+        mixer: qmix