diff --git a/rllib/BUILD b/rllib/BUILD index 89f0b4403..41ff14485 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -11,10 +11,9 @@ # Currently we have the following categories: # - Learning tests/regression, tagged: -# -- "learning_tests_[tf|tf2|torch]": Distinguish tf/tf2 vs torch. -# -- "learning_tests_[discrete|continuous]_[tf|tf2|torch]": distinguish discrete -# actions vs continuous actions AND tf vs torch. -# -- "fake_gpus_[tf|torch]": Tests that run using 2 fake GPUs. +# -- "learning_tests_[discrete|continuous]": distinguish discrete +# actions vs continuous actions. +# -- "fake_gpus": Tests that run using 2 fake GPUs. # - Quick agent compilation/tune-train tests, tagged "quick_train". # NOTE: These should be obsoleted in favor of "trainers_dir" tests as @@ -413,6 +412,37 @@ py_test( args = ["--yaml-dir=tuned_examples/ppo"] ) +# QMIX +py_test( + name = "learning_tests_two_step_game_qmix", + main = "tests/run_regression_tests.py", + tags = ["team:ml", "learning_tests", "learning_tests_discrete"], + size = "large", + srcs = ["tests/run_regression_tests.py"], + data = ["tuned_examples/qmix/two-step-game-qmix.yaml"], + args = ["--yaml-dir=tuned_examples/qmix", "--framework=torch"] +) + +py_test( + name = "learning_tests_two_step_game_qmix_vdn_mixer", + main = "tests/run_regression_tests.py", + tags = ["team:ml", "learning_tests", "learning_tests_discrete"], + size = "large", + srcs = ["tests/run_regression_tests.py"], + data = ["tuned_examples/qmix/two-step-game-qmix-vdn-mixer.yaml"], + args = ["--yaml-dir=tuned_examples/qmix", "--framework=torch"] +) + +py_test( + name = "learning_tests_two_step_game_qmix_no_mixer", + main = "tests/run_regression_tests.py", + tags = ["team:ml", "learning_tests", "learning_tests_discrete"], + size = "large", + srcs = ["tests/run_regression_tests.py"], + data = ["tuned_examples/qmix/two-step-game-qmix-no-mixer.yaml"], + args = ["--yaml-dir=tuned_examples/qmix", "--framework=torch"] +) + # R2D2 py_test( name = "learning_tests_stateless_cartpole_r2d2", @@ -2683,15 +2713,6 @@ py_test( args = ["--as-test", "--framework=torch", "--stop-reward=7", "--run=PG"] ) -py_test( - name = "examples/two_step_game_qmix", - main = "examples/two_step_game.py", - tags = ["team:ml", "examples", "examples_T"], - size = "large", - srcs = ["examples/two_step_game.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=7", "--run=QMIX"] -) - py_test( name = "contrib/bandits/examples/lin_ts", main = "contrib/bandits/examples/simple_context_bandit.py", diff --git a/rllib/agents/qmix/qmix.py b/rllib/agents/qmix/qmix.py index 17cd77f51..a53ee96c1 100644 --- a/rllib/agents/qmix/qmix.py +++ b/rllib/agents/qmix/qmix.py @@ -34,8 +34,9 @@ DEFAULT_CONFIG = with_common_config({ "type": "EpsilonGreedy", # Config for the Exploration class' constructor: "initial_epsilon": 1.0, - "final_epsilon": 0.02, - "epsilon_timesteps": 10000, # Timesteps over which to anneal epsilon. + "final_epsilon": 0.01, + # Timesteps over which to anneal epsilon. + "epsilon_timesteps": 40000, # For soft_q, use: # "exploration_config" = { diff --git a/rllib/agents/qmix/tests/test_qmix.py b/rllib/agents/qmix/tests/test_qmix.py index c5dad9d98..f141a6d8c 100644 --- a/rllib/agents/qmix/tests/test_qmix.py +++ b/rllib/agents/qmix/tests/test_qmix.py @@ -24,9 +24,10 @@ class AvailActionsTestEnv(MultiAgentEnv): def __init__(self, env_config): self.state = None - self.avail = env_config["avail_action"] + self.avail = env_config.get("avail_actions", [3]) self.action_mask = np.array([0] * 10) - self.action_mask[env_config["avail_action"]] = 1 + for a in self.avail: + self.action_mask[a] = 1 def reset(self): self.state = 0 @@ -34,22 +35,31 @@ class AvailActionsTestEnv(MultiAgentEnv): "agent_1": { "obs": self.observation_space["obs"].sample(), "action_mask": self.action_mask - } + }, + "agent_2": { + "obs": self.observation_space["obs"].sample(), + "action_mask": self.action_mask + }, } def step(self, action_dict): if self.state > 0: - assert action_dict["agent_1"] == self.avail, \ + assert (action_dict["agent_1"] in self.avail and + action_dict["agent_2"] in self.avail), \ "Failed to obey available actions mask!" self.state += 1 - rewards = {"agent_1": 1} + rewards = {"agent_1": 1, "agent_2": 0.5} obs = { "agent_1": { "obs": self.observation_space["obs"].sample(), "action_mask": self.action_mask + }, + "agent_2": { + "obs": self.observation_space["obs"].sample(), + "action_mask": self.action_mask } } - dones = {"__all__": self.state > 20} + dones = {"__all__": self.state >= 20} return obs, rewards, dones, {} @@ -64,28 +74,33 @@ class TestQMix(unittest.TestCase): def test_avail_actions_qmix(self): grouping = { - "group_1": ["agent_1"], # trivial grouping for testing + "group_1": ["agent_1", "agent_2"], } - obs_space = Tuple([AvailActionsTestEnv.observation_space]) - act_space = Tuple([AvailActionsTestEnv.action_space]) + obs_space = Tuple([ + AvailActionsTestEnv.observation_space, + AvailActionsTestEnv.observation_space + ]) + act_space = Tuple([ + AvailActionsTestEnv.action_space, AvailActionsTestEnv.action_space + ]) register_env( "action_mask_test", lambda config: AvailActionsTestEnv(config).with_agent_groups( grouping, obs_space=obs_space, act_space=act_space)) - agent = QMixTrainer( + trainer = QMixTrainer( env="action_mask_test", config={ "num_envs_per_worker": 5, # test with vectorization on "env_config": { - "avail_action": 3, + "avail_actions": [3, 4, 8], }, "framework": "torch", }) for _ in range(4): - agent.train() # OK if it doesn't trip the action assertion error - assert agent.train()["episode_reward_mean"] == 21.0 - agent.stop() + trainer.train() # OK if it doesn't trip the action assertion error + assert trainer.train()["episode_reward_mean"] == 30.0 + trainer.stop() ray.shutdown() diff --git a/rllib/examples/env/two_step_game.py b/rllib/examples/env/two_step_game.py index a6a50607b..221c64e59 100644 --- a/rllib/examples/env/two_step_game.py +++ b/rllib/examples/env/two_step_game.py @@ -1,4 +1,4 @@ -from gym.spaces import MultiDiscrete, Dict, Discrete +from gym.spaces import Dict, Discrete, MultiDiscrete, Tuple import numpy as np from ray.rllib.env.multi_agent_env import MultiAgentEnv, ENV_STATE @@ -109,3 +109,23 @@ class TwoStepGame(MultiAgentEnv): return np.concatenate([self.state, [2]]) else: return np.flatnonzero(self.state)[0] + 3 + + +class TwoStepGameWithGroupedAgents(MultiAgentEnv): + def __init__(self, env_config): + env = TwoStepGame(env_config) + tuple_obs_space = Tuple([env.observation_space, env.observation_space]) + tuple_act_space = Tuple([env.action_space, env.action_space]) + + self.env = env.with_agent_groups( + groups={"agents": [0, 1]}, + obs_space=tuple_obs_space, + act_space=tuple_act_space) + self.observation_space = self.env.observation_space + self.action_space = self.env.action_space + + def reset(self): + return self.env.reset() + + def step(self, actions): + return self.env.step(actions) diff --git a/rllib/examples/two_step_game.py b/rllib/examples/two_step_game.py index 5e90c3a0b..9d7927e63 100644 --- a/rllib/examples/two_step_game.py +++ b/rllib/examples/two_step_game.py @@ -14,7 +14,7 @@ import os import ray from ray import tune -from ray.tune import register_env, grid_search +from ray.tune import register_env from ray.rllib.env.multi_agent_env import ENV_STATE from ray.rllib.examples.env.two_step_game import TwoStepGame from ray.rllib.policy.policy import PolicySpec @@ -32,6 +32,12 @@ parser.add_argument( default="tf", help="The DL framework specifier.") parser.add_argument("--num-cpus", type=int, default=0) +parser.add_argument( + "--mixer", + type=str, + default="qmix", + choices=["qmix", "vdn", "none"], + help="The mixer model to use.") parser.add_argument( "--as-test", action="store_true", @@ -45,12 +51,12 @@ parser.add_argument( parser.add_argument( "--stop-timesteps", type=int, - default=50000, + default=70000, help="Number of timesteps to train.") parser.add_argument( "--stop-reward", type=float, - default=7.0, + default=8.0, help="Reward at which we stop training.") parser.add_argument( "--local-mode", @@ -116,11 +122,10 @@ if __name__ == "__main__": "rollout_fragment_length": 4, "train_batch_size": 32, "exploration_config": { - "epsilon_timesteps": 5000, - "final_epsilon": 0.05, + "final_epsilon": 0.0, }, "num_workers": 0, - "mixer": grid_search([None, "qmix"]), + "mixer": args.mixer, "env_config": { "separate_state_space": True, "one_hot_state_encoding": True @@ -147,9 +152,6 @@ if __name__ == "__main__": "env": "grouped_twostep" if group else TwoStepGame, }) - if args.as_test: - config["seed"] = 1234 - results = tune.run(args.run, stop=stop, config=config, verbose=2) if args.as_test: diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py index cfc0ba8f3..a65403881 100644 --- a/rllib/tests/run_regression_tests.py +++ b/rllib/tests/run_regression_tests.py @@ -53,6 +53,10 @@ parser.add_argument( if __name__ == "__main__": args = parser.parse_args() + # Error if deprecated --torch option used. + if args.torch: + deprecation_warning(old="--torch", new="--framework=torch", error=True) + # Bazel regression test mode: Get path to look for yaml files. # Get the path or single file to use. rllib_dir = Path(__file__).parent.parent @@ -81,13 +85,14 @@ if __name__ == "__main__": assert len(experiments) == 1,\ "Error, can only run a single experiment per yaml file!" - # Add torch option to exp config. exp = list(experiments.values())[0] exp["config"]["framework"] = args.framework - if args.torch: - deprecation_warning(old="--torch", new="--framework=torch") - exp["config"]["framework"] = "torch" - args.framework = "torch" + + # QMIX does not support tf yet -> skip. + if exp["run"] == "QMIX" and args.framework != "torch": + print(f"Skipping framework='{args.framework}' for QMIX.") + continue + # Always run with eager-tracing when framework=tf2. if args.framework in ["tf2", "tfe"]: exp["config"]["eager_tracing"] = True diff --git a/rllib/tests/test_io.py b/rllib/tests/test_io.py index 480b857da..c8cb6239a 100644 --- a/rllib/tests/test_io.py +++ b/rllib/tests/test_io.py @@ -42,7 +42,7 @@ class AgentIOTest(unittest.TestCase): shutil.rmtree(self.test_dir) ray.shutdown() - def writeOutputs(self, output, fw): + def write_outputs(self, output, fw): agent = PGTrainer( env="CartPole-v0", config={ @@ -53,23 +53,23 @@ class AgentIOTest(unittest.TestCase): agent.train() return agent - def testAgentOutputOk(self): + def test_agent_output_ok(self): for fw in framework_iterator(frameworks=("torch", "tf")): - self.writeOutputs(self.test_dir, fw) + self.write_outputs(self.test_dir, fw) self.assertEqual(len(os.listdir(self.test_dir + fw)), 1) reader = JsonReader(self.test_dir + fw + "/*.json") reader.next() - def testAgentOutputLogdir(self): + def test_agent_output_logdir(self): """Test special value 'logdir' as Agent's output.""" for fw in framework_iterator(): - agent = self.writeOutputs("logdir", fw) + agent = self.write_outputs("logdir", fw) self.assertEqual( len(glob.glob(agent.logdir + "/output-*.json")), 1) - def testAgentInputDir(self): + def test_agent_input_dir(self): for fw in framework_iterator(frameworks=("torch", "tf")): - self.writeOutputs(self.test_dir, fw) + self.write_outputs(self.test_dir, fw) agent = PGTrainer( env="CartPole-v0", config={ @@ -81,16 +81,16 @@ class AgentIOTest(unittest.TestCase): self.assertEqual(result["timesteps_total"], 250) # read from input self.assertTrue(np.isnan(result["episode_reward_mean"])) - def testSplitByEpisode(self): + def test_split_by_episode(self): splits = SAMPLES.split_by_episode() self.assertEqual(len(splits), 3) self.assertEqual(splits[0].count, 2) self.assertEqual(splits[1].count, 1) self.assertEqual(splits[2].count, 1) - def testAgentInputPostprocessingEnabled(self): + def test_agent_input_postprocessing_enabled(self): for fw in framework_iterator(frameworks=("tf", "torch")): - self.writeOutputs(self.test_dir, fw) + self.write_outputs(self.test_dir, fw) # Rewrite the files to drop advantages and value_targets for # testing @@ -100,7 +100,7 @@ class AgentIOTest(unittest.TestCase): for line in f.readlines(): data = json.loads(line) # Data won't contain rewards as these are not included - # in the writeOutputs run (not needed in the + # in the write_outputs run (not needed in the # SampleBatch). Flip out "rewards" for "advantages" # just for testing. data["rewards"] = data["advantages"] @@ -125,9 +125,9 @@ class AgentIOTest(unittest.TestCase): self.assertEqual(result["timesteps_total"], 250) # read from input self.assertTrue(np.isnan(result["episode_reward_mean"])) - def testAgentInputEvalSim(self): + def test_agent_input_eval_sim(self): for fw in framework_iterator(): - self.writeOutputs(self.test_dir, fw) + self.write_outputs(self.test_dir, fw) agent = PGTrainer( env="CartPole-v0", config={ @@ -142,9 +142,9 @@ class AgentIOTest(unittest.TestCase): time.sleep(0.1) assert False, "did not see any simulation results" - def testAgentInputList(self): + def test_agent_input_list(self): for fw in framework_iterator(frameworks=("torch", "tf")): - self.writeOutputs(self.test_dir, fw) + self.write_outputs(self.test_dir, fw) agent = PGTrainer( env="CartPole-v0", config={ @@ -157,9 +157,9 @@ class AgentIOTest(unittest.TestCase): self.assertEqual(result["timesteps_total"], 250) # read from input self.assertTrue(np.isnan(result["episode_reward_mean"])) - def testAgentInputDict(self): + def test_agent_input_dict(self): for fw in framework_iterator(): - self.writeOutputs(self.test_dir, fw) + self.write_outputs(self.test_dir, fw) agent = PGTrainer( env="CartPole-v0", config={ @@ -174,7 +174,7 @@ class AgentIOTest(unittest.TestCase): result = agent.train() self.assertTrue(not np.isnan(result["episode_reward_mean"])) - def testMultiAgent(self): + def test_multi_agent(self): register_env("multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 10})) @@ -234,7 +234,7 @@ class AgentIOTest(unittest.TestCase): ] for input_procedure in test_input_procedure: for fw in framework_iterator(frameworks=("torch", "tf")): - self.writeOutputs(self.test_dir, fw) + self.write_outputs(self.test_dir, fw) agent = PGTrainer( env="CartPole-v0", config={ diff --git a/rllib/tuned_examples/qmix/two-step-game-qmix-no-mixer.yaml b/rllib/tuned_examples/qmix/two-step-game-qmix-no-mixer.yaml new file mode 100644 index 000000000..f2b529211 --- /dev/null +++ b/rllib/tuned_examples/qmix/two-step-game-qmix-no-mixer.yaml @@ -0,0 +1,22 @@ +two-step-game-qmix-without-mixer: + env: ray.rllib.examples.env.two_step_game.TwoStepGameWithGroupedAgents + run: QMIX + stop: + episode_reward_mean: 7.0 + timesteps_total: 70000 + config: + # QMIX only supports torch for now. + framework: torch + + env_config: + env_config: + separate_state_space: true + one_hot_state_encoding: true + + exploration_config: + final_epsilon: 0.0 + + rollout_fragment_length: 4 + train_batch_size: 32 + num_workers: 0 + mixer: null diff --git a/rllib/tuned_examples/qmix/two-step-game-qmix-vdn-mixer.yaml b/rllib/tuned_examples/qmix/two-step-game-qmix-vdn-mixer.yaml new file mode 100644 index 000000000..4e8baffa4 --- /dev/null +++ b/rllib/tuned_examples/qmix/two-step-game-qmix-vdn-mixer.yaml @@ -0,0 +1,22 @@ +two-step-game-qmix-with-vdn-mixer: + env: ray.rllib.examples.env.two_step_game.TwoStepGameWithGroupedAgents + run: QMIX + stop: + episode_reward_mean: 7.0 + timesteps_total: 70000 + config: + # QMIX only supports torch for now. + framework: torch + + env_config: + env_config: + separate_state_space: true + one_hot_state_encoding: true + + exploration_config: + final_epsilon: 0.0 + + rollout_fragment_length: 4 + train_batch_size: 32 + num_workers: 0 + mixer: vdn diff --git a/rllib/tuned_examples/qmix/two-step-game-qmix.yaml b/rllib/tuned_examples/qmix/two-step-game-qmix.yaml new file mode 100644 index 000000000..36359b780 --- /dev/null +++ b/rllib/tuned_examples/qmix/two-step-game-qmix.yaml @@ -0,0 +1,23 @@ +two-step-game-qmix-with-qmix-mixer: + env: ray.rllib.examples.env.two_step_game.TwoStepGameWithGroupedAgents + run: QMIX + stop: + episode_reward_mean: 8.0 + timesteps_total: 70000 + config: + # QMIX only supports torch for now. + framework: torch + + env_config: + env_config: + separate_state_space: true + one_hot_state_encoding: true + + # W/o this setting, won't get to 8.0 reward. + exploration_config: + final_epsilon: 0.0 + + rollout_fragment_length: 4 + train_batch_size: 32 + num_workers: 0 + mixer: qmix