ray/rllib/examples/two_step_game.py

"""The two-step game from QMIX: https://arxiv.org/pdf/1803.11485.pdf

Configurations you can try:
    - normal policy gradients (PG)
    - MADDPG
    - QMIX

See also: centralized_critic.py for centralized critic PPO on this game.
"""

import argparse
from gym.spaces import Dict, Discrete, Tuple, MultiDiscrete
import logging
import os

import ray
from ray import air, tune
from ray.tune import register_env
from ray.rllib.algorithms.qmix import QMixConfig
from ray.rllib.env.multi_agent_env import ENV_STATE
from ray.rllib.examples.env.two_step_game import TwoStepGame
from ray.rllib.policy.policy import PolicySpec
from ray.rllib.utils.test_utils import check_learning_achieved

logger = logging.getLogger(__name__)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--run", type=str, default="PG", help="The RLlib-registered algorithm to use."
)
parser.add_argument(
    "--framework",
    choices=["tf", "tf2", "tfe", "torch"],
    default="tf",
    help="The DL framework specifier.",
)
parser.add_argument("--num-cpus", type=int, default=0)
parser.add_argument(
    "--mixer",
    type=str,
    default="qmix",
    choices=["qmix", "vdn", "none"],
    help="The mixer model to use.",
)
parser.add_argument(
    "--as-test",
    action="store_true",
    help="Whether this script should be run as a test: --stop-reward must "
    "be achieved within --stop-timesteps AND --stop-iters.",
)
parser.add_argument(
    "--stop-iters", type=int, default=200, help="Number of iterations to train."
)
parser.add_argument(
    "--stop-timesteps", type=int, default=70000, help="Number of timesteps to train."
)
parser.add_argument(
    "--stop-reward", type=float, default=8.0, help="Reward at which we stop training."
)
parser.add_argument(
    "--local-mode",
    action="store_true",
    help="Init Ray in local mode for easier debugging.",
)

if __name__ == "__main__":
    args = parser.parse_args()

    ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)

    if args.run == "contrib/MADDPG":
        logger.warning(
            "`contrib/MADDPG` is not longer a valid algorithm descriptor! "
            "Use `MADDPG` instead."
        )
        args.run = "MADDPG"

    grouping = {
        "group_1": [0, 1],
    }
    obs_space = Tuple(
        [
            Dict(
                {
                    "obs": MultiDiscrete([2, 2, 2, 3]),
                    ENV_STATE: MultiDiscrete([2, 2, 2]),
                }
            ),
            Dict(
                {
                    "obs": MultiDiscrete([2, 2, 2, 3]),
                    ENV_STATE: MultiDiscrete([2, 2, 2]),
                }
            ),
        ]
    )
    act_space = Tuple(
        [
            TwoStepGame.action_space,
            TwoStepGame.action_space,
        ]
    )
    register_env(
        "grouped_twostep",
        lambda config: TwoStepGame(config).with_agent_groups(
            grouping, obs_space=obs_space, act_space=act_space
        ),
    )

    if args.run == "MADDPG":
        obs_space = Discrete(6)
        act_space = TwoStepGame.action_space
        config = {
            "env": TwoStepGame,
            "env_config": {
                "actions_are_logits": True,
            },
            "num_steps_sampled_before_learning_starts": 100,
            "multiagent": {
                "policies": {
                    "pol1": PolicySpec(
                        observation_space=obs_space,
                        action_space=act_space,
                        config={"agent_id": 0},
                    ),
                    "pol2": PolicySpec(
                        observation_space=obs_space,
                        action_space=act_space,
                        config={"agent_id": 1},
                    ),
                },
                "policy_mapping_fn": (lambda aid, **kwargs: "pol2" if aid else "pol1"),
            },
            "framework": args.framework,
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        }
    elif args.run == "QMIX":
        config = (
            QMixConfig()
            .training(mixer=args.mixer, train_batch_size=32)
            .rollouts(num_rollout_workers=0, rollout_fragment_length=4)
            .exploration(
                exploration_config={
                    "final_epsilon": 0.0,
                }
            )
            .environment(
                env="grouped_twostep",
                env_config={
                    "separate_state_space": True,
                    "one_hot_state_encoding": True,
                },
            )
            .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
        )
        config = config.to_dict()
    else:
        config = {
            "env": TwoStepGame,
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": args.framework,
        }

    stop = {
        "episode_reward_mean": args.stop_reward,
        "timesteps_total": args.stop_timesteps,
        "training_iteration": args.stop_iters,
    }

    results = tune.Tuner(
        args.run,
        run_config=air.RunConfig(stop=stop, verbose=2),
        param_space=config,
    ).fit()

    if args.as_test:
        check_learning_achieved(results, args.stop_reward)

    ray.shutdown()
[rllib] Centralized critic / PPO example on TwoStepGame (#5392) 2019-08-08 14:03:28 -07:00			`"""The two-step game from QMIX: https://arxiv.org/pdf/1803.11485.pdf`

			`Configurations you can try:`
			`- normal policy gradients (PG)`
[RLlib] MADDPG: Move into agents folder (from contrib) and use `training_iteration` method. (#24502) 2022-05-06 12:35:21 +02:00			`- MADDPG`
[rllib] Centralized critic / PPO example on TwoStepGame (#5392) 2019-08-08 14:03:28 -07:00			`- QMIX`

			`See also: centralized_critic.py for centralized critic PPO on this game.`
			`"""`
[rllib] Q-Mix implementation (Q-Mix, VDN, IQN, and Ape-X variants) (#3548) 2018-12-18 10:40:01 -08:00
			`import argparse`
[RLlib] Redo simplify multi agent config dict: Reverted b/c seemed to break test_typing (non RLlib test). (#17046) 2021-07-15 05:51:24 -04:00			`from gym.spaces import Dict, Discrete, Tuple, MultiDiscrete`
[RLlib] Config objects for DDPG and SimpleQ. (#24339) 2022-05-12 16:12:42 +02:00			`import logging`
[RLlib] Fix all example scripts to run on GPUs. (#11105) 2020-10-02 23:07:44 +02:00			`import os`
[rllib] Q-Mix implementation (Q-Mix, VDN, IQN, and Ape-X variants) (#3548) 2018-12-18 10:40:01 -08:00
			`import ray`
[air] update rllib example to use Tuner API. (#26987) update rllib example to use Tuner API. Signed-off-by: xwjiang2010 <xwjiang2010@gmail.com> 2022-07-27 04:12:59 -07:00			`from ray import air, tune`
[RLlib] QMIX better defaults + added to CI learning tests (#21332) 2022-01-04 08:54:41 +01:00			`from ray.tune import register_env`
[RLlib] Agents to algos: DQN w/o Apex and R2D2, DDPG/TD3, SAC, SlateQ, QMIX, PG, Bandits (#24896) 2022-05-19 09:30:42 -07:00			`from ray.rllib.algorithms.qmix import QMixConfig`
[RLlib] rllib/examples folder restructuring (#8250) Cleans up of the rllib/examples folder by moving all example Envs into rllibexamples/env (so they can be used by other scripts and tests as well). 2020-05-01 22:59:34 +02:00			`from ray.rllib.env.multi_agent_env import ENV_STATE`
			`from ray.rllib.examples.env.two_step_game import TwoStepGame`
[RLlib] Redo simplify multi agent config dict: Reverted b/c seemed to break test_typing (non RLlib test). (#17046) 2021-07-15 05:51:24 -04:00			`from ray.rllib.policy.policy import PolicySpec`
[RLlib] Examples folder restructuring (Model examples; final part). (#8278) - This PR completes any previously missing PyTorch Model counterparts to TFModels in examples/models. - It also makes sure, all example scripts in the rllib/examples folder are tested for both frameworks and learn the given task (this is often currently not checked) using a --as-test flag in connection with a --stop-reward. 2020-05-12 08:23:10 +02:00			`from ray.rllib.utils.test_utils import check_learning_achieved`
[rllib] Q-Mix implementation (Q-Mix, VDN, IQN, and Ape-X variants) (#3548) 2018-12-18 10:40:01 -08:00
[RLlib] Config objects for DDPG and SimpleQ. (#24339) 2022-05-12 16:12:42 +02:00			`logger = logging.getLogger(__name__)`

[rllib] Q-Mix implementation (Q-Mix, VDN, IQN, and Ape-X variants) (#3548) 2018-12-18 10:40:01 -08:00			`parser = argparse.ArgumentParser()`
[RLlib] Examples scripts add argparse help and replace `--torch` with `--framework`. (#15832) 2021-05-18 13:18:12 +02:00			`parser.add_argument(`
			`"--run", type=str, default="PG", help="The RLlib-registered algorithm to use."`
			`)`
			`parser.add_argument(`
			`"--framework",`
			`choices=["tf", "tf2", "tfe", "torch"],`
			`default="tf",`
			`help="The DL framework specifier.",`
			`)`
[RLlib] Move all jenkins RLlib-tests into bazel (rllib/BUILD). (#7178) * commit * comment 2020-02-15 23:50:44 +01:00			`parser.add_argument("--num-cpus", type=int, default=0)`
[RLlib] QMIX better defaults + added to CI learning tests (#21332) 2022-01-04 08:54:41 +01:00			`parser.add_argument(`
			`"--mixer",`
			`type=str,`
			`default="qmix",`
			`choices=["qmix", "vdn", "none"],`
			`help="The mixer model to use.",`
			`)`
[RLlib] Examples scripts add argparse help and replace `--torch` with `--framework`. (#15832) 2021-05-18 13:18:12 +02:00			`parser.add_argument(`
			`"--as-test",`
			`action="store_true",`
			`help="Whether this script should be run as a test: --stop-reward must "`
			`"be achieved within --stop-timesteps AND --stop-iters.",`
			`)`
			`parser.add_argument(`
			`"--stop-iters", type=int, default=200, help="Number of iterations to train."`
			`)`
			`parser.add_argument(`
			`"--stop-timesteps", type=int, default=70000, help="Number of timesteps to train."`
			`)`
			`parser.add_argument(`
			`"--stop-reward", type=float, default=8.0, help="Reward at which we stop training."`
			`)`
[RLlib] POC: Separate losses for APPO/IMPALA. Enable TFPolicy to handle multiple optimizers/losses (like TorchPolicy). (#18669) 2021-09-21 22:00:14 +02:00			`parser.add_argument(`
			`"--local-mode",`
			`action="store_true",`
			`help="Init Ray in local mode for easier debugging.",`
			`)`
[rllib] Q-Mix implementation (Q-Mix, VDN, IQN, and Ape-X variants) (#3548) 2018-12-18 10:40:01 -08:00
			`if __name__ == "__main__":`
			`args = parser.parse_args()`

[RLlib] POC: Separate losses for APPO/IMPALA. Enable TFPolicy to handle multiple optimizers/losses (like TorchPolicy). (#18669) 2021-09-21 22:00:14 +02:00			`ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)`
[RLlib] Issue 15973: Trainer.with_updates(validate_config=...) behaves confusingly. (#16429) 2021-06-19 22:42:00 +02:00
[RLlib] Config objects for DDPG and SimpleQ. (#24339) 2022-05-12 16:12:42 +02:00			`if args.run == "contrib/MADDPG":`
			`logger.warning(`
			"`contrib/MADDPG` is not longer a valid algorithm descriptor! "
			"Use `MADDPG` instead."
			`)`
			`args.run = "MADDPG"`

[rllib] Q-Mix implementation (Q-Mix, VDN, IQN, and Ape-X variants) (#3548) 2018-12-18 10:40:01 -08:00			`grouping = {`
MADDPG implementation in RLlib (#5348) 2019-08-06 19:22:06 -04:00			`"group_1": [0, 1],`
[rllib] Q-Mix implementation (Q-Mix, VDN, IQN, and Ape-X variants) (#3548) 2018-12-18 10:40:01 -08:00			`}`
			`obs_space = Tuple(`
			`[`
Qmix on gpu and with non-stacked-obs environment state support (#5751) 2019-10-08 13:18:07 -07:00			`Dict(`
			`{`
			`"obs": MultiDiscrete([2, 2, 2, 3]),`
			`ENV_STATE: MultiDiscrete([2, 2, 2]),`
			`}`
			`),`
			`Dict(`
			`{`
			`"obs": MultiDiscrete([2, 2, 2, 3]),`
			`ENV_STATE: MultiDiscrete([2, 2, 2]),`
			`}`
			`),`
[rllib] Q-Mix implementation (Q-Mix, VDN, IQN, and Ape-X variants) (#3548) 2018-12-18 10:40:01 -08:00			`]`
			`)`
			`act_space = Tuple(`
			`[`
			`TwoStepGame.action_space,`
			`TwoStepGame.action_space,`
			`]`
			`)`
			`register_env(`
			`"grouped_twostep",`
			`lambda config: TwoStepGame(config).with_agent_groups(`
			`grouping, obs_space=obs_space, act_space=act_space`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`),`
[rllib] Q-Mix implementation (Q-Mix, VDN, IQN, and Ape-X variants) (#3548) 2018-12-18 10:40:01 -08:00			`)`

[RLlib] MADDPG: Move into agents folder (from contrib) and use `training_iteration` method. (#24502) 2022-05-06 12:35:21 +02:00			`if args.run == "MADDPG":`
[RLlib] Redo simplify multi agent config dict: Reverted b/c seemed to break test_typing (non RLlib test). (#17046) 2021-07-15 05:51:24 -04:00			`obs_space = Discrete(6)`
			`act_space = TwoStepGame.action_space`
MADDPG implementation in RLlib (#5348) 2019-08-06 19:22:06 -04:00			`config = {`
[RLlib] Retry agents -> algorithms. with proper doc changes this time. (#24797) 2022-05-16 00:45:32 -07:00			`"env": TwoStepGame,`
MADDPG implementation in RLlib (#5348) 2019-08-06 19:22:06 -04:00			`"env_config": {`
			`"actions_are_logits": True,`
			`},`
[RLlib] Move learning_starts logic from buffers into `training_step()`. (#26032) 2022-08-11 13:07:30 +02:00			`"num_steps_sampled_before_learning_starts": 100,`
MADDPG implementation in RLlib (#5348) 2019-08-06 19:22:06 -04:00			`"multiagent": {`
			`"policies": {`
[RLlib] Redo simplify multi agent config dict: Reverted b/c seemed to break test_typing (non RLlib test). (#17046) 2021-07-15 05:51:24 -04:00			`"pol1": PolicySpec(`
			`observation_space=obs_space,`
			`action_space=act_space,`
			`config={"agent_id": 0},`
			`),`
			`"pol2": PolicySpec(`
			`observation_space=obs_space,`
			`action_space=act_space,`
			`config={"agent_id": 1},`
			`),`
MADDPG implementation in RLlib (#5348) 2019-08-06 19:22:06 -04:00			`},`
[RLlib] Re-do: Trainer: Support add and delete Policies. (#16569) 2021-06-21 13:46:01 +02:00			`"policy_mapping_fn": (lambda aid, **kwargs: "pol2" if aid else "pol1"),`
MADDPG implementation in RLlib (#5348) 2019-08-06 19:22:06 -04:00			`},`
[RLlib] Examples scripts add argparse help and replace `--torch` with `--framework`. (#15832) 2021-05-18 13:18:12 +02:00			`"framework": args.framework,`
[RLlib] Fix all example scripts to run on GPUs. (#11105) 2020-10-02 23:07:44 +02:00			# Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
			`"num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),`
MADDPG implementation in RLlib (#5348) 2019-08-06 19:22:06 -04:00			`}`
			`elif args.run == "QMIX":`
[RLlib] Retry agents -> algorithms. with proper doc changes this time. (#24797) 2022-05-16 00:45:32 -07:00			`config = (`
			`QMixConfig()`
			`.training(mixer=args.mixer, train_batch_size=32)`
			`.rollouts(num_rollout_workers=0, rollout_fragment_length=4)`
			`.exploration(`
			`exploration_config={`
			`"final_epsilon": 0.0,`
			`}`
			`)`
			`.environment(`
			`env="grouped_twostep",`
			`env_config={`
			`"separate_state_space": True,`
			`"one_hot_state_encoding": True,`
			`},`
			`)`
			`.resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))`
			`)`
			`config = config.to_dict()`
[rllib] Q-Mix implementation (Q-Mix, VDN, IQN, and Ape-X variants) (#3548) 2018-12-18 10:40:01 -08:00			`else:`
[RLlib] Fix all example scripts to run on GPUs. (#11105) 2020-10-02 23:07:44 +02:00			`config = {`
[RLlib] Retry agents -> algorithms. with proper doc changes this time. (#24797) 2022-05-16 00:45:32 -07:00			`"env": TwoStepGame,`
[RLlib] Fix all example scripts to run on GPUs. (#11105) 2020-10-02 23:07:44 +02:00			# Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
			`"num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),`
[RLlib] Examples scripts add argparse help and replace `--torch` with `--framework`. (#15832) 2021-05-18 13:18:12 +02:00			`"framework": args.framework,`
[RLlib] Fix all example scripts to run on GPUs. (#11105) 2020-10-02 23:07:44 +02:00			`}`
[rllib] Q-Mix implementation (Q-Mix, VDN, IQN, and Ape-X variants) (#3548) 2018-12-18 10:40:01 -08:00
[RLlib] Examples folder restructuring (Model examples; final part). (#8278) - This PR completes any previously missing PyTorch Model counterparts to TFModels in examples/models. - It also makes sure, all example scripts in the rllib/examples folder are tested for both frameworks and learn the given task (this is often currently not checked) using a --as-test flag in connection with a --stop-reward. 2020-05-12 08:23:10 +02:00			`stop = {`
			`"episode_reward_mean": args.stop_reward,`
			`"timesteps_total": args.stop_timesteps,`
[RLlib] Examples scripts add argparse help and replace `--torch` with `--framework`. (#15832) 2021-05-18 13:18:12 +02:00			`"training_iteration": args.stop_iters,`
[RLlib] Examples folder restructuring (Model examples; final part). (#8278) - This PR completes any previously missing PyTorch Model counterparts to TFModels in examples/models. - It also makes sure, all example scripts in the rllib/examples folder are tested for both frameworks and learn the given task (this is often currently not checked) using a --as-test flag in connection with a --stop-reward. 2020-05-12 08:23:10 +02:00			`}`

[air] update rllib example to use Tuner API. (#26987) update rllib example to use Tuner API. Signed-off-by: xwjiang2010 <xwjiang2010@gmail.com> 2022-07-27 04:12:59 -07:00			`results = tune.Tuner(`
			`args.run,`
			`run_config=air.RunConfig(stop=stop, verbose=2),`
			`param_space=config,`
			`).fit()`
[RLlib] Examples folder restructuring (Model examples; final part). (#8278) - This PR completes any previously missing PyTorch Model counterparts to TFModels in examples/models. - It also makes sure, all example scripts in the rllib/examples folder are tested for both frameworks and learn the given task (this is often currently not checked) using a --as-test flag in connection with a --stop-reward. 2020-05-12 08:23:10 +02:00
			`if args.as_test:`
			`check_learning_achieved(results, args.stop_reward)`

			`ray.shutdown()`