ray/rllib/examples/attention_net.py

"""
Example of using an RL agent (default: PPO) with an AttentionNet model,
which is useful for environments where state is important but not explicitly
part of the observations.

For example, in the "repeat after me" environment (default here), the agent
needs to repeat an observation from n timesteps before.
AttentionNet keeps state of previous observations and uses transformers to
learn a policy that successfully repeats previous observations.
Without attention, the RL agent only "sees" the last observation, not the one
n timesteps ago and cannot learn to repeat this previous observation.

AttentionNet paper: https://arxiv.org/abs/1506.07704

This example script also shows how to train and test a PPO agent with an
AttentionNet model manually, i.e., without using Tune.

---
Run this example with defaults (using Tune and AttentionNet on the "repeat
after me" environment):
$ python attention_net.py
Then run again without attention:
$ python attention_net.py --no-attention
Compare the learning curve on TensorBoard:
$ cd ~/ray-results/; tensorboard --logdir .
There will be a huge difference between the version with and without attention!

Other options for running this example:
$ python attention_net.py --help
"""
import argparse
import os

import numpy as np

import ray
from ray import tune
from ray.rllib.agents import ppo
from ray.rllib.examples.env.look_and_push import LookAndPush, OneHot
from ray.rllib.examples.env.repeat_after_me_env import RepeatAfterMeEnv
from ray.rllib.examples.env.repeat_initial_obs_env import RepeatInitialObsEnv
from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.test_utils import check_learning_achieved
from ray.tune import registry
from ray.tune.logger import pretty_print

tf1, tf, tfv = try_import_tf()
SUPPORTED_ENVS = [
    "RepeatAfterMeEnv", "RepeatInitialObsEnv", "LookAndPush",
    "StatelessCartPole"
]


def get_cli_args():
    """Create CLI parser and return parsed arguments"""
    parser = argparse.ArgumentParser()

    # example-specific args
    parser.add_argument(
        "--no-attention",
        action="store_true",
        help="Do NOT use attention. For comparison: The agent will not learn.")
    parser.add_argument(
        "--env", choices=SUPPORTED_ENVS, default="RepeatAfterMeEnv")

    # general args
    parser.add_argument(
        "--run", default="PPO", help="The RLlib-registered algorithm to use.")
    parser.add_argument("--num-cpus", type=int, default=3)
    parser.add_argument(
        "--framework",
        choices=["tf", "tf2", "tfe", "torch"],
        default="tf",
        help="The DL framework specifier.")
    parser.add_argument(
        "--stop-iters",
        type=int,
        default=200,
        help="Number of iterations to train.")
    parser.add_argument(
        "--stop-timesteps",
        type=int,
        default=500000,
        help="Number of timesteps to train.")
    parser.add_argument(
        "--stop-reward",
        type=float,
        default=80.0,
        help="Reward at which we stop training.")
    parser.add_argument(
        "--as-test",
        action="store_true",
        help="Whether this script should be run as a test: --stop-reward must "
        "be achieved within --stop-timesteps AND --stop-iters.")
    parser.add_argument(
        "--no-tune",
        action="store_true",
        help="Run without Tune using a manual train loop instead. Here,"
        "there is no TensorBoard support.")
    parser.add_argument(
        "--local-mode",
        action="store_true",
        help="Init Ray in local mode for easier debugging.")

    args = parser.parse_args()
    print(f"Running with following CLI args: {args}")
    return args


if __name__ == "__main__":
    args = get_cli_args()

    ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)

    # register custom environments
    registry.register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c))
    registry.register_env("RepeatInitialObsEnv",
                          lambda _: RepeatInitialObsEnv())
    registry.register_env("LookAndPush", lambda _: OneHot(LookAndPush()))
    registry.register_env("StatelessCartPole", lambda _: StatelessCartPole())

    # main part: RLlib config with AttentionNet model
    config = {
        "env": args.env,
        # This env_config is only used for the RepeatAfterMeEnv env.
        "env_config": {
            "repeat_delay": 2,
        },
        "gamma": 0.99,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", 0)),
        "num_envs_per_worker": 20,
        "entropy_coeff": 0.001,
        "num_sgd_iter": 10,
        "vf_loss_coeff": 1e-5,
        "model": {
            # Attention net wrapping (for tf) can already use the native keras
            # model versions. For torch, this will have no effect.
            "_use_default_native_models": True,
            "use_attention": not args.no_attention,
            "max_seq_len": 10,
            "attention_num_transformer_units": 1,
            "attention_dim": 32,
            "attention_memory_inference": 10,
            "attention_memory_training": 10,
            "attention_num_heads": 1,
            "attention_head_dim": 32,
            "attention_position_wise_mlp_dim": 32,
        },
        "framework": args.framework,
    }

    stop = {
        "training_iteration": args.stop_iters,
        "timesteps_total": args.stop_timesteps,
        "episode_reward_mean": args.stop_reward,
    }

    # Manual training loop (no Ray tune).
    if args.no_tune:
        # manual training loop using PPO and manually keeping track of state
        if args.run != "PPO":
            raise ValueError("Only support --run PPO with --no-tune.")
        ppo_config = ppo.DEFAULT_CONFIG.copy()
        ppo_config.update(config)
        trainer = ppo.PPOTrainer(config=ppo_config, env=args.env)
        # run manual training loop and print results after each iteration
        for _ in range(args.stop_iters):
            result = trainer.train()
            print(pretty_print(result))
            # stop training if the target train steps or reward are reached
            if result["timesteps_total"] >= args.stop_timesteps or \
                    result["episode_reward_mean"] >= args.stop_reward:
                break

        # Run manual test loop (only for RepeatAfterMe env).
        if args.env == "RepeatAfterMeEnv":
            print("Finished training. Running manual test/inference loop.")
            # prepare env
            env = RepeatAfterMeEnv(config["env_config"])
            obs = env.reset()
            done = False
            total_reward = 0
            # start with all zeros as state
            num_transformers = config["model"][
                "attention_num_transformer_units"]
            attention_dim = config["model"]["attention_dim"]
            memory = config["model"]["attention_memory_inference"]
            init_state = state = [
                np.zeros([memory, attention_dim], np.float32)
                for _ in range(num_transformers)
            ]
            # run one iteration until done
            print(f"RepeatAfterMeEnv with {config['env_config']}")
            while not done:
                action, state_out, _ = trainer.compute_single_action(
                    obs, state)
                next_obs, reward, done, _ = env.step(action)
                print(f"Obs: {obs}, Action: {action}, Reward: {reward}")
                obs = next_obs
                total_reward += reward
                state = [
                    np.concatenate([state[i], [state_out[i]]], axis=0)[1:]
                    for i in range(num_transformers)
                ]
            print(f"Total reward in test episode: {total_reward}")

    # Run with Tune for auto env and trainer creation and TensorBoard.
    else:
        results = tune.run(args.run, config=config, stop=stop, verbose=2)

        if args.as_test:
            print("Checking if learning goals were achieved")
            check_learning_achieved(results, args.stop_reward)

    ray.shutdown()
[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`"""`
			`Example of using an RL agent (default: PPO) with an AttentionNet model,`
			`which is useful for environments where state is important but not explicitly`
			`part of the observations.`

			`For example, in the "repeat after me" environment (default here), the agent`
			`needs to repeat an observation from n timesteps before.`
			`AttentionNet keeps state of previous observations and uses transformers to`
			`learn a policy that successfully repeats previous observations.`
			`Without attention, the RL agent only "sees" the last observation, not the one`
			`n timesteps ago and cannot learn to repeat this previous observation.`

			`AttentionNet paper: https://arxiv.org/abs/1506.07704`

			`This example script also shows how to train and test a PPO agent with an`
			`AttentionNet model manually, i.e., without using Tune.`

			`---`
			`Run this example with defaults (using Tune and AttentionNet on the "repeat`
			`after me" environment):`
			`$ python attention_net.py`
			`Then run again without attention:`
			`$ python attention_net.py --no-attention`
			`Compare the learning curve on TensorBoard:`
			`$ cd ~/ray-results/; tensorboard --logdir .`
			`There will be a huge difference between the version with and without attention!`

			`Other options for running this example:`
			`$ python attention_net.py --help`
			`"""`
[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00			`import argparse`
[RLlib] Fix all example scripts to run on GPUs. (#11105) 2020-10-02 23:07:44 +02:00			`import os`
[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00
[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`import numpy as np`

[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00			`import ray`
			`from ray import tune`
[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`from ray.rllib.agents import ppo`
[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00			`from ray.rllib.examples.env.look_and_push import LookAndPush, OneHot`
			`from ray.rllib.examples.env.repeat_after_me_env import RepeatAfterMeEnv`
			`from ray.rllib.examples.env.repeat_initial_obs_env import RepeatInitialObsEnv`
			`from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole`
[RLlib] Minor `rllib.utils` cleanup. (#8932) 2020-06-16 08:52:20 +02:00			`from ray.rllib.utils.framework import try_import_tf`
[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00			`from ray.rllib.utils.test_utils import check_learning_achieved`
			`from ray.tune import registry`
[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`from ray.tune.logger import pretty_print`
[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00
[RLlib] Tf2x preparation; part 2 (upgrading `try_import_tf()`). (#9136) * WIP. * Fixes. * LINT. * WIP. * WIP. * Fixes. * Fixes. * Fixes. * Fixes. * WIP. * Fixes. * Test * Fix. * Fixes and LINT. * Fixes and LINT. * LINT. 2020-06-30 10:13:20 +02:00			`tf1, tf, tfv = try_import_tf()`
[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`SUPPORTED_ENVS = [`
			`"RepeatAfterMeEnv", "RepeatInitialObsEnv", "LookAndPush",`
			`"StatelessCartPole"`
			`]`
[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00

[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`def get_cli_args():`
			`"""Create CLI parser and return parsed arguments"""`
			`parser = argparse.ArgumentParser()`

			`# example-specific args`
			`parser.add_argument(`
			`"--no-attention",`
			`action="store_true",`
			`help="Do NOT use attention. For comparison: The agent will not learn.")`
			`parser.add_argument(`
			`"--env", choices=SUPPORTED_ENVS, default="RepeatAfterMeEnv")`

			`# general args`
			`parser.add_argument(`
			`"--run", default="PPO", help="The RLlib-registered algorithm to use.")`
			`parser.add_argument("--num-cpus", type=int, default=3)`
			`parser.add_argument(`
			`"--framework",`
			`choices=["tf", "tf2", "tfe", "torch"],`
			`default="tf",`
			`help="The DL framework specifier.")`
			`parser.add_argument(`
			`"--stop-iters",`
			`type=int,`
			`default=200,`
			`help="Number of iterations to train.")`
			`parser.add_argument(`
			`"--stop-timesteps",`
			`type=int,`
			`default=500000,`
			`help="Number of timesteps to train.")`
			`parser.add_argument(`
			`"--stop-reward",`
			`type=float,`
			`default=80.0,`
			`help="Reward at which we stop training.")`
			`parser.add_argument(`
			`"--as-test",`
			`action="store_true",`
			`help="Whether this script should be run as a test: --stop-reward must "`
			`"be achieved within --stop-timesteps AND --stop-iters.")`
			`parser.add_argument(`
			`"--no-tune",`
			`action="store_true",`
			`help="Run without Tune using a manual train loop instead. Here,"`
			`"there is no TensorBoard support.")`
			`parser.add_argument(`
			`"--local-mode",`
			`action="store_true",`
			`help="Init Ray in local mode for easier debugging.")`

[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00			`args = parser.parse_args()`
[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`print(f"Running with following CLI args: {args}")`
			`return args`


			`if __name__ == "__main__":`
			`args = get_cli_args()`
[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00
[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)`
[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00
[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`# register custom environments`
[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00			`registry.register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c))`
			`registry.register_env("RepeatInitialObsEnv",`
			`lambda _: RepeatInitialObsEnv())`
			`registry.register_env("LookAndPush", lambda _: OneHot(LookAndPush()))`
			`registry.register_env("StatelessCartPole", lambda _: StatelessCartPole())`

[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`# main part: RLlib config with AttentionNet model`
[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00			`config = {`
			`"env": args.env,`
[RLlib] Attention Nets: tf (#12753) 2020-12-21 02:22:32 +01:00			`# This env_config is only used for the RepeatAfterMeEnv env.`
[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00			`"env_config": {`
			`"repeat_delay": 2,`
			`},`
			`"gamma": 0.99,`
[RLlib] Fix all example scripts to run on GPUs. (#11105) 2020-10-02 23:07:44 +02:00			# Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
			`"num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", 0)),`
[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00			`"num_envs_per_worker": 20,`
			`"entropy_coeff": 0.001,`
[RLlib] Attention Nets: tf (#12753) 2020-12-21 02:22:32 +01:00			`"num_sgd_iter": 10,`
[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00			`"vf_loss_coeff": 1e-5,`
			`"model": {`
[RLlib] Support native tf.keras.Models (part 2) - Default keras models for Vision/RNN/Attention. (#15273) 2021-04-30 19:26:30 +02:00			`# Attention net wrapping (for tf) can already use the native keras`
			`# model versions. For torch, this will have no effect.`
			`"_use_default_native_models": True,`
[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`"use_attention": not args.no_attention,`
[RLlib] Fix 2 flakey test cases. (#14892) 2021-03-29 17:20:29 +02:00			`"max_seq_len": 10,`
[RLlib] Support easy `use_attention=True` flag for using the GTrXL model. (#11698) 2021-01-01 14:06:23 -05:00			`"attention_num_transformer_units": 1,`
[RLlib] Fix 2 flakey test cases. (#14892) 2021-03-29 17:20:29 +02:00			`"attention_dim": 32,`
			`"attention_memory_inference": 10,`
			`"attention_memory_training": 10,`
			`"attention_num_heads": 1,`
[RLlib] Support easy `use_attention=True` flag for using the GTrXL model. (#11698) 2021-01-01 14:06:23 -05:00			`"attention_head_dim": 32,`
			`"attention_position_wise_mlp_dim": 32,`
[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00			`},`
[RLlib] Attention net example script: Clarifications on how to use with Trainer.compute_action. (#14864) 2021-03-23 19:33:01 +01:00			`"framework": args.framework,`
[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00			`}`

			`stop = {`
			`"training_iteration": args.stop_iters,`
			`"timesteps_total": args.stop_timesteps,`
			`"episode_reward_mean": args.stop_reward,`
			`}`

[RLlib] Better PolicyServer example (w/ or w/o tune) and add printing out actual listen port address in log-level=INFO. (#18254) 2021-08-31 22:03:23 +02:00			`# Manual training loop (no Ray tune).`
[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`if args.no_tune:`
			`# manual training loop using PPO and manually keeping track of state`
			`if args.run != "PPO":`
			`raise ValueError("Only support --run PPO with --no-tune.")`
			`ppo_config = ppo.DEFAULT_CONFIG.copy()`
			`ppo_config.update(config)`
			`trainer = ppo.PPOTrainer(config=ppo_config, env=args.env)`
			`# run manual training loop and print results after each iteration`
			`for _ in range(args.stop_iters):`
			`result = trainer.train()`
			`print(pretty_print(result))`
			`# stop training if the target train steps or reward are reached`
			`if result["timesteps_total"] >= args.stop_timesteps or \`
			`result["episode_reward_mean"] >= args.stop_reward:`
			`break`

[RLlib] Redo #17410: Example script: Remote worker envs with inference done on main node. (#17960) 2021-08-20 08:02:18 +02:00			`# Run manual test loop (only for RepeatAfterMe env).`
[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`if args.env == "RepeatAfterMeEnv":`
			`print("Finished training. Running manual test/inference loop.")`
			`# prepare env`
			`env = RepeatAfterMeEnv(config["env_config"])`
			`obs = env.reset()`
			`done = False`
			`total_reward = 0`
			`# start with all zeros as state`
			`num_transformers = config["model"][`
			`"attention_num_transformer_units"]`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`attention_dim = config["model"]["attention_dim"]`
			`memory = config["model"]["attention_memory_inference"]`
[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`init_state = state = [`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`np.zeros([memory, attention_dim], np.float32)`
[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`for _ in range(num_transformers)`
			`]`
			`# run one iteration until done`
			`print(f"RepeatAfterMeEnv with {config['env_config']}")`
			`while not done:`
			`action, state_out, _ = trainer.compute_single_action(`
			`obs, state)`
			`next_obs, reward, done, _ = env.step(action)`
			`print(f"Obs: {obs}, Action: {action}, Reward: {reward}")`
			`obs = next_obs`
			`total_reward += reward`
			`state = [`
			`np.concatenate([state[i], [state_out[i]]], axis=0)[1:]`
			`for i in range(num_transformers)`
			`]`
			`print(f"Total reward in test episode: {total_reward}")`

[RLlib] Redo #17410: Example script: Remote worker envs with inference done on main node. (#17960) 2021-08-20 08:02:18 +02:00			`# Run with Tune for auto env and trainer creation and TensorBoard.`
[RLlib] Better example scripts: Description --no-tune and --local-mode CLI options (#17038) 2021-07-27 05:25:48 +03:00			`else:`
			`results = tune.run(args.run, config=config, stop=stop, verbose=2)`

			`if args.as_test:`
			`print("Checking if learning goals were achieved")`
			`check_learning_achieved(results, args.stop_reward)`

[RLlib] Attention Net integration into ModelV2 and learning RL example. (#8371) 2020-05-18 17:26:40 +02:00			`ray.shutdown()`