ray/rllib/examples/autoregressive_action_dist.py

185 lines
6.7 KiB
Python

"""
Example of specifying an autoregressive action distribution.
In an action space with multiple components (e.g., Tuple(a1, a2)), you might
want a2 to be sampled based on the sampled value of a1, i.e.,
a2_sampled ~ P(a2 | a1_sampled, obs). Normally, a1 and a2 would be sampled
independently.
To do this, you need both a custom model that implements the autoregressive
pattern, and a custom action distribution class that leverages that model.
This examples shows both.
Related paper: https://arxiv.org/abs/1903.11524
The example uses the CorrelatedActionsEnv where the agent observes a random
number (0 or 1) and has to choose two actions a1 and a2.
Action a1 should match the observation (+5 reward) and a2 should match a1
(+5 reward).
Since a2 should depend on a1, an autoregressive action dist makes sense.
---
To better understand the environment, run 1 manual train iteration and test
loop without Tune:
$ python autoregressive_action_dist.py --stop-iters 1 --no-tune
Run this example with defaults (using Tune and autoregressive action dist):
$ python autoregressive_action_dist.py
Then run again without autoregressive actions:
$ python autoregressive_action_dist.py --no-autoreg
# TODO: Why does this lead to better results than autoregressive actions?
Compare learning curve on TensorBoard:
$ cd ~/ray-results/; tensorboard --logdir .
Other options for running this example:
$ python attention_net.py --help
"""
import argparse
import os
import ray
from ray import tune
from ray.rllib.agents import ppo
from ray.rllib.examples.env.correlated_actions_env import CorrelatedActionsEnv
from ray.rllib.examples.models.autoregressive_action_model import \
AutoregressiveActionModel, TorchAutoregressiveActionModel
from ray.rllib.examples.models.autoregressive_action_dist import \
BinaryAutoregressiveDistribution, TorchBinaryAutoregressiveDistribution
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.test_utils import check_learning_achieved
from ray.tune.logger import pretty_print
def get_cli_args():
"""Create CLI parser and return parsed arguments"""
parser = argparse.ArgumentParser()
# example-specific arg: disable autoregressive action dist
parser.add_argument(
"--no-autoreg",
action="store_true",
help="Do NOT use an autoregressive action distribution but normal,"
"independently distributed actions.")
# general args
parser.add_argument(
"--run",
type=str,
default="PPO",
help="The RLlib-registered algorithm to use.")
parser.add_argument(
"--framework",
choices=["tf", "tf2", "tfe", "torch"],
default="tf",
help="The DL framework specifier.")
parser.add_argument("--num-cpus", type=int, default=0)
parser.add_argument(
"--as-test",
action="store_true",
help="Whether this script should be run as a test: --stop-reward must "
"be achieved within --stop-timesteps AND --stop-iters.")
parser.add_argument(
"--stop-iters",
type=int,
default=200,
help="Number of iterations to train.")
parser.add_argument(
"--stop-timesteps",
type=int,
default=100000,
help="Number of timesteps to train.")
parser.add_argument(
"--stop-reward",
type=float,
default=200.0,
help="Reward at which we stop training.")
parser.add_argument(
"--no-tune",
action="store_true",
help="Run without Tune using a manual train loop instead. Here,"
"there is no TensorBoard support.")
parser.add_argument(
"--local-mode",
action="store_true",
help="Init Ray in local mode for easier debugging.")
args = parser.parse_args()
print(f"Running with following CLI args: {args}")
return args
if __name__ == "__main__":
args = get_cli_args()
ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)
# main part: register and configure autoregressive action model and dist
# here, tailored to the CorrelatedActionsEnv such that a2 depends on a1
ModelCatalog.register_custom_model(
"autoregressive_model", TorchAutoregressiveActionModel
if args.framework == "torch" else AutoregressiveActionModel)
ModelCatalog.register_custom_action_dist(
"binary_autoreg_dist", TorchBinaryAutoregressiveDistribution
if args.framework == "torch" else BinaryAutoregressiveDistribution)
# standard config
config = {
"env": CorrelatedActionsEnv,
"gamma": 0.5,
# Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
"num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
"framework": args.framework,
}
# use registered model and dist in config
if not args.no_autoreg:
config["model"] = {
"custom_model": "autoregressive_model",
"custom_action_dist": "binary_autoreg_dist",
}
# use stop conditions passed via CLI (or defaults)
stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
"episode_reward_mean": args.stop_reward,
}
# manual training loop using PPO without tune.run()
if args.no_tune:
if args.run != "PPO":
raise ValueError("Only support --run PPO with --no-tune.")
ppo_config = ppo.DEFAULT_CONFIG.copy()
ppo_config.update(config)
trainer = ppo.PPOTrainer(config=ppo_config, env=CorrelatedActionsEnv)
# run manual training loop and print results after each iteration
for _ in range(args.stop_iters):
result = trainer.train()
print(pretty_print(result))
# stop training if the target train steps or reward are reached
if result["timesteps_total"] >= args.stop_timesteps or \
result["episode_reward_mean"] >= args.stop_reward:
break
# run manual test loop: 1 iteration until done
print("Finished training. Running manual test/inference loop.")
env = CorrelatedActionsEnv(_)
obs = env.reset()
done = False
total_reward = 0
while not done:
a1, a2 = trainer.compute_single_action(obs)
next_obs, reward, done, _ = env.step((a1, a2))
print(f"Obs: {obs}, Action: a1={a1} a2={a2}, Reward: {reward}")
obs = next_obs
total_reward += reward
print(f"Total reward in test episode: {total_reward}")
# run with Tune for auto env and trainer creation and TensorBoard
else:
results = tune.run(args.run, stop=stop, config=config, verbose=2)
if args.as_test:
print("Checking if learning goals were achieved")
check_learning_achieved(results, args.stop_reward)
ray.shutdown()