ray/rllib/examples/multiagent_cartpole.py

113 lines
4 KiB
Python
Raw Normal View History

"""Simple example of setting up a multi-agent policy mapping.
Control the number of agents and policies via --num-agents and --num-policies.
This works with hundreds of agents and policies, but note that initializing
many TF policies will take some time.
Also, TF evals might slow down with large numbers of policies. To debug TF
execution, set the TF_TIMELINE_DIR environment variable.
"""
import argparse
import gym
import random
import ray
from ray import tune
from ray.rllib.models import Model, ModelCatalog
from ray.rllib.tests.test_multi_agent_env import MultiCartpole
from ray.tune.registry import register_env
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
parser = argparse.ArgumentParser()
parser.add_argument("--num-agents", type=int, default=4)
parser.add_argument("--num-policies", type=int, default=2)
parser.add_argument("--num-iters", type=int, default=20)
parser.add_argument("--simple", action="store_true")
class CustomModel1(Model):
def _build_layers_v2(self, input_dict, num_outputs, options):
# Example of (optional) weight sharing between two different policies.
# Here, we share the variables defined in the 'shared' variable scope
# by entering it explicitly with tf.AUTO_REUSE. This creates the
# variables for the 'fc1' layer in a global scope called 'shared'
# outside of the policy's normal variable scope.
with tf.variable_scope(
tf.VariableScope(tf.AUTO_REUSE, "shared"),
reuse=tf.AUTO_REUSE,
auxiliary_name_scope=False):
last_layer = tf.layers.dense(
input_dict["obs"], 64, activation=tf.nn.relu, name="fc1")
last_layer = tf.layers.dense(
last_layer, 64, activation=tf.nn.relu, name="fc2")
output = tf.layers.dense(
last_layer, num_outputs, activation=None, name="fc_out")
return output, last_layer
class CustomModel2(Model):
def _build_layers_v2(self, input_dict, num_outputs, options):
# Weights shared with CustomModel1
with tf.variable_scope(
tf.VariableScope(tf.AUTO_REUSE, "shared"),
reuse=tf.AUTO_REUSE,
auxiliary_name_scope=False):
last_layer = tf.layers.dense(
input_dict["obs"], 64, activation=tf.nn.relu, name="fc1")
last_layer = tf.layers.dense(
last_layer, 64, activation=tf.nn.relu, name="fc2")
output = tf.layers.dense(
last_layer, num_outputs, activation=None, name="fc_out")
return output, last_layer
if __name__ == "__main__":
args = parser.parse_args()
ray.init()
# Simple environment with `num_agents` independent cartpole entities
register_env("multi_cartpole", lambda _: MultiCartpole(args.num_agents))
ModelCatalog.register_custom_model("model1", CustomModel1)
ModelCatalog.register_custom_model("model2", CustomModel2)
single_env = gym.make("CartPole-v0")
obs_space = single_env.observation_space
act_space = single_env.action_space
# Each policy can have a different configuration (including custom model)
def gen_policy(i):
config = {
"model": {
"custom_model": ["model1", "model2"][i % 2],
},
"gamma": random.choice([0.95, 0.99]),
}
return (None, obs_space, act_space, config)
# Setup PPO with an ensemble of `num_policies` different policies
policies = {
"policy_{}".format(i): gen_policy(i)
for i in range(args.num_policies)
}
policy_ids = list(policies.keys())
tune.run(
"PPO",
stop={"training_iteration": args.num_iters},
config={
"env": "multi_cartpole",
"log_level": "DEBUG",
"simple_optimizer": args.simple,
"num_sgd_iter": 10,
"multiagent": {
"policies": policies,
"policy_mapping_fn": (
lambda agent_id: random.choice(policy_ids)),
},
},
)