ray/rllib/examples/centralized_critic_2.py
Sven 60d4d5e1aa Remove future imports (#6724)
* Remove all __future__ imports from RLlib.

* Remove (object) again from tf_run_builder.py::TFRunBuilder.

* Fix 2xLINT warnings.

* Fix broken appo_policy import (must be appo_tf_policy)

* Remove future imports from all other ray files (not just RLlib).

* Remove future imports from all other ray files (not just RLlib).

* Remove future import blocks that contain `unicode_literals` as well.
Revert appo_tf_policy.py to appo_policy.py (belongs to another PR).

* Add two empty lines before Schedule class.

* Put back __future__ imports into determine_tests_to_run.py. Fails otherwise on a py2/print related error.
2020-01-09 00:15:48 -08:00

154 lines
5.4 KiB
Python

"""An example of implementing a centralized critic by modifying the env.
The advantage of this approach is that it's very simple and you don't have to
change the algorithm at all -- just use an env wrapper and custom model.
However, it is a bit less principled in that you have to change the agent
observation spaces and the environment.
See also: centralized_critic.py for an alternative approach that instead
modifies the policy to add a centralized value function.
"""
import numpy as np
from gym.spaces import Box, Dict, Discrete
import argparse
from ray import tune
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.examples.twostep_game import TwoStepGame
from ray.rllib.models import ModelCatalog
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
parser = argparse.ArgumentParser()
parser.add_argument("--stop", type=int, default=100000)
class CentralizedCriticModel(TFModelV2):
"""Multi-agent model that implements a centralized VF.
It assumes the observation is a dict with 'own_obs' and 'opponent_obs', the
former of which can be used for computing actions (i.e., decentralized
execution), and the latter for optimization (i.e., centralized learning).
This model has two parts:
- An action model that looks at just 'own_obs' to compute actions
- A value model that also looks at the 'opponent_obs' / 'opponent_action'
to compute the value (it does this by using the 'obs_flat' tensor).
"""
def __init__(self, obs_space, action_space, num_outputs, model_config,
name):
super(CentralizedCriticModel, self).__init__(
obs_space, action_space, num_outputs, model_config, name)
self.action_model = FullyConnectedNetwork(
Box(low=0, high=1, shape=(6, )), # one-hot encoded Discrete(6)
action_space,
num_outputs,
model_config,
name + "_action")
self.register_variables(self.action_model.variables())
self.value_model = FullyConnectedNetwork(obs_space, action_space, 1,
model_config, name + "_vf")
self.register_variables(self.value_model.variables())
def forward(self, input_dict, state, seq_lens):
self._value_out, _ = self.value_model({
"obs": input_dict["obs_flat"]
}, state, seq_lens)
return self.action_model({
"obs": input_dict["obs"]["own_obs"]
}, state, seq_lens)
def value_function(self):
return tf.reshape(self._value_out, [-1])
class GlobalObsTwoStepGame(MultiAgentEnv):
action_space = Discrete(2)
observation_space = Dict({
"own_obs": Discrete(6),
"opponent_obs": Discrete(6),
"opponent_action": Discrete(2),
})
def __init__(self, env_config):
self.env = TwoStepGame(env_config)
def reset(self):
obs_dict = self.env.reset()
return self.to_global_obs(obs_dict)
def step(self, action_dict):
obs_dict, rewards, dones, infos = self.env.step(action_dict)
return self.to_global_obs(obs_dict), rewards, dones, infos
def to_global_obs(self, obs_dict):
return {
self.env.agent_1: {
"own_obs": obs_dict[self.env.agent_1],
"opponent_obs": obs_dict[self.env.agent_2],
"opponent_action": 0, # populated by fill_in_actions
},
self.env.agent_2: {
"own_obs": obs_dict[self.env.agent_2],
"opponent_obs": obs_dict[self.env.agent_1],
"opponent_action": 0, # populated by fill_in_actions
},
}
def fill_in_actions(info):
"""Callback that saves opponent actions into the agent obs.
If you don't care about opponent actions you can leave this out."""
to_update = info["post_batch"][SampleBatch.CUR_OBS]
my_id = info["agent_id"]
other_id = 1 if my_id == 0 else 0
action_encoder = ModelCatalog.get_preprocessor_for_space(Discrete(2))
# set the opponent actions into the observation
_, opponent_batch = info["all_pre_batches"][other_id]
opponent_actions = np.array([
action_encoder.transform(a)
for a in opponent_batch[SampleBatch.ACTIONS]
])
to_update[:, -2:] = opponent_actions
if __name__ == "__main__":
args = parser.parse_args()
ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel)
tune.run(
"PPO",
stop={
"timesteps_total": args.stop,
"episode_reward_mean": 7.99,
},
config={
"env": GlobalObsTwoStepGame,
"batch_mode": "complete_episodes",
"callbacks": {
"on_postprocess_traj": fill_in_actions,
},
"num_workers": 0,
"multiagent": {
"policies": {
"pol1": (None, GlobalObsTwoStepGame.observation_space,
GlobalObsTwoStepGame.action_space, {}),
"pol2": (None, GlobalObsTwoStepGame.observation_space,
GlobalObsTwoStepGame.action_space, {}),
},
"policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
},
"model": {
"custom_model": "cc_model",
},
})