ray/rllib/examples/env/coin_game_non_vectorized_env.py
Jiajun Yao 7588bfd315
[Lint] Add flake8-bugbear (#19053)
* Add flake8-bugbear

* Add flake8-bugbear
2021-10-03 23:24:11 -07:00

338 lines
12 KiB
Python

##########
# Contribution by the Center on Long-Term Risk:
# https://github.com/longtermrisk/marltoolbox
##########
import copy
from collections import Iterable
import gym
import logging
import numpy as np
from gym.spaces import Discrete
from gym.utils import seeding
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.utils import override
from typing import Dict, Optional
from ray.rllib.examples.env.utils.interfaces import InfoAccumulationInterface
logger = logging.getLogger(__name__)
class CoinGame(InfoAccumulationInterface, MultiAgentEnv, gym.Env):
"""
Coin Game environment.
"""
NAME = "CoinGame"
NUM_AGENTS = 2
NUM_ACTIONS = 4
ACTION_SPACE = Discrete(NUM_ACTIONS)
OBSERVATION_SPACE = None
MOVES = [
np.array([0, 1]),
np.array([0, -1]),
np.array([1, 0]),
np.array([-1, 0]),
]
def __init__(self, config: Optional[Dict] = None):
if config is None:
config = {}
self._validate_config(config)
self._load_config(config)
self.player_red_id, self.player_blue_id = self.players_ids
self.n_features = self.grid_size**2 * (2 * self.NUM_AGENTS)
self.OBSERVATION_SPACE = gym.spaces.Box(
low=0,
high=1,
shape=(self.grid_size, self.grid_size, 4),
dtype="uint8")
self.step_count_in_current_episode = None
if self.output_additional_info:
self._init_info()
self.seed(seed=config.get("seed", None))
def _validate_config(self, config):
if "players_ids" in config:
assert isinstance(config["players_ids"], Iterable)
assert len(config["players_ids"]) == self.NUM_AGENTS
def _load_config(self, config):
self.players_ids = \
config.get("players_ids", ["player_red", "player_blue"])
self.max_steps = config.get("max_steps", 20)
self.grid_size = config.get("grid_size", 3)
self.output_additional_info = config.get("output_additional_info",
True)
self.asymmetric = config.get("asymmetric", False)
self.both_players_can_pick_the_same_coin = \
config.get("both_players_can_pick_the_same_coin", True)
@override(gym.Env)
def seed(self, seed=None):
"""Seed the PRNG of this space. """
self.np_random, seed = seeding.np_random(seed)
return [seed]
@override(gym.Env)
def reset(self):
self.step_count_in_current_episode = 0
if self.output_additional_info:
self._reset_info()
self._randomize_color_and_player_positions()
self._generate_coin()
obs = self._generate_observation()
return {self.player_red_id: obs[0], self.player_blue_id: obs[1]}
def _randomize_color_and_player_positions(self):
# Reset coin color and the players and coin positions
self.red_coin = self.np_random.randint(low=0, high=2)
self.red_pos = \
self.np_random.randint(low=0, high=self.grid_size, size=(2,))
self.blue_pos = \
self.np_random.randint(low=0, high=self.grid_size, size=(2,))
self.coin_pos = np.zeros(shape=(2, ), dtype=np.int8)
self._players_do_not_overlap_at_start()
def _players_do_not_overlap_at_start(self):
while self._same_pos(self.red_pos, self.blue_pos):
self.blue_pos = self.np_random.randint(self.grid_size, size=2)
def _generate_coin(self):
self._switch_between_coin_color_at_each_generation()
self._coin_position_different_from_players_positions()
def _switch_between_coin_color_at_each_generation(self):
self.red_coin = 1 - self.red_coin
def _coin_position_different_from_players_positions(self):
success = 0
while success < self.NUM_AGENTS:
self.coin_pos = self.np_random.randint(self.grid_size, size=2)
success = 1 - self._same_pos(self.red_pos, self.coin_pos)
success += 1 - self._same_pos(self.blue_pos, self.coin_pos)
def _generate_observation(self):
obs = np.zeros((self.grid_size, self.grid_size, 4))
obs[self.red_pos[0], self.red_pos[1], 0] = 1
obs[self.blue_pos[0], self.blue_pos[1], 1] = 1
if self.red_coin:
obs[self.coin_pos[0], self.coin_pos[1], 2] = 1
else:
obs[self.coin_pos[0], self.coin_pos[1], 3] = 1
obs = self._get_obs_invariant_to_the_player_trained(obs)
return obs
@override(gym.Env)
def step(self, actions: Dict):
"""
:param actions: Dict containing both actions for player_1 and player_2
:return: observations, rewards, done, info
"""
actions = self._from_RLLib_API_to_list(actions)
self.step_count_in_current_episode += 1
self._move_players(actions)
reward_list, generate_new_coin = self._compute_reward()
if generate_new_coin:
self._generate_coin()
observations = self._generate_observation()
return self._to_RLLib_API(observations, reward_list)
def _same_pos(self, x, y):
return (x == y).all()
def _move_players(self, actions):
self.red_pos = (self.red_pos + self.MOVES[actions[0]]) % self.grid_size
self.blue_pos = (
self.blue_pos + self.MOVES[actions[1]]) % self.grid_size
def _compute_reward(self):
reward_red = 0.0
reward_blue = 0.0
generate_new_coin = False
red_pick_any, red_pick_red, blue_pick_any, blue_pick_blue = \
False, False, False, False
red_first_if_both = None
if not self.both_players_can_pick_the_same_coin:
if self._same_pos(self.red_pos, self.coin_pos) and \
self._same_pos(self.blue_pos, self.coin_pos):
red_first_if_both = bool(self.np_random.randint(low=0, high=2))
if self.red_coin:
if self._same_pos(self.red_pos, self.coin_pos) and \
(red_first_if_both is None or red_first_if_both):
generate_new_coin = True
reward_red += 1
if self.asymmetric:
reward_red += 3
red_pick_any = True
red_pick_red = True
if self._same_pos(self.blue_pos, self.coin_pos) and \
(red_first_if_both is None or not red_first_if_both):
generate_new_coin = True
reward_red += -2
reward_blue += 1
blue_pick_any = True
else:
if self._same_pos(self.red_pos, self.coin_pos) and \
(red_first_if_both is None or red_first_if_both):
generate_new_coin = True
reward_red += 1
reward_blue += -2
if self.asymmetric:
reward_red += 3
red_pick_any = True
if self._same_pos(self.blue_pos, self.coin_pos) and \
(red_first_if_both is None or not red_first_if_both):
generate_new_coin = True
reward_blue += 1
blue_pick_blue = True
blue_pick_any = True
reward_list = [reward_red, reward_blue]
if self.output_additional_info:
self._accumulate_info(
red_pick_any=red_pick_any,
red_pick_red=red_pick_red,
blue_pick_any=blue_pick_any,
blue_pick_blue=blue_pick_blue)
return reward_list, generate_new_coin
def _from_RLLib_API_to_list(self, actions):
"""
Format actions from dict of players to list of lists
"""
actions = [actions[player_id] for player_id in self.players_ids]
return actions
def _get_obs_invariant_to_the_player_trained(self, observation):
"""
We want to be able to use a policy trained as player 1,
for evaluation as player 2 and vice versa.
"""
# player_red_observation contains
# [Red pos, Blue pos, Red coin pos, Blue coin pos]
player_red_observation = observation
# After modification, player_blue_observation will contain
# [Blue pos, Red pos, Blue coin pos, Red coin pos]
player_blue_observation = copy.deepcopy(observation)
player_blue_observation[..., 0] = observation[..., 1]
player_blue_observation[..., 1] = observation[..., 0]
player_blue_observation[..., 2] = observation[..., 3]
player_blue_observation[..., 3] = observation[..., 2]
return [player_red_observation, player_blue_observation]
def _to_RLLib_API(self, observations, rewards):
state = {
self.player_red_id: observations[0],
self.player_blue_id: observations[1],
}
rewards = {
self.player_red_id: rewards[0],
self.player_blue_id: rewards[1],
}
epi_is_done = (self.step_count_in_current_episode >= self.max_steps)
if self.step_count_in_current_episode > self.max_steps:
logger.warning(
"step_count_in_current_episode > self.max_steps: "
f"{self.step_count_in_current_episode} > {self.max_steps}")
done = {
self.player_red_id: epi_is_done,
self.player_blue_id: epi_is_done,
"__all__": epi_is_done,
}
if epi_is_done and self.output_additional_info:
player_red_info, player_blue_info = self._get_episode_info()
info = {
self.player_red_id: player_red_info,
self.player_blue_id: player_blue_info,
}
else:
info = {}
return state, rewards, done, info
@override(InfoAccumulationInterface)
def _get_episode_info(self):
"""
Output the following information:
pick_speed is the fraction of steps during which the player picked a
coin.
pick_own_color is the fraction of coins picked by the player which have
the same color as the player.
"""
player_red_info, player_blue_info = {}, {}
if len(self.red_pick) > 0:
red_pick = sum(self.red_pick)
player_red_info["pick_speed"] = red_pick / len(self.red_pick)
if red_pick > 0:
player_red_info["pick_own_color"] = \
sum(self.red_pick_own) / red_pick
if len(self.blue_pick) > 0:
blue_pick = sum(self.blue_pick)
player_blue_info["pick_speed"] = blue_pick / len(self.blue_pick)
if blue_pick > 0:
player_blue_info["pick_own_color"] = \
sum(self.blue_pick_own) / blue_pick
return player_red_info, player_blue_info
@override(InfoAccumulationInterface)
def _reset_info(self):
self.red_pick.clear()
self.red_pick_own.clear()
self.blue_pick.clear()
self.blue_pick_own.clear()
@override(InfoAccumulationInterface)
def _accumulate_info(self, red_pick_any, red_pick_red, blue_pick_any,
blue_pick_blue):
self.red_pick.append(red_pick_any)
self.red_pick_own.append(red_pick_red)
self.blue_pick.append(blue_pick_any)
self.blue_pick_own.append(blue_pick_blue)
@override(InfoAccumulationInterface)
def _init_info(self):
self.red_pick = []
self.red_pick_own = []
self.blue_pick = []
self.blue_pick_own = []
class AsymCoinGame(CoinGame):
NAME = "AsymCoinGame"
def __init__(self, config: Optional[dict] = None):
if config is None:
config = {}
if "asymmetric" in config:
assert config["asymmetric"]
else:
config["asymmetric"] = True
super().__init__(config)