from abc import ABCMeta from collections import defaultdict import logging import numpy as np import re from typing import Any, DefaultDict, Dict from ray.rllib.algorithms.algorithm import Algorithm from ray.rllib.examples.policy.random_policy import RandomPolicy from ray.rllib.policy.policy import PolicySpec from ray.rllib.utils.annotations import ExperimentalAPI, override from ray.rllib.utils.numpy import softmax from ray.rllib.utils.typing import PolicyID, AlgorithmConfigDict, ResultDict logger = logging.getLogger(__name__) @ExperimentalAPI class LeagueBuilder(metaclass=ABCMeta): def __init__(self, trainer: Algorithm, trainer_config: AlgorithmConfigDict): """Initializes a LeagueBuilder instance. Args: trainer: The Algorithm object by which this league builder is used. Algorithm calls `build_league()` after each training step. trainer_config: The (not yet validated) config dict to be used on the Algorithm. Child classes of `LeagueBuilder` should preprocess this to add e.g. multiagent settings to this config. """ self.trainer = trainer self.config = trainer_config def build_league(self, result: ResultDict) -> None: """Method containing league-building logic. Called after train step. Args: result: The most recent result dict with all necessary stats in it (e.g. episode rewards) to perform league building operations. """ raise NotImplementedError def __getstate__(self) -> Dict[str, Any]: """Returns a state dict, mapping str keys to state variables. Returns: The current state dict of this LeagueBuilder. """ return {} @ExperimentalAPI class NoLeagueBuilder(LeagueBuilder): """A LeagueBuilder that does nothing. Useful for simple, non-league-building multi-agent setups. See e.g. `rllib/tuned_examples/alpha_star/multi-agent-cart-pole-alpha-star.yaml` """ def build_league(self, result: ResultDict) -> None: pass @ExperimentalAPI class AlphaStarLeagueBuilder(LeagueBuilder): def __init__( self, trainer: Algorithm, trainer_config: AlgorithmConfigDict, num_random_policies: int = 2, num_learning_league_exploiters: int = 4, num_learning_main_exploiters: int = 4, win_rate_threshold_for_new_snapshot: float = 0.8, keep_new_snapshot_training_prob: float = 0.0, prob_league_exploiter_match: float = 0.33, prob_main_exploiter_match: float = 0.33, prob_main_exploiter_playing_against_learning_main: float = 0.5, ): """Initializes a AlphaStarLeagueBuilder instance. The following match types are possible: LE: A learning (not snapshot) league_exploiter vs any snapshot policy. ME: A learning (not snapshot) main exploiter vs any main. M: Main self-play (main vs main). Args: trainer: The Algorithm object by which this league builder is used. Algorithm calls `build_league()` after each training step to reconfigure the league structure (e.g. to add/remove policies). trainer_config: The (not yet validated) config dict to be used on the Algorithm. Child classes of `LeagueBuilder` should preprocess this to add e.g. multiagent settings to this config. num_random_policies: The number of random policies to add to the league. This must be an even number (including 0) as these will be evenly distributed amongst league- and main- exploiters. num_learning_league_exploiters: The number of initially learning league-exploiters to create. num_learning_main_exploiters: The number of initially learning main-exploiters to create. win_rate_threshold_for_new_snapshot: The win-rate to be achieved for a learning policy to get snapshot'd (forked into `self` + a new learning or non-learning copy of `self`). keep_new_snapshot_training_prob: The probability with which a new snapshot should keep training. Note that the policy from which this snapshot is taken will continue to train regardless. prob_league_exploiter_match: Probability of an episode to become a league-exploiter vs snapshot match. prob_main_exploiter_match: Probability of an episode to become a main-exploiter vs main match. prob_main_exploiter_playing_against_learning_main: Probability of a main-exploiter vs (training!) main match. """ super().__init__(trainer, trainer_config) self.win_rate_threshold_for_new_snapshot = win_rate_threshold_for_new_snapshot self.keep_new_snapshot_training_prob = keep_new_snapshot_training_prob self.prob_league_exploiter_match = prob_league_exploiter_match self.prob_main_exploiter_match = prob_main_exploiter_match self.prob_main_exploiter_playing_against_learning_main = ( prob_main_exploiter_playing_against_learning_main ) # Store the win rates for league overview printouts. self.win_rates: DefaultDict[PolicyID, float] = defaultdict(float) assert num_random_policies % 2 == 0, ( "ERROR: `num_random_policies` must be even number (we'll distribute " "these evenly amongst league- and main-exploiters)!" ) # Build trainer's multiagent config. ma_config = self.config["multiagent"] # Make sure the multiagent config dict has no policies defined: assert not ma_config.get("policies"), ( "ERROR: `config.multiagent.policies` should not be pre-defined! " "AlphaStarLeagueBuilder will construct this itself." ) ma_config["policies"] = policies = {} self.main_policies = 1 self.league_exploiters = ( num_learning_league_exploiters + num_random_policies / 2 ) self.main_exploiters = num_learning_main_exploiters + num_random_policies / 2 # Add 1 initial (learning) main policy. policies["main_0"] = PolicySpec() # Train all non-random policies that exist at beginning. ma_config["policies_to_train"] = ["main_0"] # Add random policies. i = -1 for i in range(num_random_policies // 2): policies[f"league_exploiter_{i}"] = PolicySpec(policy_class=RandomPolicy) policies[f"main_exploiter_{i}"] = PolicySpec(policy_class=RandomPolicy) # Add initial (learning) league-exploiters. for j in range(num_learning_league_exploiters): pid = f"league_exploiter_{j + i + 1}" policies[pid] = PolicySpec() ma_config["policies_to_train"].append(pid) # Add initial (learning) main-exploiters. for j in range(num_learning_league_exploiters): pid = f"main_exploiter_{j + i + 1}" policies[pid] = PolicySpec() ma_config["policies_to_train"].append(pid) # Build initial policy mapping function: main_0 vs main_exploiter_0. ma_config["policy_mapping_fn"] = ( lambda aid, ep, worker, **kw: "main_0" if ep.episode_id % 2 == aid else "main_exploiter_0" ) @override(LeagueBuilder) def build_league(self, result: ResultDict) -> None: local_worker = self.trainer.workers.local_worker() # If no evaluation results -> Use hist data gathered for training. if "evaluation" in result: hist_stats = result["evaluation"]["hist_stats"] else: hist_stats = result["hist_stats"] # TODO: Add example on how to use callable here, instead of updating # policies_to_train via this simple set. trainable_policies = local_worker.get_policies_to_train() non_trainable_policies = ( set(local_worker.policy_map.keys()) - trainable_policies ) logger.info(f"League building after iter {self.trainer.iteration}:") # Calculate current win-rates. for policy_id, rew in hist_stats.items(): mo = re.match("^policy_(.+)_reward$", policy_id) if mo is None: continue policy_id = mo.group(1) # Calculate this policy's win rate. won = 0 for r in rew: if r > 0.0: # win = 1.0; loss = -1.0 won += 1 win_rate = won / len(rew) # TODO: This should probably be a running average # (instead of hard-overriding it with the most recent data). self.win_rates[policy_id] = win_rate # Policy is a snapshot (frozen) -> Ignore. if policy_id not in trainable_policies: continue logger.info(f"\t{policy_id} win-rate={win_rate} -> ") # If win rate is good enough -> Snapshot current policy and decide, # whether to freeze the new snapshot or not. if win_rate >= self.win_rate_threshold_for_new_snapshot: is_main = re.match("^main(_\\d+)?$", policy_id) # Probability that the new snapshot is trainable. keep_training_p = self.keep_new_snapshot_training_prob # For main, new snapshots are never trainable, for all others # use `config.keep_new_snapshot_training_prob` (default: 0.0!). keep_training = ( False if is_main else np.random.choice( [True, False], p=[keep_training_p, 1.0 - keep_training_p] ) ) # New league-exploiter policy. if policy_id.startswith("league_ex"): new_pol_id = re.sub( "_\\d+$", f"_{self.league_exploiters}", policy_id ) self.league_exploiters += 1 # New main-exploiter policy. elif policy_id.startswith("main_ex"): new_pol_id = re.sub("_\\d+$", f"_{self.main_exploiters}", policy_id) self.main_exploiters += 1 # New main policy snapshot. else: new_pol_id = re.sub("_\\d+$", f"_{self.main_policies}", policy_id) self.main_policies += 1 if keep_training: trainable_policies.add(new_pol_id) else: non_trainable_policies.add(new_pol_id) logger.info( f"adding new opponents to the mix ({new_pol_id}; " f"trainable={keep_training})." ) num_main_policies = self.main_policies probs_match_types = [ self.prob_league_exploiter_match, self.prob_main_exploiter_match, 1.0 - self.prob_league_exploiter_match - self.prob_main_exploiter_match, ] prob_playing_learning_main = ( self.prob_main_exploiter_playing_against_learning_main ) # Update our mapping function accordingly. def policy_mapping_fn(agent_id, episode, worker, **kwargs): # Pick, whether this is: # LE: league-exploiter vs snapshot. # ME: main-exploiter vs (any) main. # M: Learning main vs itself. type_ = np.random.choice(["LE", "ME", "M"], p=probs_match_types) # Learning league exploiter vs a snapshot. # Opponent snapshots should be selected based on a win-rate- # derived probability. if type_ == "LE": if episode.episode_id % 2 == agent_id: league_exploiter = np.random.choice( [ p for p in trainable_policies if p.startswith("league_ex") ] ) logger.debug( f"Episode {episode.episode_id}: AgentID " f"{agent_id} played by {league_exploiter} (training)" ) return league_exploiter # Play against any non-trainable policy (excluding itself). else: all_opponents = list(non_trainable_policies) probs = softmax( [ worker.global_vars["win_rates"][pid] for pid in all_opponents ] ) opponent = np.random.choice(all_opponents, p=probs) logger.debug( f"Episode {episode.episode_id}: AgentID " f"{agent_id} played by {opponent} (frozen)" ) return opponent # Learning main exploiter vs (learning main OR snapshot main). elif type_ == "ME": if episode.episode_id % 2 == agent_id: main_exploiter = np.random.choice( [ p for p in trainable_policies if p.startswith("main_ex") ] ) logger.debug( f"Episode {episode.episode_id}: AgentID " f"{agent_id} played by {main_exploiter} (training)" ) return main_exploiter else: # n% of the time, play against the learning main. # Also always play againt learning main if no # non-learning mains have been created yet. if num_main_policies == 1 or ( np.random.random() < prob_playing_learning_main ): main = "main_0" training = "training" # 100-n% of the time, play against a non-learning # main. Opponent main snapshots should be selected # based on a win-rate-derived probability. else: all_opponents = [ f"main_{p}" for p in list(range(1, num_main_policies)) ] probs = softmax( [ worker.global_vars["win_rates"][pid] for pid in all_opponents ] ) main = np.random.choice(all_opponents, p=probs) training = "frozen" logger.debug( f"Episode {episode.episode_id}: AgentID " f"{agent_id} played by {main} ({training})" ) return main # Main policy: Self-play. else: logger.debug(f"Episode {episode.episode_id}: main_0 vs main_0") return "main_0" # Add and set the weights of the new polic(y/ies). state = self.trainer.get_policy(policy_id).get_state() self.trainer.add_policy( policy_id=new_pol_id, policy_cls=type(self.trainer.get_policy(policy_id)), policy_state=state, policy_mapping_fn=policy_mapping_fn, policies_to_train=trainable_policies, ) else: logger.info("not good enough; will keep learning ...") def __getstate__(self) -> Dict[str, Any]: return { "win_rates": self.win_rates, "main_policies": self.main_policies, "league_exploiters": self.league_exploiters, "main_exploiters": self.main_exploiters, } def __setstate__(self, state) -> None: self.win_rates = state["win_rates"] self.main_policies = state["main_policies"] self.league_exploiters = state["league_exploiters"] self.main_exploiters = state["main_exploiters"]