
356 lines
16 KiB

from abc import ABCMeta
import logging
import numpy as np
import re
from ray.rllib.agents.trainer import Trainer
from ray.rllib.examples.policy.random_policy import RandomPolicy
from ray.rllib.policy.policy import PolicySpec
from ray.rllib.utils.annotations import ExperimentalAPI, override
from ray.rllib.utils.numpy import softmax
from ray.rllib.utils.typing import TrainerConfigDict, ResultDict
logger = logging.getLogger(__name__)
class LeagueBuilder(metaclass=ABCMeta):
def __init__(self, trainer: Trainer, trainer_config: TrainerConfigDict):
"""Initializes a LeagueBuilder instance.
trainer: The Trainer object by which this league builder is used.
Trainer calls `build_league()` after each training step.
trainer_config: The (not yet validated) config dict to be
used on the Trainer. Child classes of `LeagueBuilder`
should preprocess this to add e.g. multiagent settings
to this config.
self.trainer = trainer
self.config = trainer_config
def build_league(self, result: ResultDict) -> None:
"""Method containing league-building logic. Called after train step.
result: The most recent result dict with all necessary stats in
it (e.g. episode rewards) to perform league building
raise NotImplementedError
class NoLeagueBuilder(LeagueBuilder):
"""A LeagueBuilder that does nothing.
Useful for simple, non-league-building multi-agent setups.
See e.g.
def build_league(self, result: ResultDict) -> None:
class AlphaStarLeagueBuilder(LeagueBuilder):
def __init__(
trainer: Trainer,
trainer_config: TrainerConfigDict,
num_random_policies: int = 2,
num_learning_league_exploiters: int = 4,
num_learning_main_exploiters: int = 4,
win_rate_threshold_for_new_snapshot: float = 0.8,
keep_new_snapshot_training_prob: float = 0.0,
prob_league_exploiter_match: float = 0.33,
prob_main_exploiter_match: float = 0.33,
prob_main_exploiter_playing_against_learning_main: float = 0.5,
"""Initializes a AlphaStarLeagueBuilder instance.
trainer: The Trainer object by which this league builder is used.
Trainer calls `build_league()` after each training step.
trainer_config: The (not yet validated) config dict to be
used on the Trainer. Child classes of `LeagueBuilder`
should preprocess this to add e.g. multiagent settings
to this config.
num_random_policies: The number of random policies to add to the
league. This must be an even number (including 0) as these
will be evenly distributed amongst league- and main- exploiters.
num_learning_league_exploiters: The number of learning
league-exploiters to create.
num_learning_main_exploiters: The number of learning
main-exploiters to create.
win_rate_threshold_for_new_snapshot: The win-rate to be achieved
for a learning policy to get snapshot'd (forked into `self` +
a new learning or non-learning copy of `self`).
keep_new_snapshot_training_prob: The probability with which a new
snapshot should keep training. Note that the policy from which
this snapshot is taken will continue to train regardless.
prob_league_exploiter_match: Probability of an episode to become a
league-exploiter vs snapshot match.
prob_main_exploiter_match: Probability of an episode to become a
main-exploiter vs main match.
prob_main_exploiter_playing_against_learning_main: Probability of
a main-exploiter vs (training!) main match.
super().__init__(trainer, trainer_config)
self.win_rate_threshold_for_new_snapshot = win_rate_threshold_for_new_snapshot
self.keep_new_snapshot_training_prob = keep_new_snapshot_training_prob
self.prob_league_exploiter_match = prob_league_exploiter_match
self.prob_main_exploiter_match = prob_main_exploiter_match
self.prob_main_exploiter_playing_against_learning_main = (
assert num_random_policies % 2 == 0, (
"ERROR: `num_random_policies` must be even number (we'll distribute "
"these evenly amongst league- and main-exploiters)!"
# Build trainer's multiagent config.
ma_config = self.config["multiagent"]
# Make sure the multiagent config dict has no policies defined:
assert not ma_config.get("policies"), (
"ERROR: `config.multiagent.policies` should not be pre-defined! "
"AlphaStarLeagueBuilder will construct this itself."
ma_config["policies"] = policies = {}
self.main_policies = 1
self.league_exploiters = (
num_learning_league_exploiters + num_random_policies / 2
self.main_exploiters = num_learning_main_exploiters + num_random_policies / 2
# Add 1 initial (learning) main policy.
policies["main_0"] = PolicySpec()
# Train all non-random policies that exist at beginning.
ma_config["policies_to_train"] = ["main_0"]
# Add random policies.
i = -1
for i in range(num_random_policies // 2):
policies[f"league_exploiter_{i}"] = PolicySpec(policy_class=RandomPolicy)
policies[f"main_exploiter_{i}"] = PolicySpec(policy_class=RandomPolicy)
# Add initial (learning) league-exploiters.
for j in range(num_learning_league_exploiters):
pid = f"league_exploiter_{j + i + 1}"
policies[pid] = PolicySpec()
# Add initial (learning) main-exploiters.
for j in range(num_learning_league_exploiters):
pid = f"main_exploiter_{j + i + 1}"
policies[pid] = PolicySpec()
# Initial policy mapping function: main_0 vs main_exploiter_0.
ma_config["policy_mapping_fn"] = (
lambda aid, ep, worker, **kw: "main_0"
if ep.episode_id % 2 == aid
else "main_exploiter_0"
def build_league(self, result: ResultDict) -> None:
local_worker = self.trainer.workers.local_worker()
# If no evaluation results -> Use hist data gathered for training.
if "evaluation" in result:
hist_stats = result["evaluation"]["hist_stats"]
hist_stats = result["hist_stats"]
trainable_policies = local_worker.get_policies_to_train()
non_trainable_policies = (
set(local_worker.policy_map.keys()) - trainable_policies
logger.info(f"League building after iter {self.trainer.iteration}:")
# Calculate current win-rates.
for policy_id, rew in hist_stats.items():
mo = re.match("^policy_(.+)_reward$", policy_id)
if mo is None:
policy_id = mo.group(1)
# Calculate this policy's win rate.
won = 0
for r in rew:
if r > 0.0: # win = 1.0; loss = -1.0
won += 1
win_rate = won / len(rew)
# TODO: This should probably be a running average
# (instead of hard-overriding it with the most recent data).
self.trainer.win_rates[policy_id] = win_rate
# Policy is a snapshot (frozen) -> Ignore.
if policy_id not in trainable_policies:
logger.info(f"\t{policy_id} win-rate={win_rate} -> ", end="")
# If win rate is good enough -> Snapshot current policy and decide,
# whether to freeze the new snapshot or not.
if win_rate >= self.win_rate_threshold_for_new_snapshot:
is_main = re.match("^main(_\\d+)?$", policy_id)
# Probability that the new snapshot is trainable.
keep_training_p = self.keep_new_snapshot_training_prob
# For main, new snapshots are never trainable, for all others
# use `config.keep_new_snapshot_training_prob` (default: 0.0!).
keep_training = (
if is_main
else np.random.choice(
[True, False], p=[keep_training_p, 1.0 - keep_training_p]
# New league-exploiter policy.
if policy_id.startswith("league_ex"):
new_pol_id = re.sub(
"_\\d+$", f"_{self.league_exploiters}", policy_id
self.league_exploiters += 1
# New main-exploiter policy.
elif policy_id.startswith("main_ex"):
new_pol_id = re.sub("_\\d+$", f"_{self.main_exploiters}", policy_id)
self.main_exploiters += 1
# New main policy snapshot.
new_pol_id = re.sub("_\\d+$", f"_{self.main_policies}", policy_id)
self.main_policies += 1
if keep_training:
f"adding new opponents to the mix ({new_pol_id}; "
num_main_policies = self.main_policies
probs_match_types = [
- self.prob_league_exploiter_match
- self.prob_main_exploiter_match,
prob_playing_learning_main = (
# Update our mapping function accordingly.
def policy_mapping_fn(agent_id, episode, worker, **kwargs):
# Pick, whether this is:
# LE: league-exploiter vs snapshot.
# ME: main-exploiter vs (any) main.
# M: Learning main vs itself.
type_ = np.random.choice(["LE", "ME", "M"], p=probs_match_types)
# Learning league exploiter vs a snapshot.
# Opponent snapshots should be selected based on a win-rate-
# derived probability.
if type_ == "LE":
if episode.episode_id % 2 == agent_id:
league_exploiter = np.random.choice(
for p in trainable_policies
if p.startswith("league_ex")
f"Episode {episode.episode_id}: AgentID "
f"{agent_id} played by {league_exploiter} (training)"
return league_exploiter
# Play against any non-trainable policy (excluding itself).
all_opponents = list(non_trainable_policies)
probs = softmax(
for pid in all_opponents
opponent = np.random.choice(all_opponents, p=probs)
f"Episode {episode.episode_id}: AgentID "
f"{agent_id} played by {opponent} (frozen)"
return opponent
# Learning main exploiter vs (learning main OR snapshot main).
elif type_ == "ME":
if episode.episode_id % 2 == agent_id:
main_exploiter = np.random.choice(
for p in trainable_policies
if p.startswith("main_ex")
f"Episode {episode.episode_id}: AgentID "
f"{agent_id} played by {main_exploiter} (training)"
return main_exploiter
# n% of the time, play against the learning main.
# Also always play againt learning main if no
# non-learning mains have been created yet.
if num_main_policies == 1 or (
np.random.random() < prob_playing_learning_main
main = "main_0"
training = "training"
# 100-n% of the time, play against a non-learning
# main. Opponent main snapshots should be selected
# based on a win-rate-derived probability.
all_opponents = [
for p in list(range(1, num_main_policies))
probs = softmax(
for pid in all_opponents
main = np.random.choice(all_opponents, p=probs)
training = "frozen"
f"Episode {episode.episode_id}: AgentID "
f"{agent_id} played by {main} ({training})"
return main
# Main policy: Self-play.
logger.debug(f"Episode {episode.episode_id}: main_0 vs main_0")
return "main_0"
# Add and set the weights of the new polic(y/ies).
state = self.trainer.get_policy(policy_id).get_state()
logger.info("not good enough; will keep learning ...")