ray/rllib/policy/policy_map.py

import os
import pickle
import threading
from collections import deque
from typing import TYPE_CHECKING, Callable, Dict, Optional, Set, Type

import gym

from ray.rllib.policy.policy import PolicySpec
from ray.rllib.utils.annotations import PublicAPI, override
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.policy import create_policy_for_framework
from ray.rllib.utils.tf_utils import get_tf_eager_cls_if_necessary
from ray.rllib.utils.threading import with_lock
from ray.rllib.utils.typing import (
    AlgorithmConfigDict,
    PartialAlgorithmConfigDict,
    PolicyID,
)
from ray.tune.utils.util import merge_dicts

if TYPE_CHECKING:
    from ray.rllib.policy.policy import Policy

tf1, tf, tfv = try_import_tf()


@PublicAPI
class PolicyMap(dict):
    """Maps policy IDs to Policy objects.

    Thereby, keeps n policies in memory and - when capacity is reached -
    writes the least recently used to disk. This allows adding 100s of
    policies to a Algorithm for league-based setups w/o running out of memory.
    """

    def __init__(
        self,
        worker_index: int,
        num_workers: int,
        capacity: Optional[int] = None,
        path: Optional[str] = None,
        policy_config: Optional[AlgorithmConfigDict] = None,
        session_creator: Optional[Callable[[], "tf1.Session"]] = None,
        seed: Optional[int] = None,
    ):
        """Initializes a PolicyMap instance.

        Args:
            worker_index: The worker index of the RolloutWorker this map
                resides in.
            num_workers: The total number of remote workers in the
                WorkerSet to which this map's RolloutWorker belongs to.
            capacity: The maximum number of policies to hold in memory.
                The least used ones are written to disk/S3 and retrieved
                when needed.
            path: The path to store the policy pickle files to. Files
                will have the name: [policy_id].[worker idx].policy.pkl.
            policy_config: The Algorithm's base config dict.
            session_creator: An optional
                tf1.Session creation callable.
            seed: An optional seed (used to seed tf policies).
        """
        super().__init__()

        self.worker_index = worker_index
        self.num_workers = num_workers
        self.session_creator = session_creator
        self.seed = seed

        # The file extension for stashed policies (that are no longer available
        # in-memory but can be reinstated any time from storage).
        self.extension = f".{self.worker_index}.policy.pkl"

        # Dictionary of keys that may be looked up (cached or not).
        self.valid_keys: Set[str] = set()
        # The actual cache with the in-memory policy objects.
        self.cache: Dict[str, Policy] = {}
        # The doubly-linked list holding the currently in-memory objects.
        self.deque = deque(maxlen=capacity or 10)
        # The file path where to store overflowing policies.
        self.path = path or "."
        # The core config to use. Each single policy's config override is
        # added on top of this.
        self.policy_config: AlgorithmConfigDict = policy_config or {}
        # The orig classes/obs+act spaces, and config overrides of the
        # Policies.
        self.policy_specs: Dict[PolicyID, PolicySpec] = {}

        # Lock used for locking some methods on the object-level.
        # This prevents possible race conditions when accessing the map
        # and the underlying structures, like self.deque and others.
        self._lock = threading.RLock()

    def create_policy(
        self,
        policy_id: PolicyID,
        policy_cls: Type["Policy"],
        observation_space: gym.Space,
        action_space: gym.Space,
        config_override: PartialAlgorithmConfigDict,
        merged_config: AlgorithmConfigDict,
    ) -> None:
        """Creates a new policy and stores it to the cache.

        Args:
            policy_id: The policy ID. This is the key under which
                the created policy will be stored in this map.
            policy_cls: The (original) policy class to use.
                This may still be altered in case tf-eager (and tracing)
                is used.
            observation_space: The observation space of the
                policy.
            action_space: The action space of the policy.
            config_override: The config override
                dict for this policy. This is the partial dict provided by
                the user.
            merged_config: The entire config (merged
                default config + `config_override`).
        """
        _class = get_tf_eager_cls_if_necessary(policy_cls, merged_config)

        self[policy_id] = create_policy_for_framework(
            policy_id,
            _class,
            merged_config,
            observation_space,
            action_space,
            self.worker_index,
            self.session_creator,
            self.seed,
        )

        # Store spec (class, obs-space, act-space, and config overrides) such
        # that the map will be able to reproduce on-the-fly added policies
        # from disk.
        self.policy_specs[policy_id] = PolicySpec(
            policy_class=policy_cls,
            observation_space=observation_space,
            action_space=action_space,
            config=config_override,
        )

    @with_lock
    @override(dict)
    def __getitem__(self, item):
        # Never seen this key -> Error.
        if item not in self.valid_keys:
            raise KeyError(f"PolicyID '{item}' not found in this PolicyMap!")

        # Item already in cache -> Rearrange deque (least recently used) and
        # return.
        if item in self.cache:
            self.deque.remove(item)
            self.deque.append(item)
        # Item not currently in cache -> Get from disk and - if at capacity -
        # remove leftmost one.
        else:
            self._read_from_disk(policy_id=item)

        return self.cache[item]

    @with_lock
    @override(dict)
    def __setitem__(self, key, value):
        # Item already in cache -> Rearrange deque (least recently used).
        if key in self.cache:
            self.deque.remove(key)
            self.deque.append(key)
            self.cache[key] = value
        # Item not currently in cache -> store new value and - if at capacity -
        # remove leftmost one.
        else:
            # Cache at capacity -> Drop leftmost item.
            if len(self.deque) == self.deque.maxlen:
                self._stash_to_disk()
            self.deque.append(key)
            self.cache[key] = value
        self.valid_keys.add(key)

    @with_lock
    @override(dict)
    def __delitem__(self, key):
        # Make key invalid.
        self.valid_keys.remove(key)
        # Remove policy from memory if currently cached.
        if key in self.cache:
            policy = self.cache[key]
            self._close_session(policy)
            del self.cache[key]
        # Remove file associated with the policy, if it exists.
        filename = self.path + "/" + key + self.extension
        if os.path.isfile(filename):
            os.remove(filename)

    @override(dict)
    def __iter__(self):
        return iter(self.keys())

    @override(dict)
    def items(self):
        """Iterates over all policies, even the stashed-to-disk ones."""

        def gen():
            for key in self.valid_keys:
                yield (key, self[key])

        return gen()

    @override(dict)
    def keys(self):
        self._lock.acquire()
        ks = list(self.valid_keys)
        self._lock.release()

        def gen():
            for key in ks:
                yield key

        return gen()

    @override(dict)
    def values(self):
        self._lock.acquire()
        vs = [self[k] for k in self.valid_keys]
        self._lock.release()

        def gen():
            for value in vs:
                yield value

        return gen()

    @with_lock
    @override(dict)
    def update(self, __m, **kwargs):
        for k, v in __m.items():
            self[k] = v
        for k, v in kwargs.items():
            self[k] = v

    @with_lock
    @override(dict)
    def get(self, key):
        if key not in self.valid_keys:
            return None
        return self[key]

    @with_lock
    @override(dict)
    def __len__(self):
        """Returns number of all policies, including the stashed-to-disk ones."""
        return len(self.valid_keys)

    @with_lock
    @override(dict)
    def __contains__(self, item):
        return item in self.valid_keys

    def _stash_to_disk(self):
        """Writes the least-recently used policy to disk and rearranges cache.

        Also closes the session - if applicable - of the stashed policy.
        """
        # Get least recently used policy (all the way on the left in deque).
        delkey = self.deque.popleft()
        policy = self.cache[delkey]
        # Get its state for writing to disk.
        policy_state = policy.get_state()
        # Closes policy's tf session, if any.
        self._close_session(policy)
        # Remove from memory. This will clear the tf Graph as well.
        del self.cache[delkey]
        # Write state to disk.
        with open(self.path + "/" + delkey + self.extension, "wb") as f:
            pickle.dump(policy_state, file=f)

    def _read_from_disk(self, policy_id):
        """Reads a policy ID from disk and re-adds it to the cache."""
        # Make sure this policy ID is not in the cache right now.
        assert policy_id not in self.cache
        # Read policy state from disk.
        with open(self.path + "/" + policy_id + self.extension, "rb") as f:
            policy_state = pickle.load(f)

        # Get class and config override.
        merged_conf = merge_dicts(
            self.policy_config, self.policy_specs[policy_id].config
        )

        # Create policy object (from its spec: cls, obs-space, act-space,
        # config).
        self.create_policy(
            policy_id,
            self.policy_specs[policy_id].policy_class,
            self.policy_specs[policy_id].observation_space,
            self.policy_specs[policy_id].action_space,
            self.policy_specs[policy_id].config,
            merged_conf,
        )
        # Restore policy's state.
        policy = self[policy_id]
        policy.set_state(policy_state)

    def _close_session(self, policy):
        sess = policy.get_session()
        # Closes the tf session, if any.
        if sess is not None:
            sess.close()
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`import os`
			`import pickle`
[RLlib] Add locking to PolicyMap in case it is accessed by a RolloutWorker and the same worker's AsyncSampler or the main LearnerThread. (#18444) 2021-09-08 23:32:23 +02:00			`import threading`
[RLlib] EnvRunnerV2 and EpisodeV2 that support Connectors. (#25922) 2022-06-29 23:44:10 -07:00			`from collections import deque`
			`from typing import TYPE_CHECKING, Callable, Dict, Optional, Set, Type`

			`import gym`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00
			`from ray.rllib.policy.policy import PolicySpec`
[RLlib] EnvRunnerV2 and EpisodeV2 that support Connectors. (#25922) 2022-06-29 23:44:10 -07:00			`from ray.rllib.utils.annotations import PublicAPI, override`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`from ray.rllib.utils.framework import try_import_tf`
[RLlib] Checkpoint and restore connectors. (#26253) 2022-07-09 01:06:24 -07:00			`from ray.rllib.utils.policy import create_policy_for_framework`
[RLlib; Docs overhaul] Docstring cleanup: rllib/utils (#19829) 2021-11-01 21:46:02 +01:00			`from ray.rllib.utils.tf_utils import get_tf_eager_cls_if_necessary`
[RLlib] Add locking to PolicyMap in case it is accessed by a RolloutWorker and the same worker's AsyncSampler or the main LearnerThread. (#18444) 2021-09-08 23:32:23 +02:00			`from ray.rllib.utils.threading import with_lock`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`from ray.rllib.utils.typing import (`
[RLlib] EnvRunnerV2 and EpisodeV2 that support Connectors. (#25922) 2022-06-29 23:44:10 -07:00			`AlgorithmConfigDict,`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`PartialAlgorithmConfigDict,`
			`PolicyID,`
			`)`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`from ray.tune.utils.util import merge_dicts`

			`if TYPE_CHECKING:`
			`from ray.rllib.policy.policy import Policy`

			`tf1, tf, tfv = try_import_tf()`


[api] Add API stability annotations for all RLlib symbols and add to LINT (#25060) 2022-05-24 22:14:25 -07:00			`@PublicAPI`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`class PolicyMap(dict):`
			`"""Maps policy IDs to Policy objects.`

			`Thereby, keeps n policies in memory and - when capacity is reached -`
			`writes the least recently used to disk. This allows adding 100s of`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`policies to a Algorithm for league-based setups w/o running out of memory.`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`"""`

			`def __init__(`
			`self,`
			`worker_index: int,`
			`num_workers: int,`
			`capacity: Optional[int] = None,`
			`path: Optional[str] = None,`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`policy_config: Optional[AlgorithmConfigDict] = None,`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`session_creator: Optional[Callable[[], "tf1.Session"]] = None,`
			`seed: Optional[int] = None,`
			`):`
			`"""Initializes a PolicyMap instance.`

			`Args:`
[RLlib; Docs] Auto API reference pages overhaul: `rllib/policy` and `rllib/agents` packages. (#20537) 2021-11-25 09:35:19 +01:00			`worker_index: The worker index of the RolloutWorker this map`
[RLlib] Redo: "fix self play example scripts" PR (17566) (#17895) * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. 2021-08-17 18:13:35 +02:00			`resides in.`
[RLlib; Docs] Auto API reference pages overhaul: `rllib/policy` and `rllib/agents` packages. (#20537) 2021-11-25 09:35:19 +01:00			`num_workers: The total number of remote workers in the`
[RLlib] Redo: "fix self play example scripts" PR (17566) (#17895) * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. 2021-08-17 18:13:35 +02:00			`WorkerSet to which this map's RolloutWorker belongs to.`
[RLlib; Docs] Auto API reference pages overhaul: `rllib/policy` and `rllib/agents` packages. (#20537) 2021-11-25 09:35:19 +01:00			`capacity: The maximum number of policies to hold in memory.`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`The least used ones are written to disk/S3 and retrieved`
			`when needed.`
[RLlib; Docs] Auto API reference pages overhaul: `rllib/policy` and `rllib/agents` packages. (#20537) 2021-11-25 09:35:19 +01:00			`path: The path to store the policy pickle files to. Files`
[RLlib] Redo: "fix self play example scripts" PR (17566) (#17895) * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. 2021-08-17 18:13:35 +02:00			`will have the name: [policy_id].[worker idx].policy.pkl.`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`policy_config: The Algorithm's base config dict.`
[RLlib; Docs] Auto API reference pages overhaul: `rllib/policy` and `rllib/agents` packages. (#20537) 2021-11-25 09:35:19 +01:00			`session_creator: An optional`
[RLlib] Redo: "fix self play example scripts" PR (17566) (#17895) * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. 2021-08-17 18:13:35 +02:00			`tf1.Session creation callable.`
[RLlib; Docs] Auto API reference pages overhaul: `rllib/policy` and `rllib/agents` packages. (#20537) 2021-11-25 09:35:19 +01:00			`seed: An optional seed (used to seed tf policies).`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`"""`
			`super().__init__()`

			`self.worker_index = worker_index`
			`self.num_workers = num_workers`
			`self.session_creator = session_creator`
			`self.seed = seed`

			`# The file extension for stashed policies (that are no longer available`
			`# in-memory but can be reinstated any time from storage).`
[RLlib] Redo: "fix self play example scripts" PR (17566) (#17895) * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. 2021-08-17 18:13:35 +02:00			`self.extension = f".{self.worker_index}.policy.pkl"`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00
			`# Dictionary of keys that may be looked up (cached or not).`
[RLlib] Redo: "fix self play example scripts" PR (17566) (#17895) * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. 2021-08-17 18:13:35 +02:00			`self.valid_keys: Set[str] = set()`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`# The actual cache with the in-memory policy objects.`
[RLlib] Redo: "fix self play example scripts" PR (17566) (#17895) * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. 2021-08-17 18:13:35 +02:00			`self.cache: Dict[str, Policy] = {}`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`# The doubly-linked list holding the currently in-memory objects.`
			`self.deque = deque(maxlen=capacity or 10)`
			`# The file path where to store overflowing policies.`
			`self.path = path or "."`
			`# The core config to use. Each single policy's config override is`
			`# added on top of this.`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`self.policy_config: AlgorithmConfigDict = policy_config or {}`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`# The orig classes/obs+act spaces, and config overrides of the`
			`# Policies.`
[RLlib] Redo: "fix self play example scripts" PR (17566) (#17895) * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. 2021-08-17 18:13:35 +02:00			`self.policy_specs: Dict[PolicyID, PolicySpec] = {}`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00
[RLlib] Add locking to PolicyMap in case it is accessed by a RolloutWorker and the same worker's AsyncSampler or the main LearnerThread. (#18444) 2021-09-08 23:32:23 +02:00			`# Lock used for locking some methods on the object-level.`
			`# This prevents possible race conditions when accessing the map`
			`# and the underlying structures, like self.deque and others.`
			`self._lock = threading.RLock()`

[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`def create_policy(`
			`self,`
			`policy_id: PolicyID,`
			`policy_cls: Type["Policy"],`
			`observation_space: gym.Space,`
			`action_space: gym.Space,`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`config_override: PartialAlgorithmConfigDict,`
			`merged_config: AlgorithmConfigDict,`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`) -> None:`
			`"""Creates a new policy and stores it to the cache.`

			`Args:`
[RLlib; Docs] Auto API reference pages overhaul: `rllib/policy` and `rllib/agents` packages. (#20537) 2021-11-25 09:35:19 +01:00			`policy_id: The policy ID. This is the key under which`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`the created policy will be stored in this map.`
[RLlib; Docs] Auto API reference pages overhaul: `rllib/policy` and `rllib/agents` packages. (#20537) 2021-11-25 09:35:19 +01:00			`policy_cls: The (original) policy class to use.`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`This may still be altered in case tf-eager (and tracing)`
			`is used.`
[RLlib; Docs] Auto API reference pages overhaul: `rllib/policy` and `rllib/agents` packages. (#20537) 2021-11-25 09:35:19 +01:00			`observation_space: The observation space of the`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`policy.`
[RLlib; Docs] Auto API reference pages overhaul: `rllib/policy` and `rllib/agents` packages. (#20537) 2021-11-25 09:35:19 +01:00			`action_space: The action space of the policy.`
			`config_override: The config override`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`dict for this policy. This is the partial dict provided by`
			`the user.`
[RLlib; Docs] Auto API reference pages overhaul: `rllib/policy` and `rllib/agents` packages. (#20537) 2021-11-25 09:35:19 +01:00			`merged_config: The entire config (merged`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			default config + `config_override`).
			`"""`
[RLlib] Checkpoint and restore connectors. (#26253) 2022-07-09 01:06:24 -07:00			`_class = get_tf_eager_cls_if_necessary(policy_cls, merged_config)`

			`self[policy_id] = create_policy_for_framework(`
			`policy_id,`
			`_class,`
			`merged_config,`
			`observation_space,`
			`action_space,`
			`self.worker_index,`
			`self.session_creator,`
			`self.seed,`
			`)`
[RLlib] EnvRunnerV2 and EpisodeV2 that support Connectors. (#25922) 2022-06-29 23:44:10 -07:00
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`# Store spec (class, obs-space, act-space, and config overrides) such`
			`# that the map will be able to reproduce on-the-fly added policies`
			`# from disk.`
			`self.policy_specs[policy_id] = PolicySpec(`
			`policy_class=policy_cls,`
			`observation_space=observation_space,`
			`action_space=action_space,`
			`config=config_override,`
			`)`

[RLlib] Add locking to PolicyMap in case it is accessed by a RolloutWorker and the same worker's AsyncSampler or the main LearnerThread. (#18444) 2021-09-08 23:32:23 +02:00			`@with_lock`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`@override(dict)`
			`def __getitem__(self, item):`
			`# Never seen this key -> Error.`
			`if item not in self.valid_keys:`
[RLlib] Redo: "fix self play example scripts" PR (17566) (#17895) * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. 2021-08-17 18:13:35 +02:00			`raise KeyError(f"PolicyID '{item}' not found in this PolicyMap!")`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00
			`# Item already in cache -> Rearrange deque (least recently used) and`
			`# return.`
			`if item in self.cache:`
			`self.deque.remove(item)`
			`self.deque.append(item)`
			`# Item not currently in cache -> Get from disk and - if at capacity -`
			`# remove leftmost one.`
			`else:`
			`self._read_from_disk(policy_id=item)`

			`return self.cache[item]`

[RLlib] Add locking to PolicyMap in case it is accessed by a RolloutWorker and the same worker's AsyncSampler or the main LearnerThread. (#18444) 2021-09-08 23:32:23 +02:00			`@with_lock`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`@override(dict)`
			`def __setitem__(self, key, value):`
			`# Item already in cache -> Rearrange deque (least recently used).`
			`if key in self.cache:`
			`self.deque.remove(key)`
			`self.deque.append(key)`
			`self.cache[key] = value`
			`# Item not currently in cache -> store new value and - if at capacity -`
			`# remove leftmost one.`
			`else:`
			`# Cache at capacity -> Drop leftmost item.`
			`if len(self.deque) == self.deque.maxlen:`
			`self._stash_to_disk()`
			`self.deque.append(key)`
			`self.cache[key] = value`
			`self.valid_keys.add(key)`

[RLlib] Add locking to PolicyMap in case it is accessed by a RolloutWorker and the same worker's AsyncSampler or the main LearnerThread. (#18444) 2021-09-08 23:32:23 +02:00			`@with_lock`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`@override(dict)`
			`def __delitem__(self, key):`
			`# Make key invalid.`
			`self.valid_keys.remove(key)`
			`# Remove policy from memory if currently cached.`
			`if key in self.cache:`
			`policy = self.cache[key]`
			`self._close_session(policy)`
			`del self.cache[key]`
			`# Remove file associated with the policy, if it exists.`
			`filename = self.path + "/" + key + self.extension`
			`if os.path.isfile(filename):`
			`os.remove(filename)`

			`@override(dict)`
			`def __iter__(self):`
[RLlib] Add locking to PolicyMap in case it is accessed by a RolloutWorker and the same worker's AsyncSampler or the main LearnerThread. (#18444) 2021-09-08 23:32:23 +02:00			`return iter(self.keys())`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00
			`@override(dict)`
			`def items(self):`
			`"""Iterates over all policies, even the stashed-to-disk ones."""`

			`def gen():`
			`for key in self.valid_keys:`
			`yield (key, self[key])`

			`return gen()`

			`@override(dict)`
			`def keys(self):`
[RLlib] Add locking to PolicyMap in case it is accessed by a RolloutWorker and the same worker's AsyncSampler or the main LearnerThread. (#18444) 2021-09-08 23:32:23 +02:00			`self._lock.acquire()`
			`ks = list(self.valid_keys)`
			`self._lock.release()`

[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`def gen():`
[RLlib] Add locking to PolicyMap in case it is accessed by a RolloutWorker and the same worker's AsyncSampler or the main LearnerThread. (#18444) 2021-09-08 23:32:23 +02:00			`for key in ks:`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`yield key`

			`return gen()`

			`@override(dict)`
			`def values(self):`
[RLlib] Add locking to PolicyMap in case it is accessed by a RolloutWorker and the same worker's AsyncSampler or the main LearnerThread. (#18444) 2021-09-08 23:32:23 +02:00			`self._lock.acquire()`
			`vs = [self[k] for k in self.valid_keys]`
			`self._lock.release()`

[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`def gen():`
[RLlib] Add locking to PolicyMap in case it is accessed by a RolloutWorker and the same worker's AsyncSampler or the main LearnerThread. (#18444) 2021-09-08 23:32:23 +02:00			`for value in vs:`
			`yield value`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00
			`return gen()`

[RLlib] Add locking to PolicyMap in case it is accessed by a RolloutWorker and the same worker's AsyncSampler or the main LearnerThread. (#18444) 2021-09-08 23:32:23 +02:00			`@with_lock`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`@override(dict)`
			`def update(self, __m, **kwargs):`
			`for k, v in __m.items():`
			`self[k] = v`
			`for k, v in kwargs.items():`
			`self[k] = v`

[RLlib] Add locking to PolicyMap in case it is accessed by a RolloutWorker and the same worker's AsyncSampler or the main LearnerThread. (#18444) 2021-09-08 23:32:23 +02:00			`@with_lock`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`@override(dict)`
			`def get(self, key):`
			`if key not in self.valid_keys:`
			`return None`
			`return self[key]`

[RLlib] Add locking to PolicyMap in case it is accessed by a RolloutWorker and the same worker's AsyncSampler or the main LearnerThread. (#18444) 2021-09-08 23:32:23 +02:00			`@with_lock`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`@override(dict)`
			`def __len__(self):`
			`"""Returns number of all policies, including the stashed-to-disk ones."""`
			`return len(self.valid_keys)`

[RLlib] Add locking to PolicyMap in case it is accessed by a RolloutWorker and the same worker's AsyncSampler or the main LearnerThread. (#18444) 2021-09-08 23:32:23 +02:00			`@with_lock`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`@override(dict)`
			`def __contains__(self, item):`
			`return item in self.valid_keys`

			`def _stash_to_disk(self):`
			`"""Writes the least-recently used policy to disk and rearranges cache.`

			`Also closes the session - if applicable - of the stashed policy.`
			`"""`
			`# Get least recently used policy (all the way on the left in deque).`
			`delkey = self.deque.popleft()`
			`policy = self.cache[delkey]`
			`# Get its state for writing to disk.`
			`policy_state = policy.get_state()`
			`# Closes policy's tf session, if any.`
			`self._close_session(policy)`
[RLlib] Redo: "fix self play example scripts" PR (17566) (#17895) * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. * wip. 2021-08-17 18:13:35 +02:00			`# Remove from memory. This will clear the tf Graph as well.`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`del self.cache[delkey]`
			`# Write state to disk.`
			`with open(self.path + "/" + delkey + self.extension, "wb") as f:`
			`pickle.dump(policy_state, file=f)`

			`def _read_from_disk(self, policy_id):`
			`"""Reads a policy ID from disk and re-adds it to the cache."""`
			`# Make sure this policy ID is not in the cache right now.`
			`assert policy_id not in self.cache`
			`# Read policy state from disk.`
			`with open(self.path + "/" + policy_id + self.extension, "rb") as f:`
			`policy_state = pickle.load(f)`

			`# Get class and config override.`
			`merged_conf = merge_dicts(`
			`self.policy_config, self.policy_specs[policy_id].config`
			`)`

			`# Create policy object (from its spec: cls, obs-space, act-space,`
			`# config).`
			`self.create_policy(`
			`policy_id,`
			`self.policy_specs[policy_id].policy_class,`
			`self.policy_specs[policy_id].observation_space,`
			`self.policy_specs[policy_id].action_space,`
			`self.policy_specs[policy_id].config,`
			`merged_conf,`
			`)`
			`# Restore policy's state.`
			`policy = self[policy_id]`
			`policy.set_state(policy_state)`

			`def _close_session(self, policy):`
			`sess = policy.get_session()`
			`# Closes the tf session, if any.`
			`if sess is not None:`
			`sess.close()`