2021-10-29 12:03:56 +02:00
|
|
|
import collections
|
2018-10-29 02:52:18 +08:00
|
|
|
import logging
|
2018-07-01 00:05:08 -07:00
|
|
|
import numpy as np
|
2021-10-29 12:03:56 +02:00
|
|
|
from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING, Union
|
2018-07-01 00:05:08 -07:00
|
|
|
|
|
|
|
import ray
|
2021-05-03 14:23:28 -07:00
|
|
|
from ray import ObjectRef
|
|
|
|
from ray.actor import ActorHandle
|
2022-06-07 03:52:19 -07:00
|
|
|
from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimate
|
2021-10-29 12:03:56 +02:00
|
|
|
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
|
2019-01-23 21:27:26 -08:00
|
|
|
from ray.rllib.utils.annotations import DeveloperAPI
|
2021-09-30 16:39:05 +02:00
|
|
|
from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY
|
2020-08-15 13:24:22 +02:00
|
|
|
from ray.rllib.utils.typing import GradInfoDict, LearnerStatsDict, ResultDict
|
2018-07-01 00:05:08 -07:00
|
|
|
|
2021-05-03 14:23:28 -07:00
|
|
|
if TYPE_CHECKING:
|
|
|
|
from ray.rllib.evaluation.rollout_worker import RolloutWorker
|
|
|
|
|
2018-10-29 02:52:18 +08:00
|
|
|
logger = logging.getLogger(__name__)
|
2018-07-01 00:05:08 -07:00
|
|
|
|
2022-05-24 22:14:25 -07:00
|
|
|
RolloutMetrics = DeveloperAPI(
|
|
|
|
collections.namedtuple(
|
|
|
|
"RolloutMetrics",
|
|
|
|
[
|
|
|
|
"episode_length",
|
|
|
|
"episode_reward",
|
|
|
|
"agent_rewards",
|
|
|
|
"custom_metrics",
|
|
|
|
"perf_stats",
|
|
|
|
"hist_data",
|
|
|
|
"media",
|
|
|
|
],
|
|
|
|
)
|
2021-10-29 12:03:56 +02:00
|
|
|
)
|
|
|
|
RolloutMetrics.__new__.__defaults__ = (0, 0, {}, {}, {}, {}, {})
|
|
|
|
|
2019-03-27 15:40:15 -07:00
|
|
|
|
2022-05-24 22:14:25 -07:00
|
|
|
def _extract_stats(stats: Dict, key: str) -> Dict[str, Any]:
|
2021-02-08 15:02:19 +01:00
|
|
|
if key in stats:
|
|
|
|
return stats[key]
|
|
|
|
|
|
|
|
multiagent_stats = {}
|
|
|
|
for k, v in stats.items():
|
|
|
|
if isinstance(v, dict):
|
|
|
|
if key in v:
|
|
|
|
multiagent_stats[k] = v[key]
|
|
|
|
|
|
|
|
return multiagent_stats
|
|
|
|
|
|
|
|
|
2019-03-27 15:40:15 -07:00
|
|
|
@DeveloperAPI
|
2020-06-19 13:09:05 -07:00
|
|
|
def get_learner_stats(grad_info: GradInfoDict) -> LearnerStatsDict:
|
2019-05-20 16:46:05 -07:00
|
|
|
"""Return optimization stats reported from the policy.
|
2019-03-27 15:40:15 -07:00
|
|
|
|
|
|
|
Example:
|
2020-05-21 10:16:18 -07:00
|
|
|
>>> grad_info = worker.learn_on_batch(samples)
|
|
|
|
{"td_error": [...], "learner_stats": {"vf_loss": ..., ...}}
|
2019-03-27 15:40:15 -07:00
|
|
|
>>> print(get_stats(grad_info))
|
|
|
|
{"vf_loss": ..., "policy_loss": ...}
|
|
|
|
"""
|
|
|
|
if LEARNER_STATS_KEY in grad_info:
|
|
|
|
return grad_info[LEARNER_STATS_KEY]
|
|
|
|
|
|
|
|
multiagent_stats = {}
|
|
|
|
for k, v in grad_info.items():
|
|
|
|
if type(v) is dict:
|
|
|
|
if LEARNER_STATS_KEY in v:
|
|
|
|
multiagent_stats[k] = v[LEARNER_STATS_KEY]
|
|
|
|
|
|
|
|
return multiagent_stats
|
|
|
|
|
2018-10-29 02:52:18 +08:00
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2020-06-19 13:09:05 -07:00
|
|
|
def collect_metrics(
|
|
|
|
local_worker: Optional["RolloutWorker"] = None,
|
2021-10-03 23:24:11 -07:00
|
|
|
remote_workers: Optional[List[ActorHandle]] = None,
|
|
|
|
to_be_collected: Optional[List[ObjectRef]] = None,
|
2020-06-19 13:09:05 -07:00
|
|
|
timeout_seconds: int = 180,
|
2022-02-02 11:28:42 -05:00
|
|
|
keep_custom_metrics: bool = False,
|
2020-06-19 13:09:05 -07:00
|
|
|
) -> ResultDict:
|
2019-06-03 06:49:24 +08:00
|
|
|
"""Gathers episode metrics from RolloutWorker instances."""
|
2021-10-03 23:24:11 -07:00
|
|
|
if remote_workers is None:
|
|
|
|
remote_workers = []
|
|
|
|
|
|
|
|
if to_be_collected is None:
|
|
|
|
to_be_collected = []
|
2018-07-01 00:05:08 -07:00
|
|
|
|
2019-07-19 13:59:52 +08:00
|
|
|
episodes, to_be_collected = collect_episodes(
|
|
|
|
local_worker, remote_workers, to_be_collected, timeout_seconds=timeout_seconds
|
|
|
|
)
|
2022-02-02 11:28:42 -05:00
|
|
|
metrics = summarize_episodes(
|
|
|
|
episodes, episodes, keep_custom_metrics=keep_custom_metrics
|
|
|
|
)
|
2018-10-29 02:52:18 +08:00
|
|
|
return metrics
|
2018-08-23 17:49:10 -07:00
|
|
|
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2020-06-19 13:09:05 -07:00
|
|
|
def collect_episodes(
|
|
|
|
local_worker: Optional["RolloutWorker"] = None,
|
2021-10-03 23:24:11 -07:00
|
|
|
remote_workers: Optional[List[ActorHandle]] = None,
|
|
|
|
to_be_collected: Optional[List[ObjectRef]] = None,
|
2020-06-19 13:09:05 -07:00
|
|
|
timeout_seconds: int = 180,
|
2021-05-03 14:23:28 -07:00
|
|
|
) -> Tuple[List[Union[RolloutMetrics, OffPolicyEstimate]], List[ObjectRef]]:
|
2022-07-15 08:55:14 +02:00
|
|
|
"""Gathers new episodes metrics tuples from the given RolloutWorkers.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
local_worker: The local RolloutWorker (if any). By default, evaluation
|
|
|
|
WorkerSets don't have a local worker anymore (not needed).
|
|
|
|
remote_workers: List of ActorHandle pointing to remote RolloutWorkers.
|
|
|
|
|
|
|
|
"""
|
2021-10-03 23:24:11 -07:00
|
|
|
if remote_workers is None:
|
|
|
|
remote_workers = []
|
|
|
|
|
|
|
|
if to_be_collected is None:
|
|
|
|
to_be_collected = []
|
2018-08-23 17:49:10 -07:00
|
|
|
|
2019-06-03 06:49:24 +08:00
|
|
|
if remote_workers:
|
2019-05-10 20:36:18 -07:00
|
|
|
pending = [
|
2019-06-03 06:49:24 +08:00
|
|
|
a.apply.remote(lambda ev: ev.get_metrics()) for a in remote_workers
|
2019-07-19 13:59:52 +08:00
|
|
|
] + to_be_collected
|
|
|
|
collected, to_be_collected = ray.wait(
|
2019-05-10 20:36:18 -07:00
|
|
|
pending, num_returns=len(pending), timeout=timeout_seconds * 1.0
|
|
|
|
)
|
|
|
|
if pending and len(collected) == 0:
|
2019-07-19 13:59:52 +08:00
|
|
|
logger.warning(
|
|
|
|
"WARNING: collected no metrics in {} seconds".format(timeout_seconds)
|
|
|
|
)
|
2020-05-21 10:16:18 -07:00
|
|
|
metric_lists = ray.get(collected)
|
2019-05-10 20:36:18 -07:00
|
|
|
else:
|
|
|
|
metric_lists = []
|
|
|
|
|
2019-06-03 06:49:24 +08:00
|
|
|
if local_worker:
|
|
|
|
metric_lists.append(local_worker.get_metrics())
|
2018-08-23 17:49:10 -07:00
|
|
|
episodes = []
|
2018-07-01 00:05:08 -07:00
|
|
|
for metrics in metric_lists:
|
2018-08-23 17:49:10 -07:00
|
|
|
episodes.extend(metrics)
|
2019-07-19 13:59:52 +08:00
|
|
|
return episodes, to_be_collected
|
2018-08-23 17:49:10 -07:00
|
|
|
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2020-06-19 13:09:05 -07:00
|
|
|
def summarize_episodes(
|
|
|
|
episodes: List[Union[RolloutMetrics, OffPolicyEstimate]],
|
|
|
|
new_episodes: List[Union[RolloutMetrics, OffPolicyEstimate]] = None,
|
2022-02-02 11:28:42 -05:00
|
|
|
keep_custom_metrics: bool = False,
|
2020-06-19 13:09:05 -07:00
|
|
|
) -> ResultDict:
|
2018-09-03 20:01:53 -07:00
|
|
|
"""Summarizes a set of episode metrics tuples.
|
|
|
|
|
2020-09-20 11:27:02 +02:00
|
|
|
Args:
|
2022-06-23 21:30:01 +02:00
|
|
|
episodes: List of most recent n episodes. This may include historical ones
|
|
|
|
(not newly collected in this iteration) in order to achieve the size of
|
|
|
|
the smoothing window.
|
|
|
|
new_episodes: All the episodes that were completed in this iteration.
|
2018-09-03 20:01:53 -07:00
|
|
|
"""
|
2018-08-23 17:49:10 -07:00
|
|
|
|
2020-02-01 22:12:12 -08:00
|
|
|
if new_episodes is None:
|
|
|
|
new_episodes = episodes
|
|
|
|
|
2022-06-21 04:24:24 -07:00
|
|
|
episodes, _ = _partition(episodes)
|
|
|
|
new_episodes, estimates = _partition(new_episodes)
|
2019-02-13 16:25:05 -08:00
|
|
|
|
2018-08-23 17:49:10 -07:00
|
|
|
episode_rewards = []
|
|
|
|
episode_lengths = []
|
|
|
|
policy_rewards = collections.defaultdict(list)
|
2018-11-03 18:48:32 -07:00
|
|
|
custom_metrics = collections.defaultdict(list)
|
2019-03-27 13:24:23 -07:00
|
|
|
perf_stats = collections.defaultdict(list)
|
2020-01-31 08:02:53 +02:00
|
|
|
hist_stats = collections.defaultdict(list)
|
2021-03-19 08:17:09 +00:00
|
|
|
episode_media = collections.defaultdict(list)
|
|
|
|
|
2018-08-23 17:49:10 -07:00
|
|
|
for episode in episodes:
|
|
|
|
episode_lengths.append(episode.episode_length)
|
|
|
|
episode_rewards.append(episode.episode_reward)
|
2018-11-03 18:48:32 -07:00
|
|
|
for k, v in episode.custom_metrics.items():
|
|
|
|
custom_metrics[k].append(v)
|
2019-03-27 13:24:23 -07:00
|
|
|
for k, v in episode.perf_stats.items():
|
|
|
|
perf_stats[k].append(v)
|
2018-08-23 17:49:10 -07:00
|
|
|
for (_, policy_id), reward in episode.agent_rewards.items():
|
|
|
|
if policy_id != DEFAULT_POLICY_ID:
|
|
|
|
policy_rewards[policy_id].append(reward)
|
2020-01-31 08:02:53 +02:00
|
|
|
for k, v in episode.hist_data.items():
|
|
|
|
hist_stats[k] += v
|
2021-03-19 08:17:09 +00:00
|
|
|
for k, v in episode.media.items():
|
|
|
|
episode_media[k].append(v)
|
2018-07-01 00:05:08 -07:00
|
|
|
if episode_rewards:
|
|
|
|
min_reward = min(episode_rewards)
|
|
|
|
max_reward = max(episode_rewards)
|
2020-03-16 11:19:58 -07:00
|
|
|
avg_reward = np.mean(episode_rewards)
|
2018-07-01 00:05:08 -07:00
|
|
|
else:
|
2019-04-11 14:24:26 -07:00
|
|
|
min_reward = float("nan")
|
|
|
|
max_reward = float("nan")
|
2020-03-16 11:19:58 -07:00
|
|
|
avg_reward = float("nan")
|
|
|
|
if episode_lengths:
|
|
|
|
avg_length = np.mean(episode_lengths)
|
|
|
|
else:
|
|
|
|
avg_length = float("nan")
|
2018-07-01 00:05:08 -07:00
|
|
|
|
2020-01-31 08:02:53 +02:00
|
|
|
# Show as histogram distributions.
|
|
|
|
hist_stats["episode_reward"] = episode_rewards
|
|
|
|
hist_stats["episode_lengths"] = episode_lengths
|
|
|
|
|
2019-08-08 14:03:28 -07:00
|
|
|
policy_reward_min = {}
|
|
|
|
policy_reward_mean = {}
|
|
|
|
policy_reward_max = {}
|
2018-07-01 00:05:08 -07:00
|
|
|
for policy_id, rewards in policy_rewards.copy().items():
|
2019-08-08 14:03:28 -07:00
|
|
|
policy_reward_min[policy_id] = np.min(rewards)
|
|
|
|
policy_reward_mean[policy_id] = np.mean(rewards)
|
|
|
|
policy_reward_max[policy_id] = np.max(rewards)
|
2018-07-01 00:05:08 -07:00
|
|
|
|
2020-01-31 08:02:53 +02:00
|
|
|
# Show as histogram distributions.
|
|
|
|
hist_stats["policy_{}_reward".format(policy_id)] = rewards
|
|
|
|
|
2018-12-05 23:31:45 -08:00
|
|
|
for k, v_list in custom_metrics.copy().items():
|
2021-05-04 13:25:28 +02:00
|
|
|
filt = [v for v in v_list if not np.any(np.isnan(v))]
|
2022-02-02 11:28:42 -05:00
|
|
|
if keep_custom_metrics:
|
|
|
|
custom_metrics[k] = filt
|
2018-12-05 23:31:45 -08:00
|
|
|
else:
|
2022-02-02 11:28:42 -05:00
|
|
|
custom_metrics[k + "_mean"] = np.mean(filt)
|
|
|
|
if filt:
|
|
|
|
custom_metrics[k + "_min"] = np.min(filt)
|
|
|
|
custom_metrics[k + "_max"] = np.max(filt)
|
|
|
|
else:
|
|
|
|
custom_metrics[k + "_min"] = float("nan")
|
|
|
|
custom_metrics[k + "_max"] = float("nan")
|
|
|
|
del custom_metrics[k]
|
2018-11-03 18:48:32 -07:00
|
|
|
|
2019-03-27 13:24:23 -07:00
|
|
|
for k, v_list in perf_stats.copy().items():
|
|
|
|
perf_stats[k] = np.mean(v_list)
|
|
|
|
|
2019-02-13 16:25:05 -08:00
|
|
|
estimators = collections.defaultdict(lambda: collections.defaultdict(list))
|
|
|
|
for e in estimates:
|
|
|
|
acc = estimators[e.estimator_name]
|
|
|
|
for k, v in e.metrics.items():
|
|
|
|
acc[k].append(v)
|
|
|
|
for name, metrics in estimators.items():
|
2022-06-21 04:24:24 -07:00
|
|
|
out = {}
|
2019-02-13 16:25:05 -08:00
|
|
|
for k, v_list in metrics.items():
|
2022-06-21 04:24:24 -07:00
|
|
|
out[k + "_mean"] = np.mean(v_list)
|
|
|
|
out[k + "_std"] = np.std(v_list)
|
|
|
|
estimators[name] = out
|
2019-02-13 16:25:05 -08:00
|
|
|
|
2018-08-07 12:17:44 -07:00
|
|
|
return dict(
|
2018-07-01 00:05:08 -07:00
|
|
|
episode_reward_max=max_reward,
|
|
|
|
episode_reward_min=min_reward,
|
|
|
|
episode_reward_mean=avg_reward,
|
|
|
|
episode_len_mean=avg_length,
|
2021-03-19 08:17:09 +00:00
|
|
|
episode_media=dict(episode_media),
|
2018-09-30 01:15:13 -07:00
|
|
|
episodes_this_iter=len(new_episodes),
|
2019-08-08 14:03:28 -07:00
|
|
|
policy_reward_min=policy_reward_min,
|
|
|
|
policy_reward_max=policy_reward_max,
|
|
|
|
policy_reward_mean=policy_reward_mean,
|
2018-11-03 18:48:32 -07:00
|
|
|
custom_metrics=dict(custom_metrics),
|
2020-01-31 08:02:53 +02:00
|
|
|
hist_stats=dict(hist_stats),
|
2019-03-27 13:24:23 -07:00
|
|
|
sampler_perf=dict(perf_stats),
|
2019-07-19 13:59:52 +08:00
|
|
|
off_policy_estimator=dict(estimators),
|
|
|
|
)
|
2019-02-13 16:25:05 -08:00
|
|
|
|
|
|
|
|
2020-06-19 13:09:05 -07:00
|
|
|
def _partition(
|
|
|
|
episodes: List[RolloutMetrics],
|
|
|
|
) -> Tuple[List[RolloutMetrics], List[OffPolicyEstimate]]:
|
2019-02-13 16:25:05 -08:00
|
|
|
"""Divides metrics data into true rollouts vs off-policy estimates."""
|
|
|
|
|
|
|
|
rollouts, estimates = [], []
|
|
|
|
for e in episodes:
|
|
|
|
if isinstance(e, RolloutMetrics):
|
|
|
|
rollouts.append(e)
|
|
|
|
elif isinstance(e, OffPolicyEstimate):
|
|
|
|
estimates.append(e)
|
|
|
|
else:
|
|
|
|
raise ValueError("Unknown metric type: {}".format(e))
|
|
|
|
return rollouts, estimates
|