ray/rllib/examples/custom_metrics_and_callbacks.py

"""Example of using RLlib's debug callbacks.

Here we use callbacks to track the average CartPole pole angle magnitude as a
custom metric.
"""

from typing import Dict
import argparse
import numpy as np
import os

import ray
from ray import tune
from ray.rllib.agents.callbacks import DefaultCallbacks
from ray.rllib.env import BaseEnv
from ray.rllib.evaluation import MultiAgentEpisode, RolloutWorker
from ray.rllib.policy import Policy
from ray.rllib.policy.sample_batch import SampleBatch

parser = argparse.ArgumentParser()
parser.add_argument("--torch", action="store_true")
parser.add_argument("--stop-iters", type=int, default=2000)


class MyCallbacks(DefaultCallbacks):
    def on_episode_start(self, *, worker: RolloutWorker, base_env: BaseEnv,
                         policies: Dict[str, Policy],
                         episode: MultiAgentEpisode, env_index: int, **kwargs):
        # Make sure this episode has just been started (only initial obs
        # logged so far).
        assert episode.length == 0, \
            "ERROR: `on_episode_start()` callback should be called right " \
            "after env reset!"
        print("episode {} (env-idx={}) started.".format(
            episode.episode_id, env_index))
        episode.user_data["pole_angles"] = []
        episode.hist_data["pole_angles"] = []

    def on_episode_step(self, *, worker: RolloutWorker, base_env: BaseEnv,
                        episode: MultiAgentEpisode, env_index: int, **kwargs):
        # Make sure this episode is ongoing.
        assert episode.length > 0, \
            "ERROR: `on_episode_step()` callback should not be called right " \
            "after env reset!"
        pole_angle = abs(episode.last_observation_for()[2])
        raw_angle = abs(episode.last_raw_obs_for()[2])
        assert pole_angle == raw_angle
        episode.user_data["pole_angles"].append(pole_angle)

    def on_episode_end(self, *, worker: RolloutWorker, base_env: BaseEnv,
                       policies: Dict[str, Policy], episode: MultiAgentEpisode,
                       env_index: int, **kwargs):
        # Make sure this episode is really done.
        assert episode.batch_builder.policy_collectors[
            "default_policy"].buffers["dones"][-1], \
            "ERROR: `on_episode_end()` should only be called " \
            "after episode is done!"
        pole_angle = np.mean(episode.user_data["pole_angles"])
        print("episode {} (env-idx={}) ended with length {} and pole "
              "angles {}".format(episode.episode_id, env_index, episode.length,
                                 pole_angle))
        episode.custom_metrics["pole_angle"] = pole_angle
        episode.hist_data["pole_angles"] = episode.user_data["pole_angles"]

    def on_sample_end(self, *, worker: RolloutWorker, samples: SampleBatch,
                      **kwargs):
        print("returned sample batch of size {}".format(samples.count))

    def on_train_result(self, *, trainer, result: dict, **kwargs):
        print("trainer.train() result: {} -> {} episodes".format(
            trainer, result["episodes_this_iter"]))
        # you can mutate the result dict to add new fields to return
        result["callback_ok"] = True

    def on_learn_on_batch(self, *, policy: Policy, train_batch: SampleBatch,
                          result: dict, **kwargs) -> None:
        result["sum_actions_in_train_batch"] = np.sum(train_batch["actions"])
        print("policy.learn_on_batch() result: {} -> sum actions: {}".format(
            policy, result["sum_actions_in_train_batch"]))

    def on_postprocess_trajectory(
            self, *, worker: RolloutWorker, episode: MultiAgentEpisode,
            agent_id: str, policy_id: str, policies: Dict[str, Policy],
            postprocessed_batch: SampleBatch,
            original_batches: Dict[str, SampleBatch], **kwargs):
        print("postprocessed {} steps".format(postprocessed_batch.count))
        if "num_batches" not in episode.custom_metrics:
            episode.custom_metrics["num_batches"] = 0
        episode.custom_metrics["num_batches"] += 1


if __name__ == "__main__":
    args = parser.parse_args()

    ray.init()
    trials = tune.run(
        "PG",
        stop={
            "training_iteration": args.stop_iters,
        },
        config={
            "env": "CartPole-v0",
            "num_envs_per_worker": 2,
            "callbacks": MyCallbacks,
            "framework": "torch" if args.torch else "tf",
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        }).trials

    # Verify episode-related custom metrics are there.
    custom_metrics = trials[0].last_result["custom_metrics"]
    print(custom_metrics)
    assert "pole_angle_mean" in custom_metrics
    assert "pole_angle_min" in custom_metrics
    assert "pole_angle_max" in custom_metrics
    assert "num_batches_mean" in custom_metrics
    assert "callback_ok" in trials[0].last_result

    # Verify `on_learn_on_batch` custom metrics are there (per policy).
    if args.torch:
        info_custom_metrics = custom_metrics["default_policy"]
        print(info_custom_metrics)
        assert "sum_actions_in_train_batch" in info_custom_metrics
[rllib] Implement custom metrics (#3144) 2018-11-03 18:48:32 -07:00			`"""Example of using RLlib's debug callbacks.`

			`Here we use callbacks to track the average CartPole pole angle magnitude as a`
			`custom metric.`
			`"""`

[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`from typing import Dict`
[rllib] Implement custom metrics (#3144) 2018-11-03 18:48:32 -07:00			`import argparse`
			`import numpy as np`
[RLlib] Fix all example scripts to run on GPUs. (#11105) 2020-10-02 23:07:44 +02:00			`import os`
[rllib] Implement custom metrics (#3144) 2018-11-03 18:48:32 -07:00
			`import ray`
			`from ray import tune`
[RLlib] Fix all example scripts to run on GPUs. (#11105) 2020-10-02 23:07:44 +02:00			`from ray.rllib.agents.callbacks import DefaultCallbacks`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`from ray.rllib.env import BaseEnv`
[RLlib] Fix all example scripts to run on GPUs. (#11105) 2020-10-02 23:07:44 +02:00			`from ray.rllib.evaluation import MultiAgentEpisode, RolloutWorker`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`from ray.rllib.policy import Policy`
			`from ray.rllib.policy.sample_batch import SampleBatch`
[RLlib] Fix all example scripts to run on GPUs. (#11105) 2020-10-02 23:07:44 +02:00
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("--torch", action="store_true")`
			`parser.add_argument("--stop-iters", type=int, default=2000)`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00

			`class MyCallbacks(DefaultCallbacks):`
[RLlib] Issue 10469: Callbacks should receive env idx ... (#10477) 2020-09-03 17:27:05 +02:00			`def on_episode_start(self, *, worker: RolloutWorker, base_env: BaseEnv,`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`policies: Dict[str, Policy],`
[RLlib] Issue 10469: Callbacks should receive env idx ... (#10477) 2020-09-03 17:27:05 +02:00			`episode: MultiAgentEpisode, env_index: int, **kwargs):`
[RLlib] Discussion 1513: `on_episode_step()` callback called after very first reset (should not). (#15218) 2021-04-11 13:16:17 +02:00			`# Make sure this episode has just been started (only initial obs`
			`# logged so far).`
			`assert episode.length == 0, \`
			"ERROR: `on_episode_start()` callback should be called right " \
			`"after env reset!"`
[RLlib] Issue 10469: Callbacks should receive env idx ... (#10477) 2020-09-03 17:27:05 +02:00			`print("episode {} (env-idx={}) started.".format(`
			`episode.episode_id, env_index))`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`episode.user_data["pole_angles"] = []`
			`episode.hist_data["pole_angles"] = []`

[RLlib] Issue 10469: Callbacks should receive env idx ... (#10477) 2020-09-03 17:27:05 +02:00			`def on_episode_step(self, *, worker: RolloutWorker, base_env: BaseEnv,`
			`episode: MultiAgentEpisode, env_index: int, **kwargs):`
[RLlib] Discussion 1513: `on_episode_step()` callback called after very first reset (should not). (#15218) 2021-04-11 13:16:17 +02:00			`# Make sure this episode is ongoing.`
			`assert episode.length > 0, \`
			"ERROR: `on_episode_step()` callback should not be called right " \
			`"after env reset!"`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`pole_angle = abs(episode.last_observation_for()[2])`
			`raw_angle = abs(episode.last_raw_obs_for()[2])`
			`assert pole_angle == raw_angle`
			`episode.user_data["pole_angles"].append(pole_angle)`

[RLlib] Issue 10469: Callbacks should receive env idx ... (#10477) 2020-09-03 17:27:05 +02:00			`def on_episode_end(self, *, worker: RolloutWorker, base_env: BaseEnv,`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`policies: Dict[str, Policy], episode: MultiAgentEpisode,`
[RLlib] Issue 10469: Callbacks should receive env idx ... (#10477) 2020-09-03 17:27:05 +02:00			`env_index: int, **kwargs):`
[RLlib] Discussion 1513: `on_episode_step()` callback called after very first reset (should not). (#15218) 2021-04-11 13:16:17 +02:00			`# Make sure this episode is really done.`
			`assert episode.batch_builder.policy_collectors[`
			`"default_policy"].buffers["dones"][-1], \`
			"ERROR: `on_episode_end()` should only be called " \
			`"after episode is done!"`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`pole_angle = np.mean(episode.user_data["pole_angles"])`
[RLlib] Issue 10469: Callbacks should receive env idx ... (#10477) 2020-09-03 17:27:05 +02:00			`print("episode {} (env-idx={}) ended with length {} and pole "`
			`"angles {}".format(episode.episode_id, env_index, episode.length,`
			`pole_angle))`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`episode.custom_metrics["pole_angle"] = pole_angle`
			`episode.hist_data["pole_angles"] = episode.user_data["pole_angles"]`

[RLlib] Issue 10469: Callbacks should receive env idx ... (#10477) 2020-09-03 17:27:05 +02:00			`def on_sample_end(self, *, worker: RolloutWorker, samples: SampleBatch,`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`**kwargs):`
			`print("returned sample batch of size {}".format(samples.count))`

[RLlib] Issue 10469: Callbacks should receive env idx ... (#10477) 2020-09-03 17:27:05 +02:00			`def on_train_result(self, , trainer, result: dict, *kwargs):`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`print("trainer.train() result: {} -> {} episodes".format(`
			`trainer, result["episodes_this_iter"]))`
			`# you can mutate the result dict to add new fields to return`
			`result["callback_ok"] = True`

[RLlib] Extend on_learn_on_batch callback to allow for custom metrics to be added. (#13584) 2021-02-08 15:02:19 +01:00			`def on_learn_on_batch(self, *, policy: Policy, train_batch: SampleBatch,`
			`result: dict, **kwargs) -> None:`
			`result["sum_actions_in_train_batch"] = np.sum(train_batch["actions"])`
			`print("policy.learn_on_batch() result: {} -> sum actions: {}".format(`
			`policy, result["sum_actions_in_train_batch"]))`

[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`def on_postprocess_trajectory(`
[RLlib] Issue 10469: Callbacks should receive env idx ... (#10477) 2020-09-03 17:27:05 +02:00			`self, *, worker: RolloutWorker, episode: MultiAgentEpisode,`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`agent_id: str, policy_id: str, policies: Dict[str, Policy],`
			`postprocessed_batch: SampleBatch,`
			`original_batches: Dict[str, SampleBatch], **kwargs):`
			`print("postprocessed {} steps".format(postprocessed_batch.count))`
			`if "num_batches" not in episode.custom_metrics:`
			`episode.custom_metrics["num_batches"] = 0`
			`episode.custom_metrics["num_batches"] += 1`
[rllib] Rename Agent to Trainer (#4556) 2019-04-07 00:36:18 -07:00

[rllib] Implement custom metrics (#3144) 2018-11-03 18:48:32 -07:00			`if __name__ == "__main__":`
			`args = parser.parse_args()`

			`ray.init()`
[rllib] Switch to tune.run() instead of run_experiments() (#4515) 2019-03-30 14:07:50 -07:00			`trials = tune.run(`
			`"PG",`
			`stop={`
[RLlib] Examples folder restructuring (Model examples; final part). (#8278) - This PR completes any previously missing PyTorch Model counterparts to TFModels in examples/models. - It also makes sure, all example scripts in the rllib/examples folder are tested for both frameworks and learn the given task (this is often currently not checked) using a --as-test flag in connection with a --stop-reward. 2020-05-12 08:23:10 +02:00			`"training_iteration": args.stop_iters,`
[rllib] Switch to tune.run() instead of run_experiments() (#4515) 2019-03-30 14:07:50 -07:00			`},`
			`config={`
[rllib] Implement custom metrics (#3144) 2018-11-03 18:48:32 -07:00			`"env": "CartPole-v0",`
[RLlib] Issue 10469: Callbacks should receive env idx ... (#10477) 2020-09-03 17:27:05 +02:00			`"num_envs_per_worker": 2,`
[RLlib] Added DefaultCallbacks which replaces old callbacks dict interface (#6972) 2020-04-17 02:06:42 +03:00			`"callbacks": MyCallbacks,`
[RLlib] Fix all example scripts to run on GPUs. (#11105) 2020-10-02 23:07:44 +02:00			`"framework": "torch" if args.torch else "tf",`
			# Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
			`"num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),`
[tune] API revamp fix (#10518) 2020-09-05 15:34:53 -07:00			`}).trials`
[rllib] Implement custom metrics (#3144) 2018-11-03 18:48:32 -07:00
[RLlib] Extend on_learn_on_batch callback to allow for custom metrics to be added. (#13584) 2021-02-08 15:02:19 +01:00			`# Verify episode-related custom metrics are there.`
[rllib] Implement custom metrics (#3144) 2018-11-03 18:48:32 -07:00			`custom_metrics = trials[0].last_result["custom_metrics"]`
			`print(custom_metrics)`
[rllib] fixes from dogfooding multi-agent (#3456) auto wrap multi-agent dict and tuple spaces by keeping a policy -> preprocessor in the sampler add some Q-learning debug stats report min, max of custom metrics better errors 2018-12-05 23:31:45 -08:00			`assert "pole_angle_mean" in custom_metrics`
			`assert "pole_angle_min" in custom_metrics`
			`assert "pole_angle_max" in custom_metrics`
[rllib] Rename Agent to Trainer (#4556) 2019-04-07 00:36:18 -07:00			`assert "num_batches_mean" in custom_metrics`
[rllib] Allow envs to be auto-registered; add on_train_result callback with curriculum example (#3451) * train step and docs * debug * doc * doc * fix examples * fix code * integration test * fix * ... * space * instance * Update .travis.yml * fix test 2018-12-03 23:15:43 -08:00			`assert "callback_ok" in trials[0].last_result`
[RLlib] Extend on_learn_on_batch callback to allow for custom metrics to be added. (#13584) 2021-02-08 15:02:19 +01:00
			# Verify `on_learn_on_batch` custom metrics are there (per policy).
[RLlib] Redo issue 14533 tf enable eager exec (#14984) 2021-03-29 20:07:44 +02:00			`if args.torch:`
			`info_custom_metrics = custom_metrics["default_policy"]`
			`print(info_custom_metrics)`
			`assert "sum_actions_in_train_batch" in info_custom_metrics`