ray/rllib/algorithms/alpha_zero/alpha_zero_policy.py

import numpy as np

from ray.rllib.algorithms.alpha_zero.mcts import Node, RootParentNode
from ray.rllib.policy.policy import Policy
from ray.rllib.policy.torch_policy import TorchPolicy
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY

torch, _ = try_import_torch()


class AlphaZeroPolicy(TorchPolicy):
    def __init__(
        self,
        observation_space,
        action_space,
        config,
        model,
        loss,
        action_distribution_class,
        mcts_creator,
        env_creator,
        **kwargs
    ):
        super().__init__(
            observation_space,
            action_space,
            config,
            model=model,
            loss=loss,
            action_distribution_class=action_distribution_class,
        )
        # we maintain an env copy in the policy that is used during mcts
        # simulations
        self.env_creator = env_creator
        self.mcts = mcts_creator()
        self.env = self.env_creator()
        self.env.reset()
        self.obs_space = observation_space

    @override(TorchPolicy)
    def compute_actions(
        self,
        obs_batch,
        state_batches=None,
        prev_action_batch=None,
        prev_reward_batch=None,
        info_batch=None,
        episodes=None,
        **kwargs
    ):

        input_dict = {"obs": obs_batch}
        if prev_action_batch is not None:
            input_dict["prev_actions"] = prev_action_batch
        if prev_reward_batch is not None:
            input_dict["prev_rewards"] = prev_reward_batch

        return self.compute_actions_from_input_dict(
            input_dict=input_dict,
            episodes=episodes,
            state_batches=state_batches,
        )

    @override(Policy)
    def compute_actions_from_input_dict(
        self, input_dict, explore=None, timestep=None, episodes=None, **kwargs
    ):
        with torch.no_grad():
            actions = []
            for i, episode in enumerate(episodes):
                if episode.length == 0:
                    # if first time step of episode, get initial env state
                    env_state = episode.user_data["initial_state"]
                    # verify if env has been wrapped for ranked rewards
                    if self.env.__class__.__name__ == "RankedRewardsEnvWrapper":
                        # r2 env state contains also the rewards buffer state
                        env_state = {"env_state": env_state, "buffer_state": None}
                    # create tree root node
                    obs = self.env.set_state(env_state)
                    tree_node = Node(
                        state=env_state,
                        obs=obs,
                        reward=0,
                        done=False,
                        action=None,
                        parent=RootParentNode(env=self.env),
                        mcts=self.mcts,
                    )
                else:
                    # otherwise get last root node from previous time step
                    tree_node = episode.user_data["tree_node"]

                # run monte carlo simulations to compute the actions
                # and record the tree
                mcts_policy, action, tree_node = self.mcts.compute_action(tree_node)
                # record action
                actions.append(action)
                # store new node
                episode.user_data["tree_node"] = tree_node

                # store mcts policies vectors and current tree root node
                if episode.length == 0:
                    episode.user_data["mcts_policies"] = [mcts_policy]
                else:
                    episode.user_data["mcts_policies"].append(mcts_policy)

            return (
                np.array(actions),
                [],
                self.extra_action_out(
                    input_dict, kwargs.get("state_batches", []), self.model, None
                ),
            )

    @override(Policy)
    def postprocess_trajectory(
        self, sample_batch, other_agent_batches=None, episode=None
    ):
        # add mcts policies to sample batch
        sample_batch["mcts_policies"] = np.array(episode.user_data["mcts_policies"])[
            sample_batch["t"]
        ]
        # final episode reward corresponds to the value (if not discounted)
        # for all transitions in episode
        final_reward = sample_batch["rewards"][-1]
        # if r2 is enabled, then add the reward to the buffer and normalize it
        if self.env.__class__.__name__ == "RankedRewardsEnvWrapper":
            self.env.r2_buffer.add_reward(final_reward)
            final_reward = self.env.r2_buffer.normalize(final_reward)
        sample_batch["value_label"] = final_reward * np.ones_like(sample_batch["t"])
        return sample_batch

    @override(TorchPolicy)
    def learn_on_batch(self, postprocessed_batch):
        train_batch = self._lazy_tensor_dict(postprocessed_batch)

        loss_out, policy_loss, value_loss = self._loss(
            self, self.model, self.dist_class, train_batch
        )
        self._optimizers[0].zero_grad()
        loss_out.backward()

        grad_process_info = self.extra_grad_process(self._optimizers[0], loss_out)
        self._optimizers[0].step()

        grad_info = self.extra_grad_info(train_batch)
        grad_info.update(grad_process_info)
        grad_info.update(
            {
                "total_loss": loss_out.detach().cpu().numpy(),
                "policy_loss": policy_loss.detach().cpu().numpy(),
                "value_loss": value_loss.detach().cpu().numpy(),
            }
        )

        return {LEARNER_STATS_KEY: grad_info}
AlphaZero and Ranked reward implementation (#6385) 2019-12-07 21:08:40 +01:00			`import numpy as np`
Get utils ready for better Agent torch support. (#6561) 2019-12-30 15:27:32 -05:00
[RLlib] AlphaZero uses training_iteration API. (#24507) 2022-05-18 09:58:25 +02:00			`from ray.rllib.algorithms.alpha_zero.mcts import Node, RootParentNode`
[RLlib] Unify all RLlib Trainer.train() -> results[info][learner][policy ID][learner_stats] and add structure tests. (#18879) 2021-09-30 16:39:05 +02:00			`from ray.rllib.policy.policy import Policy`
AlphaZero and Ranked reward implementation (#6385) 2019-12-07 21:08:40 +01:00			`from ray.rllib.policy.torch_policy import TorchPolicy`
[RLlib] Minor `rllib.utils` cleanup. (#8932) 2020-06-16 08:52:20 +02:00			`from ray.rllib.utils.annotations import override`
			`from ray.rllib.utils.framework import try_import_torch`
[RLlib] Unify all RLlib Trainer.train() -> results[info][learner][policy ID][learner_stats] and add structure tests. (#18879) 2021-09-30 16:39:05 +02:00			`from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY`
Get utils ready for better Agent torch support. (#6561) 2019-12-30 15:27:32 -05:00
			`torch, _ = try_import_torch()`
AlphaZero and Ranked reward implementation (#6385) 2019-12-07 21:08:40 +01:00

			`class AlphaZeroPolicy(TorchPolicy):`
[RLlib] Policy-classes cleanup and torch/tf unification. (#6770) 2020-01-18 07:26:28 +01:00			`def __init__(`
			`self,`
			`observation_space,`
			`action_space,`
			`config,`
			`model,`
			`loss,`
AlphaZero and Ranked reward implementation (#6385) 2019-12-07 21:08:40 +01:00			`action_distribution_class,`
			`mcts_creator,`
			`env_creator,`
			`**kwargs`
			`):`
[RLlib] Policy-classes cleanup and torch/tf unification. (#6770) 2020-01-18 07:26:28 +01:00			`super().__init__(`
[RLlib] Exploration API: Policy changes needed for forward pass noisifications. (#7798) * Rollback. * WIP. * WIP. * LINT. * WIP. * Fix. * Fix. * Fix. * LINT. * Fix (SAC does currently not support eager). * Fix. * WIP. * LINT. * Update rllib/evaluation/sampler.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/evaluation/sampler.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/utils/exploration/exploration.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/utils/exploration/exploration.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * WIP. * Fix. * LINT. * LINT. * Fix and LINT. * WIP. * WIP. * WIP. * WIP. * Fix. * LINT. * Fix. * Fix and LINT. * Update rllib/utils/exploration/exploration.py * Update rllib/policy/dynamic_tf_policy.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/policy/dynamic_tf_policy.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/policy/dynamic_tf_policy.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Fixes. * LINT. * WIP. Co-authored-by: Eric Liang <ekhliang@gmail.com> 2020-04-01 09:43:21 +02:00			`observation_space,`
			`action_space,`
			`config,`
			`model=model,`
			`loss=loss,`
			`action_distribution_class=action_distribution_class,`
			`)`
AlphaZero and Ranked reward implementation (#6385) 2019-12-07 21:08:40 +01:00			`# we maintain an env copy in the policy that is used during mcts`
			`# simulations`
			`self.env_creator = env_creator`
			`self.mcts = mcts_creator()`
			`self.env = self.env_creator()`
			`self.env.reset()`
			`self.obs_space = observation_space`

			`@override(TorchPolicy)`
			`def compute_actions(`
			`self,`
			`obs_batch,`
			`state_batches=None,`
			`prev_action_batch=None,`
			`prev_reward_batch=None,`
			`info_batch=None,`
			`episodes=None,`
			`**kwargs`
			`):`

[RLlib] Fix most remaining RLlib algos for running with trajectory view API. (#12366) * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * LINT and fixes. MB-MPO and MAML not working yet. * wip * update * update * rmeove * remove dep * higher * Update requirements_rllib.txt * Update requirements_rllib.txt * relpos * no mbmpo Co-authored-by: Eric Liang <ekhliang@gmail.com> 2020-12-02 02:41:10 +01:00			`input_dict = {"obs": obs_batch}`
[RLlib] Discussion 3644: Fix bug for complex obs spaces containing `Box([2D shape])` and discrete component. (#18917) 2021-09-30 16:39:38 +02:00			`if prev_action_batch is not None:`
[RLlib] Fix most remaining RLlib algos for running with trajectory view API. (#12366) * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * LINT and fixes. MB-MPO and MAML not working yet. * wip * update * update * rmeove * remove dep * higher * Update requirements_rllib.txt * Update requirements_rllib.txt * relpos * no mbmpo Co-authored-by: Eric Liang <ekhliang@gmail.com> 2020-12-02 02:41:10 +01:00			`input_dict["prev_actions"] = prev_action_batch`
[RLlib] Discussion 3644: Fix bug for complex obs spaces containing `Box([2D shape])` and discrete component. (#18917) 2021-09-30 16:39:38 +02:00			`if prev_reward_batch is not None:`
[RLlib] Fix most remaining RLlib algos for running with trajectory view API. (#12366) * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * LINT and fixes. MB-MPO and MAML not working yet. * wip * update * update * rmeove * remove dep * higher * Update requirements_rllib.txt * Update requirements_rllib.txt * relpos * no mbmpo Co-authored-by: Eric Liang <ekhliang@gmail.com> 2020-12-02 02:41:10 +01:00			`input_dict["prev_rewards"] = prev_reward_batch`
AlphaZero and Ranked reward implementation (#6385) 2019-12-07 21:08:40 +01:00
[RLlib] Fix most remaining RLlib algos for running with trajectory view API. (#12366) * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * LINT and fixes. MB-MPO and MAML not working yet. * wip * update * update * rmeove * remove dep * higher * Update requirements_rllib.txt * Update requirements_rllib.txt * relpos * no mbmpo Co-authored-by: Eric Liang <ekhliang@gmail.com> 2020-12-02 02:41:10 +01:00			`return self.compute_actions_from_input_dict(`
			`input_dict=input_dict,`
			`episodes=episodes,`
			`state_batches=state_batches,`
			`)`
AlphaZero and Ranked reward implementation (#6385) 2019-12-07 21:08:40 +01:00
[RLlib] Fix most remaining RLlib algos for running with trajectory view API. (#12366) * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * LINT and fixes. MB-MPO and MAML not working yet. * wip * update * update * rmeove * remove dep * higher * Update requirements_rllib.txt * Update requirements_rllib.txt * relpos * no mbmpo Co-authored-by: Eric Liang <ekhliang@gmail.com> 2020-12-02 02:41:10 +01:00			`@override(Policy)`
			`def compute_actions_from_input_dict(`
			`self, input_dict, explore=None, timestep=None, episodes=None, **kwargs`
			`):`
			`with torch.no_grad():`
			`actions = []`
AlphaZero and Ranked reward implementation (#6385) 2019-12-07 21:08:40 +01:00			`for i, episode in enumerate(episodes):`
			`if episode.length == 0:`
			`# if first time step of episode, get initial env state`
			`env_state = episode.user_data["initial_state"]`
			`# verify if env has been wrapped for ranked rewards`
			`if self.env.__class__.__name__ == "RankedRewardsEnvWrapper":`
			`# r2 env state contains also the rewards buffer state`
			`env_state = {"env_state": env_state, "buffer_state": None}`
			`# create tree root node`
			`obs = self.env.set_state(env_state)`
			`tree_node = Node(`
			`state=env_state,`
			`obs=obs,`
			`reward=0,`
			`done=False,`
			`action=None,`
			`parent=RootParentNode(env=self.env),`
			`mcts=self.mcts,`
			`)`
			`else:`
			`# otherwise get last root node from previous time step`
			`tree_node = episode.user_data["tree_node"]`

			`# run monte carlo simulations to compute the actions`
			`# and record the tree`
			`mcts_policy, action, tree_node = self.mcts.compute_action(tree_node)`
			`# record action`
			`actions.append(action)`
			`# store new node`
			`episode.user_data["tree_node"] = tree_node`

			`# store mcts policies vectors and current tree root node`
			`if episode.length == 0:`
			`episode.user_data["mcts_policies"] = [mcts_policy]`
			`else:`
			`episode.user_data["mcts_policies"].append(mcts_policy)`

			`return (`
			`np.array(actions),`
			`[],`
			`self.extra_action_out(`
[RLlib] Fix most remaining RLlib algos for running with trajectory view API. (#12366) * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * LINT and fixes. MB-MPO and MAML not working yet. * wip * update * update * rmeove * remove dep * higher * Update requirements_rllib.txt * Update requirements_rllib.txt * relpos * no mbmpo Co-authored-by: Eric Liang <ekhliang@gmail.com> 2020-12-02 02:41:10 +01:00			`input_dict, kwargs.get("state_batches", []), self.model, None`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`),`
[RLlib] Fix most remaining RLlib algos for running with trajectory view API. (#12366) * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * LINT and fixes. MB-MPO and MAML not working yet. * wip * update * update * rmeove * remove dep * higher * Update requirements_rllib.txt * Update requirements_rllib.txt * relpos * no mbmpo Co-authored-by: Eric Liang <ekhliang@gmail.com> 2020-12-02 02:41:10 +01:00			`)`
AlphaZero and Ranked reward implementation (#6385) 2019-12-07 21:08:40 +01:00
			`@override(Policy)`
			`def postprocess_trajectory(`
			`self, sample_batch, other_agent_batches=None, episode=None`
			`):`
			`# add mcts policies to sample batch`
			`sample_batch["mcts_policies"] = np.array(episode.user_data["mcts_policies"])[`
			`sample_batch["t"]`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`]`
AlphaZero and Ranked reward implementation (#6385) 2019-12-07 21:08:40 +01:00			`# final episode reward corresponds to the value (if not discounted)`
			`# for all transitions in episode`
			`final_reward = sample_batch["rewards"][-1]`
			`# if r2 is enabled, then add the reward to the buffer and normalize it`
			`if self.env.__class__.__name__ == "RankedRewardsEnvWrapper":`
			`self.env.r2_buffer.add_reward(final_reward)`
			`final_reward = self.env.r2_buffer.normalize(final_reward)`
			`sample_batch["value_label"] = final_reward * np.ones_like(sample_batch["t"])`
			`return sample_batch`

Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`@override(TorchPolicy)`
AlphaZero and Ranked reward implementation (#6385) 2019-12-07 21:08:40 +01:00			`def learn_on_batch(self, postprocessed_batch):`
			`train_batch = self._lazy_tensor_dict(postprocessed_batch)`

			`loss_out, policy_loss, value_loss = self._loss(`
			`self, self.model, self.dist_class, train_batch`
			`)`
[RLlib] SAC Torch (incl. Atari learning) (#7984) * Policy-classes cleanup and torch/tf unification. - Make Policy abstract. - Add `action_dist` to call to `extra_action_out_fn` (necessary for PPO torch). - Move some methods and vars to base Policy (from TFPolicy): num_state_tensors, ACTION_PROB, ACTION_LOGP and some more. * Fix `clip_action` import from Policy (should probably be moved into utils altogether). * - Move `is_recurrent()` and `num_state_tensors()` into TFPolicy (from DynamicTFPolicy). - Add config to all Policy c'tor calls (as 3rd arg after obs and action spaces). * Add `config` to c'tor call to TFPolicy. * Add missing `config` to c'tor call to TFPolicy in marvil_policy.py. * Fix test_rollout_worker.py::MockPolicy and BadPolicy classes (Policy base class is now abstract). * Fix LINT errors in Policy classes. * Implement StatefulPolicy abstract methods in test cases: test_multi_agent_env.py. * policy.py LINT errors. * Create a simple TestPolicy to sub-class from when testing Policies (reduces code in some test cases). * policy.py - Remove abstractmethod from `apply_gradients` and `compute_gradients` (these are not required iff `learn_on_batch` implemented). - Fix docstring of `num_state_tensors`. * Make QMIX torch Policy a child of TorchPolicy (instead of Policy). * QMixPolicy add empty implementations of abstract Policy methods. * Store Policy's config in self.config in base Policy c'tor. * - Make only compute_actions in base Policy's an abstractmethod and provide pass implementation to all other methods if not defined. - Fix state_batches=None (most Policies don't have internal states). * Cartpole tf learning. * Cartpole tf AND torch learning (in ~ same ts). * Cartpole tf AND torch learning (in ~ same ts). 2 * Cartpole tf (torch syntax-broken) learning (in ~ same ts). 3 * Cartpole tf AND torch learning (in ~ same ts). 4 * Cartpole tf AND torch learning (in ~ same ts). 5 * Cartpole tf AND torch learning (in ~ same ts). 6 * Cartpole tf AND torch learning (in ~ same ts). Pendulum tf learning. * WIP. * WIP. * SAC torch learning Pendulum. * WIP. * SAC torch and tf learning Pendulum and Cartpole after cleanup. * WIP. * LINT. * LINT. * SAC: Move policy.target_model to policy.device as well. * Fixes and cleanup. * Fix data-format of tf keras Conv2d layers (broken for some tf-versions which have data_format="channels_first" as default). * Fixes and LINT. * Fixes and LINT. * Fix and LINT. * WIP. * Test fixes and LINT. * Fixes and LINT. Co-authored-by: Sven Mika <sven@Svens-MacBook-Pro.local> 2020-04-15 13:25:16 +02:00			`self._optimizers[0].zero_grad()`
AlphaZero and Ranked reward implementation (#6385) 2019-12-07 21:08:40 +01:00			`loss_out.backward()`

[RLlib] SAC Torch (incl. Atari learning) (#7984) * Policy-classes cleanup and torch/tf unification. - Make Policy abstract. - Add `action_dist` to call to `extra_action_out_fn` (necessary for PPO torch). - Move some methods and vars to base Policy (from TFPolicy): num_state_tensors, ACTION_PROB, ACTION_LOGP and some more. * Fix `clip_action` import from Policy (should probably be moved into utils altogether). * - Move `is_recurrent()` and `num_state_tensors()` into TFPolicy (from DynamicTFPolicy). - Add config to all Policy c'tor calls (as 3rd arg after obs and action spaces). * Add `config` to c'tor call to TFPolicy. * Add missing `config` to c'tor call to TFPolicy in marvil_policy.py. * Fix test_rollout_worker.py::MockPolicy and BadPolicy classes (Policy base class is now abstract). * Fix LINT errors in Policy classes. * Implement StatefulPolicy abstract methods in test cases: test_multi_agent_env.py. * policy.py LINT errors. * Create a simple TestPolicy to sub-class from when testing Policies (reduces code in some test cases). * policy.py - Remove abstractmethod from `apply_gradients` and `compute_gradients` (these are not required iff `learn_on_batch` implemented). - Fix docstring of `num_state_tensors`. * Make QMIX torch Policy a child of TorchPolicy (instead of Policy). * QMixPolicy add empty implementations of abstract Policy methods. * Store Policy's config in self.config in base Policy c'tor. * - Make only compute_actions in base Policy's an abstractmethod and provide pass implementation to all other methods if not defined. - Fix state_batches=None (most Policies don't have internal states). * Cartpole tf learning. * Cartpole tf AND torch learning (in ~ same ts). * Cartpole tf AND torch learning (in ~ same ts). 2 * Cartpole tf (torch syntax-broken) learning (in ~ same ts). 3 * Cartpole tf AND torch learning (in ~ same ts). 4 * Cartpole tf AND torch learning (in ~ same ts). 5 * Cartpole tf AND torch learning (in ~ same ts). 6 * Cartpole tf AND torch learning (in ~ same ts). Pendulum tf learning. * WIP. * WIP. * SAC torch learning Pendulum. * WIP. * SAC torch and tf learning Pendulum and Cartpole after cleanup. * WIP. * LINT. * LINT. * SAC: Move policy.target_model to policy.device as well. * Fixes and cleanup. * Fix data-format of tf keras Conv2d layers (broken for some tf-versions which have data_format="channels_first" as default). * Fixes and LINT. * Fixes and LINT. * Fix and LINT. * WIP. * Test fixes and LINT. * Fixes and LINT. Co-authored-by: Sven Mika <sven@Svens-MacBook-Pro.local> 2020-04-15 13:25:16 +02:00			`grad_process_info = self.extra_grad_process(self._optimizers[0], loss_out)`
			`self._optimizers[0].step()`
AlphaZero and Ranked reward implementation (#6385) 2019-12-07 21:08:40 +01:00
			`grad_info = self.extra_grad_info(train_batch)`
			`grad_info.update(grad_process_info)`
			`grad_info.update(`
			`{`
[RLlib] Update alpha_zero_policy.py (#15042) 2021-05-04 20:20:24 +09:00			`"total_loss": loss_out.detach().cpu().numpy(),`
			`"policy_loss": policy_loss.detach().cpu().numpy(),`
			`"value_loss": value_loss.detach().cpu().numpy(),`
AlphaZero and Ranked reward implementation (#6385) 2019-12-07 21:08:40 +01:00			`}`
			`)`

			`return {LEARNER_STATS_KEY: grad_info}`