ray/rllib/examples/rollout_worker_custom_workflow.py

"""Example of using rollout worker classes directly to implement training.

Instead of using the built-in Trainer classes provided by RLlib, here we define
a custom Policy class and manually coordinate distributed sample
collection and policy optimization.
"""

import argparse
import gym
import numpy as np

import ray
from ray import tune
from ray.rllib.evaluation import RolloutWorker
from ray.rllib.evaluation.metrics import collect_metrics
from ray.rllib.policy.policy import Policy
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch
from ray.tune.utils.placement_groups import PlacementGroupFactory

parser = argparse.ArgumentParser()
parser.add_argument("--gpu", action="store_true")
parser.add_argument("--num-iters", type=int, default=20)
parser.add_argument("--num-workers", type=int, default=2)
parser.add_argument("--num-cpus", type=int, default=0)


class CustomPolicy(Policy):
    """Example of a custom policy written from scratch.

    You might find it more convenient to extend TF/TorchPolicy instead
    for a real policy.
    """

    def __init__(self, observation_space, action_space, config):
        super().__init__(observation_space, action_space, config)
        self.config["framework"] = None
        # example parameter
        self.w = 1.0

    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):
        # return random actions
        return np.array(
            [self.action_space.sample() for _ in obs_batch]), [], {}

    def learn_on_batch(self, samples):
        # implement your learning code here
        return {}

    def update_some_value(self, w):
        # can also call other methods on policies
        self.w = w

    def get_weights(self):
        return {"w": self.w}

    def set_weights(self, weights):
        self.w = weights["w"]


def training_workflow(config, reporter):
    # Setup policy and policy evaluation actors
    env = gym.make("CartPole-v0")
    policy = CustomPolicy(env.observation_space, env.action_space, {})
    workers = [
        RolloutWorker.as_remote().remote(
            env_creator=lambda c: gym.make("CartPole-v0"), policy=CustomPolicy)
        for _ in range(config["num_workers"])
    ]

    for _ in range(config["num_iters"]):
        # Broadcast weights to the policy evaluation workers
        weights = ray.put({DEFAULT_POLICY_ID: policy.get_weights()})
        for w in workers:
            w.set_weights.remote(weights)

        # Gather a batch of samples
        T1 = SampleBatch.concat_samples(
            ray.get([w.sample.remote() for w in workers]))

        # Update the remote policy replicas and gather another batch of samples
        new_value = policy.w * 2.0
        for w in workers:
            w.for_policy.remote(lambda p: p.update_some_value(new_value))

        # Gather another batch of samples
        T2 = SampleBatch.concat_samples(
            ray.get([w.sample.remote() for w in workers]))

        # Improve the policy using the T1 batch
        policy.learn_on_batch(T1)

        # Do some arbitrary updates based on the T2 batch
        policy.update_some_value(sum(T2["rewards"]))

        reporter(**collect_metrics(remote_workers=workers))


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init(num_cpus=args.num_cpus or None)

    tune.run(
        training_workflow,
        resources_per_trial=PlacementGroupFactory(([{
            "CPU": 1,
            "GPU": 1 if args.gpu else 0
        }] + [{
            "CPU": 1
        }] * args.num_workers)),
        config={
            "num_workers": args.num_workers,
            "num_iters": args.num_iters,
        },
        verbose=1,
    )
[rllib] Rename PolicyEvaluator => RolloutWorker (#4820) 2019-06-03 06:49:24 +08:00			`"""Example of using rollout worker classes directly to implement training.`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00
			`Instead of using the built-in Trainer classes provided by RLlib, here we define`
[rllib] Rename PolicyGraph => Policy, move from evaluation/ to policy/ (#4819) This implements some of the renames proposed in #4813 We leave behind backwards-compatibility aliases for *PolicyGraph and SampleBatch. 2019-05-20 16:46:05 -07:00			`a custom Policy class and manually coordinate distributed sample`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00			`collection and policy optimization.`
			`"""`

			`import argparse`
			`import gym`
[RLlib] Remove TupleActions and support arbitrarily nested action spaces. (#8143) Deprecate TupleActions and support arbitrarily nested action spaces. Closes issue #8143. 2020-04-28 14:59:16 +02:00			`import numpy as np`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00
			`import ray`
			`from ray import tune`
[RLlib] Move all jenkins RLlib-tests into bazel (rllib/BUILD). (#7178) * commit * comment 2020-02-15 23:50:44 +01:00			`from ray.rllib.evaluation import RolloutWorker`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00			`from ray.rllib.evaluation.metrics import collect_metrics`
[RLlib] Examples folder restructuring (Model examples; final part). (#8278) - This PR completes any previously missing PyTorch Model counterparts to TFModels in examples/models. - It also makes sure, all example scripts in the rllib/examples folder are tested for both frameworks and learn the given task (this is often currently not checked) using a --as-test flag in connection with a --stop-reward. 2020-05-12 08:23:10 +02:00			`from ray.rllib.policy.policy import Policy`
[RLlib] BC/MARWIL/recurrent nets minor cleanups and bug fixes. (#13064) 2020-12-27 09:46:03 -05:00			`from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch`
[Tune] Remove legacy resources implementations in Runner and Executor. (#19773) 2021-11-12 12:33:39 -08:00			`from ray.tune.utils.placement_groups import PlacementGroupFactory`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("--gpu", action="store_true")`
			`parser.add_argument("--num-iters", type=int, default=20)`
			`parser.add_argument("--num-workers", type=int, default=2)`
[RLlib] Move all jenkins RLlib-tests into bazel (rllib/BUILD). (#7178) * commit * comment 2020-02-15 23:50:44 +01:00			`parser.add_argument("--num-cpus", type=int, default=0)`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00

[RLlib] Examples folder restructuring (Model examples; final part). (#8278) - This PR completes any previously missing PyTorch Model counterparts to TFModels in examples/models. - It also makes sure, all example scripts in the rllib/examples folder are tested for both frameworks and learn the given task (this is often currently not checked) using a --as-test flag in connection with a --stop-reward. 2020-05-12 08:23:10 +02:00			`class CustomPolicy(Policy):`
[rllib] Rename PolicyGraph => Policy, move from evaluation/ to policy/ (#4819) This implements some of the renames proposed in #4813 We leave behind backwards-compatibility aliases for *PolicyGraph and SampleBatch. 2019-05-20 16:46:05 -07:00			`"""Example of a custom policy written from scratch.`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00
[rllib] Rename PolicyGraph => Policy, move from evaluation/ to policy/ (#4819) This implements some of the renames proposed in #4813 We leave behind backwards-compatibility aliases for *PolicyGraph and SampleBatch. 2019-05-20 16:46:05 -07:00			`You might find it more convenient to extend TF/TorchPolicy instead`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00			`for a real policy.`
			`"""`

			`def __init__(self, observation_space, action_space, config):`
[RLlib] Policy.compute_log_likelihoods() and SAC refactor. (issue #7107) (#7124) * Exploration API (+EpsilonGreedy sub-class). * Exploration API (+EpsilonGreedy sub-class). * Cleanup/LINT. * Add `deterministic` to generic Trainer config (NOTE: this is still ignored by most Agents). * Add `error` option to deprecation_warning(). * WIP. * Bug fix: Get exploration-info for tf framework. Bug fix: Properly deprecate some DQN config keys. * WIP. * LINT. * WIP. * Split PerWorkerEpsilonGreedy out of EpsilonGreedy. Docstrings. * Fix bug in sampler.py in case Policy has self.exploration = None * Update rllib/agents/dqn/dqn.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * Update rllib/agents/trainer.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * Change requests. * LINT * In tune/utils/util.py::deep_update() Only keep deep_updat'ing if both original and value are dicts. If value is not a dict, set * Completely obsolete syn_replay_optimizer.py's parameters schedule_max_timesteps AND beta_annealing_fraction (replaced with prioritized_replay_beta_annealing_timesteps). * Update rllib/evaluation/worker_set.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Review fixes. * Fix default value for DQN's exploration spec. * LINT * Fix recursion bug (wrong parent c'tor). * Do not pass timestep to get_exploration_info. * Update tf_policy.py * Fix some remaining issues with test cases and remove more deprecated DQN/APEX exploration configs. * Bug fix tf-action-dist * DDPG incompatibility bug fix with new DQN exploration handling (which is imported by DDPG). * Switch off exploration when getting action probs from off-policy-estimator's policy. * LINT * Fix test_checkpoint_restore.py. * Deprecate all SAC exploration (unused) configs. * Properly use `model.last_output()` everywhere. Instead of `model._last_output`. * WIP. * Take out set_epsilon from multi-agent-env test (not needed, decays anyway). * WIP. * Trigger re-test (flaky checkpoint-restore test). * WIP. * WIP. * Add test case for deterministic action sampling in PPO. * bug fix. * Added deterministic test cases for different Agents. * Fix problem with TupleActions in dynamic-tf-policy. * Separate supported_spaces tests so they can be run separately for easier debugging. * LINT. * Fix autoregressive_action_dist.py test case. * Re-test. * Fix. * Remove duplicate py_test rule from bazel. * LINT. * WIP. * WIP. * SAC fix. * SAC fix. * WIP. * WIP. * WIP. * FIX 2 examples tests. * WIP. * WIP. * WIP. * WIP. * WIP. * Fix. * LINT. * Renamed test file. * WIP. * Add unittest.main. * Make action_dist_class mandatory. * fix * FIX. * WIP. * WIP. * Fix. * Fix. * Fix explorations test case (contextlib cannot find its own nullcontext??). * Force torch to be installed for QMIX. * LINT. * Fix determine_tests_to_run.py. * Fix determine_tests_to_run.py. * WIP * Add Random exploration component to tests (fixed issue with "static-graph randomness" via py_function). * Add Random exploration component to tests (fixed issue with "static-graph randomness" via py_function). * Rename some stuff. * Rename some stuff. * WIP. * WIP. * Fix SAC. * Fix SAC. * Fix strange tf-error in ray core tests. * Fix strange ray-core tf-error in test_memory_scheduling test case. * Fix test_io.py. * LINT. * Update SAC yaml files' config. Co-authored-by: Eric Liang <ekhliang@gmail.com> 2020-02-22 23:19:49 +01:00			`super().__init__(observation_space, action_space, config)`
[RLlib] Fix all example scripts to run on GPUs. (#11105) 2020-10-02 23:07:44 +02:00			`self.config["framework"] = None`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00			`# example parameter`
			`self.w = 1.0`

			`def compute_actions(self,`
			`obs_batch,`
[RLlib] Policy-classes cleanup and torch/tf unification. (#6770) 2020-01-18 07:26:28 +01:00			`state_batches=None,`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00			`prev_action_batch=None,`
			`prev_reward_batch=None,`
			`info_batch=None,`
			`episodes=None,`
			`**kwargs):`
			`# return random actions`
ci: Redo `format.sh --all` script & backfill lint fixes (#9956) 2020-08-07 16:49:49 -07:00			`return np.array(`
			`[self.action_space.sample() for _ in obs_batch]), [], {}`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00
			`def learn_on_batch(self, samples):`
			`# implement your learning code here`
[rllib] Clean up concepts documentation and policy optimizer creation (#4592) 2019-04-12 21:03:26 -07:00			`return {}`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00
			`def update_some_value(self, w):`
			`# can also call other methods on policies`
			`self.w = w`

			`def get_weights(self):`
			`return {"w": self.w}`

			`def set_weights(self, weights):`
			`self.w = weights["w"]`


			`def training_workflow(config, reporter):`
			`# Setup policy and policy evaluation actors`
			`env = gym.make("CartPole-v0")`
			`policy = CustomPolicy(env.observation_space, env.action_space, {})`
			`workers = [`
[RLlib] MB-MPO cleanup (comments, docstrings, type annotations). (#11033) 2020-10-06 20:28:16 +02:00			`RolloutWorker.as_remote().remote(`
			`env_creator=lambda c: gym.make("CartPole-v0"), policy=CustomPolicy)`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00			`for _ in range(config["num_workers"])`
			`]`

			`for _ in range(config["num_iters"]):`
			`# Broadcast weights to the policy evaluation workers`
[RLlib] BC/MARWIL/recurrent nets minor cleanups and bug fixes. (#13064) 2020-12-27 09:46:03 -05:00			`weights = ray.put({DEFAULT_POLICY_ID: policy.get_weights()})`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00			`for w in workers:`
			`w.set_weights.remote(weights)`

			`# Gather a batch of samples`
			`T1 = SampleBatch.concat_samples(`
			`ray.get([w.sample.remote() for w in workers]))`

			`# Update the remote policy replicas and gather another batch of samples`
			`new_value = policy.w * 2.0`
			`for w in workers:`
			`w.for_policy.remote(lambda p: p.update_some_value(new_value))`

			`# Gather another batch of samples`
			`T2 = SampleBatch.concat_samples(`
			`ray.get([w.sample.remote() for w in workers]))`

			`# Improve the policy using the T1 batch`
			`policy.learn_on_batch(T1)`

			`# Do some arbitrary updates based on the T2 batch`
			`policy.update_some_value(sum(T2["rewards"]))`

[rllib] Rename PolicyEvaluator => RolloutWorker (#4820) 2019-06-03 06:49:24 +08:00			`reporter(**collect_metrics(remote_workers=workers))`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00

			`if __name__ == "__main__":`
			`args = parser.parse_args()`
[RLlib] Move all jenkins RLlib-tests into bazel (rllib/BUILD). (#7178) * commit * comment 2020-02-15 23:50:44 +01:00			`ray.init(num_cpus=args.num_cpus or None)`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00
			`tune.run(`
			`training_workflow,`
[Tune] Remove legacy resources implementations in Runner and Executor. (#19773) 2021-11-12 12:33:39 -08:00			`resources_per_trial=PlacementGroupFactory(([{`
			`"CPU": 1,`
			`"GPU": 1 if args.gpu else 0`
			`}] + [{`
			`"CPU": 1`
			`}] * args.num_workers)),`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00			`config={`
			`"num_workers": args.num_workers,`
			`"num_iters": args.num_iters,`
			`},`
[RLlib] Fix all example scripts to run on GPUs. (#11105) 2020-10-02 23:07:44 +02:00			`verbose=1,`
[rllib] Add multi-agent examples for hand-coded policy, centralized VF (#4554) 2019-04-09 00:36:49 -07:00			`)`