"""Example of running a custom hand-coded policy alongside trainable policies. This example has two policies: (1) a simple PG policy (2) a hand-coded policy that acts at random in the env (doesn't learn) In the console output, you can see the PG policy does much better than random: Result for PG_multi_cartpole_0: ... policy_reward_mean: pg_policy: 185.23 random: 21.255 ... """ import argparse import gym import ray from ray import tune from ray.rllib.policy import Policy from ray.rllib.tests.test_multi_agent_env import MultiCartpole from ray.tune.registry import register_env parser = argparse.ArgumentParser() parser.add_argument("--num-iters", type=int, default=20) class RandomPolicy(Policy): """Hand-coded policy that returns random actions.""" def compute_actions(self, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None, info_batch=None, episodes=None, **kwargs): """Compute actions on a batch of observations.""" return [self.action_space.sample() for _ in obs_batch], [], {} def learn_on_batch(self, samples): """No learning.""" return {} if __name__ == "__main__": args = parser.parse_args() ray.init() # Simple environment with 4 independent cartpole entities register_env("multi_cartpole", lambda _: MultiCartpole(4)) single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space tune.run( "PG", stop={"training_iteration": args.num_iters}, config={ "env": "multi_cartpole", "multiagent": { "policies": { "pg_policy": (None, obs_space, act_space, {}), "random": (RandomPolicy, obs_space, act_space, {}), }, "policy_mapping_fn": ( lambda agent_id: ["pg_policy", "random"][agent_id % 2]), }, }, )