import numpy as np import random from ray.rllib.policy.policy import Policy from ray.rllib.utils.annotations import override class RandomPolicy(Policy): """Hand-coded policy that returns random actions.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @override(Policy) def compute_actions(self, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None, **kwargs): # Alternatively, a numpy array would work here as well. # e.g.: np.array([random.choice([0, 1])] * len(obs_batch)) return [self.action_space.sample() for _ in obs_batch], [], {} @override(Policy) def learn_on_batch(self, samples): """No learning.""" return {} @override(Policy) def compute_log_likelihoods(self, actions, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None): return np.array([random.random()] * len(obs_batch))