import unittest import ray from ray.rllib.agents.dqn import DQNTrainer from ray.tune.registry import register_env import numpy as np import gym class TestReproducibility(unittest.TestCase): def testReproducingTrajectory(self): class PickLargest(gym.Env): def __init__(self): self.observation_space = gym.spaces.Box( low=float("-inf"), high=float("inf"), shape=(4, )) self.action_space = gym.spaces.Discrete(4) def reset(self, **kwargs): self.obs = np.random.randn(4) return self.obs def step(self, action): reward = self.obs[action] return self.obs, reward, True, {} def env_creator(env_config): return PickLargest() trajs = list() for trial in range(3): ray.init() register_env("PickLargest", env_creator) agent = DQNTrainer( env="PickLargest", config={"seed": 666 if trial in [0, 1] else 999}) trajectory = list() for _ in range(8): r = agent.train() trajectory.append(r["episode_reward_max"]) trajectory.append(r["episode_reward_min"]) trajs.append(trajectory) ray.shutdown() # trial0 and trial1 use same seed and thus # expect identical trajectories. all_same = True for v0, v1 in zip(trajs[0], trajs[1]): if v0 != v1: all_same = False self.assertTrue(all_same) # trial1 and trial2 use different seeds and thus # most rewards tend to be different. diff_cnt = 0 for v1, v2 in zip(trajs[1], trajs[2]): if v1 != v2: diff_cnt += 1 self.assertTrue(diff_cnt > 8) if __name__ == "__main__": unittest.main(verbosity=2)