2017-05-14 17:53:51 -07:00
|
|
|
# Code in this file is copied and adapted from
|
|
|
|
# https://github.com/openai/evolution-strategies-starter.
|
|
|
|
|
|
|
|
from __future__ import absolute_import
|
|
|
|
from __future__ import division
|
|
|
|
from __future__ import print_function
|
|
|
|
|
|
|
|
from collections import namedtuple
|
|
|
|
import numpy as np
|
2017-06-04 23:21:19 +00:00
|
|
|
import os
|
2017-08-27 18:56:52 -07:00
|
|
|
import pickle
|
2017-05-14 17:53:51 -07:00
|
|
|
import time
|
|
|
|
|
2017-06-25 15:13:03 -07:00
|
|
|
import ray
|
2018-03-23 05:54:31 -07:00
|
|
|
from ray.rllib import agent
|
2018-04-16 16:58:15 -07:00
|
|
|
from ray.tune.trial import Resources
|
2017-06-25 15:13:03 -07:00
|
|
|
|
2017-08-29 16:56:42 -07:00
|
|
|
from ray.rllib.es import optimizers
|
|
|
|
from ray.rllib.es import policies
|
|
|
|
from ray.rllib.es import tabular_logger as tlogger
|
|
|
|
from ray.rllib.es import utils
|
2017-05-14 17:53:51 -07:00
|
|
|
|
|
|
|
|
|
|
|
Result = namedtuple("Result", [
|
2017-11-16 21:58:30 -08:00
|
|
|
"noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
|
|
|
|
"eval_returns", "eval_lengths"
|
2017-05-14 17:53:51 -07:00
|
|
|
])
|
|
|
|
|
|
|
|
|
2018-05-20 16:15:06 -07:00
|
|
|
DEFAULT_CONFIG = {
|
|
|
|
'l2_coeff': 0.005,
|
|
|
|
'noise_stdev': 0.02,
|
|
|
|
'episodes_per_batch': 1000,
|
|
|
|
'timesteps_per_batch': 10000,
|
|
|
|
'eval_prob': 0.003,
|
|
|
|
'return_proc_mode': "centered_rank",
|
|
|
|
'num_workers': 10,
|
|
|
|
'stepsize': 0.01,
|
|
|
|
'observation_filter': "MeanStdFilter",
|
|
|
|
'noise_size': 250000000,
|
|
|
|
'env_config': {},
|
|
|
|
}
|
2017-06-25 15:13:03 -07:00
|
|
|
|
|
|
|
|
2017-05-14 17:53:51 -07:00
|
|
|
@ray.remote
|
2018-01-24 11:03:43 -08:00
|
|
|
def create_shared_noise(count):
|
2017-07-13 14:53:57 -07:00
|
|
|
"""Create a large array of noise to be shared by all workers."""
|
|
|
|
seed = 123
|
|
|
|
noise = np.random.RandomState(seed).randn(count).astype(np.float32)
|
|
|
|
return noise
|
2017-05-14 17:53:51 -07:00
|
|
|
|
|
|
|
|
|
|
|
class SharedNoiseTable(object):
|
2017-07-13 14:53:57 -07:00
|
|
|
def __init__(self, noise):
|
|
|
|
self.noise = noise
|
|
|
|
assert self.noise.dtype == np.float32
|
2017-05-14 17:53:51 -07:00
|
|
|
|
2017-07-13 14:53:57 -07:00
|
|
|
def get(self, i, dim):
|
|
|
|
return self.noise[i:i + dim]
|
2017-05-14 17:53:51 -07:00
|
|
|
|
2017-11-16 21:58:30 -08:00
|
|
|
def sample_index(self, dim):
|
|
|
|
return np.random.randint(0, len(self.noise) - dim + 1)
|
2017-05-14 17:53:51 -07:00
|
|
|
|
|
|
|
|
|
|
|
@ray.remote
|
|
|
|
class Worker(object):
|
2017-12-28 13:19:04 -08:00
|
|
|
def __init__(self, registry, config, policy_params, env_creator, noise,
|
2017-07-13 14:53:57 -07:00
|
|
|
min_task_runtime=0.2):
|
|
|
|
self.min_task_runtime = min_task_runtime
|
|
|
|
self.config = config
|
|
|
|
self.policy_params = policy_params
|
|
|
|
self.noise = SharedNoiseTable(noise)
|
|
|
|
|
2018-01-05 21:32:41 -08:00
|
|
|
self.env = env_creator(config["env_config"])
|
2018-03-23 05:54:31 -07:00
|
|
|
from ray.rllib import models
|
|
|
|
self.preprocessor = models.ModelCatalog.get_preprocessor(
|
|
|
|
registry, self.env)
|
2017-08-22 03:51:49 +02:00
|
|
|
|
2017-07-13 14:53:57 -07:00
|
|
|
self.sess = utils.make_session(single_threaded=True)
|
2017-12-28 13:19:04 -08:00
|
|
|
self.policy = policies.GenericPolicy(
|
|
|
|
registry, self.sess, self.env.action_space, self.preprocessor,
|
|
|
|
config["observation_filter"], **policy_params)
|
2017-11-16 21:58:30 -08:00
|
|
|
|
|
|
|
def rollout(self, timestep_limit, add_noise=True):
|
|
|
|
rollout_rewards, rollout_length = policies.rollout(
|
|
|
|
self.policy, self.env, timestep_limit=timestep_limit,
|
|
|
|
add_noise=add_noise)
|
|
|
|
return rollout_rewards, rollout_length
|
|
|
|
|
|
|
|
def do_rollouts(self, params, timestep_limit=None):
|
2017-07-13 14:53:57 -07:00
|
|
|
# Set the network weights.
|
2017-11-16 21:58:30 -08:00
|
|
|
self.policy.set_weights(params)
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2017-11-16 21:58:30 -08:00
|
|
|
noise_indices, returns, sign_returns, lengths = [], [], [], []
|
|
|
|
eval_returns, eval_lengths = [], []
|
2017-07-13 14:53:57 -07:00
|
|
|
|
|
|
|
# Perform some rollouts with noise.
|
|
|
|
task_tstart = time.time()
|
2017-11-16 21:58:30 -08:00
|
|
|
while (len(noise_indices) == 0 or
|
2017-07-13 14:53:57 -07:00
|
|
|
time.time() - task_tstart < self.min_task_runtime):
|
2017-11-16 21:58:30 -08:00
|
|
|
|
|
|
|
if np.random.uniform() < self.config["eval_prob"]:
|
|
|
|
# Do an evaluation run with no perturbation.
|
|
|
|
self.policy.set_weights(params)
|
|
|
|
rewards, length = self.rollout(timestep_limit, add_noise=False)
|
|
|
|
eval_returns.append(rewards.sum())
|
|
|
|
eval_lengths.append(length)
|
|
|
|
else:
|
|
|
|
# Do a regular run with parameter perturbations.
|
|
|
|
noise_index = self.noise.sample_index(self.policy.num_params)
|
|
|
|
|
|
|
|
perturbation = self.config["noise_stdev"] * self.noise.get(
|
|
|
|
noise_index, self.policy.num_params)
|
|
|
|
|
|
|
|
# These two sampling steps could be done in parallel on
|
|
|
|
# different actors letting us update twice as frequently.
|
|
|
|
self.policy.set_weights(params + perturbation)
|
|
|
|
rewards_pos, lengths_pos = self.rollout(timestep_limit)
|
|
|
|
|
|
|
|
self.policy.set_weights(params - perturbation)
|
|
|
|
rewards_neg, lengths_neg = self.rollout(timestep_limit)
|
|
|
|
|
|
|
|
noise_indices.append(noise_index)
|
|
|
|
returns.append([rewards_pos.sum(), rewards_neg.sum()])
|
|
|
|
sign_returns.append(
|
|
|
|
[np.sign(rewards_pos).sum(), np.sign(rewards_neg).sum()])
|
|
|
|
lengths.append([lengths_pos, lengths_neg])
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2018-01-31 17:22:39 -08:00
|
|
|
return Result(
|
|
|
|
noise_indices=noise_indices,
|
|
|
|
noisy_returns=returns,
|
|
|
|
sign_noisy_returns=sign_returns,
|
|
|
|
noisy_lengths=lengths,
|
|
|
|
eval_returns=eval_returns,
|
|
|
|
eval_lengths=eval_lengths)
|
2017-05-14 17:53:51 -07:00
|
|
|
|
|
|
|
|
2018-03-23 05:54:31 -07:00
|
|
|
class ESAgent(agent.Agent):
|
2017-10-10 12:49:42 -07:00
|
|
|
_agent_name = "ES"
|
2017-10-13 16:18:16 -07:00
|
|
|
_default_config = DEFAULT_CONFIG
|
2018-01-05 21:32:41 -08:00
|
|
|
_allow_unknown_subkeys = ["env_config"]
|
2017-08-27 18:56:52 -07:00
|
|
|
|
2018-04-16 16:58:15 -07:00
|
|
|
@classmethod
|
|
|
|
def default_resource_request(cls, config):
|
|
|
|
cf = dict(cls._default_config, **config)
|
|
|
|
return Resources(cpu=1, gpu=0, extra_cpu=cf["num_workers"])
|
|
|
|
|
2017-08-27 18:56:52 -07:00
|
|
|
def _init(self):
|
2017-07-13 14:53:57 -07:00
|
|
|
policy_params = {
|
2017-11-16 21:58:30 -08:00
|
|
|
"action_noise_std": 0.01
|
2017-07-13 14:53:57 -07:00
|
|
|
}
|
|
|
|
|
2018-01-05 21:32:41 -08:00
|
|
|
env = self.env_creator(self.config["env_config"])
|
2018-03-23 05:54:31 -07:00
|
|
|
from ray.rllib import models
|
|
|
|
preprocessor = models.ModelCatalog.get_preprocessor(
|
|
|
|
self.registry, env)
|
2017-08-22 03:51:49 +02:00
|
|
|
|
2017-08-27 18:56:52 -07:00
|
|
|
self.sess = utils.make_session(single_threaded=False)
|
2017-07-17 01:58:54 -07:00
|
|
|
self.policy = policies.GenericPolicy(
|
2017-12-28 13:19:04 -08:00
|
|
|
self.registry, self.sess, env.action_space, preprocessor,
|
2017-11-16 21:58:30 -08:00
|
|
|
self.config["observation_filter"], **policy_params)
|
2017-08-27 18:56:52 -07:00
|
|
|
self.optimizer = optimizers.Adam(self.policy, self.config["stepsize"])
|
2017-07-17 01:58:54 -07:00
|
|
|
|
2017-07-13 14:53:57 -07:00
|
|
|
# Create the shared noise table.
|
|
|
|
print("Creating shared noise table.")
|
2018-01-24 11:03:43 -08:00
|
|
|
noise_id = create_shared_noise.remote(self.config["noise_size"])
|
2017-07-13 14:53:57 -07:00
|
|
|
self.noise = SharedNoiseTable(ray.get(noise_id))
|
|
|
|
|
|
|
|
# Create the actors.
|
|
|
|
print("Creating actors.")
|
2017-07-17 01:58:54 -07:00
|
|
|
self.workers = [
|
2017-10-10 12:49:42 -07:00
|
|
|
Worker.remote(
|
2017-12-28 13:19:04 -08:00
|
|
|
self.registry, self.config, policy_params, self.env_creator,
|
|
|
|
noise_id)
|
2017-08-27 18:56:52 -07:00
|
|
|
for _ in range(self.config["num_workers"])]
|
2017-07-13 14:53:57 -07:00
|
|
|
|
|
|
|
self.episodes_so_far = 0
|
|
|
|
self.timesteps_so_far = 0
|
|
|
|
self.tstart = time.time()
|
|
|
|
|
2017-11-16 21:58:30 -08:00
|
|
|
def _collect_results(self, theta_id, min_episodes, min_timesteps):
|
|
|
|
num_episodes, num_timesteps = 0, 0
|
2017-07-16 16:23:56 -07:00
|
|
|
results = []
|
2017-11-16 21:58:30 -08:00
|
|
|
while num_episodes < min_episodes or num_timesteps < min_timesteps:
|
2017-07-16 16:23:56 -07:00
|
|
|
print(
|
|
|
|
"Collected {} episodes {} timesteps so far this iter".format(
|
2017-11-16 21:58:30 -08:00
|
|
|
num_episodes, num_timesteps))
|
|
|
|
rollout_ids = [worker.do_rollouts.remote(theta_id)
|
|
|
|
for worker in self.workers]
|
2017-07-16 16:23:56 -07:00
|
|
|
# Get the results of the rollouts.
|
|
|
|
for result in ray.get(rollout_ids):
|
|
|
|
results.append(result)
|
2017-11-16 21:58:30 -08:00
|
|
|
# Update the number of episodes and the number of timesteps
|
|
|
|
# keeping in mind that result.noisy_lengths is a list of lists,
|
|
|
|
# where the inner lists have length 2.
|
2018-05-20 16:15:06 -07:00
|
|
|
num_episodes += sum(len(pair) for pair
|
|
|
|
in result.noisy_lengths)
|
|
|
|
num_timesteps += sum(sum(pair) for pair
|
|
|
|
in result.noisy_lengths)
|
2017-11-16 21:58:30 -08:00
|
|
|
return results, num_episodes, num_timesteps
|
2017-07-16 16:23:56 -07:00
|
|
|
|
2017-09-12 14:28:16 -07:00
|
|
|
def _train(self):
|
2017-07-13 14:53:57 -07:00
|
|
|
config = self.config
|
|
|
|
|
|
|
|
step_tstart = time.time()
|
2017-11-16 21:58:30 -08:00
|
|
|
theta = self.policy.get_weights()
|
2017-07-13 14:53:57 -07:00
|
|
|
assert theta.dtype == np.float32
|
|
|
|
|
|
|
|
# Put the current policy weights in the object store.
|
|
|
|
theta_id = ray.put(theta)
|
|
|
|
# Use the actors to do rollouts, note that we pass in the ID of the
|
|
|
|
# policy weights.
|
2017-11-16 21:58:30 -08:00
|
|
|
results, num_episodes, num_timesteps = self._collect_results(
|
2017-07-13 14:53:57 -07:00
|
|
|
theta_id,
|
2017-07-16 16:23:56 -07:00
|
|
|
config["episodes_per_batch"],
|
|
|
|
config["timesteps_per_batch"])
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2017-11-16 21:58:30 -08:00
|
|
|
all_noise_indices = []
|
|
|
|
all_training_returns = []
|
|
|
|
all_training_lengths = []
|
|
|
|
all_eval_returns = []
|
|
|
|
all_eval_lengths = []
|
|
|
|
|
|
|
|
# Loop over the results.
|
2017-07-13 14:53:57 -07:00
|
|
|
for result in results:
|
2017-11-16 21:58:30 -08:00
|
|
|
all_eval_returns += result.eval_returns
|
|
|
|
all_eval_lengths += result.eval_lengths
|
|
|
|
|
|
|
|
all_noise_indices += result.noise_indices
|
|
|
|
all_training_returns += result.noisy_returns
|
|
|
|
all_training_lengths += result.noisy_lengths
|
|
|
|
|
|
|
|
assert len(all_eval_returns) == len(all_eval_lengths)
|
|
|
|
assert (len(all_noise_indices) == len(all_training_returns) ==
|
|
|
|
len(all_training_lengths))
|
|
|
|
|
|
|
|
self.episodes_so_far += num_episodes
|
|
|
|
self.timesteps_so_far += num_timesteps
|
2017-07-13 14:53:57 -07:00
|
|
|
|
|
|
|
# Assemble the results.
|
2017-11-16 21:58:30 -08:00
|
|
|
eval_returns = np.array(all_eval_returns)
|
|
|
|
eval_lengths = np.array(all_eval_lengths)
|
|
|
|
noise_indices = np.array(all_noise_indices)
|
|
|
|
noisy_returns = np.array(all_training_returns)
|
|
|
|
noisy_lengths = np.array(all_training_lengths)
|
|
|
|
|
2017-07-13 14:53:57 -07:00
|
|
|
# Process the returns.
|
|
|
|
if config["return_proc_mode"] == "centered_rank":
|
2017-11-16 21:58:30 -08:00
|
|
|
proc_noisy_returns = utils.compute_centered_ranks(noisy_returns)
|
2017-07-13 14:53:57 -07:00
|
|
|
else:
|
|
|
|
raise NotImplementedError(config["return_proc_mode"])
|
|
|
|
|
|
|
|
# Compute and take a step.
|
|
|
|
g, count = utils.batched_weighted_sum(
|
2017-11-16 21:58:30 -08:00
|
|
|
proc_noisy_returns[:, 0] - proc_noisy_returns[:, 1],
|
|
|
|
(self.noise.get(index, self.policy.num_params)
|
|
|
|
for index in noise_indices),
|
2017-07-13 14:53:57 -07:00
|
|
|
batch_size=500)
|
2017-11-16 21:58:30 -08:00
|
|
|
g /= noisy_returns.size
|
2017-07-17 01:58:54 -07:00
|
|
|
assert (
|
|
|
|
g.shape == (self.policy.num_params,) and
|
|
|
|
g.dtype == np.float32 and
|
2017-11-16 21:58:30 -08:00
|
|
|
count == len(noise_indices))
|
|
|
|
# Compute the new weights theta.
|
|
|
|
theta, update_ratio = self.optimizer.update(
|
|
|
|
-g + config["l2_coeff"] * theta)
|
|
|
|
# Set the new weights in the local copy of the policy.
|
|
|
|
self.policy.set_weights(theta)
|
2017-07-13 14:53:57 -07:00
|
|
|
|
|
|
|
step_tend = time.time()
|
2017-11-16 21:58:30 -08:00
|
|
|
tlogger.record_tabular("EvalEpRewMean", eval_returns.mean())
|
|
|
|
tlogger.record_tabular("EvalEpRewStd", eval_returns.std())
|
|
|
|
tlogger.record_tabular("EvalEpLenMean", eval_lengths.mean())
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2017-11-16 21:58:30 -08:00
|
|
|
tlogger.record_tabular("EpRewMean", noisy_returns.mean())
|
|
|
|
tlogger.record_tabular("EpRewStd", noisy_returns.std())
|
|
|
|
tlogger.record_tabular("EpLenMean", noisy_lengths.mean())
|
|
|
|
|
|
|
|
tlogger.record_tabular("Norm", float(np.square(theta).sum()))
|
2017-07-13 14:53:57 -07:00
|
|
|
tlogger.record_tabular("GradNorm", float(np.square(g).sum()))
|
|
|
|
tlogger.record_tabular("UpdateRatio", float(update_ratio))
|
|
|
|
|
2017-11-16 21:58:30 -08:00
|
|
|
tlogger.record_tabular("EpisodesThisIter", noisy_lengths.size)
|
2017-07-13 14:53:57 -07:00
|
|
|
tlogger.record_tabular("EpisodesSoFar", self.episodes_so_far)
|
2017-11-16 21:58:30 -08:00
|
|
|
tlogger.record_tabular("TimestepsThisIter", noisy_lengths.sum())
|
2017-07-13 14:53:57 -07:00
|
|
|
tlogger.record_tabular("TimestepsSoFar", self.timesteps_so_far)
|
|
|
|
|
|
|
|
tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart)
|
|
|
|
tlogger.record_tabular("TimeElapsed", step_tend - self.tstart)
|
|
|
|
tlogger.dump_tabular()
|
|
|
|
|
|
|
|
info = {
|
2017-11-16 21:58:30 -08:00
|
|
|
"weights_norm": np.square(theta).sum(),
|
2017-07-13 14:53:57 -07:00
|
|
|
"grad_norm": np.square(g).sum(),
|
|
|
|
"update_ratio": update_ratio,
|
2017-11-16 21:58:30 -08:00
|
|
|
"episodes_this_iter": noisy_lengths.size,
|
2017-07-13 14:53:57 -07:00
|
|
|
"episodes_so_far": self.episodes_so_far,
|
2017-11-16 21:58:30 -08:00
|
|
|
"timesteps_this_iter": noisy_lengths.sum(),
|
2017-07-13 14:53:57 -07:00
|
|
|
"timesteps_so_far": self.timesteps_so_far,
|
|
|
|
"time_elapsed_this_iter": step_tend - step_tstart,
|
|
|
|
"time_elapsed": step_tend - self.tstart
|
|
|
|
}
|
|
|
|
|
2018-03-23 05:54:31 -07:00
|
|
|
result = ray.tune.result.TrainingResult(
|
2017-11-16 21:58:30 -08:00
|
|
|
episode_reward_mean=eval_returns.mean(),
|
|
|
|
episode_len_mean=eval_lengths.mean(),
|
|
|
|
timesteps_this_iter=noisy_lengths.sum(),
|
2017-09-12 14:28:16 -07:00
|
|
|
info=info)
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2017-09-12 14:28:16 -07:00
|
|
|
return result
|
2017-08-24 00:09:33 -07:00
|
|
|
|
2018-02-11 19:14:51 -08:00
|
|
|
def _stop(self):
|
|
|
|
# workaround for https://github.com/ray-project/ray/issues/1516
|
|
|
|
for w in self.workers:
|
2018-05-08 19:19:07 -07:00
|
|
|
w.__ray_terminate__.remote()
|
2018-02-11 19:14:51 -08:00
|
|
|
|
2018-01-29 18:48:45 -08:00
|
|
|
def _save(self, checkpoint_dir):
|
2017-08-27 18:56:52 -07:00
|
|
|
checkpoint_path = os.path.join(
|
2018-01-29 18:48:45 -08:00
|
|
|
checkpoint_dir, "checkpoint-{}".format(self.iteration))
|
2017-11-16 21:58:30 -08:00
|
|
|
weights = self.policy.get_weights()
|
2017-08-27 18:56:52 -07:00
|
|
|
objects = [
|
|
|
|
weights,
|
|
|
|
self.episodes_so_far,
|
2017-09-12 14:28:16 -07:00
|
|
|
self.timesteps_so_far]
|
2017-08-27 18:56:52 -07:00
|
|
|
pickle.dump(objects, open(checkpoint_path, "wb"))
|
|
|
|
return checkpoint_path
|
|
|
|
|
2017-09-12 14:28:16 -07:00
|
|
|
def _restore(self, checkpoint_path):
|
2017-08-27 18:56:52 -07:00
|
|
|
objects = pickle.load(open(checkpoint_path, "rb"))
|
2017-11-16 21:58:30 -08:00
|
|
|
self.policy.set_weights(objects[0])
|
|
|
|
self.episodes_so_far = objects[1]
|
|
|
|
self.timesteps_so_far = objects[2]
|
2017-08-24 00:09:33 -07:00
|
|
|
|
|
|
|
def compute_action(self, observation):
|
2017-11-16 21:58:30 -08:00
|
|
|
return self.policy.compute(observation, update=False)[0]
|