2017-05-14 17:53:51 -07:00
|
|
|
# Code in this file is copied and adapted from
|
|
|
|
# https://github.com/openai/evolution-strategies-starter.
|
|
|
|
|
2017-11-16 21:58:30 -08:00
|
|
|
import gym
|
2017-05-14 17:53:51 -07:00
|
|
|
import numpy as np
|
|
|
|
|
2017-11-16 21:58:30 -08:00
|
|
|
import ray
|
2019-02-24 14:26:46 -08:00
|
|
|
import ray.experimental.tf_utils
|
2018-10-20 15:21:22 -07:00
|
|
|
from ray.rllib.evaluation.sampler import _unbatch_tuple_actions
|
2017-07-17 01:58:54 -07:00
|
|
|
from ray.rllib.models import ModelCatalog
|
2020-04-20 21:47:28 +02:00
|
|
|
from ray.rllib.policy.sample_batch import SampleBatch
|
2017-11-30 00:22:25 -08:00
|
|
|
from ray.rllib.utils.filter import get_filter
|
2020-04-21 09:47:52 +02:00
|
|
|
from ray.rllib.utils.framework import try_import_tf
|
2019-05-10 20:36:18 -07:00
|
|
|
|
|
|
|
tf = try_import_tf()
|
2017-11-16 21:58:30 -08:00
|
|
|
|
|
|
|
|
2020-04-21 09:47:52 +02:00
|
|
|
def rollout(policy, env, timestep_limit=None, add_noise=False, offset=0.0):
|
2017-11-16 21:58:30 -08:00
|
|
|
"""Do a rollout.
|
|
|
|
|
|
|
|
If add_noise is True, the rollout will take noisy actions with
|
|
|
|
noise drawn from that stream. Otherwise, no action noise will be added.
|
2020-04-21 09:47:52 +02:00
|
|
|
|
|
|
|
Args:
|
|
|
|
policy (Policy): Rllib Policy from which to draw actions.
|
|
|
|
env (gym.Env): Environment from which to draw rewards, done, and
|
|
|
|
next state.
|
|
|
|
timestep_limit (Optional[int]): Steps after which to end the rollout.
|
|
|
|
If None, use `env.spec.max_episode_steps` or 999999.
|
|
|
|
add_noise (bool): Indicates whether exploratory action noise should be
|
|
|
|
added.
|
|
|
|
offset (float): Value to subtract from the reward (e.g. survival bonus
|
|
|
|
from humanoid).
|
2017-11-16 21:58:30 -08:00
|
|
|
"""
|
2020-03-12 04:33:20 +01:00
|
|
|
max_timestep_limit = 999999
|
|
|
|
env_timestep_limit = env.spec.max_episode_steps if (
|
|
|
|
hasattr(env, "spec") and hasattr(env.spec, "max_episode_steps")) \
|
|
|
|
else max_timestep_limit
|
2018-07-19 15:30:36 -07:00
|
|
|
timestep_limit = (env_timestep_limit if timestep_limit is None else min(
|
|
|
|
timestep_limit, env_timestep_limit))
|
2020-04-21 09:47:52 +02:00
|
|
|
rewards = []
|
2017-11-16 21:58:30 -08:00
|
|
|
t = 0
|
|
|
|
observation = env.reset()
|
2020-03-12 04:33:20 +01:00
|
|
|
for _ in range(timestep_limit or max_timestep_limit):
|
2020-04-21 09:47:52 +02:00
|
|
|
ac = policy.compute_actions(
|
|
|
|
observation, add_noise=add_noise, update=True)[0]
|
|
|
|
observation, r, done, _ = env.step(ac)
|
|
|
|
if offset != 0.0:
|
|
|
|
r -= np.abs(offset)
|
|
|
|
rewards.append(r)
|
2017-11-16 21:58:30 -08:00
|
|
|
t += 1
|
|
|
|
if done:
|
|
|
|
break
|
2020-04-21 09:47:52 +02:00
|
|
|
rewards = np.array(rewards, dtype=np.float32)
|
|
|
|
return rewards, t
|
2017-11-16 21:58:30 -08:00
|
|
|
|
|
|
|
|
2020-04-20 21:47:28 +02:00
|
|
|
def make_session(single_threaded):
|
|
|
|
if not single_threaded:
|
|
|
|
return tf.Session()
|
|
|
|
return tf.Session(
|
|
|
|
config=tf.ConfigProto(
|
|
|
|
inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))
|
|
|
|
|
|
|
|
|
|
|
|
class ESTFPolicy:
|
|
|
|
def __init__(self, obs_space, action_space, config):
|
2017-11-16 21:58:30 -08:00
|
|
|
self.action_space = action_space
|
2020-04-20 21:47:28 +02:00
|
|
|
self.action_noise_std = config["action_noise_std"]
|
|
|
|
self.preprocessor = ModelCatalog.get_preprocessor_for_space(obs_space)
|
|
|
|
self.observation_filter = get_filter(config["observation_filter"],
|
2018-07-19 15:30:36 -07:00
|
|
|
self.preprocessor.shape)
|
2020-04-20 21:47:28 +02:00
|
|
|
self.single_threaded = config.get("single_threaded", False)
|
|
|
|
self.sess = make_session(single_threaded=self.single_threaded)
|
2018-07-19 15:30:36 -07:00
|
|
|
self.inputs = tf.placeholder(tf.float32,
|
|
|
|
[None] + list(self.preprocessor.shape))
|
2017-11-16 21:58:30 -08:00
|
|
|
|
|
|
|
# Policy network.
|
|
|
|
dist_class, dist_dim = ModelCatalog.get_action_dist(
|
2020-04-20 21:47:28 +02:00
|
|
|
self.action_space, config["model"], dist_type="deterministic")
|
2018-10-20 15:21:22 -07:00
|
|
|
model = ModelCatalog.get_model({
|
2020-04-20 21:47:28 +02:00
|
|
|
SampleBatch.CUR_OBS: self.inputs
|
|
|
|
}, obs_space, action_space, dist_dim, config["model"])
|
2019-08-10 14:05:12 -07:00
|
|
|
dist = dist_class(model.outputs, model)
|
2017-11-16 21:58:30 -08:00
|
|
|
self.sampler = dist.sample()
|
|
|
|
|
2019-02-24 14:26:46 -08:00
|
|
|
self.variables = ray.experimental.tf_utils.TensorFlowVariables(
|
2017-11-16 21:58:30 -08:00
|
|
|
model.outputs, self.sess)
|
|
|
|
|
2018-07-19 15:30:36 -07:00
|
|
|
self.num_params = sum(
|
|
|
|
np.prod(variable.shape.as_list())
|
|
|
|
for _, variable in self.variables.variables.items())
|
2017-11-16 21:58:30 -08:00
|
|
|
self.sess.run(tf.global_variables_initializer())
|
|
|
|
|
2020-04-20 21:47:28 +02:00
|
|
|
def compute_actions(self, observation, add_noise=False, update=True):
|
2017-11-16 21:58:30 -08:00
|
|
|
observation = self.preprocessor.transform(observation)
|
|
|
|
observation = self.observation_filter(observation[None], update=update)
|
2018-07-19 15:30:36 -07:00
|
|
|
action = self.sess.run(
|
|
|
|
self.sampler, feed_dict={self.inputs: observation})
|
2018-10-20 15:21:22 -07:00
|
|
|
action = _unbatch_tuple_actions(action)
|
2017-11-16 21:58:30 -08:00
|
|
|
if add_noise and isinstance(self.action_space, gym.spaces.Box):
|
|
|
|
action += np.random.randn(*action.shape) * self.action_noise_std
|
|
|
|
return action
|
|
|
|
|
2020-04-20 21:47:28 +02:00
|
|
|
def set_flat_weights(self, x):
|
2017-11-16 21:58:30 -08:00
|
|
|
self.variables.set_flat(x)
|
|
|
|
|
2020-04-20 21:47:28 +02:00
|
|
|
def get_flat_weights(self):
|
2017-11-16 21:58:30 -08:00
|
|
|
return self.variables.get_flat()
|