[rllib] add augmented random search (#2714)

* added ars * functioning ars with regression test * added regression tests for ARs * fixed default config for ARS * ARS code runs, now time to test * ARS working and tested, changed std deviation of meanstd filter to initialize to 1 * ARS working and tested, changed std deviation of meanstd filter to initialize to 1 * pep8 fixes * removed unused linear model * address comments * more fixing comments * post yapf * fixed support failure * Update LICENSE * Update policies.py * Update test_supported_spaces.py * Update policies.py * Update LICENSE * Update test_supported_spaces.py * Update policies.py * Update policies.py * Update filter.py
2025-03-05 10:01:43 -05:00 · 2018-08-24 22:20:02 -07:00 · 2018-08-24 22:20:02 -07:00 · 6201a6d1c7
commit 6201a6d1c7
parent 5fd44afb8a
11 changed files with 698 additions and 1 deletions
--- a/27
+++ b/27
@ -243,3 +243,30 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+--------------------------------------------------------------------------------
+Code in python/ray/rllib/ars is adapted from https://github.com/modestyachts/ARS
+
+Copyright (c) 2018, ARS contributors (Horia Mania, Aurelia Guy, Benjamin Recht)
+All rights reserved.
+
+Redistribution and use of ARS in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/python/ray/rllib/init.py
+++ b/python/ray/rllib/init.py
@ -17,9 +17,10 @@ from ray.rllib.evaluation.sample_batch import SampleBatch


 def _register_all():
+
    for key in [
            "PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG", "APEX_DDPG",
-            "IMPALA", "A2C", "__fake", "__sigmoid_fake_data",
+            "IMPALA", "ARS", "A2C", "__fake", "__sigmoid_fake_data",
            "__parameter_tuning"
    ]:
        from ray.rllib.agents.agent import get_agent_class
--- a/python/ray/rllib/agents/agent.py
+++ b/python/ray/rllib/agents/agent.py
@ -393,6 +393,9 @@ def get_agent_class(alg):
    elif alg == "ES":
        from ray.rllib.agents import es
        return es.ESAgent
+    elif alg == "ARS":
+        from ray.rllib.agents import ars
+        return ars.ARSAgent
    elif alg == "DQN":
        from ray.rllib.agents import dqn
        return dqn.DQNAgent
--- a/python/ray/rllib/agents/ars/init.py
+++ b/python/ray/rllib/agents/ars/init.py
@ -0,0 +1,3 @@
+from ray.rllib.agents.ars.ars import (ARSAgent, DEFAULT_CONFIG)
+
+__all__ = ["ARSAgent", "DEFAULT_CONFIG"]
--- a/python/ray/rllib/agents/ars/ars.py
+++ b/python/ray/rllib/agents/ars/ars.py
@ -0,0 +1,351 @@
+# Code in this file is copied and adapted from
+# https://github.com/openai/evolution-strategies-starter and from
+# https://github.com/modestyachts/ARS
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+import numpy as np
+import os
+import pickle
+import time
+
+import ray
+from ray.rllib.agents import Agent, with_common_config
+from ray.tune.trial import Resources
+
+from ray.rllib.agents.ars import optimizers
+from ray.rllib.agents.ars import policies
+from ray.rllib.agents.es import tabular_logger as tlogger
+from ray.rllib.agents.ars import utils
+
+Result = namedtuple("Result", [
+    "noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
+    "eval_returns", "eval_lengths"
+])
+
+DEFAULT_CONFIG = with_common_config({
+    'noise_stdev': 0.02,  # std deviation of parameter noise
+    'num_deltas': 4,  # number of perturbations to try
+    'deltas_used': 4,  # number of perturbations to keep in gradient estimate
+    'num_workers': 2,
+    'stepsize': 0.01,  # sgd step-size
+    'observation_filter': "MeanStdFilter",
+    'noise_size': 250000000,
+    'eval_prob': 0.03,  # probability of evaluating the parameter rewards
+    'env_config': {},
+    'offset': 0,
+    'policy_type': "LinearPolicy",  # ["LinearPolicy", "MLPPolicy"]
+    "fcnet_hiddens": [32, 32],  # fcnet structure of MLPPolicy
+})
+
+
+@ray.remote
+def create_shared_noise(count):
+    """Create a large array of noise to be shared by all workers."""
+    seed = 123
+    noise = np.random.RandomState(seed).randn(count).astype(np.float32)
+    return noise
+
+
+class SharedNoiseTable(object):
+    def __init__(self, noise):
+        self.noise = noise
+        assert self.noise.dtype == np.float32
+
+    def get(self, i, dim):
+        return self.noise[i:i + dim]
+
+    def sample_index(self, dim):
+        return np.random.randint(0, len(self.noise) - dim + 1)
+
+    def get_delta(self, dim):
+        idx = self.sample_index(dim)
+        return idx, self.get(idx, dim)
+
+
+@ray.remote
+class Worker(object):
+    def __init__(self,
+                 config,
+                 policy_params,
+                 env_creator,
+                 noise,
+                 min_task_runtime=0.2):
+        self.min_task_runtime = min_task_runtime
+        self.config = config
+        self.policy_params = policy_params
+        self.noise = SharedNoiseTable(noise)
+
+        self.env = env_creator(config["env_config"])
+        from ray.rllib import models
+        self.preprocessor = models.ModelCatalog.get_preprocessor(self.env)
+
+        self.sess = utils.make_session(single_threaded=True)
+        if config["policy_type"] == "LinearPolicy":
+            self.policy = policies.LinearPolicy(
+                self.sess, self.env.action_space, self.preprocessor,
+                config["observation_filter"], **policy_params)
+        else:
+            self.policy = policies.MLPPolicy(
+                self.sess, self.env.action_space, self.preprocessor,
+                config["observation_filter"], config["fcnet_hiddens"],
+                **policy_params)
+
+    def rollout(self, timestep_limit, add_noise=False):
+        rollout_rewards, rollout_length = policies.rollout(
+            self.policy,
+            self.env,
+            timestep_limit=timestep_limit,
+            add_noise=add_noise,
+            offset=self.config['offset'])
+        return rollout_rewards, rollout_length
+
+    def do_rollouts(self, params, timestep_limit=None):
+        # Set the network weights.
+        self.policy.set_weights(params)
+
+        noise_indices, returns, sign_returns, lengths = [], [], [], []
+        eval_returns, eval_lengths = [], []
+
+        # Perform some rollouts with noise.
+        while (len(noise_indices) == 0):
+            if np.random.uniform() < self.config["eval_prob"]:
+                # Do an evaluation run with no perturbation.
+                self.policy.set_weights(params)
+                rewards, length = self.rollout(timestep_limit, add_noise=False)
+                eval_returns.append(rewards.sum())
+                eval_lengths.append(length)
+            else:
+                # Do a regular run with parameter perturbations.
+                noise_index = self.noise.sample_index(self.policy.num_params)
+
+                perturbation = self.config["noise_stdev"] * self.noise.get(
+                    noise_index, self.policy.num_params)
+
+                # These two sampling steps could be done in parallel on
+                # different actors letting us update twice as frequently.
+                self.policy.set_weights(params + perturbation)
+                rewards_pos, lengths_pos = self.rollout(timestep_limit)
+
+                self.policy.set_weights(params - perturbation)
+                rewards_neg, lengths_neg = self.rollout(timestep_limit)
+
+                noise_indices.append(noise_index)
+                returns.append([rewards_pos.sum(), rewards_neg.sum()])
+                sign_returns.append(
+                    [np.sign(rewards_pos).sum(),
+                     np.sign(rewards_neg).sum()])
+                lengths.append([lengths_pos, lengths_neg])
+
+        return Result(
+            noise_indices=noise_indices,
+            noisy_returns=returns,
+            sign_noisy_returns=sign_returns,
+            noisy_lengths=lengths,
+            eval_returns=eval_returns,
+            eval_lengths=eval_lengths)
+
+
+class ARSAgent(Agent):
+    """Large-scale implementation of Augmented Random Search in Ray."""
+
+    _agent_name = "ARS"
+    _default_config = DEFAULT_CONFIG
+
+    @classmethod
+    def default_resource_request(cls, config):
+        cf = dict(cls._default_config, **config)
+        return Resources(cpu=1, gpu=0, extra_cpu=cf["num_workers"])
+
+    def _init(self):
+        policy_params = {"action_noise_std": 0.0}
+
+        # register the linear network
+        utils.register_linear_network()
+
+        env = self.env_creator(self.config["env_config"])
+        from ray.rllib import models
+        preprocessor = models.ModelCatalog.get_preprocessor(env)
+
+        self.sess = utils.make_session(single_threaded=False)
+        if self.config["policy_type"] == "LinearPolicy":
+            self.policy = policies.LinearPolicy(
+                self.sess, env.action_space, preprocessor,
+                self.config["observation_filter"], **policy_params)
+        else:
+            self.policy = policies.MLPPolicy(
+                self.sess, env.action_space, preprocessor,
+                self.config["observation_filter"],
+                self.config["fcnet_hiddens"], **policy_params)
+        self.optimizer = optimizers.Adam(self.policy, self.config["stepsize"])
+
+        self.deltas_used = self.config["deltas_used"]
+        self.num_deltas = self.config["num_deltas"]
+
+        # Create the shared noise table.
+        print("Creating shared noise table.")
+        noise_id = create_shared_noise.remote(self.config["noise_size"])
+        self.noise = SharedNoiseTable(ray.get(noise_id))
+
+        # Create the actors.
+        print("Creating actors.")
+        self.workers = [
+            Worker.remote(self.config, policy_params, self.env_creator,
+                          noise_id) for _ in range(self.config["num_workers"])
+        ]
+
+        self.episodes_so_far = 0
+        self.timesteps_so_far = 0
+        self.tstart = time.time()
+
+    def _collect_results(self, theta_id, min_episodes):
+        num_episodes, num_timesteps = 0, 0
+        results = []
+        while num_episodes < min_episodes:
+            print("Collected {} episodes {} timesteps so far this iter".format(
+                num_episodes, num_timesteps))
+            rollout_ids = [
+                worker.do_rollouts.remote(theta_id) for worker in self.workers
+            ]
+            # Get the results of the rollouts.
+            for result in ray.get(rollout_ids):
+                results.append(result)
+                # Update the number of episodes and the number of timesteps
+                # keeping in mind that result.noisy_lengths is a list of lists,
+                # where the inner lists have length 2.
+                num_episodes += sum(len(pair) for pair in result.noisy_lengths)
+                num_timesteps += sum(
+                    sum(pair) for pair in result.noisy_lengths)
+        return results, num_episodes, num_timesteps
+
+    def _train(self):
+        config = self.config
+
+        step_tstart = time.time()
+        theta = self.policy.get_weights()
+        assert theta.dtype == np.float32
+
+        # Put the current policy weights in the object store.
+        theta_id = ray.put(theta)
+        # Use the actors to do rollouts, note that we pass in the ID of the
+        # policy weights.
+        results, num_episodes, num_timesteps = self._collect_results(
+            theta_id, config["num_deltas"])
+
+        all_noise_indices = []
+        all_training_returns = []
+        all_training_lengths = []
+        all_eval_returns = []
+        all_eval_lengths = []
+
+        # Loop over the results.
+        for result in results:
+            all_eval_returns += result.eval_returns
+            all_eval_lengths += result.eval_lengths
+
+            all_noise_indices += result.noise_indices
+            all_training_returns += result.noisy_returns
+            all_training_lengths += result.noisy_lengths
+
+        assert len(all_eval_returns) == len(all_eval_lengths)
+        assert (len(all_noise_indices) == len(all_training_returns) ==
+                len(all_training_lengths))
+
+        self.episodes_so_far += num_episodes
+        self.timesteps_so_far += num_timesteps
+
+        # Assemble the results.
+        eval_returns = np.array(all_eval_returns)
+        eval_lengths = np.array(all_eval_lengths)
+        noise_indices = np.array(all_noise_indices)
+        noisy_returns = np.array(all_training_returns)
+        noisy_lengths = np.array(all_training_lengths)
+
+        # keep only the best returns
+        # select top performing directions if deltas_used < num_deltas
+        max_rewards = np.max(noisy_returns, axis=1)
+        if self.deltas_used > self.num_deltas:
+            self.deltas_used = self.num_deltas
+
+        percentile = 100 * (1 - (self.deltas_used / self.num_deltas))
+        idx = np.arange(max_rewards.size)[
+            max_rewards >= np.percentile(max_rewards, percentile)]
+        noise_idx = noise_indices[idx]
+        noisy_returns = noisy_returns[idx, :]
+
+        # Compute and take a step.
+        g, count = utils.batched_weighted_sum(
+            noisy_returns[:, 0] - noisy_returns[:, 1],
+            (self.noise.get(index, self.policy.num_params)
+             for index in noise_idx),
+            batch_size=min(500, noisy_returns[:, 0].size))
+        g /= noise_idx.size
+        # scale the returns by their standard deviation
+        if not np.isclose(np.std(noisy_returns), 0.0):
+            g /= np.std(noisy_returns)
+        assert (g.shape == (self.policy.num_params, )
+                and g.dtype == np.float32)
+        print('the number of policy params is, ', self.policy.num_params)
+        # Compute the new weights theta.
+        theta, update_ratio = self.optimizer.update(-g)
+        # Set the new weights in the local copy of the policy.
+        self.policy.set_weights(theta)
+
+        step_tend = time.time()
+        tlogger.record_tabular("EvalEpRewMean", eval_returns.mean())
+        tlogger.record_tabular("EvalEpRewStd", eval_returns.std())
+        tlogger.record_tabular("EvalEpLenMean", eval_lengths.mean())
+
+        tlogger.record_tabular("NoisyEpRewMean", noisy_returns.mean())
+        tlogger.record_tabular("NoisyEpRewStd", noisy_returns.std())
+        tlogger.record_tabular("NoisyEpLenMean", noisy_lengths.mean())
+
+        tlogger.record_tabular("WeightsNorm", float(np.square(theta).sum()))
+        tlogger.record_tabular("WeightsStd", float(np.std(theta)))
+        tlogger.record_tabular("Grad2Norm", float(np.sqrt(np.square(g).sum())))
+        tlogger.record_tabular("UpdateRatio", float(update_ratio))
+        tlogger.dump_tabular()
+
+        info = {
+            "weights_norm": np.square(theta).sum(),
+            "grad_norm": np.square(g).sum(),
+            "update_ratio": update_ratio,
+            "episodes_this_iter": noisy_lengths.size,
+            "episodes_so_far": self.episodes_so_far,
+            "timesteps_so_far": self.timesteps_so_far,
+            "time_elapsed_this_iter": step_tend - step_tstart,
+            "time_elapsed": step_tend - self.tstart
+        }
+
+        result = dict(
+            episode_reward_mean=eval_returns.mean(),
+            episode_len_mean=eval_lengths.mean(),
+            timesteps_this_iter=noisy_lengths.sum(),
+            info=info)
+
+        return result
+
+    def _stop(self):
+        # workaround for https://github.com/ray-project/ray/issues/1516
+        for w in self.workers:
+            w.__ray_terminate__.remote()
+
+    def _save(self, checkpoint_dir):
+        checkpoint_path = os.path.join(checkpoint_dir,
+                                       "checkpoint-{}".format(self.iteration))
+        weights = self.policy.get_weights()
+        objects = [weights, self.episodes_so_far, self.timesteps_so_far]
+        pickle.dump(objects, open(checkpoint_path, "wb"))
+        return checkpoint_path
+
+    def _restore(self, checkpoint_path):
+        objects = pickle.load(open(checkpoint_path, "rb"))
+        self.policy.set_weights(objects[0])
+        self.episodes_so_far = objects[1]
+        self.timesteps_so_far = objects[2]
+
+    def compute_action(self, observation):
+        return self.policy.compute(observation, update=True)[0]
--- a/python/ray/rllib/agents/ars/optimizers.py
+++ b/python/ray/rllib/agents/ars/optimizers.py
@ -0,0 +1,56 @@
+# Code in this file is copied and adapted from
+# https://github.com/openai/evolution-strategies-starter.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+class Optimizer(object):
+    def __init__(self, pi):
+        self.pi = pi
+        self.dim = pi.num_params
+        self.t = 0
+
+    def update(self, globalg):
+        self.t += 1
+        step = self._compute_step(globalg)
+        theta = self.pi.get_weights()
+        ratio = np.linalg.norm(step) / np.linalg.norm(theta)
+        return theta + step, ratio
+
+    def _compute_step(self, globalg):
+        raise NotImplementedError
+
+
+class SGD(Optimizer):
+    def __init__(self, pi, stepsize, momentum=0.9):
+        Optimizer.__init__(self, pi)
+        self.v = np.zeros(self.dim, dtype=np.float32)
+        self.stepsize, self.momentum = stepsize, momentum
+
+    def _compute_step(self, globalg):
+        self.v = self.momentum * self.v + (1. - self.momentum) * globalg
+        step = -self.stepsize * self.v
+        return step
+
+
+class Adam(Optimizer):
+    def __init__(self, pi, stepsize, beta1=0.9, beta2=0.999, epsilon=1e-08):
+        Optimizer.__init__(self, pi)
+        self.stepsize = stepsize
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.m = np.zeros(self.dim, dtype=np.float32)
+        self.v = np.zeros(self.dim, dtype=np.float32)
+
+    def _compute_step(self, globalg):
+        a = self.stepsize * (np.sqrt(1 - self.beta2**self.t) /
+                             (1 - self.beta1**self.t))
+        self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
+        self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
+        step = -a * self.m / (np.sqrt(self.v) + self.epsilon)
+        return step
--- a/python/ray/rllib/agents/ars/policies.py
+++ b/python/ray/rllib/agents/ars/policies.py
@ -0,0 +1,136 @@
+# Code in this file is copied and adapted from
+# https://github.com/openai/evolution-strategies-starter.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gym
+import numpy as np
+import tensorflow as tf
+
+import ray
+from ray.rllib.utils.filter import get_filter
+from ray.rllib.utils.error import UnsupportedSpaceException
+from ray.rllib.models import ModelCatalog
+
+
+def rollout(policy, env, timestep_limit=None, add_noise=False, offset=0):
+    """Do a rollout.
+
+    If add_noise is True, the rollout will take noisy actions with
+    noise drawn from that stream. Otherwise, no action noise will be added.
+
+    Parameters
+    ----------
+    policy: tf object
+        policy from which to draw actions
+    env: GymEnv
+        environment from which to draw rewards, done, and next state
+    timestep_limit: int, optional
+        steps after which to end the rollout
+    add_noise: bool, optional
+        indicates whether exploratory action noise should be added
+    offset: int, optional
+        value to subtract from the reward. For example, survival bonus
+        from humanoid
+    """
+    env_timestep_limit = env.spec.max_episode_steps
+    timestep_limit = (env_timestep_limit if timestep_limit is None else min(
+        timestep_limit, env_timestep_limit))
+    rews = []
+    t = 0
+    observation = env.reset()
+    for _ in range(timestep_limit or 999999):
+        ac = policy.compute(observation, add_noise=add_noise, update=True)[0]
+        observation, rew, done, _ = env.step(ac)
+        rew -= np.abs(offset)
+        rews.append(rew)
+        t += 1
+        if done:
+            break
+    rews = np.array(rews, dtype=np.float32)
+    return rews, t
+
+
+class GenericPolicy(object):
+    def __init__(self,
+                 sess,
+                 action_space,
+                 preprocessor,
+                 observation_filter,
+                 action_noise_std,
+                 options={}):
+
+        if len(preprocessor.shape) > 1:
+            raise UnsupportedSpaceException(
+                "Observation space {} is not supported with ARS.".format(
+                    preprocessor.shape))
+
+        self.sess = sess
+        self.action_space = action_space
+        self.action_noise_std = action_noise_std
+        self.preprocessor = preprocessor
+        self.observation_filter = get_filter(observation_filter,
+                                             self.preprocessor.shape)
+        self.inputs = tf.placeholder(tf.float32,
+                                     [None] + list(self.preprocessor.shape))
+
+        # Policy network.
+        dist_class, dist_dim = ModelCatalog.get_action_dist(
+            action_space, dist_type="deterministic")
+
+        model = ModelCatalog.get_model(self.inputs, dist_dim, options=options)
+        dist = dist_class(model.outputs)
+        self.sampler = dist.sample()
+
+        self.variables = ray.experimental.TensorFlowVariables(
+            model.outputs, self.sess)
+
+        self.num_params = sum(
+            np.prod(variable.shape.as_list())
+            for _, variable in self.variables.variables.items())
+        self.sess.run(tf.global_variables_initializer())
+
+    def compute(self, observation, add_noise=False, update=True):
+        observation = self.preprocessor.transform(observation)
+        observation = self.observation_filter(observation[None], update=update)
+        action = self.sess.run(
+            self.sampler, feed_dict={self.inputs: observation})
+        if add_noise and isinstance(self.action_space, gym.spaces.Box):
+            action += np.random.randn(*action.shape) * self.action_noise_std
+        return action
+
+    def set_weights(self, x):
+        self.variables.set_flat(x)
+
+    def get_weights(self):
+        return self.variables.get_flat()
+
+
+class LinearPolicy(GenericPolicy):
+    def __init__(self, sess, action_space, preprocessor, observation_filter,
+                 action_noise_std):
+        options = {"custom_model": "LinearNetwork"}
+        GenericPolicy.__init__(
+            self,
+            sess,
+            action_space,
+            preprocessor,
+            observation_filter,
+            action_noise_std,
+            options=options)
+
+
+class MLPPolicy(GenericPolicy):
+    def __init__(self, sess, action_space, preprocessor, observation_filter,
+                 fcnet_hiddens, action_noise_std):
+        options = {"fcnet_hiddens": fcnet_hiddens}
+        GenericPolicy.__init__(
+            self,
+            sess,
+            action_space,
+            preprocessor,
+            observation_filter,
+            action_noise_std,
+            options=options)
--- a/python/ray/rllib/agents/ars/utils.py
+++ b/python/ray/rllib/agents/ars/utils.py
@ -0,0 +1,82 @@
+# Code in this file is copied and adapted from
+# https://github.com/openai/evolution-strategies-starter.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+from ray.rllib.models import ModelCatalog, Model
+import tensorflow.contrib.slim as slim
+from ray.rllib.models.misc import normc_initializer
+
+
+def compute_ranks(x):
+    """Returns ranks in [0, len(x))
+
+    Note: This is different from scipy.stats.rankdata, which returns ranks in
+    [1, len(x)].
+    """
+    assert x.ndim == 1
+    ranks = np.empty(len(x), dtype=int)
+    ranks[x.argsort()] = np.arange(len(x))
+    return ranks
+
+
+def compute_centered_ranks(x):
+    y = compute_ranks(x.ravel()).reshape(x.shape).astype(np.float32)
+    y /= (x.size - 1)
+    y -= 0.5
+    return y
+
+
+def make_session(single_threaded):
+    if not single_threaded:
+        return tf.Session()
+    return tf.Session(
+        config=tf.ConfigProto(
+            inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))
+
+
+def itergroups(items, group_size):
+    assert group_size >= 1
+    group = []
+    for x in items:
+        group.append(x)
+        if len(group) == group_size:
+            yield tuple(group)
+            del group[:]
+    if group:
+        yield tuple(group)
+
+
+def batched_weighted_sum(weights, vecs, batch_size):
+    total = 0
+    num_items_summed = 0
+    for batch_weights, batch_vecs in zip(
+            itergroups(weights, batch_size), itergroups(vecs, batch_size)):
+        assert len(batch_weights) == len(batch_vecs) <= batch_size
+        total += np.dot(
+            np.asarray(batch_weights, dtype=np.float32),
+            np.asarray(batch_vecs, dtype=np.float32))
+        num_items_summed += len(batch_weights)
+    return total, num_items_summed
+
+
+class LinearNetwork(Model):
+    """Generic linear network."""
+
+    def _build_layers(self, inputs, num_outputs, _):
+        with tf.name_scope("linear"):
+            output = slim.fully_connected(
+                inputs,
+                num_outputs,
+                weights_initializer=normc_initializer(0.01),
+                activation_fn=None,
+            )
+            return output, inputs
+
+
+def register_linear_network():
+    ModelCatalog.register_custom_model("LinearNetwork", LinearNetwork)
--- a/python/ray/rllib/test/test_supported_spaces.py
+++ b/python/ray/rllib/test/test_supported_spaces.py
@ -116,6 +116,13 @@ class ModelSupportedSpaces(unittest.TestCase):
                "episodes_per_batch": 1,
                "timesteps_per_batch": 1
            }, stats)
+        check_support(
+            "ARS", {
+                "num_workers": 1,
+                "noise_size": 10000000,
+                "num_deltas": 1,
+                "deltas_used": 1
+            }, stats)
        check_support("PG", {"num_workers": 1, "optimizer": {}}, stats)
        num_unexpected_errors = 0
        for (alg, a_name, o_name), stat in sorted(stats.items()):
--- a/python/ray/rllib/tuned_examples/regression_tests/cartpole-ars.yaml
+++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-ars.yaml
@ -0,0 +1,16 @@
+cartpole-ars:
+    env: CartPole-v0
+    run: ARS
+    stop:
+        episode_reward_mean: 200
+        time_total_s: 600
+    config:
+        noise_stdev: 0.02
+        num_deltas: 50
+        deltas_used: 25
+        num_workers: 2
+        stepsize: 0.01
+        noise_size: 250000000
+        eval_prob: 0.5
+        policy_type: MLPPolicy
+        fcnet_hiddens: [16, 16]
--- a/python/ray/rllib/tuned_examples/swimmer-ars.yaml
+++ b/python/ray/rllib/tuned_examples/swimmer-ars.yaml
@ -0,0 +1,15 @@
+# can expect improvement to -140 reward in ~300-500k timesteps
+pendulum-ars:
+    env: Swimmer-v2
+    run: ARS
+    config:
+        noise_stdev: 0.01
+        num_deltas: 2
+        deltas_used: 1
+        num_workers: 1
+        stepsize: 0.02
+        noise_size: 250000000
+        fcnet_hiddens: [32,32]
+        policy_type: LinearPolicy
+        eval_prob: 0.2
+        offset: 0