mirror of
https://github.com/vale981/ray
synced 2025-03-07 02:51:39 -05:00

* Remove all __future__ imports from RLlib. * Remove (object) again from tf_run_builder.py::TFRunBuilder. * Fix 2xLINT warnings. * Fix broken appo_policy import (must be appo_tf_policy) * Remove future imports from all other ray files (not just RLlib). * Remove future imports from all other ray files (not just RLlib). * Remove future import blocks that contain `unicode_literals` as well. Revert appo_tf_policy.py to appo_policy.py (belongs to another PR). * Add two empty lines before Schedule class. * Put back __future__ imports into determine_tests_to_run.py. Fails otherwise on a py2/print related error.
52 lines
1.7 KiB
Python
52 lines
1.7 KiB
Python
# Code in this file is copied and adapted from
|
|
# https://github.com/openai/evolution-strategies-starter.
|
|
|
|
import numpy as np
|
|
|
|
|
|
class Optimizer:
|
|
def __init__(self, pi):
|
|
self.pi = pi
|
|
self.dim = pi.num_params
|
|
self.t = 0
|
|
|
|
def update(self, globalg):
|
|
self.t += 1
|
|
step = self._compute_step(globalg)
|
|
theta = self.pi.get_weights()
|
|
ratio = np.linalg.norm(step) / np.linalg.norm(theta)
|
|
return theta + step, ratio
|
|
|
|
def _compute_step(self, globalg):
|
|
raise NotImplementedError
|
|
|
|
|
|
class SGD(Optimizer):
|
|
def __init__(self, pi, stepsize, momentum=0.9):
|
|
Optimizer.__init__(self, pi)
|
|
self.v = np.zeros(self.dim, dtype=np.float32)
|
|
self.stepsize, self.momentum = stepsize, momentum
|
|
|
|
def _compute_step(self, globalg):
|
|
self.v = self.momentum * self.v + (1. - self.momentum) * globalg
|
|
step = -self.stepsize * self.v
|
|
return step
|
|
|
|
|
|
class Adam(Optimizer):
|
|
def __init__(self, pi, stepsize, beta1=0.9, beta2=0.999, epsilon=1e-08):
|
|
Optimizer.__init__(self, pi)
|
|
self.stepsize = stepsize
|
|
self.beta1 = beta1
|
|
self.beta2 = beta2
|
|
self.epsilon = epsilon
|
|
self.m = np.zeros(self.dim, dtype=np.float32)
|
|
self.v = np.zeros(self.dim, dtype=np.float32)
|
|
|
|
def _compute_step(self, globalg):
|
|
a = self.stepsize * (np.sqrt(1 - self.beta2**self.t) /
|
|
(1 - self.beta1**self.t))
|
|
self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
|
|
self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
|
|
step = -a * self.m / (np.sqrt(self.v) + self.epsilon)
|
|
return step
|