Added functionality for retrieving variables from control dependencies (#220)

* Added test for retriving variables from an optimizer

* Added comments to test

* Addressed comments

* Fixed travis bug

* Added fix to circular controls

* Added set for explored operations and duplicate prefix stripping

* Removed embeded ipython

* Removed prefix, use seperate graph for each network

* Removed redundant imports

* Addressed comments and added separate graph to initializer

* fix typos

* get rid of prefix in documentation
This commit is contained in:
Wapaul1 2017-01-30 19:17:42 -08:00 committed by Philipp Moritz
parent 6703f7be6f
commit db7297865f
4 changed files with 84 additions and 36 deletions

View file

@ -72,21 +72,18 @@ b.assign(np.zeros(1)) # This adds a node to the graph every time you call it.
## Complete Example ## Complete Example
Putting this all together, we would first create the graph on each worker using Putting this all together, we would first create the graph on each worker using
environment variables. Within the environment variables, we would use the environment variables. Within the environment variables, we would use the
`get_weights` and `set_weights` methods of the `TensorFlowVariables` class. We `get_weights` and `set_weights` methods of the `TensorFlowVariables` class. We
would then use those methods to ship the weights (as a dictionary of variable would then use those methods to ship the weights (as a dictionary of variable
names mapping to tensorflow tensors) between the processes without shipping the names mapping to tensorflow tensors) between the processes without shipping the
actual TensorFlow graphs, which are much more complex Python objects. Note that actual TensorFlow graphs, which are much more complex Python objects. Note that
to avoid namespace collision with already created variables on the workers, we to avoid namespace collision with already created variables on the workers, we
use a variable_scope and a prefix in the environment variables and then pass use a separate graph for each network.
true to the prefix in `TensorFlowVariables` so it can properly decode the variable
names.
```python ```python
import tensorflow as tf import tensorflow as tf
import numpy as np import numpy as np
import ray import ray
import uuid
ray.init(num_workers=5) ray.init(num_workers=5)
@ -95,11 +92,8 @@ NUM_BATCHES = 1
NUM_ITERS = 201 NUM_ITERS = 201
def net_vars_initializer(): def net_vars_initializer():
# Prefix should be random so that there is no conflict with variable names in # Use a separate graph for each network.
# the cluster setting. with tf.Graph().as_default():
prefix = str(uuid.uuid1().hex)
# Use the tensorflow variable_scope to prefix all of the variables
with tf.variable_scope(prefix):
# Seed TensorFlow to make the script deterministic. # Seed TensorFlow to make the script deterministic.
tf.set_random_seed(0) tf.set_random_seed(0)
# Define the inputs. # Define the inputs.
@ -116,9 +110,8 @@ def net_vars_initializer():
# Define the weight initializer and session. # Define the weight initializer and session.
init = tf.global_variables_initializer() init = tf.global_variables_initializer()
sess = tf.Session() sess = tf.Session()
# Additional code for setting and getting the weights, and use a prefix # Additional code for setting and getting the weights
# so that the variable names can be converted between workers. variables = ray.experimental.TensorFlowVariables(loss, sess)
variables = ray.experimental.TensorFlowVariables(loss, sess, prefix=True)
# Return all of the data needed to use the network. # Return all of the data needed to use the network.
return variables, sess, train, loss, x_data, y_data, init return variables, sess, train, loss, x_data, y_data, init

View file

@ -61,7 +61,8 @@ class LinearModel(object):
return self.sess.run(self.cross_entropy_grads, feed_dict={self.x: xs, self.y_: ys}) return self.sess.run(self.cross_entropy_grads, feed_dict={self.x: xs, self.y_: ys})
def net_initialization(): def net_initialization():
return LinearModel([784,10]) with tf.Graph().as_default():
return LinearModel([784,10])
# By default, when an environment variable is used by a remote function, the # By default, when an environment variable is used by a remote function, the
# initialization code will be rerun at the end of the remote task to ensure # initialization code will be rerun at the end of the remote task to ensure

View file

@ -28,28 +28,41 @@ class TensorFlowVariables(object):
assignment_placeholders (List[tf.placeholders]): The nodes that weights get assignment_placeholders (List[tf.placeholders]): The nodes that weights get
passed to. passed to.
assignment_nodes (List[tf.Tensor]): The nodes that assign the weights. assignment_nodes (List[tf.Tensor]): The nodes that assign the weights.
prefix (Bool): Boolean for if there is a prefix on the variable names.
""" """
def __init__(self, loss, sess=None, prefix=False): def __init__(self, loss, sess=None):
"""Creates a TensorFlowVariables instance.""" """Creates a TensorFlowVariables instance."""
import tensorflow as tf import tensorflow as tf
self.sess = sess self.sess = sess
self.loss = loss self.loss = loss
self.prefix = prefix
queue = deque([loss]) queue = deque([loss])
variable_names = [] variable_names = []
explored_inputs = set([loss])
# We do a BFS on the dependency graph of the input function to find # We do a BFS on the dependency graph of the input function to find
# the variables. # the variables.
while len(queue) != 0: while len(queue) != 0:
op = queue.popleft().op tf_obj = queue.popleft()
queue.extend(op.inputs)
if op.node_def.op == "Variable": # The object put into the queue is not necessarily an operation, so we
variable_names.append(op.node_def.name) # want the op attribute to get the operation underlying the object.
# Only operations contain the inputs that we can explore.
if hasattr(tf_obj, "op"):
tf_obj = tf_obj.op
for input_op in tf_obj.inputs:
if input_op not in explored_inputs:
queue.append(input_op)
explored_inputs.add(input_op)
# Tensorflow control inputs can be circular, so we keep track of
# explored operations.
for control in tf_obj.control_inputs:
if control not in explored_inputs:
queue.append(control)
explored_inputs.add(control)
if tf_obj.node_def.op == "Variable":
variable_names.append(tf_obj.node_def.name)
self.variables = OrderedDict() self.variables = OrderedDict()
for v in [v for v in tf.global_variables() if v.op.node_def.name in variable_names]: for v in [v for v in tf.global_variables() if v.op.node_def.name in variable_names]:
name = v.op.node_def.name.split("/", 1 if prefix else 0)[-1] self.variables[v.op.node_def.name] = v
self.variables[name] = v
self.assignment_placeholders = dict() self.assignment_placeholders = dict()
self.assignment_nodes = [] self.assignment_nodes = []

View file

@ -17,32 +17,39 @@ def make_linear_network(w_name=None, b_name=None):
b = tf.Variable(tf.zeros([1]), name=b_name) b = tf.Variable(tf.zeros([1]), name=b_name)
y = w * x_data + b y = w * x_data + b
# Return the loss and weight initializer. # Return the loss and weight initializer.
return tf.reduce_mean(tf.square(y - y_data)), tf.global_variables_initializer() return tf.reduce_mean(tf.square(y - y_data)), tf.global_variables_initializer(), x_data, y_data
def net_vars_initializer(): def net_vars_initializer():
# Random prefix so variable names do not clash if we use nets with # Uses a separate graph for each network.
# the same name. with tf.Graph().as_default():
prefix = str(uuid.uuid1().hex)
# Use the tensorflow variable_scope to prefix all of the variables
with tf.variable_scope(prefix):
# Create the network. # Create the network.
loss, init = make_linear_network() loss, init, _, _ = make_linear_network()
sess = tf.Session() sess = tf.Session()
# Additional code for setting and getting the weights. # Additional code for setting and getting the weights.
variables = ray.experimental.TensorFlowVariables(loss, sess, prefix=True) variables = ray.experimental.TensorFlowVariables(loss, sess)
# Return all of the data needed to use the network. # Return all of the data needed to use the network.
return variables, init, sess return variables, init, sess
def net_vars_reinitializer(net_vars): def net_vars_reinitializer(net_vars):
return net_vars return net_vars
def train_vars_initializer():
# Almost the same as above, but now returns the placeholders and gradient.
with tf.Graph().as_default():
loss, init, x_data, y_data = make_linear_network()
sess = tf.Session()
variables = ray.experimental.TensorFlowVariables(loss, sess)
grad = tf.gradients(loss, list(variables.variables.values()))
return variables, init, sess, grad, [x_data, y_data]
class TensorFlowTest(unittest.TestCase): class TensorFlowTest(unittest.TestCase):
def testTensorFlowVariables(self): def testTensorFlowVariables(self):
ray.init(num_workers=2) ray.init(num_workers=2)
sess = tf.Session() sess = tf.Session()
loss, init = make_linear_network() loss, init, _, _ = make_linear_network()
sess.run(init) sess.run(init)
variables = ray.experimental.TensorFlowVariables(loss, sess) variables = ray.experimental.TensorFlowVariables(loss, sess)
@ -54,7 +61,7 @@ class TensorFlowTest(unittest.TestCase):
variables.set_weights(weights) variables.set_weights(weights)
self.assertEqual(weights, variables.get_weights()) self.assertEqual(weights, variables.get_weights())
loss2, init2 = make_linear_network("w", "b") loss2, init2, _, _ = make_linear_network("w", "b")
sess.run(init2) sess.run(init2)
variables2 = ray.experimental.TensorFlowVariables(loss2, sess) variables2 = ray.experimental.TensorFlowVariables(loss2, sess)
@ -148,7 +155,7 @@ class TensorFlowTest(unittest.TestCase):
# Create a network on the driver locally. # Create a network on the driver locally.
sess1 = tf.Session() sess1 = tf.Session()
loss1, init1 = make_linear_network() loss1, init1, _, _ = make_linear_network()
net_vars1 = ray.experimental.TensorFlowVariables(loss1, sess1) net_vars1 = ray.experimental.TensorFlowVariables(loss1, sess1)
sess1.run(init1) sess1.run(init1)
@ -170,5 +177,39 @@ class TensorFlowTest(unittest.TestCase):
ray.worker.cleanup() ray.worker.cleanup()
def testVariablesControlDependencies(self):
ray.init(num_workers=1)
# Creates a network and appends a momentum optimizer.
sess = tf.Session()
loss, init, _, _ = make_linear_network()
minimizer = tf.train.MomentumOptimizer(0.9, 0.9).minimize(loss)
net_vars = ray.experimental.TensorFlowVariables(minimizer, sess)
sess.run(init)
# Tests if all variables are properly retrieved, 2 variables and 2 momentum
# variables.
self.assertEqual(len(net_vars.variables.items()), 4)
ray.worker.cleanup()
def testRemoteTrainingStep(self):
ray.init(num_workers=1)
ray.env.net = ray.EnvironmentVariable(train_vars_initializer, net_vars_reinitializer)
@ray.remote
def training_step(weights):
variables, _, sess, grad, placeholders = ray.env.net
variables.set_weights(weights)
return sess.run(grad, feed_dict=dict(zip(placeholders, [[1]*100]*2)))
variables, init, sess, _, _ = ray.env.net
sess.run(init)
ray.get(training_step.remote(variables.get_weights()))
ray.worker.cleanup()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main(verbosity=2) unittest.main(verbosity=2)