2018-03-04 12:25:25 -08:00
|
|
|
import os
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
from ray.rllib.utils.annotations import DeveloperAPI
|
2017-12-17 15:59:57 -08:00
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
|
|
|
|
@DeveloperAPI
|
2020-01-02 17:42:13 -08:00
|
|
|
class EvaluatorInterface:
|
2018-07-01 00:05:08 -07:00
|
|
|
"""This is the interface between policy optimizers and policy evaluation.
|
2017-12-17 15:59:57 -08:00
|
|
|
|
2019-06-03 06:49:24 +08:00
|
|
|
See also: RolloutWorker
|
2017-12-17 15:59:57 -08:00
|
|
|
"""
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2017-12-17 15:59:57 -08:00
|
|
|
def sample(self):
|
2018-03-15 15:57:31 -07:00
|
|
|
"""Returns a batch of experience sampled from this evaluator.
|
|
|
|
|
|
|
|
This method must be implemented by subclasses.
|
2017-12-17 15:59:57 -08:00
|
|
|
|
|
|
|
Returns:
|
2018-06-25 22:33:57 -07:00
|
|
|
SampleBatch|MultiAgentBatch: A columnar batch of experiences
|
2018-07-01 00:05:08 -07:00
|
|
|
(e.g., tensors), or a multi-agent batch.
|
2017-12-17 15:59:57 -08:00
|
|
|
|
|
|
|
Examples:
|
|
|
|
>>> print(ev.sample())
|
2018-03-15 15:57:31 -07:00
|
|
|
SampleBatch({"obs": [1, 2, 3], "action": [0, 1, 0], ...})
|
2017-12-17 15:59:57 -08:00
|
|
|
"""
|
|
|
|
|
|
|
|
raise NotImplementedError
|
|
|
|
|
2019-02-11 15:22:15 -08:00
|
|
|
@DeveloperAPI
|
|
|
|
def learn_on_batch(self, samples):
|
|
|
|
"""Update policies based on the given batch.
|
|
|
|
|
|
|
|
This is the equivalent to apply_gradients(compute_gradients(samples)),
|
|
|
|
but can be optimized to avoid pulling gradients into CPU memory.
|
|
|
|
|
|
|
|
Either this or the combination of compute/apply grads must be
|
|
|
|
implemented by subclasses.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
info: dictionary of extra metadata from compute_gradients().
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
>>> batch = ev.sample()
|
|
|
|
>>> ev.learn_on_batch(samples)
|
|
|
|
"""
|
|
|
|
|
2019-04-09 00:36:49 -07:00
|
|
|
grads, info = self.compute_gradients(samples)
|
|
|
|
self.apply_gradients(grads)
|
|
|
|
return info
|
2019-02-11 15:22:15 -08:00
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2017-12-17 15:59:57 -08:00
|
|
|
def compute_gradients(self, samples):
|
|
|
|
"""Returns a gradient computed w.r.t the specified samples.
|
|
|
|
|
2019-02-11 15:22:15 -08:00
|
|
|
Either this or learn_on_batch() must be implemented by subclasses.
|
2018-03-15 15:57:31 -07:00
|
|
|
|
2017-12-17 15:59:57 -08:00
|
|
|
Returns:
|
2018-06-25 22:33:57 -07:00
|
|
|
(grads, info): A list of gradients that can be applied on a
|
2018-07-01 00:05:08 -07:00
|
|
|
compatible evaluator. In the multi-agent case, returns a dict
|
2019-05-20 16:46:05 -07:00
|
|
|
of gradients keyed by policy ids. An info dictionary of
|
2018-07-01 00:05:08 -07:00
|
|
|
extra metadata is also returned.
|
2018-03-15 15:57:31 -07:00
|
|
|
|
|
|
|
Examples:
|
|
|
|
>>> batch = ev.sample()
|
|
|
|
>>> grads, info = ev2.compute_gradients(samples)
|
2017-12-17 15:59:57 -08:00
|
|
|
"""
|
|
|
|
|
|
|
|
raise NotImplementedError
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2017-12-17 15:59:57 -08:00
|
|
|
def apply_gradients(self, grads):
|
2018-03-15 15:57:31 -07:00
|
|
|
"""Applies the given gradients to this evaluator's weights.
|
|
|
|
|
2019-02-11 15:22:15 -08:00
|
|
|
Either this or learn_on_batch() must be implemented by subclasses.
|
2017-12-17 15:59:57 -08:00
|
|
|
|
|
|
|
Examples:
|
|
|
|
>>> samples = ev1.sample()
|
2018-03-15 15:57:31 -07:00
|
|
|
>>> grads, info = ev2.compute_gradients(samples)
|
2017-12-17 15:59:57 -08:00
|
|
|
>>> ev1.apply_gradients(grads)
|
|
|
|
"""
|
|
|
|
|
|
|
|
raise NotImplementedError
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2017-12-17 15:59:57 -08:00
|
|
|
def get_weights(self):
|
|
|
|
"""Returns the model weights of this Evaluator.
|
|
|
|
|
2018-03-15 15:57:31 -07:00
|
|
|
This method must be implemented by subclasses.
|
|
|
|
|
2017-12-17 15:59:57 -08:00
|
|
|
Returns:
|
|
|
|
object: weights that can be set on a compatible evaluator.
|
2018-03-15 15:57:31 -07:00
|
|
|
info: dictionary of extra metadata.
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
>>> weights = ev1.get_weights()
|
2017-12-17 15:59:57 -08:00
|
|
|
"""
|
|
|
|
|
|
|
|
raise NotImplementedError
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2017-12-17 15:59:57 -08:00
|
|
|
def set_weights(self, weights):
|
|
|
|
"""Sets the model weights of this Evaluator.
|
|
|
|
|
2018-03-15 15:57:31 -07:00
|
|
|
This method must be implemented by subclasses.
|
|
|
|
|
2017-12-17 15:59:57 -08:00
|
|
|
Examples:
|
|
|
|
>>> weights = ev1.get_weights()
|
|
|
|
>>> ev2.set_weights(weights)
|
|
|
|
"""
|
|
|
|
|
|
|
|
raise NotImplementedError
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2018-03-04 12:25:25 -08:00
|
|
|
def get_host(self):
|
2018-03-15 15:57:31 -07:00
|
|
|
"""Returns the hostname of the process running this evaluator."""
|
2018-03-04 12:25:25 -08:00
|
|
|
|
|
|
|
return os.uname()[1]
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
def apply(self, func, *args):
|
|
|
|
"""Apply the given function to this evaluator instance."""
|
|
|
|
|
|
|
|
return func(self, *args)
|