2018-10-21 23:43:57 -07:00
|
|
|
import logging
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
from ray.rllib.utils.annotations import DeveloperAPI
|
2018-08-23 17:49:10 -07:00
|
|
|
from ray.rllib.evaluation.metrics import collect_episodes, summarize_episodes
|
2018-03-04 12:25:25 -08:00
|
|
|
|
2018-10-21 23:43:57 -07:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2017-12-06 17:51:57 -08:00
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2020-01-02 17:42:13 -08:00
|
|
|
class PolicyOptimizer:
|
2018-03-15 15:57:31 -07:00
|
|
|
"""Policy optimizers encapsulate distributed RL optimization strategies.
|
|
|
|
|
|
|
|
Policy optimizers serve as the "control plane" of algorithms.
|
2017-12-06 17:51:57 -08:00
|
|
|
|
|
|
|
For example, AsyncOptimizer is used for A3C, and LocalMultiGPUOptimizer is
|
2018-01-23 10:50:45 -08:00
|
|
|
used for PPO. These optimizers are all pluggable, and it is possible
|
|
|
|
to mix and match as needed.
|
2017-12-06 17:51:57 -08:00
|
|
|
|
2018-03-15 15:57:31 -07:00
|
|
|
Attributes:
|
|
|
|
config (dict): The JSON configuration passed to this optimizer.
|
2019-06-03 06:49:24 +08:00
|
|
|
workers (WorkerSet): The set of rollout workers to use.
|
2018-03-15 15:57:31 -07:00
|
|
|
num_steps_trained (int): Number of timesteps trained on so far.
|
|
|
|
num_steps_sampled (int): Number of timesteps sampled so far.
|
2017-12-06 17:51:57 -08:00
|
|
|
"""
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2019-06-03 06:49:24 +08:00
|
|
|
def __init__(self, workers):
|
2017-12-06 17:51:57 -08:00
|
|
|
"""Create an optimizer instance.
|
|
|
|
|
|
|
|
Args:
|
2019-06-03 06:49:24 +08:00
|
|
|
workers (WorkerSet): The set of rollout workers to use.
|
2017-12-06 17:51:57 -08:00
|
|
|
"""
|
2019-06-03 06:49:24 +08:00
|
|
|
self.workers = workers
|
2018-08-23 17:49:10 -07:00
|
|
|
self.episode_history = []
|
2019-07-19 13:59:52 +08:00
|
|
|
self.to_be_collected = []
|
2018-03-04 12:25:25 -08:00
|
|
|
|
|
|
|
# Counters that should be updated by sub-classes
|
|
|
|
self.num_steps_trained = 0
|
|
|
|
self.num_steps_sampled = 0
|
2017-12-06 17:51:57 -08:00
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2017-12-06 17:51:57 -08:00
|
|
|
def step(self):
|
2018-03-15 15:57:31 -07:00
|
|
|
"""Takes a logical optimization step.
|
|
|
|
|
|
|
|
This should run for long enough to minimize call overheads (i.e., at
|
|
|
|
least a couple seconds), but short enough to return control
|
|
|
|
periodically to callers (i.e., at most a few tens of seconds).
|
2018-07-12 19:22:46 +02:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
fetches (dict|None): Optional fetches from compute grads calls.
|
2018-03-15 15:57:31 -07:00
|
|
|
"""
|
2017-12-06 17:51:57 -08:00
|
|
|
|
|
|
|
raise NotImplementedError
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2017-12-06 17:51:57 -08:00
|
|
|
def stats(self):
|
|
|
|
"""Returns a dictionary of internal performance statistics."""
|
|
|
|
|
2018-03-04 12:25:25 -08:00
|
|
|
return {
|
|
|
|
"num_steps_trained": self.num_steps_trained,
|
|
|
|
"num_steps_sampled": self.num_steps_sampled,
|
|
|
|
}
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2018-12-08 16:28:58 -08:00
|
|
|
def save(self):
|
|
|
|
"""Returns a serializable object representing the optimizer state."""
|
|
|
|
|
|
|
|
return [self.num_steps_trained, self.num_steps_sampled]
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2018-12-08 16:28:58 -08:00
|
|
|
def restore(self, data):
|
|
|
|
"""Restores optimizer state from the given data object."""
|
|
|
|
|
|
|
|
self.num_steps_trained = data[0]
|
|
|
|
self.num_steps_sampled = data[1]
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2018-12-08 16:28:58 -08:00
|
|
|
def stop(self):
|
|
|
|
"""Release any resources used by this optimizer."""
|
|
|
|
pass
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2018-12-07 18:36:23 -08:00
|
|
|
def collect_metrics(self,
|
|
|
|
timeout_seconds,
|
|
|
|
min_history=100,
|
2019-06-03 06:49:24 +08:00
|
|
|
selected_workers=None):
|
|
|
|
"""Returns worker and optimizer stats.
|
2018-07-24 20:51:22 -07:00
|
|
|
|
2018-08-23 17:49:10 -07:00
|
|
|
Arguments:
|
2019-06-03 06:49:24 +08:00
|
|
|
timeout_seconds (int): Max wait time for a worker before
|
|
|
|
dropping its results. This usually indicates a hung worker.
|
2018-08-23 17:49:10 -07:00
|
|
|
min_history (int): Min history length to smooth results over.
|
2019-06-03 06:49:24 +08:00
|
|
|
selected_workers (list): Override the list of remote workers
|
2018-12-07 18:36:23 -08:00
|
|
|
to collect metrics from.
|
2018-08-23 17:49:10 -07:00
|
|
|
|
2018-07-24 20:51:22 -07:00
|
|
|
Returns:
|
2019-06-03 06:49:24 +08:00
|
|
|
res (dict): A training result dict from worker metrics with
|
2018-07-24 20:51:22 -07:00
|
|
|
`info` replaced with stats from self.
|
|
|
|
"""
|
2019-07-19 13:59:52 +08:00
|
|
|
episodes, self.to_be_collected = collect_episodes(
|
2019-06-03 06:49:24 +08:00
|
|
|
self.workers.local_worker(),
|
|
|
|
selected_workers or self.workers.remote_workers(),
|
2019-07-19 13:59:52 +08:00
|
|
|
self.to_be_collected,
|
2018-10-29 02:52:18 +08:00
|
|
|
timeout_seconds=timeout_seconds)
|
2018-08-23 17:49:10 -07:00
|
|
|
orig_episodes = list(episodes)
|
|
|
|
missing = min_history - len(episodes)
|
|
|
|
if missing > 0:
|
|
|
|
episodes.extend(self.episode_history[-missing:])
|
|
|
|
assert len(episodes) <= min_history
|
|
|
|
self.episode_history.extend(orig_episodes)
|
|
|
|
self.episode_history = self.episode_history[-min_history:]
|
2019-07-19 13:59:52 +08:00
|
|
|
res = summarize_episodes(episodes, orig_episodes)
|
2018-08-07 12:17:44 -07:00
|
|
|
res.update(info=self.stats())
|
2018-07-24 20:51:22 -07:00
|
|
|
return res
|
|
|
|
|
2019-03-16 13:34:09 -07:00
|
|
|
@DeveloperAPI
|
2019-06-03 06:49:24 +08:00
|
|
|
def reset(self, remote_workers):
|
|
|
|
"""Called to change the set of remote workers being used."""
|
|
|
|
self.workers.reset(remote_workers)
|
2019-03-16 13:34:09 -07:00
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2019-06-03 06:49:24 +08:00
|
|
|
def foreach_worker(self, func):
|
|
|
|
"""Apply the given function to each worker instance."""
|
|
|
|
return self.workers.foreach_worker(func)
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2019-06-03 06:49:24 +08:00
|
|
|
def foreach_worker_with_index(self, func):
|
|
|
|
"""Apply the given function to each worker instance.
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
|
|
|
|
The index will be passed as the second arg to the given function.
|
|
|
|
"""
|
2019-06-03 06:49:24 +08:00
|
|
|
return self.workers.foreach_worker_with_index(func)
|