2020-06-19 13:09:05 -07:00
|
|
|
from typing import Callable, Tuple, Optional, List, Dict, Any, TYPE_CHECKING
|
|
|
|
|
2018-11-12 16:31:27 -08:00
|
|
|
from ray.rllib.env.external_env import ExternalEnv
|
2019-04-07 04:58:14 +02:00
|
|
|
from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
|
2018-07-01 00:05:08 -07:00
|
|
|
from ray.rllib.env.multi_agent_env import MultiAgentEnv
|
2020-05-30 22:48:34 +02:00
|
|
|
from ray.rllib.env.vector_env import VectorEnv
|
2019-01-23 21:27:26 -08:00
|
|
|
from ray.rllib.utils.annotations import override, PublicAPI
|
2021-02-08 12:05:16 +01:00
|
|
|
from ray.rllib.utils.typing import AgentID, EnvID, EnvType, MultiAgentDict, \
|
|
|
|
MultiEnvDict, PartialTrainerConfigDict
|
2020-06-19 13:09:05 -07:00
|
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
|
|
from ray.rllib.models.preprocessors import Preprocessor
|
2018-06-23 18:32:16 -07:00
|
|
|
|
2019-03-29 21:19:42 +01:00
|
|
|
ASYNC_RESET_RETURN = "async_reset_return"
|
|
|
|
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@PublicAPI
|
2020-01-02 17:42:13 -08:00
|
|
|
class BaseEnv:
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
"""The lowest-level env interface used by RLlib for sampling.
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
BaseEnv models multiple agents executing asynchronously in multiple
|
2018-06-23 18:32:16 -07:00
|
|
|
environments. A call to poll() returns observations from ready agents
|
|
|
|
keyed by their environment and agent ids, and actions for those agents
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
can be sent back via send_actions().
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
All other env types can be adapted to BaseEnv. RLlib handles these
|
2019-06-03 06:49:24 +08:00
|
|
|
conversions internally in RolloutWorker, for example:
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
gym.Env => rllib.VectorEnv => rllib.BaseEnv
|
|
|
|
rllib.MultiAgentEnv => rllib.BaseEnv
|
|
|
|
rllib.ExternalEnv => rllib.BaseEnv
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
|
2018-10-20 15:21:22 -07:00
|
|
|
Attributes:
|
|
|
|
action_space (gym.Space): Action space. This must be defined for
|
|
|
|
single-agent envs. Multi-agent envs can set this to None.
|
|
|
|
observation_space (gym.Space): Observation space. This must be defined
|
|
|
|
for single-agent envs. Multi-agent envs can set this to None.
|
|
|
|
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
Examples:
|
2019-01-23 21:27:26 -08:00
|
|
|
>>> env = MyBaseEnv()
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
>>> obs, rewards, dones, infos, off_policy_actions = env.poll()
|
|
|
|
>>> print(obs)
|
2018-06-23 18:32:16 -07:00
|
|
|
{
|
|
|
|
"env_0": {
|
|
|
|
"car_0": [2.4, 1.6],
|
|
|
|
"car_1": [3.4, -3.2],
|
2019-03-08 15:39:48 -08:00
|
|
|
},
|
|
|
|
"env_1": {
|
|
|
|
"car_0": [8.0, 4.1],
|
|
|
|
},
|
|
|
|
"env_2": {
|
|
|
|
"car_0": [2.3, 3.3],
|
|
|
|
"car_1": [1.4, -0.2],
|
|
|
|
"car_3": [1.2, 0.1],
|
|
|
|
},
|
2018-06-23 18:32:16 -07:00
|
|
|
}
|
|
|
|
>>> env.send_actions(
|
|
|
|
actions={
|
|
|
|
"env_0": {
|
|
|
|
"car_0": 0,
|
|
|
|
"car_1": 1,
|
2019-03-08 15:39:48 -08:00
|
|
|
}, ...
|
2018-06-23 18:32:16 -07:00
|
|
|
})
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
>>> obs, rewards, dones, infos, off_policy_actions = env.poll()
|
|
|
|
>>> print(obs)
|
2018-06-23 18:32:16 -07:00
|
|
|
{
|
|
|
|
"env_0": {
|
|
|
|
"car_0": [4.1, 1.7],
|
|
|
|
"car_1": [3.2, -4.2],
|
2019-03-08 15:39:48 -08:00
|
|
|
}, ...
|
2018-06-23 18:32:16 -07:00
|
|
|
}
|
|
|
|
>>> print(dones)
|
|
|
|
{
|
|
|
|
"env_0": {
|
|
|
|
"__all__": False,
|
|
|
|
"car_0": False,
|
|
|
|
"car_1": True,
|
2019-03-08 15:39:48 -08:00
|
|
|
}, ...
|
2018-06-23 18:32:16 -07:00
|
|
|
}
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
"""
|
|
|
|
|
2018-06-23 18:32:16 -07:00
|
|
|
@staticmethod
|
2021-02-08 12:05:16 +01:00
|
|
|
def to_base_env(
|
|
|
|
env: EnvType,
|
|
|
|
make_env: Callable[[int], EnvType] = None,
|
|
|
|
num_envs: int = 1,
|
|
|
|
remote_envs: bool = False,
|
|
|
|
remote_env_batch_wait_ms: int = 0,
|
|
|
|
policy_config: PartialTrainerConfigDict = None,
|
|
|
|
) -> "BaseEnv":
|
2018-06-23 18:32:16 -07:00
|
|
|
"""Wraps any env type as needed to expose the async interface."""
|
2019-03-08 15:39:48 -08:00
|
|
|
|
|
|
|
from ray.rllib.env.remote_vector_env import RemoteVectorEnv
|
2019-03-29 21:19:42 +01:00
|
|
|
if remote_envs and num_envs == 1:
|
2019-02-13 19:08:26 +01:00
|
|
|
raise ValueError(
|
|
|
|
"Remote envs only make sense to use if num_envs > 1 "
|
|
|
|
"(i.e. vectorization is enabled).")
|
2019-03-08 15:39:48 -08:00
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
if not isinstance(env, BaseEnv):
|
2018-06-23 18:32:16 -07:00
|
|
|
if isinstance(env, MultiAgentEnv):
|
2019-02-13 19:08:26 +01:00
|
|
|
if remote_envs:
|
2019-03-08 15:39:48 -08:00
|
|
|
env = RemoteVectorEnv(
|
2019-03-29 21:19:42 +01:00
|
|
|
make_env,
|
|
|
|
num_envs,
|
|
|
|
multiagent=True,
|
|
|
|
remote_env_batch_wait_ms=remote_env_batch_wait_ms)
|
2019-03-08 15:39:48 -08:00
|
|
|
else:
|
|
|
|
env = _MultiAgentEnvToBaseEnv(
|
|
|
|
make_env=make_env,
|
|
|
|
existing_envs=[env],
|
|
|
|
num_envs=num_envs)
|
2018-11-12 16:31:27 -08:00
|
|
|
elif isinstance(env, ExternalEnv):
|
2018-06-23 18:32:16 -07:00
|
|
|
if num_envs != 1:
|
|
|
|
raise ValueError(
|
2020-05-30 22:48:34 +02:00
|
|
|
"External(MultiAgent)Env does not currently support "
|
|
|
|
"num_envs > 1. One way of solving this would be to "
|
|
|
|
"treat your Env as a MultiAgentEnv hosting only one "
|
|
|
|
"type of agent but with several copies.")
|
2019-01-23 21:27:26 -08:00
|
|
|
env = _ExternalEnvToBaseEnv(env)
|
2018-06-23 18:32:16 -07:00
|
|
|
elif isinstance(env, VectorEnv):
|
2019-01-23 21:27:26 -08:00
|
|
|
env = _VectorEnvToBaseEnv(env)
|
2018-06-23 18:32:16 -07:00
|
|
|
else:
|
2019-03-08 15:39:48 -08:00
|
|
|
if remote_envs:
|
|
|
|
env = RemoteVectorEnv(
|
2019-03-29 21:19:42 +01:00
|
|
|
make_env,
|
|
|
|
num_envs,
|
|
|
|
multiagent=False,
|
2021-07-25 16:55:51 -04:00
|
|
|
remote_env_batch_wait_ms=remote_env_batch_wait_ms,
|
|
|
|
existing_envs=[env],
|
|
|
|
)
|
2019-03-08 15:39:48 -08:00
|
|
|
else:
|
|
|
|
env = VectorEnv.wrap(
|
|
|
|
make_env=make_env,
|
|
|
|
existing_envs=[env],
|
|
|
|
num_envs=num_envs,
|
|
|
|
action_space=env.action_space,
|
2021-02-08 12:05:16 +01:00
|
|
|
observation_space=env.observation_space,
|
|
|
|
policy_config=policy_config,
|
|
|
|
)
|
2019-03-08 15:39:48 -08:00
|
|
|
env = _VectorEnvToBaseEnv(env)
|
|
|
|
assert isinstance(env, BaseEnv), env
|
2018-06-23 18:32:16 -07:00
|
|
|
return env
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@PublicAPI
|
2020-06-19 13:09:05 -07:00
|
|
|
def poll(self) -> Tuple[MultiEnvDict, MultiEnvDict, MultiEnvDict,
|
|
|
|
MultiEnvDict, MultiEnvDict]:
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
"""Returns observations from ready agents.
|
|
|
|
|
2018-06-23 18:32:16 -07:00
|
|
|
The returns are two-level dicts mapping from env_id to a dict of
|
|
|
|
agent_id to values. The number of agents and envs can vary over time.
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
|
2018-07-01 00:05:08 -07:00
|
|
|
Returns
|
|
|
|
-------
|
2018-06-23 18:32:16 -07:00
|
|
|
obs (dict): New observations for each ready agent.
|
|
|
|
rewards (dict): Reward values for each ready agent. If the
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
episode is just started, the value will be None.
|
2018-06-23 18:32:16 -07:00
|
|
|
dones (dict): Done values for each ready agent. The special key
|
|
|
|
"__all__" is used to indicate env termination.
|
|
|
|
infos (dict): Info values for each ready agent.
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
off_policy_actions (dict): Agents may take off-policy actions. When
|
|
|
|
that happens, there will be an entry in this dict that contains
|
2018-06-23 18:32:16 -07:00
|
|
|
the taken action. There is no need to send_actions() for agents
|
|
|
|
that have already chosen off-policy actions.
|
2018-07-01 00:05:08 -07:00
|
|
|
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@PublicAPI
|
2020-06-19 13:09:05 -07:00
|
|
|
def send_actions(self, action_dict: MultiEnvDict) -> None:
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
"""Called to send actions back to running agents in this env.
|
|
|
|
|
2018-06-23 18:32:16 -07:00
|
|
|
Actions should be sent for each ready agent that returned observations
|
|
|
|
in the previous poll() call.
|
|
|
|
|
2020-09-20 11:27:02 +02:00
|
|
|
Args:
|
2018-06-23 18:32:16 -07:00
|
|
|
action_dict (dict): Actions values keyed by env_id and agent_id.
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@PublicAPI
|
2020-06-19 13:09:05 -07:00
|
|
|
def try_reset(self,
|
|
|
|
env_id: Optional[EnvID] = None) -> Optional[MultiAgentDict]:
|
2020-05-30 22:48:34 +02:00
|
|
|
"""Attempt to reset the sub-env with the given id or all sub-envs.
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
|
|
|
|
If the environment does not support synchronous reset, None can be
|
|
|
|
returned here.
|
|
|
|
|
2020-05-30 22:48:34 +02:00
|
|
|
Args:
|
|
|
|
env_id (Optional[int]): The sub-env ID if applicable. If None,
|
|
|
|
reset the entire Env (i.e. all sub-envs).
|
|
|
|
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
Returns:
|
2020-10-01 16:57:10 +02:00
|
|
|
Optional[MultiAgentDict]: Resetted (multi-agent) observation dict
|
|
|
|
or None if reset is not supported.
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
"""
|
|
|
|
return None
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@PublicAPI
|
2020-06-19 13:09:05 -07:00
|
|
|
def get_unwrapped(self) -> List[EnvType]:
|
2018-08-23 17:49:10 -07:00
|
|
|
"""Return a reference to the underlying gym envs, if any.
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
|
|
|
|
Returns:
|
2018-08-23 17:49:10 -07:00
|
|
|
envs (list): Underlying gym envs or [].
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
"""
|
2018-08-23 17:49:10 -07:00
|
|
|
return []
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
|
2019-03-29 21:19:42 +01:00
|
|
|
@PublicAPI
|
2021-02-08 12:05:16 +01:00
|
|
|
def try_render(self, env_id: Optional[EnvID] = None) -> None:
|
|
|
|
"""Tries to render the environment.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
env_id (Optional[int]): The sub-env ID if applicable. If None,
|
|
|
|
renders the entire Env (i.e. all sub-envs).
|
|
|
|
"""
|
|
|
|
|
|
|
|
# By default, do nothing.
|
|
|
|
pass
|
|
|
|
|
2021-05-12 12:16:00 +02:00
|
|
|
@PublicAPI
|
|
|
|
def stop(self) -> None:
|
|
|
|
"""Releases all resources used."""
|
|
|
|
|
|
|
|
for env in self.get_unwrapped():
|
|
|
|
if hasattr(env, "close"):
|
|
|
|
env.close()
|
|
|
|
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
|
2018-06-23 18:32:16 -07:00
|
|
|
# Fixed agent identifier when there is only the single agent in the env
|
2019-03-29 12:44:23 -07:00
|
|
|
_DUMMY_AGENT_ID = "agent0"
|
2018-06-23 18:32:16 -07:00
|
|
|
|
|
|
|
|
2020-06-19 13:09:05 -07:00
|
|
|
def _with_dummy_agent_id(env_id_to_values: Dict[EnvID, Any],
|
|
|
|
dummy_id: "AgentID" = _DUMMY_AGENT_ID
|
|
|
|
) -> MultiEnvDict:
|
2018-06-23 18:32:16 -07:00
|
|
|
return {k: {dummy_id: v} for (k, v) in env_id_to_values.items()}
|
|
|
|
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
class _ExternalEnvToBaseEnv(BaseEnv):
|
|
|
|
"""Internal adapter of ExternalEnv to BaseEnv."""
|
2018-06-23 18:32:16 -07:00
|
|
|
|
2020-06-19 13:09:05 -07:00
|
|
|
def __init__(self,
|
|
|
|
external_env: ExternalEnv,
|
|
|
|
preprocessor: "Preprocessor" = None):
|
2018-11-12 16:31:27 -08:00
|
|
|
self.external_env = external_env
|
2018-10-20 15:21:22 -07:00
|
|
|
self.prep = preprocessor
|
2020-05-30 22:48:34 +02:00
|
|
|
self.multiagent = issubclass(type(external_env), ExternalMultiAgentEnv)
|
2018-11-12 16:31:27 -08:00
|
|
|
self.action_space = external_env.action_space
|
2018-10-20 15:21:22 -07:00
|
|
|
if preprocessor:
|
|
|
|
self.observation_space = preprocessor.observation_space
|
|
|
|
else:
|
2018-11-12 16:31:27 -08:00
|
|
|
self.observation_space = external_env.observation_space
|
|
|
|
external_env.start()
|
2018-06-23 18:32:16 -07:00
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@override(BaseEnv)
|
2020-06-19 13:09:05 -07:00
|
|
|
def poll(self) -> Tuple[MultiEnvDict, MultiEnvDict, MultiEnvDict,
|
|
|
|
MultiEnvDict, MultiEnvDict]:
|
2018-11-12 16:31:27 -08:00
|
|
|
with self.external_env._results_avail_condition:
|
2018-06-23 18:32:16 -07:00
|
|
|
results = self._poll()
|
|
|
|
while len(results[0]) == 0:
|
2018-11-12 16:31:27 -08:00
|
|
|
self.external_env._results_avail_condition.wait()
|
2018-06-23 18:32:16 -07:00
|
|
|
results = self._poll()
|
2021-08-31 22:03:23 +02:00
|
|
|
if not self.external_env.is_alive():
|
2018-06-23 18:32:16 -07:00
|
|
|
raise Exception("Serving thread has stopped.")
|
2018-11-12 16:31:27 -08:00
|
|
|
limit = self.external_env._max_concurrent_episodes
|
2018-06-23 18:32:16 -07:00
|
|
|
assert len(results[0]) < limit, \
|
2018-11-12 16:31:27 -08:00
|
|
|
("Too many concurrent episodes, were some leaked? This "
|
|
|
|
"ExternalEnv was created with max_concurrent={}".format(limit))
|
2018-06-23 18:32:16 -07:00
|
|
|
return results
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@override(BaseEnv)
|
2020-06-19 13:09:05 -07:00
|
|
|
def send_actions(self, action_dict: MultiEnvDict) -> None:
|
2019-04-07 04:58:14 +02:00
|
|
|
if self.multiagent:
|
|
|
|
for env_id, actions in action_dict.items():
|
|
|
|
self.external_env._episodes[env_id].action_queue.put(actions)
|
|
|
|
else:
|
|
|
|
for env_id, action in action_dict.items():
|
|
|
|
self.external_env._episodes[env_id].action_queue.put(
|
|
|
|
action[_DUMMY_AGENT_ID])
|
2018-12-08 16:28:58 -08:00
|
|
|
|
2020-06-19 13:09:05 -07:00
|
|
|
def _poll(self) -> Tuple[MultiEnvDict, MultiEnvDict, MultiEnvDict,
|
|
|
|
MultiEnvDict, MultiEnvDict]:
|
2018-06-23 18:32:16 -07:00
|
|
|
all_obs, all_rewards, all_dones, all_infos = {}, {}, {}, {}
|
|
|
|
off_policy_actions = {}
|
2018-11-12 16:31:27 -08:00
|
|
|
for eid, episode in self.external_env._episodes.copy().items():
|
2018-06-23 18:32:16 -07:00
|
|
|
data = episode.get_data()
|
2019-04-07 04:58:14 +02:00
|
|
|
cur_done = episode.cur_done_dict[
|
|
|
|
"__all__"] if self.multiagent else episode.cur_done
|
|
|
|
if cur_done:
|
2018-11-12 16:31:27 -08:00
|
|
|
del self.external_env._episodes[eid]
|
2018-06-23 18:32:16 -07:00
|
|
|
if data:
|
2018-10-20 15:21:22 -07:00
|
|
|
if self.prep:
|
|
|
|
all_obs[eid] = self.prep.transform(data["obs"])
|
|
|
|
else:
|
|
|
|
all_obs[eid] = data["obs"]
|
2018-06-23 18:32:16 -07:00
|
|
|
all_rewards[eid] = data["reward"]
|
|
|
|
all_dones[eid] = data["done"]
|
|
|
|
all_infos[eid] = data["info"]
|
|
|
|
if "off_policy_action" in data:
|
|
|
|
off_policy_actions[eid] = data["off_policy_action"]
|
2019-04-07 04:58:14 +02:00
|
|
|
if self.multiagent:
|
2020-05-30 22:48:34 +02:00
|
|
|
# Ensure a consistent set of keys
|
|
|
|
# rely on all_obs having all possible keys for now.
|
2019-04-07 04:58:14 +02:00
|
|
|
for eid, eid_dict in all_obs.items():
|
|
|
|
for agent_id in eid_dict.keys():
|
|
|
|
|
|
|
|
def fix(d, zero_val):
|
|
|
|
if agent_id not in d[eid]:
|
|
|
|
d[eid][agent_id] = zero_val
|
|
|
|
|
|
|
|
fix(all_rewards, 0.0)
|
|
|
|
fix(all_dones, False)
|
|
|
|
fix(all_infos, {})
|
|
|
|
return (all_obs, all_rewards, all_dones, all_infos,
|
|
|
|
off_policy_actions)
|
|
|
|
else:
|
|
|
|
return _with_dummy_agent_id(all_obs), \
|
|
|
|
_with_dummy_agent_id(all_rewards), \
|
|
|
|
_with_dummy_agent_id(all_dones, "__all__"), \
|
|
|
|
_with_dummy_agent_id(all_infos), \
|
|
|
|
_with_dummy_agent_id(off_policy_actions)
|
2018-06-23 18:32:16 -07:00
|
|
|
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
class _VectorEnvToBaseEnv(BaseEnv):
|
|
|
|
"""Internal adapter of VectorEnv to BaseEnv.
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
|
|
|
|
We assume the caller will always send the full vector of actions in each
|
|
|
|
call to send_actions(), and that they call reset_at() on all completed
|
|
|
|
environments before calling send_actions().
|
|
|
|
"""
|
|
|
|
|
2020-06-19 13:09:05 -07:00
|
|
|
def __init__(self, vector_env: VectorEnv):
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
self.vector_env = vector_env
|
2018-10-20 15:21:22 -07:00
|
|
|
self.action_space = vector_env.action_space
|
|
|
|
self.observation_space = vector_env.observation_space
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
self.num_envs = vector_env.num_envs
|
2018-11-11 01:45:37 -08:00
|
|
|
self.new_obs = None # lazily initialized
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
self.cur_rewards = [None for _ in range(self.num_envs)]
|
|
|
|
self.cur_dones = [False for _ in range(self.num_envs)]
|
|
|
|
self.cur_infos = [None for _ in range(self.num_envs)]
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@override(BaseEnv)
|
2020-06-19 13:09:05 -07:00
|
|
|
def poll(self) -> Tuple[MultiEnvDict, MultiEnvDict, MultiEnvDict,
|
|
|
|
MultiEnvDict, MultiEnvDict]:
|
2018-11-11 01:45:37 -08:00
|
|
|
if self.new_obs is None:
|
|
|
|
self.new_obs = self.vector_env.vector_reset()
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
new_obs = dict(enumerate(self.new_obs))
|
|
|
|
rewards = dict(enumerate(self.cur_rewards))
|
|
|
|
dones = dict(enumerate(self.cur_dones))
|
|
|
|
infos = dict(enumerate(self.cur_infos))
|
|
|
|
self.new_obs = []
|
|
|
|
self.cur_rewards = []
|
|
|
|
self.cur_dones = []
|
|
|
|
self.cur_infos = []
|
2018-06-23 18:32:16 -07:00
|
|
|
return _with_dummy_agent_id(new_obs), \
|
|
|
|
_with_dummy_agent_id(rewards), \
|
|
|
|
_with_dummy_agent_id(dones, "__all__"), \
|
|
|
|
_with_dummy_agent_id(infos), {}
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@override(BaseEnv)
|
2020-06-19 13:09:05 -07:00
|
|
|
def send_actions(self, action_dict: MultiEnvDict) -> None:
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
action_vector = [None] * self.num_envs
|
|
|
|
for i in range(self.num_envs):
|
2018-06-23 18:32:16 -07:00
|
|
|
action_vector[i] = action_dict[i][_DUMMY_AGENT_ID]
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
self.new_obs, self.cur_rewards, self.cur_dones, self.cur_infos = \
|
|
|
|
self.vector_env.vector_step(action_vector)
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@override(BaseEnv)
|
2021-02-08 12:05:16 +01:00
|
|
|
def try_reset(self, env_id: Optional[EnvID] = None) -> MultiAgentDict:
|
|
|
|
assert env_id is None or isinstance(env_id, int)
|
2018-06-23 18:32:16 -07:00
|
|
|
return {_DUMMY_AGENT_ID: self.vector_env.reset_at(env_id)}
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@override(BaseEnv)
|
2020-06-19 13:09:05 -07:00
|
|
|
def get_unwrapped(self) -> List[EnvType]:
|
[rllib] Envs for vectorized execution, async execution, and policy serving (#2170)
## What do these changes do?
**Vectorized envs**: Users can either implement `VectorEnv`, or alternatively set `num_envs=N` to auto-vectorize gym envs (this vectorizes just the action computation part).
```
# CartPole-v0 on single core with 64x64 MLP:
# vector_width=1:
Actions per second 2720.1284458322966
# vector_width=8:
Actions per second 13773.035334888269
# vector_width=64:
Actions per second 37903.20472563333
```
**Async envs**: The more general form of `VectorEnv` is `AsyncVectorEnv`, which allows agents to execute out of lockstep. We use this as an adapter to support `ServingEnv`. Since we can convert any other form of env to `AsyncVectorEnv`, utils.sampler has been rewritten to run against this interface.
**Policy serving**: This provides an env which is not stepped. Rather, the env executes in its own thread, querying the policy for actions via `self.get_action(obs)`, and reporting results via `self.log_returns(rewards)`. We also support logging of off-policy actions via `self.log_action(obs, action)`. This is a more convenient API for some use cases, and also provides parallelizable support for policy serving (for example, if you start a HTTP server in the env) and ingest of offline logs (if the env reads from serving logs).
Any of these types of envs can be passed to RLlib agents. RLlib handles conversions internally in CommonPolicyEvaluator, for example:
```
gym.Env => rllib.VectorEnv => rllib.AsyncVectorEnv
rllib.ServingEnv => rllib.AsyncVectorEnv
```
2018-06-18 11:55:32 -07:00
|
|
|
return self.vector_env.get_unwrapped()
|
2018-06-23 18:32:16 -07:00
|
|
|
|
2021-02-08 12:05:16 +01:00
|
|
|
@override(BaseEnv)
|
|
|
|
def try_render(self, env_id: Optional[EnvID] = None) -> None:
|
|
|
|
assert env_id is None or isinstance(env_id, int)
|
|
|
|
return self.vector_env.try_render_at(env_id)
|
|
|
|
|
2018-06-23 18:32:16 -07:00
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
class _MultiAgentEnvToBaseEnv(BaseEnv):
|
|
|
|
"""Internal adapter of MultiAgentEnv to BaseEnv.
|
2018-06-23 18:32:16 -07:00
|
|
|
|
|
|
|
This also supports vectorization if num_envs > 1.
|
|
|
|
"""
|
|
|
|
|
2020-06-19 13:09:05 -07:00
|
|
|
def __init__(self, make_env: Callable[[int], EnvType],
|
|
|
|
existing_envs: List[MultiAgentEnv], num_envs: int):
|
2018-06-23 18:32:16 -07:00
|
|
|
"""Wrap existing multi-agent envs.
|
|
|
|
|
2020-09-20 11:27:02 +02:00
|
|
|
Args:
|
2018-06-23 18:32:16 -07:00
|
|
|
make_env (func|None): Factory that produces a new multiagent env.
|
|
|
|
Must be defined if the number of existing envs is less than
|
|
|
|
num_envs.
|
|
|
|
existing_envs (list): List of existing multiagent envs.
|
|
|
|
num_envs (int): Desired num multiagent envs to keep total.
|
|
|
|
"""
|
|
|
|
self.make_env = make_env
|
|
|
|
self.envs = existing_envs
|
|
|
|
self.num_envs = num_envs
|
|
|
|
self.dones = set()
|
|
|
|
while len(self.envs) < self.num_envs:
|
2018-08-01 16:29:27 -07:00
|
|
|
self.envs.append(self.make_env(len(self.envs)))
|
2018-06-23 18:32:16 -07:00
|
|
|
for env in self.envs:
|
|
|
|
assert isinstance(env, MultiAgentEnv)
|
|
|
|
self.env_states = [_MultiAgentEnvState(env) for env in self.envs]
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@override(BaseEnv)
|
2020-06-19 13:09:05 -07:00
|
|
|
def poll(self) -> Tuple[MultiEnvDict, MultiEnvDict, MultiEnvDict,
|
|
|
|
MultiEnvDict, MultiEnvDict]:
|
2018-06-23 18:32:16 -07:00
|
|
|
obs, rewards, dones, infos = {}, {}, {}, {}
|
|
|
|
for i, env_state in enumerate(self.env_states):
|
|
|
|
obs[i], rewards[i], dones[i], infos[i] = env_state.poll()
|
|
|
|
return obs, rewards, dones, infos, {}
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@override(BaseEnv)
|
2020-06-19 13:09:05 -07:00
|
|
|
def send_actions(self, action_dict: MultiEnvDict) -> None:
|
2018-06-23 18:32:16 -07:00
|
|
|
for env_id, agent_dict in action_dict.items():
|
|
|
|
if env_id in self.dones:
|
|
|
|
raise ValueError("Env {} is already done".format(env_id))
|
|
|
|
env = self.envs[env_id]
|
|
|
|
obs, rewards, dones, infos = env.step(agent_dict)
|
2018-10-15 13:42:56 -07:00
|
|
|
assert isinstance(obs, dict), "Not a multi-agent obs"
|
|
|
|
assert isinstance(rewards, dict), "Not a multi-agent reward"
|
|
|
|
assert isinstance(dones, dict), "Not a multi-agent return"
|
|
|
|
assert isinstance(infos, dict), "Not a multi-agent info"
|
2018-12-18 10:40:01 -08:00
|
|
|
if set(infos).difference(set(obs)):
|
|
|
|
raise ValueError("Key set for infos must be a subset of obs: "
|
|
|
|
"{} vs {}".format(infos.keys(), obs.keys()))
|
2019-02-23 21:23:40 -08:00
|
|
|
if "__all__" not in dones:
|
|
|
|
raise ValueError(
|
|
|
|
"In multi-agent environments, '__all__': True|False must "
|
|
|
|
"be included in the 'done' dict: got {}.".format(dones))
|
2018-06-23 18:32:16 -07:00
|
|
|
if dones["__all__"]:
|
|
|
|
self.dones.add(env_id)
|
|
|
|
self.env_states[env_id].observe(obs, rewards, dones, infos)
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@override(BaseEnv)
|
2020-06-19 13:09:05 -07:00
|
|
|
def try_reset(self,
|
|
|
|
env_id: Optional[EnvID] = None) -> Optional[MultiAgentDict]:
|
2018-06-23 18:32:16 -07:00
|
|
|
obs = self.env_states[env_id].reset()
|
2018-10-15 13:42:56 -07:00
|
|
|
assert isinstance(obs, dict), "Not a multi-agent obs"
|
2018-08-03 16:37:56 -07:00
|
|
|
if obs is not None and env_id in self.dones:
|
2018-06-23 18:32:16 -07:00
|
|
|
self.dones.remove(env_id)
|
|
|
|
return obs
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@override(BaseEnv)
|
2020-06-19 13:09:05 -07:00
|
|
|
def get_unwrapped(self) -> List[EnvType]:
|
2019-01-20 15:00:18 -08:00
|
|
|
return [state.env for state in self.env_states]
|
|
|
|
|
2021-05-12 12:16:00 +02:00
|
|
|
@override(BaseEnv)
|
|
|
|
def try_render(self, env_id: Optional[EnvID] = None) -> None:
|
2021-06-19 08:57:53 +02:00
|
|
|
if env_id is None:
|
|
|
|
env_id = 0
|
|
|
|
assert isinstance(env_id, int)
|
|
|
|
return self.envs[env_id].render()
|
2021-05-12 12:16:00 +02:00
|
|
|
|
2018-06-23 18:32:16 -07:00
|
|
|
|
2020-01-02 17:42:13 -08:00
|
|
|
class _MultiAgentEnvState:
|
2020-06-19 13:09:05 -07:00
|
|
|
def __init__(self, env: MultiAgentEnv):
|
2018-06-23 18:32:16 -07:00
|
|
|
assert isinstance(env, MultiAgentEnv)
|
|
|
|
self.env = env
|
2019-01-20 15:00:18 -08:00
|
|
|
self.initialized = False
|
2018-06-23 18:32:16 -07:00
|
|
|
|
2020-06-19 13:09:05 -07:00
|
|
|
def poll(self) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict,
|
|
|
|
MultiAgentDict, MultiAgentDict]:
|
2019-01-20 15:00:18 -08:00
|
|
|
if not self.initialized:
|
|
|
|
self.reset()
|
|
|
|
self.initialized = True
|
2021-06-21 13:46:01 +02:00
|
|
|
|
|
|
|
observations = self.last_obs
|
|
|
|
rewards = {}
|
|
|
|
dones = {"__all__": self.last_dones["__all__"]}
|
|
|
|
infos = {}
|
|
|
|
|
|
|
|
# If episode is done, release everything we have.
|
|
|
|
if dones["__all__"]:
|
|
|
|
rewards = self.last_rewards
|
|
|
|
self.last_rewards = {}
|
|
|
|
dones = self.last_dones
|
|
|
|
self.last_dones = {}
|
|
|
|
self.last_obs = {}
|
2021-07-13 17:33:48 +02:00
|
|
|
infos = self.last_infos
|
|
|
|
self.last_infos = {}
|
2021-06-21 13:46:01 +02:00
|
|
|
# Only release those agents' rewards/dones/infos, whose
|
|
|
|
# observations we have.
|
|
|
|
else:
|
|
|
|
for ag in observations.keys():
|
|
|
|
if ag in self.last_rewards:
|
|
|
|
rewards[ag] = self.last_rewards[ag]
|
|
|
|
del self.last_rewards[ag]
|
|
|
|
if ag in self.last_dones:
|
|
|
|
dones[ag] = self.last_dones[ag]
|
|
|
|
del self.last_dones[ag]
|
2021-07-13 17:33:48 +02:00
|
|
|
if ag in self.last_infos:
|
|
|
|
infos[ag] = self.last_infos[ag]
|
|
|
|
del self.last_infos[ag]
|
2021-06-21 13:46:01 +02:00
|
|
|
|
|
|
|
self.last_dones["__all__"] = False
|
2018-06-25 22:33:57 -07:00
|
|
|
self.last_infos = {}
|
2021-06-21 13:46:01 +02:00
|
|
|
return observations, rewards, dones, infos
|
2018-06-23 18:32:16 -07:00
|
|
|
|
2020-06-19 13:09:05 -07:00
|
|
|
def observe(self, obs: MultiAgentDict, rewards: MultiAgentDict,
|
|
|
|
dones: MultiAgentDict, infos: MultiAgentDict):
|
2018-06-23 18:32:16 -07:00
|
|
|
self.last_obs = obs
|
2021-06-21 13:46:01 +02:00
|
|
|
for ag, r in rewards.items():
|
|
|
|
if ag in self.last_rewards:
|
|
|
|
self.last_rewards[ag] += r
|
|
|
|
else:
|
|
|
|
self.last_rewards[ag] = r
|
|
|
|
for ag, d in dones.items():
|
|
|
|
if ag in self.last_dones:
|
|
|
|
self.last_dones[ag] = self.last_dones[ag] or d
|
|
|
|
else:
|
|
|
|
self.last_dones[ag] = d
|
2018-06-23 18:32:16 -07:00
|
|
|
self.last_infos = infos
|
|
|
|
|
2020-06-19 13:09:05 -07:00
|
|
|
def reset(self) -> MultiAgentDict:
|
2018-06-23 18:32:16 -07:00
|
|
|
self.last_obs = self.env.reset()
|
2021-06-21 13:46:01 +02:00
|
|
|
self.last_rewards = {}
|
|
|
|
self.last_dones = {"__all__": False}
|
|
|
|
self.last_infos = {}
|
2018-06-23 18:32:16 -07:00
|
|
|
return self.last_obs
|