ray/rllib/agents/marwil/marwil.py

from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.agents.marwil.marwil_tf_policy import MARWILTFPolicy
from ray.rllib.execution.replay_ops import SimpleReplayBuffer, Replay, \
    StoreToReplayBuffer
from ray.rllib.execution.rollout_ops import ParallelRollouts, ConcatBatches
from ray.rllib.execution.concurrency_ops import Concurrently
from ray.rllib.execution.train_ops import TrainOneStep
from ray.rllib.execution.metric_ops import StandardMetricsReporting

# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
    # You should override this to point to an offline dataset (see agent.py).
    "input": "sampler",
    # Use importance sampling estimators for reward
    "input_evaluation": ["is", "wis"],

    # If true, use the Generalized Advantage Estimator (GAE)
    # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
    "use_gae": True,

    # Scaling of advantages in exponential terms.
    # When beta is 0.0, MARWIL is reduced to imitation learning.
    "beta": 1.0,
    # Balancing value estimation loss and policy optimization loss.
    "vf_coeff": 1.0,
    # If specified, clip the global norm of gradients by this amount.
    "grad_clip": None,
    # Whether to calculate cumulative rewards.
    "postprocess_inputs": True,
    # Whether to rollout "complete_episodes" or "truncate_episodes".
    "batch_mode": "complete_episodes",
    # Learning rate for adam optimizer.
    "lr": 1e-4,
    # Number of timesteps collected for each SGD round.
    "train_batch_size": 2000,
    # Size of the replay buffer in batches (not timesteps!).
    "replay_buffer_size": 1000,
    # Number of steps to read before learning starts.
    "learning_starts": 0,
    # === Parallelism ===
    "num_workers": 0,
})
# __sphinx_doc_end__
# yapf: enable


def get_policy_class(config):
    if config["framework"] == "torch":
        from ray.rllib.agents.marwil.marwil_torch_policy import \
            MARWILTorchPolicy
        return MARWILTorchPolicy


def execution_plan(workers, config):
    rollouts = ParallelRollouts(workers, mode="bulk_sync")
    replay_buffer = SimpleReplayBuffer(config["replay_buffer_size"])

    store_op = rollouts \
        .for_each(StoreToReplayBuffer(local_buffer=replay_buffer))

    replay_op = Replay(local_buffer=replay_buffer) \
        .combine(
            ConcatBatches(
                min_batch_size=config["train_batch_size"],
                count_steps_by=config["multiagent"]["count_steps_by"],
            )) \
        .for_each(TrainOneStep(workers))

    train_op = Concurrently(
        [store_op, replay_op], mode="round_robin", output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)


def validate_config(config):
    if config["num_gpus"] > 1:
        raise ValueError("`num_gpus` > 1 not yet supported for MARWIL!")


MARWILTrainer = build_trainer(
    name="MARWIL",
    default_config=DEFAULT_CONFIG,
    default_policy=MARWILTFPolicy,
    get_policy_class=get_policy_class,
    validate_config=validate_config,
    execution_plan=execution_plan)
[rllib] Port remainder of algorithms to build_trainer() pattern (#4920) 2019-06-07 16:45:36 -07:00			`from ray.rllib.agents.trainer import with_common_config`
			`from ray.rllib.agents.trainer_template import build_trainer`
[RLlib] MARWIL torch. (#7836) * WIP. * WIP. * LINT. * Fix MARWIL so it can run with eager-mode. * LINT. 2020-04-07 01:38:50 +02:00			`from ray.rllib.agents.marwil.marwil_tf_policy import MARWILTFPolicy`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`from ray.rllib.execution.replay_ops import SimpleReplayBuffer, Replay, \`
			`StoreToReplayBuffer`
			`from ray.rllib.execution.rollout_ops import ParallelRollouts, ConcatBatches`
			`from ray.rllib.execution.concurrency_ops import Concurrently`
			`from ray.rllib.execution.train_ops import TrainOneStep`
			`from ray.rllib.execution.metric_ops import StandardMetricsReporting`
[rllib] Develop MARWIL (#3635) * add marvil policy graph * fix typo * add offline optimizer and enable running marwil * fix loss function * add maintaining the moving average of advantage norm * use sync replay optimizer for unifying * remove offline optimizer and use sync replay optimizer * format by yapf * add imitation learning objective * fix according to eric's review * format by yapf * revise * add test data * marwil 2019-01-17 11:00:43 +08:00
			`# yapf: disable`
			`# __sphinx_doc_begin__`
			`DEFAULT_CONFIG = with_common_config({`
[rllib] Use model.value_function() in MARWIL (#4036) * fix marwil * add ph * fix 2019-02-14 19:35:21 -08:00			`# You should override this to point to an offline dataset (see agent.py).`
			`"input": "sampler",`
			`# Use importance sampling estimators for reward`
			`"input_evaluation": ["is", "wis"],`

[RLlib] Issue 14523: Torch + py3.8 leads to GPU device error. (#15014) 2021-03-30 21:43:11 +02:00			`# If true, use the Generalized Advantage Estimator (GAE)`
			`# with a value function, see https://arxiv.org/pdf/1506.02438.pdf.`
			`"use_gae": True,`

[RLlib] Behavioral Cloning (from MARWIL). (#10619) 2020-09-09 17:33:21 +02:00			`# Scaling of advantages in exponential terms.`
			`# When beta is 0.0, MARWIL is reduced to imitation learning.`
[rllib] Develop MARWIL (#3635) * add marvil policy graph * fix typo * add offline optimizer and enable running marwil * fix loss function * add maintaining the moving average of advantage norm * use sync replay optimizer for unifying * remove offline optimizer and use sync replay optimizer * format by yapf * add imitation learning objective * fix according to eric's review * format by yapf * revise * add test data * marwil 2019-01-17 11:00:43 +08:00			`"beta": 1.0,`
[RLlib] Behavioral Cloning (from MARWIL). (#10619) 2020-09-09 17:33:21 +02:00			`# Balancing value estimation loss and policy optimization loss.`
[rllib] Develop MARWIL (#3635) * add marvil policy graph * fix typo * add offline optimizer and enable running marwil * fix loss function * add maintaining the moving average of advantage norm * use sync replay optimizer for unifying * remove offline optimizer and use sync replay optimizer * format by yapf * add imitation learning objective * fix according to eric's review * format by yapf * revise * add test data * marwil 2019-01-17 11:00:43 +08:00			`"vf_coeff": 1.0,`
[RLlib] Add grad_clip config option to MARWIL and stabilize grad clipping against inf global_norms. (#13634) 2021-01-22 19:36:02 +01:00			`# If specified, clip the global norm of gradients by this amount.`
			`"grad_clip": None,`
[RLlib] Behavioral Cloning (from MARWIL). (#10619) 2020-09-09 17:33:21 +02:00			`# Whether to calculate cumulative rewards.`
[rllib] Develop MARWIL (#3635) * add marvil policy graph * fix typo * add offline optimizer and enable running marwil * fix loss function * add maintaining the moving average of advantage norm * use sync replay optimizer for unifying * remove offline optimizer and use sync replay optimizer * format by yapf * add imitation learning objective * fix according to eric's review * format by yapf * revise * add test data * marwil 2019-01-17 11:00:43 +08:00			`"postprocess_inputs": True,`
[RLlib] Behavioral Cloning (from MARWIL). (#10619) 2020-09-09 17:33:21 +02:00			`# Whether to rollout "complete_episodes" or "truncate_episodes".`
[rllib] Develop MARWIL (#3635) * add marvil policy graph * fix typo * add offline optimizer and enable running marwil * fix loss function * add maintaining the moving average of advantage norm * use sync replay optimizer for unifying * remove offline optimizer and use sync replay optimizer * format by yapf * add imitation learning objective * fix according to eric's review * format by yapf * revise * add test data * marwil 2019-01-17 11:00:43 +08:00			`"batch_mode": "complete_episodes",`
[RLlib] Behavioral Cloning (from MARWIL). (#10619) 2020-09-09 17:33:21 +02:00			`# Learning rate for adam optimizer.`
[rllib] Develop MARWIL (#3635) * add marvil policy graph * fix typo * add offline optimizer and enable running marwil * fix loss function * add maintaining the moving average of advantage norm * use sync replay optimizer for unifying * remove offline optimizer and use sync replay optimizer * format by yapf * add imitation learning objective * fix according to eric's review * format by yapf * revise * add test data * marwil 2019-01-17 11:00:43 +08:00			`"lr": 1e-4,`
[RLlib] Behavioral Cloning (from MARWIL). (#10619) 2020-09-09 17:33:21 +02:00			`# Number of timesteps collected for each SGD round.`
[rllib] Develop MARWIL (#3635) * add marvil policy graph * fix typo * add offline optimizer and enable running marwil * fix loss function * add maintaining the moving average of advantage norm * use sync replay optimizer for unifying * remove offline optimizer and use sync replay optimizer * format by yapf * add imitation learning objective * fix according to eric's review * format by yapf * revise * add test data * marwil 2019-01-17 11:00:43 +08:00			`"train_batch_size": 2000,`
add large data warning (#10957) 2020-09-23 15:46:06 -07:00			`# Size of the replay buffer in batches (not timesteps!).`
			`"replay_buffer_size": 1000,`
[RLlib] Behavioral Cloning (from MARWIL). (#10619) 2020-09-09 17:33:21 +02:00			`# Number of steps to read before learning starts.`
[rllib] Develop MARWIL (#3635) * add marvil policy graph * fix typo * add offline optimizer and enable running marwil * fix loss function * add maintaining the moving average of advantage norm * use sync replay optimizer for unifying * remove offline optimizer and use sync replay optimizer * format by yapf * add imitation learning objective * fix according to eric's review * format by yapf * revise * add test data * marwil 2019-01-17 11:00:43 +08:00			`"learning_starts": 0,`
			`# === Parallelism ===`
			`"num_workers": 0,`
			`})`
			`# __sphinx_doc_end__`
			`# yapf: enable`


[RLlib] MARWIL torch. (#7836) * WIP. * WIP. * LINT. * Fix MARWIL so it can run with eager-mode. * LINT. 2020-04-07 01:38:50 +02:00			`def get_policy_class(config):`
[RLlib] Auto-framework, retire `use_pytorch` in favor of `framework=...` (#8520) 2020-05-27 16:19:13 +02:00			`if config["framework"] == "torch":`
[RLlib] MARWIL torch. (#7836) * WIP. * WIP. * LINT. * Fix MARWIL so it can run with eager-mode. * LINT. 2020-04-07 01:38:50 +02:00			`from ray.rllib.agents.marwil.marwil_torch_policy import \`
			`MARWILTorchPolicy`
			`return MARWILTorchPolicy`
[RLlib] Add `torch` flag to train.py (#6807) 2020-01-18 03:48:44 +01:00

[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`def execution_plan(workers, config):`
			`rollouts = ParallelRollouts(workers, mode="bulk_sync")`
			`replay_buffer = SimpleReplayBuffer(config["replay_buffer_size"])`

			`store_op = rollouts \`
			`.for_each(StoreToReplayBuffer(local_buffer=replay_buffer))`

			`replay_op = Replay(local_buffer=replay_buffer) \`
			`.combine(`
[RLlib] Batch-size for truncate_episode batch_mode should be confgurable in agent-steps (rather than env-steps), if needed. (#12420) 2020-12-09 01:41:45 +01:00			`ConcatBatches(`
			`min_batch_size=config["train_batch_size"],`
			`count_steps_by=config["multiagent"]["count_steps_by"],`
			`)) \`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`.for_each(TrainOneStep(workers))`

			`train_op = Concurrently(`
			`[store_op, replay_op], mode="round_robin", output_indexes=[1])`

			`return StandardMetricsReporting(train_op, workers, config)`


[RLlib] Multi-GPU for tf-DQN/PG/A2C. (#13393) 2021-03-08 15:41:27 +01:00			`def validate_config(config):`
			`if config["num_gpus"] > 1:`
			raise ValueError("`num_gpus` > 1 not yet supported for MARWIL!")


[rllib] Port remainder of algorithms to build_trainer() pattern (#4920) 2019-06-07 16:45:36 -07:00			`MARWILTrainer = build_trainer(`
			`name="MARWIL",`
			`default_config=DEFAULT_CONFIG,`
[RLlib] Update MARWIL to use tf policy template (#6975) * update MARWIL to use tf policy template * formatting fixes 2020-01-31 20:57:52 +00:00			`default_policy=MARWILTFPolicy,`
[RLlib] MARWIL torch. (#7836) * WIP. * WIP. * LINT. * Fix MARWIL so it can run with eager-mode. * LINT. 2020-04-07 01:38:50 +02:00			`get_policy_class=get_policy_class,`
[RLlib] Multi-GPU for tf-DQN/PG/A2C. (#13393) 2021-03-08 15:41:27 +01:00			`validate_config=validate_config,`
[rllib] Deprecate policy optimizers (#8345) 2020-05-21 10:16:18 -07:00			`execution_plan=execution_plan)`