ray/rllib/agents/cql/cql.py

"""CQL (derived from SAC).
"""
import logging
import numpy as np
from typing import Optional, Type

from ray.rllib.agents.cql.cql_tf_policy import CQLTFPolicy
from ray.rllib.agents.cql.cql_torch_policy import CQLTorchPolicy
from ray.rllib.agents.sac.sac import SACTrainer, \
    DEFAULT_CONFIG as SAC_CONFIG
from ray.rllib.execution.metric_ops import StandardMetricsReporting
from ray.rllib.execution.replay_buffer import LocalReplayBuffer
from ray.rllib.execution.replay_ops import Replay
from ray.rllib.execution.train_ops import MultiGPUTrainOneStep, TrainOneStep, \
    UpdateTargetNetwork
from ray.rllib.offline.shuffled_input import ShuffledInput
from ray.rllib.policy.policy import LEARNER_STATS_KEY, Policy
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils import merge_dicts
from ray.rllib.utils.framework import try_import_tf, try_import_tfp
from ray.rllib.utils.typing import TrainerConfigDict

tf1, tf, tfv = try_import_tf()
tfp = try_import_tfp()
logger = logging.getLogger(__name__)
replay_buffer = None

# yapf: disable
# __sphinx_doc_begin__
CQL_DEFAULT_CONFIG = merge_dicts(
    SAC_CONFIG, {
        # You should override this to point to an offline dataset.
        "input": "sampler",
        # Switch off off-policy evaluation.
        "input_evaluation": [],
        # Number of iterations with Behavior Cloning Pretraining.
        "bc_iters": 20000,
        # CQL loss temperature.
        "temperature": 1.0,
        # Number of actions to sample for CQL loss.
        "num_actions": 10,
        # Whether to use the Lagrangian for Alpha Prime (in CQL loss).
        "lagrangian": False,
        # Lagrangian threshold.
        "lagrangian_thresh": 5.0,
        # Min Q weight multiplier.
        "min_q_weight": 5.0,
        # Replay buffer should be larger or equal the size of the offline
        # dataset.
        "buffer_size": int(1e6),
    })
# __sphinx_doc_end__
# yapf: enable


def validate_config(config: TrainerConfigDict):
    if config["num_gpus"] > 1:
        raise ValueError("`num_gpus` > 1 not yet supported for CQL!")

    # CQL-torch performs the optimizer steps inside the loss function.
    # Using the multi-GPU optimizer will therefore not work (see multi-GPU
    # check above) and we must use the simple optimizer for now.
    if config["simple_optimizer"] is not True and \
            config["framework"] == "torch":
        config["simple_optimizer"] = True

    if config["framework"] in ["tf", "tf2", "tfe"] and tfp is None:
        logger.warning(
            "You need `tensorflow_probability` in order to run CQL! "
            "Install it via `pip install tensorflow_probability`. Your "
            f"tf.__version__={tf.__version__ if tf else None}."
            "Trying to import tfp results in the following error:")
        try_import_tfp(error=True)


def execution_plan(workers, config):
    if config.get("prioritized_replay"):
        prio_args = {
            "prioritized_replay_alpha": config["prioritized_replay_alpha"],
            "prioritized_replay_beta": config["prioritized_replay_beta"],
            "prioritized_replay_eps": config["prioritized_replay_eps"],
        }
    else:
        prio_args = {}

    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config.get("replay_sequence_length", 1),
        replay_burn_in=config.get("burn_in", 0),
        replay_zero_init_states=config.get("zero_init_states", True),
        **prio_args)

    global replay_buffer
    replay_buffer = local_replay_buffer

    def update_prio(item):
        samples, info_dict = item
        if config.get("prioritized_replay"):
            prio_dict = {}
            for policy_id, info in info_dict.items():
                # TODO(sven): This is currently structured differently for
                #  torch/tf. Clean up these results/info dicts across
                #  policies (note: fixing this in torch_policy.py will
                #  break e.g. DDPPO!).
                td_error = info.get("td_error",
                                    info[LEARNER_STATS_KEY].get("td_error"))
                samples.policy_batches[policy_id].set_get_interceptor(None)
                prio_dict[policy_id] = (samples.policy_batches[policy_id]
                                        .get("batch_indexes"), td_error)
            local_replay_buffer.update_priorities(prio_dict)
        return info_dict

    # (2) Read and train on experiences from the replay buffer. Every batch
    # returned from the LocalReplay() iterator is passed to TrainOneStep to
    # take a SGD step, and then we decide whether to update the target network.
    post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)

    if config["simple_optimizer"]:
        train_step_op = TrainOneStep(workers)
    else:
        train_step_op = MultiGPUTrainOneStep(
            workers=workers,
            sgd_minibatch_size=config["train_batch_size"],
            num_sgd_iter=1,
            num_gpus=config["num_gpus"],
            shuffle_sequences=True,
            _fake_gpus=config["_fake_gpus"],
            framework=config.get("framework"))

    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(lambda x: post_fn(x, workers, config)) \
        .for_each(train_step_op) \
        .for_each(update_prio) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    return StandardMetricsReporting(
        replay_op, workers, config, by_steps_trained=True)


def get_policy_class(config: TrainerConfigDict) -> Optional[Type[Policy]]:
    if config["framework"] == "torch":
        return CQLTorchPolicy


def after_init(trainer):
    # Add the entire dataset to Replay Buffer (global variable)
    global replay_buffer
    reader = trainer.workers.local_worker().input_reader

    # For d4rl, add the D4RLReaders' dataset to the buffer.
    if isinstance(trainer.config["input"], str) and \
            "d4rl" in trainer.config["input"]:
        dataset = reader.dataset
        replay_buffer.add_batch(dataset)
    # For a list of files, add each file's entire content to the buffer.
    elif isinstance(reader, ShuffledInput):
        num_batches = 0
        total_timesteps = 0
        for batch in reader.child.read_all_files():
            num_batches += 1
            total_timesteps += len(batch)
            # Add NEXT_OBS if not available. This is slightly hacked
            # as for the very last time step, we will use next-obs=zeros
            # and therefore force-set DONE=True to avoid this missing
            # next-obs to cause learning problems.
            if SampleBatch.NEXT_OBS not in batch:
                obs = batch[SampleBatch.OBS]
                batch[SampleBatch.NEXT_OBS] = \
                    np.concatenate([obs[1:], np.zeros_like(obs[0:1])])
                batch[SampleBatch.DONES][-1] = True
            replay_buffer.add_batch(batch)
        print(f"Loaded {num_batches} batches ({total_timesteps} ts) into the "
              f"replay buffer, which has capacity {replay_buffer.capacity}.")
    else:
        raise ValueError(
            "Unknown offline input! config['input'] must either be list of "
            "offline files (json) or a D4RL-specific InputReader specifier "
            "(e.g. 'd4rl.hopper-medium-v0').")


CQLTrainer = SACTrainer.with_updates(
    name="CQL",
    default_config=CQL_DEFAULT_CONFIG,
    validate_config=validate_config,
    default_policy=CQLTFPolicy,
    get_policy_class=get_policy_class,
    after_init=after_init,
    execution_plan=execution_plan,
)
[RLlib] New Offline RL Algorithm: CQL (based on SAC) (#13118) 2020-12-30 07:11:57 -08:00			`"""CQL (derived from SAC).`
			`"""`
[RLlib Testing] Lower `--smoke-test` "time_total_s" to make sure it doesn't time out. (#18670) 2021-09-16 18:22:23 +02:00			`import logging`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`import numpy as np`
[RLlib] CQL iteration count fixes: Remove dummy buffer and unnecessary store op from exec_plan. (#16332) 2021-06-10 07:49:17 +02:00			`from typing import Optional, Type`
[RLlib] New Offline RL Algorithm: CQL (based on SAC) (#13118) 2020-12-30 07:11:57 -08:00
[RLlib] CQL TensorFlow support (#15841) 2021-05-18 11:10:46 +02:00			`from ray.rllib.agents.cql.cql_tf_policy import CQLTFPolicy`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`from ray.rllib.agents.cql.cql_torch_policy import CQLTorchPolicy`
[RLlib] New Offline RL Algorithm: CQL (based on SAC) (#13118) 2020-12-30 07:11:57 -08:00			`from ray.rllib.agents.sac.sac import SACTrainer, \`
			`DEFAULT_CONFIG as SAC_CONFIG`
[RLlib] CQL for HalfCheetah-Random-v0 + Hopper-Random-v0 + CQL Bug Fixes (#14243) 2021-02-22 08:30:18 -08:00			`from ray.rllib.execution.metric_ops import StandardMetricsReporting`
			`from ray.rllib.execution.replay_buffer import LocalReplayBuffer`
			`from ray.rllib.execution.replay_ops import Replay`
[RLlib] Refactor: All tf static graph code should reside inside Policy class. (#17169) 2021-07-20 14:58:13 -04:00			`from ray.rllib.execution.train_ops import MultiGPUTrainOneStep, TrainOneStep, \`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`UpdateTargetNetwork`
			`from ray.rllib.offline.shuffled_input import ShuffledInput`
			`from ray.rllib.policy.policy import LEARNER_STATS_KEY, Policy`
			`from ray.rllib.policy.sample_batch import SampleBatch`
			`from ray.rllib.utils import merge_dicts`
[RLlib Testing] Lower `--smoke-test` "time_total_s" to make sure it doesn't time out. (#18670) 2021-09-16 18:22:23 +02:00			`from ray.rllib.utils.framework import try_import_tf, try_import_tfp`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`from ray.rllib.utils.typing import TrainerConfigDict`
[RLlib] CQL for HalfCheetah-Random-v0 + Hopper-Random-v0 + CQL Bug Fixes (#14243) 2021-02-22 08:30:18 -08:00
[RLlib Testing] Lower `--smoke-test` "time_total_s" to make sure it doesn't time out. (#18670) 2021-09-16 18:22:23 +02:00			`tf1, tf, tfv = try_import_tf()`
[RLlib] Bump tf version in ML docker to tf==2.5.0; add tfp to ML-docker. (#18544) 2021-09-15 08:46:37 +02:00			`tfp = try_import_tfp()`
[RLlib Testing] Lower `--smoke-test` "time_total_s" to make sure it doesn't time out. (#18670) 2021-09-16 18:22:23 +02:00			`logger = logging.getLogger(__name__)`
[RLlib] Bump tf version in ML docker to tf==2.5.0; add tfp to ML-docker. (#18544) 2021-09-15 08:46:37 +02:00			`replay_buffer = None`

[RLlib] New Offline RL Algorithm: CQL (based on SAC) (#13118) 2020-12-30 07:11:57 -08:00			`# yapf: disable`
			`# __sphinx_doc_begin__`
			`CQL_DEFAULT_CONFIG = merge_dicts(`
			`SAC_CONFIG, {`
			`# You should override this to point to an offline dataset.`
			`"input": "sampler",`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`# Switch off off-policy evaluation.`
[RLlib] Support for D4RL + Semi-working CQL Benchmark (#13550) 2021-01-21 07:43:55 -08:00			`"input_evaluation": [],`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`# Number of iterations with Behavior Cloning Pretraining.`
[RLlib] New Offline RL Algorithm: CQL (based on SAC) (#13118) 2020-12-30 07:11:57 -08:00			`"bc_iters": 20000,`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`# CQL loss temperature.`
[RLlib] New Offline RL Algorithm: CQL (based on SAC) (#13118) 2020-12-30 07:11:57 -08:00			`"temperature": 1.0,`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`# Number of actions to sample for CQL loss.`
[RLlib] New Offline RL Algorithm: CQL (based on SAC) (#13118) 2020-12-30 07:11:57 -08:00			`"num_actions": 10,`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`# Whether to use the Lagrangian for Alpha Prime (in CQL loss).`
[RLlib] New Offline RL Algorithm: CQL (based on SAC) (#13118) 2020-12-30 07:11:57 -08:00			`"lagrangian": False,`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`# Lagrangian threshold.`
[RLlib] New Offline RL Algorithm: CQL (based on SAC) (#13118) 2020-12-30 07:11:57 -08:00			`"lagrangian_thresh": 5.0,`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`# Min Q weight multiplier.`
[RLlib] New Offline RL Algorithm: CQL (based on SAC) (#13118) 2020-12-30 07:11:57 -08:00			`"min_q_weight": 5.0,`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`# Replay buffer should be larger or equal the size of the offline`
			`# dataset.`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`"buffer_size": int(1e6),`
[RLlib] New Offline RL Algorithm: CQL (based on SAC) (#13118) 2020-12-30 07:11:57 -08:00			`})`
			`# __sphinx_doc_end__`
			`# yapf: enable`


			`def validate_config(config: TrainerConfigDict):`
[RLlib] Multi-GPU for tf-DQN/PG/A2C. (#13393) 2021-03-08 15:41:27 +01:00			`if config["num_gpus"] > 1:`
			raise ValueError("`num_gpus` > 1 not yet supported for CQL!")
[RLlib] New Offline RL Algorithm: CQL (based on SAC) (#13118) 2020-12-30 07:11:57 -08:00
[RLlib] Issue 17667: CQL-torch + GPU not working (due to simple_optimizer=False; must use simple optimizer!). (#17742) 2021-08-11 18:30:21 +02:00			`# CQL-torch performs the optimizer steps inside the loss function.`
			`# Using the multi-GPU optimizer will therefore not work (see multi-GPU`
			`# check above) and we must use the simple optimizer for now.`
			`if config["simple_optimizer"] is not True and \`
			`config["framework"] == "torch":`
			`config["simple_optimizer"] = True`

[RLlib] Bump tf version in ML docker to tf==2.5.0; add tfp to ML-docker. (#18544) 2021-09-15 08:46:37 +02:00			`if config["framework"] in ["tf", "tf2", "tfe"] and tfp is None:`
[RLlib Testing] Lower `--smoke-test` "time_total_s" to make sure it doesn't time out. (#18670) 2021-09-16 18:22:23 +02:00			`logger.warning(`
			"You need `tensorflow_probability` in order to run CQL! "
			"Install it via `pip install tensorflow_probability`. Your "
			`f"tf.__version__={tf.__version__ if tf else None}."`
			`"Trying to import tfp results in the following error:")`
			`try_import_tfp(error=True)`
[RLlib] CQL for HalfCheetah-Random-v0 + Hopper-Random-v0 + CQL Bug Fixes (#14243) 2021-02-22 08:30:18 -08:00

			`def execution_plan(workers, config):`
			`if config.get("prioritized_replay"):`
			`prio_args = {`
			`"prioritized_replay_alpha": config["prioritized_replay_alpha"],`
			`"prioritized_replay_beta": config["prioritized_replay_beta"],`
			`"prioritized_replay_eps": config["prioritized_replay_eps"],`
			`}`
			`else:`
			`prio_args = {}`

			`local_replay_buffer = LocalReplayBuffer(`
			`num_shards=1,`
			`learning_starts=config["learning_starts"],`
			`buffer_size=config["buffer_size"],`
			`replay_batch_size=config["train_batch_size"],`
			`replay_mode=config["multiagent"]["replay_mode"],`
[RLlib] CQL Documentation + Tests (#14531) 2021-03-11 09:51:39 -08:00			`replay_sequence_length=config.get("replay_sequence_length", 1),`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`replay_burn_in=config.get("burn_in", 0),`
			`replay_zero_init_states=config.get("zero_init_states", True),`
[RLlib] CQL for HalfCheetah-Random-v0 + Hopper-Random-v0 + CQL Bug Fixes (#14243) 2021-02-22 08:30:18 -08:00			`**prio_args)`

			`global replay_buffer`
			`replay_buffer = local_replay_buffer`

			`def update_prio(item):`
			`samples, info_dict = item`
			`if config.get("prioritized_replay"):`
			`prio_dict = {}`
			`for policy_id, info in info_dict.items():`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`# TODO(sven): This is currently structured differently for`
			`# torch/tf. Clean up these results/info dicts across`
			`# policies (note: fixing this in torch_policy.py will`
			`# break e.g. DDPPO!).`
[RLlib] CQL for HalfCheetah-Random-v0 + Hopper-Random-v0 + CQL Bug Fixes (#14243) 2021-02-22 08:30:18 -08:00			`td_error = info.get("td_error",`
			`info[LEARNER_STATS_KEY].get("td_error"))`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`samples.policy_batches[policy_id].set_get_interceptor(None)`
[RLlib] CQL for HalfCheetah-Random-v0 + Hopper-Random-v0 + CQL Bug Fixes (#14243) 2021-02-22 08:30:18 -08:00			`prio_dict[policy_id] = (samples.policy_batches[policy_id]`
[RLlib] Remove all (already soft-deprecated) `SampleBatch.data` from code. (#15335) 2021-04-15 19:19:51 +02:00			`.get("batch_indexes"), td_error)`
[RLlib] CQL for HalfCheetah-Random-v0 + Hopper-Random-v0 + CQL Bug Fixes (#14243) 2021-02-22 08:30:18 -08:00			`local_replay_buffer.update_priorities(prio_dict)`
			`return info_dict`

[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`# (2) Read and train on experiences from the replay buffer. Every batch`
			`# returned from the LocalReplay() iterator is passed to TrainOneStep to`
			`# take a SGD step, and then we decide whether to update the target network.`
[RLlib] CQL for HalfCheetah-Random-v0 + Hopper-Random-v0 + CQL Bug Fixes (#14243) 2021-02-22 08:30:18 -08:00			`post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00
			`if config["simple_optimizer"]:`
			`train_step_op = TrainOneStep(workers)`
			`else:`
[RLlib] Refactor: All tf static graph code should reside inside Policy class. (#17169) 2021-07-20 14:58:13 -04:00			`train_step_op = MultiGPUTrainOneStep(`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`workers=workers,`
			`sgd_minibatch_size=config["train_batch_size"],`
			`num_sgd_iter=1,`
			`num_gpus=config["num_gpus"],`
			`shuffle_sequences=True,`
			`_fake_gpus=config["_fake_gpus"],`
			`framework=config.get("framework"))`

[RLlib] CQL for HalfCheetah-Random-v0 + Hopper-Random-v0 + CQL Bug Fixes (#14243) 2021-02-22 08:30:18 -08:00			`replay_op = Replay(local_buffer=local_replay_buffer) \`
			`.for_each(lambda x: post_fn(x, workers, config)) \`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`.for_each(train_step_op) \`
[RLlib] CQL for HalfCheetah-Random-v0 + Hopper-Random-v0 + CQL Bug Fixes (#14243) 2021-02-22 08:30:18 -08:00			`.for_each(update_prio) \`
			`.for_each(UpdateTargetNetwork(`
			`workers, config["target_network_update_freq"]))`

[RLlib] CQL iteration count fixes: Remove dummy buffer and unnecessary store op from exec_plan. (#16332) 2021-06-10 07:49:17 +02:00			`return StandardMetricsReporting(`
			`replay_op, workers, config, by_steps_trained=True)`
[RLlib] CQL for HalfCheetah-Random-v0 + Hopper-Random-v0 + CQL Bug Fixes (#14243) 2021-02-22 08:30:18 -08:00

[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`def get_policy_class(config: TrainerConfigDict) -> Optional[Type[Policy]]:`
			`if config["framework"] == "torch":`
			`return CQLTorchPolicy`


[RLlib] CQL for HalfCheetah-Random-v0 + Hopper-Random-v0 + CQL Bug Fixes (#14243) 2021-02-22 08:30:18 -08:00			`def after_init(trainer):`
			`# Add the entire dataset to Replay Buffer (global variable)`
			`global replay_buffer`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`reader = trainer.workers.local_worker().input_reader`

			`# For d4rl, add the D4RLReaders' dataset to the buffer.`
[rllib] Enhancements to Input API for customizing offline datasets (#16957) Co-authored-by: Richard Liaw <rliaw@berkeley.edu> 2021-07-10 18:05:25 -04:00			`if isinstance(trainer.config["input"], str) and \`
			`"d4rl" in trainer.config["input"]:`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`dataset = reader.dataset`
[RLlib] CQL Documentation + Tests (#14531) 2021-03-11 09:51:39 -08:00			`replay_buffer.add_batch(dataset)`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`# For a list of files, add each file's entire content to the buffer.`
			`elif isinstance(reader, ShuffledInput):`
			`num_batches = 0`
			`total_timesteps = 0`
			`for batch in reader.child.read_all_files():`
			`num_batches += 1`
			`total_timesteps += len(batch)`
			`# Add NEXT_OBS if not available. This is slightly hacked`
			`# as for the very last time step, we will use next-obs=zeros`
			`# and therefore force-set DONE=True to avoid this missing`
			`# next-obs to cause learning problems.`
			`if SampleBatch.NEXT_OBS not in batch:`
			`obs = batch[SampleBatch.OBS]`
			`batch[SampleBatch.NEXT_OBS] = \`
			`np.concatenate([obs[1:], np.zeros_like(obs[0:1])])`
			`batch[SampleBatch.DONES][-1] = True`
			`replay_buffer.add_batch(batch)`
[RLlib] Replay buffers: Add config option to store contents in checkpoints. (#17999) 2021-08-31 12:21:49 +02:00			`print(f"Loaded {num_batches} batches ({total_timesteps} ts) into the "`
			`f"replay buffer, which has capacity {replay_buffer.capacity}.")`
[RLlib] CQL loss fn fixes, MuJoCo + Pendulum benchmarks, offline-RL example script w/ json file. (#15603) Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-05-04 10:06:19 -07:00			`else:`
			`raise ValueError(`
			`"Unknown offline input! config['input'] must either be list of "`
			`"offline files (json) or a D4RL-specific InputReader specifier "`
			`"(e.g. 'd4rl.hopper-medium-v0').")`
[RLlib] CQL for HalfCheetah-Random-v0 + Hopper-Random-v0 + CQL Bug Fixes (#14243) 2021-02-22 08:30:18 -08:00

[RLlib] New Offline RL Algorithm: CQL (based on SAC) (#13118) 2020-12-30 07:11:57 -08:00			`CQLTrainer = SACTrainer.with_updates(`
			`name="CQL",`
			`default_config=CQL_DEFAULT_CONFIG,`
			`validate_config=validate_config,`
[RLlib] CQL TensorFlow support (#15841) 2021-05-18 11:10:46 +02:00			`default_policy=CQLTFPolicy,`
[RLlib] New Offline RL Algorithm: CQL (based on SAC) (#13118) 2020-12-30 07:11:57 -08:00			`get_policy_class=get_policy_class,`
[RLlib] CQL for HalfCheetah-Random-v0 + Hopper-Random-v0 + CQL Bug Fixes (#14243) 2021-02-22 08:30:18 -08:00			`after_init=after_init,`
			`execution_plan=execution_plan,`
[RLlib] New Offline RL Algorithm: CQL (based on SAC) (#13118) 2020-12-30 07:11:57 -08:00			`)`