ray/rllib/contrib/bandits/agents/lin_ts.py

import logging

from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.contrib.bandits.agents.policy import BanditPolicy

logger = logging.getLogger(__name__)

# yapf: disable
# __sphinx_doc_begin__
TS_CONFIG = with_common_config({
    # No remote workers by default.
    "num_workers": 0,
    "use_pytorch": True,

    # Do online learning one step at a time.
    "rollout_fragment_length": 1,
    "train_batch_size": 1,

    # Bandits cant afford to do one timestep per iteration as it is extremely
    # slow because of metrics collection overhead. This setting means that the
    # agent will be trained for 100 times in one iteration of Rllib
    "timesteps_per_iteration": 100,

    "exploration_config": {
        "type": "ray.rllib.contrib.bandits.exploration.ThompsonSampling"
    }
})
# __sphinx_doc_end__
# yapf: enable


def get_stats(trainer):
    env_metrics = trainer.collect_metrics()
    stats = trainer.optimizer.stats()
    # Uncomment if regret at each time step is needed
    # stats.update({"all_regrets": trainer.get_policy().regrets})
    return dict(env_metrics, **stats)


LinTSTrainer = build_trainer(
    name="LinTS",
    default_config=TS_CONFIG,
    default_policy=BanditPolicy,
    collect_metrics_fn=get_stats)
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`import logging`

			`from ray.rllib.agents.trainer import with_common_config`
			`from ray.rllib.agents.trainer_template import build_trainer`
			`from ray.rllib.contrib.bandits.agents.policy import BanditPolicy`

			`logger = logging.getLogger(__name__)`

			`# yapf: disable`
			`# __sphinx_doc_begin__`
			`TS_CONFIG = with_common_config({`
			`# No remote workers by default.`
			`"num_workers": 0,`
			`"use_pytorch": True,`

			`# Do online learning one step at a time.`
			`"rollout_fragment_length": 1,`
			`"train_batch_size": 1,`

			`# Bandits cant afford to do one timestep per iteration as it is extremely`
			`# slow because of metrics collection overhead. This setting means that the`
			`# agent will be trained for 100 times in one iteration of Rllib`
			`"timesteps_per_iteration": 100,`

			`"exploration_config": {`
			`"type": "ray.rllib.contrib.bandits.exploration.ThompsonSampling"`
			`}`
			`})`
			`# __sphinx_doc_end__`
			`# yapf: enable`


			`def get_stats(trainer):`
			`env_metrics = trainer.collect_metrics()`
			`stats = trainer.optimizer.stats()`
			`# Uncomment if regret at each time step is needed`
			`# stats.update({"all_regrets": trainer.get_policy().regrets})`
			`return dict(env_metrics, **stats)`


			`LinTSTrainer = build_trainer(`
			`name="LinTS",`
			`default_config=TS_CONFIG,`
			`default_policy=BanditPolicy,`
			`collect_metrics_fn=get_stats)`