ray/rllib/contrib/bandits/envs/parametric.py

import copy

import gym
import numpy as np
from gym import spaces

DEFAULT_RECO_CONFIG = {
    "num_users": 1,
    "num_items": 100,
    "feature_dim": 16,
    "slate_size": 1,
    "num_candidates": 25,
    "seed": 1
}


class ParametricItemRecoEnv(gym.Env):
    """A recommendation environment which generates items with visible features
     randomly (parametric actions).
    The environment can be configured to be multi-user, i.e. different model
    will be learned independently for each user.
    To enable slate recommendation, the `slate_size` config parameter can be
    set as > 1 .
    """

    def __init__(self, config=None):
        self.config = copy.copy(DEFAULT_RECO_CONFIG)
        if config is not None and type(config) == dict:
            self.config.update(config)

        self.num_users = self.config["num_users"]
        self.num_items = self.config["num_items"]
        self.feature_dim = self.config["feature_dim"]
        self.slate_size = self.config["slate_size"]
        self.num_candidates = self.config["num_candidates"]
        self.seed = self.config["seed"]

        assert self.num_candidates <= self.num_items,\
            "Size of candidate pool should be less than total no. of items"
        assert self.slate_size < self.num_candidates,\
            "Slate size should be less than no. of candidate items"

        self.action_space = self._def_action_space()
        self.observation_space = self._def_observation_space()

        self.current_user_id = 0
        self.item_pool = None
        self.item_pool_ids = None
        self.total_regret = 0

        self._init_embeddings()

    def _init_embeddings(self):
        self.item_embeddings = self._gen_normalized_embeddings(
            self.num_items, self.feature_dim)

        # These are latent user features that will be hidden from the learning
        # agent. They will be used for reward generation only
        self.user_embeddings = self._gen_normalized_embeddings(
            self.num_users, self.feature_dim)

    def _sample_user(self):
        self.current_user_id = np.random.randint(0, self.num_users)

    def _gen_item_pool(self):
        # Randomly generate a candidate list of items by sampling without
        # replacement
        self.item_pool_ids = np.random.choice(
            np.arange(self.num_items), self.num_candidates, replace=False)
        self.item_pool = self.item_embeddings[self.item_pool_ids]

    @staticmethod
    def _gen_normalized_embeddings(size, dim):
        embeddings = np.random.rand(size, dim)
        embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)
        return embeddings

    def _def_action_space(self):
        if self.slate_size == 1:
            return spaces.Discrete(self.num_candidates)
        else:
            return spaces.MultiDiscrete(
                [self.num_candidates] * self.slate_size)

    def _def_observation_space(self):
        # Embeddings for each item in the candidate pool
        item_obs_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(self.num_candidates, self.feature_dim))

        # Can be useful for collaborative filtering based agents
        item_ids_obs_space = spaces.MultiDiscrete(
            [self.num_items] * self.num_candidates)

        # Can be either binary (clicks) or continuous feedback (watch time)
        resp_space = spaces.Box(low=-1, high=1, shape=(self.slate_size, ))

        if self.num_users == 1:
            return spaces.Dict({
                "item": item_obs_space,
                "item_id": item_ids_obs_space,
                "response": resp_space
            })
        else:
            user_obs_space = spaces.Discrete(self.num_users)
            return spaces.Dict({
                "user": user_obs_space,
                "item": item_obs_space,
                "item_id": item_ids_obs_space,
                "response": resp_space
            })

    def step(self, action):
        # Action can be a single action or a slate depending on slate size
        assert self.action_space.contains(
            action
        ), "Action cannot be recognized. Please check the type and bounds."

        if self.slate_size == 1:
            scores = self.item_pool.dot(
                self.user_embeddings[self.current_user_id])
            reward = scores[action]
            regret = np.max(scores) - reward
            self.total_regret += regret

            info = {"regret": regret}

            self.current_user_id = np.random.randint(0, self.num_users)
            self._gen_item_pool()

            obs = {
                "item": self.item_pool,
                "item_id": self.item_pool_ids,
                "response": [reward]
            }
            if self.num_users > 1:
                obs["user"] = self.current_user_id
            return obs, reward, True, info
        else:
            # TODO(saurabh3949):Handle slate recommendation using a click model
            return None

    def reset(self):
        self._sample_user()
        self._gen_item_pool()
        obs = {
            "item": self.item_pool,
            "item_id": self.item_pool_ids,
            "response": [0] * self.slate_size
        }
        if self.num_users > 1:
            obs["user"] = self.current_user_id
        return obs

    def render(self, mode="human"):
        raise NotImplementedError
Contextual Bandit algorithms (WIP) (#7642) 2020-03-26 13:41:16 -07:00			`import copy`

			`import gym`
			`import numpy as np`
			`from gym import spaces`

			`DEFAULT_RECO_CONFIG = {`
			`"num_users": 1,`
			`"num_items": 100,`
			`"feature_dim": 16,`
			`"slate_size": 1,`
			`"num_candidates": 25,`
			`"seed": 1`
			`}`


			`class ParametricItemRecoEnv(gym.Env):`
			`"""A recommendation environment which generates items with visible features`
			`randomly (parametric actions).`
			`The environment can be configured to be multi-user, i.e. different model`
			`will be learned independently for each user.`
			To enable slate recommendation, the `slate_size` config parameter can be
			`set as > 1 .`
			`"""`

			`def __init__(self, config=None):`
			`self.config = copy.copy(DEFAULT_RECO_CONFIG)`
			`if config is not None and type(config) == dict:`
			`self.config.update(config)`

			`self.num_users = self.config["num_users"]`
			`self.num_items = self.config["num_items"]`
			`self.feature_dim = self.config["feature_dim"]`
			`self.slate_size = self.config["slate_size"]`
			`self.num_candidates = self.config["num_candidates"]`
			`self.seed = self.config["seed"]`

			`assert self.num_candidates <= self.num_items,\`
			`"Size of candidate pool should be less than total no. of items"`
			`assert self.slate_size < self.num_candidates,\`
			`"Slate size should be less than no. of candidate items"`

			`self.action_space = self._def_action_space()`
			`self.observation_space = self._def_observation_space()`

			`self.current_user_id = 0`
			`self.item_pool = None`
			`self.item_pool_ids = None`
			`self.total_regret = 0`

			`self._init_embeddings()`

			`def _init_embeddings(self):`
			`self.item_embeddings = self._gen_normalized_embeddings(`
			`self.num_items, self.feature_dim)`

			`# These are latent user features that will be hidden from the learning`
			`# agent. They will be used for reward generation only`
			`self.user_embeddings = self._gen_normalized_embeddings(`
			`self.num_users, self.feature_dim)`

			`def _sample_user(self):`
			`self.current_user_id = np.random.randint(0, self.num_users)`

			`def _gen_item_pool(self):`
			`# Randomly generate a candidate list of items by sampling without`
			`# replacement`
			`self.item_pool_ids = np.random.choice(`
			`np.arange(self.num_items), self.num_candidates, replace=False)`
			`self.item_pool = self.item_embeddings[self.item_pool_ids]`

			`@staticmethod`
			`def _gen_normalized_embeddings(size, dim):`
			`embeddings = np.random.rand(size, dim)`
			`embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)`
			`return embeddings`

			`def _def_action_space(self):`
			`if self.slate_size == 1:`
			`return spaces.Discrete(self.num_candidates)`
			`else:`
			`return spaces.MultiDiscrete(`
			`[self.num_candidates] * self.slate_size)`

			`def _def_observation_space(self):`
			`# Embeddings for each item in the candidate pool`
			`item_obs_space = spaces.Box(`
			`low=-np.inf,`
			`high=np.inf,`
			`shape=(self.num_candidates, self.feature_dim))`

			`# Can be useful for collaborative filtering based agents`
			`item_ids_obs_space = spaces.MultiDiscrete(`
			`[self.num_items] * self.num_candidates)`

			`# Can be either binary (clicks) or continuous feedback (watch time)`
			`resp_space = spaces.Box(low=-1, high=1, shape=(self.slate_size, ))`

			`if self.num_users == 1:`
			`return spaces.Dict({`
			`"item": item_obs_space,`
			`"item_id": item_ids_obs_space,`
			`"response": resp_space`
			`})`
			`else:`
			`user_obs_space = spaces.Discrete(self.num_users)`
			`return spaces.Dict({`
			`"user": user_obs_space,`
			`"item": item_obs_space,`
			`"item_id": item_ids_obs_space,`
			`"response": resp_space`
			`})`

			`def step(self, action):`
			`# Action can be a single action or a slate depending on slate size`
			`assert self.action_space.contains(`
			`action`
			`), "Action cannot be recognized. Please check the type and bounds."`

			`if self.slate_size == 1:`
			`scores = self.item_pool.dot(`
			`self.user_embeddings[self.current_user_id])`
			`reward = scores[action]`
			`regret = np.max(scores) - reward`
			`self.total_regret += regret`

			`info = {"regret": regret}`

			`self.current_user_id = np.random.randint(0, self.num_users)`
			`self._gen_item_pool()`

			`obs = {`
			`"item": self.item_pool,`
			`"item_id": self.item_pool_ids,`
			`"response": [reward]`
			`}`
			`if self.num_users > 1:`
			`obs["user"] = self.current_user_id`
			`return obs, reward, True, info`
			`else:`
			`# TODO(saurabh3949):Handle slate recommendation using a click model`
			`return None`

			`def reset(self):`
			`self._sample_user()`
			`self._gen_item_pool()`
			`obs = {`
			`"item": self.item_pool,`
			`"item_id": self.item_pool_ids,`
			`"response": [0] * self.slate_size`
			`}`
			`if self.num_users > 1:`
			`obs["user"] = self.current_user_id`
			`return obs`

			`def render(self, mode="human"):`
			`raise NotImplementedError`