Source code for rl_agents.agents.mab.ucbs

import numpy as np

from rl_agents.agents.mab.base import BaseMAB


[docs]class UCB(BaseMAB):
    r"""MAB Agent following a Upper Confidence Bound policy.

    The UCB selects the action that maximizes the function given by:

    .. math:: f(i) = \mu_i + U_i,

    where  :math:`\mu_i` is the average reward of arm :math:`i`, and
    :math:`U_i` is given by:

    .. math:: U_i = \sqrt{\frac{-\log{p}}{2 N_i} },

    where :math:`N_i` is the number of pulls made to arm  :math:`i`.

    Parameters
    ----------
    n_arms : int
        Number of actions (arms) of the MAB.
    p : float
        Probability of the true value being above the estimate plus the bound.

    Attributes
    ----------
    means : numpy.array(float, ndim=1)
        Vector containing the average reward of each arm.
    trials : numpy.array(float, ndim=1)
        Vector containing the number of trials made to each arm.
    bounds : numpy.array(float, ndim=1)
        Vector containing the upper bounds of each arm.
    t : int
        Total trial counter.

    """

    def __init__(self, n_arms, p):
        self.p = p
        self.n_arms = n_arms
        self.means = np.zeros(self.n_arms)
        self.trials = np.zeros(self.n_arms)
        self.bounds = np.zeros(self.n_arms)
        self.t = 0

[docs]    def learn(self, a_idx, reward):
        """Learn from the interaction.

        Update the means, the bounds and the trials.

        Parameters
        ----------
        reward : float
            Reward received from the system after taking action a_idx.
        a_idx : int
            Index of the arm pulled (action taken).

        """
        self.means[a_idx] = (
            (self.means[a_idx] * self.trials[a_idx]) + reward
        ) / (self.trials[a_idx] + 1)
        self.trials[a_idx] += 1  # add trial
        self.bounds[a_idx] = np.sqrt(-np.log(self.p) / 2 * self.trials[a_idx])
        self.t += 1

[docs]    def predict(self):
        """Predict next action.

        Pulls each arm once, then chooses the arm that gives the best
        mean + bound.

        Returns
        -------
        int
            Index of chosen action.

        """
        if self.t < self.n_arms:
            return self.t
        return np.argmax(self.means + self.bounds)


[docs]class UCB1(BaseMAB):
    """Short summary.

    Parameters
    ----------
    n_arms : type
        Description of parameter `n_arms`.
    c : type
        Description of parameter `c`.

    Attributes
    ----------
    means : type
        Description of attribute `means`.
    trials : type
        Description of attribute `trials`.
    bounds : type
        Description of attribute `bounds`.
    t : type
        Description of attribute `t`.
    n_arms
    c

    """

    def __init__(self, n_arms, c=4):
        self.n_arms = n_arms
        self.means = np.zeros(self.n_arms)
        self.trials = np.zeros(self.n_arms)
        self.bounds = np.zeros(self.n_arms)
        self.c = c
        self.t = 0

[docs]    def learn(self, a_idx, reward):
        """Short summary.

        Parameters
        ----------
        a_idx : type
            Description of parameter `a_idx`.
        reward : type
            Description of parameter `reward`.

        Returns
        -------
        type
            Description of returned object.

        """
        self.means[a_idx] = (
            (self.means[a_idx] * self.trials[a_idx]) + reward
        ) / (self.trials[a_idx] + 1)
        self.trials[a_idx] += 1  # add trial
        self.t += 1
        self.bounds[a_idx] = self.c * np.sqrt(
            np.log(self.t) / (self.trials[a_idx])
        )

[docs]    def predict(self):
        """Short summary.

        Returns
        -------
        type
            Description of returned object.

        """
        if self.t < self.n_arms:
            return self.t
        return np.argmax(self.means + self.bounds)


[docs]class UCB2(BaseMAB):
    """Short summary.

    Parameters
    ----------
    n_arms : type
        Description of parameter `n_arms`.
    alpha : type
        Description of parameter `alpha`.

    Attributes
    ----------
    means : type
        Description of attribute `means`.
    trials : type
        Description of attribute `trials`.
    bounds : type
        Description of attribute `bounds`.
    rj : type
        Description of attribute `rj`.
    t : type
        Description of attribute `t`.
    counter : type
        Description of attribute `counter`.
    current : type
        Description of attribute `current`.
    n_arms
    alpha

    """

    def __init__(self, n_arms, alpha):
        self.n_arms = n_arms
        self.means = np.zeros(self.n_arms)
        self.trials = np.zeros(self.n_arms)
        self.bounds = np.zeros(self.n_arms)
        self.rj = np.zeros(self.n_arms)
        self.alpha = alpha
        self.t = 0
        self.counter = 0
        self.current = 0

[docs]    def learn(self, a_idx, reward):
        """Short summary.

        Parameters
        ----------
        a_idx : type
            Description of parameter `a_idx`.
        reward : type
            Description of parameter `reward`.

        Returns
        -------
        type
            Description of returned object.

        """
        self.means[a_idx] = (
            (self.means[a_idx] * self.trials[a_idx]) + reward
        ) / (self.trials[a_idx] + 1)
        self.trials[a_idx] += 1  # add trial
        self.t += 1
        tau = self._tau(self.rj[a_idx])
        self.bounds[a_idx] = np.sqrt(
            (1 + self.alpha) * np.log(np.e * self.t / tau) / (2 * tau)
        )
        self.counter = self._tau(self.rj[a_idx] + 1) - tau
        self.rj[a_idx] += 1

[docs]    def predict(self):
        """Short summary.

        Returns
        -------
        type
            Description of returned object.

        """
        if self.t < self.n_arms:
            return self.t
        if self.counter == 0:
            action = np.argmax(self.means + self.bounds)
            self.current = action
            return action
        else:
            self.counter -= 1
            return self.current

    def _tau(self, rj):
        return np.ceil((1 + self.alpha) ** rj)