Source code for rl_agents.agents.mab.egreedy

import numpy as np

from rl_agents.agents.mab.base import BaseMAB


[docs]class EpsilonGreedy(BaseMAB):
    r"""Epsilon-Greedy agent.

    The agent uses the epsilon-greedy approach to solve the Multi-Armed
    bandit problem.

    The parameter :math:`\epsilon` is used for the
    exploration-exploitation trade-off. With probability :math:`\epsilon`
    the agent selects a random action, otherwise it selects the action
    that has the best average reward.

    Parameters
    ----------
    n_arms : int
        Number of actions (arms) of the MAB.
    epsilon : float
        Probability of selecting a random action.

    Attributes
    ----------
    means : numpy.array(float, ndim=1)
        Vector containing the average reward of each arm.
    trials : numpy.array(float, ndim=1)
        Vector containing the number of trials made to each arm.

    """

    def __init__(self, n_arms, epsilon):
        self.epsilon = epsilon
        self.n_arms = n_arms
        self.means = np.zeros(self.n_arms)
        self.trials = np.zeros(self.n_arms)

[docs]    def learn(self, a_idx, reward):
        """Make `EpsilonGreedy` agent learn from the interaction.

        The `EpsilonGreedy` agent learns from its previous choice
        and the reward received from this action.
        Updates the means and the trials.

        Parameters
        ----------
        reward : float
            Reward received from the system after taking action a_idx.
        a_idx : int
            Index of the arm pulled (action taken).

        """
        self.means[a_idx] = (
            self.means[a_idx] * self.trials[a_idx] + reward
        ) / (self.trials[a_idx] + 1)
        self.trials[a_idx] += 1  # add trial

[docs]    def predict(self):
        r"""Predict next action.

        With probability :math:`\epsilon` the agent selects a random arm.
        With probability :math:`1 - \epsilon` the agent selects the arm that
        has the best average reward.

        Returns
        -------
        int
            Index of chosen action.

        """
        if np.random.rand() < self.epsilon:
            a = np.random.randint(low=0, high=self.n_arms)
        else:
            a = self.means.argmax()
        return a


[docs]class DecayEpsilon(BaseMAB):
    r"""Agent that follows an epsilon-decreasing policy.

    The agent uses the epsilon-greedy approach to solve the Multi-Armed
    bandit problem, but with a decay in the epsilon.

    The parameter :math:`\epsilon` is used for the exploration-exploitation
    trade-off. With probability :math:`\epsilon` the agent selects a
    random action, otherwise it selects the action that has the
    best average reward. After each interaction the epsilon is updated as
    epsilon = epsilon * decay.

    Parameters
    ----------
    n_arms : int
        Number of actions (arms) of the MAB.
    max_epsilon : float
        Initial epsilon.
    decay : float
        Decay of the epsilon.

    Attributes
    ----------
    epsilon : float
        Epsilon of the agent. Constantly updated as epsilon = epsilon*decay
    means : numpy.array(float, ndim=1)
        Vector containing the average reward of each arm.
    trials : numpy.array(float, ndim=1)
        Vector containing the number of trials made to each arm.

    """

    def __init__(self, n_arms, max_epsilon, decay):
        self.epsilon = max_epsilon
        self.n_arms = n_arms
        self.means = np.zeros(self.n_arms)
        self.trials = np.zeros(self.n_arms)
        self.decay = decay

[docs]    def learn(self, a_idx, reward):
        """Make the `DecayEpsilon` agent learn from the interaction.

        The MAB agent learns from its previous choice and the reward received
        from this action. Updates the means and the trials.

        Parameters
        ----------
        reward : float
            Reward received from the system after taking action a_idx.
        a_idx : int
            Index of the arm pulled (action taken).

        """
        self.means[a_idx] = (
            self.means[a_idx] * self.trials[a_idx] + reward
        ) / (self.trials[a_idx] + 1)
        self.trials[a_idx] += 1  # add trial

[docs]    def predict(self):
        r"""Predict next action and update epsilon.

        With probability :math:`\epsilon` the agent selects a random arm.
        With probability :math:`1 - \epsilon` the agent selects the arm that
        has the best average reward.

        Returns
        -------
        int
            Index of chosen action.

        """
        if np.random.rand() < self.epsilon:
            a_idx = np.random.randint(low=0, high=self.n_arms)
        else:
            a_idx = self.means.argmax()
        self.epsilon = self.epsilon * self.decay
        return a_idx