Source code for rl_agents.agents.mab.egreedy

import numpy as np

from rl_agents.agents.mab.base import BaseMAB


[docs]class EpsilonGreedy(BaseMAB): r"""Epsilon-Greedy agent. The agent uses the epsilon-greedy approach to solve the Multi-Armed bandit problem. The parameter :math:`\epsilon` is used for the exploration-exploitation trade-off. With probability :math:`\epsilon` the agent selects a random action, otherwise it selects the action that has the best average reward. Parameters ---------- n_arms : int Number of actions (arms) of the MAB. epsilon : float Probability of selecting a random action. Attributes ---------- means : numpy.array(float, ndim=1) Vector containing the average reward of each arm. trials : numpy.array(float, ndim=1) Vector containing the number of trials made to each arm. """ def __init__(self, n_arms, epsilon): self.epsilon = epsilon self.n_arms = n_arms self.means = np.zeros(self.n_arms) self.trials = np.zeros(self.n_arms)
[docs] def learn(self, a_idx, reward): """Make `EpsilonGreedy` agent learn from the interaction. The `EpsilonGreedy` agent learns from its previous choice and the reward received from this action. Updates the means and the trials. Parameters ---------- reward : float Reward received from the system after taking action a_idx. a_idx : int Index of the arm pulled (action taken). """ self.means[a_idx] = ( self.means[a_idx] * self.trials[a_idx] + reward ) / (self.trials[a_idx] + 1) self.trials[a_idx] += 1 # add trial
[docs] def predict(self): r"""Predict next action. With probability :math:`\epsilon` the agent selects a random arm. With probability :math:`1 - \epsilon` the agent selects the arm that has the best average reward. Returns ------- int Index of chosen action. """ if np.random.rand() < self.epsilon: a = np.random.randint(low=0, high=self.n_arms) else: a = self.means.argmax() return a
[docs]class DecayEpsilon(BaseMAB): r"""Agent that follows an epsilon-decreasing policy. The agent uses the epsilon-greedy approach to solve the Multi-Armed bandit problem, but with a decay in the epsilon. The parameter :math:`\epsilon` is used for the exploration-exploitation trade-off. With probability :math:`\epsilon` the agent selects a random action, otherwise it selects the action that has the best average reward. After each interaction the epsilon is updated as epsilon = epsilon * decay. Parameters ---------- n_arms : int Number of actions (arms) of the MAB. max_epsilon : float Initial epsilon. decay : float Decay of the epsilon. Attributes ---------- epsilon : float Epsilon of the agent. Constantly updated as epsilon = epsilon*decay means : numpy.array(float, ndim=1) Vector containing the average reward of each arm. trials : numpy.array(float, ndim=1) Vector containing the number of trials made to each arm. """ def __init__(self, n_arms, max_epsilon, decay): self.epsilon = max_epsilon self.n_arms = n_arms self.means = np.zeros(self.n_arms) self.trials = np.zeros(self.n_arms) self.decay = decay
[docs] def learn(self, a_idx, reward): """Make the `DecayEpsilon` agent learn from the interaction. The MAB agent learns from its previous choice and the reward received from this action. Updates the means and the trials. Parameters ---------- reward : float Reward received from the system after taking action a_idx. a_idx : int Index of the arm pulled (action taken). """ self.means[a_idx] = ( self.means[a_idx] * self.trials[a_idx] + reward ) / (self.trials[a_idx] + 1) self.trials[a_idx] += 1 # add trial
[docs] def predict(self): r"""Predict next action and update epsilon. With probability :math:`\epsilon` the agent selects a random arm. With probability :math:`1 - \epsilon` the agent selects the arm that has the best average reward. Returns ------- int Index of chosen action. """ if np.random.rand() < self.epsilon: a_idx = np.random.randint(low=0, high=self.n_arms) else: a_idx = self.means.argmax() self.epsilon = self.epsilon * self.decay return a_idx