Source code for bandit.bandit

"""
Bandit agents that implement various strategies.
"""

from typing import List, Tuple, Union

from abc import ABC, abstractmethod

from bandit.environment import Environment

import numpy as np


[docs]class BaseBandit(ABC): """ Base class for all bandit agents. """ def __init__(self, environment: Environment, values: List[float] = None): self.environment = environment if values is None: self.values = [0.0] * len(self.environment) else: self.values = values self.n_selections = np.zeros(len(self.environment), dtype=np.int32) self.reward_history = [] self.choice_history = [] def __len__(self): return len(self.choice_history) @abstractmethod def choose_action(self, *args, **kwargs) -> int: pass # pragma: no cover
[docs] def update_history_and_values( self, choice: int, reward: Union[float, int] ) -> None: """ Update the histories and the value estimates. This base class assumes a sample mean estimate for the values. Different strategies require overwriting this function. Args: choice (int): choiec of action taken reward (Union[float, int]): reward recieved """ self.n_selections[choice] += 1 self.values[choice] += float(reward - self.values[choice]) / ( self.n_selections[choice] ) self.choice_history.append(choice) self.reward_history.append(reward) return
[docs] def action(self, i: int = None) -> float: """ Take an action. Args: i (int): action to take Returns: (float) reward of the taken action """ choice = self.choose_action() if i is None else 0 reward = self.environment.action(choice) self.update_history_and_values(choice, reward) return reward
@property def history(self) -> Tuple[List, List]: return (self.reward_history, self.choice_history)
[docs]class CustomBandit(BaseBandit): """ Wrapper around the `BaseBandit` for creating custom bandit subclasses. """ def choose_action(self, *args, **kwargs) -> int: raise NotImplementedError
[docs]class RandomBandit(BaseBandit): """ A totally random bandit with no strategy. Actions are selected randomly. """
[docs] def choose_action(self, *args, **kwargs) -> int: """ Choose a random action. Returns: (int) action choice """ return np.random.randint(0, len(self.environment))
[docs]class GreedyBandit(BaseBandit): """ Greedy bandit that always selects the optimally valued action. """
[docs] def choose_action(self, *args, **kwargs) -> int: """ Choose the action with the highest value. In case of any ties, return a random selection. Returns: (int) action choice """ return np.random.choice( np.where(self.values == np.max(self.values))[0] )
[docs]class EpsGreedyBandit(BaseBandit): """ Epsilon-Greedy bandit, that makes a random choice 100*episilon percent of the time for exploration and acts greedily the rest of the time. Args: eps (float): fraction of time taking exploratory actions """ def __init__( self, environment: Environment, eps: float, values: List[float] = None ): super().__init__(environment, values) self.eps = eps
[docs] def choose_action(self, *args, **kwargs) -> int: """ Choose a random action `100*self.eps` percent of the time and otherwise take greedy actions. Returns: (int) action choice """ if np.random.rand() < self.eps: # random step return np.random.randint(len(self.environment), dtype=np.int32) else: # greedy step return np.random.choice( np.where(self.values == np.max(self.values))[0] )