Source code for bandit.bandit

Bandit agents that implement various strategies.

from typing import List, Tuple, Union

from abc import ABC, abstractmethod

from bandit.environment import Environment

import numpy as np

[docs]class BaseBandit(ABC): """ Base class for all bandit agents. """ def __init__(self, environment: Environment, values: List[float] = None): self.environment = environment if values is None: self.values = [0.0] * len(self.environment) else: self.values = values self.n_selections = np.zeros(len(self.environment), dtype=np.int32) self.reward_history = [] self.choice_history = [] def __len__(self): return len(self.choice_history) @abstractmethod def choose_action(self, *args, **kwargs) -> int: pass # pragma: no cover
[docs] def update_history_and_values( self, choice: int, reward: Union[float, int] ) -> None: """ Update the histories and the value estimates. This base class assumes a sample mean estimate for the values. Different strategies require overwriting this function. Args: choice (int): choiec of action taken reward (Union[float, int]): reward recieved """ self.n_selections[choice] += 1 self.values[choice] += float(reward - self.values[choice]) / ( self.n_selections[choice] ) self.choice_history.append(choice) self.reward_history.append(reward) return
[docs] def action(self, i: int = None) -> float: """ Take an action. Args: i (int): action to take Returns: (float) reward of the taken action """ choice = self.choose_action() if i is None else 0 reward = self.environment.action(choice) self.update_history_and_values(choice, reward) return reward
@property def history(self) -> Tuple[List, List]: return (self.reward_history, self.choice_history)
[docs]class CustomBandit(BaseBandit): """ Wrapper around the `BaseBandit` for creating custom bandit subclasses. """ def choose_action(self, *args, **kwargs) -> int: raise NotImplementedError
[docs]class RandomBandit(BaseBandit): """ A totally random bandit with no strategy. Actions are selected randomly. """
[docs] def choose_action(self, *args, **kwargs) -> int: """ Choose a random action. Returns: (int) action choice """ return np.random.randint(0, len(self.environment))
[docs]class GreedyBandit(BaseBandit): """ Greedy bandit that always selects the optimally valued action. """
[docs] def choose_action(self, *args, **kwargs) -> int: """ Choose the action with the highest value. In case of any ties, return a random selection. Returns: (int) action choice """ return np.random.choice( np.where(self.values == np.max(self.values))[0] )
[docs]class EpsGreedyBandit(BaseBandit): """ Epsilon-Greedy bandit, that makes a random choice 100*episilon percent of the time for exploration and acts greedily the rest of the time. Args: eps (float): fraction of time taking exploratory actions """ def __init__( self, environment: Environment, eps: float, values: List[float] = None ): super().__init__(environment, values) self.eps = eps
[docs] def choose_action(self, *args, **kwargs) -> int: """ Choose a random action `100*self.eps` percent of the time and otherwise take greedy actions. Returns: (int) action choice """ if np.random.rand() < self.eps: # random step return np.random.randint(len(self.environment), dtype=np.int32) else: # greedy step return np.random.choice( np.where(self.values == np.max(self.values))[0] )