Source code for bandit.posse

"""
A gang of bandit agents for easily performing testing en masse.
"""

from typing import List, Type, Union

import numpy as np

from bandit.bandit import BaseBandit
from bandit.environment import Environment


[docs]class Posse: """ A posse of bandits that all sample the same environment for the same number of steps. Args: environment (Environment): the environment that the bandits sample bandit_class (Type[BaseBandit]): the kind of bandit to create n_bandits (int): the number of bandits to create bandit_kwargs (dict): dictionary of arguments to pass to the bandits """ def __init__( self, environment: Environment, bandit_class: Type[BaseBandit], n_bandits: int, **bandit_kwargs, ): self.environment: Environment = environment self.n_bandits: int = n_bandits self.bandits: List[Type[BaseBandit]] = [ bandit_class(self.environment, **bandit_kwargs) for _ in range(n_bandits) ] self._n_actions_taken = 0
[docs] def take_actions(self, n_actions: int) -> None: """ Take `n_actions` actions for each bandit in the posse. Args: n_actions (int): number of actions to take """ for _ in range(n_actions): for b in self.bandits: b.action() self._n_actions_taken += n_actions self.reward_histories = np.array([[]]) self.choice_histories = np.array([[]])
def __len__(self) -> int: return self._n_actions_taken @property def n_actions_taken(self) -> int: return self._n_actions_taken @property def len_env(self) -> int: return len(self.environment) @property def n_rewards(self) -> int: return self.len_env def _update_histories(self) -> None: """ """ self.reward_histories = np.array( [b.reward_history for b in self.bandits] ) self.choice_histories = np.array( [b.choice_history for b in self.bandits] )
[docs] def mean_reward(self) -> np.ndarray: """ Average reward at each time computed over all bandits. """ if self.n_actions_taken > len(self.reward_histories[0]): self._update_histories() return np.mean(self.reward_histories, axis=0)
[docs] def var_reward(self) -> np.ndarray: """ Variance at each time of the reward computed over all bandits. """ if self.n_actions_taken > len(self.reward_histories[0]): self._update_histories return np.var(self.reward_histories, axis=0)
[docs] def mean_best_choice( self, best_choice: Union[int, Union[List, np.ndarray]], ) -> np.ndarray: """ Average of the best choice at each time computed over all bandits. Args: best_choice (Union[int, List[int], np.ndarray]): if int, the best choice for all times. If list of `np.ndarray` then the best choice at each time step. """ if self.n_actions_taken > len(self.reward_histories[0]): self._update_histories() if type(best_choice) in [list, np.ndarray]: msg = "len(best_choices) must equal choice history of the bandits" assert len(best_choice) == len(self.choice_histories[0]), msg where_best = self.choice_histories == np.asarray( best_choice, dtype=np.int32 ) elif np.issubdtype(type(best_choice), np.integer): where_best = self.choice_histories == best_choice else: msg = f"best_choice must be int, list, np.ndarray but {type(best_choice)} provided" # noqa: E501 raise TypeError(msg) return np.mean(where_best, axis=0)
[docs] def var_best_choice( self, best_choice: Union[int, Union[List, np.ndarray]], ) -> np.ndarray: """ Average of the best choice at each time computed over all bandits. Args: best_choice (Union[int, List[int], np.ndarray]): if int, the best choice for all times. If list of `np.ndarray` then the best choice at each time step. """ if self.n_actions_taken > len(self.reward_histories[0]): self._update_histories() if type(best_choice) in [list, np.ndarray]: msg = "len(best_choices) must equal choice history of the bandits" assert len(best_choice) == len(self.choice_histories[0]), msg where_best = self.choice_histories == np.asarray( best_choice, dtype=np.int32 ) elif np.issubdtype(type(best_choice), np.integer): where_best = self.choice_histories == best_choice else: msg = f"best_choice must be int, list, np.ndarray but {type(best_choice)} provided" # noqa: E501 raise TypeError(msg) return np.var(where_best, axis=0)