Skip to content

Commit 31f188d

Browse files
authored
Implement epsilon-greedy, UCB, and Thompson sampling bandits
1 parent e7e7ad1 commit 31f188d

File tree

1 file changed

+226
-0
lines changed

1 file changed

+226
-0
lines changed
Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
import numpy as np
2+
import matplotlib.pyplot as plt
3+
from abc import ABC, abstractmethod
4+
5+
class BanditAlgorithm(ABC):
6+
"""Base class for bandit algorithms"""
7+
8+
def __init__(self, n_arms):
9+
self.n_arms = n_arms
10+
self.reset()
11+
12+
def reset(self):
13+
self.counts = np.zeros(self.n_arms)
14+
self.rewards = np.zeros(self.n_arms)
15+
self.t = 0
16+
17+
@abstractmethod
18+
def select_arm(self):
19+
pass
20+
21+
def update(self, arm, reward):
22+
self.t += 1
23+
self.counts[arm] += 1
24+
self.rewards[arm] += reward
25+
26+
class EpsilonGreedy(BanditAlgorithm):
27+
"""Epsilon-Greedy Algorithm"""
28+
29+
def __init__(self, n_arms, epsilon=0.1):
30+
super().__init__(n_arms)
31+
self.epsilon = epsilon
32+
33+
def select_arm(self):
34+
if np.random.random() < self.epsilon:
35+
# Explore: random arm
36+
return np.random.randint(self.n_arms)
37+
else:
38+
# Exploit: best arm so far
39+
avg_rewards = np.divide(self.rewards, self.counts,
40+
out=np.zeros_like(self.rewards),
41+
where=self.counts!=0)
42+
return np.argmax(avg_rewards)
43+
44+
class UCB(BanditAlgorithm):
45+
"""Upper Confidence Bound Algorithm"""
46+
47+
def __init__(self, n_arms, c=2.0):
48+
super().__init__(n_arms)
49+
self.c = c
50+
51+
def select_arm(self):
52+
# If any arm hasn't been tried, try it
53+
if 0 in self.counts:
54+
return np.where(self.counts == 0)[0][0]
55+
56+
# Calculate UCB values
57+
avg_rewards = self.rewards / self.counts
58+
confidence = self.c * np.sqrt(np.log(self.t) / self.counts)
59+
ucb_values = avg_rewards + confidence
60+
61+
return np.argmax(ucb_values)
62+
63+
class ThompsonSampling(BanditAlgorithm):
64+
"""Thompson Sampling (Beta-Bernoulli)"""
65+
66+
def __init__(self, n_arms):
67+
super().__init__(n_arms)
68+
self.alpha = np.ones(n_arms) # Prior successes
69+
self.beta = np.ones(n_arms) # Prior failures
70+
71+
def select_arm(self):
72+
# Sample from Beta distribution for each arm
73+
samples = np.random.beta(self.alpha, self.beta)
74+
return np.argmax(samples)
75+
76+
def update(self, arm, reward):
77+
super().update(arm, reward)
78+
# Update Beta parameters
79+
if reward > 0:
80+
self.alpha[arm] += 1
81+
else:
82+
self.beta[arm] += 1
83+
84+
class GradientBandit(BanditAlgorithm):
85+
"""Gradient Bandit Algorithm"""
86+
87+
def __init__(self, n_arms, alpha=0.1):
88+
super().__init__(n_arms)
89+
self.alpha = alpha
90+
self.preferences = np.zeros(n_arms)
91+
self.avg_reward = 0
92+
93+
def select_arm(self):
94+
# Softmax to get probabilities
95+
exp_prefs = np.exp(self.preferences - np.max(self.preferences))
96+
probs = exp_prefs / np.sum(exp_prefs)
97+
return np.random.choice(self.n_arms, p=probs)
98+
99+
def update(self, arm, reward):
100+
super().update(arm, reward)
101+
102+
# Update average reward
103+
self.avg_reward += (reward - self.avg_reward) / self.t
104+
105+
# Get action probabilities
106+
exp_prefs = np.exp(self.preferences - np.max(self.preferences))
107+
probs = exp_prefs / np.sum(exp_prefs)
108+
109+
# Update preferences
110+
for a in range(self.n_arms):
111+
if a == arm:
112+
self.preferences[a] += self.alpha * (reward - self.avg_reward) * (1 - probs[a])
113+
else:
114+
self.preferences[a] -= self.alpha * (reward - self.avg_reward) * probs[a]
115+
116+
# Testbed for comparing algorithms
117+
class BanditTestbed:
118+
"""Environment for testing bandit algorithms"""
119+
120+
def __init__(self, n_arms=10, true_rewards=None):
121+
self.n_arms = n_arms
122+
if true_rewards is None:
123+
self.true_rewards = np.random.normal(0, 1, n_arms)
124+
else:
125+
self.true_rewards = true_rewards
126+
self.optimal_arm = np.argmax(self.true_rewards)
127+
128+
def get_reward(self, arm):
129+
"""Get noisy reward for pulling an arm"""
130+
return np.random.normal(self.true_rewards[arm], 1)
131+
132+
def run_experiment(self, algorithm, n_steps=1000):
133+
"""Run bandit algorithm for n_steps"""
134+
algorithm.reset()
135+
rewards = []
136+
optimal_actions = []
137+
138+
for _ in range(n_steps):
139+
arm = algorithm.select_arm()
140+
reward = self.get_reward(arm)
141+
algorithm.update(arm, reward)
142+
143+
rewards.append(reward)
144+
optimal_actions.append(1 if arm == self.optimal_arm else 0)
145+
146+
return np.array(rewards), np.array(optimal_actions)
147+
148+
# Example usage and comparison
149+
def compare_algorithms():
150+
"""Compare different bandit algorithms"""
151+
152+
# Create testbed
153+
testbed = BanditTestbed(n_arms=10)
154+
155+
# Initialize algorithms
156+
algorithms = {
157+
'ε-greedy (0.1)': EpsilonGreedy(10, epsilon=0.1),
158+
'ε-greedy (0.01)': EpsilonGreedy(10, epsilon=0.01),
159+
'UCB (c=2)': UCB(10, c=2),
160+
'Thompson Sampling': ThompsonSampling(10),
161+
'Gradient Bandit': GradientBandit(10, alpha=0.1)
162+
}
163+
164+
n_steps = 2000
165+
n_runs = 100
166+
167+
results = {}
168+
169+
for name, algorithm in algorithms.items():
170+
print(f"Running {name}...")
171+
avg_rewards = np.zeros(n_steps)
172+
optimal_actions = np.zeros(n_steps)
173+
174+
for run in range(n_runs):
175+
rewards, optimal = testbed.run_experiment(algorithm, n_steps)
176+
avg_rewards += rewards
177+
optimal_actions += optimal
178+
179+
avg_rewards /= n_runs
180+
optimal_actions /= n_runs
181+
182+
results[name] = {
183+
'rewards': avg_rewards,
184+
'optimal_actions': optimal_actions
185+
}
186+
187+
# Plot results
188+
plt.figure(figsize=(15, 5))
189+
190+
# Average reward over time
191+
plt.subplot(1, 2, 1)
192+
for name, result in results.items():
193+
plt.plot(np.cumsum(result['rewards']) / np.arange(1, n_steps + 1),
194+
label=name)
195+
plt.xlabel('Steps')
196+
plt.ylabel('Average Reward')
197+
plt.title('Average Reward vs Steps')
198+
plt.legend()
199+
plt.grid(True)
200+
201+
# Percentage of optimal actions
202+
plt.subplot(1, 2, 2)
203+
for name, result in results.items():
204+
plt.plot(np.cumsum(result['optimal_actions']) / np.arange(1, n_steps + 1) * 100,
205+
label=name)
206+
plt.xlabel('Steps')
207+
plt.ylabel('% Optimal Action')
208+
plt.title('Optimal Action Selection vs Steps')
209+
plt.legend()
210+
plt.grid(True)
211+
212+
plt.tight_layout()
213+
plt.show()
214+
215+
return results
216+
217+
# Run the comparison
218+
if __name__ == "__main__":
219+
results = compare_algorithms()
220+
221+
# Print final performance
222+
print("\nFinal Performance (last 100 steps):")
223+
for name, result in results.items():
224+
avg_reward = np.mean(result['rewards'][-100:])
225+
optimal_pct = np.mean(result['optimal_actions'][-100:]) * 100
226+
print(f"{name:20s}: Avg Reward = {avg_reward:.3f}, Optimal = {optimal_pct:.1f}%")

0 commit comments

Comments
 (0)