Skip to content

Commit 11adba0

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent 31f188d commit 11adba0

File tree

1 file changed

+82
-67
lines changed

1 file changed

+82
-67
lines changed

machine_learning/Multi-Armed Bandits .py

Lines changed: 82 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -2,77 +2,84 @@
22
import matplotlib.pyplot as plt
33
from abc import ABC, abstractmethod
44

5+
56
class BanditAlgorithm(ABC):
67
"""Base class for bandit algorithms"""
7-
8+
89
def __init__(self, n_arms):
910
self.n_arms = n_arms
1011
self.reset()
11-
12+
1213
def reset(self):
1314
self.counts = np.zeros(self.n_arms)
1415
self.rewards = np.zeros(self.n_arms)
1516
self.t = 0
16-
17+
1718
@abstractmethod
1819
def select_arm(self):
1920
pass
20-
21+
2122
def update(self, arm, reward):
2223
self.t += 1
2324
self.counts[arm] += 1
2425
self.rewards[arm] += reward
2526

27+
2628
class EpsilonGreedy(BanditAlgorithm):
2729
"""Epsilon-Greedy Algorithm"""
28-
30+
2931
def __init__(self, n_arms, epsilon=0.1):
3032
super().__init__(n_arms)
3133
self.epsilon = epsilon
32-
34+
3335
def select_arm(self):
3436
if np.random.random() < self.epsilon:
3537
# Explore: random arm
3638
return np.random.randint(self.n_arms)
3739
else:
3840
# Exploit: best arm so far
39-
avg_rewards = np.divide(self.rewards, self.counts,
40-
out=np.zeros_like(self.rewards),
41-
where=self.counts!=0)
41+
avg_rewards = np.divide(
42+
self.rewards,
43+
self.counts,
44+
out=np.zeros_like(self.rewards),
45+
where=self.counts != 0,
46+
)
4247
return np.argmax(avg_rewards)
4348

49+
4450
class UCB(BanditAlgorithm):
4551
"""Upper Confidence Bound Algorithm"""
46-
52+
4753
def __init__(self, n_arms, c=2.0):
4854
super().__init__(n_arms)
4955
self.c = c
50-
56+
5157
def select_arm(self):
5258
# If any arm hasn't been tried, try it
5359
if 0 in self.counts:
5460
return np.where(self.counts == 0)[0][0]
55-
61+
5662
# Calculate UCB values
5763
avg_rewards = self.rewards / self.counts
5864
confidence = self.c * np.sqrt(np.log(self.t) / self.counts)
5965
ucb_values = avg_rewards + confidence
60-
66+
6167
return np.argmax(ucb_values)
6268

69+
6370
class ThompsonSampling(BanditAlgorithm):
6471
"""Thompson Sampling (Beta-Bernoulli)"""
65-
72+
6673
def __init__(self, n_arms):
6774
super().__init__(n_arms)
6875
self.alpha = np.ones(n_arms) # Prior successes
69-
self.beta = np.ones(n_arms) # Prior failures
70-
76+
self.beta = np.ones(n_arms) # Prior failures
77+
7178
def select_arm(self):
7279
# Sample from Beta distribution for each arm
7380
samples = np.random.beta(self.alpha, self.beta)
7481
return np.argmax(samples)
75-
82+
7683
def update(self, arm, reward):
7784
super().update(arm, reward)
7885
# Update Beta parameters
@@ -81,146 +88,154 @@ def update(self, arm, reward):
8188
else:
8289
self.beta[arm] += 1
8390

91+
8492
class GradientBandit(BanditAlgorithm):
8593
"""Gradient Bandit Algorithm"""
86-
94+
8795
def __init__(self, n_arms, alpha=0.1):
8896
super().__init__(n_arms)
8997
self.alpha = alpha
9098
self.preferences = np.zeros(n_arms)
9199
self.avg_reward = 0
92-
100+
93101
def select_arm(self):
94102
# Softmax to get probabilities
95103
exp_prefs = np.exp(self.preferences - np.max(self.preferences))
96104
probs = exp_prefs / np.sum(exp_prefs)
97105
return np.random.choice(self.n_arms, p=probs)
98-
106+
99107
def update(self, arm, reward):
100108
super().update(arm, reward)
101-
109+
102110
# Update average reward
103111
self.avg_reward += (reward - self.avg_reward) / self.t
104-
112+
105113
# Get action probabilities
106114
exp_prefs = np.exp(self.preferences - np.max(self.preferences))
107115
probs = exp_prefs / np.sum(exp_prefs)
108-
116+
109117
# Update preferences
110118
for a in range(self.n_arms):
111119
if a == arm:
112-
self.preferences[a] += self.alpha * (reward - self.avg_reward) * (1 - probs[a])
120+
self.preferences[a] += (
121+
self.alpha * (reward - self.avg_reward) * (1 - probs[a])
122+
)
113123
else:
114-
self.preferences[a] -= self.alpha * (reward - self.avg_reward) * probs[a]
124+
self.preferences[a] -= (
125+
self.alpha * (reward - self.avg_reward) * probs[a]
126+
)
127+
115128

116129
# Testbed for comparing algorithms
117130
class BanditTestbed:
118131
"""Environment for testing bandit algorithms"""
119-
132+
120133
def __init__(self, n_arms=10, true_rewards=None):
121134
self.n_arms = n_arms
122135
if true_rewards is None:
123136
self.true_rewards = np.random.normal(0, 1, n_arms)
124137
else:
125138
self.true_rewards = true_rewards
126139
self.optimal_arm = np.argmax(self.true_rewards)
127-
140+
128141
def get_reward(self, arm):
129142
"""Get noisy reward for pulling an arm"""
130143
return np.random.normal(self.true_rewards[arm], 1)
131-
144+
132145
def run_experiment(self, algorithm, n_steps=1000):
133146
"""Run bandit algorithm for n_steps"""
134147
algorithm.reset()
135148
rewards = []
136149
optimal_actions = []
137-
150+
138151
for _ in range(n_steps):
139152
arm = algorithm.select_arm()
140153
reward = self.get_reward(arm)
141154
algorithm.update(arm, reward)
142-
155+
143156
rewards.append(reward)
144157
optimal_actions.append(1 if arm == self.optimal_arm else 0)
145-
158+
146159
return np.array(rewards), np.array(optimal_actions)
147160

161+
148162
# Example usage and comparison
149163
def compare_algorithms():
150164
"""Compare different bandit algorithms"""
151-
165+
152166
# Create testbed
153167
testbed = BanditTestbed(n_arms=10)
154-
168+
155169
# Initialize algorithms
156170
algorithms = {
157-
'ε-greedy (0.1)': EpsilonGreedy(10, epsilon=0.1),
158-
'ε-greedy (0.01)': EpsilonGreedy(10, epsilon=0.01),
159-
'UCB (c=2)': UCB(10, c=2),
160-
'Thompson Sampling': ThompsonSampling(10),
161-
'Gradient Bandit': GradientBandit(10, alpha=0.1)
171+
"ε-greedy (0.1)": EpsilonGreedy(10, epsilon=0.1),
172+
"ε-greedy (0.01)": EpsilonGreedy(10, epsilon=0.01),
173+
"UCB (c=2)": UCB(10, c=2),
174+
"Thompson Sampling": ThompsonSampling(10),
175+
"Gradient Bandit": GradientBandit(10, alpha=0.1),
162176
}
163-
177+
164178
n_steps = 2000
165179
n_runs = 100
166-
180+
167181
results = {}
168-
182+
169183
for name, algorithm in algorithms.items():
170184
print(f"Running {name}...")
171185
avg_rewards = np.zeros(n_steps)
172186
optimal_actions = np.zeros(n_steps)
173-
187+
174188
for run in range(n_runs):
175189
rewards, optimal = testbed.run_experiment(algorithm, n_steps)
176190
avg_rewards += rewards
177191
optimal_actions += optimal
178-
192+
179193
avg_rewards /= n_runs
180194
optimal_actions /= n_runs
181-
182-
results[name] = {
183-
'rewards': avg_rewards,
184-
'optimal_actions': optimal_actions
185-
}
186-
195+
196+
results[name] = {"rewards": avg_rewards, "optimal_actions": optimal_actions}
197+
187198
# Plot results
188199
plt.figure(figsize=(15, 5))
189-
200+
190201
# Average reward over time
191202
plt.subplot(1, 2, 1)
192203
for name, result in results.items():
193-
plt.plot(np.cumsum(result['rewards']) / np.arange(1, n_steps + 1),
194-
label=name)
195-
plt.xlabel('Steps')
196-
plt.ylabel('Average Reward')
197-
plt.title('Average Reward vs Steps')
204+
plt.plot(np.cumsum(result["rewards"]) / np.arange(1, n_steps + 1), label=name)
205+
plt.xlabel("Steps")
206+
plt.ylabel("Average Reward")
207+
plt.title("Average Reward vs Steps")
198208
plt.legend()
199209
plt.grid(True)
200-
210+
201211
# Percentage of optimal actions
202212
plt.subplot(1, 2, 2)
203213
for name, result in results.items():
204-
plt.plot(np.cumsum(result['optimal_actions']) / np.arange(1, n_steps + 1) * 100,
205-
label=name)
206-
plt.xlabel('Steps')
207-
plt.ylabel('% Optimal Action')
208-
plt.title('Optimal Action Selection vs Steps')
214+
plt.plot(
215+
np.cumsum(result["optimal_actions"]) / np.arange(1, n_steps + 1) * 100,
216+
label=name,
217+
)
218+
plt.xlabel("Steps")
219+
plt.ylabel("% Optimal Action")
220+
plt.title("Optimal Action Selection vs Steps")
209221
plt.legend()
210222
plt.grid(True)
211-
223+
212224
plt.tight_layout()
213225
plt.show()
214-
226+
215227
return results
216228

229+
217230
# Run the comparison
218231
if __name__ == "__main__":
219232
results = compare_algorithms()
220-
233+
221234
# Print final performance
222235
print("\nFinal Performance (last 100 steps):")
223236
for name, result in results.items():
224-
avg_reward = np.mean(result['rewards'][-100:])
225-
optimal_pct = np.mean(result['optimal_actions'][-100:]) * 100
226-
print(f"{name:20s}: Avg Reward = {avg_reward:.3f}, Optimal = {optimal_pct:.1f}%")
237+
avg_reward = np.mean(result["rewards"][-100:])
238+
optimal_pct = np.mean(result["optimal_actions"][-100:]) * 100
239+
print(
240+
f"{name:20s}: Avg Reward = {avg_reward:.3f}, Optimal = {optimal_pct:.1f}%"
241+
)

0 commit comments

Comments
 (0)