2
2
import matplotlib .pyplot as plt
3
3
from abc import ABC , abstractmethod
4
4
5
+
5
6
class BanditAlgorithm (ABC ):
6
7
"""Base class for bandit algorithms"""
7
-
8
+
8
9
def __init__ (self , n_arms ):
9
10
self .n_arms = n_arms
10
11
self .reset ()
11
-
12
+
12
13
def reset (self ):
13
14
self .counts = np .zeros (self .n_arms )
14
15
self .rewards = np .zeros (self .n_arms )
15
16
self .t = 0
16
-
17
+
17
18
@abstractmethod
18
19
def select_arm (self ):
19
20
pass
20
-
21
+
21
22
def update (self , arm , reward ):
22
23
self .t += 1
23
24
self .counts [arm ] += 1
24
25
self .rewards [arm ] += reward
25
26
27
+
26
28
class EpsilonGreedy (BanditAlgorithm ):
27
29
"""Epsilon-Greedy Algorithm"""
28
-
30
+
29
31
def __init__ (self , n_arms , epsilon = 0.1 ):
30
32
super ().__init__ (n_arms )
31
33
self .epsilon = epsilon
32
-
34
+
33
35
def select_arm (self ):
34
36
if np .random .random () < self .epsilon :
35
37
# Explore: random arm
36
38
return np .random .randint (self .n_arms )
37
39
else :
38
40
# Exploit: best arm so far
39
- avg_rewards = np .divide (self .rewards , self .counts ,
40
- out = np .zeros_like (self .rewards ),
41
- where = self .counts != 0 )
41
+ avg_rewards = np .divide (
42
+ self .rewards ,
43
+ self .counts ,
44
+ out = np .zeros_like (self .rewards ),
45
+ where = self .counts != 0 ,
46
+ )
42
47
return np .argmax (avg_rewards )
43
48
49
+
44
50
class UCB (BanditAlgorithm ):
45
51
"""Upper Confidence Bound Algorithm"""
46
-
52
+
47
53
def __init__ (self , n_arms , c = 2.0 ):
48
54
super ().__init__ (n_arms )
49
55
self .c = c
50
-
56
+
51
57
def select_arm (self ):
52
58
# If any arm hasn't been tried, try it
53
59
if 0 in self .counts :
54
60
return np .where (self .counts == 0 )[0 ][0 ]
55
-
61
+
56
62
# Calculate UCB values
57
63
avg_rewards = self .rewards / self .counts
58
64
confidence = self .c * np .sqrt (np .log (self .t ) / self .counts )
59
65
ucb_values = avg_rewards + confidence
60
-
66
+
61
67
return np .argmax (ucb_values )
62
68
69
+
63
70
class ThompsonSampling (BanditAlgorithm ):
64
71
"""Thompson Sampling (Beta-Bernoulli)"""
65
-
72
+
66
73
def __init__ (self , n_arms ):
67
74
super ().__init__ (n_arms )
68
75
self .alpha = np .ones (n_arms ) # Prior successes
69
- self .beta = np .ones (n_arms ) # Prior failures
70
-
76
+ self .beta = np .ones (n_arms ) # Prior failures
77
+
71
78
def select_arm (self ):
72
79
# Sample from Beta distribution for each arm
73
80
samples = np .random .beta (self .alpha , self .beta )
74
81
return np .argmax (samples )
75
-
82
+
76
83
def update (self , arm , reward ):
77
84
super ().update (arm , reward )
78
85
# Update Beta parameters
@@ -81,146 +88,154 @@ def update(self, arm, reward):
81
88
else :
82
89
self .beta [arm ] += 1
83
90
91
+
84
92
class GradientBandit (BanditAlgorithm ):
85
93
"""Gradient Bandit Algorithm"""
86
-
94
+
87
95
def __init__ (self , n_arms , alpha = 0.1 ):
88
96
super ().__init__ (n_arms )
89
97
self .alpha = alpha
90
98
self .preferences = np .zeros (n_arms )
91
99
self .avg_reward = 0
92
-
100
+
93
101
def select_arm (self ):
94
102
# Softmax to get probabilities
95
103
exp_prefs = np .exp (self .preferences - np .max (self .preferences ))
96
104
probs = exp_prefs / np .sum (exp_prefs )
97
105
return np .random .choice (self .n_arms , p = probs )
98
-
106
+
99
107
def update (self , arm , reward ):
100
108
super ().update (arm , reward )
101
-
109
+
102
110
# Update average reward
103
111
self .avg_reward += (reward - self .avg_reward ) / self .t
104
-
112
+
105
113
# Get action probabilities
106
114
exp_prefs = np .exp (self .preferences - np .max (self .preferences ))
107
115
probs = exp_prefs / np .sum (exp_prefs )
108
-
116
+
109
117
# Update preferences
110
118
for a in range (self .n_arms ):
111
119
if a == arm :
112
- self .preferences [a ] += self .alpha * (reward - self .avg_reward ) * (1 - probs [a ])
120
+ self .preferences [a ] += (
121
+ self .alpha * (reward - self .avg_reward ) * (1 - probs [a ])
122
+ )
113
123
else :
114
- self .preferences [a ] -= self .alpha * (reward - self .avg_reward ) * probs [a ]
124
+ self .preferences [a ] -= (
125
+ self .alpha * (reward - self .avg_reward ) * probs [a ]
126
+ )
127
+
115
128
116
129
# Testbed for comparing algorithms
117
130
class BanditTestbed :
118
131
"""Environment for testing bandit algorithms"""
119
-
132
+
120
133
def __init__ (self , n_arms = 10 , true_rewards = None ):
121
134
self .n_arms = n_arms
122
135
if true_rewards is None :
123
136
self .true_rewards = np .random .normal (0 , 1 , n_arms )
124
137
else :
125
138
self .true_rewards = true_rewards
126
139
self .optimal_arm = np .argmax (self .true_rewards )
127
-
140
+
128
141
def get_reward (self , arm ):
129
142
"""Get noisy reward for pulling an arm"""
130
143
return np .random .normal (self .true_rewards [arm ], 1 )
131
-
144
+
132
145
def run_experiment (self , algorithm , n_steps = 1000 ):
133
146
"""Run bandit algorithm for n_steps"""
134
147
algorithm .reset ()
135
148
rewards = []
136
149
optimal_actions = []
137
-
150
+
138
151
for _ in range (n_steps ):
139
152
arm = algorithm .select_arm ()
140
153
reward = self .get_reward (arm )
141
154
algorithm .update (arm , reward )
142
-
155
+
143
156
rewards .append (reward )
144
157
optimal_actions .append (1 if arm == self .optimal_arm else 0 )
145
-
158
+
146
159
return np .array (rewards ), np .array (optimal_actions )
147
160
161
+
148
162
# Example usage and comparison
149
163
def compare_algorithms ():
150
164
"""Compare different bandit algorithms"""
151
-
165
+
152
166
# Create testbed
153
167
testbed = BanditTestbed (n_arms = 10 )
154
-
168
+
155
169
# Initialize algorithms
156
170
algorithms = {
157
- ' ε-greedy (0.1)' : EpsilonGreedy (10 , epsilon = 0.1 ),
158
- ' ε-greedy (0.01)' : EpsilonGreedy (10 , epsilon = 0.01 ),
159
- ' UCB (c=2)' : UCB (10 , c = 2 ),
160
- ' Thompson Sampling' : ThompsonSampling (10 ),
161
- ' Gradient Bandit' : GradientBandit (10 , alpha = 0.1 )
171
+ " ε-greedy (0.1)" : EpsilonGreedy (10 , epsilon = 0.1 ),
172
+ " ε-greedy (0.01)" : EpsilonGreedy (10 , epsilon = 0.01 ),
173
+ " UCB (c=2)" : UCB (10 , c = 2 ),
174
+ " Thompson Sampling" : ThompsonSampling (10 ),
175
+ " Gradient Bandit" : GradientBandit (10 , alpha = 0.1 ),
162
176
}
163
-
177
+
164
178
n_steps = 2000
165
179
n_runs = 100
166
-
180
+
167
181
results = {}
168
-
182
+
169
183
for name , algorithm in algorithms .items ():
170
184
print (f"Running { name } ..." )
171
185
avg_rewards = np .zeros (n_steps )
172
186
optimal_actions = np .zeros (n_steps )
173
-
187
+
174
188
for run in range (n_runs ):
175
189
rewards , optimal = testbed .run_experiment (algorithm , n_steps )
176
190
avg_rewards += rewards
177
191
optimal_actions += optimal
178
-
192
+
179
193
avg_rewards /= n_runs
180
194
optimal_actions /= n_runs
181
-
182
- results [name ] = {
183
- 'rewards' : avg_rewards ,
184
- 'optimal_actions' : optimal_actions
185
- }
186
-
195
+
196
+ results [name ] = {"rewards" : avg_rewards , "optimal_actions" : optimal_actions }
197
+
187
198
# Plot results
188
199
plt .figure (figsize = (15 , 5 ))
189
-
200
+
190
201
# Average reward over time
191
202
plt .subplot (1 , 2 , 1 )
192
203
for name , result in results .items ():
193
- plt .plot (np .cumsum (result ['rewards' ]) / np .arange (1 , n_steps + 1 ),
194
- label = name )
195
- plt .xlabel ('Steps' )
196
- plt .ylabel ('Average Reward' )
197
- plt .title ('Average Reward vs Steps' )
204
+ plt .plot (np .cumsum (result ["rewards" ]) / np .arange (1 , n_steps + 1 ), label = name )
205
+ plt .xlabel ("Steps" )
206
+ plt .ylabel ("Average Reward" )
207
+ plt .title ("Average Reward vs Steps" )
198
208
plt .legend ()
199
209
plt .grid (True )
200
-
210
+
201
211
# Percentage of optimal actions
202
212
plt .subplot (1 , 2 , 2 )
203
213
for name , result in results .items ():
204
- plt .plot (np .cumsum (result ['optimal_actions' ]) / np .arange (1 , n_steps + 1 ) * 100 ,
205
- label = name )
206
- plt .xlabel ('Steps' )
207
- plt .ylabel ('% Optimal Action' )
208
- plt .title ('Optimal Action Selection vs Steps' )
214
+ plt .plot (
215
+ np .cumsum (result ["optimal_actions" ]) / np .arange (1 , n_steps + 1 ) * 100 ,
216
+ label = name ,
217
+ )
218
+ plt .xlabel ("Steps" )
219
+ plt .ylabel ("% Optimal Action" )
220
+ plt .title ("Optimal Action Selection vs Steps" )
209
221
plt .legend ()
210
222
plt .grid (True )
211
-
223
+
212
224
plt .tight_layout ()
213
225
plt .show ()
214
-
226
+
215
227
return results
216
228
229
+
217
230
# Run the comparison
218
231
if __name__ == "__main__" :
219
232
results = compare_algorithms ()
220
-
233
+
221
234
# Print final performance
222
235
print ("\n Final Performance (last 100 steps):" )
223
236
for name , result in results .items ():
224
- avg_reward = np .mean (result ['rewards' ][- 100 :])
225
- optimal_pct = np .mean (result ['optimal_actions' ][- 100 :]) * 100
226
- print (f"{ name :20s} : Avg Reward = { avg_reward :.3f} , Optimal = { optimal_pct :.1f} %" )
237
+ avg_reward = np .mean (result ["rewards" ][- 100 :])
238
+ optimal_pct = np .mean (result ["optimal_actions" ][- 100 :]) * 100
239
+ print (
240
+ f"{ name :20s} : Avg Reward = { avg_reward :.3f} , Optimal = { optimal_pct :.1f} %"
241
+ )
0 commit comments