-
Notifications
You must be signed in to change notification settings - Fork 0
/
bandit.go
275 lines (231 loc) · 8.77 KB
/
bandit.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
package honu
import (
"math"
"math/rand"
)
// BanditStrategy specifies the methods required by an algorithm to compute
// multi-armed bandit probabilities for reinforcement learning. The basic
// mechanism allows you to initialize a strategy with n arms (or n choices).
// The Select() method will return a selected index based on the internal
// strategy, and the Update() method allows external callers to update the
// reward function for the selected arm.
type BanditStrategy interface {
Init(nArms int) // Initialize the bandit with n choices
Select() int // Selects an arm and returns the index of the choice
Update(arm int, reward float64) // Update the given arm with a reward
Counts() []uint64 // The frequency of each arm being selected
Values() []float64 // The reward distributions for each arm
Serialize() interface{} // Return a JSON representation of the strategy
}
//===========================================================================
// Epsilon Greedy Multi-Armed Bandit
//===========================================================================
// EpsilonGreedy implements a reinforcement learning strategy such that the
// maximizing value is selected with probability epsilon and a uniform random
// selection is made with probability 1-epsilon.
type EpsilonGreedy struct {
Epsilon float64 // Probability of selecting maximizing value
counts []uint64 // Number of times each index was selected
values []float64 // Reward values conditioned by frequency
history *BanditHistory // History of reward values per iteration
}
// Init the bandit with nArms number of possible choices, which are referred
// to by index in both the Counts and Values arrays.
func (b *EpsilonGreedy) Init(nArms int) {
b.counts = make([]uint64, nArms, nArms)
b.values = make([]float64, nArms, nArms)
b.history = NewBanditHistory()
}
// Select the arm with the maximizing value with probability epsilon,
// otherwise uniform random selection of all arms with probability 1-epsilon.
func (b *EpsilonGreedy) Select() int {
if rand.Float64() > b.Epsilon {
// Select the maximal value from values.
max := -1.0
idx := -1
// Find the index of the maximal value.
for i, val := range b.values {
if val > max {
max = val
idx = i
}
}
return idx
}
// Otherwise return any of the values
return rand.Intn(len(b.values))
}
// Update the selected arm with the reward so that the strategy can learn the
// maximizing value (conditioned by the frequency of selection).
func (b *EpsilonGreedy) Update(arm int, reward float64) {
// Update the frequency
b.counts[arm]++
n := float64(b.counts[arm])
value := b.values[arm]
b.values[arm] = ((n-1)/n)*value + (1/n)*reward
b.history.Update(arm, reward)
}
// Counts returns the frequency each arm was selected
func (b *EpsilonGreedy) Counts() []uint64 {
return b.counts
}
// Values returns the reward distribution of each arm
func (b *EpsilonGreedy) Values() []float64 {
return b.values
}
// Serialize the bandit strategy to dump to JSON.
func (b *EpsilonGreedy) Serialize() interface{} {
data := make(map[string]interface{})
data["strategy"] = "epsilon greedy"
data["epsilon"] = b.Epsilon
data["counts"] = b.counts
data["values"] = b.values
data["history"] = b.history
return data
}
//===========================================================================
// Annealing Epsilon Greedy Multi-Armed Bandit
//===========================================================================
// AnnealingEpsilonGreedy implements a reinforcement learning strategy such
// that value of epsilon starts small then grows increasingly bigger, leading
// to an exploring learning strategy at start and prefering exploitation as
// more selections are made.
type AnnealingEpsilonGreedy struct {
counts []uint64 // Number of times each index was selected
values []float64 // Reward values condition by frequency
history *BanditHistory // History of reward values per iteration
}
// Init the bandit with nArms number of possible choices, which are referred
// to by index in both the Counts and Values arrays.
func (b *AnnealingEpsilonGreedy) Init(nArms int) {
b.counts = make([]uint64, nArms, nArms)
b.values = make([]float64, nArms, nArms)
b.history = NewBanditHistory()
}
// Epsilon is computed by the current number of trials such that the more
// trials have occured, the smaller epsilon is (on a log scale).
func (b *AnnealingEpsilonGreedy) Epsilon() float64 {
// Compute epsilon based on the total number of trials
t := uint64(1)
for _, i := range b.counts {
t += i
}
// The more trials the smaller that epsilon is
return 1 / math.Log(float64(t)+0.0000001)
}
// Select the arm with the maximizing value with probability epsilon,
// otherwise uniform random selection of all arms with probability 1-epsilon.
func (b *AnnealingEpsilonGreedy) Select() int {
if rand.Float64() > b.Epsilon() {
// Select the maximal value from values.
max := -1.0
idx := -1
// Find the index of the maximal value.
for i, val := range b.values {
if val > max {
max = val
idx = i
}
}
return idx
}
// Otherwise return any of the values
return rand.Intn(len(b.values))
}
// Update the selected arm with the reward so that the strategy can learn the
// maximizing value (conditioned by the frequency of selection).
func (b *AnnealingEpsilonGreedy) Update(arm int, reward float64) {
// Update the frequency
b.counts[arm]++
n := float64(b.counts[arm])
value := b.values[arm]
b.values[arm] = ((n-1)/n)*value + (1/n)*reward
b.history.Update(arm, reward)
}
// Counts returns the frequency each arm was selected
func (b *AnnealingEpsilonGreedy) Counts() []uint64 {
return b.counts
}
// Values returns the reward distribution of each arm
func (b *AnnealingEpsilonGreedy) Values() []float64 {
return b.values
}
// Serialize the bandit strategy to dump to JSON.
func (b *AnnealingEpsilonGreedy) Serialize() interface{} {
data := make(map[string]interface{})
data["strategy"] = "annealing epsilon greedy"
data["epsilon"] = b.Epsilon()
data["counts"] = b.counts
data["values"] = b.values
data["history"] = b.history
return data
}
//===========================================================================
// Uniform strategy
//===========================================================================
// Uniform selects all values with an equal likelihood on every selection.
// While it tracks the frequency of selection and the reward costs, this
// information does not affect the way it selects values.
type Uniform struct {
counts []uint64 // Number of times each index was selected
values []float64 // Reward values condition by frequency
history *BanditHistory // History of reward values per iteration
}
// Init the bandit with nArms number of possible choices, which are referred
// to by index in both the Counts and Values arrays.
func (b *Uniform) Init(nArms int) {
b.counts = make([]uint64, nArms, nArms)
b.values = make([]float64, nArms, nArms)
b.history = NewBanditHistory()
}
// Select the arm with equal probability for each choice.
func (b *Uniform) Select() int {
return rand.Intn(len(b.values))
}
// Update the selected arm with the reward so that the strategy can learn the
// maximizing value (conditioned by the frequency of selection).
func (b *Uniform) Update(arm int, reward float64) {
// Update the frequency
b.counts[arm]++
n := float64(b.counts[arm])
value := b.values[arm]
b.values[arm] = ((n-1)/n)*value + (1/n)*reward
b.history.Update(arm, reward)
}
// Counts returns the frequency each arm was selected
func (b *Uniform) Counts() []uint64 {
return b.counts
}
// Values returns the reward distribution of each arm
func (b *Uniform) Values() []float64 {
return b.values
}
// Serialize the bandit strategy to dump to JSON.
func (b *Uniform) Serialize() interface{} {
data := make(map[string]interface{})
data["strategy"] = "uniform selection"
data["counts"] = b.counts
data["values"] = b.values
data["history"] = b.history
return data
}
//===========================================================================
// Bandity History - For Experimentation
//===========================================================================
// NewBanditHistory creates and returns a bandit history struct.
func NewBanditHistory() *BanditHistory {
history := new(BanditHistory)
history.Arms = make([]int, 0, 10000)
history.Rewards = make([]float64, 0, 10000)
return history
}
// BanditHistory tracks the selected arms and their rewards over time.
type BanditHistory struct {
Arms []int `json:"arms"` // selected arms per iteration
Rewards []float64 `json:"rewards"` // reward values per iteration
}
// Update the history
func (h *BanditHistory) Update(arm int, reward float64) {
h.Arms = append(h.Arms, arm)
h.Rewards = append(h.Rewards, reward)
}