forked from STC-CPI-Methods/cpiclasser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
510 lines (458 loc) · 22.7 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
# -*- coding: utf-8 -*-
"""This module contiains code that is useful for training and evaulating
the perfomance of a model.
"""
import pathlib
import os
from inspect import getsource
import time
import random
import operator
import collections
import itertools
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import BaseCrossValidator
from keras.callbacks import Callback
from math import ceil
import keras.backend as K
from . import core
class CustomCV(BaseCrossValidator):
def __init__(self, n_groups, train_min, stratas, always_include=None):
#N_GROUPS IS number of folds to make...
self.n_groups = n_groups
#min number of observations per strata needed for train
#if there are less than this, inlcude all in train...
self.train_min = train_min
#make into pandas dataframe.
#should have option to just start like this..
#always_include boolean array that indicates observations to always include in the train set
#useful when updating the model with new retailers, but should not be necessary to provide...
#if always_include not provide, then make 0s (false)
if always_include is None:
always_include=np.zeros(stratas.shape[0], dtype=bool)
self.df=pd.DataFrame({"strata":stratas, "always_include":always_include})
def split(self, X=None, y=None, groups=None):
#initiliaze empty folds
#will put indexes that indicate the observation should be in train here
fold_groups = [list() for i in range(self.n_groups)]
#indexes = np.arange(0,self.df.shape[0])
#iterate through strta, creating test folds
for _, group in self.df.groupby("strata"):
#exclude observations that are always_include
group = group.loc[group["always_include"] == False]
n_samples = group.shape[0]
#if not enough observations for the strata, don't add any observations to test_folds
if (n_samples - ceil(n_samples / self.n_groups)) < self.train_min:
continue
else:
#sort to make certain shortest fold gets most
fold_groups.sort(key = lambda x: len(x))
num_per_fold = np.ones(self.n_groups, dtype = 'int16')
num_per_fold = num_per_fold * (n_samples // self.n_groups)
#extras go the beginning
num_per_fold[:(n_samples % self.n_groups)] += 1
idxs = np.random.permutation(group.index.values)
start = 0
for i, fold_length in enumerate(num_per_fold):
fold_groups[i].extend(idxs[start:start+fold_length])
start += fold_length
for fold in fold_groups:
test_index = np.array(fold)
train_index = self.df.index.values[~self.df.index.isin(fold)]
yield train_index, test_index
def get_n_splits(self, X, y=None, groups = None):
return(self.n_groups)
class TimeStopper(Callback):
#does early stopping baswed on time and f1
def __init__(self,
validation_generator,
#desired is very high since if not provided, basically ignored
#can be improved
desired_time=1e8,
patience=10):
super(TimeStopper, self).__init__()
self.desired_time = desired_time
self.t0 = None
self.elapsed=0.0
#if adding optino to monitor diffeent stats, change here
self.best_f1 = 0.
self.since_best = 0
self.patience = patience
self.validation_generator = validation_generator
self.labels = np.unique(validation_generator.y)
def on_train_begin(self, logs=None):
self.t0 = time.time()
def on_epoch_end(self, epoch, logs=None):
self.elapsed = time.time()-self.t0
#add to logs
logs["elapsed"] = self.elapsed
#monitor f1 score
probs = self.model.predict_generator(generator=self.validation_generator,
steps=len(self.validation_generator))
#SHOULD ADD OPTION TO GET LOG LOSS
#SHOULD HAVE ABILITY TO MONITOR:
#LOG LOSS, OVERALL ACC, RECALL, PRECISION AND F1
predicted = np.argmax(probs, axis=-1)
precisions, recalls, f1_scores, supports = precision_recall_fscore_support(self.validation_generator.y,
predicted,
average=None,
labels=self.labels)
#make into DataFrame
per_class_stats = pd.DataFrame({"class_int":self.labels, "recall":recalls, "precision":precisions,
"f1_score":f1_scores, "support":supports})
#cal overall f1 score
val_f1 = np.mean(f1_scores)
val_recall = np.mean(recalls)
val_precision = np.mean(precisions)
#add to logs
logs["per_class"] = per_class_stats
logs["val_f1"] = val_f1
logs["val_recall"] = val_recall
logs["val_precision"] = val_precision
#and print
print(" - val_f1 {:.2%}".format(val_f1))
print(" - val_recall {:.2%}".format(val_recall))
print(" - val_precision {:.2%}".format(val_precision))
if val_f1 > self.best_f1:
self.since_best = 0
self.best_f1 = val_f1
else:
self.since_best += 1
#trigger early stopping
if (self.elapsed >= self.desired_time) or (self.since_best >= self.patience):
self.model.stop_training = True
#for making a grid and doing a search
def make_grid(args):
#incase value is not iterable
iterable_args=collections.defaultdict(list)
for key, value in args.items():
if isinstance(value, (tuple,list)):
iterable_args[key].extend(value)
else:
iterable_args[key].append(value)
#create grid:
return [dict(zip(iterable_args, x)) for x in itertools.product(*iterable_args.values())]
class NeuralValidator():
"""This class is for testing a certain set of parameters
Made into a class since will be used with both NeuralSearcher and
NeuralEvolver
"""
def __init__(self,
model_func,
model_params,
labels):
self.model_func = model_func
self.model_params = model_params
#this will be a list of dicts
#each dict will contain latest score for best iteratoin
self.scores = []
#labels is passed since NeuralClasser requires it
#this might be a good reason to split up Tester and Classer
self.labels = labels
#always require validation data
#and use TimeStopper (might want to rename)
def test(self,
#this should be a seq of two elments
train_data,
valid_data,
#making maxminutes high so will be nnot a factor if not supplied
#not ideal, repeating TimeStopper
max_minutes=1e8,
class_weights=None):
classer = core.NeuralClasser(labels = self.labels,
model_func = self.model_func,
**self.model_params)
validation_generator = core.PadderSequence(x=valid_data[0],
y=valid_data[1],
vectorizer=classer.vectorizer,
prevec=True)
timestopper = TimeStopper(desired_time = max_minutes,
validation_generator=validation_generator)
history = classer.fit(x=train_data[0],
y=train_data[1],
epochs=500,
callbacks=[timestopper],
class_weights=class_weights)
scores = dict()
#now get some stats
scores['best_epoch'], scores['best_f1'] = max(enumerate(history.history["val_f1"]),
key=operator.itemgetter(1))
scores['best_time'] = history.history["elapsed"][scores['best_epoch']]
scores['best_precision'] = history.history["val_precision"][scores['best_epoch']]
scores['best_recall'] = history.history["val_recall"][scores['best_epoch']]
#this will always exist in history (unless multiple losses?)
scores['total_epochs'] = len(history.history["loss"])
#so that we can later crossreference the individual perclass files with the averaged file
#WILL NEED TO INTEGRATE INTO SEARCHER
#scores['best_per_class["model_num"] = params_idx
self.scores.append(scores)
#not appending tthe per class to scores, since easier to manage
return scores, history.history["per_class"][scores['best_epoch']]
class NeuralSearcher():
#Still need a way to pass arguments to the vectorier
def __init__(self,
#text should be a list of tuples (array or tuple may or may not work)
texts,
#y is labels, integer encoded (done outside of system....)
y,
#labels is pandas dataframe with code and label
#could get from from y
labels,
#function that return a keras nodel, compiled
model_func,
#for iterating over
model_param_possibilites={},
n_folds=5,
train_min=3,
always_include=None,
class_weights=None,
verbose = False):
self.texts=texts
#don't need to encode here
self.num_labels = labels.shape[0]
self.labels=labels
self.y=y
self.n_folds=n_folds
self.train_min=train_min
self.cv_generator=CustomCV(n_folds, train_min, stratas=self.y, always_include=always_include)
self.fold_indexes = [(train_indexes, valid_indexes) for train_indexes, valid_indexes in self.cv_generator.split()]
self.class_weights=class_weights
self.model_func = model_func
#make model arguments into grid if a dictionary
#if not a dict, should be a list or tuple
if isinstance(model_param_possibilites, dict):
self.param_grid = make_grid(model_param_possibilites)
elif isinstance(model_param_possibilites, (tuple, list)):
self.param_grid = model_param_possibilites
else:
raise ValueError("""model_param_possibilites argument needs to be either a
dictionary (in which case a grid will be created) or an iterable
list or tuple of dicionaties of model parameters""")
self.model_specs = [key for key in model_param_possibilites]
self.verbose = verbose
#need to add since starts from 0
self.results=collections.defaultdict(list)
self.per_class_results = []
#time per fold is maxium number of times that wil be spent on fold, before earlystopping is done
def fit(self, minutes_per_fold=10, patience=25):
#iterate through model params
for params_idx, params in enumerate(self.param_grid):
#might be good to print parameters.
print("\n"+"*"*20)
print("STARTING MODEL SEARCH {}/{}:".format(params_idx, len(self.param_grid)))
#print model arguments
for key, value in params.items():
print(" {} : {}".format(key, value))
#iterate through folds:
for fold_idx, (train_index, valid_index) in enumerate(self.fold_indexes):
print("\n"+"*"*20+"\n")
print("Starting fold {}/{}".format(fold_idx+1, self.n_folds))
#need to call clear session to prevent accumlation in memory
K.clear_session()
#use the NeuralValidator wrapper
tester = NeuralValidator(labels = self.labels,
model_func=self.model_func,
model_params=params)
scores, best_per_class = tester.test((self.texts[train_index], self.y[train_index]),
(self.texts[valid_index], self.y[valid_index]),
max_minutes=minutes_per_fold*60,
class_weights=self.class_weights)
#Some stats
print("\nBest F1 score of {:.2%}".format(scores['best_f1']))
print("Achieved on epoch {}/{} after {:0.0f} seconds".format(scores['best_epoch']+1,
scores['total_epochs'],
scores['best_time']))
print("Corresponding to a recall of: {:.2%}".format(scores['best_recall']))
print("And to a precision of: {:.2%}".format(scores['best_precision']))
#now model args save to results dict that we will output as DataFrame later
for key, value in params.items():
self.results[key].append(value)
#saving scores, iterate and append
for key, value in scores.items():
self.results[key].append(value)
self.results["model_num"].append(params_idx)
#add model num to per class results so can keep track
best_per_class["model_num"] = params_idx
self.per_class_results.append(best_per_class)
def save_results(self, results_dir):
time_finished = time.strftime("%Y%m%d_%Hh%Mm")
out_dir = results_dir + "search_results_" + time_finished
pathlib.Path(out_dir).mkdir(parents=True, exist_ok=True)
#save model string
with open(os.path.join(out_dir, "model_string.txt"),"w") as f:
f.write(getsource(self.model_func))
#now the averaged results per set pof params
results = pd.DataFrame(self.results)
#need to convert to string since otherwise None types are silently dropped
results = results.astype(dtype={col:str for col in self.model_specs})
model_args = self.model_specs + ["model_num"]
results = results.groupby(model_args).mean()
results.to_csv(os.path.join(out_dir,"averaged_results.csv"))
#now per class results
perclass_results = pd.concat(self.per_class_results).groupby(["class_int", "model_num"],
as_index=False).mean()
perclass_results = pd.merge(perclass_results,
self.labels,
left_on = "class_int",
right_index=True)
for i in range(len(self.param_grid)):
perclass_result = perclass_results[perclass_results["model_num"] == i]
perclass_result.to_csv(os.path.join(out_dir,"perclass_result_" + str(i) + ".csv"), index=False)
class NeuralEvolver():
def __init__(self,
texts,
y,
labels,
model_func,
possible_params,
train_min=3,
always_include=None,
n_folds=10,
population_per_fold=20,
retain=0.4,
random_select=0.1,
mutate_chance=0.05,
class_weights=None):
"""
Should consider moving out the stuff lik texts, y
Maybe the creation of cv to fit method
"""
self.texts = texts
self.y = y
self.n_folds=n_folds
self.train_min=train_min
self.cv_generator=CustomCV(n_folds, train_min, stratas=self.y, always_include=always_include)
self.fold_indexes = [(train_indexes, valid_indexes) for train_indexes, valid_indexes in self.cv_generator.split()]
#labels are needed since train.NeuralValidator requires it
#not going to use it, should break dependacy
self.labels = labels
self.class_weights=class_weights
self.model_func = model_func
#make certain that value are all iterable (list or tuple)
self.possible_params = collections.defaultdict(list)
for key, value in possible_params.items():
if isinstance(value, (tuple,list)):
self.possible_params[key].extend(value)
else:
self.possible_params[key].append(value)
self.retain = retain
self.random_select = random_select
self.mutate_chance = mutate_chance
self.population_per_fold = population_per_fold
def __create_generation__(self, size):
"""Create first generation of network params
Only needs to be called once, at start of fit
Returns:
pop (list): list containing networks params
network params are dicts, with two keys
"Return li of dictionaruies with model args"""
pop = []
for _ in range(size):
# Create a set of network parameters
model_params = {}
for param, values in self.possible_params.items():
model_params[param] = random.choice(values)
# Add the network parameters to our population.
tester = NeuralValidator(labels = self.labels,
model_func=self.model_func,
model_params=model_params)
pop.append(tester)
return pop
def __breed__(self, mother, father):
"""Make two children as parts of their parents.
Args:
mother (NeuralValidator): A wrapper for testing a model with certain set of parameters
father (NeuralValidator): A wrapper for testing a model with certain set of parameters
Returns:
children (list): Two instantiated NeuralValidator
"""
children = []
for _ in range(2):
child_params = {}
# Loop through the parameters and pick params for the kid.
for param, values in self.possible_params.items():
#Randomly mutate some of the children's "genes"
if self.mutate_chance > random.random():
child_params[param] = random.choice(
values
)
else:
child_params[param] = random.choice(
[mother.model_params[param], father.model_params[param]]
)
child = NeuralValidator(labels = self.labels,
model_func=self.model_func,
model_params=child_params)
children.append(child)
return children
def fit(self, initial_pop=[]):
"""this does the heavy lifting, it evolves the sepcdied number of generations
initial_pop can be a list of instantiated NeuralValidator objects
up to a a length of self.population per fold
if less, then new specimens are generated to get up to desired"""
#start with first gen
specimens_needed = self.population_per_fold - len(initial_pop)
generation = self.__create_generation__(specimens_needed)
for fold_idx, (train_index, valid_index) in enumerate(self.fold_indexes):
print("\n"+"*"*20+"\n")
print("Starting fold (generation) {}/{}".format(fold_idx+1, self.n_folds))
#now train...
#good thing about NeuralValidator is that it does not store weights
#but still need to clear session
for network_idx, network in enumerate(generation):
print("\n"+"*"*20)
print("STARTING MODEL SEARCH {}/{}:".format(network_idx+1, self.population_per_fold))
#print model arguments
for key, value in network.model_params.items():
print(" {} : {}".format(key, value))
_ = network.test((self.texts[train_index], self.y[train_index]),
(self.texts[valid_index], self.y[valid_index]),
class_weights=self.class_weights)
K.clear_session()
#don't evolve if last fold (generation)
if (fold_idx + 1) < len(self.fold_indexes):
generation = self.__evolve__(generation)
self.results = generation
def __evolve__(self, generation):
#now sort networks based on score (best_f1)
graded = [x for x in sorted(generation, key=lambda x: x.scores[-1]["best_f1"], reverse=True)]
# Get the number we want to keep for the next gen.
retain_length = int(len(graded)*self.retain)
# The parents are every network we want to keep.
parents = graded[:retain_length]
# For those we aren't keeping, randomly keep some anyway.
for individual in graded[retain_length:]:
if self.random_select > random.random():
parents.append(individual)
# Now find out how many spots we have left to fill.
parents_length = len(parents)
desired_length = len(generation) - parents_length
children = []
# Add children, which are bred from two remaining networks.
while len(children) < desired_length:
# Get a random mom and dad.
male = random.randint(0, parents_length-1)
female = random.randint(0, parents_length-1)
# Assuming they aren't the same network...
if male != female:
male = parents[male]
female = parents[female]
# Breed them.
babies = self.__breed__(male, female)
# Add the children one at a time.
for baby in babies:
# Don't grow larger than desired length.
if len(children) < desired_length:
children.append(baby)
parents.extend(children)
return parents
def save(self, out_path):
results = []
for network in self.results:
result = pd.DataFrame(network.scores).mean()
for key, value in network.model_params.items():
result[key] = value
results.append(result)
pd.concat(results, axis=1).T.to_csv(out_path, index=False)