forked from vzhangmeng726/Kaggle-Ensemble-Guide
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathblend_proba.py
90 lines (79 loc) · 3.52 KB
/
blend_proba.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from sklearn import cross_validation
from sklearn.metrics import log_loss, accuracy_score
import numpy as np
import pandas as pd
import random
import md5
import json
def blend_proba(clf, X_train, y, X_test, nfolds=5, save_preds="",
save_test_only="", seed=300373, save_params="",
clf_name="XX", generalizers_params=[], minimal_loss=0,
return_score=False, minimizer="log_loss"):
print("\nBlending with classifier:\n\t%s"%(clf))
folds = list(cross_validation.StratifiedKFold(y, nfolds,shuffle=True,random_state=seed))
print(X_train.shape)
dataset_blend_train = np.zeros((X_train.shape[0],np.unique(y).shape[0]))
#iterate through train set and train - predict folds
loss = 0
for i, (train_index, test_index) in enumerate( folds ):
print("Train Fold %s/%s"%(i+1,nfolds))
fold_X_train = X_train[train_index]
fold_y_train = y[train_index]
fold_X_test = X_train[test_index]
fold_y_test = y[test_index]
clf.fit(fold_X_train, fold_y_train)
fold_preds = clf.predict_proba(fold_X_test)
print("Logistic loss: %s"%log_loss(fold_y_test,fold_preds))
dataset_blend_train[test_index] = fold_preds
if minimizer == "log_loss":
loss += log_loss(fold_y_test,fold_preds)
if minimizer == "accuracy":
fold_preds_a = np.argmax(fold_preds, axis=1)
loss += accuracy_score(fold_y_test,fold_preds_a)
#fold_preds = clf.predict(fold_X_test)
#loss += accuracy_score(fold_y_test,fold_preds)
if minimal_loss > 0 and loss > minimal_loss and i == 0:
return False, False
fold_preds = np.argmax(fold_preds, axis=1)
print("Accuracy: %s"%accuracy_score(fold_y_test,fold_preds))
avg_loss = loss / float(i+1)
print("\nAverage:\t%s\n"%avg_loss)
#predict test set (better to take average on all folds, but this is quicker)
print("Test Fold 1/1")
clf.fit(X_train, y)
dataset_blend_test = clf.predict_proba(X_test)
if clf_name == "XX":
clf_name = str(clf)[1:3]
if len(save_preds)>0:
id = md5.new("%s"%str(clf.get_params())).hexdigest()
print("storing meta predictions at: %s"%save_preds)
np.save("%s%s_%s_%s_train.npy"%(save_preds,clf_name,avg_loss,id),dataset_blend_train)
np.save("%s%s_%s_%s_test.npy"%(save_preds,clf_name,avg_loss,id),dataset_blend_test)
if len(save_test_only)>0:
id = md5.new("%s"%str(clf.get_params())).hexdigest()
print("storing meta predictions at: %s"%save_test_only)
dataset_blend_test = clf.predict(X_test)
np.savetxt("%s%s_%s_%s_test.txt"%(save_test_only,clf_name,avg_loss,id),dataset_blend_test)
d = {}
d["stacker"] = clf.get_params()
d["generalizers"] = generalizers_params
with open("%s%s_%s_%s_params.json"%(save_test_only,clf_name,avg_loss, id), 'wb') as f:
json.dump(d, f)
if len(save_params)>0:
id = md5.new("%s"%str(clf.get_params())).hexdigest()
d = {}
d["name"] = clf_name
d["params"] = { k:(v.get_params() if "\n" in str(v) or "<" in str(v) else v) for k,v in clf.get_params().items()}
d["generalizers"] = generalizers_params
with open("%s%s_%s_%s_params.json"%(save_params,clf_name,avg_loss, id), 'wb') as f:
json.dump(d, f)
if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba
if return_score:
return dataset_blend_train[:,1], dataset_blend_test[:,1], avg_loss
else:
return dataset_blend_train[:,1], dataset_blend_test[:,1]
else:
if return_score:
return dataset_blend_train, dataset_blend_test, avg_loss
else:
return dataset_blend_train, dataset_blend_test