-
Notifications
You must be signed in to change notification settings - Fork 1
/
E. Cross-validation
96 lines (74 loc) · 3.83 KB
/
E. Cross-validation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
For small datasets. If the model is fast to run, it's probably worth switching to cross-validation.
Steps: Gridsearch (optimization) on X_train and cross-validation on X !
BEST SOLUTION: Do a for-loop to be able to fit_transform scaler on X_fold_train and transform on X_fold_test + advantage to be able to store multiple metrics
from sklearn.model_selection import KFold, StratifiedKFold # to have class balance
# Set K
skf = StratifiedKFold(n_splits=10). # No more than 10 cuz we have to keep enough data for test
# To store train and test scores
r2_train_skf = []
r2_test_skf = []
mae_train_skf = []
mae_test_skf = []
for train_I, test_I in skf.split(X,y): # X ou X_encoded depending on the encoding strategy. Possible to include encoding in the for loop using RareEncoding
# Get samples indices
X_fold_train = X_encoded.iloc[train_I, :]
y_fold_train = y_log.iloc[train_I]
X_fold_test = X_encoded.iloc[test_I, :]
y_fold_test = y_log.iloc[test_I]
# Normalization
minmaxscaler = MinMaxScaler(). # Set new scaler for each fold
X_fold_train_scaled = minmaxscaler.fit_transform(X_fold_train)
X_fold_test_scaled = minmaxscaler.transform(X_fold_test)
# Model
model = XGBRegressor()
model.fit(X_fold_train_scaled, y_fold_train, eval_metric='mae', eval_set = [(X_fold_test_scaled, y_fold_test)],early_stopping_rounds=10) # XGBoost enables specify eval_metric
# eval_set for valid_test and early stopping
# Predict on train and test
y_fold_pred_train = model.predict(X_fold_train_scaled)
y_fold_pred_test = model.predict(X_fold_test_scaled)
# Calculate score of the model on X_fold_train and X_fold_test
r2_train_skf.append(
r2_score(y_fold_train, y_fold_pred_train)
)
r2_test_skf.append(
r2_score(y_fold_test, y_fold_pred_test)
)
mae_train_skf.append(
mean_absolute_error(y_fold_train, y_fold_pred_train)
)
mae_test_skf.append(
mean_absolute_error(y_fold_test, y_fold_pred_test)
)
# Show how it goes
if len(r2_train_skf) % 5 == 0:
print("Fin de l'itération {} ...".format(len(r2_train_skf)))
print("Fin de la validation croisée")
# Then Analyze mean and std (better to use standard deviation to describe the variability of the data + std is in the unit of the original variable)
mean_score_test = np.mean(scores_test)
std_score_test = np.std(scores_test)
round(mean_score_test,2), round(std_score_test,2)
(VARIANTE) Use cross_val_score to have only results directly. Less flexible
Use a pipeline for cross-validation.
We define below a function get_score with argument n_estimators for the RF model
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
def get_score(n_estimators):
my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
('model', RandomForestRegressor(n_estimators,random_state=0))
])
scores = -1 * cross_val_score(my_pipeline, X, y, # Multiply by -1 since sklearn calculates *negative* MAE
cv=3,
scoring='neg_mean_absolute_error')
return scores.mean() #We want to have only one score so we take the mean of the K-scores
# Tune hyperparam using cross-validation
results = {} #create a dictionary with the different results
n_estimators_range = list(range(50,450,50)) # for n_estimators = 50, 100, 150,... 400.
for i in n_estimators_range:
results[i] = get_score(i)
# Plot the results
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(list(results.keys()), list(results.values()))
plt.show()