-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2_feature_selection.py
92 lines (83 loc) · 3.51 KB
/
2_feature_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
import pickle
from multiprocessing import Pool
import matplotlib.pyplot as plt
from skopt import BayesSearchCV
from skopt.space import Real
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
import numpy as np
# %% Load data.
with open('raw/1_cs_training_normalized.pkl', 'rb') as f:
x, y = pickle.load(f)
with open('raw/1_cs_testing_normalized.pkl', 'rb') as f:
x_test, y_test = pickle.load(f)
# %% Find the range of optimal hyperparameter of random forest.
optimizer = DecisionTreeRegressor(random_state=324)
optimizer.fit(x, y.ravel())
path = optimizer.cost_complexity_pruning_path(x, y.ravel())
def ccp_depth(alpha):
dt = DecisionTreeRegressor(ccp_alpha=alpha, random_state=324)
dt.fit(x, y)
return dt.tree_.max_depth
pool = Pool()
depths = pool.map(ccp_depth, path.ccp_alphas)
path.ccp_alphas = path.ccp_alphas[:-1]
depths = depths[:-1]
# %% Visualize
f, ax = plt.subplots()
ax.plot(path.ccp_alphas, depths, drawstyle="steps-post", marker='o', markersize=3)
for i, ccp_alpha, depth in zip(range(len(depths)), path.ccp_alphas, depths):
if depth not in depths[:i]:
ax.annotate(f'({round(ccp_alpha, 4)}, {round(depth, 0)})', xy=(ccp_alpha, depth), textcoords='data')
ax.set_xlabel("CCP alpha")
ax.set_ylabel("Tree's depth")
f.savefig('results/2_ccp_alpha_depth.pdf')
plt.close(f)
# Result: Search max_depth in random forest in interval [2, 20], and ccp_alpha in interval [0, 0.0279].
# %% Optimize a random forest model to calc feature importance.
k_fold = KFold(shuffle=True, random_state=387)
optimizer = BayesSearchCV(
RandomForestRegressor(n_jobs=-1, random_state=324, oob_score=True),
# Hyperparameter space based on previous decision tree results
{
'max_depth': (2, 20),
'ccp_alpha': Real(0, 0.028, prior='uniform'),
},
n_iter=32, n_jobs=-1, cv=k_fold.split(x), random_state=743, # RS for searching
)
optimizer.fit(x, y.ravel())
# %% Feature importance.
feature_importance = pd.DataFrame({'Name': x.columns,
'Importance': optimizer.best_estimator_.feature_importances_})
feature_importance.sort_values(by='Importance', ascending=False, inplace=True)
# %% Select top-k features
max_features = 50
r2 = np.zeros(shape=(5, max_features))
k_fold = KFold(shuffle=True, random_state=387)
x_train, y_train, x_valid, y_valid = [], [], [], []
for (idx_train, idx_valid) in k_fold.split(x):
x_train.append(x.values[idx_train])
y_train.append(y[idx_train])
x_valid.append(x.values[idx_valid])
y_valid.append(y[idx_valid])
for i in range(max_features):
k = i + 1
top_k = feature_importance.index[:k].tolist()
for j in range(5):
optimizer_k = RandomForestRegressor(n_jobs=-1, random_state=324, oob_score=True, **optimizer.best_params_)
optimizer_k.fit(x_train[j][:, top_k], y_train[j].ravel())
r2[j, i] = optimizer_k.score(x_valid[j][:, top_k], y_valid[j].ravel())
r2 = np.mean(r2, axis=0)
best_k = np.argmax(r2) + 1
# %% Export.
with open('raw/2_feature_selection_model.pkl', 'wb') as f:
pickle.dump(optimizer.best_estimator_, f)
with open('raw/2_feature_importance.pkl', 'wb') as f:
pickle.dump(feature_importance, f)
top_best_k = feature_importance.index[:best_k].tolist()
with open('raw/2_feature_rf_training_normalized.pkl', 'wb') as f:
pickle.dump([x.values[:, top_best_k], y], f)
with open('raw/2_feature_rf_testing_normalized.pkl', 'wb') as f:
pickle.dump([x_test.values[:, top_best_k], y_test], f)