-
Notifications
You must be signed in to change notification settings - Fork 0
/
Chapter_6.py
747 lines (503 loc) · 25.8 KB
/
Chapter_6.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 15 10:52:00 2024
@author: charlesr
"""
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection._split import KFold
from sklearn.metrics import log_loss
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.linear_model import LinearRegression
import sys
# Change director to path
sys.path.insert(0, path)
# Import the chapter four
import Chapter_4 as four
# See https://stackoverflow.com/questions/69596239/how-to-avoid-memory-leak-when-dealing-with-kmeans-for-example-in-this-code-i-am
import warnings
warnings.filterwarnings('ignore')
plt.style.use("seaborn-v0_8")
#------------------------------------------------------------------------------
def gen_test_data(n_features = 100, n_informative = 25, n_redundant = 25,
n_samples = 10000, random_state = 0, scale = 0, is_clf = True):
# Generate a random dataset for a classification/regression problem
np.random.seed(random_state)
# Calculate the number of noise features
n_noise = n_features - n_informative - n_redundant
if is_clf:
# Use make_classification to construct informative and noise features
X, y = make_classification(n_samples = n_samples,
n_features = n_features - n_redundant,
n_informative = n_informative,
n_redundant = 0,
shuffle = False,
random_state = random_state)
# Think Lopez de Prado not using n_redundant because linear combinations of informative features
# Too hard for clustering to untangle
else:
# Use make_regression to construct informative and noise features
X, y = make_regression(n_samples = n_samples,
n_features = n_features - n_redundant,
n_informative = n_informative,
shuffle = False,
random_state = random_state)
# Add names for the informative features
cols = [f'I_{i}' for i in range(n_informative)]
# Add names for the noise features
cols += [f'N_{i}' for i in range(n_noise)]
# Convert results to a pandas data frame
X, y = pd.DataFrame(X, columns = cols), pd.Series(y)
# Randomly choose which features the redundant ones replicate
rep = np.random.choice(range(n_informative), size = n_redundant)
for j, k, in enumerate(rep):
# Redundant feature j is informative feature k plus random noise
X[f'R_{j}'] = X[f'I_{k}'] + np.random.normal(size = n_samples,
scale = scale)
return X, y
#------------------------------------------------------------------------------
def orthogonalize_clusters(df, clusters):
# Make a deep copy of df
df_copy = df.copy()
# Get the values of the clusters
vals = list(clusters.values())
# Loop over the clusters
for i, responses in enumerate(vals):
if i != 0:
# Get the explanitory columns
exp_vars = [col for j in range(i) for col in vals[j]]
# Loop over response columns
for response in responses:
# Get X and y
X = df_copy[exp_vars].values
y = df_copy[response].values
# Need to make sure X is a 2D matrix
if len(exp_vars) == 1:
X = X.reshape((-1, 1))
# Initialize and fit the linear regression
reg = LinearRegression().fit(X, y)
# Calculate the residuals
df_copy[response] = y - reg.predict(X)
return df_copy
#------------------------------------------------------------------------------
def feat_imp_MDI(clf, feat_names):
# Feature importance based on in-sample mean impurity reduction
df = {i:tree.feature_importances_ for i, tree in enumerate(clf.estimators_)}
# Convert from dictionary to data frame
df = pd.DataFrame.from_dict(df, orient = 'index')
# Name the columns
df.columns = feat_names
# Because max_features = 1
df = df.replace(0, np.nan)
# Calculate the mean and std of the samples
imp = pd.concat({'mean':df.mean(), 'std':df.std()/np.sqrt(df.shape[0])},
axis = 1)
# Rescale by dividing by mean
imp /= imp['mean'].sum()
return imp
#------------------------------------------------------------------------------
def feat_imp_MDA(clf, X, y, n_splits = None, cv = None):
if cv is None:
# Initialize k-folds constructor
cv_gen = KFold(n_splits = n_splits).split(X = X)
else:
cv_gen = cv.split(X = X)
# Initialize pandas objects to hold raw and shuffled log_loss scores
score_raw, score_shuff = pd.Series(), pd.DataFrame(columns = X.columns)
# Generate split
for fold, (train_idx, test_idx) in enumerate(cv_gen):
# Create training arrays
X_train, y_train = X.iloc[train_idx, :], y.iloc[train_idx]
# Create testing arrays
X_test, y_test = X.iloc[test_idx, :], y.iloc[test_idx]
# Fit the model using the training data
clf_fit = clf.fit(X = X_train, y = y_train)
# Use testing data to predict probabilities
probs = clf_fit.predict_proba(X_test)
# Record negative log-loss
score_raw.loc[fold] = -log_loss(y_test, probs, labels = clf.classes_)
for col in X.columns:
# Make a deep copy of X_test
X_shuff = X_test.copy()
# Shuffle column
np.random.shuffle(X_shuff[col].values)
# Predict the probabilities
probs_shuff = clf_fit.predict_proba(X_shuff)
# Calculate the score
score_shuff.loc[fold, col] = -log_loss(y_test, probs_shuff,
labels = clf.classes_)
# Subtract the raw score from the score after the shuffle
imp = score_shuff.sub(score_raw, axis = 0)
# Normalize by dividing by the shuffled score
imp = imp/score_shuff
# Compute the mean and std
imp = pd.concat({'mean':imp.mean(),
'std':imp.std()/np.sqrt(imp.shape[0])}, axis = 1)
# Calculate t-stat
imp.loc[imp['std'] != 0, 't-stat'] = imp.loc[imp['std'] != 0,
'mean']/imp.loc[imp['std'] != 0, 'std']
return imp
#------------------------------------------------------------------------------
def reg_feat_imp_MDA(reg, X, y, n_splits = None, p = 2, cv = None):
# Define penalty function
penalty_fun = lambda e: np.sum(np.abs(e)**p)
if cv is None:
# Initialize k-folds constructor
cv_gen = KFold(n_splits = n_splits).split(X = X)
else:
cv_gen = cv.split(X = X)
# Initialize pandas objects to hold raw and shuffled scores
score_raw, score_shuff = pd.Series(), pd.DataFrame(columns = X.columns)
# Generate split
for fold, (train_idx, test_idx) in enumerate(cv_gen):
# Create training arrays
X_train, y_train = X.iloc[train_idx, :], y.iloc[train_idx]
# Create testing arrays
X_test, y_test = X.iloc[test_idx, :], y.iloc[test_idx]
# Fit the model using the training data
reg_fit = reg.fit(X = X_train, y = y_train)
# Use testing data to predict probabilities
y_pred = reg_fit.predict(X_test)
# Record score
score_raw.loc[fold] = -penalty_fun(y_test - y_pred)
for col in X.columns:
# Make a deep copy of X_test
X_shuff = X_test.copy()
# Shuffle the j-th column
np.random.shuffle(X_shuff[col].values)
# Predict values
y_shuff = reg_fit.predict(X_shuff)
# Calculate the score
score_shuff.loc[fold, col] = -penalty_fun(y_test - y_shuff)
# Subtract the raw score from the score after the shuffle
imp = score_shuff.sub(score_raw, axis = 0)
# Normalize by dividing by the shuffled score
imp = imp/score_shuff
# Compute the mean and std
imp = pd.concat({'mean':imp.mean(),
'std':imp.std()/np.sqrt(imp.shape[0])}, axis = 1)
# Calculate t-stat
imp.loc[imp['std'] != 0, 't-stat'] = imp.loc[imp['std'] != 0,
'mean']/imp.loc[imp['std'] != 0, 'std']
return imp
#------------------------------------------------------------------------------
def group_mean_std(df, clusters):
# Initialize data frame for output
out = pd.DataFrame(columns = ['mean', 'std'])
# Loop over clusters
for clst, col in clusters.items():
# Take the sum of the values for each cluster
temp = df[col].sum(axis = 1)
# Compute the mean value
out.loc[f'C_{clst}', 'mean'] = temp.mean()
# Compute the standard deviation
out.loc[f'C_{clst}', 'std'] = temp.std()/np.sqrt(temp.shape[0])
# Calculate t-stat
if temp.std() != 0:
out.loc[f'C_{clst}', 't-stat'] = out.loc[f'C_{clst}',
'mean']/out.loc[f'C_{clst}', 'std']
return out
#------------------------------------------------------------------------------
def feat_imp_MDI_clustered(clf, feat_names, clusters):
# Feature importance based on in-sample mean impurity reduction
df = {i:tree.feature_importances_ for i, tree in enumerate(clf.estimators_)}
# Convert dictionary to data frame
df = pd.DataFrame.from_dict(df, orient = 'index')
# Rename columns
df.columns = feat_names
# Replace 0 with np.nan
df = df.replace(0, np.nan)
# Get impurity of each cluster
imp = group_mean_std(df, clusters)
# Divide by sum to normalize
imp /= imp['mean'].sum()
return imp
#------------------------------------------------------------------------------
def feat_imp_MDA_clustered(clf, X, y, clusters, n_splits = None, cv = None):
if cv is None:
# Initialize k-folds constructor
cv_gen = KFold(n_splits = n_splits).split(X = X)
else:
cv_gen = cv.split(X = X)
# Initialize pandas objects to hold raw and shuffled log_loss scores
score_raw, score_shuff = pd.Series(), pd.DataFrame(columns = clusters.keys())
# Generate splits
for fold, (train_idx, test_idx) in enumerate(cv_gen):
# Create training arrays
X_train, y_train = X.iloc[train_idx, :], y.iloc[train_idx]
# Create testing arrays
X_test, y_test = X.iloc[test_idx, :], y.iloc[test_idx]
# Fit the model using training data
clf_fit = clf.fit(X = X_train, y = y_train)
# Use the fitted model to predict probabilities
probs = clf_fit.predict_proba(X_test)
# Record log-loss
score_raw.loc[fold] = -log_loss(y_test, probs, labels = clf.classes_)
# Loop over clusters
for clst in clusters:
# Make a deep copy of X_test
X_shuff = X_test.copy()
# For each column in clst
for col in clusters[clst]:
# Shuffle col
np.random.shuffle(X_shuff[col].values)
# Predict the probabilities with shuffled results
probs_shuff = clf_fit.predict_proba(X_shuff)
# Calcualte the score
score_shuff.loc[fold, clst] = -log_loss(y_test, probs_shuff,
labels = clf.classes_)
# Subtract the raw score from the score after the shuffle
imp = score_shuff.sub(score_raw, axis = 0)
# Normalize by the shuffled scores
imp = imp/score_shuff
# Calculate the mean and std
imp = pd.concat({'mean':imp.mean(),
'std':imp.std()/np.sqrt(imp.shape[0])}, axis = 1)
# Calculate t-stat
imp.loc[imp['std'] != 0, 't-stat'] = imp.loc[imp['std'] != 0,
'mean']/imp.loc[imp['std'] != 0, 'std']
# Change the index name
imp.index = [f'C_{i}' for i in imp.index]
return imp
#------------------------------------------------------------------------------
def reg_feat_imp_MDA_clustered(reg, X, y, clusters, n_splits = 10, p = 2):
# Define penalty function
penalty_fun = lambda e: np.sum(np.abs(e)**p)
# Initialize k-folds constructor
cv_gen = KFold(n_splits = n_splits)
# Initialize pandas objects to hold raw and shuffled scores
score_raw, score_shuff = pd.Series(), pd.DataFrame(columns = clusters.keys())
# Generate splits
for fold, (train_idx, test_idx) in enumerate(cv_gen.split(X = X)):
# Create training arrays
X_train, y_train = X.iloc[train_idx, :], y.iloc[train_idx]
# Create testing arrays
X_test, y_test = X.iloc[test_idx, :], y.iloc[test_idx]
# Fit the model using training data
reg_fit = reg.fit(X = X_train, y = y_train)
# Use the fitted model to predict probabilities
y_pred = reg_fit.predict(X_test)
# Record score
score_raw.loc[fold] = -penalty_fun(y_test - y_pred)
# Loop over clusters
for clst in clusters:
# Make a deep copy of X_test
X_shuff = X_test.copy()
# For each column in clst
for col in clusters[clst]:
# Shuffle col
np.random.shuffle(X_shuff[col].values)
# Predict the values with shuffled results
y_shuff = reg_fit.predict(X_shuff)
# Calculate the score
score_shuff.loc[fold, clst] = -penalty_fun(y_test - y_shuff)
# Subtract the raw score from the score after the shuffle
imp = score_shuff.sub(score_raw, axis = 0)
# Normalize by the shuffled scores
imp = imp/score_shuff
# Calculate the mean and std
imp = pd.concat({'mean':imp.mean(),
'std':imp.std()/np.sqrt(imp.shape[0])}, axis = 1)
# Calculate t-stat
imp.loc[imp['std'] != 0, 't-stat'] = imp.loc[imp['std'] != 0,
'mean']/imp.loc[imp['std'] != 0, 'std']
# Change the index name
imp.index = [f'C_{i}' for i in imp.index]
return imp
#------------------------------------------------------------------------------
def plot_logistic_pvals(X, y, feat_names, alpha = 0.05, title = None,
filename = None, **kwargs):
# Import logistic regression
from statsmodels.discrete.discrete_model import Logit
# Fit the regression
logit = Logit(y.values, X.values).fit()
# Record the p-values
p_vals = pd.Series(logit.pvalues, index = feat_names)
# Sort the p-values
p_vals = p_vals.sort_values(ascending = False)
# Initialize figure size of plot
plt.figure(figsize = (10, int(np.max([10, p_vals.shape[0]/4]))))
# Plot histogram
ax = p_vals.plot(kind = 'barh', color = 'b', alpha = 0.25,
error_kw = {'ecolor':'r'})
# Set x-range
plt.xlim([-0.01 * np.max([p_vals.max(), alpha]),
1.01 * np.max([p_vals.max(), alpha])])
# draw verticle line at significance level
plt.axvline(alpha, linewidth = 1, color = 'r', linestyle = 'dotted')
# Make the y-axis invisible
ax.get_yaxis().set_visible(False)
# Place feature name as center of each bar
for bar, feature_name in zip(ax.patches, p_vals.index):
ax.text(bar.get_width()/2, bar.get_y() + bar.get_height()/2,
feature_name, ha = 'center', va = 'center', color = 'black')
if title is not None:
# Give plot title
plt.title(title)
# If filename is defined...
if filename is not None:
# ... save the plot
plt.savefig(filename, dpi = 100)
# Show the plot
plt.show()
# Close figure
plt.close()
#------------------------------------------------------------------------------
def plot_feat_importance(imp, method, title = None, filename = None, **kwargs):
# Plot mean imp bars with std
plt.figure(figsize = (10, int(np.max([10, imp.shape[0]/4]))))
# Sort features by mean
imp = imp.sort_values('mean', ascending = True)
# Plot histogram
ax = imp['mean'].plot(kind = 'barh', color = 'b', alpha = 0.25,
xerr = imp['std'], error_kw = {'ecolor':'r'})
# If MDI...
if method == 'MDI':
# ... set x-range
plt.xlim([-0.01 * imp[['mean', 'std']].sum(axis = 1).max(),
1.01 * imp[['mean', 'std']].sum(axis = 1).max()])
# ... draw verticle line to see average feature importance
plt.axvline(1.0/imp.shape[0], linewidth = 1, color = 'r',
linestyle = 'dotted')
# Make the y-axis invisible
ax.get_yaxis().set_visible(False)
# Place feature name as center of each bar
for bar, feature_name in zip(ax.patches, imp.index):
ax.text(bar.get_width()/2, bar.get_y() + bar.get_height()/2,
feature_name, ha = 'center', va = 'center', color = 'black')
if title is not None:
# Give plot title
plt.title(title)
# If filename is defined...
if filename is not None:
# ... save the plot
plt.savefig(filename, dpi = 100)
# Show the plot
plt.show()
# Close figure
plt.close()
if __name__ == '__main__':
# Start the clock!
start_time = time.perf_counter()
# Set random seet
np.random.seed(0)
# Generate the data; in this case we're doing a classification problem
X, y = gen_test_data(n_features = 40, n_informative = 5, n_redundant = 30,
n_samples = 10000, random_state = 0, scale = 0.5)
# Plot the p-values
plot_logistic_pvals(X, y, feat_names = X.columns,
title = '$p$-values\n $\\alpha = 5\%$',
filename = path + 'fig6.1.png')
# Initialize an ML model
clf = DecisionTreeClassifier(criterion = 'entropy', max_features = 1,
class_weight = 'balanced')
# Using bagged decision trees to be consistant with book, but can use anything
clf = BaggingClassifier(estimator = clf, n_estimators = 1000,
max_features = 1.0, max_samples = 1.0,
oob_score = False)
# Fit ML model
clf.fit(X, y)
# Calculate MDI feature importances
mdi = feat_imp_MDI(clf, feat_names = X.columns)
# Generate plot of feature importances
plot_feat_importance(mdi, method = 'MDI',
title = 'Feature Importance MDI',
filename = path + 'fig6.2.png')
# Calculate MDA feature importances
mda = feat_imp_MDA(clf, X, y, n_splits = 5)
# Generate plot of feature importances
plot_feat_importance(mda, method = 'MDA',
title = 'Feature Importance MDA',
filename = path + 'fig6.3.png')
# Calculate correlations between explanatory variables
corr0 = X.corr()
# I'm skipping figure 6.4, because I don't want to use Seaborn
# Cluster results
corr1, clusters, silh = four.cluster_kmeans_top(corr0)
# Calculate clustered MDI feature importances
mdi_clustered = feat_imp_MDI_clustered(clf, feat_names = X.columns,
clusters = clusters)
# Generate plot of feature importances
plot_feat_importance(mdi_clustered, method = 'MDI',
title = 'Feature Importance MDI Clustered',
filename = path + 'fig6.5.png')
# Calculate clustered MDA feature importances
mda_clustered = feat_imp_MDA_clustered(clf, X, y, clusters, n_splits = 5)
# Generate plot of feature importances
plot_feat_importance(mda_clustered, method = 'MDA',
title = 'Feature Importance MDA Clusted',
filename = path + 'fig6.6.png')
# Orthoginalize clusters
X_prime = orthogonalize_clusters(X, clusters)
# Fit classifier on orthoginalized clusters
clf.fit(X_prime, y)
# Calculate clustered MDI feature importances
mdi_prime_clustered = feat_imp_MDI_clustered(clf, feat_names = X_prime.columns,
clusters = clusters)
# Generate plot of feature importances
plot_feat_importance(mdi_prime_clustered, method = 'MDI',
title = 'Feature Importance MDI Clustered with Modified X')
# Calculate clustered MDA feature importances
mda_prime_clustered = feat_imp_MDA_clustered(clf, X_prime, y, clusters, n_splits = 5)
# Generate plot of feature importances
plot_feat_importance(mda_prime_clustered, method = 'MDA',
title = 'Feature Importance MDA Clustered with Modified X')
#--------------------------------------------------------------------------
# Generate the data; in this case we're doing a regression problem
X, y = gen_test_data(n_features = 40, n_informative = 5, n_redundant = 30,
n_samples = 10000, random_state = 0, scale = 0.5,
is_clf = False)
# Initialize regressor
reg = DecisionTreeRegressor(criterion = 'squared_error', max_features = 1)
# We're doing bagged regression trees to be consistant with classification problem above
reg = BaggingRegressor(estimator = reg, n_estimators = 1000,
max_features = 1.0, max_samples = 1.0,
oob_score = False)
# Fit the ML model
reg.fit(X, y)
# Calculate MDI feature importances
mdi_reg = feat_imp_MDI(reg, feat_names = X.columns)
# Generate plot of feature importances
plot_feat_importance(mdi_reg, method = 'MDI',
title = 'Regression Feature Importance MDI')
# Calculate MDA feature importances
mda_reg = reg_feat_imp_MDA(reg, X, y, n_splits = 5)
# Generate plot of feature importances
plot_feat_importance(mda_reg, method = 'MDA',
title = 'Regression Feature Importance MDA')
# Calculate correlations between explanatory variables
corr0 = X.corr()
# Cluster results
corr1, clusters, silh = four.cluster_kmeans_top(corr0)
# Calculate clustered MDI feature importances
mdi_reg_clustered = feat_imp_MDI_clustered(reg, feat_names = X.columns,
clusters = clusters)
# Generate plot of feature importances
plot_feat_importance(mdi_reg_clustered, method = 'MDI',
title = 'Regression Feature Importance MDI Clustered')
# Calculate clustered MDA feature importances
mda_reg_clustered = reg_feat_imp_MDA_clustered(reg, X, y, clusters, n_splits = 5)
# Generate plot of feature importances
plot_feat_importance(mda_reg_clustered, method = 'MDA',
title = 'Regression Feature Importance MDA Clusted')
# Orthoginalize clusters
X_prime = orthogonalize_clusters(X, clusters)
# Fit regression on orthoginalized clusters
reg.fit(X_prime, y)
# Calculate clustered MDI feature importances
mdi_prime_clustered = feat_imp_MDI_clustered(reg, feat_names = X_prime.columns,
clusters = clusters)
# Generate plot of feature importances
plot_feat_importance(mdi_prime_clustered, method = 'MDI',
title = 'Regression Feature Importance MDI Clustered with Modified X')
# Calculate clustered MDA feature importances
mda_prime_clustered = reg_feat_imp_MDA_clustered(reg, X_prime, y, clusters = clusters,
n_splits = 5)
# Generate plot of feature importances
plot_feat_importance(mda_prime_clustered, method = 'MDA',
title = 'Regression Feature Importance MDA Clustered with Modified X')
print(f'This program took {(time.perf_counter() - start_time)/60:.2f} minutes.')