4
4
import numpy as np
5
5
import pandas as pd
6
6
from boruta import BorutaPy
7
+ from scipy .stats import randint , uniform
7
8
from sklearn .ensemble import HistGradientBoostingRegressor , RandomForestRegressor
8
9
from sklearn .feature_selection import SelectFromModel , mutual_info_regression
9
10
from sklearn .impute import SimpleImputer
10
- from sklearn .model_selection import KFold , cross_val_score
11
+ from sklearn .model_selection import KFold , RandomizedSearchCV , cross_val_score
11
12
12
13
warnings .filterwarnings ("ignore" )
13
14
@@ -93,11 +94,14 @@ def boruta_selection(self, X, y):
93
94
94
95
95
96
class KaggleSubmissionPipeline :
96
- def __init__ (self , model = None ):
97
- self .model = (
98
- model
99
- if model
100
- else HistGradientBoostingRegressor (
97
+ def __init__ (self , model = None , tune_hyperparameters = True ):
98
+ self .tune_hyperparameters = tune_hyperparameters
99
+ if model is not None :
100
+ self .model = model
101
+ elif tune_hyperparameters :
102
+ self .model = self ._get_tuned_model ()
103
+ else :
104
+ self .model = HistGradientBoostingRegressor (
101
105
max_iter = 1000 ,
102
106
learning_rate = 0.1 ,
103
107
max_depth = None ,
@@ -109,11 +113,34 @@ def __init__(self, model=None):
109
113
n_iter_no_change = 20 ,
110
114
verbose = 1 ,
111
115
)
112
- )
113
116
self .cv_scores = []
114
117
self .feature_selector = FeatureSelector ()
115
118
116
- def evaluate_feature_selection_methods (self , X_train , y_train , cv = 5 ):
119
+ def _get_tuned_model (self ):
120
+ """Perform hyperparameter tuning using RandomizedSearchCV"""
121
+ param_distributions = {
122
+ "learning_rate" : uniform (0.01 , 0.09 ),
123
+ "max_depth" : randint (5 , 20 ),
124
+ "min_samples_leaf" : randint (10 , 50 ),
125
+ "l2_regularization" : uniform (0.5 , 4.5 ),
126
+ }
127
+
128
+ base_model = HistGradientBoostingRegressor (
129
+ max_iter = 1000 , early_stopping = True , validation_fraction = 0.1 , n_iter_no_change = 20 , random_state = 42 , verbose = 0
130
+ )
131
+
132
+ return RandomizedSearchCV (
133
+ base_model ,
134
+ param_distributions ,
135
+ n_iter = 20 ,
136
+ cv = 10 ,
137
+ scoring = "neg_mean_absolute_error" ,
138
+ n_jobs = - 1 ,
139
+ random_state = 42 ,
140
+ verbose = 2 ,
141
+ )
142
+
143
+ def evaluate_feature_selection_methods (self , X_train , y_train , cv = 10 ):
117
144
"""Compare different feature selection methods"""
118
145
results = {}
119
146
methods = {
@@ -127,6 +154,13 @@ def evaluate_feature_selection_methods(self, X_train, y_train, cv=5):
127
154
logger .info (f"\n Evaluating { name } ..." )
128
155
try :
129
156
X_selected = method (X_train .copy (), y_train )
157
+ if self .tune_hyperparameters and hasattr (self .model , "fit" ):
158
+ logger .info ("Performing hyperparameter tuning..." )
159
+ self .model .fit (X_selected , y_train )
160
+ logger .info ("Best parameters found:" )
161
+ logger .info (self .model .best_params_ )
162
+ logger .info (f"Best cross-validation MAE: { - self .model .best_score_ :.4f} " )
163
+
130
164
scores = self ._get_cv_scores (X_selected , y_train , cv )
131
165
results [name ] = {
132
166
"n_features" : X_selected .shape [1 ],
@@ -168,7 +202,7 @@ def select_features(self, X_train, y_train, X_test, method="model_based"):
168
202
X_test_selected = X_test [selected_features ]
169
203
return X_train_selected , X_test_selected
170
204
171
- def validate_model (self , X_train , y_train , cv = 5 ):
205
+ def validate_model (self , X_train , y_train , cv = 10 ):
172
206
"""Perform cross-validation and print results"""
173
207
scores = self ._get_cv_scores (X_train , y_train , cv )
174
208
self .cv_scores = scores
@@ -178,16 +212,30 @@ def validate_model(self, X_train, y_train, cv=5):
178
212
def train_and_predict (self , X_train , y_train , X_test ):
179
213
"""Train model and generate predictions"""
180
214
logger .info ("Training final model..." )
181
- self .model .fit (X_train , y_train )
182
- predictions = self .model .predict (X_test )
183
215
184
- # Calculate feature importances if available
185
- if hasattr (self .model , "feature_importances_" ):
186
- importances = pd .DataFrame (
187
- {"feature" : X_train .columns , "importance" : self .model .feature_importances_ }
188
- ).sort_values ("importance" , ascending = False )
189
- logger .info ("\n Top 10 most important features:" )
190
- logger .info (importances .head (10 ))
216
+ if self .tune_hyperparameters and hasattr (self .model , "best_estimator_" ):
217
+ logger .info ("Using best model from hyperparameter tuning..." )
218
+ # Get the best model from tuning
219
+ best_model = self .model .best_estimator_
220
+ # Fit it on the full training data
221
+ best_model .fit (X_train , y_train )
222
+ predictions = best_model .predict (X_test )
223
+
224
+ # Log best parameters for reference
225
+ logger .info ("Best parameters used:" )
226
+ logger .info (self .model .best_params_ )
227
+ else :
228
+ # If not tuning hyperparameters, use the base model
229
+ self .model .fit (X_train , y_train )
230
+ predictions = self .model .predict (X_test )
231
+
232
+ # Calculate feature importances if available
233
+ if hasattr (self .model , "feature_importances_" ):
234
+ importances = pd .DataFrame (
235
+ {"feature" : X_train .columns , "importance" : self .model .feature_importances_ }
236
+ ).sort_values ("importance" , ascending = False )
237
+ logger .info ("\n Top 10 most important features:" )
238
+ logger .info (importances .head (10 ))
191
239
192
240
return predictions
193
241
@@ -208,14 +256,15 @@ def prepare_submission(
208
256
model = None ,
209
257
evaluate_features = True ,
210
258
feature_method = "model_based" ,
259
+ tune_hyperparameters = True ,
211
260
):
212
- """Complete pipeline with feature selection"""
261
+ """Complete pipeline with feature selection and hyperparameter tuning """
213
262
# Remove rows where target is NaN
214
263
mask = ~ y_train .isna ()
215
264
X_train = X_train [mask ]
216
265
y_train = y_train [mask ]
217
266
218
- pipeline = KaggleSubmissionPipeline (model )
267
+ pipeline = KaggleSubmissionPipeline (model , tune_hyperparameters = tune_hyperparameters )
219
268
220
269
logger .info ("Dataset information:" )
221
270
logger .info (f"Training data shape: { X_train .shape } " )
0 commit comments