Skip to content

Commit 802597a

Browse files
committed
try hyperparameter tuning
try hyperparameter tuning and increasing folds from 5 to 10
1 parent 1fd16c6 commit 802597a

File tree

2 files changed

+84
-32
lines changed

2 files changed

+84
-32
lines changed

examples/kaggle_pipeline.py

+69-20
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44
import numpy as np
55
import pandas as pd
66
from boruta import BorutaPy
7+
from scipy.stats import randint, uniform
78
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
89
from sklearn.feature_selection import SelectFromModel, mutual_info_regression
910
from sklearn.impute import SimpleImputer
10-
from sklearn.model_selection import KFold, cross_val_score
11+
from sklearn.model_selection import KFold, RandomizedSearchCV, cross_val_score
1112

1213
warnings.filterwarnings("ignore")
1314

@@ -93,11 +94,14 @@ def boruta_selection(self, X, y):
9394

9495

9596
class KaggleSubmissionPipeline:
96-
def __init__(self, model=None):
97-
self.model = (
98-
model
99-
if model
100-
else HistGradientBoostingRegressor(
97+
def __init__(self, model=None, tune_hyperparameters=True):
98+
self.tune_hyperparameters = tune_hyperparameters
99+
if model is not None:
100+
self.model = model
101+
elif tune_hyperparameters:
102+
self.model = self._get_tuned_model()
103+
else:
104+
self.model = HistGradientBoostingRegressor(
101105
max_iter=1000,
102106
learning_rate=0.1,
103107
max_depth=None,
@@ -109,11 +113,34 @@ def __init__(self, model=None):
109113
n_iter_no_change=20,
110114
verbose=1,
111115
)
112-
)
113116
self.cv_scores = []
114117
self.feature_selector = FeatureSelector()
115118

116-
def evaluate_feature_selection_methods(self, X_train, y_train, cv=5):
119+
def _get_tuned_model(self):
120+
"""Perform hyperparameter tuning using RandomizedSearchCV"""
121+
param_distributions = {
122+
"learning_rate": uniform(0.01, 0.09),
123+
"max_depth": randint(5, 20),
124+
"min_samples_leaf": randint(10, 50),
125+
"l2_regularization": uniform(0.5, 4.5),
126+
}
127+
128+
base_model = HistGradientBoostingRegressor(
129+
max_iter=1000, early_stopping=True, validation_fraction=0.1, n_iter_no_change=20, random_state=42, verbose=0
130+
)
131+
132+
return RandomizedSearchCV(
133+
base_model,
134+
param_distributions,
135+
n_iter=20,
136+
cv=10,
137+
scoring="neg_mean_absolute_error",
138+
n_jobs=-1,
139+
random_state=42,
140+
verbose=2,
141+
)
142+
143+
def evaluate_feature_selection_methods(self, X_train, y_train, cv=10):
117144
"""Compare different feature selection methods"""
118145
results = {}
119146
methods = {
@@ -127,6 +154,13 @@ def evaluate_feature_selection_methods(self, X_train, y_train, cv=5):
127154
logger.info(f"\nEvaluating {name}...")
128155
try:
129156
X_selected = method(X_train.copy(), y_train)
157+
if self.tune_hyperparameters and hasattr(self.model, "fit"):
158+
logger.info("Performing hyperparameter tuning...")
159+
self.model.fit(X_selected, y_train)
160+
logger.info("Best parameters found:")
161+
logger.info(self.model.best_params_)
162+
logger.info(f"Best cross-validation MAE: {-self.model.best_score_:.4f}")
163+
130164
scores = self._get_cv_scores(X_selected, y_train, cv)
131165
results[name] = {
132166
"n_features": X_selected.shape[1],
@@ -168,7 +202,7 @@ def select_features(self, X_train, y_train, X_test, method="model_based"):
168202
X_test_selected = X_test[selected_features]
169203
return X_train_selected, X_test_selected
170204

171-
def validate_model(self, X_train, y_train, cv=5):
205+
def validate_model(self, X_train, y_train, cv=10):
172206
"""Perform cross-validation and print results"""
173207
scores = self._get_cv_scores(X_train, y_train, cv)
174208
self.cv_scores = scores
@@ -178,16 +212,30 @@ def validate_model(self, X_train, y_train, cv=5):
178212
def train_and_predict(self, X_train, y_train, X_test):
179213
"""Train model and generate predictions"""
180214
logger.info("Training final model...")
181-
self.model.fit(X_train, y_train)
182-
predictions = self.model.predict(X_test)
183215

184-
# Calculate feature importances if available
185-
if hasattr(self.model, "feature_importances_"):
186-
importances = pd.DataFrame(
187-
{"feature": X_train.columns, "importance": self.model.feature_importances_}
188-
).sort_values("importance", ascending=False)
189-
logger.info("\nTop 10 most important features:")
190-
logger.info(importances.head(10))
216+
if self.tune_hyperparameters and hasattr(self.model, "best_estimator_"):
217+
logger.info("Using best model from hyperparameter tuning...")
218+
# Get the best model from tuning
219+
best_model = self.model.best_estimator_
220+
# Fit it on the full training data
221+
best_model.fit(X_train, y_train)
222+
predictions = best_model.predict(X_test)
223+
224+
# Log best parameters for reference
225+
logger.info("Best parameters used:")
226+
logger.info(self.model.best_params_)
227+
else:
228+
# If not tuning hyperparameters, use the base model
229+
self.model.fit(X_train, y_train)
230+
predictions = self.model.predict(X_test)
231+
232+
# Calculate feature importances if available
233+
if hasattr(self.model, "feature_importances_"):
234+
importances = pd.DataFrame(
235+
{"feature": X_train.columns, "importance": self.model.feature_importances_}
236+
).sort_values("importance", ascending=False)
237+
logger.info("\nTop 10 most important features:")
238+
logger.info(importances.head(10))
191239

192240
return predictions
193241

@@ -208,14 +256,15 @@ def prepare_submission(
208256
model=None,
209257
evaluate_features=True,
210258
feature_method="model_based",
259+
tune_hyperparameters=True,
211260
):
212-
"""Complete pipeline with feature selection"""
261+
"""Complete pipeline with feature selection and hyperparameter tuning"""
213262
# Remove rows where target is NaN
214263
mask = ~y_train.isna()
215264
X_train = X_train[mask]
216265
y_train = y_train[mask]
217266

218-
pipeline = KaggleSubmissionPipeline(model)
267+
pipeline = KaggleSubmissionPipeline(model, tune_hyperparameters=tune_hyperparameters)
219268

220269
logger.info("Dataset information:")
221270
logger.info(f"Training data shape: {X_train.shape}")

examples/kelmarsh_kaggle.py

+15-12
Original file line numberDiff line numberDiff line change
@@ -396,11 +396,11 @@ def save_t1_detrend_dfs(assessment_inputs: AssessmentInputs) -> None:
396396
X_train = train_df.drop(columns=[target_column])
397397
X_test = test_df
398398

399-
# First make a copy of the timestamp column to work with
399+
# make a copy of the timestamp column to work with
400400
timestamp_train = pd.to_datetime(X_train[("Timestamp", "Unnamed: 1_level_1")])
401401
timestamp_test = pd.to_datetime(X_test[("Timestamp", "Unnamed: 1_level_1")])
402402

403-
# Create multiple time-based features
403+
# create time-based features
404404
time_features_train = pd.DataFrame(
405405
{
406406
("Time", "hour"): timestamp_train.dt.hour,
@@ -467,21 +467,24 @@ def save_t1_detrend_dfs(assessment_inputs: AssessmentInputs) -> None:
467467
X_train = X_train[mask]
468468
y_train = y_train[mask]
469469

470-
# First, evaluate all feature selection methods
471-
pipeline = prepare_submission(
472-
X_train=X_train,
473-
y_train=y_train,
474-
X_test=X_test,
475-
sample_submission_path=ANALYSIS_OUTPUT_DIR / "sample_submission.csv",
476-
evaluate_features=True,
477-
)
470+
evaluate_features = False
471+
if evaluate_features:
472+
pipeline = prepare_submission(
473+
X_train=X_train,
474+
y_train=y_train,
475+
X_test=X_test,
476+
sample_submission_path=DATA_DIR / "sample_submission.csv",
477+
evaluate_features=True,
478+
)
478479

479480
# Then, use the best method for your final model
480481
pipeline = prepare_submission(
481482
X_train=X_train,
482483
y_train=y_train,
483484
X_test=X_test,
484-
sample_submission_path=ANALYSIS_OUTPUT_DIR / "sample_submission.csv",
485+
sample_submission_path=DATA_DIR / "sample_submission.csv",
486+
output_path=ANALYSIS_OUTPUT_DIR / f"submission_{pd.Timestamp.now():%Y%m%d_%H%M%S}.csv",
485487
evaluate_features=False,
486-
feature_method="model_based", # or 'mutual_info' or 'boruta'
488+
feature_method="all_features", # or 'mutual_info' or 'model_based' or 'boruta'
489+
tune_hyperparameters=True, # enable hyperparameter tuning
487490
)

0 commit comments

Comments
 (0)