Skip to content

Commit

Permalink
Recursive feature elimination to deal with multicollinearity
Browse files Browse the repository at this point in the history
  • Loading branch information
EgorKraevTransferwise committed Oct 2, 2024
1 parent bb27da1 commit 3c0c9d7
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 3 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright [2024] [Wise PLC]
Copyright 2024 Wise PLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
Empty file removed docs/bug.py
Empty file.
43 changes: 41 additions & 2 deletions shap_select/select.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,45 @@ def shap_features_to_significance(
return result_df_sorted


def iterative_shap_feature_reduction(
shap_features: pd.DataFrame | List[pd.DataFrame],
target: pd.Series,
task: str,
) -> pd.DataFrame:
collected_rows = [] # List to store the rows we collect during each iteration

features_left = True
while features_left:
# Call the original shap_features_to_significance function
significance_df = shap_features_to_significance(shap_features, target, task)

# Find the feature with the lowest t-value
min_t_value_row = significance_df.loc[significance_df["t-value"].idxmin()]

# Remember this row (collect it in our list)
collected_rows.append(min_t_value_row)

# Drop the feature corresponding to the lowest t-value from shap_features
feature_to_remove = min_t_value_row["feature name"]
if isinstance(shap_features, pd.DataFrame):
shap_features = shap_features.drop(columns=[feature_to_remove])
features_left = len(shap_features.columns)
else:
shap_features = {
k: v.drop(columns=[feature_to_remove]) for k, v in shap_features.items()
}
features_left = len(list(shap_features.values())[0].columns)

# Convert collected rows back to a dataframe
result_df = (
pd.DataFrame(collected_rows)
.sort_values(by="t-value", ascending=False)
.reset_index()
)

return result_df


def shap_select(
tree_model: Any,
validation_df: pd.DataFrame,
Expand Down Expand Up @@ -270,8 +309,8 @@ def shap_select(
else:
shap_features = create_shap_features(tree_model, validation_df[feature_names])

# Compute statistical significance of each feature
significance_df = shap_features_to_significance(shap_features, target, task)
# Compute statistical significance of each feature, recursively ablating
significance_df = iterative_shap_feature_reduction(shap_features, target, task)

# Add 'Selected' column based on the threshold
significance_df["selected"] = (
Expand Down

0 comments on commit 3c0c9d7

Please sign in to comment.