Skip to content

Commit

Permalink
Merge pull request #7 from transferwise/regularization
Browse files Browse the repository at this point in the history
Add Regularization and tidy up
  • Loading branch information
EgorKraevTransferwise authored Sep 25, 2024
2 parents 8acef93 + 3e51a12 commit b2b7af5
Show file tree
Hide file tree
Showing 7 changed files with 308 additions and 337 deletions.
27 changes: 16 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
# Repository created from the dev portal

Owner: data-scientists

Slack channels: #shap-select

## Table of Contents

- [Overview](#overview)

## Overview
`shap-select` implements a heuristic to do fast feature selection for tabular regression and classification models.

The basic idea is running a linear or logistic regression of the target on the Shapley values on the validation set,
discarding the features with negative coefficients, and ranking/filtering the rest according to their
statistical significance. For motivation and details, see the [example notebook](https://github.com/transferwise/shap-select/blob/main/docs/Quick%20feature%20selection%20through%20regression%20on%20Shapley%20values.ipynb)

A library for feature selection for gradient boosting models using regression on feature Shapley values
Earlier packages using Shapley values for feature selection exist, the advantages of this one are
* Regression on the **validation set** to combat overfitting
* A single pass regression, not an iterative approach
* A single intuitive hyperparameter for feature selection: statistical significance
* Bonferroni correction for multiclass classification
## Usage
```python
from shap_select import shap_select
# Here model is any model supported by the shap library, fitted on a different (train) dataset
selected_features_df = shap_select(model, X_val, y_val, task="multiclass", threshold=0.05)
```
530 changes: 228 additions & 302 deletions docs/Quick feature selection through regression on Shapley values.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions docs/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import xgboost as xgb
from sklearn.model_selection import train_test_split

from shap_select import score_features
from shap_select import shap_select

# Generate a dataset with 8 normally distributed features and a target based on a given formula
np.random.seed(42)
Expand Down Expand Up @@ -83,7 +83,7 @@


# Call the select_features function
selected_features_df, shap_features = score_features(
selected_features_df, shap_features = shap_select(
model, X_val, X.columns.tolist(), y_val
)

Expand Down
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pandas
scikit_learn
scipy
shap
statsmodels
2 changes: 1 addition & 1 deletion shap_select/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .select import score_features
from .select import shap_select
67 changes: 52 additions & 15 deletions shap_select/select.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from typing import Any, Tuple, List, Dict

import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.genmod.families import Binomial
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats
import shap

Expand All @@ -22,7 +24,7 @@ def create_shap_features(
- pd.DataFrame: A DataFrame containing the SHAP values for each feature in the `validation_df`, where each column
corresponds to the SHAP values of a feature, and the rows match the index of the `validation_df`.
"""
explainer = shap.TreeExplainer(tree_model, model_output="raw")(validation_df)
explainer = shap.Explainer(tree_model, model_output="raw")(validation_df)
shap_values = explainer.values

if len(shap_values.shape) == 2:
Expand Down Expand Up @@ -64,10 +66,32 @@ def binary_classifier_significance(
"""

# Add a constant to the features for the intercept in logistic regression
shap_features_with_const = sm.add_constant(shap_features)

# Fit the logistic regression model
logit_model = sm.Logit(target, shap_features_with_const)
# Standardizing the features (Logistic regression with L1 regularization tends to
# work better with standardized data)
shap_features_scaled = pd.DataFrame(
data=StandardScaler().fit_transform(shap_features),
columns=shap_features.columns,
)
shap_features_with_const = sm.add_constant(shap_features_scaled)

# To avoid linear dependence of features, first do a pass with tiny L1-reg
# and throw away the zero coeffs
# Define the Logistic Regression model with L1 regularization
logistic_l1 = LogisticRegression(
penalty="l1", solver="liblinear", fit_intercept=False, C=1e6
) # C is the inverse of regularization strength
logistic_l1.fit(shap_features_with_const, target)

# Get the coefficients from the Logistic Regression model
# Logistic regression gives an array of shape (1, n_features), so we take [0]
coefficients = logistic_l1.coef_[0]
shap_features_filtered = sm.add_constant(shap_features).loc[
:, np.abs(coefficients) > 1e-6
]

# Fit the logistic regression model that will generate confidence intervals
logit_model = sm.Logit(target, shap_features_filtered)
result = logit_model.fit(disp=False)

# Extract the results
Expand Down Expand Up @@ -154,8 +178,16 @@ def regression_significance(
- stderr: The standard error for each coefficient.
- stat.significance: The p-value (statistical significance) for each feature.
"""
# Fit the linear regression model
ols_model = sm.OLS(target, shap_features)

# To avoid collinearity of features, first do a pass with tiny L1-reg
# and throw away the zero coeffs
shap_features_scaled = StandardScaler().fit_transform(shap_features)
coefficients = Lasso(alpha=1e-6).fit(shap_features_scaled, target).coef_
shap_features_filtered = shap_features.loc[:, np.abs(coefficients) > 1e-6]

# Sadly regularized models tend to not produce confidence intervals, so
# Fit the linear regression model that will generate confidence intervals
ols_model = sm.OLS(target, shap_features_filtered)
result = ols_model.fit()

# Extract the results
Expand Down Expand Up @@ -219,14 +251,14 @@ def shap_features_to_significance(
return result_df_sorted


def score_features(
def shap_select(
tree_model: Any,
validation_df: pd.DataFrame,
feature_names: List[str],
target: pd.Series | str, # str is column name in validation_df
feature_names: List[str] | None = None,
task: str | None = None,
threshold: float = 0.05,
return_shap_features: bool = False,
return_extended_data: bool = False,
) -> pd.DataFrame | Tuple[pd.DataFrame, pd.DataFrame]:
"""
Select features based on their SHAP values and statistical significance.
Expand All @@ -238,7 +270,7 @@ def score_features(
- target (pd.Series | str): The target values, or the name of the target column in `validation_df`.
- task (str | None): The task type ('regression', 'binary', or 'multi'). If None, it is inferred automatically.
- threshold (float): Significance threshold to select features. Default is 0.05.
- return_shap_features (bool): Whether to also return the shapley values dataframe(s)
- return_extended_data (bool): Whether to also return the shapley values dataframe(s) and some extra columns
Returns:
- pd.DataFrame: A DataFrame containing the feature names, statistical significance, and a 'Selected' column
Expand All @@ -248,6 +280,9 @@ def score_features(
if isinstance(target, str):
target = validation_df[target]

if feature_names is None:
feature_names = validation_df.columns.tolist()

# Infer the task if not provided
if task is None:
if pd.api.types.is_numeric_dtype(target) and target.nunique() > 10:
Expand All @@ -269,12 +304,14 @@ def score_features(
significance_df = shap_features_to_significance(shap_features, target, task)

# Add 'Selected' column based on the threshold
significance_df["Selected"] = (
significance_df["selected"] = (
significance_df["stat.significance"] < threshold
).astype(int)
significance_df.loc[significance_df["t-value"] < 0, "Selected"] = -1
significance_df.loc[significance_df["t-value"] < 0, "selected"] = -1

if return_shap_features:
if return_extended_data:
return significance_df, shap_features
else:
return significance_df
return significance_df[
["feature name", "t-value", "stat.significance", "coefficient", "selected"]
]
10 changes: 4 additions & 6 deletions tests/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import train_test_split
from shap_select import score_features
from shap_select import shap_select


@pytest.fixture
Expand Down Expand Up @@ -239,16 +239,14 @@ def test_selected_column_values(model_type, data_fixture, task_type, request):
raise ValueError("Unsupported model type")

# Call the score_features function for the correct task (regression, binary, multiclass)
selected_features_df = score_features(
model, X_val, X_val.columns.tolist(), y_val, task=task_type
)
selected_features_df = shap_select(model, X_val, y_val, task=task_type)

# Check feature significance for all task types
selected_rows = selected_features_df[
selected_features_df["feature name"].isin(["x7", "x8", "x9"])
]
assert (
selected_rows["Selected"] <= 0
selected_rows["selected"] <= 0
).all(), (
"The Selected column must have negative or zero values for features x7, x8, x9"
)
Expand All @@ -257,5 +255,5 @@ def test_selected_column_values(model_type, data_fixture, task_type, request):
~selected_features_df["feature name"].isin(["x7", "x8", "x9", "const"])
]
assert (
other_features_rows["Selected"] > 0
other_features_rows["selected"] > 0
).all(), "The Selected column must have positive values for features other than x7, x8, x9"

0 comments on commit b2b7af5

Please sign in to comment.