Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Regularization and tidy up #7

Merged
merged 5 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 16 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
# Repository created from the dev portal

Owner: data-scientists

Slack channels: #shap-select

## Table of Contents

- [Overview](#overview)

## Overview
`shap-select` implements a heuristic to do fast feature selection for tabular regression and classification models.

The basic idea is running a linear or logistic regression of the target on the Shapley values on the validation set,
discarding the features with negative coefficients, and ranking/filtering the rest according to their
statistical significance. For motivation and details, see the [example notebook](https://github.com/transferwise/shap-select/blob/main/docs/Quick%20feature%20selection%20through%20regression%20on%20Shapley%20values.ipynb)

A library for feature selection for gradient boosting models using regression on feature Shapley values
Earlier packages using Shapley values for feature selection exist, the advantages of this one are
* Regression on the **validation set** to combat overfitting
* A single pass regression, not an iterative approach
* A single intuitive hyperparameter for feature selection: statistical significance
* Bonferroni correction for multiclass classification
## Usage
```python
from shap_select import shap_select
# Here model is any model supported by the shap library, fitted on a different (train) dataset
selected_features_df = shap_select(model, X_val, y_val, task="multiclass", threshold=0.05)
```
530 changes: 228 additions & 302 deletions docs/Quick feature selection through regression on Shapley values.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions docs/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import xgboost as xgb
from sklearn.model_selection import train_test_split

from shap_select import score_features
from shap_select import shap_select

# Generate a dataset with 8 normally distributed features and a target based on a given formula
np.random.seed(42)
Expand Down Expand Up @@ -83,7 +83,7 @@


# Call the select_features function
selected_features_df, shap_features = score_features(
selected_features_df, shap_features = shap_select(
model, X_val, X.columns.tolist(), y_val
)

Expand Down
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pandas
scikit_learn
scipy
shap
statsmodels
2 changes: 1 addition & 1 deletion shap_select/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .select import score_features
from .select import shap_select
67 changes: 52 additions & 15 deletions shap_select/select.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from typing import Any, Tuple, List, Dict

import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.genmod.families import Binomial
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats
import shap

Expand All @@ -22,7 +24,7 @@ def create_shap_features(
- pd.DataFrame: A DataFrame containing the SHAP values for each feature in the `validation_df`, where each column
corresponds to the SHAP values of a feature, and the rows match the index of the `validation_df`.
"""
explainer = shap.TreeExplainer(tree_model, model_output="raw")(validation_df)
explainer = shap.Explainer(tree_model, model_output="raw")(validation_df)
shap_values = explainer.values

if len(shap_values.shape) == 2:
Expand Down Expand Up @@ -64,10 +66,32 @@ def binary_classifier_significance(
"""

# Add a constant to the features for the intercept in logistic regression
shap_features_with_const = sm.add_constant(shap_features)

# Fit the logistic regression model
logit_model = sm.Logit(target, shap_features_with_const)
# Standardizing the features (Logistic regression with L1 regularization tends to
# work better with standardized data)
shap_features_scaled = pd.DataFrame(
data=StandardScaler().fit_transform(shap_features),
columns=shap_features.columns,
)
shap_features_with_const = sm.add_constant(shap_features_scaled)

# To avoid linear dependence of features, first do a pass with tiny L1-reg
# and throw away the zero coeffs
# Define the Logistic Regression model with L1 regularization
logistic_l1 = LogisticRegression(
penalty="l1", solver="liblinear", fit_intercept=False, C=1e6
) # C is the inverse of regularization strength
logistic_l1.fit(shap_features_with_const, target)

# Get the coefficients from the Logistic Regression model
# Logistic regression gives an array of shape (1, n_features), so we take [0]
coefficients = logistic_l1.coef_[0]
shap_features_filtered = sm.add_constant(shap_features).loc[
:, np.abs(coefficients) > 1e-6
]

# Fit the logistic regression model that will generate confidence intervals
logit_model = sm.Logit(target, shap_features_filtered)
result = logit_model.fit(disp=False)

# Extract the results
Expand Down Expand Up @@ -154,8 +178,16 @@ def regression_significance(
- stderr: The standard error for each coefficient.
- stat.significance: The p-value (statistical significance) for each feature.
"""
# Fit the linear regression model
ols_model = sm.OLS(target, shap_features)

# To avoid collinearity of features, first do a pass with tiny L1-reg
# and throw away the zero coeffs
shap_features_scaled = StandardScaler().fit_transform(shap_features)
coefficients = Lasso(alpha=1e-6).fit(shap_features_scaled, target).coef_
shap_features_filtered = shap_features.loc[:, np.abs(coefficients) > 1e-6]

# Sadly regularized models tend to not produce confidence intervals, so
# Fit the linear regression model that will generate confidence intervals
ols_model = sm.OLS(target, shap_features_filtered)
result = ols_model.fit()

# Extract the results
Expand Down Expand Up @@ -219,14 +251,14 @@ def shap_features_to_significance(
return result_df_sorted


def score_features(
def shap_select(
tree_model: Any,
validation_df: pd.DataFrame,
feature_names: List[str],
target: pd.Series | str, # str is column name in validation_df
feature_names: List[str] | None = None,
task: str | None = None,
threshold: float = 0.05,
return_shap_features: bool = False,
return_extended_data: bool = False,
) -> pd.DataFrame | Tuple[pd.DataFrame, pd.DataFrame]:
"""
Select features based on their SHAP values and statistical significance.
Expand All @@ -238,7 +270,7 @@ def score_features(
- target (pd.Series | str): The target values, or the name of the target column in `validation_df`.
- task (str | None): The task type ('regression', 'binary', or 'multi'). If None, it is inferred automatically.
- threshold (float): Significance threshold to select features. Default is 0.05.
- return_shap_features (bool): Whether to also return the shapley values dataframe(s)
- return_extended_data (bool): Whether to also return the shapley values dataframe(s) and some extra columns

Returns:
- pd.DataFrame: A DataFrame containing the feature names, statistical significance, and a 'Selected' column
Expand All @@ -248,6 +280,9 @@ def score_features(
if isinstance(target, str):
target = validation_df[target]

if feature_names is None:
feature_names = validation_df.columns.tolist()

# Infer the task if not provided
if task is None:
if pd.api.types.is_numeric_dtype(target) and target.nunique() > 10:
Expand All @@ -269,12 +304,14 @@ def score_features(
significance_df = shap_features_to_significance(shap_features, target, task)

# Add 'Selected' column based on the threshold
significance_df["Selected"] = (
significance_df["selected"] = (
significance_df["stat.significance"] < threshold
).astype(int)
significance_df.loc[significance_df["t-value"] < 0, "Selected"] = -1
significance_df.loc[significance_df["t-value"] < 0, "selected"] = -1

if return_shap_features:
if return_extended_data:
return significance_df, shap_features
else:
return significance_df
return significance_df[
["feature name", "t-value", "stat.significance", "coefficient", "selected"]
]
10 changes: 4 additions & 6 deletions tests/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import train_test_split
from shap_select import score_features
from shap_select import shap_select


@pytest.fixture
Expand Down Expand Up @@ -239,16 +239,14 @@ def test_selected_column_values(model_type, data_fixture, task_type, request):
raise ValueError("Unsupported model type")

# Call the score_features function for the correct task (regression, binary, multiclass)
selected_features_df = score_features(
model, X_val, X_val.columns.tolist(), y_val, task=task_type
)
selected_features_df = shap_select(model, X_val, y_val, task=task_type)

# Check feature significance for all task types
selected_rows = selected_features_df[
selected_features_df["feature name"].isin(["x7", "x8", "x9"])
]
assert (
selected_rows["Selected"] <= 0
selected_rows["selected"] <= 0
).all(), (
"The Selected column must have negative or zero values for features x7, x8, x9"
)
Expand All @@ -257,5 +255,5 @@ def test_selected_column_values(model_type, data_fixture, task_type, request):
~selected_features_df["feature name"].isin(["x7", "x8", "x9", "const"])
]
assert (
other_features_rows["Selected"] > 0
other_features_rows["selected"] > 0
).all(), "The Selected column must have positive values for features other than x7, x8, x9"
Loading