Merge pull request #7 from transferwise/regularization

Add Regularization and tidy up
transferwise · Sep 25, 2024 · b2b7af5 · b2b7af5
2 parents 8acef93 + 3e51a12
commit b2b7af5
Show file tree

Hide file tree

Showing 7 changed files with 308 additions and 337 deletions.
diff --git a/README.md b/README.md
@@ -1,13 +1,18 @@
-# Repository created from the dev portal
-
-Owner: data-scientists
-
-Slack channels: #shap-select
-
-## Table of Contents
-
-- [Overview](#overview)
-
 ## Overview
+`shap-select` implements a heuristic to do fast feature selection for tabular regression and classification models. 
+
+The basic idea is running a linear or logistic regression of the target on the Shapley values on the validation set,
+discarding the features with negative coefficients, and ranking/filtering the rest according to their 
+statistical significance. For motivation and details, see the [example notebook](https://github.com/transferwise/shap-select/blob/main/docs/Quick%20feature%20selection%20through%20regression%20on%20Shapley%20values.ipynb)
 
-A library for feature selection for gradient boosting models using regression on feature Shapley values
+Earlier packages using Shapley values for feature selection exist, the advantages of this one are
+* Regression on the **validation set** to combat overfitting
+* A single pass regression, not an iterative approach
+* A single intuitive hyperparameter for feature selection: statistical significance
+* Bonferroni correction for multiclass classification
+## Usage
+```python
+from shap_select import shap_select
+# Here model is any model supported by the shap library, fitted on a different (train) dataset
+selected_features_df = shap_select(model, X_val, y_val, task="multiclass", threshold=0.05)
+```
diff --git a/docs/Quick feature selection through regression on Shapley values.ipynb b/docs/Quick feature selection through regression on Shapley values.ipynb
diff --git a/docs/example.py b/docs/example.py
@@ -4,7 +4,7 @@
 import xgboost as xgb
 from sklearn.model_selection import train_test_split
 
-from shap_select import score_features
+from shap_select import shap_select
 
 # Generate a dataset with 8 normally distributed features and a target based on a given formula
 np.random.seed(42)
@@ -83,7 +83,7 @@
 
 
 # Call the select_features function
-selected_features_df, shap_features = score_features(
+selected_features_df, shap_features = shap_select(
     model, X_val, X.columns.tolist(), y_val
 )
 

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+pandas
+scikit_learn
+scipy
+shap
+statsmodels
diff --git a/shap_select/__init__.py b/shap_select/__init__.py
@@ -1 +1 @@
-from .select import score_features
+from .select import shap_select
diff --git a/shap_select/select.py b/shap_select/select.py
@@ -1,8 +1,10 @@
 from typing import Any, Tuple, List, Dict
 
 import pandas as pd
+import numpy as np
 import statsmodels.api as sm
-from statsmodels.genmod.families import Binomial
+from sklearn.linear_model import Lasso, LogisticRegression
+from sklearn.preprocessing import StandardScaler
 import scipy.stats as stats
 import shap
 
@@ -22,7 +24,7 @@ def create_shap_features(
     - pd.DataFrame: A DataFrame containing the SHAP values for each feature in the `validation_df`, where each column
       corresponds to the SHAP values of a feature, and the rows match the index of the `validation_df`.
     """
-    explainer = shap.TreeExplainer(tree_model, model_output="raw")(validation_df)
+    explainer = shap.Explainer(tree_model, model_output="raw")(validation_df)
     shap_values = explainer.values
 
     if len(shap_values.shape) == 2:
@@ -64,10 +66,32 @@ def binary_classifier_significance(
     """
 
     # Add a constant to the features for the intercept in logistic regression
-    shap_features_with_const = sm.add_constant(shap_features)
 
-    # Fit the logistic regression model
-    logit_model = sm.Logit(target, shap_features_with_const)
+    # Standardizing the features (Logistic regression with L1 regularization tends to
+    # work better with standardized data)
+    shap_features_scaled = pd.DataFrame(
+        data=StandardScaler().fit_transform(shap_features),
+        columns=shap_features.columns,
+    )
+    shap_features_with_const = sm.add_constant(shap_features_scaled)
+
+    # To avoid linear dependence of features, first do a pass with tiny L1-reg
+    # and throw away the zero coeffs
+    # Define the Logistic Regression model with L1 regularization
+    logistic_l1 = LogisticRegression(
+        penalty="l1", solver="liblinear", fit_intercept=False, C=1e6
+    )  # C is the inverse of regularization strength
+    logistic_l1.fit(shap_features_with_const, target)
+
+    # Get the coefficients from the Logistic Regression model
+    # Logistic regression gives an array of shape (1, n_features), so we take [0]
+    coefficients = logistic_l1.coef_[0]
+    shap_features_filtered = sm.add_constant(shap_features).loc[
+        :, np.abs(coefficients) > 1e-6
+    ]
+
+    # Fit the logistic regression model that will generate confidence intervals
+    logit_model = sm.Logit(target, shap_features_filtered)
     result = logit_model.fit(disp=False)
 
     # Extract the results
@@ -154,8 +178,16 @@ def regression_significance(
         - stderr: The standard error for each coefficient.
         - stat.significance: The p-value (statistical significance) for each feature.
     """
-    # Fit the linear regression model
-    ols_model = sm.OLS(target, shap_features)
+
+    # To avoid collinearity of features, first do a pass with tiny L1-reg
+    # and throw away the zero coeffs
+    shap_features_scaled = StandardScaler().fit_transform(shap_features)
+    coefficients = Lasso(alpha=1e-6).fit(shap_features_scaled, target).coef_
+    shap_features_filtered = shap_features.loc[:, np.abs(coefficients) > 1e-6]
+
+    # Sadly regularized models tend to not produce confidence intervals, so
+    # Fit the linear regression model that will generate confidence intervals
+    ols_model = sm.OLS(target, shap_features_filtered)
     result = ols_model.fit()
 
     # Extract the results
@@ -219,14 +251,14 @@ def shap_features_to_significance(
     return result_df_sorted
 
 
-def score_features(
+def shap_select(
     tree_model: Any,
     validation_df: pd.DataFrame,
-    feature_names: List[str],
     target: pd.Series | str,  # str is column name in validation_df
+    feature_names: List[str] | None = None,
     task: str | None = None,
     threshold: float = 0.05,
-    return_shap_features: bool = False,
+    return_extended_data: bool = False,
 ) -> pd.DataFrame | Tuple[pd.DataFrame, pd.DataFrame]:
     """
     Select features based on their SHAP values and statistical significance.
@@ -238,7 +270,7 @@ def score_features(
     - target (pd.Series | str): The target values, or the name of the target column in `validation_df`.
     - task (str | None): The task type ('regression', 'binary', or 'multi'). If None, it is inferred automatically.
     - threshold (float): Significance threshold to select features. Default is 0.05.
-    - return_shap_features (bool): Whether to also return the shapley values dataframe(s)
+    - return_extended_data (bool): Whether to also return the shapley values dataframe(s) and some extra columns
 
     Returns:
     - pd.DataFrame: A DataFrame containing the feature names, statistical significance, and a 'Selected' column
@@ -248,6 +280,9 @@ def score_features(
     if isinstance(target, str):
         target = validation_df[target]
 
+    if feature_names is None:
+        feature_names = validation_df.columns.tolist()
+
     # Infer the task if not provided
     if task is None:
         if pd.api.types.is_numeric_dtype(target) and target.nunique() > 10:
@@ -269,12 +304,14 @@ def score_features(
     significance_df = shap_features_to_significance(shap_features, target, task)
 
     # Add 'Selected' column based on the threshold
-    significance_df["Selected"] = (
+    significance_df["selected"] = (
         significance_df["stat.significance"] < threshold
     ).astype(int)
-    significance_df.loc[significance_df["t-value"] < 0, "Selected"] = -1
+    significance_df.loc[significance_df["t-value"] < 0, "selected"] = -1
 
-    if return_shap_features:
+    if return_extended_data:
         return significance_df, shap_features
     else:
-        return significance_df
+        return significance_df[
+            ["feature name", "t-value", "stat.significance", "coefficient", "selected"]
+        ]
diff --git a/tests/test_regression.py b/tests/test_regression.py
@@ -5,7 +5,7 @@
 import xgboost as xgb
 import catboost as cb
 from sklearn.model_selection import train_test_split
-from shap_select import score_features
+from shap_select import shap_select
 
 
 @pytest.fixture
@@ -239,16 +239,14 @@ def test_selected_column_values(model_type, data_fixture, task_type, request):
         raise ValueError("Unsupported model type")
 
     # Call the score_features function for the correct task (regression, binary, multiclass)
-    selected_features_df = score_features(
-        model, X_val, X_val.columns.tolist(), y_val, task=task_type
-    )
+    selected_features_df = shap_select(model, X_val, y_val, task=task_type)
 
     # Check feature significance for all task types
     selected_rows = selected_features_df[
         selected_features_df["feature name"].isin(["x7", "x8", "x9"])
     ]
     assert (
-        selected_rows["Selected"] <= 0
+        selected_rows["selected"] <= 0
     ).all(), (
         "The Selected column must have negative or zero values for features x7, x8, x9"
     )
@@ -257,5 +255,5 @@ def test_selected_column_values(model_type, data_fixture, task_type, request):
         ~selected_features_df["feature name"].isin(["x7", "x8", "x9", "const"])
     ]
     assert (
-        other_features_rows["Selected"] > 0
+        other_features_rows["selected"] > 0
     ).all(), "The Selected column must have positive values for features other than x7, x8, x9"
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from .select import score_features
		from .select import shap_select