From bb27da18a3c82610618972def7bb5937c2a6c083 Mon Sep 17 00:00:00 2001
From: "Egor.Kraev" <egor.kraev@transferwise.com>
Date: Thu, 26 Sep 2024 17:22:36 +0100
Subject: [PATCH] Properly call statsmodels.fit_regularized()

---
 README.md             |  1 +
 docs/bug.py           |  0
 requirements.txt      |  1 +
 shap_select/select.py | 42 ++++++------------------------------------
 4 files changed, 8 insertions(+), 36 deletions(-)
 create mode 100644 docs/bug.py

diff --git a/README.md b/README.md
index 4610030..1933886 100644
--- a/README.md
+++ b/README.md
@@ -14,5 +14,6 @@ Earlier packages using Shapley values for feature selection exist, the advantage
 ```python
 from shap_select import shap_select
 # Here model is any model supported by the shap library, fitted on a different (train) dataset
+# Task can be regression, binary, or multiclass
 selected_features_df = shap_select(model, X_val, y_val, task="multiclass", threshold=0.05)
 ```
\ No newline at end of file
diff --git a/docs/bug.py b/docs/bug.py
new file mode 100644
index 0000000..e69de29
diff --git a/requirements.txt b/requirements.txt
index 4abb286..1aca3b7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ scikit_learn
 scipy
 shap
 statsmodels
+numpy
diff --git a/shap_select/select.py b/shap_select/select.py
index f803207..68993cb 100644
--- a/shap_select/select.py
+++ b/shap_select/select.py
@@ -66,33 +66,11 @@ def binary_classifier_significance(
     """
 
     # Add a constant to the features for the intercept in logistic regression
-
-    # Standardizing the features (Logistic regression with L1 regularization tends to
-    # work better with standardized data)
-    shap_features_scaled = pd.DataFrame(
-        data=StandardScaler().fit_transform(shap_features),
-        columns=shap_features.columns,
-    )
-    shap_features_with_const = sm.add_constant(shap_features_scaled)
-
-    # To avoid linear dependence of features, first do a pass with tiny L1-reg
-    # and throw away the zero coeffs
-    # Define the Logistic Regression model with L1 regularization
-    logistic_l1 = LogisticRegression(
-        penalty="l1", solver="liblinear", fit_intercept=False, C=1e6
-    )  # C is the inverse of regularization strength
-    logistic_l1.fit(shap_features_with_const, target)
-
-    # Get the coefficients from the Logistic Regression model
-    # Logistic regression gives an array of shape (1, n_features), so we take [0]
-    coefficients = logistic_l1.coef_[0]
-    shap_features_filtered = sm.add_constant(shap_features).loc[
-        :, np.abs(coefficients) > 1e-6
-    ]
+    shap_features_with_constant = sm.add_constant(shap_features)
 
     # Fit the logistic regression model that will generate confidence intervals
-    logit_model = sm.Logit(target, shap_features_filtered)
-    result = logit_model.fit(disp=False)
+    logit_model = sm.Logit(target, shap_features_with_constant)
+    result = logit_model.fit_regularized(disp=False, alpha=1e-6)
 
     # Extract the results
     summary_frame = result.summary2().tables[1]
@@ -178,17 +156,9 @@ def regression_significance(
         - stderr: The standard error for each coefficient.
         - stat.significance: The p-value (statistical significance) for each feature.
     """
-
-    # To avoid collinearity of features, first do a pass with tiny L1-reg
-    # and throw away the zero coeffs
-    shap_features_scaled = StandardScaler().fit_transform(shap_features)
-    coefficients = Lasso(alpha=1e-6).fit(shap_features_scaled, target).coef_
-    shap_features_filtered = shap_features.loc[:, np.abs(coefficients) > 1e-6]
-
-    # Sadly regularized models tend to not produce confidence intervals, so
     # Fit the linear regression model that will generate confidence intervals
-    ols_model = sm.OLS(target, shap_features_filtered)
-    result = ols_model.fit()
+    ols_model = sm.OLS(target, shap_features)
+    result = ols_model.fit_regularized(alpha=1e-6, refit=True)
 
     # Extract the results
     summary_frame = result.summary2().tables[1]
@@ -268,7 +238,7 @@ def shap_select(
     - validation_df (pd.DataFrame): Validation dataset containing the features.
     - feature_names (List[str]): A list of feature names used by the model.
     - target (pd.Series | str): The target values, or the name of the target column in `validation_df`.
-    - task (str | None): The task type ('regression', 'binary', or 'multi'). If None, it is inferred automatically.
+    - task (str | None): The task type ('regression', 'binary', or 'multiclass'). If None, it is inferred automatically.
     - threshold (float): Significance threshold to select features. Default is 0.05.
     - return_extended_data (bool): Whether to also return the shapley values dataframe(s) and some extra columns