From 12e96bd4b958da4902e8a46c1864ecdc30f0e7b3 Mon Sep 17 00:00:00 2001
From: "Egor.Kraev" <egor.kraev@transferwise.com>
Date: Tue, 24 Sep 2024 19:52:36 +0100
Subject: [PATCH 1/4] add an l1 regularization stage before regression to kill
 collinearity

---
 ...through regression on Shapley values.ipynb |  8 ++--
 docs/example.py                               |  4 +-
 requirements.txt                              |  5 ++
 shap_select/__init__.py                       |  2 +-
 shap_select/select.py                         | 46 ++++++++++++++++---
 tests/test_regression.py                      |  4 +-
 6 files changed, 53 insertions(+), 16 deletions(-)
 create mode 100644 requirements.txt

diff --git a/docs/Quick feature selection through regression on Shapley values.ipynb b/docs/Quick feature selection through regression on Shapley values.ipynb
index 777d7e6..aeeefbf 100644
--- a/docs/Quick feature selection through regression on Shapley values.ipynb	
+++ b/docs/Quick feature selection through regression on Shapley values.ipynb	
@@ -427,14 +427,14 @@
     "import os, sys\n",
     "\n",
     "try:\n",
-    "    from shap_select import score_features\n",
+    "    from shap_select import shap_select\n",
     "except ModuleNotFoundError:\n",
     "    # If you're running shap_select from source\n",
     "    root = os.path.realpath(\"..\")\n",
     "    sys.path.append(root)\n",
-    "    from shap_select import score_features\n",
+    "    from shap_select import shap_select\n",
     "\n",
-    "selected_features_df = score_features(\n",
+    "selected_features_df = shap_select(\n",
     "    model, X_val, X_val.columns.tolist(), y_val, task=\"regression\", threshold=0.05\n",
     ")\n",
     "\n",
@@ -1469,7 +1469,7 @@
     }
    ],
    "source": [
-    "selected_features_df = score_features(\n",
+    "selected_features_df = shap_select(\n",
     "    model, X_val, X_val.columns.tolist(), y_val, task=\"multiclass\", threshold=0.05\n",
     ")\n",
     "\n",
diff --git a/docs/example.py b/docs/example.py
index d446c38..0aa99b9 100644
--- a/docs/example.py
+++ b/docs/example.py
@@ -4,7 +4,7 @@
 import xgboost as xgb
 from sklearn.model_selection import train_test_split
 
-from shap_select import score_features
+from shap_select import shap_select
 
 # Generate a dataset with 8 normally distributed features and a target based on a given formula
 np.random.seed(42)
@@ -83,7 +83,7 @@
 
 
 # Call the select_features function
-selected_features_df, shap_features = score_features(
+selected_features_df, shap_features = shap_select(
     model, X_val, X.columns.tolist(), y_val
 )
 
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..4abb286
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+pandas
+scikit_learn
+scipy
+shap
+statsmodels
diff --git a/shap_select/__init__.py b/shap_select/__init__.py
index 9d26d4d..937bd85 100644
--- a/shap_select/__init__.py
+++ b/shap_select/__init__.py
@@ -1 +1 @@
-from .select import score_features
+from .select import shap_select
diff --git a/shap_select/select.py b/shap_select/select.py
index 4a89c96..365eeef 100644
--- a/shap_select/select.py
+++ b/shap_select/select.py
@@ -1,8 +1,10 @@
 from typing import Any, Tuple, List, Dict
 
 import pandas as pd
+import numpy as np
 import statsmodels.api as sm
-from statsmodels.genmod.families import Binomial
+from sklearn.linear_model import Lasso, LogisticRegression
+from sklearn.preprocessing import StandardScaler
 import scipy.stats as stats
 import shap
 
@@ -64,10 +66,32 @@ def binary_classifier_significance(
     """
 
     # Add a constant to the features for the intercept in logistic regression
-    shap_features_with_const = sm.add_constant(shap_features)
 
-    # Fit the logistic regression model
-    logit_model = sm.Logit(target, shap_features_with_const)
+    # Standardizing the features (Logistic regression with L1 regularization tends to
+    # work better with standardized data)
+    shap_features_scaled = pd.DataFrame(
+        data=StandardScaler().fit_transform(shap_features),
+        columns=shap_features.columns,
+    )
+    shap_features_with_const = sm.add_constant(shap_features_scaled)
+
+    # To avoid linear dependence of features, first do a pass with tiny L1-reg
+    # and throw away the zero coeffs
+    # Define the Logistic Regression model with L1 regularization
+    logistic_l1 = LogisticRegression(
+        penalty="l1", solver="liblinear", fit_intercept=False, C=1e6
+    )  # C is the inverse of regularization strength
+    logistic_l1.fit(shap_features_with_const, target)
+
+    # Get the coefficients from the Logistic Regression model
+    # Logistic regression gives an array of shape (1, n_features), so we take [0]
+    coefficients = logistic_l1.coef_[0]
+    shap_features_filtered = sm.add_constant(shap_features).loc[
+        :, np.abs(coefficients) > 1e-6
+    ]
+
+    # Fit the logistic regression model that will generate confidence intervals
+    logit_model = sm.Logit(target, shap_features_filtered)
     result = logit_model.fit(disp=False)
 
     # Extract the results
@@ -154,8 +178,16 @@ def regression_significance(
         - stderr: The standard error for each coefficient.
         - stat.significance: The p-value (statistical significance) for each feature.
     """
-    # Fit the linear regression model
-    ols_model = sm.OLS(target, shap_features)
+
+    # To avoid collinearity of features, first do a pass with tiny L1-reg
+    # and throw away the zero coeffs
+    shap_features_scaled = StandardScaler().fit_transform(shap_features)
+    coefficients = Lasso(alpha=1e-6).fit(shap_features_scaled, target).coef_
+    shap_features_filtered = shap_features.loc[:, np.abs(coefficients) > 1e-6]
+
+    # Sadly regularized models tend to not produce confidence intervals, so
+    # Fit the linear regression model that will generate confidence intervals
+    ols_model = sm.OLS(target, shap_features_filtered)
     result = ols_model.fit()
 
     # Extract the results
@@ -219,7 +251,7 @@ def shap_features_to_significance(
     return result_df_sorted
 
 
-def score_features(
+def shap_select(
     tree_model: Any,
     validation_df: pd.DataFrame,
     feature_names: List[str],
diff --git a/tests/test_regression.py b/tests/test_regression.py
index 4b5d29e..a25568e 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -5,7 +5,7 @@
 import xgboost as xgb
 import catboost as cb
 from sklearn.model_selection import train_test_split
-from shap_select import score_features
+from shap_select import shap_select
 
 
 @pytest.fixture
@@ -239,7 +239,7 @@ def test_selected_column_values(model_type, data_fixture, task_type, request):
         raise ValueError("Unsupported model type")
 
     # Call the score_features function for the correct task (regression, binary, multiclass)
-    selected_features_df = score_features(
+    selected_features_df = shap_select(
         model, X_val, X_val.columns.tolist(), y_val, task=task_type
     )
 

From fc820b285e3ca5fdba11042c4b2becae39548d81 Mon Sep 17 00:00:00 2001
From: "Egor.Kraev" <egor.kraev@transferwise.com>
Date: Tue, 24 Sep 2024 21:49:22 +0100
Subject: [PATCH 2/4] Support any model shap supports

---
 shap_select/select.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shap_select/select.py b/shap_select/select.py
index 365eeef..fe0203e 100644
--- a/shap_select/select.py
+++ b/shap_select/select.py
@@ -24,7 +24,7 @@ def create_shap_features(
     - pd.DataFrame: A DataFrame containing the SHAP values for each feature in the `validation_df`, where each column
       corresponds to the SHAP values of a feature, and the rows match the index of the `validation_df`.
     """
-    explainer = shap.TreeExplainer(tree_model, model_output="raw")(validation_df)
+    explainer = shap.Explainer(tree_model, model_output="raw")(validation_df)
     shap_values = explainer.values
 
     if len(shap_values.shape) == 2:

From f2f4966bdb65e60b5b7ed4cf2520413a47b4ea3d Mon Sep 17 00:00:00 2001
From: "Egor.Kraev" <egor.kraev@transferwise.com>
Date: Wed, 25 Sep 2024 09:16:09 +0100
Subject: [PATCH 3/4] Minor tidying-up

---
 README.md                                     |  28 +-
 ...through regression on Shapley values.ipynb | 530 ++++++++----------
 shap_select/select.py                         |  19 +-
 tests/test_regression.py                      |   8 +-
 4 files changed, 261 insertions(+), 324 deletions(-)

diff --git a/README.md b/README.md
index e5ec68d..dc10600 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,21 @@
-# Repository created from the dev portal
-
-Owner: data-scientists
-
-Slack channels: #shap-select
-
-## Table of Contents
-
-- [Overview](#overview)
+A library for fast feature selection for gradient boosting models using regression on feature Shapley values.
+Unlike
 
 ## Overview
+`shap-select` implements a heuristic to do fast feature selection for tabular regression and classification models. 
+
+The basic idea is running a linear or logistic regression of the Shapley values on the target on the validation set,
+discarding the features with negative coefficients, and ranking/filtering the rest according to their 
+statistical significance. For details, see the [example notebook](https://github.com/transferwise/shap-select/blob/main/docs/Quick%20feature%20selection%20through%20regression%20on%20Shapley%20values.ipynb)
 
-A library for feature selection for gradient boosting models using regression on feature Shapley values
+Earlier packages using Shapley values for feature selection exist, the advantages of this one are
+* Regression on the **validation set** to combat overfitting
+* A single pass regression calculation, not iterative
+* A single intuitive hyperparameter for feature selection: statistical significance
+* Bonferroni correction for multiclass classification
+## Usage
+```python
+from shap_select import shap_select
+# Here model is any model supported by the shap library, fitted on a different (train) dataset
+selected_features_df = shap_select(model, X_val, y_val, task="multiclass", threshold=0.05)
+```
\ No newline at end of file
diff --git a/docs/Quick feature selection through regression on Shapley values.ipynb b/docs/Quick feature selection through regression on Shapley values.ipynb
index aeeefbf..d441efb 100644
--- a/docs/Quick feature selection through regression on Shapley values.ipynb	
+++ b/docs/Quick feature selection through regression on Shapley values.ipynb	
@@ -25,14 +25,33 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "348c2468",
+   "id": "51cd6a7d",
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os, sys\n",
+    "from typing import List\n",
+    "\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "from sklearn.model_selection import train_test_split\n",
     "\n",
+    "try:\n",
+    "    from shap_select import shap_select\n",
+    "except ModuleNotFoundError:\n",
+    "    # If you're running shap_select from source\n",
+    "    root = os.path.realpath(\"..\")\n",
+    "    sys.path.append(root)\n",
+    "    from shap_select import shap_select"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "348c2468",
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "np.random.seed(42)\n",
     "n_samples = 100000\n",
     "\n",
@@ -80,7 +99,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "ec03ff2c",
    "metadata": {},
    "outputs": [
@@ -186,268 +205,206 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "8f403fc5",
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "selected_features_df = shap_select(model, X_val, y_val, task=\"regression\", threshold=0.05)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "9fe28e6b",
+   "metadata": {},
    "outputs": [
     {
      "data": {
       "text/html": [
        "<style type=\"text/css\">\n",
-       "#T_12624_row0_col1, #T_12624_row3_col1 {\n",
-       "  background-color: #f39778;\n",
-       "  color: #000000;\n",
-       "}\n",
-       "#T_12624_row0_col2, #T_12624_row0_col3, #T_12624_row0_col5, #T_12624_row1_col2, #T_12624_row1_col3, #T_12624_row1_col5, #T_12624_row2_col3, #T_12624_row3_col3, #T_12624_row3_col5, #T_12624_row4_col3, #T_12624_row8_col1, #T_12624_row8_col4, #T_12624_row8_col6 {\n",
-       "  background-color: #3b4cc0;\n",
+       "#T_694ab_row0_col1, #T_694ab_row0_col4, #T_694ab_row1_col4, #T_694ab_row2_col4, #T_694ab_row3_col4, #T_694ab_row4_col4, #T_694ab_row5_col4, #T_694ab_row6_col3, #T_694ab_row7_col2 {\n",
+       "  background-color: #b40426;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_12624_row0_col4, #T_12624_row0_col6, #T_12624_row1_col6, #T_12624_row2_col6, #T_12624_row3_col6, #T_12624_row4_col6, #T_12624_row5_col6, #T_12624_row6_col1, #T_12624_row7_col2, #T_12624_row7_col3, #T_12624_row8_col5 {\n",
-       "  background-color: #b40426;\n",
+       "#T_694ab_row0_col2, #T_694ab_row1_col2, #T_694ab_row2_col2, #T_694ab_row3_col2, #T_694ab_row4_col2, #T_694ab_row8_col1, #T_694ab_row8_col3, #T_694ab_row8_col4 {\n",
+       "  background-color: #3b4cc0;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_12624_row1_col1 {\n",
-       "  background-color: #f59d7e;\n",
+       "#T_694ab_row0_col3, #T_694ab_row3_col3 {\n",
+       "  background-color: #f39778;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_12624_row1_col4 {\n",
+       "#T_694ab_row1_col1 {\n",
        "  background-color: #d24b40;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_12624_row2_col1 {\n",
-       "  background-color: #f39577;\n",
+       "#T_694ab_row1_col3 {\n",
+       "  background-color: #f59d7e;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_12624_row2_col2, #T_12624_row3_col2, #T_12624_row4_col2 {\n",
-       "  background-color: #445acc;\n",
-       "  color: #f1f1f1;\n",
-       "}\n",
-       "#T_12624_row2_col4 {\n",
+       "#T_694ab_row2_col1 {\n",
        "  background-color: #bcd2f7;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_12624_row2_col5 {\n",
-       "  background-color: #3d50c3;\n",
-       "  color: #f1f1f1;\n",
-       "}\n",
-       "#T_12624_row3_col4 {\n",
-       "  background-color: #b6cefa;\n",
+       "#T_694ab_row2_col3 {\n",
+       "  background-color: #f39577;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_12624_row4_col1 {\n",
-       "  background-color: #f59f80;\n",
+       "#T_694ab_row3_col1 {\n",
+       "  background-color: #b6cefa;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_12624_row4_col4 {\n",
+       "#T_694ab_row4_col1 {\n",
        "  background-color: #a7c5fe;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_12624_row4_col5 {\n",
-       "  background-color: #3c4ec2;\n",
-       "  color: #f1f1f1;\n",
-       "}\n",
-       "#T_12624_row5_col1 {\n",
-       "  background-color: #eb7d62;\n",
-       "  color: #f1f1f1;\n",
-       "}\n",
-       "#T_12624_row5_col2 {\n",
-       "  background-color: #7093f3;\n",
-       "  color: #f1f1f1;\n",
-       "}\n",
-       "#T_12624_row5_col3 {\n",
-       "  background-color: #4358cb;\n",
-       "  color: #f1f1f1;\n",
+       "#T_694ab_row4_col3 {\n",
+       "  background-color: #f59f80;\n",
+       "  color: #000000;\n",
        "}\n",
-       "#T_12624_row5_col4 {\n",
+       "#T_694ab_row5_col1 {\n",
        "  background-color: #7597f6;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_12624_row5_col5 {\n",
-       "  background-color: #5470de;\n",
+       "#T_694ab_row5_col2 {\n",
+       "  background-color: #4358cb;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_12624_row6_col2 {\n",
-       "  background-color: #cf453c;\n",
+       "#T_694ab_row5_col3 {\n",
+       "  background-color: #eb7d62;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_12624_row6_col3 {\n",
-       "  background-color: #f6bfa6;\n",
-       "  color: #000000;\n",
-       "}\n",
-       "#T_12624_row6_col4 {\n",
+       "#T_694ab_row6_col1 {\n",
        "  background-color: #5e7de7;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_12624_row6_col5 {\n",
-       "  background-color: #a9c6fd;\n",
+       "#T_694ab_row6_col2 {\n",
+       "  background-color: #f6bfa6;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_12624_row6_col6, #T_12624_row7_col6 {\n",
+       "#T_694ab_row6_col4, #T_694ab_row7_col4 {\n",
        "  background-color: #dddcdc;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_12624_row7_col1 {\n",
-       "  background-color: #de614d;\n",
-       "  color: #f1f1f1;\n",
-       "}\n",
-       "#T_12624_row7_col4 {\n",
+       "#T_694ab_row7_col1 {\n",
        "  background-color: #5977e3;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_12624_row7_col5 {\n",
-       "  background-color: #6e90f2;\n",
-       "  color: #f1f1f1;\n",
-       "}\n",
-       "#T_12624_row8_col2 {\n",
-       "  background-color: #ea7b60;\n",
+       "#T_694ab_row7_col3 {\n",
+       "  background-color: #de614d;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_12624_row8_col3 {\n",
+       "#T_694ab_row8_col2 {\n",
        "  background-color: #779af7;\n",
        "  color: #f1f1f1;\n",
        "}\n",
        "</style>\n",
-       "<table id=\"T_12624\">\n",
+       "<table id=\"T_694ab\">\n",
        "  <thead>\n",
        "    <tr>\n",
        "      <th class=\"blank level0\" >&nbsp;</th>\n",
-       "      <th id=\"T_12624_level0_col0\" class=\"col_heading level0 col0\" >feature name</th>\n",
-       "      <th id=\"T_12624_level0_col1\" class=\"col_heading level0 col1\" >coefficient</th>\n",
-       "      <th id=\"T_12624_level0_col2\" class=\"col_heading level0 col2\" >stderr</th>\n",
-       "      <th id=\"T_12624_level0_col3\" class=\"col_heading level0 col3\" >stat.significance</th>\n",
-       "      <th id=\"T_12624_level0_col4\" class=\"col_heading level0 col4\" >t-value</th>\n",
-       "      <th id=\"T_12624_level0_col5\" class=\"col_heading level0 col5\" >closeness to 1.0</th>\n",
-       "      <th id=\"T_12624_level0_col6\" class=\"col_heading level0 col6\" >Selected</th>\n",
+       "      <th id=\"T_694ab_level0_col0\" class=\"col_heading level0 col0\" >feature name</th>\n",
+       "      <th id=\"T_694ab_level0_col1\" class=\"col_heading level0 col1\" >t-value</th>\n",
+       "      <th id=\"T_694ab_level0_col2\" class=\"col_heading level0 col2\" >stat.significance</th>\n",
+       "      <th id=\"T_694ab_level0_col3\" class=\"col_heading level0 col3\" >coefficient</th>\n",
+       "      <th id=\"T_694ab_level0_col4\" class=\"col_heading level0 col4\" >selected</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th id=\"T_12624_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
-       "      <td id=\"T_12624_row0_col0\" class=\"data row0 col0\" >x5</td>\n",
-       "      <td id=\"T_12624_row0_col1\" class=\"data row0 col1\" >1.052030</td>\n",
-       "      <td id=\"T_12624_row0_col2\" class=\"data row0 col2\" >0.052052</td>\n",
-       "      <td id=\"T_12624_row0_col3\" class=\"data row0 col3\" >0.000000</td>\n",
-       "      <td id=\"T_12624_row0_col4\" class=\"data row0 col4\" >20.211299</td>\n",
-       "      <td id=\"T_12624_row0_col5\" class=\"data row0 col5\" >0.052030</td>\n",
-       "      <td id=\"T_12624_row0_col6\" class=\"data row0 col6\" >1</td>\n",
+       "      <th id=\"T_694ab_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
+       "      <td id=\"T_694ab_row0_col0\" class=\"data row0 col0\" >x5</td>\n",
+       "      <td id=\"T_694ab_row0_col1\" class=\"data row0 col1\" >20.211299</td>\n",
+       "      <td id=\"T_694ab_row0_col2\" class=\"data row0 col2\" >0.000000</td>\n",
+       "      <td id=\"T_694ab_row0_col3\" class=\"data row0 col3\" >1.052030</td>\n",
+       "      <td id=\"T_694ab_row0_col4\" class=\"data row0 col4\" >1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_12624_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
-       "      <td id=\"T_12624_row1_col0\" class=\"data row1 col0\" >x4</td>\n",
-       "      <td id=\"T_12624_row1_col1\" class=\"data row1 col1\" >0.952416</td>\n",
-       "      <td id=\"T_12624_row1_col2\" class=\"data row1 col2\" >0.052002</td>\n",
-       "      <td id=\"T_12624_row1_col3\" class=\"data row1 col3\" >0.000000</td>\n",
-       "      <td id=\"T_12624_row1_col4\" class=\"data row1 col4\" >18.315144</td>\n",
-       "      <td id=\"T_12624_row1_col5\" class=\"data row1 col5\" >0.047584</td>\n",
-       "      <td id=\"T_12624_row1_col6\" class=\"data row1 col6\" >1</td>\n",
+       "      <th id=\"T_694ab_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
+       "      <td id=\"T_694ab_row1_col0\" class=\"data row1 col0\" >x4</td>\n",
+       "      <td id=\"T_694ab_row1_col1\" class=\"data row1 col1\" >18.315144</td>\n",
+       "      <td id=\"T_694ab_row1_col2\" class=\"data row1 col2\" >0.000000</td>\n",
+       "      <td id=\"T_694ab_row1_col3\" class=\"data row1 col3\" >0.952416</td>\n",
+       "      <td id=\"T_694ab_row1_col4\" class=\"data row1 col4\" >1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_12624_level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
-       "      <td id=\"T_12624_row2_col0\" class=\"data row2 col0\" >x3</td>\n",
-       "      <td id=\"T_12624_row2_col1\" class=\"data row2 col1\" >1.098154</td>\n",
-       "      <td id=\"T_12624_row2_col2\" class=\"data row2 col2\" >0.160650</td>\n",
-       "      <td id=\"T_12624_row2_col3\" class=\"data row2 col3\" >0.000000</td>\n",
-       "      <td id=\"T_12624_row2_col4\" class=\"data row2 col4\" >6.835690</td>\n",
-       "      <td id=\"T_12624_row2_col5\" class=\"data row2 col5\" >0.098154</td>\n",
-       "      <td id=\"T_12624_row2_col6\" class=\"data row2 col6\" >1</td>\n",
+       "      <th id=\"T_694ab_level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
+       "      <td id=\"T_694ab_row2_col0\" class=\"data row2 col0\" >x3</td>\n",
+       "      <td id=\"T_694ab_row2_col1\" class=\"data row2 col1\" >6.835690</td>\n",
+       "      <td id=\"T_694ab_row2_col2\" class=\"data row2 col2\" >0.000000</td>\n",
+       "      <td id=\"T_694ab_row2_col3\" class=\"data row2 col3\" >1.098154</td>\n",
+       "      <td id=\"T_694ab_row2_col4\" class=\"data row2 col4\" >1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_12624_level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
-       "      <td id=\"T_12624_row3_col0\" class=\"data row3 col0\" >x2</td>\n",
-       "      <td id=\"T_12624_row3_col1\" class=\"data row3 col1\" >1.044842</td>\n",
-       "      <td id=\"T_12624_row3_col2\" class=\"data row3 col2\" >0.161812</td>\n",
-       "      <td id=\"T_12624_row3_col3\" class=\"data row3 col3\" >0.000000</td>\n",
-       "      <td id=\"T_12624_row3_col4\" class=\"data row3 col4\" >6.457140</td>\n",
-       "      <td id=\"T_12624_row3_col5\" class=\"data row3 col5\" >0.044842</td>\n",
-       "      <td id=\"T_12624_row3_col6\" class=\"data row3 col6\" >1</td>\n",
+       "      <th id=\"T_694ab_level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
+       "      <td id=\"T_694ab_row3_col0\" class=\"data row3 col0\" >x2</td>\n",
+       "      <td id=\"T_694ab_row3_col1\" class=\"data row3 col1\" >6.457140</td>\n",
+       "      <td id=\"T_694ab_row3_col2\" class=\"data row3 col2\" >0.000000</td>\n",
+       "      <td id=\"T_694ab_row3_col3\" class=\"data row3 col3\" >1.044842</td>\n",
+       "      <td id=\"T_694ab_row3_col4\" class=\"data row3 col4\" >1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_12624_level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
-       "      <td id=\"T_12624_row4_col0\" class=\"data row4 col0\" >x1</td>\n",
-       "      <td id=\"T_12624_row4_col1\" class=\"data row4 col1\" >0.917242</td>\n",
-       "      <td id=\"T_12624_row4_col2\" class=\"data row4 col2\" >0.165850</td>\n",
-       "      <td id=\"T_12624_row4_col3\" class=\"data row4 col3\" >0.000000</td>\n",
-       "      <td id=\"T_12624_row4_col4\" class=\"data row4 col4\" >5.530556</td>\n",
-       "      <td id=\"T_12624_row4_col5\" class=\"data row4 col5\" >0.082758</td>\n",
-       "      <td id=\"T_12624_row4_col6\" class=\"data row4 col6\" >1</td>\n",
+       "      <th id=\"T_694ab_level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
+       "      <td id=\"T_694ab_row4_col0\" class=\"data row4 col0\" >x1</td>\n",
+       "      <td id=\"T_694ab_row4_col1\" class=\"data row4 col1\" >5.530556</td>\n",
+       "      <td id=\"T_694ab_row4_col2\" class=\"data row4 col2\" >0.000000</td>\n",
+       "      <td id=\"T_694ab_row4_col3\" class=\"data row4 col3\" >0.917242</td>\n",
+       "      <td id=\"T_694ab_row4_col4\" class=\"data row4 col4\" >1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_12624_level0_row5\" class=\"row_heading level0 row5\" >5</th>\n",
-       "      <td id=\"T_12624_row5_col0\" class=\"data row5 col0\" >x6</td>\n",
-       "      <td id=\"T_12624_row5_col1\" class=\"data row5 col1\" >1.497983</td>\n",
-       "      <td id=\"T_12624_row5_col2\" class=\"data row5 col2\" >0.626544</td>\n",
-       "      <td id=\"T_12624_row5_col3\" class=\"data row5 col3\" >0.016827</td>\n",
-       "      <td id=\"T_12624_row5_col4\" class=\"data row5 col4\" >2.390868</td>\n",
-       "      <td id=\"T_12624_row5_col5\" class=\"data row5 col5\" >0.497983</td>\n",
-       "      <td id=\"T_12624_row5_col6\" class=\"data row5 col6\" >1</td>\n",
+       "      <th id=\"T_694ab_level0_row5\" class=\"row_heading level0 row5\" >5</th>\n",
+       "      <td id=\"T_694ab_row5_col0\" class=\"data row5 col0\" >x6</td>\n",
+       "      <td id=\"T_694ab_row5_col1\" class=\"data row5 col1\" >2.390868</td>\n",
+       "      <td id=\"T_694ab_row5_col2\" class=\"data row5 col2\" >0.016827</td>\n",
+       "      <td id=\"T_694ab_row5_col3\" class=\"data row5 col3\" >1.497983</td>\n",
+       "      <td id=\"T_694ab_row5_col4\" class=\"data row5 col4\" >1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_12624_level0_row6\" class=\"row_heading level0 row6\" >6</th>\n",
-       "      <td id=\"T_12624_row6_col0\" class=\"data row6 col0\" >x7</td>\n",
-       "      <td id=\"T_12624_row6_col1\" class=\"data row6 col1\" >2.865508</td>\n",
-       "      <td id=\"T_12624_row6_col2\" class=\"data row6 col2\" >3.180017</td>\n",
-       "      <td id=\"T_12624_row6_col3\" class=\"data row6 col3\" >0.367558</td>\n",
-       "      <td id=\"T_12624_row6_col4\" class=\"data row6 col4\" >0.901098</td>\n",
-       "      <td id=\"T_12624_row6_col5\" class=\"data row6 col5\" >1.865508</td>\n",
-       "      <td id=\"T_12624_row6_col6\" class=\"data row6 col6\" >0</td>\n",
+       "      <th id=\"T_694ab_level0_row6\" class=\"row_heading level0 row6\" >6</th>\n",
+       "      <td id=\"T_694ab_row6_col0\" class=\"data row6 col0\" >x7</td>\n",
+       "      <td id=\"T_694ab_row6_col1\" class=\"data row6 col1\" >0.901098</td>\n",
+       "      <td id=\"T_694ab_row6_col2\" class=\"data row6 col2\" >0.367558</td>\n",
+       "      <td id=\"T_694ab_row6_col3\" class=\"data row6 col3\" >2.865508</td>\n",
+       "      <td id=\"T_694ab_row6_col4\" class=\"data row6 col4\" >0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_12624_level0_row7\" class=\"row_heading level0 row7\" >7</th>\n",
-       "      <td id=\"T_12624_row7_col0\" class=\"data row7 col0\" >x8</td>\n",
-       "      <td id=\"T_12624_row7_col1\" class=\"data row7 col1\" >1.933632</td>\n",
-       "      <td id=\"T_12624_row7_col2\" class=\"data row7 col2\" >3.433208</td>\n",
-       "      <td id=\"T_12624_row7_col3\" class=\"data row7 col3\" >0.573302</td>\n",
-       "      <td id=\"T_12624_row7_col4\" class=\"data row7 col4\" >0.563214</td>\n",
-       "      <td id=\"T_12624_row7_col5\" class=\"data row7 col5\" >0.933632</td>\n",
-       "      <td id=\"T_12624_row7_col6\" class=\"data row7 col6\" >0</td>\n",
+       "      <th id=\"T_694ab_level0_row7\" class=\"row_heading level0 row7\" >7</th>\n",
+       "      <td id=\"T_694ab_row7_col0\" class=\"data row7 col0\" >x8</td>\n",
+       "      <td id=\"T_694ab_row7_col1\" class=\"data row7 col1\" >0.563214</td>\n",
+       "      <td id=\"T_694ab_row7_col2\" class=\"data row7 col2\" >0.573302</td>\n",
+       "      <td id=\"T_694ab_row7_col3\" class=\"data row7 col3\" >1.933632</td>\n",
+       "      <td id=\"T_694ab_row7_col4\" class=\"data row7 col4\" >0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_12624_level0_row8\" class=\"row_heading level0 row8\" >8</th>\n",
-       "      <td id=\"T_12624_row8_col0\" class=\"data row8 col0\" >x9</td>\n",
-       "      <td id=\"T_12624_row8_col1\" class=\"data row8 col1\" >-4.537098</td>\n",
-       "      <td id=\"T_12624_row8_col2\" class=\"data row8 col2\" >2.821905</td>\n",
-       "      <td id=\"T_12624_row8_col3\" class=\"data row8 col3\" >0.107908</td>\n",
-       "      <td id=\"T_12624_row8_col4\" class=\"data row8 col4\" >-1.607814</td>\n",
-       "      <td id=\"T_12624_row8_col5\" class=\"data row8 col5\" >5.537098</td>\n",
-       "      <td id=\"T_12624_row8_col6\" class=\"data row8 col6\" >-1</td>\n",
+       "      <th id=\"T_694ab_level0_row8\" class=\"row_heading level0 row8\" >8</th>\n",
+       "      <td id=\"T_694ab_row8_col0\" class=\"data row8 col0\" >x9</td>\n",
+       "      <td id=\"T_694ab_row8_col1\" class=\"data row8 col1\" >-1.607814</td>\n",
+       "      <td id=\"T_694ab_row8_col2\" class=\"data row8 col2\" >0.107908</td>\n",
+       "      <td id=\"T_694ab_row8_col3\" class=\"data row8 col3\" >-4.537098</td>\n",
+       "      <td id=\"T_694ab_row8_col4\" class=\"data row8 col4\" >-1</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n"
       ],
       "text/plain": [
-       "<pandas.io.formats.style.Styler at 0x2044614f290>"
+       "<pandas.io.formats.style.Styler at 0x212dbbff3d0>"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "import os, sys\n",
-    "\n",
-    "try:\n",
-    "    from shap_select import shap_select\n",
-    "except ModuleNotFoundError:\n",
-    "    # If you're running shap_select from source\n",
-    "    root = os.path.realpath(\"..\")\n",
-    "    sys.path.append(root)\n",
-    "    from shap_select import shap_select\n",
-    "\n",
-    "selected_features_df = shap_select(\n",
-    "    model, X_val, X_val.columns.tolist(), y_val, task=\"regression\", threshold=0.05\n",
-    ")\n",
-    "\n",
     "# Let's color the output prettily\n",
-    "styled_df = selected_features_df.style.background_gradient(\n",
-    "    cmap='coolwarm', subset=pd.IndexSlice[:, ['coefficient', \n",
-    "                                              'stderr', \n",
-    "                                              'stat.significance', \n",
-    "                                              't-value', \n",
-    "                                              'closeness to 1.0', \n",
-    "                                              'Selected']]\n",
-    ")\n",
-    "styled_df"
+    "def prettify(df: pd.DataFrame, exclude: List[str]):\n",
+    "    styled_df = df.style.background_gradient(\n",
+    "        cmap='coolwarm', subset=pd.IndexSlice[:, [c for i,c in enumerate(df.columns) if c not in exclude]]\n",
+    "    )\n",
+    "    return styled_df\n",
+    "\n",
+    "prettify(selected_features_df, exclude=[\"feature name\"])"
    ]
   },
   {
@@ -470,7 +427,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "id": "1412da7f",
    "metadata": {},
    "outputs": [
@@ -1207,7 +1164,8 @@
       "[714]\tvalid-mlogloss:0.03015\n",
       "[715]\tvalid-mlogloss:0.03018\n",
       "[716]\tvalid-mlogloss:0.03018\n",
-      "[717]\tvalid-mlogloss:0.03016\n"
+      "[717]\tvalid-mlogloss:0.03016\n",
+      "[718]\tvalid-mlogloss:0.03014\n"
      ]
     }
    ],
@@ -1265,223 +1223,191 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "id": "743d6988",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\EgorKraev\\miniconda3\\envs\\llm3.11\\Lib\\site-packages\\sklearn\\svm\\_base.py:1235: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
+      "  warnings.warn(\n"
+     ]
+    },
     {
      "data": {
       "text/html": [
        "<style type=\"text/css\">\n",
-       "#T_08f22_row0_col1, #T_08f22_row0_col5, #T_08f22_row1_col1, #T_08f22_row1_col5, #T_08f22_row2_col5, #T_08f22_row3_col5, #T_08f22_row4_col5, #T_08f22_row5_col3, #T_08f22_row5_col5, #T_08f22_row8_col2, #T_08f22_row8_col4 {\n",
+       "#T_b6429_row0_col1, #T_b6429_row0_col4, #T_b6429_row1_col1, #T_b6429_row1_col4, #T_b6429_row2_col4, #T_b6429_row3_col4, #T_b6429_row4_col4, #T_b6429_row5_col3, #T_b6429_row5_col4, #T_b6429_row8_col2 {\n",
        "  background-color: #b40426;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_08f22_row0_col2, #T_08f22_row0_col4, #T_08f22_row1_col2, #T_08f22_row1_col4, #T_08f22_row2_col4, #T_08f22_row3_col2, #T_08f22_row3_col4, #T_08f22_row4_col4, #T_08f22_row5_col4, #T_08f22_row7_col5, #T_08f22_row8_col1, #T_08f22_row8_col3, #T_08f22_row8_col5 {\n",
+       "#T_b6429_row0_col2, #T_b6429_row1_col2, #T_b6429_row2_col2, #T_b6429_row3_col2, #T_b6429_row4_col2, #T_b6429_row5_col2, #T_b6429_row7_col4, #T_b6429_row8_col1, #T_b6429_row8_col3, #T_b6429_row8_col4 {\n",
        "  background-color: #3b4cc0;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_08f22_row0_col3, #T_08f22_row2_col3 {\n",
+       "#T_b6429_row0_col3, #T_b6429_row2_col3 {\n",
        "  background-color: #eb7d62;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_08f22_row1_col3 {\n",
+       "#T_b6429_row1_col3 {\n",
        "  background-color: #ea7b60;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_08f22_row2_col1 {\n",
+       "#T_b6429_row2_col1 {\n",
        "  background-color: #b50927;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_08f22_row2_col2 {\n",
-       "  background-color: #3c4ec2;\n",
-       "  color: #f1f1f1;\n",
-       "}\n",
-       "#T_08f22_row3_col1 {\n",
+       "#T_b6429_row3_col1 {\n",
        "  background-color: #e8765c;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_08f22_row3_col3 {\n",
+       "#T_b6429_row3_col3 {\n",
        "  background-color: #de614d;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_08f22_row4_col1 {\n",
+       "#T_b6429_row4_col1 {\n",
        "  background-color: #e9785d;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_08f22_row4_col2 {\n",
-       "  background-color: #465ecf;\n",
-       "  color: #f1f1f1;\n",
-       "}\n",
-       "#T_08f22_row4_col3 {\n",
+       "#T_b6429_row4_col3 {\n",
        "  background-color: #da5a49;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_08f22_row5_col1 {\n",
+       "#T_b6429_row5_col1 {\n",
        "  background-color: #dedcdb;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_08f22_row5_col2 {\n",
-       "  background-color: #5e7de7;\n",
-       "  color: #f1f1f1;\n",
-       "}\n",
-       "#T_08f22_row6_col1 {\n",
+       "#T_b6429_row6_col1 {\n",
        "  background-color: #4a63d3;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_08f22_row6_col2 {\n",
-       "  background-color: #b5cdfa;\n",
+       "#T_b6429_row6_col2 {\n",
+       "  background-color: #9ebeff;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_08f22_row6_col3 {\n",
+       "#T_b6429_row6_col3 {\n",
        "  background-color: #d1493f;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_08f22_row6_col4 {\n",
-       "  background-color: #9ebeff;\n",
-       "  color: #000000;\n",
-       "}\n",
-       "#T_08f22_row6_col5 {\n",
+       "#T_b6429_row6_col4 {\n",
        "  background-color: #dddcdc;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_08f22_row7_col1 {\n",
+       "#T_b6429_row7_col1 {\n",
        "  background-color: #3f53c6;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_08f22_row7_col2 {\n",
-       "  background-color: #ebd3c6;\n",
-       "  color: #000000;\n",
+       "#T_b6429_row7_col2 {\n",
+       "  background-color: #f08a6c;\n",
+       "  color: #f1f1f1;\n",
        "}\n",
-       "#T_08f22_row7_col3 {\n",
+       "#T_b6429_row7_col3 {\n",
        "  background-color: #94b6ff;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_08f22_row7_col4 {\n",
-       "  background-color: #f08a6c;\n",
-       "  color: #f1f1f1;\n",
-       "}\n",
        "</style>\n",
-       "<table id=\"T_08f22\">\n",
+       "<table id=\"T_b6429\">\n",
        "  <thead>\n",
        "    <tr>\n",
        "      <th class=\"blank level0\" >&nbsp;</th>\n",
-       "      <th id=\"T_08f22_level0_col0\" class=\"col_heading level0 col0\" >feature name</th>\n",
-       "      <th id=\"T_08f22_level0_col1\" class=\"col_heading level0 col1\" >t-value</th>\n",
-       "      <th id=\"T_08f22_level0_col2\" class=\"col_heading level0 col2\" >closeness to 1.0</th>\n",
-       "      <th id=\"T_08f22_level0_col3\" class=\"col_heading level0 col3\" >coefficient</th>\n",
-       "      <th id=\"T_08f22_level0_col4\" class=\"col_heading level0 col4\" >stat.significance</th>\n",
-       "      <th id=\"T_08f22_level0_col5\" class=\"col_heading level0 col5\" >Selected</th>\n",
+       "      <th id=\"T_b6429_level0_col0\" class=\"col_heading level0 col0\" >feature name</th>\n",
+       "      <th id=\"T_b6429_level0_col1\" class=\"col_heading level0 col1\" >t-value</th>\n",
+       "      <th id=\"T_b6429_level0_col2\" class=\"col_heading level0 col2\" >stat.significance</th>\n",
+       "      <th id=\"T_b6429_level0_col3\" class=\"col_heading level0 col3\" >coefficient</th>\n",
+       "      <th id=\"T_b6429_level0_col4\" class=\"col_heading level0 col4\" >selected</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th id=\"T_08f22_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
-       "      <td id=\"T_08f22_row0_col0\" class=\"data row0 col0\" >x4</td>\n",
-       "      <td id=\"T_08f22_row0_col1\" class=\"data row0 col1\" >25.927565</td>\n",
-       "      <td id=\"T_08f22_row0_col2\" class=\"data row0 col2\" >0.120259</td>\n",
-       "      <td id=\"T_08f22_row0_col3\" class=\"data row0 col3\" >1.559384</td>\n",
-       "      <td id=\"T_08f22_row0_col4\" class=\"data row0 col4\" >0.000000</td>\n",
-       "      <td id=\"T_08f22_row0_col5\" class=\"data row0 col5\" >1</td>\n",
+       "      <th id=\"T_b6429_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
+       "      <td id=\"T_b6429_row0_col0\" class=\"data row0 col0\" >x4</td>\n",
+       "      <td id=\"T_b6429_row0_col1\" class=\"data row0 col1\" >25.927565</td>\n",
+       "      <td id=\"T_b6429_row0_col2\" class=\"data row0 col2\" >0.000000</td>\n",
+       "      <td id=\"T_b6429_row0_col3\" class=\"data row0 col3\" >1.559384</td>\n",
+       "      <td id=\"T_b6429_row0_col4\" class=\"data row0 col4\" >1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_08f22_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
-       "      <td id=\"T_08f22_row1_col0\" class=\"data row1 col0\" >x5</td>\n",
-       "      <td id=\"T_08f22_row1_col1\" class=\"data row1 col1\" >25.874027</td>\n",
-       "      <td id=\"T_08f22_row1_col2\" class=\"data row1 col2\" >0.113714</td>\n",
-       "      <td id=\"T_08f22_row1_col3\" class=\"data row1 col3\" >1.571661</td>\n",
-       "      <td id=\"T_08f22_row1_col4\" class=\"data row1 col4\" >0.000000</td>\n",
-       "      <td id=\"T_08f22_row1_col5\" class=\"data row1 col5\" >1</td>\n",
+       "      <th id=\"T_b6429_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
+       "      <td id=\"T_b6429_row1_col0\" class=\"data row1 col0\" >x5</td>\n",
+       "      <td id=\"T_b6429_row1_col1\" class=\"data row1 col1\" >25.874027</td>\n",
+       "      <td id=\"T_b6429_row1_col2\" class=\"data row1 col2\" >0.000000</td>\n",
+       "      <td id=\"T_b6429_row1_col3\" class=\"data row1 col3\" >1.571661</td>\n",
+       "      <td id=\"T_b6429_row1_col4\" class=\"data row1 col4\" >1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_08f22_level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
-       "      <td id=\"T_08f22_row2_col0\" class=\"data row2 col0\" >x6</td>\n",
-       "      <td id=\"T_08f22_row2_col1\" class=\"data row2 col1\" >25.782536</td>\n",
-       "      <td id=\"T_08f22_row2_col2\" class=\"data row2 col2\" >0.126149</td>\n",
-       "      <td id=\"T_08f22_row2_col3\" class=\"data row2 col3\" >1.561214</td>\n",
-       "      <td id=\"T_08f22_row2_col4\" class=\"data row2 col4\" >0.000000</td>\n",
-       "      <td id=\"T_08f22_row2_col5\" class=\"data row2 col5\" >1</td>\n",
+       "      <th id=\"T_b6429_level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
+       "      <td id=\"T_b6429_row2_col0\" class=\"data row2 col0\" >x6</td>\n",
+       "      <td id=\"T_b6429_row2_col1\" class=\"data row2 col1\" >25.782536</td>\n",
+       "      <td id=\"T_b6429_row2_col2\" class=\"data row2 col2\" >0.000000</td>\n",
+       "      <td id=\"T_b6429_row2_col3\" class=\"data row2 col3\" >1.561214</td>\n",
+       "      <td id=\"T_b6429_row2_col4\" class=\"data row2 col4\" >1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_08f22_level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
-       "      <td id=\"T_08f22_row3_col0\" class=\"data row3 col0\" >x2</td>\n",
-       "      <td id=\"T_08f22_row3_col1\" class=\"data row3 col1\" >21.367053</td>\n",
-       "      <td id=\"T_08f22_row3_col2\" class=\"data row3 col2\" >0.112966</td>\n",
-       "      <td id=\"T_08f22_row3_col3\" class=\"data row3 col3\" >1.753463</td>\n",
-       "      <td id=\"T_08f22_row3_col4\" class=\"data row3 col4\" >0.000000</td>\n",
-       "      <td id=\"T_08f22_row3_col5\" class=\"data row3 col5\" >1</td>\n",
+       "      <th id=\"T_b6429_level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
+       "      <td id=\"T_b6429_row3_col0\" class=\"data row3 col0\" >x2</td>\n",
+       "      <td id=\"T_b6429_row3_col1\" class=\"data row3 col1\" >21.367053</td>\n",
+       "      <td id=\"T_b6429_row3_col2\" class=\"data row3 col2\" >0.000000</td>\n",
+       "      <td id=\"T_b6429_row3_col3\" class=\"data row3 col3\" >1.753463</td>\n",
+       "      <td id=\"T_b6429_row3_col4\" class=\"data row3 col4\" >1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_08f22_level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
-       "      <td id=\"T_08f22_row4_col0\" class=\"data row4 col0\" >x3</td>\n",
-       "      <td id=\"T_08f22_row4_col1\" class=\"data row4 col1\" >21.330803</td>\n",
-       "      <td id=\"T_08f22_row4_col2\" class=\"data row4 col2\" >0.201773</td>\n",
-       "      <td id=\"T_08f22_row4_col3\" class=\"data row4 col3\" >1.792630</td>\n",
-       "      <td id=\"T_08f22_row4_col4\" class=\"data row4 col4\" >0.000000</td>\n",
-       "      <td id=\"T_08f22_row4_col5\" class=\"data row4 col5\" >1</td>\n",
+       "      <th id=\"T_b6429_level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
+       "      <td id=\"T_b6429_row4_col0\" class=\"data row4 col0\" >x3</td>\n",
+       "      <td id=\"T_b6429_row4_col1\" class=\"data row4 col1\" >21.330803</td>\n",
+       "      <td id=\"T_b6429_row4_col2\" class=\"data row4 col2\" >0.000000</td>\n",
+       "      <td id=\"T_b6429_row4_col3\" class=\"data row4 col3\" >1.792630</td>\n",
+       "      <td id=\"T_b6429_row4_col4\" class=\"data row4 col4\" >1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_08f22_level0_row5\" class=\"row_heading level0 row5\" >5</th>\n",
-       "      <td id=\"T_08f22_row5_col0\" class=\"data row5 col0\" >x1</td>\n",
-       "      <td id=\"T_08f22_row5_col1\" class=\"data row5 col1\" >12.835856</td>\n",
-       "      <td id=\"T_08f22_row5_col2\" class=\"data row5 col2\" >0.359081</td>\n",
-       "      <td id=\"T_08f22_row5_col3\" class=\"data row5 col3\" >2.197310</td>\n",
-       "      <td id=\"T_08f22_row5_col4\" class=\"data row5 col4\" >0.000000</td>\n",
-       "      <td id=\"T_08f22_row5_col5\" class=\"data row5 col5\" >1</td>\n",
+       "      <th id=\"T_b6429_level0_row5\" class=\"row_heading level0 row5\" >5</th>\n",
+       "      <td id=\"T_b6429_row5_col0\" class=\"data row5 col0\" >x1</td>\n",
+       "      <td id=\"T_b6429_row5_col1\" class=\"data row5 col1\" >12.835856</td>\n",
+       "      <td id=\"T_b6429_row5_col2\" class=\"data row5 col2\" >0.000000</td>\n",
+       "      <td id=\"T_b6429_row5_col3\" class=\"data row5 col3\" >2.197310</td>\n",
+       "      <td id=\"T_b6429_row5_col4\" class=\"data row5 col4\" >1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_08f22_level0_row6\" class=\"row_heading level0 row6\" >6</th>\n",
-       "      <td id=\"T_08f22_row6_col0\" class=\"data row6 col0\" >x7</td>\n",
-       "      <td id=\"T_08f22_row6_col1\" class=\"data row6 col1\" >0.773525</td>\n",
-       "      <td id=\"T_08f22_row6_col2\" class=\"data row6 col2\" >0.901079</td>\n",
-       "      <td id=\"T_08f22_row6_col3\" class=\"data row6 col3\" >1.901079</td>\n",
-       "      <td id=\"T_08f22_row6_col4\" class=\"data row6 col4\" >0.658817</td>\n",
-       "      <td id=\"T_08f22_row6_col5\" class=\"data row6 col5\" >0</td>\n",
+       "      <th id=\"T_b6429_level0_row6\" class=\"row_heading level0 row6\" >6</th>\n",
+       "      <td id=\"T_b6429_row6_col0\" class=\"data row6 col0\" >x7</td>\n",
+       "      <td id=\"T_b6429_row6_col1\" class=\"data row6 col1\" >0.773525</td>\n",
+       "      <td id=\"T_b6429_row6_col2\" class=\"data row6 col2\" >0.658817</td>\n",
+       "      <td id=\"T_b6429_row6_col3\" class=\"data row6 col3\" >1.901079</td>\n",
+       "      <td id=\"T_b6429_row6_col4\" class=\"data row6 col4\" >0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_08f22_level0_row7\" class=\"row_heading level0 row7\" >7</th>\n",
-       "      <td id=\"T_08f22_row7_col0\" class=\"data row7 col0\" >x9</td>\n",
-       "      <td id=\"T_08f22_row7_col1\" class=\"data row7 col1\" >-0.206328</td>\n",
-       "      <td id=\"T_08f22_row7_col2\" class=\"data row7 col2\" >1.317295</td>\n",
-       "      <td id=\"T_08f22_row7_col3\" class=\"data row7 col3\" >-0.317295</td>\n",
-       "      <td id=\"T_08f22_row7_col4\" class=\"data row7 col4\" >1.745198</td>\n",
-       "      <td id=\"T_08f22_row7_col5\" class=\"data row7 col5\" >-1</td>\n",
+       "      <th id=\"T_b6429_level0_row7\" class=\"row_heading level0 row7\" >7</th>\n",
+       "      <td id=\"T_b6429_row7_col0\" class=\"data row7 col0\" >x9</td>\n",
+       "      <td id=\"T_b6429_row7_col1\" class=\"data row7 col1\" >-0.206328</td>\n",
+       "      <td id=\"T_b6429_row7_col2\" class=\"data row7 col2\" >1.745198</td>\n",
+       "      <td id=\"T_b6429_row7_col3\" class=\"data row7 col3\" >-0.317295</td>\n",
+       "      <td id=\"T_b6429_row7_col4\" class=\"data row7 col4\" >-1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_08f22_level0_row8\" class=\"row_heading level0 row8\" >8</th>\n",
-       "      <td id=\"T_08f22_row8_col0\" class=\"data row8 col0\" >x8</td>\n",
-       "      <td id=\"T_08f22_row8_col1\" class=\"data row8 col1\" >-0.636902</td>\n",
-       "      <td id=\"T_08f22_row8_col2\" class=\"data row8 col2\" >2.259370</td>\n",
-       "      <td id=\"T_08f22_row8_col3\" class=\"data row8 col3\" >-1.259370</td>\n",
-       "      <td id=\"T_08f22_row8_col4\" class=\"data row8 col4\" >2.213717</td>\n",
-       "      <td id=\"T_08f22_row8_col5\" class=\"data row8 col5\" >-1</td>\n",
+       "      <th id=\"T_b6429_level0_row8\" class=\"row_heading level0 row8\" >8</th>\n",
+       "      <td id=\"T_b6429_row8_col0\" class=\"data row8 col0\" >x8</td>\n",
+       "      <td id=\"T_b6429_row8_col1\" class=\"data row8 col1\" >-0.636902</td>\n",
+       "      <td id=\"T_b6429_row8_col2\" class=\"data row8 col2\" >2.213717</td>\n",
+       "      <td id=\"T_b6429_row8_col3\" class=\"data row8 col3\" >-1.259370</td>\n",
+       "      <td id=\"T_b6429_row8_col4\" class=\"data row8 col4\" >-1</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n"
       ],
       "text/plain": [
-       "<pandas.io.formats.style.Styler at 0x20424e8a890>"
+       "<pandas.io.formats.style.Styler at 0x212d742ad90>"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "selected_features_df = shap_select(\n",
-    "    model, X_val, X_val.columns.tolist(), y_val, task=\"multiclass\", threshold=0.05\n",
-    ")\n",
+    "selected_features_df = shap_select(model, X_val, y_val, task=\"multiclass\", threshold=0.05)\n",
     "\n",
-    "# Let's color the output prettily\n",
-    "styled_df = selected_features_df.style.background_gradient(\n",
-    "    cmap='coolwarm', subset=pd.IndexSlice[:, ['coefficient', \n",
-    "                                              'stat.significance', \n",
-    "                                              't-value', \n",
-    "                                              'closeness to 1.0', \n",
-    "                                              'Selected']]\n",
-    ")\n",
-    "styled_df"
+    "prettify(selected_features_df, exclude=[\"feature name\"])"
    ]
   },
   {
diff --git a/shap_select/select.py b/shap_select/select.py
index fe0203e..f803207 100644
--- a/shap_select/select.py
+++ b/shap_select/select.py
@@ -254,11 +254,11 @@ def shap_features_to_significance(
 def shap_select(
     tree_model: Any,
     validation_df: pd.DataFrame,
-    feature_names: List[str],
     target: pd.Series | str,  # str is column name in validation_df
+    feature_names: List[str] | None = None,
     task: str | None = None,
     threshold: float = 0.05,
-    return_shap_features: bool = False,
+    return_extended_data: bool = False,
 ) -> pd.DataFrame | Tuple[pd.DataFrame, pd.DataFrame]:
     """
     Select features based on their SHAP values and statistical significance.
@@ -270,7 +270,7 @@ def shap_select(
     - target (pd.Series | str): The target values, or the name of the target column in `validation_df`.
     - task (str | None): The task type ('regression', 'binary', or 'multi'). If None, it is inferred automatically.
     - threshold (float): Significance threshold to select features. Default is 0.05.
-    - return_shap_features (bool): Whether to also return the shapley values dataframe(s)
+    - return_extended_data (bool): Whether to also return the shapley values dataframe(s) and some extra columns
 
     Returns:
     - pd.DataFrame: A DataFrame containing the feature names, statistical significance, and a 'Selected' column
@@ -280,6 +280,9 @@ def shap_select(
     if isinstance(target, str):
         target = validation_df[target]
 
+    if feature_names is None:
+        feature_names = validation_df.columns.tolist()
+
     # Infer the task if not provided
     if task is None:
         if pd.api.types.is_numeric_dtype(target) and target.nunique() > 10:
@@ -301,12 +304,14 @@ def shap_select(
     significance_df = shap_features_to_significance(shap_features, target, task)
 
     # Add 'Selected' column based on the threshold
-    significance_df["Selected"] = (
+    significance_df["selected"] = (
         significance_df["stat.significance"] < threshold
     ).astype(int)
-    significance_df.loc[significance_df["t-value"] < 0, "Selected"] = -1
+    significance_df.loc[significance_df["t-value"] < 0, "selected"] = -1
 
-    if return_shap_features:
+    if return_extended_data:
         return significance_df, shap_features
     else:
-        return significance_df
+        return significance_df[
+            ["feature name", "t-value", "stat.significance", "coefficient", "selected"]
+        ]
diff --git a/tests/test_regression.py b/tests/test_regression.py
index a25568e..d5ef6da 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -239,16 +239,14 @@ def test_selected_column_values(model_type, data_fixture, task_type, request):
         raise ValueError("Unsupported model type")
 
     # Call the score_features function for the correct task (regression, binary, multiclass)
-    selected_features_df = shap_select(
-        model, X_val, X_val.columns.tolist(), y_val, task=task_type
-    )
+    selected_features_df = shap_select(model, X_val, y_val, task=task_type)
 
     # Check feature significance for all task types
     selected_rows = selected_features_df[
         selected_features_df["feature name"].isin(["x7", "x8", "x9"])
     ]
     assert (
-        selected_rows["Selected"] <= 0
+        selected_rows["selected"] <= 0
     ).all(), (
         "The Selected column must have negative or zero values for features x7, x8, x9"
     )
@@ -257,5 +255,5 @@ def test_selected_column_values(model_type, data_fixture, task_type, request):
         ~selected_features_df["feature name"].isin(["x7", "x8", "x9", "const"])
     ]
     assert (
-        other_features_rows["Selected"] > 0
+        other_features_rows["selected"] > 0
     ).all(), "The Selected column must have positive values for features other than x7, x8, x9"

From 31cc53a2e08a79be3da0d4a0b65f735338b4cf77 Mon Sep 17 00:00:00 2001
From: "Egor.Kraev" <egor.kraev@transferwise.com>
Date: Wed, 25 Sep 2024 09:21:40 +0100
Subject: [PATCH 4/4] Edit README.md

---
 README.md | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index dc10600..4610030 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,13 @@
-A library for fast feature selection for gradient boosting models using regression on feature Shapley values.
-Unlike
-
 ## Overview
 `shap-select` implements a heuristic to do fast feature selection for tabular regression and classification models. 
 
-The basic idea is running a linear or logistic regression of the Shapley values on the target on the validation set,
+The basic idea is running a linear or logistic regression of the target on the Shapley values on the validation set,
 discarding the features with negative coefficients, and ranking/filtering the rest according to their 
-statistical significance. For details, see the [example notebook](https://github.com/transferwise/shap-select/blob/main/docs/Quick%20feature%20selection%20through%20regression%20on%20Shapley%20values.ipynb)
+statistical significance. For motivation and details, see the [example notebook](https://github.com/transferwise/shap-select/blob/main/docs/Quick%20feature%20selection%20through%20regression%20on%20Shapley%20values.ipynb)
 
 Earlier packages using Shapley values for feature selection exist, the advantages of this one are
 * Regression on the **validation set** to combat overfitting
-* A single pass regression calculation, not iterative
+* A single pass regression, not an iterative approach
 * A single intuitive hyperparameter for feature selection: statistical significance
 * Bonferroni correction for multiclass classification
 ## Usage