From 12e96bd4b958da4902e8a46c1864ecdc30f0e7b3 Mon Sep 17 00:00:00 2001 From: "Egor.Kraev" Date: Tue, 24 Sep 2024 19:52:36 +0100 Subject: [PATCH 1/4] add an l1 regularization stage before regression to kill collinearity --- ...through regression on Shapley values.ipynb | 8 ++-- docs/example.py | 4 +- requirements.txt | 5 ++ shap_select/__init__.py | 2 +- shap_select/select.py | 46 ++++++++++++++++--- tests/test_regression.py | 4 +- 6 files changed, 53 insertions(+), 16 deletions(-) create mode 100644 requirements.txt diff --git a/docs/Quick feature selection through regression on Shapley values.ipynb b/docs/Quick feature selection through regression on Shapley values.ipynb index 777d7e6..aeeefbf 100644 --- a/docs/Quick feature selection through regression on Shapley values.ipynb +++ b/docs/Quick feature selection through regression on Shapley values.ipynb @@ -427,14 +427,14 @@ "import os, sys\n", "\n", "try:\n", - " from shap_select import score_features\n", + " from shap_select import shap_select\n", "except ModuleNotFoundError:\n", " # If you're running shap_select from source\n", " root = os.path.realpath(\"..\")\n", " sys.path.append(root)\n", - " from shap_select import score_features\n", + " from shap_select import shap_select\n", "\n", - "selected_features_df = score_features(\n", + "selected_features_df = shap_select(\n", " model, X_val, X_val.columns.tolist(), y_val, task=\"regression\", threshold=0.05\n", ")\n", "\n", @@ -1469,7 +1469,7 @@ } ], "source": [ - "selected_features_df = score_features(\n", + "selected_features_df = shap_select(\n", " model, X_val, X_val.columns.tolist(), y_val, task=\"multiclass\", threshold=0.05\n", ")\n", "\n", diff --git a/docs/example.py b/docs/example.py index d446c38..0aa99b9 100644 --- a/docs/example.py +++ b/docs/example.py @@ -4,7 +4,7 @@ import xgboost as xgb from sklearn.model_selection import train_test_split -from shap_select import score_features +from shap_select import shap_select # Generate a dataset with 8 normally distributed features and a target based on a given formula np.random.seed(42) @@ -83,7 +83,7 @@ # Call the select_features function -selected_features_df, shap_features = score_features( +selected_features_df, shap_features = shap_select( model, X_val, X.columns.tolist(), y_val ) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4abb286 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +pandas +scikit_learn +scipy +shap +statsmodels diff --git a/shap_select/__init__.py b/shap_select/__init__.py index 9d26d4d..937bd85 100644 --- a/shap_select/__init__.py +++ b/shap_select/__init__.py @@ -1 +1 @@ -from .select import score_features +from .select import shap_select diff --git a/shap_select/select.py b/shap_select/select.py index 4a89c96..365eeef 100644 --- a/shap_select/select.py +++ b/shap_select/select.py @@ -1,8 +1,10 @@ from typing import Any, Tuple, List, Dict import pandas as pd +import numpy as np import statsmodels.api as sm -from statsmodels.genmod.families import Binomial +from sklearn.linear_model import Lasso, LogisticRegression +from sklearn.preprocessing import StandardScaler import scipy.stats as stats import shap @@ -64,10 +66,32 @@ def binary_classifier_significance( """ # Add a constant to the features for the intercept in logistic regression - shap_features_with_const = sm.add_constant(shap_features) - # Fit the logistic regression model - logit_model = sm.Logit(target, shap_features_with_const) + # Standardizing the features (Logistic regression with L1 regularization tends to + # work better with standardized data) + shap_features_scaled = pd.DataFrame( + data=StandardScaler().fit_transform(shap_features), + columns=shap_features.columns, + ) + shap_features_with_const = sm.add_constant(shap_features_scaled) + + # To avoid linear dependence of features, first do a pass with tiny L1-reg + # and throw away the zero coeffs + # Define the Logistic Regression model with L1 regularization + logistic_l1 = LogisticRegression( + penalty="l1", solver="liblinear", fit_intercept=False, C=1e6 + ) # C is the inverse of regularization strength + logistic_l1.fit(shap_features_with_const, target) + + # Get the coefficients from the Logistic Regression model + # Logistic regression gives an array of shape (1, n_features), so we take [0] + coefficients = logistic_l1.coef_[0] + shap_features_filtered = sm.add_constant(shap_features).loc[ + :, np.abs(coefficients) > 1e-6 + ] + + # Fit the logistic regression model that will generate confidence intervals + logit_model = sm.Logit(target, shap_features_filtered) result = logit_model.fit(disp=False) # Extract the results @@ -154,8 +178,16 @@ def regression_significance( - stderr: The standard error for each coefficient. - stat.significance: The p-value (statistical significance) for each feature. """ - # Fit the linear regression model - ols_model = sm.OLS(target, shap_features) + + # To avoid collinearity of features, first do a pass with tiny L1-reg + # and throw away the zero coeffs + shap_features_scaled = StandardScaler().fit_transform(shap_features) + coefficients = Lasso(alpha=1e-6).fit(shap_features_scaled, target).coef_ + shap_features_filtered = shap_features.loc[:, np.abs(coefficients) > 1e-6] + + # Sadly regularized models tend to not produce confidence intervals, so + # Fit the linear regression model that will generate confidence intervals + ols_model = sm.OLS(target, shap_features_filtered) result = ols_model.fit() # Extract the results @@ -219,7 +251,7 @@ def shap_features_to_significance( return result_df_sorted -def score_features( +def shap_select( tree_model: Any, validation_df: pd.DataFrame, feature_names: List[str], diff --git a/tests/test_regression.py b/tests/test_regression.py index 4b5d29e..a25568e 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -5,7 +5,7 @@ import xgboost as xgb import catboost as cb from sklearn.model_selection import train_test_split -from shap_select import score_features +from shap_select import shap_select @pytest.fixture @@ -239,7 +239,7 @@ def test_selected_column_values(model_type, data_fixture, task_type, request): raise ValueError("Unsupported model type") # Call the score_features function for the correct task (regression, binary, multiclass) - selected_features_df = score_features( + selected_features_df = shap_select( model, X_val, X_val.columns.tolist(), y_val, task=task_type ) From fc820b285e3ca5fdba11042c4b2becae39548d81 Mon Sep 17 00:00:00 2001 From: "Egor.Kraev" Date: Tue, 24 Sep 2024 21:49:22 +0100 Subject: [PATCH 2/4] Support any model shap supports --- shap_select/select.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shap_select/select.py b/shap_select/select.py index 365eeef..fe0203e 100644 --- a/shap_select/select.py +++ b/shap_select/select.py @@ -24,7 +24,7 @@ def create_shap_features( - pd.DataFrame: A DataFrame containing the SHAP values for each feature in the `validation_df`, where each column corresponds to the SHAP values of a feature, and the rows match the index of the `validation_df`. """ - explainer = shap.TreeExplainer(tree_model, model_output="raw")(validation_df) + explainer = shap.Explainer(tree_model, model_output="raw")(validation_df) shap_values = explainer.values if len(shap_values.shape) == 2: From f2f4966bdb65e60b5b7ed4cf2520413a47b4ea3d Mon Sep 17 00:00:00 2001 From: "Egor.Kraev" Date: Wed, 25 Sep 2024 09:16:09 +0100 Subject: [PATCH 3/4] Minor tidying-up --- README.md | 28 +- ...through regression on Shapley values.ipynb | 530 ++++++++---------- shap_select/select.py | 19 +- tests/test_regression.py | 8 +- 4 files changed, 261 insertions(+), 324 deletions(-) diff --git a/README.md b/README.md index e5ec68d..dc10600 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,21 @@ -# Repository created from the dev portal - -Owner: data-scientists - -Slack channels: #shap-select - -## Table of Contents - -- [Overview](#overview) +A library for fast feature selection for gradient boosting models using regression on feature Shapley values. +Unlike ## Overview +`shap-select` implements a heuristic to do fast feature selection for tabular regression and classification models. + +The basic idea is running a linear or logistic regression of the Shapley values on the target on the validation set, +discarding the features with negative coefficients, and ranking/filtering the rest according to their +statistical significance. For details, see the [example notebook](https://github.com/transferwise/shap-select/blob/main/docs/Quick%20feature%20selection%20through%20regression%20on%20Shapley%20values.ipynb) -A library for feature selection for gradient boosting models using regression on feature Shapley values +Earlier packages using Shapley values for feature selection exist, the advantages of this one are +* Regression on the **validation set** to combat overfitting +* A single pass regression calculation, not iterative +* A single intuitive hyperparameter for feature selection: statistical significance +* Bonferroni correction for multiclass classification +## Usage +```python +from shap_select import shap_select +# Here model is any model supported by the shap library, fitted on a different (train) dataset +selected_features_df = shap_select(model, X_val, y_val, task="multiclass", threshold=0.05) +``` \ No newline at end of file diff --git a/docs/Quick feature selection through regression on Shapley values.ipynb b/docs/Quick feature selection through regression on Shapley values.ipynb index aeeefbf..d441efb 100644 --- a/docs/Quick feature selection through regression on Shapley values.ipynb +++ b/docs/Quick feature selection through regression on Shapley values.ipynb @@ -25,14 +25,33 @@ { "cell_type": "code", "execution_count": 1, - "id": "348c2468", + "id": "51cd6a7d", "metadata": {}, "outputs": [], "source": [ + "import os, sys\n", + "from typing import List\n", + "\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "\n", + "try:\n", + " from shap_select import shap_select\n", + "except ModuleNotFoundError:\n", + " # If you're running shap_select from source\n", + " root = os.path.realpath(\"..\")\n", + " sys.path.append(root)\n", + " from shap_select import shap_select" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "348c2468", + "metadata": {}, + "outputs": [], + "source": [ "np.random.seed(42)\n", "n_samples = 100000\n", "\n", @@ -80,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "ec03ff2c", "metadata": {}, "outputs": [ @@ -186,268 +205,206 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "8f403fc5", "metadata": {}, + "outputs": [], + "source": [ + "selected_features_df = shap_select(model, X_val, y_val, task=\"regression\", threshold=0.05)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9fe28e6b", + "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 feature namecoefficientstderrstat.significancet-valuecloseness to 1.0Selectedfeature namet-valuestat.significancecoefficientselected
0x51.0520300.0520520.00000020.2112990.05203010x520.2112990.0000001.0520301
1x40.9524160.0520020.00000018.3151440.04758411x418.3151440.0000000.9524161
2x31.0981540.1606500.0000006.8356900.09815412x36.8356900.0000001.0981541
3x21.0448420.1618120.0000006.4571400.04484213x26.4571400.0000001.0448421
4x10.9172420.1658500.0000005.5305560.08275814x15.5305560.0000000.9172421
5x61.4979830.6265440.0168272.3908680.49798315x62.3908680.0168271.4979831
6x72.8655083.1800170.3675580.9010981.86550806x70.9010980.3675582.8655080
7x81.9336323.4332080.5733020.5632140.93363207x80.5632140.5733021.9336320
8x9-4.5370982.8219050.107908-1.6078145.537098-18x9-1.6078140.107908-4.537098-1
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import os, sys\n", - "\n", - "try:\n", - " from shap_select import shap_select\n", - "except ModuleNotFoundError:\n", - " # If you're running shap_select from source\n", - " root = os.path.realpath(\"..\")\n", - " sys.path.append(root)\n", - " from shap_select import shap_select\n", - "\n", - "selected_features_df = shap_select(\n", - " model, X_val, X_val.columns.tolist(), y_val, task=\"regression\", threshold=0.05\n", - ")\n", - "\n", "# Let's color the output prettily\n", - "styled_df = selected_features_df.style.background_gradient(\n", - " cmap='coolwarm', subset=pd.IndexSlice[:, ['coefficient', \n", - " 'stderr', \n", - " 'stat.significance', \n", - " 't-value', \n", - " 'closeness to 1.0', \n", - " 'Selected']]\n", - ")\n", - "styled_df" + "def prettify(df: pd.DataFrame, exclude: List[str]):\n", + " styled_df = df.style.background_gradient(\n", + " cmap='coolwarm', subset=pd.IndexSlice[:, [c for i,c in enumerate(df.columns) if c not in exclude]]\n", + " )\n", + " return styled_df\n", + "\n", + "prettify(selected_features_df, exclude=[\"feature name\"])" ] }, { @@ -470,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "1412da7f", "metadata": {}, "outputs": [ @@ -1207,7 +1164,8 @@ "[714]\tvalid-mlogloss:0.03015\n", "[715]\tvalid-mlogloss:0.03018\n", "[716]\tvalid-mlogloss:0.03018\n", - "[717]\tvalid-mlogloss:0.03016\n" + "[717]\tvalid-mlogloss:0.03016\n", + "[718]\tvalid-mlogloss:0.03014\n" ] } ], @@ -1265,223 +1223,191 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "743d6988", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\EgorKraev\\miniconda3\\envs\\llm3.11\\Lib\\site-packages\\sklearn\\svm\\_base.py:1235: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", + " warnings.warn(\n" + ] + }, { "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 feature namet-valuecloseness to 1.0coefficientstat.significanceSelectedfeature namet-valuestat.significancecoefficientselected
0x425.9275650.1202591.5593840.00000010x425.9275650.0000001.5593841
1x525.8740270.1137141.5716610.00000011x525.8740270.0000001.5716611
2x625.7825360.1261491.5612140.00000012x625.7825360.0000001.5612141
3x221.3670530.1129661.7534630.00000013x221.3670530.0000001.7534631
4x321.3308030.2017731.7926300.00000014x321.3308030.0000001.7926301
5x112.8358560.3590812.1973100.00000015x112.8358560.0000002.1973101
6x70.7735250.9010791.9010790.65881706x70.7735250.6588171.9010790
7x9-0.2063281.317295-0.3172951.745198-17x9-0.2063281.745198-0.317295-1
8x8-0.6369022.259370-1.2593702.213717-18x8-0.6369022.213717-1.259370-1
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "selected_features_df = shap_select(\n", - " model, X_val, X_val.columns.tolist(), y_val, task=\"multiclass\", threshold=0.05\n", - ")\n", + "selected_features_df = shap_select(model, X_val, y_val, task=\"multiclass\", threshold=0.05)\n", "\n", - "# Let's color the output prettily\n", - "styled_df = selected_features_df.style.background_gradient(\n", - " cmap='coolwarm', subset=pd.IndexSlice[:, ['coefficient', \n", - " 'stat.significance', \n", - " 't-value', \n", - " 'closeness to 1.0', \n", - " 'Selected']]\n", - ")\n", - "styled_df" + "prettify(selected_features_df, exclude=[\"feature name\"])" ] }, { diff --git a/shap_select/select.py b/shap_select/select.py index fe0203e..f803207 100644 --- a/shap_select/select.py +++ b/shap_select/select.py @@ -254,11 +254,11 @@ def shap_features_to_significance( def shap_select( tree_model: Any, validation_df: pd.DataFrame, - feature_names: List[str], target: pd.Series | str, # str is column name in validation_df + feature_names: List[str] | None = None, task: str | None = None, threshold: float = 0.05, - return_shap_features: bool = False, + return_extended_data: bool = False, ) -> pd.DataFrame | Tuple[pd.DataFrame, pd.DataFrame]: """ Select features based on their SHAP values and statistical significance. @@ -270,7 +270,7 @@ def shap_select( - target (pd.Series | str): The target values, or the name of the target column in `validation_df`. - task (str | None): The task type ('regression', 'binary', or 'multi'). If None, it is inferred automatically. - threshold (float): Significance threshold to select features. Default is 0.05. - - return_shap_features (bool): Whether to also return the shapley values dataframe(s) + - return_extended_data (bool): Whether to also return the shapley values dataframe(s) and some extra columns Returns: - pd.DataFrame: A DataFrame containing the feature names, statistical significance, and a 'Selected' column @@ -280,6 +280,9 @@ def shap_select( if isinstance(target, str): target = validation_df[target] + if feature_names is None: + feature_names = validation_df.columns.tolist() + # Infer the task if not provided if task is None: if pd.api.types.is_numeric_dtype(target) and target.nunique() > 10: @@ -301,12 +304,14 @@ def shap_select( significance_df = shap_features_to_significance(shap_features, target, task) # Add 'Selected' column based on the threshold - significance_df["Selected"] = ( + significance_df["selected"] = ( significance_df["stat.significance"] < threshold ).astype(int) - significance_df.loc[significance_df["t-value"] < 0, "Selected"] = -1 + significance_df.loc[significance_df["t-value"] < 0, "selected"] = -1 - if return_shap_features: + if return_extended_data: return significance_df, shap_features else: - return significance_df + return significance_df[ + ["feature name", "t-value", "stat.significance", "coefficient", "selected"] + ] diff --git a/tests/test_regression.py b/tests/test_regression.py index a25568e..d5ef6da 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -239,16 +239,14 @@ def test_selected_column_values(model_type, data_fixture, task_type, request): raise ValueError("Unsupported model type") # Call the score_features function for the correct task (regression, binary, multiclass) - selected_features_df = shap_select( - model, X_val, X_val.columns.tolist(), y_val, task=task_type - ) + selected_features_df = shap_select(model, X_val, y_val, task=task_type) # Check feature significance for all task types selected_rows = selected_features_df[ selected_features_df["feature name"].isin(["x7", "x8", "x9"]) ] assert ( - selected_rows["Selected"] <= 0 + selected_rows["selected"] <= 0 ).all(), ( "The Selected column must have negative or zero values for features x7, x8, x9" ) @@ -257,5 +255,5 @@ def test_selected_column_values(model_type, data_fixture, task_type, request): ~selected_features_df["feature name"].isin(["x7", "x8", "x9", "const"]) ] assert ( - other_features_rows["Selected"] > 0 + other_features_rows["selected"] > 0 ).all(), "The Selected column must have positive values for features other than x7, x8, x9" From 31cc53a2e08a79be3da0d4a0b65f735338b4cf77 Mon Sep 17 00:00:00 2001 From: "Egor.Kraev" Date: Wed, 25 Sep 2024 09:21:40 +0100 Subject: [PATCH 4/4] Edit README.md --- README.md | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index dc10600..4610030 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,13 @@ -A library for fast feature selection for gradient boosting models using regression on feature Shapley values. -Unlike - ## Overview `shap-select` implements a heuristic to do fast feature selection for tabular regression and classification models. -The basic idea is running a linear or logistic regression of the Shapley values on the target on the validation set, +The basic idea is running a linear or logistic regression of the target on the Shapley values on the validation set, discarding the features with negative coefficients, and ranking/filtering the rest according to their -statistical significance. For details, see the [example notebook](https://github.com/transferwise/shap-select/blob/main/docs/Quick%20feature%20selection%20through%20regression%20on%20Shapley%20values.ipynb) +statistical significance. For motivation and details, see the [example notebook](https://github.com/transferwise/shap-select/blob/main/docs/Quick%20feature%20selection%20through%20regression%20on%20Shapley%20values.ipynb) Earlier packages using Shapley values for feature selection exist, the advantages of this one are * Regression on the **validation set** to combat overfitting -* A single pass regression calculation, not iterative +* A single pass regression, not an iterative approach * A single intuitive hyperparameter for feature selection: statistical significance * Bonferroni correction for multiclass classification ## Usage