diff --git a/shap_select/select.py b/shap_select/select.py
index be3398f..c733ed7 100644
--- a/shap_select/select.py
+++ b/shap_select/select.py
@@ -1,11 +1,14 @@
-from typing import Any, Tuple, List
+from typing import Any, Tuple, List, Dict
 
 import pandas as pd
 import statsmodels.api as sm
+import scipy.stats as stats
 import shap
 
 
-def create_shap_features(tree_model: Any, validation_df: pd.DataFrame) -> pd.DataFrame:
+def create_shap_features(
+    tree_model: Any, validation_df: pd.DataFrame, classes: List | None = None
+) -> pd.DataFrame | Dict[Any, pd.DataFrame]:
     """
     Generates SHAP (SHapley Additive exPlanations) values for a given tree-based model on a validation dataset.
 
@@ -18,13 +21,26 @@ def create_shap_features(tree_model: Any, validation_df: pd.DataFrame) -> pd.Dat
     - pd.DataFrame: A DataFrame containing the SHAP values for each feature in the `validation_df`, where each column
       corresponds to the SHAP values of a feature, and the rows match the index of the `validation_df`.
     """
-    explainer = shap.TreeExplainer(tree_model, model_output="raw")
-    shap_values = explainer(validation_df).values
-
-    # Create a DataFrame with the SHAP values, with one column per feature
-    return pd.DataFrame(
-        shap_values, columns=validation_df.columns, index=validation_df.index
-    )
+    explainer = shap.TreeExplainer(tree_model, model_output="raw")(validation_df)
+    shap_values = explainer.values
+
+    if len(shap_values.shape) == 2:
+        assert (
+            classes is None
+        ), "Don't specify classes for binary classification or regression"
+        # Create a DataFrame with the SHAP values, with one column per feature
+        return pd.DataFrame(
+            shap_values, columns=validation_df.columns, index=validation_df.index
+        )
+    elif len(shap_values.shape) == 3:  # multiclass classification
+        out = {}
+        for i, c in enumerate(classes):
+            out[i] = pd.DataFrame(
+                shap_values[:, :, i],
+                columns=validation_df.columns,
+                index=validation_df.index,
+            )
+        return out
 
 
 def binary_classifier_significance(
@@ -66,12 +82,14 @@ def binary_classifier_significance(
             "t-value": summary_frame["Coef."] / summary_frame["Std.Err."],
         }
     ).reset_index(drop=True)
-    result_df["closeness to 1.0"] = closeness_to_one(result_df).abs()
+    result_df["closeness to 1.0"] = (result_df["coefficient"] - 1.0).abs()
     return result_df
 
 
 def multi_classifier_significance(
-    shap_features: pd.DataFrame, target: pd.Series
+    shap_features: Dict[Any, pd.DataFrame],
+    target: pd.Series,
+    return_individual_significances: bool = False,
 ) -> (pd.DataFrame, list):
     """
     Fits a binary logistic regression model for each unique class in the target, comparing each class against all others (one-vs-all).
@@ -85,25 +103,36 @@ def multi_classifier_significance(
     - A DataFrame with feature names and their maximum significance values across all binary classifications.
     - A list of DataFrames, one for each binary classification, containing feature names, coefficients, standard errors, and statistical significance.
     """
-    unique_classes = target.unique()
     significance_dfs = []
 
     # Iterate through each class and perform binary classification (one-vs-all)
-    for cls in unique_classes:
+    for cls, feature_df in shap_features.items():
         binary_target = (target == cls).astype(int)
-        significance_df = binary_classifier_significance(shap_features, binary_target)
+        significance_df = binary_classifier_significance(feature_df, binary_target)
         significance_dfs.append(significance_df)
 
     # Combine results into a single DataFrame with the max significance value for each feature
     combined_df = pd.concat(significance_dfs)
     max_significance_df = (
         combined_df.groupby("feature name", as_index=False)
-        .agg({"stat.significance": "min", "t-value": "max", "closeness to 1.0": "min"})
+        .agg(
+            {
+                "t-value": "max",
+                "closeness to 1.0": "min",
+                "coefficient": max,
+            }
+        )
         .reset_index(drop=True)
     )
-    max_significance_df.columns = ["feature name", "max significance value"]
 
-    return max_significance_df, significance_dfs
+    # Len(shap_features) multiplier is the Bonferroni correction
+    max_significance_df["stat.significance"] = max_significance_df["t-value"].apply(
+        lambda x: len(shap_features) * (1 - stats.norm.cdf(x))
+    )
+    if return_individual_significances:
+        return max_significance_df, significance_dfs
+    else:
+        return max_significance_df
 
 
 def regression_significance(
@@ -141,7 +170,7 @@ def regression_significance(
             "t-value": summary_frame["Coef."] / summary_frame["Std.Err."],
         }
     ).reset_index(drop=True)
-    result_df["closeness to 1.0"] = closeness_to_one(result_df).abs()
+    result_df["closeness to 1.0"] = (result_df["coefficient"] - 1.0).abs()
 
     return result_df
 
@@ -151,7 +180,9 @@ def closeness_to_one(df: pd.DataFrame) -> pd.Series:
 
 
 def shap_features_to_significance(
-    shap_features: pd.DataFrame, target: pd.Series, task: str | None = None
+    shap_features: pd.DataFrame | List[pd.DataFrame],
+    target: pd.Series,
+    task: str,
 ) -> pd.DataFrame:
     """
     Determines the task (regression, binary, or multi-class classification) based on the target and calls the appropriate
@@ -160,8 +191,7 @@ def shap_features_to_significance(
     Parameters:
     shap_features (pd.DataFrame): A DataFrame containing the features used for prediction.
     target (pd.Series): The target series for prediction (either continuous or categorical).
-    task (str | None): The type of task to perform. If None, the function will infer the task automatically.
-                       The options are "regression", "binary", or "multi".
+    task (str): The type of task to perform: "regression", "binary", or "multiclass".
 
     Returns:
     pd.DataFrame: A DataFrame containing:
@@ -170,27 +200,15 @@ def shap_features_to_significance(
         Sorted in descending order of significance (ascending p-value).
     """
 
-    # Infer the task if not provided
-    if task is None:
-        if pd.api.types.is_numeric_dtype(target) and target.nunique() > 10:
-            task = "regression"
-        elif target.nunique() == 2:
-            task = "binary"
-        else:
-            task = "multi"
-
     # Call the appropriate function based on the task
     if task == "regression":
         result_df = regression_significance(shap_features, target)
     elif task == "binary":
         result_df = binary_classifier_significance(shap_features, target)
-    elif task == "multi":
-        max_significance_df, _ = multi_classifier_significance(shap_features, target)
-        result_df = max_significance_df.rename(
-            columns={"max significance value": "stat.significance"}
-        )
+    elif task == "multiclass":
+        result_df = multi_classifier_significance(shap_features, target)
     else:
-        raise ValueError("`task` must be 'regression', 'binary', 'multi' or None.")
+        raise ValueError("`task` must be 'regression', 'binary', 'multiclass' or None.")
 
     # Sort the result by statistical significance in ascending order (more significant features first)
     result_df_sorted = result_df.sort_values(by="t-value", ascending=False).reset_index(
@@ -227,8 +245,22 @@ def score_features(
     if isinstance(target, str):
         target = validation_df[target]
 
-    # Generate SHAP values for the validation dataset
-    shap_features = create_shap_features(tree_model, validation_df[feature_names])
+    # Infer the task if not provided
+    if task is None:
+        if pd.api.types.is_numeric_dtype(target) and target.nunique() > 10:
+            task = "regression"
+        elif target.nunique() == 2:
+            task = "binary"
+        else:
+            task = "multiclass"
+
+    if task == "multiclass":
+        unique_classes = sorted(list(target.unique()))
+        shap_features = create_shap_features(
+            tree_model, validation_df[feature_names], unique_classes
+        )
+    else:
+        shap_features = create_shap_features(tree_model, validation_df[feature_names])
 
     # Compute statistical significance of each feature
     significance_df = shap_features_to_significance(shap_features, target, task)
@@ -237,6 +269,6 @@ def score_features(
     significance_df["Selected"] = (
         significance_df["stat.significance"] < threshold
     ).astype(int)
-    significance_df.loc[significance_df["coefficient"] < 0, "Selected"] = -1
+    significance_df.loc[significance_df["t-value"] < 0, "Selected"] = -1
 
     return significance_df, shap_features
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_regression.py b/tests/test_regression.py
new file mode 100644
index 0000000..0a98eea
--- /dev/null
+++ b/tests/test_regression.py
@@ -0,0 +1,261 @@
+import pytest
+import numpy as np
+import pandas as pd
+import lightgbm as lgb
+import xgboost as xgb
+import catboost as cb
+from sklearn.model_selection import train_test_split
+from shap_select import score_features
+
+
+@pytest.fixture
+def generate_data_regression():
+    np.random.seed(42)
+    n_samples = 100000
+
+    # Create 9 normally distributed features
+    X = pd.DataFrame(
+        {
+            "x1": np.random.normal(size=n_samples),
+            "x2": np.random.normal(size=n_samples),
+            "x3": np.random.normal(size=n_samples),
+            "x4": np.random.normal(size=n_samples),
+            "x5": np.random.normal(size=n_samples),
+            "x6": np.random.normal(size=n_samples),
+            "x7": np.random.normal(size=n_samples),
+            "x8": np.random.normal(size=n_samples),
+            "x9": np.random.normal(size=n_samples),
+        }
+    )
+
+    # Make all the features positive-ish
+    X += 3
+
+    # Define the target based on the formula y = x1 + x2*x3 + x4*x5*x6
+    y = (
+        3 * X["x1"]
+        + X["x2"] * X["x3"]
+        + X["x4"] * X["x5"] * X["x6"]
+        + 10 * np.random.normal(size=n_samples)  # lots of noise
+    )
+    X["x6"] *= 0.1
+    X["x6"] += np.random.normal(size=n_samples)
+
+    # Split the dataset into training and validation sets (both with 10K rows)
+    X_train, X_val, y_train, y_val = train_test_split(
+        X, y, test_size=0.1, random_state=42
+    )
+
+    return X_train, X_val, y_train, y_val
+
+
+@pytest.fixture
+def generate_data_binary():
+    np.random.seed(42)
+    n_samples = 100000
+
+    # Create 9 normally distributed features
+    X = pd.DataFrame(
+        {
+            "x1": np.random.normal(size=n_samples),
+            "x2": np.random.normal(size=n_samples),
+            "x3": np.random.normal(size=n_samples),
+            # "x4": np.random.normal(size=n_samples),
+            # "x5": np.random.normal(size=n_samples),
+            # "x6": np.random.normal(size=n_samples),
+            "x7": np.random.normal(size=n_samples),
+            "x8": np.random.normal(size=n_samples),
+            "x9": np.random.normal(size=n_samples),
+        }
+    )
+
+    # Make all the features positive-ish
+    X += 3
+
+    # Create a binary target based on a threshold
+    y = (X["x1"] + X["x2"] * X["x3"] > 12).astype(int)
+
+    # Split the dataset into training and validation sets
+    X_train, X_val, y_train, y_val = train_test_split(
+        X, y, test_size=0.1, random_state=42
+    )
+
+    return X_train, X_val, y_train, y_val
+
+
+@pytest.fixture
+def generate_data_multiclass():
+    np.random.seed(42)
+    n_samples = 100000
+
+    # Create 9 normally distributed features
+    X = pd.DataFrame(
+        {
+            "x1": np.random.normal(size=n_samples),
+            "x2": np.random.normal(size=n_samples),
+            "x3": np.random.normal(size=n_samples),
+            # "x4": np.random.normal(size=n_samples),
+            # "x5": np.random.normal(size=n_samples),
+            # "x6": np.random.normal(size=n_samples),
+            "x7": np.random.normal(size=n_samples),
+            "x8": np.random.normal(size=n_samples),
+            "x9": np.random.normal(size=n_samples),
+        }
+    )
+
+    # Make all the features positive-ish
+    X += 3
+
+    # Create a multiclass target with 3 classes
+    y = pd.cut(
+        X["x1"] + X["x2"] * X["x3"],  # + X["x4"] * X["x5"] * X["x6"],
+        bins=3,
+        labels=[0, 1, 2],
+    ).astype(int)
+
+    # Split the dataset into training and validation sets
+    X_train, X_val, y_train, y_val = train_test_split(
+        X, y, test_size=0.1, random_state=42
+    )
+
+    return X_train, X_val, y_train, y_val
+
+
+def train_lightgbm(X_train, X_val, y_train, y_val, task_type):
+    """Train a LightGBM model based on the task type"""
+    train_data = lgb.Dataset(X_train, label=y_train)
+    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
+
+    if task_type == "binary":
+        params = {"objective": "binary", "metric": "binary_logloss", "verbose": -1}
+    elif task_type == "regression":
+        params = {"objective": "regression", "metric": "rmse", "verbose": -1}
+    elif task_type == "multiclass":
+        params = {
+            "objective": "multiclass",
+            "num_class": 3,
+            "metric": "multi_logloss",
+            "verbose": -1,
+        }
+    else:
+        raise ValueError(f"Unsupported task type: {task_type}")
+
+    model = lgb.train(
+        params,
+        train_data,
+        num_boost_round=1000,
+        valid_sets=[train_data, val_data],
+        valid_names=["train", "valid"],
+        callbacks=[lgb.early_stopping(stopping_rounds=50)],
+    )
+    return model
+
+
+def train_xgboost(X_train, X_val, y_train, y_val, task_type):
+    """Train an XGBoost model based on the task type"""
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dval = xgb.DMatrix(X_val, label=y_val)
+
+    if task_type == "binary":
+        params = {
+            "objective": "binary:logistic",
+            "eval_metric": "logloss",
+            "verbosity": 0,
+        }
+    elif task_type == "regression":
+        params = {
+            "objective": "reg:squarederror",
+            "eval_metric": "rmse",
+            "verbosity": 0,
+        }
+    elif task_type == "multiclass":
+        params = {
+            "objective": "multi:softprob",
+            "num_class": 3,
+            "eval_metric": "mlogloss",
+            "verbosity": 0,
+        }
+    else:
+        raise ValueError(f"Unsupported task type: {task_type}")
+
+    evals = [(dval, "valid")]
+    model = xgb.train(
+        params, dtrain, num_boost_round=1000, evals=evals, early_stopping_rounds=50
+    )
+    return model
+
+
+def train_catboost(X_train, X_val, y_train, y_val, task_type):
+    """Train a CatBoost model based on the task type"""
+    if task_type == "binary":
+        model = cb.CatBoostClassifier(
+            iterations=1000,
+            loss_function="Logloss",
+            verbose=0,
+            early_stopping_rounds=50,
+        )
+    elif task_type == "regression":
+        model = cb.CatBoostRegressor(
+            iterations=1000, loss_function="RMSE", verbose=0, early_stopping_rounds=50
+        )
+    elif task_type == "multiclass":
+        model = cb.CatBoostClassifier(
+            iterations=1000,
+            loss_function="MultiClass",
+            verbose=0,
+            early_stopping_rounds=50,
+        )
+    else:
+        raise ValueError(f"Unsupported task type: {task_type}")
+
+    model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)
+    return model
+
+
+@pytest.mark.parametrize(
+    "model_type",
+    ["lightgbm", "xgboost", "catboost"],
+)
+@pytest.mark.parametrize(
+    "data_fixture, task_type",
+    [
+        ("generate_data_regression", "regression"),
+        ("generate_data_binary", "binary"),
+        ("generate_data_multiclass", "multiclass"),
+    ],
+)
+def test_selected_column_values(model_type, data_fixture, task_type, request):
+    """Parameterized test for regression, binary classification, and multiclass classification."""
+    X_train, X_val, y_train, y_val = request.getfixturevalue(data_fixture)
+
+    # Select the correct model to train
+    if model_type == "lightgbm":
+        model = train_lightgbm(X_train, X_val, y_train, y_val, task_type)
+    elif model_type == "xgboost":
+        model = train_xgboost(X_train, X_val, y_train, y_val, task_type)
+    elif model_type == "catboost":
+        model = train_catboost(X_train, X_val, y_train, y_val, task_type)
+    else:
+        raise ValueError("Unsupported model type")
+
+    # Call the score_features function for the correct task (regression, binary, multiclass)
+    selected_features_df, _ = score_features(
+        model, X_val, X_val.columns.tolist(), y_val, task=task_type
+    )
+
+    # Check feature significance for all task types
+    selected_rows = selected_features_df[
+        selected_features_df["feature name"].isin(["x7", "x8", "x9"])
+    ]
+    assert (
+        selected_rows["Selected"] <= 0
+    ).all(), (
+        "The Selected column must have negative or zero values for features x7, x8, x9"
+    )
+
+    other_features_rows = selected_features_df[
+        ~selected_features_df["feature name"].isin(["x7", "x8", "x9", "const"])
+    ]
+    assert (
+        other_features_rows["Selected"] > 0
+    ).all(), "The Selected column must have positive values for features other than x7, x8, x9"