diff --git a/shap_select/select.py b/shap_select/select.py index be3398f..c733ed7 100644 --- a/shap_select/select.py +++ b/shap_select/select.py @@ -1,11 +1,14 @@ -from typing import Any, Tuple, List +from typing import Any, Tuple, List, Dict import pandas as pd import statsmodels.api as sm +import scipy.stats as stats import shap -def create_shap_features(tree_model: Any, validation_df: pd.DataFrame) -> pd.DataFrame: +def create_shap_features( + tree_model: Any, validation_df: pd.DataFrame, classes: List | None = None +) -> pd.DataFrame | Dict[Any, pd.DataFrame]: """ Generates SHAP (SHapley Additive exPlanations) values for a given tree-based model on a validation dataset. @@ -18,13 +21,26 @@ def create_shap_features(tree_model: Any, validation_df: pd.DataFrame) -> pd.Dat - pd.DataFrame: A DataFrame containing the SHAP values for each feature in the `validation_df`, where each column corresponds to the SHAP values of a feature, and the rows match the index of the `validation_df`. """ - explainer = shap.TreeExplainer(tree_model, model_output="raw") - shap_values = explainer(validation_df).values - - # Create a DataFrame with the SHAP values, with one column per feature - return pd.DataFrame( - shap_values, columns=validation_df.columns, index=validation_df.index - ) + explainer = shap.TreeExplainer(tree_model, model_output="raw")(validation_df) + shap_values = explainer.values + + if len(shap_values.shape) == 2: + assert ( + classes is None + ), "Don't specify classes for binary classification or regression" + # Create a DataFrame with the SHAP values, with one column per feature + return pd.DataFrame( + shap_values, columns=validation_df.columns, index=validation_df.index + ) + elif len(shap_values.shape) == 3: # multiclass classification + out = {} + for i, c in enumerate(classes): + out[i] = pd.DataFrame( + shap_values[:, :, i], + columns=validation_df.columns, + index=validation_df.index, + ) + return out def binary_classifier_significance( @@ -66,12 +82,14 @@ def binary_classifier_significance( "t-value": summary_frame["Coef."] / summary_frame["Std.Err."], } ).reset_index(drop=True) - result_df["closeness to 1.0"] = closeness_to_one(result_df).abs() + result_df["closeness to 1.0"] = (result_df["coefficient"] - 1.0).abs() return result_df def multi_classifier_significance( - shap_features: pd.DataFrame, target: pd.Series + shap_features: Dict[Any, pd.DataFrame], + target: pd.Series, + return_individual_significances: bool = False, ) -> (pd.DataFrame, list): """ Fits a binary logistic regression model for each unique class in the target, comparing each class against all others (one-vs-all). @@ -85,25 +103,36 @@ def multi_classifier_significance( - A DataFrame with feature names and their maximum significance values across all binary classifications. - A list of DataFrames, one for each binary classification, containing feature names, coefficients, standard errors, and statistical significance. """ - unique_classes = target.unique() significance_dfs = [] # Iterate through each class and perform binary classification (one-vs-all) - for cls in unique_classes: + for cls, feature_df in shap_features.items(): binary_target = (target == cls).astype(int) - significance_df = binary_classifier_significance(shap_features, binary_target) + significance_df = binary_classifier_significance(feature_df, binary_target) significance_dfs.append(significance_df) # Combine results into a single DataFrame with the max significance value for each feature combined_df = pd.concat(significance_dfs) max_significance_df = ( combined_df.groupby("feature name", as_index=False) - .agg({"stat.significance": "min", "t-value": "max", "closeness to 1.0": "min"}) + .agg( + { + "t-value": "max", + "closeness to 1.0": "min", + "coefficient": max, + } + ) .reset_index(drop=True) ) - max_significance_df.columns = ["feature name", "max significance value"] - return max_significance_df, significance_dfs + # Len(shap_features) multiplier is the Bonferroni correction + max_significance_df["stat.significance"] = max_significance_df["t-value"].apply( + lambda x: len(shap_features) * (1 - stats.norm.cdf(x)) + ) + if return_individual_significances: + return max_significance_df, significance_dfs + else: + return max_significance_df def regression_significance( @@ -141,7 +170,7 @@ def regression_significance( "t-value": summary_frame["Coef."] / summary_frame["Std.Err."], } ).reset_index(drop=True) - result_df["closeness to 1.0"] = closeness_to_one(result_df).abs() + result_df["closeness to 1.0"] = (result_df["coefficient"] - 1.0).abs() return result_df @@ -151,7 +180,9 @@ def closeness_to_one(df: pd.DataFrame) -> pd.Series: def shap_features_to_significance( - shap_features: pd.DataFrame, target: pd.Series, task: str | None = None + shap_features: pd.DataFrame | List[pd.DataFrame], + target: pd.Series, + task: str, ) -> pd.DataFrame: """ Determines the task (regression, binary, or multi-class classification) based on the target and calls the appropriate @@ -160,8 +191,7 @@ def shap_features_to_significance( Parameters: shap_features (pd.DataFrame): A DataFrame containing the features used for prediction. target (pd.Series): The target series for prediction (either continuous or categorical). - task (str | None): The type of task to perform. If None, the function will infer the task automatically. - The options are "regression", "binary", or "multi". + task (str): The type of task to perform: "regression", "binary", or "multiclass". Returns: pd.DataFrame: A DataFrame containing: @@ -170,27 +200,15 @@ def shap_features_to_significance( Sorted in descending order of significance (ascending p-value). """ - # Infer the task if not provided - if task is None: - if pd.api.types.is_numeric_dtype(target) and target.nunique() > 10: - task = "regression" - elif target.nunique() == 2: - task = "binary" - else: - task = "multi" - # Call the appropriate function based on the task if task == "regression": result_df = regression_significance(shap_features, target) elif task == "binary": result_df = binary_classifier_significance(shap_features, target) - elif task == "multi": - max_significance_df, _ = multi_classifier_significance(shap_features, target) - result_df = max_significance_df.rename( - columns={"max significance value": "stat.significance"} - ) + elif task == "multiclass": + result_df = multi_classifier_significance(shap_features, target) else: - raise ValueError("`task` must be 'regression', 'binary', 'multi' or None.") + raise ValueError("`task` must be 'regression', 'binary', 'multiclass' or None.") # Sort the result by statistical significance in ascending order (more significant features first) result_df_sorted = result_df.sort_values(by="t-value", ascending=False).reset_index( @@ -227,8 +245,22 @@ def score_features( if isinstance(target, str): target = validation_df[target] - # Generate SHAP values for the validation dataset - shap_features = create_shap_features(tree_model, validation_df[feature_names]) + # Infer the task if not provided + if task is None: + if pd.api.types.is_numeric_dtype(target) and target.nunique() > 10: + task = "regression" + elif target.nunique() == 2: + task = "binary" + else: + task = "multiclass" + + if task == "multiclass": + unique_classes = sorted(list(target.unique())) + shap_features = create_shap_features( + tree_model, validation_df[feature_names], unique_classes + ) + else: + shap_features = create_shap_features(tree_model, validation_df[feature_names]) # Compute statistical significance of each feature significance_df = shap_features_to_significance(shap_features, target, task) @@ -237,6 +269,6 @@ def score_features( significance_df["Selected"] = ( significance_df["stat.significance"] < threshold ).astype(int) - significance_df.loc[significance_df["coefficient"] < 0, "Selected"] = -1 + significance_df.loc[significance_df["t-value"] < 0, "Selected"] = -1 return significance_df, shap_features diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_regression.py b/tests/test_regression.py new file mode 100644 index 0000000..0a98eea --- /dev/null +++ b/tests/test_regression.py @@ -0,0 +1,261 @@ +import pytest +import numpy as np +import pandas as pd +import lightgbm as lgb +import xgboost as xgb +import catboost as cb +from sklearn.model_selection import train_test_split +from shap_select import score_features + + +@pytest.fixture +def generate_data_regression(): + np.random.seed(42) + n_samples = 100000 + + # Create 9 normally distributed features + X = pd.DataFrame( + { + "x1": np.random.normal(size=n_samples), + "x2": np.random.normal(size=n_samples), + "x3": np.random.normal(size=n_samples), + "x4": np.random.normal(size=n_samples), + "x5": np.random.normal(size=n_samples), + "x6": np.random.normal(size=n_samples), + "x7": np.random.normal(size=n_samples), + "x8": np.random.normal(size=n_samples), + "x9": np.random.normal(size=n_samples), + } + ) + + # Make all the features positive-ish + X += 3 + + # Define the target based on the formula y = x1 + x2*x3 + x4*x5*x6 + y = ( + 3 * X["x1"] + + X["x2"] * X["x3"] + + X["x4"] * X["x5"] * X["x6"] + + 10 * np.random.normal(size=n_samples) # lots of noise + ) + X["x6"] *= 0.1 + X["x6"] += np.random.normal(size=n_samples) + + # Split the dataset into training and validation sets (both with 10K rows) + X_train, X_val, y_train, y_val = train_test_split( + X, y, test_size=0.1, random_state=42 + ) + + return X_train, X_val, y_train, y_val + + +@pytest.fixture +def generate_data_binary(): + np.random.seed(42) + n_samples = 100000 + + # Create 9 normally distributed features + X = pd.DataFrame( + { + "x1": np.random.normal(size=n_samples), + "x2": np.random.normal(size=n_samples), + "x3": np.random.normal(size=n_samples), + # "x4": np.random.normal(size=n_samples), + # "x5": np.random.normal(size=n_samples), + # "x6": np.random.normal(size=n_samples), + "x7": np.random.normal(size=n_samples), + "x8": np.random.normal(size=n_samples), + "x9": np.random.normal(size=n_samples), + } + ) + + # Make all the features positive-ish + X += 3 + + # Create a binary target based on a threshold + y = (X["x1"] + X["x2"] * X["x3"] > 12).astype(int) + + # Split the dataset into training and validation sets + X_train, X_val, y_train, y_val = train_test_split( + X, y, test_size=0.1, random_state=42 + ) + + return X_train, X_val, y_train, y_val + + +@pytest.fixture +def generate_data_multiclass(): + np.random.seed(42) + n_samples = 100000 + + # Create 9 normally distributed features + X = pd.DataFrame( + { + "x1": np.random.normal(size=n_samples), + "x2": np.random.normal(size=n_samples), + "x3": np.random.normal(size=n_samples), + # "x4": np.random.normal(size=n_samples), + # "x5": np.random.normal(size=n_samples), + # "x6": np.random.normal(size=n_samples), + "x7": np.random.normal(size=n_samples), + "x8": np.random.normal(size=n_samples), + "x9": np.random.normal(size=n_samples), + } + ) + + # Make all the features positive-ish + X += 3 + + # Create a multiclass target with 3 classes + y = pd.cut( + X["x1"] + X["x2"] * X["x3"], # + X["x4"] * X["x5"] * X["x6"], + bins=3, + labels=[0, 1, 2], + ).astype(int) + + # Split the dataset into training and validation sets + X_train, X_val, y_train, y_val = train_test_split( + X, y, test_size=0.1, random_state=42 + ) + + return X_train, X_val, y_train, y_val + + +def train_lightgbm(X_train, X_val, y_train, y_val, task_type): + """Train a LightGBM model based on the task type""" + train_data = lgb.Dataset(X_train, label=y_train) + val_data = lgb.Dataset(X_val, label=y_val, reference=train_data) + + if task_type == "binary": + params = {"objective": "binary", "metric": "binary_logloss", "verbose": -1} + elif task_type == "regression": + params = {"objective": "regression", "metric": "rmse", "verbose": -1} + elif task_type == "multiclass": + params = { + "objective": "multiclass", + "num_class": 3, + "metric": "multi_logloss", + "verbose": -1, + } + else: + raise ValueError(f"Unsupported task type: {task_type}") + + model = lgb.train( + params, + train_data, + num_boost_round=1000, + valid_sets=[train_data, val_data], + valid_names=["train", "valid"], + callbacks=[lgb.early_stopping(stopping_rounds=50)], + ) + return model + + +def train_xgboost(X_train, X_val, y_train, y_val, task_type): + """Train an XGBoost model based on the task type""" + dtrain = xgb.DMatrix(X_train, label=y_train) + dval = xgb.DMatrix(X_val, label=y_val) + + if task_type == "binary": + params = { + "objective": "binary:logistic", + "eval_metric": "logloss", + "verbosity": 0, + } + elif task_type == "regression": + params = { + "objective": "reg:squarederror", + "eval_metric": "rmse", + "verbosity": 0, + } + elif task_type == "multiclass": + params = { + "objective": "multi:softprob", + "num_class": 3, + "eval_metric": "mlogloss", + "verbosity": 0, + } + else: + raise ValueError(f"Unsupported task type: {task_type}") + + evals = [(dval, "valid")] + model = xgb.train( + params, dtrain, num_boost_round=1000, evals=evals, early_stopping_rounds=50 + ) + return model + + +def train_catboost(X_train, X_val, y_train, y_val, task_type): + """Train a CatBoost model based on the task type""" + if task_type == "binary": + model = cb.CatBoostClassifier( + iterations=1000, + loss_function="Logloss", + verbose=0, + early_stopping_rounds=50, + ) + elif task_type == "regression": + model = cb.CatBoostRegressor( + iterations=1000, loss_function="RMSE", verbose=0, early_stopping_rounds=50 + ) + elif task_type == "multiclass": + model = cb.CatBoostClassifier( + iterations=1000, + loss_function="MultiClass", + verbose=0, + early_stopping_rounds=50, + ) + else: + raise ValueError(f"Unsupported task type: {task_type}") + + model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True) + return model + + +@pytest.mark.parametrize( + "model_type", + ["lightgbm", "xgboost", "catboost"], +) +@pytest.mark.parametrize( + "data_fixture, task_type", + [ + ("generate_data_regression", "regression"), + ("generate_data_binary", "binary"), + ("generate_data_multiclass", "multiclass"), + ], +) +def test_selected_column_values(model_type, data_fixture, task_type, request): + """Parameterized test for regression, binary classification, and multiclass classification.""" + X_train, X_val, y_train, y_val = request.getfixturevalue(data_fixture) + + # Select the correct model to train + if model_type == "lightgbm": + model = train_lightgbm(X_train, X_val, y_train, y_val, task_type) + elif model_type == "xgboost": + model = train_xgboost(X_train, X_val, y_train, y_val, task_type) + elif model_type == "catboost": + model = train_catboost(X_train, X_val, y_train, y_val, task_type) + else: + raise ValueError("Unsupported model type") + + # Call the score_features function for the correct task (regression, binary, multiclass) + selected_features_df, _ = score_features( + model, X_val, X_val.columns.tolist(), y_val, task=task_type + ) + + # Check feature significance for all task types + selected_rows = selected_features_df[ + selected_features_df["feature name"].isin(["x7", "x8", "x9"]) + ] + assert ( + selected_rows["Selected"] <= 0 + ).all(), ( + "The Selected column must have negative or zero values for features x7, x8, x9" + ) + + other_features_rows = selected_features_df[ + ~selected_features_df["feature name"].isin(["x7", "x8", "x9", "const"]) + ] + assert ( + other_features_rows["Selected"] > 0 + ).all(), "The Selected column must have positive values for features other than x7, x8, x9"