diff --git a/README.md b/README.md index 4610030..1933886 100644 --- a/README.md +++ b/README.md @@ -14,5 +14,6 @@ Earlier packages using Shapley values for feature selection exist, the advantage ```python from shap_select import shap_select # Here model is any model supported by the shap library, fitted on a different (train) dataset +# Task can be regression, binary, or multiclass selected_features_df = shap_select(model, X_val, y_val, task="multiclass", threshold=0.05) ``` \ No newline at end of file diff --git a/docs/bug.py b/docs/bug.py new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index 4abb286..1aca3b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ scikit_learn scipy shap statsmodels +numpy diff --git a/shap_select/select.py b/shap_select/select.py index f803207..68993cb 100644 --- a/shap_select/select.py +++ b/shap_select/select.py @@ -66,33 +66,11 @@ def binary_classifier_significance( """ # Add a constant to the features for the intercept in logistic regression - - # Standardizing the features (Logistic regression with L1 regularization tends to - # work better with standardized data) - shap_features_scaled = pd.DataFrame( - data=StandardScaler().fit_transform(shap_features), - columns=shap_features.columns, - ) - shap_features_with_const = sm.add_constant(shap_features_scaled) - - # To avoid linear dependence of features, first do a pass with tiny L1-reg - # and throw away the zero coeffs - # Define the Logistic Regression model with L1 regularization - logistic_l1 = LogisticRegression( - penalty="l1", solver="liblinear", fit_intercept=False, C=1e6 - ) # C is the inverse of regularization strength - logistic_l1.fit(shap_features_with_const, target) - - # Get the coefficients from the Logistic Regression model - # Logistic regression gives an array of shape (1, n_features), so we take [0] - coefficients = logistic_l1.coef_[0] - shap_features_filtered = sm.add_constant(shap_features).loc[ - :, np.abs(coefficients) > 1e-6 - ] + shap_features_with_constant = sm.add_constant(shap_features) # Fit the logistic regression model that will generate confidence intervals - logit_model = sm.Logit(target, shap_features_filtered) - result = logit_model.fit(disp=False) + logit_model = sm.Logit(target, shap_features_with_constant) + result = logit_model.fit_regularized(disp=False, alpha=1e-6) # Extract the results summary_frame = result.summary2().tables[1] @@ -178,17 +156,9 @@ def regression_significance( - stderr: The standard error for each coefficient. - stat.significance: The p-value (statistical significance) for each feature. """ - - # To avoid collinearity of features, first do a pass with tiny L1-reg - # and throw away the zero coeffs - shap_features_scaled = StandardScaler().fit_transform(shap_features) - coefficients = Lasso(alpha=1e-6).fit(shap_features_scaled, target).coef_ - shap_features_filtered = shap_features.loc[:, np.abs(coefficients) > 1e-6] - - # Sadly regularized models tend to not produce confidence intervals, so # Fit the linear regression model that will generate confidence intervals - ols_model = sm.OLS(target, shap_features_filtered) - result = ols_model.fit() + ols_model = sm.OLS(target, shap_features) + result = ols_model.fit_regularized(alpha=1e-6, refit=True) # Extract the results summary_frame = result.summary2().tables[1] @@ -268,7 +238,7 @@ def shap_select( - validation_df (pd.DataFrame): Validation dataset containing the features. - feature_names (List[str]): A list of feature names used by the model. - target (pd.Series | str): The target values, or the name of the target column in `validation_df`. - - task (str | None): The task type ('regression', 'binary', or 'multi'). If None, it is inferred automatically. + - task (str | None): The task type ('regression', 'binary', or 'multiclass'). If None, it is inferred automatically. - threshold (float): Significance threshold to select features. Default is 0.05. - return_extended_data (bool): Whether to also return the shapley values dataframe(s) and some extra columns