diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 424c196..5211a7a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,9 +4,6 @@ on: pull_request: branches: - main - push: - branches: - - main jobs: test: diff --git a/.gitignore b/.gitignore index 7e0a4a4..46956c1 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,10 @@ hs_err_pid* build/ out/ .gradle/ -bin/ \ No newline at end of file +bin/ + +# Python cache files +__pycache__/ +*.py[cod] +*.pyo +*.pyd \ No newline at end of file diff --git a/tests/test_regression.py b/tests/test_regression.py index d5ef6da..e7dbee7 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -256,4 +256,4 @@ def test_selected_column_values(model_type, data_fixture, task_type, request): ] assert ( other_features_rows["selected"] > 0 - ).all(), "The Selected column must have positive values for features other than x7, x8, x9" + ).all(), "The Selected column must have positive values for features other than x7, x8, x9" \ No newline at end of file diff --git a/tests/test_shap_feature_generation.py b/tests/test_shap_feature_generation.py new file mode 100644 index 0000000..179c21c --- /dev/null +++ b/tests/test_shap_feature_generation.py @@ -0,0 +1,49 @@ +import pytest +import pandas as pd +import numpy as np +from shap_select.select import create_shap_features +import lightgbm as lgb + + +@pytest.fixture +def sample_data_binary(): + """Generate sample data for binary classification.""" + np.random.seed(42) + X = pd.DataFrame(np.random.normal(size=(100, 5)), columns=[f"x{i}" for i in range(5)]) + y = (X["x0"] > 0).astype(int) + return X, y + + +@pytest.fixture +def sample_data_multiclass(): + """Generate sample data for multiclass classification.""" + np.random.seed(42) + X = pd.DataFrame(np.random.normal(size=(100, 5)), columns=[f"x{i}" for i in range(5)]) + y = np.random.choice([0, 1, 2], size=100) + return X, y + + +def test_shap_feature_generation_binary(sample_data_binary): + """Test SHAP feature generation for binary classification.""" + X, y = sample_data_binary + + model = lgb.LGBMClassifier() + model.fit(X, y) + + shap_df = create_shap_features(model, X) + assert isinstance(shap_df, pd.DataFrame), "SHAP output should be a DataFrame" + assert shap_df.shape == X.shape, "SHAP output shape should match input data" + assert shap_df.isnull().sum().sum() == 0, "No missing values expected in SHAP output" + + +def test_shap_feature_generation_multiclass(sample_data_multiclass): + """Test SHAP feature generation for multiclass classification.""" + X, y = sample_data_multiclass + + model = lgb.LGBMClassifier(objective="multiclass", num_class=3) + model.fit(X, y) + + shap_df = create_shap_features(model, X, classes=[0, 1, 2]) + assert isinstance(shap_df, dict), "SHAP output should be a dictionary for multiclass" + assert all(isinstance(v, pd.DataFrame) for v in shap_df.values()), "Each class should have a DataFrame" + assert shap_df[0].shape == X.shape, "SHAP output shape should match input data for each class" diff --git a/tests/test_significance_calculation.py b/tests/test_significance_calculation.py new file mode 100644 index 0000000..7d11db7 --- /dev/null +++ b/tests/test_significance_calculation.py @@ -0,0 +1,55 @@ +import pytest +import pandas as pd +import numpy as np +from shap_select.select import binary_classifier_significance, regression_significance +import statsmodels.api as sm + + +@pytest.fixture +def shap_features_binary(): + """Generate sample SHAP values for binary classification.""" + np.random.seed(42) + return pd.DataFrame(np.random.normal(size=(100, 5)), columns=[f"x{i}" for i in range(5)]) + + +@pytest.fixture +def binary_target(): + """Generate binary target.""" + np.random.seed(42) + return pd.Series(np.random.choice([0, 1], size=100)) + + +def test_binary_classifier_significance(shap_features_binary, binary_target): + """Test significance calculation for binary classification.""" + result_df = binary_classifier_significance(shap_features_binary, binary_target, alpha=1e-4) + + assert "feature name" in result_df.columns, "Result should contain feature names" + assert "coefficient" in result_df.columns, "Result should contain coefficients" + assert "stat.significance" in result_df.columns, "Result should contain statistical significance" + assert result_df.shape[0] == shap_features_binary.shape[1], "Each feature should have a row in the output" + assert (result_df["stat.significance"] > 0).all(), "All p-values should be non-negative" + + +@pytest.fixture +def shap_features_regression(): + """Generate sample SHAP values for regression.""" + np.random.seed(42) + return pd.DataFrame(np.random.normal(size=(100, 5)), columns=[f"x{i}" for i in range(5)]) + + +@pytest.fixture +def regression_target(): + """Generate regression target.""" + np.random.seed(42) + return pd.Series(np.random.normal(size=100)) + + +def test_regression_significance(shap_features_regression, regression_target): + """Test significance calculation for regression.""" + result_df = regression_significance(shap_features_regression, regression_target, alpha=1e-6) + + assert "feature name" in result_df.columns, "Result should contain feature names" + assert "coefficient" in result_df.columns, "Result should contain coefficients" + assert "stat.significance" in result_df.columns, "Result should contain statistical significance" + assert result_df.shape[0] == shap_features_regression.shape[1], "Each feature should have a row in the output" + assert (result_df["stat.significance"] > 0).all(), "All p-values should be non-negative"