From ba102654a8f794dfb7855b24b9a8dacf2ec8c50d Mon Sep 17 00:00:00 2001 From: bkoseoglu Date: Tue, 1 Oct 2024 16:49:16 +0100 Subject: [PATCH 1/3] add more unit tests --- tests/test_shap_feature_generation.py | 49 +++++++++++++++++++++++ tests/test_significance_calculation.py | 55 ++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 tests/test_shap_feature_generation.py create mode 100644 tests/test_significance_calculation.py diff --git a/tests/test_shap_feature_generation.py b/tests/test_shap_feature_generation.py new file mode 100644 index 0000000..bcd68e5 --- /dev/null +++ b/tests/test_shap_feature_generation.py @@ -0,0 +1,49 @@ +import pytest +import pandas as pd +import numpy as np +from shap_select import create_shap_features +import lightgbm as lgb + + +@pytest.fixture +def sample_data_binary(): + """Generate sample data for binary classification.""" + np.random.seed(42) + X = pd.DataFrame(np.random.normal(size=(100, 5)), columns=[f"x{i}" for i in range(5)]) + y = (X["x0"] > 0).astype(int) + return X, y + + +@pytest.fixture +def sample_data_multiclass(): + """Generate sample data for multiclass classification.""" + np.random.seed(42) + X = pd.DataFrame(np.random.normal(size=(100, 5)), columns=[f"x{i}" for i in range(5)]) + y = np.random.choice([0, 1, 2], size=100) + return X, y + + +def test_shap_feature_generation_binary(sample_data_binary): + """Test SHAP feature generation for binary classification.""" + X, y = sample_data_binary + + model = lgb.LGBMClassifier() + model.fit(X, y) + + shap_df = create_shap_features(model, X) + assert isinstance(shap_df, pd.DataFrame), "SHAP output should be a DataFrame" + assert shap_df.shape == X.shape, "SHAP output shape should match input data" + assert shap_df.isnull().sum().sum() == 0, "No missing values expected in SHAP output" + + +def test_shap_feature_generation_multiclass(sample_data_multiclass): + """Test SHAP feature generation for multiclass classification.""" + X, y = sample_data_multiclass + + model = lgb.LGBMClassifier(objective="multiclass", num_class=3) + model.fit(X, y) + + shap_df = create_shap_features(model, X, classes=[0, 1, 2]) + assert isinstance(shap_df, dict), "SHAP output should be a dictionary for multiclass" + assert all(isinstance(v, pd.DataFrame) for v in shap_df.values()), "Each class should have a DataFrame" + assert shap_df[0].shape == X.shape, "SHAP output shape should match input data for each class" diff --git a/tests/test_significance_calculation.py b/tests/test_significance_calculation.py new file mode 100644 index 0000000..0f5dffd --- /dev/null +++ b/tests/test_significance_calculation.py @@ -0,0 +1,55 @@ +import pytest +import pandas as pd +import numpy as np +from shap_select import binary_classifier_significance, regression_significance +import statsmodels.api as sm + + +@pytest.fixture +def shap_features_binary(): + """Generate sample SHAP values for binary classification.""" + np.random.seed(42) + return pd.DataFrame(np.random.normal(size=(100, 5)), columns=[f"x{i}" for i in range(5)]) + + +@pytest.fixture +def binary_target(): + """Generate binary target.""" + np.random.seed(42) + return pd.Series(np.random.choice([0, 1], size=100)) + + +def test_binary_classifier_significance(shap_features_binary, binary_target): + """Test significance calculation for binary classification.""" + result_df = binary_classifier_significance(shap_features_binary, binary_target, alpha=1e-4) + + assert "feature name" in result_df.columns, "Result should contain feature names" + assert "coefficient" in result_df.columns, "Result should contain coefficients" + assert "stat.significance" in result_df.columns, "Result should contain statistical significance" + assert result_df.shape[0] == shap_features_binary.shape[1], "Each feature should have a row in the output" + assert (result_df["stat.significance"] > 0).all(), "All p-values should be non-negative" + + +@pytest.fixture +def shap_features_regression(): + """Generate sample SHAP values for regression.""" + np.random.seed(42) + return pd.DataFrame(np.random.normal(size=(100, 5)), columns=[f"x{i}" for i in range(5)]) + + +@pytest.fixture +def regression_target(): + """Generate regression target.""" + np.random.seed(42) + return pd.Series(np.random.normal(size=100)) + + +def test_regression_significance(shap_features_regression, regression_target): + """Test significance calculation for regression.""" + result_df = regression_significance(shap_features_regression, regression_target, alpha=1e-6) + + assert "feature name" in result_df.columns, "Result should contain feature names" + assert "coefficient" in result_df.columns, "Result should contain coefficients" + assert "stat.significance" in result_df.columns, "Result should contain statistical significance" + assert result_df.shape[0] == shap_features_regression.shape[1], "Each feature should have a row in the output" + assert (result_df["stat.significance"] > 0).all(), "All p-values should be non-negative" From be17f045a74f5ca9fc6ffbe331b5bfe2a4c183f8 Mon Sep 17 00:00:00 2001 From: bkoseoglu Date: Tue, 1 Oct 2024 16:50:15 +0100 Subject: [PATCH 2/3] remove runnig tests on push to main --- .github/workflows/test.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 424c196..5211a7a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,9 +4,6 @@ on: pull_request: branches: - main - push: - branches: - - main jobs: test: From 68ef18332f2d00f58a8dbd28e720d435841dd604 Mon Sep 17 00:00:00 2001 From: bkoseoglu Date: Wed, 2 Oct 2024 12:04:26 +0100 Subject: [PATCH 3/3] refactor tests --- .gitignore | 8 +++++++- tests/test_regression.py | 2 +- tests/test_shap_feature_generation.py | 2 +- tests/test_significance_calculation.py | 2 +- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 7e0a4a4..46956c1 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,10 @@ hs_err_pid* build/ out/ .gradle/ -bin/ \ No newline at end of file +bin/ + +# Python cache files +__pycache__/ +*.py[cod] +*.pyo +*.pyd \ No newline at end of file diff --git a/tests/test_regression.py b/tests/test_regression.py index d5ef6da..e7dbee7 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -256,4 +256,4 @@ def test_selected_column_values(model_type, data_fixture, task_type, request): ] assert ( other_features_rows["selected"] > 0 - ).all(), "The Selected column must have positive values for features other than x7, x8, x9" + ).all(), "The Selected column must have positive values for features other than x7, x8, x9" \ No newline at end of file diff --git a/tests/test_shap_feature_generation.py b/tests/test_shap_feature_generation.py index bcd68e5..179c21c 100644 --- a/tests/test_shap_feature_generation.py +++ b/tests/test_shap_feature_generation.py @@ -1,7 +1,7 @@ import pytest import pandas as pd import numpy as np -from shap_select import create_shap_features +from shap_select.select import create_shap_features import lightgbm as lgb diff --git a/tests/test_significance_calculation.py b/tests/test_significance_calculation.py index 0f5dffd..7d11db7 100644 --- a/tests/test_significance_calculation.py +++ b/tests/test_significance_calculation.py @@ -1,7 +1,7 @@ import pytest import pandas as pd import numpy as np -from shap_select import binary_classifier_significance, regression_significance +from shap_select.select import binary_classifier_significance, regression_significance import statsmodels.api as sm