transferwise · bkoseoglu · Oct 2, 2024 · Oct 1, 2024 · Oct 1, 2024 · Oct 2, 2024
@@ -4,9 +4,6 @@ on:
   pull_request:
     branches:
       - main
-  push:
-    branches:
-      - main
 
 jobs:
   test:

@@ -23,4 +23,10 @@ hs_err_pid*
 build/
 out/
 .gradle/
-bin/
+bin/
+
+# Python cache files
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
@@ -256,4 +256,4 @@ def test_selected_column_values(model_type, data_fixture, task_type, request):
     ]
     assert (
         other_features_rows["selected"] > 0
-    ).all(), "The Selected column must have positive values for features other than x7, x8, x9"
+    ).all(), "The Selected column must have positive values for features other than x7, x8, x9"
@@ -0,0 +1,49 @@
+import pytest
+import pandas as pd
+import numpy as np
+from shap_select.select import create_shap_features
+import lightgbm as lgb
+
+
+@pytest.fixture
+def sample_data_binary():
+    """Generate sample data for binary classification."""
+    np.random.seed(42)
+    X = pd.DataFrame(np.random.normal(size=(100, 5)), columns=[f"x{i}" for i in range(5)])
+    y = (X["x0"] > 0).astype(int)
+    return X, y
+
+
+@pytest.fixture
+def sample_data_multiclass():
+    """Generate sample data for multiclass classification."""
+    np.random.seed(42)
+    X = pd.DataFrame(np.random.normal(size=(100, 5)), columns=[f"x{i}" for i in range(5)])
+    y = np.random.choice([0, 1, 2], size=100)
+    return X, y
+
+
+def test_shap_feature_generation_binary(sample_data_binary):
+    """Test SHAP feature generation for binary classification."""
+    X, y = sample_data_binary
+
+    model = lgb.LGBMClassifier()
+    model.fit(X, y)
+
+    shap_df = create_shap_features(model, X)
+    assert isinstance(shap_df, pd.DataFrame), "SHAP output should be a DataFrame"
+    assert shap_df.shape == X.shape, "SHAP output shape should match input data"
+    assert shap_df.isnull().sum().sum() == 0, "No missing values expected in SHAP output"
+
+
+def test_shap_feature_generation_multiclass(sample_data_multiclass):
+    """Test SHAP feature generation for multiclass classification."""
+    X, y = sample_data_multiclass
+
+    model = lgb.LGBMClassifier(objective="multiclass", num_class=3)
+    model.fit(X, y)
+
+    shap_df = create_shap_features(model, X, classes=[0, 1, 2])
+    assert isinstance(shap_df, dict), "SHAP output should be a dictionary for multiclass"
+    assert all(isinstance(v, pd.DataFrame) for v in shap_df.values()), "Each class should have a DataFrame"
+    assert shap_df[0].shape == X.shape, "SHAP output shape should match input data for each class"
@@ -0,0 +1,55 @@
+import pytest
+import pandas as pd
+import numpy as np
+from shap_select.select import binary_classifier_significance, regression_significance
+import statsmodels.api as sm
+
+
+@pytest.fixture
+def shap_features_binary():
+    """Generate sample SHAP values for binary classification."""
+    np.random.seed(42)
+    return pd.DataFrame(np.random.normal(size=(100, 5)), columns=[f"x{i}" for i in range(5)])
+
+
+@pytest.fixture
+def binary_target():
+    """Generate binary target."""
+    np.random.seed(42)
+    return pd.Series(np.random.choice([0, 1], size=100))
+
+
+def test_binary_classifier_significance(shap_features_binary, binary_target):
+    """Test significance calculation for binary classification."""
+    result_df = binary_classifier_significance(shap_features_binary, binary_target, alpha=1e-4)
+
+    assert "feature name" in result_df.columns, "Result should contain feature names"
+    assert "coefficient" in result_df.columns, "Result should contain coefficients"
+    assert "stat.significance" in result_df.columns, "Result should contain statistical significance"
+    assert result_df.shape[0] == shap_features_binary.shape[1], "Each feature should have a row in the output"
+    assert (result_df["stat.significance"] > 0).all(), "All p-values should be non-negative"
+
+
+@pytest.fixture
+def shap_features_regression():
+    """Generate sample SHAP values for regression."""
+    np.random.seed(42)
+    return pd.DataFrame(np.random.normal(size=(100, 5)), columns=[f"x{i}" for i in range(5)])
+
+
+@pytest.fixture
+def regression_target():
+    """Generate regression target."""
+    np.random.seed(42)
+    return pd.Series(np.random.normal(size=100))
+
+
+def test_regression_significance(shap_features_regression, regression_target):
+    """Test significance calculation for regression."""
+    result_df = regression_significance(shap_features_regression, regression_target, alpha=1e-6)
+
+    assert "feature name" in result_df.columns, "Result should contain feature names"
+    assert "coefficient" in result_df.columns, "Result should contain coefficients"
+    assert "stat.significance" in result_df.columns, "Result should contain statistical significance"
+    assert result_df.shape[0] == shap_features_regression.shape[1], "Each feature should have a row in the output"
+    assert (result_df["stat.significance"] > 0).all(), "All p-values should be non-negative"
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,9 +4,6 @@ on: @@
       pull_request:
         branches:
           - main
-      push:
-        branches:
-          - main
     jobs:
       test:
@@ Expand Down @@