From 4ce606f2790ba6f80359f95564a61422243c9714 Mon Sep 17 00:00:00 2001
From: "Egor.Kraev" <egor.kraev@transferwise.com>
Date: Thu, 19 Sep 2024 09:33:32 +0100
Subject: [PATCH] First decent cut of code, with example usage

---
 docs/example.py         | 91 +++++++++++++++++++++++++++++++++++++++++
 shap_select/__init__.py |  1 +
 shap_select/select.py   |  3 +-
 3 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 docs/example.py
 create mode 100644 shap_select/__init__.py

diff --git a/docs/example.py b/docs/example.py
new file mode 100644
index 0000000..d446c38
--- /dev/null
+++ b/docs/example.py
@@ -0,0 +1,91 @@
+import numpy as np
+import pandas as pd
+import lightgbm as lgb
+import xgboost as xgb
+from sklearn.model_selection import train_test_split
+
+from shap_select import score_features
+
+# Generate a dataset with 8 normally distributed features and a target based on a given formula
+np.random.seed(42)
+n_samples = 100000
+
+# Create 8 normally distributed features
+X = pd.DataFrame(
+    {
+        "x1": np.random.normal(size=n_samples),
+        "x2": np.random.normal(size=n_samples),
+        "x3": np.random.normal(size=n_samples),
+        "x4": np.random.normal(size=n_samples),
+        "x5": np.random.normal(size=n_samples),
+        "x6": np.random.normal(size=n_samples),
+        "x7": np.random.normal(size=n_samples),
+        "x8": np.random.normal(size=n_samples),
+        "x9": np.random.normal(size=n_samples),
+    }
+)
+
+# make all the features positive-ish
+X += 3
+
+# Define the target based on the formula y = x1 + x2*x3 + x4*x5*x6
+y = (
+    X["x1"]
+    + X["x2"] * X["x3"]
+    + X["x4"] * X["x5"] * X["x6"]
+    + 10 * np.random.normal(size=n_samples)  # lots of noise
+)
+X["x6"] *= 0.1
+X["x6"] += np.random.normal(size=n_samples)
+
+# Split the dataset into training and validation sets (both with 10K rows)
+X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
+
+lightgbm = True
+stopping_rounds = 50
+
+if lightgbm:
+
+    # Train a LightGBM model on the training data
+    train_data = lgb.Dataset(X_train, label=y_train)
+    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
+    params = {"objective": "regression", "metric": "rmse", "verbose": -1}
+    model = lgb.train(
+        params,
+        train_data,
+        num_boost_round=1000,  # Max number of boosting rounds
+        valid_sets=[train_data, val_data],  # Validation sets
+        valid_names=["train", "valid"],  # Name the datasets
+        callbacks=[
+            lgb.early_stopping(stopping_rounds=stopping_rounds)
+        ],  # Stop if validation score doesn't improve for 10 rounds
+    )
+else:
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dval = xgb.DMatrix(X_val, label=y_val)
+
+    # Set parameters for XGBoost
+    params = {
+        "objective": "reg:squarederror",  # Regression task
+        "eval_metric": "rmse",  # Metric to evaluate
+        "verbosity": 0,  # Set to 0 to disable output
+    }
+
+    # Train the model with early stopping
+    evals = [(dval, "valid")]
+    model = xgb.train(
+        params,
+        dtrain,
+        num_boost_round=1000,  # Max number of boosting rounds
+        evals=evals,  # Evaluation set
+        early_stopping_rounds=stopping_rounds,  # Stop if validation RMSE doesn't improve for 10 rounds
+    )
+
+
+# Call the select_features function
+selected_features_df, shap_features = score_features(
+    model, X_val, X.columns.tolist(), y_val
+)
+
+# Output the resulting DataFrame
+print(selected_features_df.head())
diff --git a/shap_select/__init__.py b/shap_select/__init__.py
new file mode 100644
index 0000000..9d26d4d
--- /dev/null
+++ b/shap_select/__init__.py
@@ -0,0 +1 @@
+from .select import score_features
diff --git a/shap_select/select.py b/shap_select/select.py
index 6be0001..be3398f 100644
--- a/shap_select/select.py
+++ b/shap_select/select.py
@@ -1,4 +1,5 @@
 from typing import Any, Tuple, List
+
 import pandas as pd
 import statsmodels.api as sm
 import shap
@@ -199,7 +200,7 @@ def shap_features_to_significance(
     return result_df_sorted
 
 
-def select_features(
+def score_features(
     tree_model: Any,
     validation_df: pd.DataFrame,
     feature_names: List[str],