ENH change standard deviation to standard error (#9)

- compute_bias: change bias_stddev to bias_stderr - plot_bias use bias_stderr for errorbars
lorentzenchr · Jan 6, 2023 · e01a8cf · e01a8cf
1 parent 62256fc
commit e01a8cf
Show file tree

Hide file tree

Showing 4 changed files with 128 additions and 72 deletions.
diff --git a/docs/examples/regression_on_workers_compensation.ipynb b/docs/examples/regression_on_workers_compensation.ipynb
diff --git a/src/model_diagnostics/calibration/identification.py b/src/model_diagnostics/calibration/identification.py
@@ -128,7 +128,7 @@ def compute_bias(
         Observed values of the response variable.
         For binary classification, y_obs is expected to be in the interval [0, 1].
     y_pred : array-like of shape (n_obs) or (n_obs, n_models)
-        Predicted values of the conditional expectation of Y, :math:`E(Y|X)`.
+        Predicted values of the conditional expectation of Y, \(E(Y|X)\).
     feature : array-like of shape (n_obs) or None
         Some feature column.
     functional : str
@@ -149,6 +149,11 @@ def compute_bias(
     Returns
     -------
     df : pyarrow Table
+        The result table contains at least the columns:
+
+        - `bias_mean`: Mean of the bias
+        - `bias_cout`: Number of data rows
+        - `bias_stderr`: Standard error, i.e. standard deviation of `bias_mean`
 
     Notes
     -----
@@ -267,7 +272,13 @@ def compute_bias(
                 cnames[-1] = feature_name
                 df = df.rename_columns(cnames)
 
-        # Add p-value of 2-sided t-test.
+        # Add column standard error.
+        df = df.append_column(
+            "bias_stderr",
+            pc.divide(df.column("bias_stddev"), pc.sqrt(df.column("bias_count"))),
+        )
+
+        # Add column with p-value of 2-sided t-test.
         s_ = df.column("bias_stddev").to_numpy()
         p_value = np.full_like(s_, fill_value=np.nan)
         mask: npt.ArrayLike = s_ > 0
@@ -297,7 +308,7 @@ def compute_bias(
             col_selection.append(model_col_name)
         if feature_name is not None and feature_name in df.column_names:
             col_selection.append(feature_name)
-        col_selection += ["bias_mean", "bias_count", "bias_stddev", "p_value"]
+        col_selection += ["bias_mean", "bias_count", "bias_stderr", "p_value"]
         df_list.append(df.select(col_selection))
 
     return pa.concat_tables(df_list)
diff --git a/src/model_diagnostics/calibration/plots.py b/src/model_diagnostics/calibration/plots.py
@@ -167,14 +167,14 @@ def plot_bias(
         min_max = pc.min_max(df[feature_name]).as_py()
     ax.hlines(0, min_max["min"], min_max["max"], color="k", linestyles="dotted")
     # bias plot
-    if df["bias_stddev"].null_count > 0:
+    if df["bias_stderr"].null_count > 0:
 
         ax.plot(df[feature_name], df["bias_mean"], "o-")
     else:
         ax.errorbar(
             df[feature_name],
             df["bias_mean"],
-            yerr=df["bias_stddev"],
+            yerr=df["bias_stderr"],
             fmt="o-",
             capsize=4,
         )

diff --git a/src/model_diagnostics/calibration/tests/test_identification.py b/src/model_diagnostics/calibration/tests/test_identification.py
@@ -121,7 +121,7 @@ def test_compute_bias(feature, f_grouped):
             "feature": f_grouped,
             "bias_mean": [0.5, -1],
             "bias_count": [2, 3],
-            "bias_stddev": np.sqrt([0.25 + 0.25, (1 + 1 + 0) / 2]),
+            "bias_stderr": np.sqrt([(0.25 + 0.25), (1 + 1 + 0) / 2]) / np.sqrt([2, 3]),
             "p_value": [
                 ttest_1samp([1, 0], 0).pvalue,
                 ttest_1samp([0, -2, -1], 0).pvalue,
@@ -146,9 +146,9 @@ def test_compute_bias_feature_none():
     )
     df_expected = pa.table(
         {
-            "bias_mean": [-0.4],  # (1 + 0 + 0 - 2 - 1)/5
+            "bias_mean": [-0.4],  # (1 + 0 + 0 - 2 - 1) / 5
             "bias_count": [5],
-            "bias_stddev": [np.std([1, 0, 0, -2, -1], ddof=1)],
+            "bias_stderr": [np.std([1, 0, 0, -2, -1], ddof=1) / np.sqrt(5)],
             "p_value": [
                 ttest_1samp([1, 0, 0, -2, -1], 0).pvalue,
             ],
@@ -181,8 +181,9 @@ def test_compute_bias_numerical_feature():
             "feature": 0.045 + 0.1 * np.arange(10),
             "bias_mean": 0.955 - 0.1 * np.arange(10),
             "bias_count": n_steps * np.ones(n_bins, dtype=np.int64),
-            "bias_stddev": [
-                np.std(bias[n : n + n_steps], ddof=1) for n in range(0, n_obs, n_steps)
+            "bias_stderr": [
+                np.std(bias[n : n + n_steps], ddof=1) / np.sqrt(n_steps)
+                for n in range(0, n_obs, n_steps)
             ],
             "p_value": [
                 ttest_1samp(bias[n : n + n_steps], 0).pvalue
@@ -211,7 +212,7 @@ def test_compute_bias_multiple_predictions():
             "nice_feature": [1.0, 2, 1, 2],
             "bias_mean": [0.0, -1, 2, 1],
             "bias_count": [5] * 4,
-            "bias_stddev": [0.0] * 4,
+            "bias_stderr": [0.0] * 4,
             "p_value": [np.nan] * 4,
         }
     )