From f775d39cc0ec605d05c966684dc95a1c0537a0c3 Mon Sep 17 00:00:00 2001
From: "Egor.Kraev" <egor.kraev@transferwise.com>
Date: Tue, 14 May 2024 10:32:05 +0100
Subject: [PATCH 1/2] Deprecate min_segments parameter

---
 wise_pizza/explain.py          |  8 +++---
 wise_pizza/slicer.py           |  5 ++--
 wise_pizza/solve/find_alpha.py | 17 +----------
 wise_pizza/utils.py            | 52 ++++++++++++++++++++++++++++------
 4 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/wise_pizza/explain.py b/wise_pizza/explain.py
index 1f6e3b3..ab1066a 100644
--- a/wise_pizza/explain.py
+++ b/wise_pizza/explain.py
@@ -27,7 +27,7 @@ def explain_changes_in_average(
     total_name: str,
     size_name: str,
     min_segments: Optional[int] = None,
-    max_segments: int = 5,
+    max_segments: int = None,
     min_depth: int = 1,
     max_depth: int = 2,
     solver: str = "lasso",
@@ -124,7 +124,7 @@ def explain_changes_in_totals(
     total_name: str,
     size_name: str,
     min_segments: Optional[int] = None,
-    max_segments: int = 5,
+    max_segments: int = None,
     min_depth: int = 1,
     max_depth: int = 2,
     solver: str = "lasso",
@@ -271,7 +271,7 @@ def explain_levels(
     total_name: str,
     size_name: Optional[str] = None,
     min_segments: int = None,
-    max_segments: int = 10,
+    max_segments: int = None,
     min_depth: int = 1,
     max_depth: int = 2,
     solver="lasso",
@@ -353,7 +353,7 @@ def explain_timeseries(
     time_name: str,
     size_name: Optional[str] = None,
     min_segments: int = None,
-    max_segments: int = 5,
+    max_segments: int = None,
     min_depth: int = 1,
     max_depth: int = 2,
     solver: str = "omp",
diff --git a/wise_pizza/slicer.py b/wise_pizza/slicer.py
index de0b267..1033513 100644
--- a/wise_pizza/slicer.py
+++ b/wise_pizza/slicer.py
@@ -8,7 +8,8 @@
 import pandas as pd
 from scipy.sparse import csc_matrix, diags
 
-from wise_pizza.solve.find_alpha import clean_up_min_max, find_alpha
+from wise_pizza.solve.find_alpha import find_alpha
+from wise_pizza.utils import clean_up_min_max
 from wise_pizza.make_matrix import sparse_dummy_matrix
 from wise_pizza.cluster import make_clusters
 from wise_pizza.preselect import HeuristicSelector
@@ -98,7 +99,7 @@ def fit(
         weights: pd.Series = None,
         time_col: pd.Series = None,
         time_basis: pd.DataFrame = None,
-        min_segments: int = 10,
+        min_segments: int = None,
         max_segments: int = None,
         min_depth: int = 1,
         max_depth: int = 3,
diff --git a/wise_pizza/solve/find_alpha.py b/wise_pizza/solve/find_alpha.py
index ec454fe..8e05826 100644
--- a/wise_pizza/solve/find_alpha.py
+++ b/wise_pizza/solve/find_alpha.py
@@ -4,6 +4,7 @@
 from scipy.linalg import svd
 
 from wise_pizza.solve.solver import solve_lasso, solve_lp, solve_omp
+from wise_pizza.utils import clean_up_min_max
 
 
 def find_alpha(
@@ -219,19 +220,3 @@ def print_errors(a: np.ndarray):
         # fit_intercept=not use_proj
     )
     return reg, nonzeros
-
-
-def clean_up_min_max(min_nonzeros: int = None, max_nonzeros: int = None):
-    assert min_nonzeros is not None or max_nonzeros is not None
-    if max_nonzeros is None:
-        if min_nonzeros is None:
-            max_nonzeros = 5
-            min_nonzeros = 5
-        else:
-            max_nonzeros = min_nonzeros
-    else:
-        if min_nonzeros is None:
-            min_nonzeros = max_nonzeros
-
-    assert min_nonzeros <= max_nonzeros
-    return min_nonzeros, max_nonzeros
diff --git a/wise_pizza/utils.py b/wise_pizza/utils.py
index b4ad53a..296938c 100644
--- a/wise_pizza/utils.py
+++ b/wise_pizza/utils.py
@@ -1,3 +1,4 @@
+import logging
 from typing import List, Optional
 
 import numpy as np
@@ -120,7 +121,9 @@ def rel_error(x, y):
 
         if return_multiple:
             sd_size = SegmentData(
-                combined.rename(columns={"Change in totals": "Change from segment size"}),
+                combined.rename(
+                    columns={"Change in totals": "Change from segment size"}
+                ),
                 dimensions=dims,
                 segment_total="Change from segment size",
                 segment_size=weights,
@@ -137,7 +140,9 @@ def rel_error(x, y):
             combined["Change from"] = "Segment size"
             c2["Change from"] = "Segment average"
 
-            df = pd.concat([combined, c2])[dims + [weights, "Change in totals", "Change from"]]
+            df = pd.concat([combined, c2])[
+                dims + [weights, "Change in totals", "Change from"]
+            ]
             df_change_in_totals = np.array(df["Change in totals"], dtype=np.longdouble)
             combined_dtotals = np.array(combined["dtotals"], dtype=np.longdouble)
             df_change_in_totals_sum = np.nansum(df_change_in_totals)
@@ -160,7 +165,9 @@ def rel_error(x, y):
         combined[weights] = 1.0  # combined[totals + "_x"]
         combined[weights] = np.maximum(1.0, combined[weights])
         cols = (
-            dims + ["Change in totals", totals + "_x", totals + "_y"] + [c for c in combined.columns if "baseline" in c]
+            dims
+            + ["Change in totals", totals + "_x", totals + "_y"]
+            + [c for c in combined.columns if "baseline" in c]
         )
 
         return SegmentData(
@@ -205,16 +212,26 @@ def prepare_df(
 
     # replace NaN values in categorical columns with the column name + "_unknown"
     object_columns = list(new_df[dims].select_dtypes("object").columns)
-    new_df[object_columns] = new_df[object_columns].fillna(new_df[object_columns].apply(lambda x: x.name + "_unknown"))
+    new_df[object_columns] = new_df[object_columns].fillna(
+        new_df[object_columns].apply(lambda x: x.name + "_unknown")
+    )
     new_df[object_columns] = new_df[object_columns].astype(str)
 
     # Groupby all relevant dims to decrease the dataframe size, if possible
     group_dims = dims if time_name is None else dims + [time_name]
 
     if size_name is not None:
-        new_df = new_df.groupby(by=group_dims, observed=True)[[total_name, size_name]].sum().reset_index()
+        new_df = (
+            new_df.groupby(by=group_dims, observed=True)[[total_name, size_name]]
+            .sum()
+            .reset_index()
+        )
     else:
-        new_df = new_df.groupby(by=group_dims, observed=True)[[total_name]].sum().reset_index()
+        new_df = (
+            new_df.groupby(by=group_dims, observed=True)[[total_name]]
+            .sum()
+            .reset_index()
+        )
 
     return new_df
 
@@ -280,5 +297,24 @@ def prepare_df(
 #     new_df[object_columns] = new_df[object_columns].astype(str)
 #
 #     return new_df
-def almost_equals(x1, x2, eps: float=1e-6) -> bool:
-    return np.sum(np.abs(x1-x2))/np.mean(np.abs(x1+x2)) < eps
+def almost_equals(x1, x2, eps: float = 1e-6) -> bool:
+    return np.sum(np.abs(x1 - x2)) / np.mean(np.abs(x1 + x2)) < eps
+
+
+def clean_up_min_max(min_nonzeros: int = None, max_nonzeros: int = None):
+    if min_nonzeros is not None:
+        logging.info(
+            "min_segments parameter is deprecated, please use max_nonzeros instead."
+        )
+    if max_nonzeros is None:
+        if min_nonzeros is None:
+            max_nonzeros = 5
+            min_nonzeros = 5
+        else:
+            max_nonzeros = min_nonzeros
+    else:
+        if min_nonzeros is None:
+            min_nonzeros = max_nonzeros
+
+    assert min_nonzeros <= max_nonzeros
+    return min_nonzeros, max_nonzeros

From 7e846f8ee2bf4fa38feb92a2d8c233e0c398aa74 Mon Sep 17 00:00:00 2001
From: "Egor.Kraev" <egor.kraev@transferwise.com>
Date: Tue, 14 May 2024 10:34:23 +0100
Subject: [PATCH 2/2] Make deprecation message a warning instead of info

---
 wise_pizza/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wise_pizza/utils.py b/wise_pizza/utils.py
index 296938c..6df4ac8 100644
--- a/wise_pizza/utils.py
+++ b/wise_pizza/utils.py
@@ -303,7 +303,7 @@ def almost_equals(x1, x2, eps: float = 1e-6) -> bool:
 
 def clean_up_min_max(min_nonzeros: int = None, max_nonzeros: int = None):
     if min_nonzeros is not None:
-        logging.info(
+        logging.warning(
             "min_segments parameter is deprecated, please use max_nonzeros instead."
         )
     if max_nonzeros is None: