From 8c684edbe9db17fddb00540cca57500810fb7a29 Mon Sep 17 00:00:00 2001
From: "Egor.Kraev" <egor.kraev@transferwise.com>
Date: Thu, 14 Nov 2024 08:40:27 +0000
Subject: [PATCH] A pretty decent TS tree fit with weights

---
 tests/timeseries_wip_entrypoint.py |  2 +-
 wise_pizza/explain.py              |  5 ++--
 wise_pizza/plotting_time_tree.py   | 35 +++++++++++++++++++++------
 wise_pizza/slicer.py               | 38 +++++++++++++++++++-----------
 wise_pizza/solve/fitter.py         | 27 ++++++++++++++++-----
 wise_pizza/solve/tree.py           | 22 +++++++++++++----
 wise_pizza/time.py                 |  6 ++---
 7 files changed, 98 insertions(+), 37 deletions(-)
diff --git a/tests/timeseries_wip_entrypoint.py b/tests/timeseries_wip_entrypoint.py
index a38778c..145bf43 100644
--- a/tests/timeseries_wip_entrypoint.py
+++ b/tests/timeseries_wip_entrypoint.py
@@ -39,7 +39,7 @@
     time_name=time,
     verbose=False,
     solver="tree",
-    fit_sizes=False,
+    fit_sizes=True,
 )
 sf.plot(plot_is_static=False, height=1000, width=1000, average_name="VPC")
 print(sf.summary())
diff --git a/wise_pizza/explain.py b/wise_pizza/explain.py
index 23e1388..ffed482 100644
--- a/wise_pizza/explain.py
+++ b/wise_pizza/explain.py
@@ -441,7 +441,7 @@ def explain_timeseries(
 
         # Block-matrix basis with itself
         re_basis = time_basis.copy().rename(
-            {c: c + "_w" for c in time_basis.columns}, axis=1
+            {c: c + "_w" for c in time_basis.columns if c != "__time"}, axis=1
         )
         time_basis["chunk"] = "Average"
         re_basis["chunk"] = "Weights"
@@ -455,6 +455,7 @@ def explain_timeseries(
         groupby_dims = ["__time"]
 
     df2["_target"] = df2[total_name]
+    df2["__time"] = df2[time_name]
     df2["total_adjustment"] = 0.0
     avg_df = 0.0
     average = 0.0
@@ -468,7 +469,7 @@ def explain_timeseries(
     sf.avg_df = avg_df
     sf.time_values = df2[time_name].unique()
     sf.fit(
-        df2[dims],
+        df2[dims + groupby_dims],
         df2["_target"],
         time_col=df2[time_name],
         time_basis=time_basis,
diff --git a/wise_pizza/plotting_time_tree.py b/wise_pizza/plotting_time_tree.py
index 436a3db..6af8f74 100644
--- a/wise_pizza/plotting_time_tree.py
+++ b/wise_pizza/plotting_time_tree.py
@@ -72,10 +72,10 @@ def plot_time_from_tree(
 def preprocess_for_ts_plot(
     sf: SliceFinderPlottingInterface,
     average_name: str,
-    weight_sf: SliceFinderPlottingInterface | None = None,
 ) -> List[List[PlotData]]:
     out = []
     for row, s in enumerate(sf.segments):
+        print(row, s)
         this_df = pd.DataFrame(
             {
                 "time": sf.time,
@@ -84,18 +84,39 @@ def preprocess_for_ts_plot(
                 "pred_totals": sf.avg_prediction * sf.weights * s["dummy"],
             }
         )
+        if sf.weight_total_prediction is not None:
+            this_df["w_pred_totals"] = sf.weight_total_prediction * s["dummy"]
+
         time_df = this_df.groupby("time", as_index=False).sum()
+
         data1 = PlotData(
             regression=time_df["pred_totals"] / time_df["weights"],
             bars=time_df["totals"] / time_df["weights"],
             subtitle=f"{average_name} for <br> {s['segment']}",
         )
-        data2 = PlotData(
-            regression=time_df["pred_totals"],
-            bars=time_df["totals"],
-            subtitle=f"{sf.total_name} for <br> {s['segment']}",
-        )
-        out.append([data1, data2])
+
+        if sf.weight_total_prediction is None:
+            data2 = PlotData(
+                regression=time_df["pred_totals"],
+                bars=time_df["totals"],
+                subtitle=f"{sf.total_name} for <br> {s['segment']}",
+            )
+            out.append([data1, data2])
+        else:
+            data2 = PlotData(
+                # Use predictions for both avg and weights if available
+                regression=time_df["w_pred_totals"]
+                * time_df["pred_totals"]
+                / time_df["weights"],
+                bars=time_df["totals"],
+                subtitle=f"{sf.total_name} for <br> {s['segment']}",
+            )
+            data3 = PlotData(
+                regression=time_df["w_pred_totals"],
+                bars=time_df["weights"],
+                subtitle=f"{sf.size_name} for <br> {s['segment']}",
+            )
+            out.append([data3, data1, data2])
 
     return out
 
diff --git a/wise_pizza/slicer.py b/wise_pizza/slicer.py
index a9013dd..290e4f8 100644
--- a/wise_pizza/slicer.py
+++ b/wise_pizza/slicer.py
@@ -158,14 +158,15 @@ def fit(
         dim_df = dim_df.astype(str)
 
         dims = list(dim_df.columns)
+        if groupby_dims is not None:
+            dims = [d for d in dims if d not in groupby_dims]
         # sort the dataframe by dimension values,
         # making sure the other vectors stay aligned
         dim_df = dim_df.reset_index(drop=True)
         dim_df["totals"] = totals
         dim_df["weights"] = weights
 
-        if time_col is not None:
-            dim_df["__time"] = time_col
+        if groupby_dims is not None:
             dim_df = pd.merge(dim_df, time_basis, on=groupby_dims)
             sort_dims = dims + groupby_dims
         else:
@@ -174,6 +175,11 @@ def fit(
         dim_df = dim_df.sort_values(sort_dims)
         dim_df = dim_df[dim_df["weights"] > 0]
 
+        if len(groupby_dims) == 2:
+            source_df = dim_df[dim_df["chunk"] == "Average"]
+        else:
+            source_df = dim_df
+
         # Transform the time basis from table by date to matrices by dataset row
         if time_col is not None:
             self.basis_df = time_basis
@@ -184,12 +190,12 @@ def fit(
             #     # take all the values a nudge away from zero so we can divide by them later
             #     this_ts[np.abs(this_ts) < 1e-6 * max_val] = 1e-6 * max_val
             #     self.time_basis[c] = csc_matrix(this_ts)
-            self.time = dim_df["__time"].values
+            self.time = source_df["__time"].values
         # else:
         #     self.time_basis = None
 
-        self.weights = dim_df["weights"].values
-        self.totals = dim_df["totals"].values
+        self.weights = source_df["weights"].values
+        self.totals = source_df["totals"].values
 
         # While we still have weights and totals as part of the dataframe, let's produce clusters
         # of dimension values with similar outcomes
@@ -203,7 +209,7 @@ def fit(
                     "Ignoring cluster_values argument as tree solver makes its own clusters"
                 )
             if time_basis is None:
-                self.X, self.col_defs, self.cluster_names, _ = tree_solver(
+                self.X, self.col_defs, self.cluster_names, _, _ = tree_solver(
                     dim_df=dim_df,
                     dims=dims,
                     num_leaves=max_segments,
@@ -233,14 +239,18 @@ def fit(
                     time_fitter_model=time_fitter_model,
                     groupby_dims=groupby_dims,
                 )
-                self.X, self.col_defs, self.cluster_names, self.avg_prediction = (
-                    tree_solver(
-                        dim_df=dim_df,
-                        dims=dims,
-                        fitter=fitter,
-                        num_leaves=max_segments,
-                        max_depth=max_depth,
-                    )
+                (
+                    self.X,
+                    self.col_defs,
+                    self.cluster_names,
+                    self.avg_prediction,
+                    self.weight_total_prediction,
+                ) = tree_solver(
+                    dim_df=dim_df,
+                    dims=dims,
+                    fitter=fitter,
+                    num_leaves=max_segments,
+                    max_depth=max_depth,
                 )
             self.nonzeros = np.array(range(self.X.shape[1]))
 
diff --git a/wise_pizza/solve/fitter.py b/wise_pizza/solve/fitter.py
index e8a31fc..f002cdd 100644
--- a/wise_pizza/solve/fitter.py
+++ b/wise_pizza/solve/fitter.py
@@ -26,7 +26,11 @@ def fit_predict(self, X, y, sample_weight=None):
     def error(self, X, y, sample_weight=None):
         # Error is chosen so that it's minimized by the weighted mean of y
         # debug_plot(X, y, self.predict(X), sample_weight)
-        err = y - self.predict(X)
+        X = X.copy()
+        X["target"] = y
+        if hasattr(self, "dims"):
+            X = X.sort_values(self.dims + self.groupby_dims)
+        err = X["target"] - self.predict(X)
         errsq = err**2
         if sample_weight is not None:
             errsq *= sample_weight
@@ -64,6 +68,7 @@ def fit_predict(self, X, y, sample_weight=None):
 class AverageFitter(Fitter):
     def __init__(self):
         self.avg = None
+        self.groupby_dims = []
 
     def fit(self, X, y, sample_weight=None):
         y = np.array(y)
@@ -109,7 +114,7 @@ def fit(self, X, y, sample_weight=None):
 
     def predict(self, X):
         # predict straight away on the big table, it's row-wise anyway
-        return self.time_fitter.predict(X[self.groupby_dims + self.dims])
+        return self.time_fitter.predict(X[self.dims + self.groupby_dims])
 
 
 class TimeFitterLinearModel(TimeFitterModel):
@@ -137,13 +142,16 @@ def fit(self, X: pd.DataFrame, y, sample_weight=None):
             self.basis,
             on=self.groupby_dims,
         )
+        this_X = this_basis[self.basis_cols]
+        w = this_basis["weights"].values
+        w = w / w.max()
         self.reg = LinearRegression().fit(
-            X=this_basis[self.basis_cols],
-            y=this_basis["target"],
-            sample_weight=None if sample_weight is None else this_basis["weights"],
+            X=this_X,
+            y=this_basis["target"].values,
+            sample_weight=None if sample_weight is None else w,
         )
         ## testing code begins
-        # self.prediction = self.reg.predict(this_basis[self.basis_cols])
+        # self.prediction = self.reg.predict(this_X)
         # test = pd.DataFrame(
         #     {
         #         "time": this_basis[self.time_col],
@@ -151,7 +159,14 @@ def fit(self, X: pd.DataFrame, y, sample_weight=None):
         #         "prediction": self.prediction,
         #     }
         # )
+        # import matplotlib.pyplot as plt
+        #
+        # plt.plot(test["target"], label="target")
+        # plt.plot(test["prediction"], label="prediction")
+        # plt.legend()
+        # plt.show()
         # print("yay!")
+
         ## testing code ends
 
     def predict(self, X: pd.DataFrame):
diff --git a/wise_pizza/solve/tree.py b/wise_pizza/solve/tree.py
index 955a3e3..371b7b5 100644
--- a/wise_pizza/solve/tree.py
+++ b/wise_pizza/solve/tree.py
@@ -50,11 +50,25 @@ def tree_solver(
 
     # The convention in the calling code is first dims then time
     re_df = pd.concat([leaf.df for leaf in leaves]).sort_values(
-        dims if isinstance(fitter, AverageFitter) else dims + fitter.groupby_dims
+        dims + fitter.groupby_dims
     )
+
+    if len(fitter.groupby_dims) == 2:  # Time series with weights
+        re_df_w = re_df[re_df["chunk"] == "Weights"].copy()
+        re_df = re_df[re_df["chunk"] == "Average"]
+        w_total_prediction = (re_df_w["prediction"] * re_df_w["weights"]).values
+    else:
+        w_total_prediction = None
+
     X = pd.get_dummies(re_df["Segment_id"]).values
 
-    return csc_matrix(X), col_defs, cluster_names, re_df["prediction"].values
+    return (
+        csc_matrix(X),
+        col_defs,
+        cluster_names,
+        re_df["prediction"].values,
+        w_total_prediction,
+    )
 
 
 def error(x: np.ndarray, y: np.ndarray) -> float:
@@ -83,7 +97,7 @@ def __init__(
         max_depth: Optional[int] = None,
         dim_split: Optional[Dict[str, List]] = None,
     ):
-        self.df = df.copy().sort_values([time_col] + dims)
+        self.df = df.copy().sort_values(dims + fitter.groupby_dims)
         self.fitter = fitter
         self.dims = dims
         self.time_col = time_col
@@ -102,7 +116,7 @@ def depth(self):
 
     @property
     def error(self):
-        this_X = self.df[self.dims + ([] if self.time_col is None else [self.time_col])]
+        this_X = self.df[self.dims + self.fitter.groupby_dims]
         if self.model is None:
             self.model = copy.deepcopy(self.fitter)
             self.model.fit(
diff --git a/wise_pizza/time.py b/wise_pizza/time.py
index 148a379..e50e569 100644
--- a/wise_pizza/time.py
+++ b/wise_pizza/time.py
@@ -19,9 +19,9 @@ def create_time_basis(
     const = np.ones(len(t))
     linear = np.cumsum(const)
     linear -= linear.mean()  # now orthogonal to const
-    col_names = ["Slope"]
+    col_names = ["Intercept", "Slope"]
 
-    dummies = [linear]
+    dummies = [const, linear]
 
     if include_breaks:
         for i in range(1, len(t)):
@@ -47,7 +47,7 @@ def prune_time_basis(
     # from all the possible kinks, choose evenly spaced num_breaks ones
     for i in range(1, num_breaks + 1):
         chosen_cols.append(dtrend_cols[int(i * len(dtrend_cols) / (num_breaks + 1))])
-    pre_basis = time_basis[list(time_basis.columns[:2]) + chosen_cols].copy()
+    pre_basis = time_basis[["Intercept", "Slope"] + chosen_cols].copy()
     if solver != "tree":
         # TODO: fix this bug
         for c in chosen_cols: