From 8c684edbe9db17fddb00540cca57500810fb7a29 Mon Sep 17 00:00:00 2001 From: "Egor.Kraev" Date: Thu, 14 Nov 2024 08:40:27 +0000 Subject: [PATCH] A pretty decent TS tree fit with weights --- tests/timeseries_wip_entrypoint.py | 2 +- wise_pizza/explain.py | 5 ++-- wise_pizza/plotting_time_tree.py | 35 +++++++++++++++++++++------ wise_pizza/slicer.py | 38 +++++++++++++++++++----------- wise_pizza/solve/fitter.py | 27 ++++++++++++++++----- wise_pizza/solve/tree.py | 22 +++++++++++++---- wise_pizza/time.py | 6 ++--- 7 files changed, 98 insertions(+), 37 deletions(-) diff --git a/tests/timeseries_wip_entrypoint.py b/tests/timeseries_wip_entrypoint.py index a38778c..145bf43 100644 --- a/tests/timeseries_wip_entrypoint.py +++ b/tests/timeseries_wip_entrypoint.py @@ -39,7 +39,7 @@ time_name=time, verbose=False, solver="tree", - fit_sizes=False, + fit_sizes=True, ) sf.plot(plot_is_static=False, height=1000, width=1000, average_name="VPC") print(sf.summary()) diff --git a/wise_pizza/explain.py b/wise_pizza/explain.py index 23e1388..ffed482 100644 --- a/wise_pizza/explain.py +++ b/wise_pizza/explain.py @@ -441,7 +441,7 @@ def explain_timeseries( # Block-matrix basis with itself re_basis = time_basis.copy().rename( - {c: c + "_w" for c in time_basis.columns}, axis=1 + {c: c + "_w" for c in time_basis.columns if c != "__time"}, axis=1 ) time_basis["chunk"] = "Average" re_basis["chunk"] = "Weights" @@ -455,6 +455,7 @@ def explain_timeseries( groupby_dims = ["__time"] df2["_target"] = df2[total_name] + df2["__time"] = df2[time_name] df2["total_adjustment"] = 0.0 avg_df = 0.0 average = 0.0 @@ -468,7 +469,7 @@ def explain_timeseries( sf.avg_df = avg_df sf.time_values = df2[time_name].unique() sf.fit( - df2[dims], + df2[dims + groupby_dims], df2["_target"], time_col=df2[time_name], time_basis=time_basis, diff --git a/wise_pizza/plotting_time_tree.py b/wise_pizza/plotting_time_tree.py index 436a3db..6af8f74 100644 --- a/wise_pizza/plotting_time_tree.py +++ b/wise_pizza/plotting_time_tree.py @@ -72,10 +72,10 @@ def plot_time_from_tree( def preprocess_for_ts_plot( sf: SliceFinderPlottingInterface, average_name: str, - weight_sf: SliceFinderPlottingInterface | None = None, ) -> List[List[PlotData]]: out = [] for row, s in enumerate(sf.segments): + print(row, s) this_df = pd.DataFrame( { "time": sf.time, @@ -84,18 +84,39 @@ def preprocess_for_ts_plot( "pred_totals": sf.avg_prediction * sf.weights * s["dummy"], } ) + if sf.weight_total_prediction is not None: + this_df["w_pred_totals"] = sf.weight_total_prediction * s["dummy"] + time_df = this_df.groupby("time", as_index=False).sum() + data1 = PlotData( regression=time_df["pred_totals"] / time_df["weights"], bars=time_df["totals"] / time_df["weights"], subtitle=f"{average_name} for
{s['segment']}", ) - data2 = PlotData( - regression=time_df["pred_totals"], - bars=time_df["totals"], - subtitle=f"{sf.total_name} for
{s['segment']}", - ) - out.append([data1, data2]) + + if sf.weight_total_prediction is None: + data2 = PlotData( + regression=time_df["pred_totals"], + bars=time_df["totals"], + subtitle=f"{sf.total_name} for
{s['segment']}", + ) + out.append([data1, data2]) + else: + data2 = PlotData( + # Use predictions for both avg and weights if available + regression=time_df["w_pred_totals"] + * time_df["pred_totals"] + / time_df["weights"], + bars=time_df["totals"], + subtitle=f"{sf.total_name} for
{s['segment']}", + ) + data3 = PlotData( + regression=time_df["w_pred_totals"], + bars=time_df["weights"], + subtitle=f"{sf.size_name} for
{s['segment']}", + ) + out.append([data3, data1, data2]) return out diff --git a/wise_pizza/slicer.py b/wise_pizza/slicer.py index a9013dd..290e4f8 100644 --- a/wise_pizza/slicer.py +++ b/wise_pizza/slicer.py @@ -158,14 +158,15 @@ def fit( dim_df = dim_df.astype(str) dims = list(dim_df.columns) + if groupby_dims is not None: + dims = [d for d in dims if d not in groupby_dims] # sort the dataframe by dimension values, # making sure the other vectors stay aligned dim_df = dim_df.reset_index(drop=True) dim_df["totals"] = totals dim_df["weights"] = weights - if time_col is not None: - dim_df["__time"] = time_col + if groupby_dims is not None: dim_df = pd.merge(dim_df, time_basis, on=groupby_dims) sort_dims = dims + groupby_dims else: @@ -174,6 +175,11 @@ def fit( dim_df = dim_df.sort_values(sort_dims) dim_df = dim_df[dim_df["weights"] > 0] + if len(groupby_dims) == 2: + source_df = dim_df[dim_df["chunk"] == "Average"] + else: + source_df = dim_df + # Transform the time basis from table by date to matrices by dataset row if time_col is not None: self.basis_df = time_basis @@ -184,12 +190,12 @@ def fit( # # take all the values a nudge away from zero so we can divide by them later # this_ts[np.abs(this_ts) < 1e-6 * max_val] = 1e-6 * max_val # self.time_basis[c] = csc_matrix(this_ts) - self.time = dim_df["__time"].values + self.time = source_df["__time"].values # else: # self.time_basis = None - self.weights = dim_df["weights"].values - self.totals = dim_df["totals"].values + self.weights = source_df["weights"].values + self.totals = source_df["totals"].values # While we still have weights and totals as part of the dataframe, let's produce clusters # of dimension values with similar outcomes @@ -203,7 +209,7 @@ def fit( "Ignoring cluster_values argument as tree solver makes its own clusters" ) if time_basis is None: - self.X, self.col_defs, self.cluster_names, _ = tree_solver( + self.X, self.col_defs, self.cluster_names, _, _ = tree_solver( dim_df=dim_df, dims=dims, num_leaves=max_segments, @@ -233,14 +239,18 @@ def fit( time_fitter_model=time_fitter_model, groupby_dims=groupby_dims, ) - self.X, self.col_defs, self.cluster_names, self.avg_prediction = ( - tree_solver( - dim_df=dim_df, - dims=dims, - fitter=fitter, - num_leaves=max_segments, - max_depth=max_depth, - ) + ( + self.X, + self.col_defs, + self.cluster_names, + self.avg_prediction, + self.weight_total_prediction, + ) = tree_solver( + dim_df=dim_df, + dims=dims, + fitter=fitter, + num_leaves=max_segments, + max_depth=max_depth, ) self.nonzeros = np.array(range(self.X.shape[1])) diff --git a/wise_pizza/solve/fitter.py b/wise_pizza/solve/fitter.py index e8a31fc..f002cdd 100644 --- a/wise_pizza/solve/fitter.py +++ b/wise_pizza/solve/fitter.py @@ -26,7 +26,11 @@ def fit_predict(self, X, y, sample_weight=None): def error(self, X, y, sample_weight=None): # Error is chosen so that it's minimized by the weighted mean of y # debug_plot(X, y, self.predict(X), sample_weight) - err = y - self.predict(X) + X = X.copy() + X["target"] = y + if hasattr(self, "dims"): + X = X.sort_values(self.dims + self.groupby_dims) + err = X["target"] - self.predict(X) errsq = err**2 if sample_weight is not None: errsq *= sample_weight @@ -64,6 +68,7 @@ def fit_predict(self, X, y, sample_weight=None): class AverageFitter(Fitter): def __init__(self): self.avg = None + self.groupby_dims = [] def fit(self, X, y, sample_weight=None): y = np.array(y) @@ -109,7 +114,7 @@ def fit(self, X, y, sample_weight=None): def predict(self, X): # predict straight away on the big table, it's row-wise anyway - return self.time_fitter.predict(X[self.groupby_dims + self.dims]) + return self.time_fitter.predict(X[self.dims + self.groupby_dims]) class TimeFitterLinearModel(TimeFitterModel): @@ -137,13 +142,16 @@ def fit(self, X: pd.DataFrame, y, sample_weight=None): self.basis, on=self.groupby_dims, ) + this_X = this_basis[self.basis_cols] + w = this_basis["weights"].values + w = w / w.max() self.reg = LinearRegression().fit( - X=this_basis[self.basis_cols], - y=this_basis["target"], - sample_weight=None if sample_weight is None else this_basis["weights"], + X=this_X, + y=this_basis["target"].values, + sample_weight=None if sample_weight is None else w, ) ## testing code begins - # self.prediction = self.reg.predict(this_basis[self.basis_cols]) + # self.prediction = self.reg.predict(this_X) # test = pd.DataFrame( # { # "time": this_basis[self.time_col], @@ -151,7 +159,14 @@ def fit(self, X: pd.DataFrame, y, sample_weight=None): # "prediction": self.prediction, # } # ) + # import matplotlib.pyplot as plt + # + # plt.plot(test["target"], label="target") + # plt.plot(test["prediction"], label="prediction") + # plt.legend() + # plt.show() # print("yay!") + ## testing code ends def predict(self, X: pd.DataFrame): diff --git a/wise_pizza/solve/tree.py b/wise_pizza/solve/tree.py index 955a3e3..371b7b5 100644 --- a/wise_pizza/solve/tree.py +++ b/wise_pizza/solve/tree.py @@ -50,11 +50,25 @@ def tree_solver( # The convention in the calling code is first dims then time re_df = pd.concat([leaf.df for leaf in leaves]).sort_values( - dims if isinstance(fitter, AverageFitter) else dims + fitter.groupby_dims + dims + fitter.groupby_dims ) + + if len(fitter.groupby_dims) == 2: # Time series with weights + re_df_w = re_df[re_df["chunk"] == "Weights"].copy() + re_df = re_df[re_df["chunk"] == "Average"] + w_total_prediction = (re_df_w["prediction"] * re_df_w["weights"]).values + else: + w_total_prediction = None + X = pd.get_dummies(re_df["Segment_id"]).values - return csc_matrix(X), col_defs, cluster_names, re_df["prediction"].values + return ( + csc_matrix(X), + col_defs, + cluster_names, + re_df["prediction"].values, + w_total_prediction, + ) def error(x: np.ndarray, y: np.ndarray) -> float: @@ -83,7 +97,7 @@ def __init__( max_depth: Optional[int] = None, dim_split: Optional[Dict[str, List]] = None, ): - self.df = df.copy().sort_values([time_col] + dims) + self.df = df.copy().sort_values(dims + fitter.groupby_dims) self.fitter = fitter self.dims = dims self.time_col = time_col @@ -102,7 +116,7 @@ def depth(self): @property def error(self): - this_X = self.df[self.dims + ([] if self.time_col is None else [self.time_col])] + this_X = self.df[self.dims + self.fitter.groupby_dims] if self.model is None: self.model = copy.deepcopy(self.fitter) self.model.fit( diff --git a/wise_pizza/time.py b/wise_pizza/time.py index 148a379..e50e569 100644 --- a/wise_pizza/time.py +++ b/wise_pizza/time.py @@ -19,9 +19,9 @@ def create_time_basis( const = np.ones(len(t)) linear = np.cumsum(const) linear -= linear.mean() # now orthogonal to const - col_names = ["Slope"] + col_names = ["Intercept", "Slope"] - dummies = [linear] + dummies = [const, linear] if include_breaks: for i in range(1, len(t)): @@ -47,7 +47,7 @@ def prune_time_basis( # from all the possible kinks, choose evenly spaced num_breaks ones for i in range(1, num_breaks + 1): chosen_cols.append(dtrend_cols[int(i * len(dtrend_cols) / (num_breaks + 1))]) - pre_basis = time_basis[list(time_basis.columns[:2]) + chosen_cols].copy() + pre_basis = time_basis[["Intercept", "Slope"] + chosen_cols].copy() if solver != "tree": # TODO: fix this bug for c in chosen_cols: