Skip to content

Commit

Permalink
A pretty decent TS tree fit with weights
Browse files Browse the repository at this point in the history
  • Loading branch information
EgorKraevTransferwise committed Nov 14, 2024
1 parent 2dd348b commit 8c684ed
Show file tree
Hide file tree
Showing 7 changed files with 98 additions and 37 deletions.
2 changes: 1 addition & 1 deletion tests/timeseries_wip_entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
time_name=time,
verbose=False,
solver="tree",
fit_sizes=False,
fit_sizes=True,
)
sf.plot(plot_is_static=False, height=1000, width=1000, average_name="VPC")
print(sf.summary())
Expand Down
5 changes: 3 additions & 2 deletions wise_pizza/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ def explain_timeseries(

# Block-matrix basis with itself
re_basis = time_basis.copy().rename(
{c: c + "_w" for c in time_basis.columns}, axis=1
{c: c + "_w" for c in time_basis.columns if c != "__time"}, axis=1
)
time_basis["chunk"] = "Average"
re_basis["chunk"] = "Weights"
Expand All @@ -455,6 +455,7 @@ def explain_timeseries(
groupby_dims = ["__time"]

df2["_target"] = df2[total_name]
df2["__time"] = df2[time_name]
df2["total_adjustment"] = 0.0
avg_df = 0.0
average = 0.0
Expand All @@ -468,7 +469,7 @@ def explain_timeseries(
sf.avg_df = avg_df
sf.time_values = df2[time_name].unique()
sf.fit(
df2[dims],
df2[dims + groupby_dims],
df2["_target"],
time_col=df2[time_name],
time_basis=time_basis,
Expand Down
35 changes: 28 additions & 7 deletions wise_pizza/plotting_time_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ def plot_time_from_tree(
def preprocess_for_ts_plot(
sf: SliceFinderPlottingInterface,
average_name: str,
weight_sf: SliceFinderPlottingInterface | None = None,
) -> List[List[PlotData]]:
out = []
for row, s in enumerate(sf.segments):
print(row, s)
this_df = pd.DataFrame(
{
"time": sf.time,
Expand All @@ -84,18 +84,39 @@ def preprocess_for_ts_plot(
"pred_totals": sf.avg_prediction * sf.weights * s["dummy"],
}
)
if sf.weight_total_prediction is not None:
this_df["w_pred_totals"] = sf.weight_total_prediction * s["dummy"]

time_df = this_df.groupby("time", as_index=False).sum()

data1 = PlotData(
regression=time_df["pred_totals"] / time_df["weights"],
bars=time_df["totals"] / time_df["weights"],
subtitle=f"{average_name} for <br> {s['segment']}",
)
data2 = PlotData(
regression=time_df["pred_totals"],
bars=time_df["totals"],
subtitle=f"{sf.total_name} for <br> {s['segment']}",
)
out.append([data1, data2])

if sf.weight_total_prediction is None:
data2 = PlotData(
regression=time_df["pred_totals"],
bars=time_df["totals"],
subtitle=f"{sf.total_name} for <br> {s['segment']}",
)
out.append([data1, data2])
else:
data2 = PlotData(
# Use predictions for both avg and weights if available
regression=time_df["w_pred_totals"]
* time_df["pred_totals"]
/ time_df["weights"],
bars=time_df["totals"],
subtitle=f"{sf.total_name} for <br> {s['segment']}",
)
data3 = PlotData(
regression=time_df["w_pred_totals"],
bars=time_df["weights"],
subtitle=f"{sf.size_name} for <br> {s['segment']}",
)
out.append([data3, data1, data2])

return out

Expand Down
38 changes: 24 additions & 14 deletions wise_pizza/slicer.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,14 +158,15 @@ def fit(
dim_df = dim_df.astype(str)

dims = list(dim_df.columns)
if groupby_dims is not None:
dims = [d for d in dims if d not in groupby_dims]
# sort the dataframe by dimension values,
# making sure the other vectors stay aligned
dim_df = dim_df.reset_index(drop=True)
dim_df["totals"] = totals
dim_df["weights"] = weights

if time_col is not None:
dim_df["__time"] = time_col
if groupby_dims is not None:
dim_df = pd.merge(dim_df, time_basis, on=groupby_dims)
sort_dims = dims + groupby_dims
else:
Expand All @@ -174,6 +175,11 @@ def fit(
dim_df = dim_df.sort_values(sort_dims)
dim_df = dim_df[dim_df["weights"] > 0]

if len(groupby_dims) == 2:
source_df = dim_df[dim_df["chunk"] == "Average"]
else:
source_df = dim_df

# Transform the time basis from table by date to matrices by dataset row
if time_col is not None:
self.basis_df = time_basis
Expand All @@ -184,12 +190,12 @@ def fit(
# # take all the values a nudge away from zero so we can divide by them later
# this_ts[np.abs(this_ts) < 1e-6 * max_val] = 1e-6 * max_val
# self.time_basis[c] = csc_matrix(this_ts)
self.time = dim_df["__time"].values
self.time = source_df["__time"].values
# else:
# self.time_basis = None

self.weights = dim_df["weights"].values
self.totals = dim_df["totals"].values
self.weights = source_df["weights"].values
self.totals = source_df["totals"].values

# While we still have weights and totals as part of the dataframe, let's produce clusters
# of dimension values with similar outcomes
Expand All @@ -203,7 +209,7 @@ def fit(
"Ignoring cluster_values argument as tree solver makes its own clusters"
)
if time_basis is None:
self.X, self.col_defs, self.cluster_names, _ = tree_solver(
self.X, self.col_defs, self.cluster_names, _, _ = tree_solver(
dim_df=dim_df,
dims=dims,
num_leaves=max_segments,
Expand Down Expand Up @@ -233,14 +239,18 @@ def fit(
time_fitter_model=time_fitter_model,
groupby_dims=groupby_dims,
)
self.X, self.col_defs, self.cluster_names, self.avg_prediction = (
tree_solver(
dim_df=dim_df,
dims=dims,
fitter=fitter,
num_leaves=max_segments,
max_depth=max_depth,
)
(
self.X,
self.col_defs,
self.cluster_names,
self.avg_prediction,
self.weight_total_prediction,
) = tree_solver(
dim_df=dim_df,
dims=dims,
fitter=fitter,
num_leaves=max_segments,
max_depth=max_depth,
)
self.nonzeros = np.array(range(self.X.shape[1]))

Expand Down
27 changes: 21 additions & 6 deletions wise_pizza/solve/fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@ def fit_predict(self, X, y, sample_weight=None):
def error(self, X, y, sample_weight=None):
# Error is chosen so that it's minimized by the weighted mean of y
# debug_plot(X, y, self.predict(X), sample_weight)
err = y - self.predict(X)
X = X.copy()
X["target"] = y
if hasattr(self, "dims"):
X = X.sort_values(self.dims + self.groupby_dims)
err = X["target"] - self.predict(X)
errsq = err**2
if sample_weight is not None:
errsq *= sample_weight
Expand Down Expand Up @@ -64,6 +68,7 @@ def fit_predict(self, X, y, sample_weight=None):
class AverageFitter(Fitter):
def __init__(self):
self.avg = None
self.groupby_dims = []

def fit(self, X, y, sample_weight=None):
y = np.array(y)
Expand Down Expand Up @@ -109,7 +114,7 @@ def fit(self, X, y, sample_weight=None):

def predict(self, X):
# predict straight away on the big table, it's row-wise anyway
return self.time_fitter.predict(X[self.groupby_dims + self.dims])
return self.time_fitter.predict(X[self.dims + self.groupby_dims])


class TimeFitterLinearModel(TimeFitterModel):
Expand Down Expand Up @@ -137,21 +142,31 @@ def fit(self, X: pd.DataFrame, y, sample_weight=None):
self.basis,
on=self.groupby_dims,
)
this_X = this_basis[self.basis_cols]
w = this_basis["weights"].values
w = w / w.max()
self.reg = LinearRegression().fit(
X=this_basis[self.basis_cols],
y=this_basis["target"],
sample_weight=None if sample_weight is None else this_basis["weights"],
X=this_X,
y=this_basis["target"].values,
sample_weight=None if sample_weight is None else w,
)
## testing code begins
# self.prediction = self.reg.predict(this_basis[self.basis_cols])
# self.prediction = self.reg.predict(this_X)
# test = pd.DataFrame(
# {
# "time": this_basis[self.time_col],
# "target": this_basis["target"],
# "prediction": self.prediction,
# }
# )
# import matplotlib.pyplot as plt
#
# plt.plot(test["target"], label="target")
# plt.plot(test["prediction"], label="prediction")
# plt.legend()
# plt.show()
# print("yay!")

## testing code ends

def predict(self, X: pd.DataFrame):
Expand Down
22 changes: 18 additions & 4 deletions wise_pizza/solve/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,25 @@ def tree_solver(

# The convention in the calling code is first dims then time
re_df = pd.concat([leaf.df for leaf in leaves]).sort_values(
dims if isinstance(fitter, AverageFitter) else dims + fitter.groupby_dims
dims + fitter.groupby_dims
)

if len(fitter.groupby_dims) == 2: # Time series with weights
re_df_w = re_df[re_df["chunk"] == "Weights"].copy()
re_df = re_df[re_df["chunk"] == "Average"]
w_total_prediction = (re_df_w["prediction"] * re_df_w["weights"]).values
else:
w_total_prediction = None

X = pd.get_dummies(re_df["Segment_id"]).values

return csc_matrix(X), col_defs, cluster_names, re_df["prediction"].values
return (
csc_matrix(X),
col_defs,
cluster_names,
re_df["prediction"].values,
w_total_prediction,
)


def error(x: np.ndarray, y: np.ndarray) -> float:
Expand Down Expand Up @@ -83,7 +97,7 @@ def __init__(
max_depth: Optional[int] = None,
dim_split: Optional[Dict[str, List]] = None,
):
self.df = df.copy().sort_values([time_col] + dims)
self.df = df.copy().sort_values(dims + fitter.groupby_dims)
self.fitter = fitter
self.dims = dims
self.time_col = time_col
Expand All @@ -102,7 +116,7 @@ def depth(self):

@property
def error(self):
this_X = self.df[self.dims + ([] if self.time_col is None else [self.time_col])]
this_X = self.df[self.dims + self.fitter.groupby_dims]
if self.model is None:
self.model = copy.deepcopy(self.fitter)
self.model.fit(
Expand Down
6 changes: 3 additions & 3 deletions wise_pizza/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ def create_time_basis(
const = np.ones(len(t))
linear = np.cumsum(const)
linear -= linear.mean() # now orthogonal to const
col_names = ["Slope"]
col_names = ["Intercept", "Slope"]

dummies = [linear]
dummies = [const, linear]

if include_breaks:
for i in range(1, len(t)):
Expand All @@ -47,7 +47,7 @@ def prune_time_basis(
# from all the possible kinks, choose evenly spaced num_breaks ones
for i in range(1, num_breaks + 1):
chosen_cols.append(dtrend_cols[int(i * len(dtrend_cols) / (num_breaks + 1))])
pre_basis = time_basis[list(time_basis.columns[:2]) + chosen_cols].copy()
pre_basis = time_basis[["Intercept", "Slope"] + chosen_cols].copy()
if solver != "tree":
# TODO: fix this bug
for c in chosen_cols:
Expand Down

0 comments on commit 8c684ed

Please sign in to comment.