Skip to content

Commit

Permalink
Tweaks to time series fitting
Browse files Browse the repository at this point in the history
  • Loading branch information
EgorKraevTransferwise committed Nov 22, 2024
1 parent 8537719 commit 6d65136
Show file tree
Hide file tree
Showing 10 changed files with 310 additions and 191 deletions.
270 changes: 138 additions & 132 deletions notebooks/Finding interesting segments in time series.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion tests/timeseries_wip_entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
time_name=time,
verbose=False,
solver="tree",
fit_sizes=False,
fit_sizes=True,
)
sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
print(sf.summary())
Expand Down
47 changes: 47 additions & 0 deletions tests/timeseries_wip_entrypoint_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os, sys
import pandas as pd

root_path = os.path.realpath("../..")
print(root_path)

# this assumes that all of the following files are checked in the same directory
sys.path.append(os.path.join(root_path, "wise-pizza"))

# create data-related directories
data_dir = os.path.realpath(os.path.join(root_path, "wise-pizza/data"))
if not os.path.isdir(data_dir):
os.mkdir(data_dir)
print(data_dir)

from wise_pizza import explain_timeseries

df = pd.read_csv(
os.path.join(data_dir, "volume_data_new.csv")
) # replace this variable with your data
dims = [
"CUSTOMER_TYPE",
"STRATEGIC_PRODUCT",
"SOURCE_CURRENCY",
"TARGET_CURRENCY",
"PRODUCT_USE_CASE",
"REGION",
"TRANS_VOL_BUCKET",
] # dimensions to find segments
totals = "VOLUME_GBP" # value to analyze
size = "NUM_CUSTOMERS" #'NUM_TRANSACTIONS' # number of objects
time = "ACTION_YM"
sf = explain_timeseries(
df=df,
dims=dims,
max_segments=7,
max_depth=2,
total_name=totals,
size_name=size,
time_name=time,
verbose=False,
solver="tree",
fit_sizes=True,
)
sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
print(sf.summary())
print("yay!")
40 changes: 26 additions & 14 deletions wise_pizza/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,16 +367,15 @@ def explain_timeseries(
total_name: str,
time_name: str,
size_name: Optional[str] = None,
min_segments: int = None,
max_segments: int = None,
min_depth: int = 1,
num_segments: int = None,
max_depth: int = 2,
solver: str = "tree",
verbose: bool = False,
time_basis: Optional[pd.DataFrame] = None,
fit_log_space: bool = False,
fit_sizes: Optional[bool] = None,
num_breaks: int = 2,
ignore_averages: bool = True,
log_space_weight_sc: float = 0.5,
):
assert (
Expand Down Expand Up @@ -450,16 +449,31 @@ def explain_timeseries(
time_basis = (
pd.concat([time_basis, re_basis], axis=0).fillna(0.0).reset_index(drop=True)
)
print("yay!")
groupby_dims = ["chunk", "__time"]
else:
groupby_dims = ["__time"]

df2["_target"] = df2[total_name]
df2["__time"] = df2[time_name]
df2["total_adjustment"] = 0.0
avg_df = 0.0
average = 0.0

# Adds the column of the time average over each dimension combination
if ignore_averages:
df2, avg_df = add_average_over_time(
df2,
dims=dims,
total_name=total_name,
size_name=size_name,
time_name="__time",
groupby_dims=groupby_dims,
cartesian=False,
)
else:
df2["total_adjustment"] = 0.0
avg_df = None

# The join in the above function could have messed up the ordering
df2 = df2.sort_values(by=dims + groupby_dims)
average = df2[total_name].sum() / df2[size_name].sum()

sf = SliceFinder()
sf.global_average = average
Expand All @@ -468,16 +482,14 @@ def explain_timeseries(
sf.time_name = time_name
sf.y_adj = df2["total_adjustment"].values
sf.avg_df = avg_df
sf.time_values = df2[time_name].unique()
sf.time_values = df2["__time"].unique()
sf.fit(
df2[dims + groupby_dims],
df2["_target"],
time_col=df2[time_name],
df2[dims + groupby_dims + ["total_adjustment"]],
df2[total_name],
time_col=df2["__time"],
time_basis=time_basis,
weights=df2[size_name],
min_segments=min_segments,
max_segments=max_segments,
min_depth=min_depth,
max_segments=num_segments,
max_depth=max_depth,
solver=solver,
verbose=verbose,
Expand Down
4 changes: 4 additions & 0 deletions wise_pizza/plotting_time_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,7 @@ def simple_ts_plot(
row=row_num,
col=col_num,
)
fig.update_layout(
xaxis=dict(autorange=True),
yaxis=dict(autorange=True)
)
13 changes: 9 additions & 4 deletions wise_pizza/slicer.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ def fit(
group of segments from the same dimension with similar naive averages
"""
dim_df = dim_df.copy()
if groupby_dims is None:
groupby_dims = []

assert solver.lower() in ["lasso", "tree", "omp", "lp"]
min_segments, max_segments = clean_up_min_max(min_segments, max_segments)
Expand All @@ -160,18 +163,20 @@ def fit(
assert np.sum(np.abs(totals[weights == 0])) == 0

# Cast all dimension values to strings
dim_df = dim_df.astype(str)
for c in dim_df.columns:
if c not in groupby_dims + ["total_adjustment"]:
dim_df[c] = dim_df[c].astype(str)

dims = list(dim_df.columns)
if groupby_dims is not None:
dims = [d for d in dims if d not in groupby_dims]
if groupby_dims:
dims = [d for d in dims if d not in groupby_dims + ["total_adjustment"]]
# sort the dataframe by dimension values,
# making sure the other vectors stay aligned
dim_df = dim_df.reset_index(drop=True)
dim_df["totals"] = totals
dim_df["weights"] = weights

if groupby_dims is not None:
if groupby_dims:
dim_df = pd.merge(dim_df, time_basis, on=groupby_dims)
sort_dims = dims + groupby_dims
else:
Expand Down
1 change: 0 additions & 1 deletion wise_pizza/solve/fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def debug_plot(X, y, y_pred, w):
plt.plot(X_agg["y_pred"] / X_agg["weights"], label="y_pred")
plt.legend()
plt.show()
print("yay!")


class TimeFitterModel(ABC):
Expand Down
57 changes: 43 additions & 14 deletions wise_pizza/solve/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,15 @@ def target_encoding_partitions(df: pd.DataFrame, dim: str, num_bins: int):
return partitions


def kmeans_partition(df: pd.DataFrame, dim: str, groupby_dims: List[str]):
def kmeans_partition(
df: pd.DataFrame,
dim: str,
groupby_dims: List[str],
normalize_averages: bool = False,
):
assert len(df[dim].unique()) >= 3
# Get split candidates
# Get time profiles split by the dimension we are evaluating
agg_df = df.groupby([dim] + groupby_dims, as_index=False).sum()
agg_df["__avg"] = agg_df["totals"] / agg_df["weights"]
pivot_df = agg_df.pivot(
Expand All @@ -57,16 +63,31 @@ def kmeans_partition(df: pd.DataFrame, dim: str, groupby_dims: List[str]):
for chunk in ["Average", "Weights"]:
this_df = pivot_df[pivot_df["chunk"] == chunk]
nice_values = fill_gaps(this_df[value_cols].values)
if chunk == "Weights":
nice_values = (
np.mean(nice_mats["Average"])
* nice_values
/ np.sum(nice_values, axis=0, keepdims=True)

if normalize_averages:
# Normalize both subsegments separately: weights and averages
nice_values /= (
np.linalg.norm(nice_values, ord=2, axis=0, keepdims=True) + 1e-6
)
else:
if chunk == "Weights":
nice_values = (
np.mean(nice_mats["Average"])
* nice_values
/ (
np.linalg.norm(nice_values, ord=2, axis=0, keepdims=True)
+ 1e-6
)
)
nice_mats[chunk] = nice_values
joint_mat = np.concatenate([nice_mats["Average"], nice_mats["Weights"]], axis=0)
else:
joint_mat = fill_gaps(pivot_df[value_cols].values)
nice_values = fill_gaps(pivot_df[value_cols].values)
if normalize_averages:
nice_values /= (
np.linalg.norm(nice_values, ord=2, axis=0, keepdims=True) + 1e-6
)
joint_mat = nice_values

weights = pivot_df[value_cols].T.sum(axis=1)
vector_dict = {}
Expand Down Expand Up @@ -109,12 +130,20 @@ def weighted_kmeans_two_clusters(data_dict, tol=1e-4, max_iter=100, max_retries=
break

# Update centroids with weighted averages
new_centroids = np.array(
[
np.average(data[labels == i], axis=0, weights=weights[labels == i])
for i in range(2)
]
)
try:
new_centroids = np.array(
[
np.average(
data[labels == i], axis=0, weights=weights[labels == i]
)
for i in range(2)
]
)
except ZeroDivisionError:
print(
f"Zero division error detected on retry {retry + 1}, reinitializing centroids."
)
break

# Check for convergence
if np.linalg.norm(new_centroids - centroids) < tol:
Expand All @@ -140,7 +169,7 @@ def fill_gaps(x: np.ndarray, num_iter=50):
nice_marg = interpolate_and_extrapolate(marg)
tile_marg = np.tile(nice_marg, (x.shape[1], 1)).T
tile_marg[nans] = np.nan
reg = np.nanmedian(x) * 1e-6
reg = np.nanmedian(x) * 1e-6 + 1e-6
coeffs = (np.nansum(x * tile_marg, axis=0) + reg) / (
np.nansum(tile_marg * tile_marg, axis=0) + reg
)
Expand Down
5 changes: 5 additions & 0 deletions wise_pizza/solve/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def tree_solver(
"""

df = dim_df.copy().reset_index(drop=True)
df["totals"] -= df["total_adjustment"]
df["__avg"] = df["totals"] / df["weights"]
df["__avg"] = df["__avg"].fillna(df["__avg"].mean())

Expand All @@ -56,6 +57,10 @@ def tree_solver(
re_df = pd.concat([leaf.df for leaf in leaves]).sort_values(
dims + fitter.groupby_dims
)
# Put back the averages over time by segment
re_df["prediction"] += re_df["total_adjustment"] / re_df["weights"]

# re_df["totals"] += re_df["total_adjustment"]

if len(fitter.groupby_dims) == 2: # Time series with weights
re_df_w = re_df[re_df["chunk"] == "Weights"].copy()
Expand Down
62 changes: 37 additions & 25 deletions wise_pizza/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,36 +90,48 @@ def add_average_over_time(
total_name: str,
size_name: str,
time_name: str,
groupby_dims: List[str] = None,
cartesian: bool = False,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
avgs = df[dims + [total_name, size_name]].groupby(dims, as_index=False).sum()
avgs["avg"] = avgs[total_name] / avgs[size_name]
if cartesian:
# make sure that the cartesian product of dimension combinations x time is present,
# without changing the totals
times = df[[time_name]].groupby(time_name, as_index=False).sum()
times["key"] = 1
avgs["key"] = 1
cartesian_df = pd.merge(avgs, times, on="key").drop(columns=["key"])
joined = pd.merge(
df,
cartesian_df[dims + [time_name]],
on=dims + [time_name],
how="right",
)
joined[size_name] = joined[size_name].fillna(
np.nanmean(joined[size_name].values)
)
joined[total_name] = joined[total_name].fillna(0.0)
df = joined

avgs = df[dims + [total_name, size_name]].groupby(dims, as_index=False).sum()
groupby_dims = groupby_dims or [time_name]

# get the average of the total over time
group_dims = dims + [c for c in groupby_dims if c != time_name]
avgs = (
df[group_dims + [total_name, size_name]]
.groupby(group_dims, as_index=False)
.sum()
)

avgs["avg"] = avgs[total_name] / avgs[size_name]
joined = pd.merge(df, avgs[dims + ["avg"]], on=dims)
# if cartesian:
# # make sure that the cartesian product of dimension combinations x time is present,
# # without changing the totals
# times = df[[time_name]].groupby(time_name, as_index=False).sum()
# times["key"] = 1
# avgs["key"] = 1
# cartesian_df = pd.merge(avgs, times, on="key").drop(columns=["key"])
# joined = pd.merge(
# df,
# cartesian_df[dims + [time_name]],
# on=dims + [time_name],
# how="right",
# )
# joined[size_name] = joined[size_name].fillna(
# np.nanmean(joined[size_name].values)
# )
# joined[total_name] = joined[total_name].fillna(0.0)
# df = joined

# avgs = df[dims + [total_name, size_name]].groupby(dims, as_index=False).sum()
# avgs["avg"] = avgs[total_name] / avgs[size_name]

joined = pd.merge(df, avgs[group_dims + ["avg"]], on=group_dims)

joined["total_adjustment"] = joined[size_name] * joined["avg"]
out = joined[dims + [total_name, size_name, time_name, "total_adjustment"]]
tmp = out[dims + [total_name, "total_adjustment"]].groupby(dims).sum()

out = joined[group_dims + [total_name, size_name, time_name, "total_adjustment"]]
tmp = out[group_dims + [total_name, "total_adjustment"]].groupby(dims).sum()
assert (tmp[total_name] - tmp["total_adjustment"]).abs().sum() < 1e-6 * df[
total_name
].abs().max()
Expand Down

0 comments on commit 6d65136

Please sign in to comment.