Skip to content

Commit

Permalink
Add and fix tests for ts tree solver
Browse files Browse the repository at this point in the history
  • Loading branch information
EgorKraevTransferwise committed Nov 14, 2024
1 parent 92ab151 commit 0900f73
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 61 deletions.
120 changes: 79 additions & 41 deletions tests/test_fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,43 @@ def monthly_driver_data():
)


def monthly_driver_ts_data():
df = pd.read_csv(
os.path.join(os.path.dirname(__file__), "../data", "synth_time_data.csv")
)
return SegmentData(
data=df,
dimensions=[
"PRODUCT",
"REGION",
"SOURCE_CURRENCY",
"TARGET_CURRENCY",
],
segment_total="VOLUME",
segment_size="ACTIVE_CUSTOMERS",
time_col="DATE",
)


@pytest.mark.parametrize("fit_sizes", [True, False])
def test_time_series_tree_solver(fit_sizes: bool):
data = monthly_driver_ts_data()
sf = explain_timeseries(
df=data.data,
dims=data.dimensions,
max_segments=7,
max_depth=2,
total_name=data.segment_total,
size_name=data.segment_size,
time_name=data.time_col,
verbose=False,
solver="tree",
fit_sizes=fit_sizes,
)
sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
print(sf.summary())


def test_categorical():
all_data = monthly_driver_data()
df = all_data.data
Expand Down Expand Up @@ -201,46 +238,47 @@ def test_synthetic_template_tree(nan_percent: float):
print("yay!")


@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
def test_synthetic_ts_template(nan_percent: float):
all_data = synthetic_ts_data(init_len=10000)

# Add some big trends to the data
# TODO: insert trend break patterns too
months = np.array(sorted(all_data.data[all_data.time_col].unique()))
basis = create_time_basis(months, baseline_dims=1)
joined = pd.merge(all_data.data, basis, left_on="TIME", right_index=True)
df = joined.drop(columns=basis.columns)

loc1 = (df["dim0"] == 0) & (df["dim1"] == 1)
loc2 = (df["dim1"] == 0) & (df["dim2"] == 1)

df.loc[loc1, "totals"] += 100 * joined.loc[loc1, "Slope"]
df.loc[loc2, "totals"] += 300 * joined.loc[loc2, "Slope"]

if nan_percent > 0:
df = values_to_nan(df, nan_percent)
sf = explain_timeseries(
df,
dims=all_data.dimensions,
total_name=all_data.segment_total,
time_name=all_data.time_col,
size_name=all_data.segment_size,
max_depth=2,
max_segments=5,
verbose=True,
)
print("***")
for s in sf.segments:
print(s)

plot_time(sf)

assert abs(sf.segments[0]["coef"] - 300) < 2
assert abs(sf.segments[1]["coef"] - 100) < 2

# sf.plot()
print("yay!")
# The old solvers for time series no longer work
# @pytest.mark.parametrize("nan_percent", [0.0, 1.0])
# def test_synthetic_ts_template(nan_percent: float):
# all_data = synthetic_ts_data(init_len=10000)
#
# # Add some big trends to the data
# # TODO: insert trend break patterns too
# months = np.array(sorted(all_data.data[all_data.time_col].unique()))
# basis = create_time_basis(months, baseline_dims=1)
# joined = pd.merge(all_data.data, basis, left_on="TIME", right_index=True)
# df = joined.drop(columns=basis.columns)
#
# loc1 = (df["dim0"] == 0) & (df["dim1"] == 1)
# loc2 = (df["dim1"] == 0) & (df["dim2"] == 1)
#
# df.loc[loc1, "totals"] += 100 * joined.loc[loc1, "Slope"]
# df.loc[loc2, "totals"] += 300 * joined.loc[loc2, "Slope"]
#
# if nan_percent > 0:
# df = values_to_nan(df, nan_percent)
# sf = explain_timeseries(
# df,
# dims=all_data.dimensions,
# total_name=all_data.segment_total,
# time_name=all_data.time_col,
# size_name=all_data.segment_size,
# max_depth=2,
# max_segments=5,
# verbose=True,
# )
# print("***")
# for s in sf.segments:
# print(s)
#
# plot_time(sf)
#
# assert abs(sf.segments[0]["coef"] - 300) < 2
# assert abs(sf.segments[1]["coef"] - 100) < 2
#
# # sf.plot()
# print("yay!")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -279,7 +317,7 @@ def test_deltas(
max_depth=1,
max_segments=10,
solver=solver,
cluster_values=cluster_values
cluster_values=cluster_values,
)
# sf.plot(plot_is_static=plot_is_static)
print("yay!")
Expand Down
3 changes: 1 addition & 2 deletions tests/timeseries_wip_entrypoint.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os, sys
import numpy as np
import pandas as pd

root_path = os.path.realpath("../..")
Expand Down Expand Up @@ -38,7 +37,7 @@
time_name=time,
verbose=False,
solver="tree",
fit_sizes=True,
fit_sizes=False,
)
sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
print(sf.summary())
Expand Down
7 changes: 4 additions & 3 deletions wise_pizza/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,16 +371,17 @@ def explain_timeseries(
max_segments: int = None,
min_depth: int = 1,
max_depth: int = 2,
solver: str = "omp",
solver: str = "tree",
verbose: bool = False,
constrain_signs: bool = False,
cluster_values: bool = False,
time_basis: Optional[pd.DataFrame] = None,
fit_log_space: bool = False,
fit_sizes: Optional[bool] = None,
num_breaks: int = 2,
log_space_weight_sc: float = 0.5,
):
assert (
solver == "tree"
), "Only the tree solver is supported for time series at the moment"
df = copy.copy(df)

# replace NaN values in numeric columns with zeros
Expand Down
3 changes: 1 addition & 2 deletions wise_pizza/slicer.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def fit(
dim_df = dim_df.sort_values(sort_dims)
dim_df = dim_df[dim_df["weights"] > 0]

if len(groupby_dims) == 2:
if groupby_dims is not None and len(groupby_dims) == 2:
source_df = dim_df[dim_df["chunk"] == "Average"]
else:
source_df = dim_df
Expand Down Expand Up @@ -282,7 +282,6 @@ def fit(
max_depth,
force_dim=force_dim,
clusters=clusters,
time_basis=self.time_basis,
)
assert len(self.col_defs) == self.X.shape[1]
self.min_depth = min_depth
Expand Down
31 changes: 18 additions & 13 deletions wise_pizza/solve/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,24 @@ def kmeans_partition(df: pd.DataFrame, dim: str, groupby_dims: List[str]):
pivot_df = agg_df.pivot(
index=groupby_dims, columns=dim, values="__avg"
).reset_index()
nice_mats = {}
for chunk in ["Average", "Weights"]:
this_df = pivot_df[pivot_df["chunk"] == chunk]
value_cols = [c for c in this_df.columns if c not in groupby_dims]
nice_values = fill_gaps(this_df[value_cols].values)
if chunk == "Weights":
nice_values = (
np.mean(nice_mats["Average"])
* nice_values
/ np.sum(nice_values, axis=0, keepdims=True)
)
nice_mats[chunk] = nice_values
joint_mat = np.concatenate([nice_mats["Average"], nice_mats["Weights"]], axis=0)
value_cols = [c for c in pivot_df.columns if c not in groupby_dims]

if len(groupby_dims) == 2:
nice_mats = {}
for chunk in ["Average", "Weights"]:
this_df = pivot_df[pivot_df["chunk"] == chunk]
nice_values = fill_gaps(this_df[value_cols].values)
if chunk == "Weights":
nice_values = (
np.mean(nice_mats["Average"])
* nice_values
/ np.sum(nice_values, axis=0, keepdims=True)
)
nice_mats[chunk] = nice_values
joint_mat = np.concatenate([nice_mats["Average"], nice_mats["Weights"]], axis=0)
else:
joint_mat = fill_gaps(pivot_df[value_cols].values)

weights = pivot_df[value_cols].T.sum(axis=1)
vector_dict = {}
for i, c in enumerate(value_cols):
Expand Down

0 comments on commit 0900f73

Please sign in to comment.