Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tree solver for time series, fits both averages and jointly averages and weights #60

Merged
merged 10 commits into from
Nov 15, 2024
Merged
228 changes: 195 additions & 33 deletions notebooks/Finding interesting segments in time series.ipynb

Large diffs are not rendered by default.

120 changes: 79 additions & 41 deletions tests/test_fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,43 @@ def monthly_driver_data():
)


def monthly_driver_ts_data():
df = pd.read_csv(
os.path.join(os.path.dirname(__file__), "../data", "synth_time_data.csv")
)
return SegmentData(
data=df,
dimensions=[
"PRODUCT",
"REGION",
"SOURCE_CURRENCY",
"TARGET_CURRENCY",
],
segment_total="VOLUME",
segment_size="ACTIVE_CUSTOMERS",
time_col="DATE",
)


@pytest.mark.parametrize("fit_sizes", [True, False])
def test_time_series_tree_solver(fit_sizes: bool):
data = monthly_driver_ts_data()
sf = explain_timeseries(
df=data.data,
dims=data.dimensions,
max_segments=7,
max_depth=2,
total_name=data.segment_total,
size_name=data.segment_size,
time_name=data.time_col,
verbose=False,
solver="tree",
fit_sizes=fit_sizes,
)
sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
print(sf.summary())


def test_categorical():
all_data = monthly_driver_data()
df = all_data.data
Expand Down Expand Up @@ -201,46 +238,47 @@ def test_synthetic_template_tree(nan_percent: float):
print("yay!")


@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
def test_synthetic_ts_template(nan_percent: float):
all_data = synthetic_ts_data(init_len=10000)

# Add some big trends to the data
# TODO: insert trend break patterns too
months = np.array(sorted(all_data.data[all_data.time_col].unique()))
basis = create_time_basis(months, baseline_dims=1)
joined = pd.merge(all_data.data, basis, left_on="TIME", right_index=True)
df = joined.drop(columns=basis.columns)

loc1 = (df["dim0"] == 0) & (df["dim1"] == 1)
loc2 = (df["dim1"] == 0) & (df["dim2"] == 1)

df.loc[loc1, "totals"] += 100 * joined.loc[loc1, "Slope"]
df.loc[loc2, "totals"] += 300 * joined.loc[loc2, "Slope"]

if nan_percent > 0:
df = values_to_nan(df, nan_percent)
sf = explain_timeseries(
df,
dims=all_data.dimensions,
total_name=all_data.segment_total,
time_name=all_data.time_col,
size_name=all_data.segment_size,
max_depth=2,
max_segments=5,
verbose=True,
)
print("***")
for s in sf.segments:
print(s)

plot_time(sf)

assert abs(sf.segments[0]["coef"] - 300) < 2
assert abs(sf.segments[1]["coef"] - 100) < 2

# sf.plot()
print("yay!")
# The old solvers for time series no longer work
# @pytest.mark.parametrize("nan_percent", [0.0, 1.0])
# def test_synthetic_ts_template(nan_percent: float):
# all_data = synthetic_ts_data(init_len=10000)
#
# # Add some big trends to the data
# # TODO: insert trend break patterns too
# months = np.array(sorted(all_data.data[all_data.time_col].unique()))
# basis = create_time_basis(months, baseline_dims=1)
# joined = pd.merge(all_data.data, basis, left_on="TIME", right_index=True)
# df = joined.drop(columns=basis.columns)
#
# loc1 = (df["dim0"] == 0) & (df["dim1"] == 1)
# loc2 = (df["dim1"] == 0) & (df["dim2"] == 1)
#
# df.loc[loc1, "totals"] += 100 * joined.loc[loc1, "Slope"]
# df.loc[loc2, "totals"] += 300 * joined.loc[loc2, "Slope"]
#
# if nan_percent > 0:
# df = values_to_nan(df, nan_percent)
# sf = explain_timeseries(
# df,
# dims=all_data.dimensions,
# total_name=all_data.segment_total,
# time_name=all_data.time_col,
# size_name=all_data.segment_size,
# max_depth=2,
# max_segments=5,
# verbose=True,
# )
# print("***")
# for s in sf.segments:
# print(s)
#
# plot_time(sf)
#
# assert abs(sf.segments[0]["coef"] - 300) < 2
# assert abs(sf.segments[1]["coef"] - 100) < 2
#
# # sf.plot()
# print("yay!")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -279,7 +317,7 @@ def test_deltas(
max_depth=1,
max_segments=10,
solver=solver,
cluster_values=cluster_values
cluster_values=cluster_values,
)
# sf.plot(plot_is_static=plot_is_static)
print("yay!")
Expand Down
44 changes: 44 additions & 0 deletions tests/timeseries_wip_entrypoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os, sys
import pandas as pd

root_path = os.path.realpath("../..")
print(root_path)

# this assumes that all of the following files are checked in the same directory
sys.path.append(os.path.join(root_path, "wise-pizza"))

# create data-related directories
data_dir = os.path.realpath(os.path.join(root_path, "wise-pizza/data"))
if not os.path.isdir(data_dir):
os.mkdir(data_dir)
print(data_dir)

from wise_pizza import explain_timeseries

df = pd.read_csv(
os.path.join(data_dir, "synth_time_data.csv")
) # replace this variable with your data
dims = [
"PRODUCT",
"REGION",
"SOURCE_CURRENCY",
"TARGET_CURRENCY",
] # dimensions to find segments
totals = "VOLUME" # value to analyze
size = "ACTIVE_CUSTOMERS" # number of objects
time = "DATE"
sf = explain_timeseries(
df=df,
dims=dims,
max_segments=7,
max_depth=2,
total_name=totals,
size_name=size,
time_name=time,
verbose=False,
solver="tree",
fit_sizes=False,
)
sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
print(sf.summary())
print("yay!")
11 changes: 9 additions & 2 deletions wise_pizza/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,15 @@ def nice_cluster_names(x: List[Dict[str, List[str]]]) -> Tuple[List[Dict], Dict]
for dim, clusters in cluster_strings.items():
reverse_cluster_names[dim] = {}
for i, c in enumerate(clusters):
cluster_names[f"{dim}_cluster_{i + 1}"] = c
reverse_cluster_names[dim][c] = f"{dim}_cluster_{i + 1}"
ugly_name = f"{dim}_cluster_{i + 1}"
nice_name = c.replace("@@", ";")
if len(nice_name) < 1.2 * len(ugly_name):
name = nice_name
else:
name = ugly_name

cluster_names[name] = c
reverse_cluster_names[dim][c] = name

col_defs = []
for xx in x:
Expand Down
Loading
Loading