Skip to content

Commit

Permalink
Merge pull request #37 from transferwise/time
Browse files Browse the repository at this point in the history
Time
  • Loading branch information
AlxdrPolyakov authored Feb 6, 2024
2 parents d0a7af2 + e2cc8c2 commit 3bf293a
Show file tree
Hide file tree
Showing 22 changed files with 147,771 additions and 157 deletions.
145,515 changes: 145,515 additions & 0 deletions data/synth_time_data.csv

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions notebooks/Finding interesting segments (continuous segments).ipynb

Large diffs are not rendered by default.

262 changes: 262 additions & 0 deletions notebooks/Finding interesting segments in time series.ipynb

Large diffs are not rendered by default.

103 changes: 83 additions & 20 deletions notebooks/Finding interesting segments.ipynb

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ scipy>=1.8.0
tqdm
cloudpickle
pivottablejs
streamlit==1.28.0
streamlit==1.28.0
nbformat>=4.2.0
51 changes: 50 additions & 1 deletion tests/test_fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@
import pandas as pd
import pytest

from wise_pizza.data_sources.synthetic import synthetic_data
from wise_pizza.data_sources.synthetic import synthetic_data, synthetic_ts_data
from wise_pizza.explain import (
explain_changes_in_average,
explain_changes_in_totals,
explain_levels,
explain_timeseries,
)
from wise_pizza.segment_data import SegmentData
from wise_pizza.solver import solve_lasso, solve_lp
from wise_pizza.time import create_time_basis
from wise_pizza.plotting_time import plot_time

np.random.seed(42)

Expand Down Expand Up @@ -137,6 +140,10 @@ def test_categorical():
def test_synthetic_template(nan_percent: float):
all_data = synthetic_data(init_len=1000)
data = all_data.data

data.loc[(data["dim0"] == 0) & (data["dim1"] == 1), "totals"] += 100
data.loc[(data["dim1"] == 0) & (data["dim2"] == 1), "totals"] += 300

if nan_percent > 0:
data = values_to_nan(data, nan_percent)
sf = explain_levels(
Expand All @@ -160,6 +167,48 @@ def test_synthetic_template(nan_percent: float):
print("yay!")


@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
def test_synthetic_ts_template(nan_percent: float):
all_data = synthetic_ts_data(init_len=10000)

# Add some big trends to the data
# TODO: insert trend break patterns too
months = np.array(sorted(all_data.data[all_data.time_col].unique()))
basis = create_time_basis(months, baseline_dims=1)
joined = pd.merge(all_data.data, basis, left_on="TIME", right_index=True)
df = joined.drop(columns=basis.columns)

loc1 = (df["dim0"] == 0) & (df["dim1"] == 1)
loc2 = (df["dim1"] == 0) & (df["dim2"] == 1)

df.loc[loc1, "totals"] += 100 * joined.loc[loc1, "Slope"]
df.loc[loc2, "totals"] += 300 * joined.loc[loc2, "Slope"]

if nan_percent > 0:
df = values_to_nan(df, nan_percent)
sf = explain_timeseries(
df,
dims=all_data.dimensions,
total_name=all_data.segment_total,
time_name=all_data.time_col,
size_name=all_data.segment_size,
max_depth=2,
min_segments=5,
verbose=True,
)
print("***")
for s in sf.segments:
print(s)

plot_time(sf)

assert abs(sf.segments[0]["coef"] - 300) < 2
assert abs(sf.segments[1]["coef"] - 100) < 2

# sf.plot()
print("yay!")


@pytest.mark.parametrize(
"how, solver, plot_is_static, function, nan_percent, size_one_percent",
deltas_test_cases,
Expand Down
1 change: 1 addition & 0 deletions wise_pizza/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
explain_levels,
explain_changes_in_totals,
explain_changes_in_average,
explain_timeseries,
)
26 changes: 21 additions & 5 deletions wise_pizza/data_sources/synthetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
np.random.seed(42)


def synthetic_data(num_dims: int = 5, dim_values: int = 5, init_len=10000):
def synthetic_data(num_dims: int = 5, dim_values: int = 5, init_len=10000) -> SegmentData:
np.random.seed(42)
cols = {}
for dim in range(num_dims):
Expand All @@ -17,9 +17,25 @@ def synthetic_data(num_dims: int = 5, dim_values: int = 5, init_len=10000):
cols["totals"] = np.random.lognormal(0, 1, size=init_len)
dims = [k for k in cols.keys() if "dim" in k]

df = pd.DataFrame(cols).groupby(dims, as_index=False).sum()
# deduplicate dimension values
df = pd.DataFrame(cols).groupby(dims, as_index=False).sum().reset_index(drop=True)
return SegmentData(data=df, dimensions=dims, segment_total="totals")

df.loc[(df["dim0"] == 0) & (df["dim1"] == 1), "totals"] += 100
df.loc[(df["dim1"] == 0) & (df["dim2"] == 1), "totals"] += 300

return SegmentData(data=df, dimensions=dims, segment_total="totals")
def synthetic_ts_data(num_dims: int = 5, dim_values: int = 5, init_len=10000, ts_len: int = 12):
pre_data = synthetic_data(num_dims, dim_values, int(init_len/ts_len))
small_df = pre_data.data
dfs = []
months = np.array(pd.date_range(start="2023-01-01", periods=ts_len, freq="MS"))

for m in months:
this_df = small_df.copy()
this_df["TIME"] = m
this_df["totals"] = np.random.lognormal(0, 1, size=len(this_df))
dfs.append(this_df)

df = pd.concat(dfs)
pre_data.time_col = "TIME"

pre_data.data = df.sort_values(pre_data.dimensions + [pre_data.time_col]).reset_index(drop=True)
return pre_data
Loading

0 comments on commit 3bf293a

Please sign in to comment.