Skip to content

Commit

Permalink
Merge pull request #62 from transferwise/time_tweaks
Browse files Browse the repository at this point in the history
Small improvements to time series clustering functionality
  • Loading branch information
AlxdrPolyakov authored Nov 26, 2024
2 parents 21991a4 + 08d7dca commit 60c2f15
Show file tree
Hide file tree
Showing 12 changed files with 386 additions and 232 deletions.
286 changes: 134 additions & 152 deletions notebooks/Finding interesting segments in time series.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ scipy>=1.8.0
tqdm
cloudpickle
pivottablejs
streamlit==1.32.0
streamlit>=1.32.0
nbformat>=4.2.0
2 changes: 1 addition & 1 deletion tests/test_fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def test_time_series_tree_solver(fit_sizes: bool):
sf = explain_timeseries(
df=data.data,
dims=data.dimensions,
max_segments=7,
num_segments=7,
max_depth=2,
total_name=data.segment_total,
size_name=data.segment_size,
Expand Down
5 changes: 3 additions & 2 deletions tests/timeseries_wip_entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,15 @@
sf = explain_timeseries(
df=df,
dims=dims,
max_segments=7,
num_segments=7,
max_depth=2,
total_name=totals,
size_name=size,
time_name=time,
verbose=False,
solver="tree",
fit_sizes=False,
fit_sizes=True,
num_breaks=100,
)
sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
print(sf.summary())
Expand Down
47 changes: 47 additions & 0 deletions tests/timeseries_wip_entrypoint_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os, sys
import pandas as pd

root_path = os.path.realpath("../..")
print(root_path)

# this assumes that all of the following files are checked in the same directory
sys.path.append(os.path.join(root_path, "wise-pizza"))

# create data-related directories
data_dir = os.path.realpath(os.path.join(root_path, "wise-pizza/data"))
if not os.path.isdir(data_dir):
os.mkdir(data_dir)
print(data_dir)

from wise_pizza import explain_timeseries

df = pd.read_csv(
os.path.join(data_dir, "volume_data_new.csv")
) # replace this variable with your data
dims = [
"CUSTOMER_TYPE",
"STRATEGIC_PRODUCT",
"SOURCE_CURRENCY",
"TARGET_CURRENCY",
"PRODUCT_USE_CASE",
"REGION",
"TRANS_VOL_BUCKET",
] # dimensions to find segments
totals = "VOLUME_GBP" # value to analyze
size = "NUM_CUSTOMERS" #'NUM_TRANSACTIONS' # number of objects
time = "ACTION_YM"
sf = explain_timeseries(
df=df,
dims=dims,
max_segments=7,
max_depth=2,
total_name=totals,
size_name=size,
time_name=time,
verbose=False,
solver="tree",
fit_sizes=True,
)
sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
print(sf.summary())
print("yay!")
64 changes: 49 additions & 15 deletions wise_pizza/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,18 +367,37 @@ def explain_timeseries(
total_name: str,
time_name: str,
size_name: Optional[str] = None,
min_segments: int = None,
max_segments: int = None,
min_depth: int = 1,
num_segments: int = None,
max_depth: int = 2,
solver: str = "tree",
verbose: bool = False,
time_basis: Optional[pd.DataFrame] = None,
fit_log_space: bool = False,
fit_sizes: Optional[bool] = None,
num_breaks: int = 2,
num_breaks: int = 3,
n_jobs: int = 10,
ignore_averages: bool = True,
log_space_weight_sc: float = 0.5,
):
"""
Split a time series panel dataset into segments that are as different as possible
:param df: A pandas DataFrame with the time series data
:param dims: Discrete dimensions to segment by
:param total_name: Name of the column containing totals
:param time_name: Name of the column containing the time values
:param num_segments: Number of segments to find
:param size_name: (Optional) Name of the column containing the size of the segment
:param max_depth: (Optional, defaults to 2) Maximum number of dimensions to constrain per segment
:param fit_sizes: (Optional) Whether to fit the sizes of the segments, or just the averages
:param n_jobs: (Optional, defaults to 10) Number of jobs to run in parallel when finding segments
:param num_breaks: (Optional, defaults to 3) Number of breaks in stylized time series used for comparing segments
:param ignore_averages: If set to True (recommended), the level (across time) of each segment is ignored when calculating similarity
:param time_basis: A DataFrame with the time basis to use. Only use if you know what you're doing.
:param solver: (Optional) The solver to use, currently only "tree" is supported
:param fit_log_space: Do not use
:param log_space_weight_sc: Do not use
:return:
"""
assert (
solver == "tree"
), "Only the tree solver is supported for time series at the moment"
Expand Down Expand Up @@ -450,16 +469,31 @@ def explain_timeseries(
time_basis = (
pd.concat([time_basis, re_basis], axis=0).fillna(0.0).reset_index(drop=True)
)
print("yay!")
groupby_dims = ["chunk", "__time"]
else:
groupby_dims = ["__time"]

df2["_target"] = df2[total_name]
df2["__time"] = df2[time_name]
df2["total_adjustment"] = 0.0
avg_df = 0.0
average = 0.0

# Adds the column of the time average over each dimension combination
if ignore_averages:
df2, avg_df = add_average_over_time(
df2,
dims=dims,
total_name=total_name,
size_name=size_name,
time_name="__time",
groupby_dims=groupby_dims,
cartesian=False,
)
else:
df2["total_adjustment"] = 0.0
avg_df = None

# The join in the above function could have messed up the ordering
df2 = df2.sort_values(by=dims + groupby_dims)
average = df2[total_name].sum() / df2[size_name].sum()

sf = SliceFinder()
sf.global_average = average
Expand All @@ -468,20 +502,20 @@ def explain_timeseries(
sf.time_name = time_name
sf.y_adj = df2["total_adjustment"].values
sf.avg_df = avg_df
sf.time_values = df2[time_name].unique()
sf.time_values = df2["__time"].unique()
sf.fit(
df2[dims + groupby_dims],
df2["_target"],
time_col=df2[time_name],
df2[dims + groupby_dims + ["total_adjustment"]],
df2[total_name],
time_col=df2["__time"],
time_basis=time_basis,
weights=df2[size_name],
min_segments=min_segments,
max_segments=max_segments,
min_depth=min_depth,
max_segments=num_segments,
max_depth=max_depth,
solver=solver,
verbose=verbose,
groupby_dims=groupby_dims,
cluster_values=False,
n_jobs=n_jobs,
)

# TODO: insert back the normalized bits?
Expand Down
3 changes: 2 additions & 1 deletion wise_pizza/plotting_time_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def preprocess_for_ts_plot(
) -> List[List[PlotData]]:
out = []
for row, s in enumerate(sf.segments):
print(row, s)
# print(row, s)
this_df = pd.DataFrame(
{
"time": sf.time,
Expand Down Expand Up @@ -158,3 +158,4 @@ def simple_ts_plot(
row=row_num,
col=col_num,
)
fig.update_layout(xaxis=dict(autorange=True), yaxis=dict(autorange=True))
23 changes: 17 additions & 6 deletions wise_pizza/slicer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def _summary(obj) -> str:
{
k: v
for k, v in s.items()
if k in ["segment", "total", "seg_size", "naive_avg"]
if k in ["segment", "total", "seg_size", "naive_avg", "impact"]
}
for s in obj.segments
],
Expand Down Expand Up @@ -124,6 +124,7 @@ def fit(
constrain_signs: bool = True,
cluster_values: bool = True,
groupby_dims: Optional[List[str]] = None,
n_jobs: int = 1,
):
"""
Function to fit slicer and find segments
Expand All @@ -143,6 +144,9 @@ def fit(
group of segments from the same dimension with similar naive averages
"""
dim_df = dim_df.copy()
if groupby_dims is None:
groupby_dims = []

assert solver.lower() in ["lasso", "tree", "omp", "lp"]
min_segments, max_segments = clean_up_min_max(min_segments, max_segments)
Expand All @@ -160,18 +164,20 @@ def fit(
assert np.sum(np.abs(totals[weights == 0])) == 0

# Cast all dimension values to strings
dim_df = dim_df.astype(str)
for c in dim_df.columns:
if c not in groupby_dims + ["total_adjustment"]:
dim_df[c] = dim_df[c].astype(str)

dims = list(dim_df.columns)
if groupby_dims is not None:
dims = [d for d in dims if d not in groupby_dims]
if groupby_dims:
dims = [d for d in dims if d not in groupby_dims + ["total_adjustment"]]
# sort the dataframe by dimension values,
# making sure the other vectors stay aligned
dim_df = dim_df.reset_index(drop=True)
dim_df["totals"] = totals
dim_df["weights"] = weights

if groupby_dims is not None:
if groupby_dims:
dim_df = pd.merge(dim_df, time_basis, on=groupby_dims)
sort_dims = dims + groupby_dims
else:
Expand Down Expand Up @@ -220,6 +226,8 @@ def fit(
num_leaves=max_segments,
max_depth=max_depth,
fitter=AverageFitter(),
n_jobs=n_jobs,
verbose=verbose,
)

Xw = csc_matrix(diags(self.weights) @ self.X)
Expand Down Expand Up @@ -256,6 +264,8 @@ def fit(
fitter=fitter,
num_leaves=max_segments,
max_depth=max_depth,
n_jobs=n_jobs,
verbose=verbose,
)
self.nonzeros = np.array(range(self.X.shape[1]))

Expand Down Expand Up @@ -420,7 +430,8 @@ def relevant_cluster_names(self):
relevant_clusters = {}
for s in self.segments:
for c in s["segment"].values():
if c in self.cluster_names:
if c in self.cluster_names and ";" not in c:
# Then cluster names containing ; are snumerations, don't need explanation
relevant_clusters[c] = self.cluster_names[c].replace("@@", ", ")
return relevant_clusters

Expand Down
1 change: 0 additions & 1 deletion wise_pizza/solve/fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def debug_plot(X, y, y_pred, w):
plt.plot(X_agg["y_pred"] / X_agg["weights"], label="y_pred")
plt.legend()
plt.show()
print("yay!")


class TimeFitterModel(ABC):
Expand Down
Loading

0 comments on commit 60c2f15

Please sign in to comment.