Merge pull request #62 from transferwise/time_tweaks

Small improvements to time series clustering functionality
transferwise · Nov 26, 2024 · 60c2f15 · 60c2f15
2 parents 21991a4 + 08d7dca
commit 60c2f15
Show file tree

Hide file tree

Showing 12 changed files with 386 additions and 232 deletions.
diff --git a/notebooks/Finding interesting segments in time series.ipynb b/notebooks/Finding interesting segments in time series.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -9,5 +9,5 @@ scipy>=1.8.0
 tqdm
 cloudpickle
 pivottablejs
-streamlit==1.32.0
+streamlit>=1.32.0
 nbformat>=4.2.0
diff --git a/tests/test_fit.py b/tests/test_fit.py
@@ -139,7 +139,7 @@ def test_time_series_tree_solver(fit_sizes: bool):
     sf = explain_timeseries(
         df=data.data,
         dims=data.dimensions,
-        max_segments=7,
+        num_segments=7,
         max_depth=2,
         total_name=data.segment_total,
         size_name=data.segment_size,

diff --git a/tests/timeseries_wip_entrypoint.py b/tests/timeseries_wip_entrypoint.py
@@ -30,14 +30,15 @@
 sf = explain_timeseries(
     df=df,
     dims=dims,
-    max_segments=7,
+    num_segments=7,
     max_depth=2,
     total_name=totals,
     size_name=size,
     time_name=time,
     verbose=False,
     solver="tree",
-    fit_sizes=False,
+    fit_sizes=True,
+    num_breaks=100,
 )
 sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
 print(sf.summary())

diff --git a/tests/timeseries_wip_entrypoint_2.py b/tests/timeseries_wip_entrypoint_2.py
@@ -0,0 +1,47 @@
+import os, sys
+import pandas as pd
+
+root_path = os.path.realpath("../..")
+print(root_path)
+
+# this assumes that all of the following files are checked in the same directory
+sys.path.append(os.path.join(root_path, "wise-pizza"))
+
+# create data-related directories
+data_dir = os.path.realpath(os.path.join(root_path, "wise-pizza/data"))
+if not os.path.isdir(data_dir):
+    os.mkdir(data_dir)
+print(data_dir)
+
+from wise_pizza import explain_timeseries
+
+df = pd.read_csv(
+    os.path.join(data_dir, "volume_data_new.csv")
+)  # replace this variable with your data
+dims = [
+    "CUSTOMER_TYPE",
+    "STRATEGIC_PRODUCT",
+    "SOURCE_CURRENCY",
+    "TARGET_CURRENCY",
+    "PRODUCT_USE_CASE",
+    "REGION",
+    "TRANS_VOL_BUCKET",
+]  # dimensions to find segments
+totals = "VOLUME_GBP"  # value to analyze
+size = "NUM_CUSTOMERS"  #'NUM_TRANSACTIONS'  # number of objects
+time = "ACTION_YM"
+sf = explain_timeseries(
+    df=df,
+    dims=dims,
+    max_segments=7,
+    max_depth=2,
+    total_name=totals,
+    size_name=size,
+    time_name=time,
+    verbose=False,
+    solver="tree",
+    fit_sizes=True,
+)
+sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
+print(sf.summary())
+print("yay!")
diff --git a/wise_pizza/explain.py b/wise_pizza/explain.py
@@ -367,18 +367,37 @@ def explain_timeseries(
     total_name: str,
     time_name: str,
     size_name: Optional[str] = None,
-    min_segments: int = None,
-    max_segments: int = None,
-    min_depth: int = 1,
+    num_segments: int = None,
     max_depth: int = 2,
     solver: str = "tree",
     verbose: bool = False,
     time_basis: Optional[pd.DataFrame] = None,
     fit_log_space: bool = False,
     fit_sizes: Optional[bool] = None,
-    num_breaks: int = 2,
+    num_breaks: int = 3,
+    n_jobs: int = 10,
+    ignore_averages: bool = True,
     log_space_weight_sc: float = 0.5,
 ):
+    """
+    Split a time series panel dataset into segments that are as different as possible
+    :param df:  A pandas DataFrame with the time series data
+    :param dims: Discrete dimensions to segment by
+    :param total_name: Name of the column containing totals
+    :param time_name: Name of the column containing the time values
+    :param num_segments: Number of segments to find
+    :param size_name: (Optional) Name of the column containing the size of the segment
+    :param max_depth: (Optional, defaults to 2) Maximum number of dimensions to constrain per segment
+    :param fit_sizes: (Optional) Whether to fit the sizes of the segments, or just the averages
+    :param n_jobs: (Optional, defaults to 10) Number of jobs to run in parallel when finding segments
+    :param num_breaks: (Optional, defaults to 3) Number of breaks in stylized time series used for comparing segments
+    :param ignore_averages: If set to True (recommended), the level (across time) of each segment is ignored when calculating similarity
+    :param time_basis: A DataFrame with the time basis to use. Only use if you know what you're doing.
+    :param solver: (Optional) The solver to use, currently only "tree" is supported
+    :param fit_log_space: Do not use
+    :param log_space_weight_sc: Do not use
+    :return:
+    """
     assert (
         solver == "tree"
     ), "Only the tree solver is supported for time series at the moment"
@@ -450,16 +469,31 @@ def explain_timeseries(
         time_basis = (
             pd.concat([time_basis, re_basis], axis=0).fillna(0.0).reset_index(drop=True)
         )
-        print("yay!")
         groupby_dims = ["chunk", "__time"]
     else:
         groupby_dims = ["__time"]
 
     df2["_target"] = df2[total_name]
     df2["__time"] = df2[time_name]
-    df2["total_adjustment"] = 0.0
-    avg_df = 0.0
-    average = 0.0
+
+    # Adds the column of the time average over each dimension combination
+    if ignore_averages:
+        df2, avg_df = add_average_over_time(
+            df2,
+            dims=dims,
+            total_name=total_name,
+            size_name=size_name,
+            time_name="__time",
+            groupby_dims=groupby_dims,
+            cartesian=False,
+        )
+    else:
+        df2["total_adjustment"] = 0.0
+        avg_df = None
+
+    # The join in the above function could have messed up the ordering
+    df2 = df2.sort_values(by=dims + groupby_dims)
+    average = df2[total_name].sum() / df2[size_name].sum()
 
     sf = SliceFinder()
     sf.global_average = average
@@ -468,20 +502,20 @@ def explain_timeseries(
     sf.time_name = time_name
     sf.y_adj = df2["total_adjustment"].values
     sf.avg_df = avg_df
-    sf.time_values = df2[time_name].unique()
+    sf.time_values = df2["__time"].unique()
     sf.fit(
-        df2[dims + groupby_dims],
-        df2["_target"],
-        time_col=df2[time_name],
+        df2[dims + groupby_dims + ["total_adjustment"]],
+        df2[total_name],
+        time_col=df2["__time"],
         time_basis=time_basis,
         weights=df2[size_name],
-        min_segments=min_segments,
-        max_segments=max_segments,
-        min_depth=min_depth,
+        max_segments=num_segments,
         max_depth=max_depth,
         solver=solver,
         verbose=verbose,
         groupby_dims=groupby_dims,
+        cluster_values=False,
+        n_jobs=n_jobs,
     )
 
     # TODO: insert back the normalized bits?

diff --git a/wise_pizza/plotting_time_tree.py b/wise_pizza/plotting_time_tree.py
@@ -75,7 +75,7 @@ def preprocess_for_ts_plot(
 ) -> List[List[PlotData]]:
     out = []
     for row, s in enumerate(sf.segments):
-        print(row, s)
+        # print(row, s)
         this_df = pd.DataFrame(
             {
                 "time": sf.time,
@@ -158,3 +158,4 @@ def simple_ts_plot(
             row=row_num,
             col=col_num,
         )
+    fig.update_layout(xaxis=dict(autorange=True), yaxis=dict(autorange=True))
diff --git a/wise_pizza/slicer.py b/wise_pizza/slicer.py
@@ -27,7 +27,7 @@ def _summary(obj) -> str:
             {
                 k: v
                 for k, v in s.items()
-                if k in ["segment", "total", "seg_size", "naive_avg"]
+                if k in ["segment", "total", "seg_size", "naive_avg", "impact"]
             }
             for s in obj.segments
         ],
@@ -124,6 +124,7 @@ def fit(
         constrain_signs: bool = True,
         cluster_values: bool = True,
         groupby_dims: Optional[List[str]] = None,
+        n_jobs: int = 1,
     ):
         """
         Function to fit slicer and find segments
@@ -143,6 +144,9 @@ def fit(
         group of segments from the same dimension with similar naive averages
 
         """
+        dim_df = dim_df.copy()
+        if groupby_dims is None:
+            groupby_dims = []
 
         assert solver.lower() in ["lasso", "tree", "omp", "lp"]
         min_segments, max_segments = clean_up_min_max(min_segments, max_segments)
@@ -160,18 +164,20 @@ def fit(
         assert np.sum(np.abs(totals[weights == 0])) == 0
 
         # Cast all dimension values to strings
-        dim_df = dim_df.astype(str)
+        for c in dim_df.columns:
+            if c not in groupby_dims + ["total_adjustment"]:
+                dim_df[c] = dim_df[c].astype(str)
 
         dims = list(dim_df.columns)
-        if groupby_dims is not None:
-            dims = [d for d in dims if d not in groupby_dims]
+        if groupby_dims:
+            dims = [d for d in dims if d not in groupby_dims + ["total_adjustment"]]
         # sort the dataframe by dimension values,
         # making sure the other vectors stay aligned
         dim_df = dim_df.reset_index(drop=True)
         dim_df["totals"] = totals
         dim_df["weights"] = weights
 
-        if groupby_dims is not None:
+        if groupby_dims:
             dim_df = pd.merge(dim_df, time_basis, on=groupby_dims)
             sort_dims = dims + groupby_dims
         else:
@@ -220,6 +226,8 @@ def fit(
                     num_leaves=max_segments,
                     max_depth=max_depth,
                     fitter=AverageFitter(),
+                    n_jobs=n_jobs,
+                    verbose=verbose,
                 )
 
                 Xw = csc_matrix(diags(self.weights) @ self.X)
@@ -256,6 +264,8 @@ def fit(
                     fitter=fitter,
                     num_leaves=max_segments,
                     max_depth=max_depth,
+                    n_jobs=n_jobs,
+                    verbose=verbose,
                 )
             self.nonzeros = np.array(range(self.X.shape[1]))
 
@@ -420,7 +430,8 @@ def relevant_cluster_names(self):
         relevant_clusters = {}
         for s in self.segments:
             for c in s["segment"].values():
-                if c in self.cluster_names:
+                if c in self.cluster_names and ";" not in c:
+                    # Then cluster names containing ; are snumerations, don't need explanation
                     relevant_clusters[c] = self.cluster_names[c].replace("@@", ", ")
         return relevant_clusters
 

diff --git a/wise_pizza/solve/fitter.py b/wise_pizza/solve/fitter.py
@@ -48,7 +48,6 @@ def debug_plot(X, y, y_pred, w):
     plt.plot(X_agg["y_pred"] / X_agg["weights"], label="y_pred")
     plt.legend()
     plt.show()
-    print("yay!")
 
 
 class TimeFitterModel(ABC):