minor time series tweaks

transferwise · Nov 25, 2024 · f2caca5 · f2caca5
1 parent 9c2b224
commit f2caca5
Show file tree

Hide file tree

Showing 7 changed files with 149 additions and 137 deletions.
diff --git a/notebooks/Finding interesting segments in time series.ipynb b/notebooks/Finding interesting segments in time series.ipynb
diff --git a/tests/timeseries_wip_entrypoint.py b/tests/timeseries_wip_entrypoint.py
@@ -30,14 +30,15 @@
 sf = explain_timeseries(
     df=df,
     dims=dims,
-    max_segments=7,
+    num_segments=7,
     max_depth=2,
     total_name=totals,
     size_name=size,
     time_name=time,
     verbose=False,
     solver="tree",
     fit_sizes=True,
+    num_breaks=100,
 )
 sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
 print(sf.summary())

diff --git a/wise_pizza/explain.py b/wise_pizza/explain.py
@@ -374,10 +374,30 @@ def explain_timeseries(
     time_basis: Optional[pd.DataFrame] = None,
     fit_log_space: bool = False,
     fit_sizes: Optional[bool] = None,
-    num_breaks: int = 2,
+    num_breaks: int = 3,
+    n_jobs: int = 10,
     ignore_averages: bool = True,
     log_space_weight_sc: float = 0.5,
 ):
+    """
+    Split a time series panel dataset into segments that are as different as possible
+    :param df:  A pandas DataFrame with the time series data
+    :param dims: Discrete dimensions to segment by
+    :param total_name: Name of the column containing totals
+    :param time_name: Name of the column containing the time values
+    :param num_segments: Number of segments to find
+    :param size_name: (Optional) Name of the column containing the size of the segment
+    :param max_depth: (Optional, defaults to 2) Maximum number of dimensions to constrain per segment
+    :param fit_sizes: (Optional) Whether to fit the sizes of the segments, or just the averages
+    :param n_jobs: (Optional, defaults to 10) Number of jobs to run in parallel when finding segments
+    :param num_breaks: (Optional, defaults to 3) Number of breaks in stylized time series used for comparing segments
+    :param ignore_averages: If set to True (recommended), the level (across time) of each segment is ignored when calculating similarity
+    :param time_basis: A DataFrame with the time basis to use. Only use if you know what you're doing.
+    :param solver: (Optional) The solver to use, currently only "tree" is supported
+    :param fit_log_space: Do not use
+    :param log_space_weight_sc: Do not use
+    :return:
+    """
     assert (
         solver == "tree"
     ), "Only the tree solver is supported for time series at the moment"
@@ -495,6 +515,7 @@ def explain_timeseries(
         verbose=verbose,
         groupby_dims=groupby_dims,
         cluster_values=False,
+        n_jobs=n_jobs,
     )
 
     # TODO: insert back the normalized bits?

diff --git a/wise_pizza/plotting_time_tree.py b/wise_pizza/plotting_time_tree.py
@@ -75,7 +75,7 @@ def preprocess_for_ts_plot(
 ) -> List[List[PlotData]]:
     out = []
     for row, s in enumerate(sf.segments):
-        print(row, s)
+        # print(row, s)
         this_df = pd.DataFrame(
             {
                 "time": sf.time,
@@ -158,7 +158,4 @@ def simple_ts_plot(
             row=row_num,
             col=col_num,
         )
-    fig.update_layout(
-        xaxis=dict(autorange=True),
-        yaxis=dict(autorange=True)
-    )
+    fig.update_layout(xaxis=dict(autorange=True), yaxis=dict(autorange=True))
diff --git a/wise_pizza/slicer.py b/wise_pizza/slicer.py
@@ -124,6 +124,7 @@ def fit(
         constrain_signs: bool = True,
         cluster_values: bool = True,
         groupby_dims: Optional[List[str]] = None,
+        n_jobs: int = 1,
     ):
         """
         Function to fit slicer and find segments
@@ -225,6 +226,8 @@ def fit(
                     num_leaves=max_segments,
                     max_depth=max_depth,
                     fitter=AverageFitter(),
+                    n_jobs=n_jobs,
+                    verbose=verbose,
                 )
 
                 Xw = csc_matrix(diags(self.weights) @ self.X)
@@ -261,6 +264,8 @@ def fit(
                     fitter=fitter,
                     num_leaves=max_segments,
                     max_depth=max_depth,
+                    n_jobs=n_jobs,
+                    verbose=verbose,
                 )
             self.nonzeros = np.array(range(self.X.shape[1]))
 

diff --git a/wise_pizza/solve/tree.py b/wise_pizza/solve/tree.py
@@ -18,7 +18,8 @@ def tree_solver(
     fitter: Fitter,
     max_depth: Optional[int] = None,
     num_leaves: Optional[int] = None,
-    parallel_processes: int = 10,
+    n_jobs: int = 10,
+    verbose: bool = False,
 ):
     """
     Partition the data into segments using a greedy binary tree approach
@@ -27,6 +28,8 @@ def tree_solver(
     :param fitter: A model to fit on the chunks
     :param max_depth: max depth of the tree
     :param num_leaves: num leaves to generate
+    :param n_jobs: number of parallel jobs
+    :param verbose: print progress
     :return: Segment description, column definitions, and cluster names
     """
 
@@ -41,10 +44,10 @@ def tree_solver(
         dims=dims,
         time_col=None if isinstance(fitter, AverageFitter) else "__time",
         max_depth=max_depth,
-        parallel_processes=parallel_processes,
+        n_jobs=n_jobs,
     )
 
-    build_tree(root=root, num_leaves=num_leaves, max_depth=max_depth)
+    build_tree(root=root, num_leaves=num_leaves, max_depth=max_depth, verbose=verbose)
 
     leaves = get_leaves(root)
 
@@ -93,7 +96,7 @@ def __init__(
         time_col: str = None,
         max_depth: Optional[int] = None,
         dim_split: Optional[Dict[str, List]] = None,
-        parallel_processes: int = 10,
+        n_jobs: int = 10,
     ):
         self.df = df.copy().sort_values(dims + fitter.groupby_dims)
         self.fitter = fitter
@@ -107,7 +110,7 @@ def __init__(
         self.model = None
         # For dimension splitting candidates, hardwired for now
         self.num_bins = 10
-        self.parallel_processes = parallel_processes
+        self.parallel_processes = n_jobs
 
     @property
     def depth(self):
@@ -219,15 +222,23 @@ def get_best_subtree_result(
             return node2
 
 
-def build_tree(root: ModelNode, num_leaves: int, max_depth: Optional[int] = 1000):
+def build_tree(
+    root: ModelNode,
+    num_leaves: int,
+    max_depth: Optional[int] = 1000,
+    verbose: bool = False,
+):
     for i in range(num_leaves - 1):
-        print(f"Adding node {i+1}...")
+        if verbose:
+            print(f"Adding node {i+1}...")
         best_node = get_best_subtree_result(root, max_depth)
         if best_node.error_improvement > 0:
             best_node.children = best_node._best_submodels
-            print("Done!")
+            if verbose:
+                print("Done!")
         else:
-            print("No more improvement, stopping")
+            if verbose:
+                print("No more improvement, stopping")
             break
 
 

diff --git a/wise_pizza/time.py b/wise_pizza/time.py
@@ -45,6 +45,7 @@ def prune_time_basis(
     dtrend_cols = [t for t in time_basis.columns if "dtrend" in t]
     chosen_cols = []
     # from all the possible kinks, choose evenly spaced num_breaks ones
+    num_breaks = min(num_breaks, len(dtrend_cols) - 1)
     for i in range(1, num_breaks + 1):
         chosen_cols.append(dtrend_cols[int(i * len(dtrend_cols) / (num_breaks + 1))])
     pre_basis = time_basis[["Intercept", "Slope"] + chosen_cols].copy()