transferwise · EgorKraevTransferwise · May 7, 2024 · Feb 16, 2024 · Feb 16, 2024 · Mar 20, 2024
@@ -14,7 +14,7 @@
     explain_timeseries,
 )
 from wise_pizza.segment_data import SegmentData
-from wise_pizza.solver import solve_lasso, solve_lp
+from wise_pizza.solve.solver import solve_lasso, solve_lp
 from wise_pizza.time import create_time_basis
 from wise_pizza.plotting_time import plot_time
 
@@ -136,9 +136,9 @@ def test_categorical():
     print("yay!")
 
 
-@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
-def test_synthetic_template(nan_percent: float):
-    all_data = synthetic_data(init_len=1000)
+@pytest.mark.parametrize("nan_percent, clustering", [[0.0, False], [1.0, False]])
+def test_synthetic_template(nan_percent: float, clustering: bool):
+    all_data = synthetic_data(init_len=10000, dim_values=5)
     data = all_data.data
 
     data.loc[(data["dim0"] == 0) & (data["dim1"] == 1), "totals"] += 100
@@ -155,6 +155,7 @@ def test_synthetic_template(nan_percent: float):
         min_segments=5,
         verbose=1,
         solver="lp",
+        cluster_values=clustering,
     )
     print("***")
     for s in sf.segments:
@@ -167,6 +168,38 @@ def test_synthetic_template(nan_percent: float):
     print("yay!")
 
 
+@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
+def test_synthetic_template_tree(nan_percent: float):
+    all_data = synthetic_data(init_len=1000)
+    data = all_data.data
+
+    data.loc[(data["dim0"] == 0) & (data["dim1"] == 1), "totals"] += 200
+    data.loc[(data["dim1"] == 0) & (data["dim2"] == 1), "totals"] += 300
+
+    if nan_percent > 0:
+        data = values_to_nan(data, nan_percent)
+    sf = explain_levels(
+        data,
+        dims=all_data.dimensions,
+        total_name=all_data.segment_total,
+        size_name=all_data.segment_size,
+        max_depth=2,
+        min_segments=5,
+        verbose=1,
+        solver="tree",
+    )
+    print("***")
+    for s in sf.segments:
+        print(s)
+
+    # TODO: insert approppriate asserts
+    # assert abs(sf.segments[0]["coef"] - 300) < 2
+    # assert abs(sf.segments[1]["coef"] - 100) < 2
+
+    # sf.plot()
+    print("yay!")
+
+
 @pytest.mark.parametrize("nan_percent", [0.0, 1.0])
 def test_synthetic_ts_template(nan_percent: float):
     all_data = synthetic_ts_data(init_len=10000)

@@ -1,3 +1,6 @@
+from typing import List, Dict, Tuple
+from collections import defaultdict
+
 import numpy as np
 import pandas as pd
 from sklearn.preprocessing import PowerTransformer
@@ -18,17 +21,27 @@ def guided_kmeans(X: np.ndarray, power_transform: bool = True) -> np.ndarray:
         X = X.values
 
     if power_transform:
-        if len(X[X > 0] > 1):
-            X[X > 0] = PowerTransformer(standardize=False).fit_transform(X[X > 0].reshape(-1, 1)).reshape(-1)
-        if len(X[X < 0] > 1):
-            X[X < 0] = -PowerTransformer(standardize=False).fit_transform(-X[X < 0].reshape(-1, 1)).reshape(-1)
+        if len(X[X > 0]) > 1:
+            X[X > 0] = (
+                PowerTransformer(standardize=False)
+                .fit_transform(X[X > 0].reshape(-1, 1))
+                .reshape(-1)
+            )
+        if len(X[X < 0]) > 1:
+            X[X < 0] = (
+                -PowerTransformer(standardize=False)
+                .fit_transform(-X[X < 0].reshape(-1, 1))
+                .reshape(-1)
+            )
 
     best_score = -1
     best_labels = None
     best_n = -1
     # If we allow 2 clusters, it almost always just splits positive vs negative - boring!
     for n_clusters in range(3, int(len(X) / 2) + 1):
-        cluster_labels = KMeans(n_clusters=n_clusters, init="k-means++", n_init=10).fit_predict(X)
+        cluster_labels = KMeans(
+            n_clusters=n_clusters, init="k-means++", n_init=10
+        ).fit_predict(X)
         score = silhouette_score(X, cluster_labels)
         # print(n_clusters, score)
         if score > best_score:
@@ -45,3 +58,55 @@ def to_matrix(labels: np.ndarray) -> np.ndarray:
     for i in labels.unique():
         out[labels == i, i] = 1.0
     return out
+
+
+def make_clusters(dim_df: pd.DataFrame, dims: List[str]):
+    cluster_names = {}
+    for dim in dims:
+        if len(dim_df[dim].unique()) >= 6:  # otherwise what's the point in clustering?
+            grouped_df = (
+                dim_df[[dim, "totals", "weights"]].groupby(dim, as_index=False).sum()
+            )
+            grouped_df["avg"] = grouped_df["totals"] / grouped_df["weights"]
+            grouped_df["cluster"], _ = guided_kmeans(grouped_df["avg"])
+            pre_clusters = (
+                grouped_df[["cluster", dim]]
+                .groupby("cluster")
+                .agg({dim: lambda x: "@@".join(x)})
+                .values
+            )
+            # filter out clusters with only one element
+            these_clusters = [c for c in pre_clusters.reshape(-1) if "@@" in c]
+            # create short cluster names
+            for i, c in enumerate(these_clusters):
+                cluster_names[f"{dim}_cluster_{i + 1}"] = c
+    return cluster_names
+
+
+def nice_cluster_names(x: List[Dict[str, List[str]]]) -> Tuple[List[Dict], Dict]:
+    # first pass just populate cluster names
+    cluster_strings = defaultdict(set)
+    for xx in x:
+        for dim, v in xx.items():
+            if len(v) > 1:
+                cluster_strings[dim].add("@@".join(v))
+
+    cluster_names = {}
+    reverse_cluster_names = {}
+    for dim, clusters in cluster_strings.items():
+        reverse_cluster_names[dim] = {}
+        for i, c in enumerate(clusters):
+            cluster_names[f"{dim}_cluster_{i + 1}"] = c
+            reverse_cluster_names[dim][c] = f"{dim}_cluster_{i + 1}"
+
+    col_defs = []
+    for xx in x:
+        this_def = {}
+        for dim, v in xx.items():
+            if len(v) > 1:
+                this_def[dim] = reverse_cluster_names[dim]["@@".join(v)]
+            else:
+                this_def[dim] = v[0]
+        col_defs.append(this_def)
+
+    return col_defs, cluster_names
@@ -361,6 +361,7 @@ def explain_timeseries(
     max_depth: int = 2,
     solver: str = "omp",
     verbose: bool = False,
+    constrain_signs: bool = False,
     cluster_values: bool = False,
     time_basis: Optional[pd.DataFrame] = None,
     fit_log_space: bool = False,
@@ -388,7 +389,10 @@ def explain_timeseries(
             fit_sizes = True
 
     if fit_log_space:
-        tf = LogTransform(offset=1, weight_pow_sc=log_space_weight_sc)
+        tf = LogTransform(
+            offset=1,
+            weight_pow_sc=log_space_weight_sc,
+        )
     else:
         tf = IdentityTransform()
 
@@ -415,6 +419,7 @@ def explain_timeseries(
             max_depth=max_depth,
             solver=solver,
             verbose=verbose,
+            constrain_signs=constrain_signs,
             cluster_values=cluster_values,
             time_basis=time_basis,
         )
@@ -441,6 +446,7 @@ def explain_timeseries(
         max_depth=max_depth,
         solver=solver,
         verbose=verbose,
+        constrain_signs=constrain_signs,
         cluster_values=cluster_values,
         time_basis=time_basis,
     )
@@ -477,6 +483,7 @@ def explain_timeseries(
         max_depth=max_depth,
         solver=solver,
         verbose=verbose,
+        constrain_signs=constrain_signs,
         cluster_values=cluster_values,
         time_basis=time_basis,
     )

@@ -8,12 +8,14 @@
 import pandas as pd
 from scipy.sparse import csc_matrix, diags
 
-from wise_pizza.find_alpha import clean_up_min_max, find_alpha
+from wise_pizza.solve.find_alpha import clean_up_min_max, find_alpha
 from wise_pizza.make_matrix import sparse_dummy_matrix
-from wise_pizza.cluster import guided_kmeans
+from wise_pizza.cluster import make_clusters
 from wise_pizza.preselect import HeuristicSelector
 from wise_pizza.time import extend_dataframe
 from wise_pizza.slicer_facades import SliceFinderPredictFacade
+from wise_pizza.solve.tree import tree_solver
+from wise_pizza.solve.solver import solve_lasso
 
 
 def _summary(obj) -> str:
@@ -116,7 +118,7 @@ def fit(
         @param max_segments: Maximum number of segments to find, defaults to min_segments
         @param min_depth: Minimum number of dimension to constrain in segment definition
         @param max_depth: Maximum number of dimension to constrain in segment definition
-        @param solver: If this equals to "lp" uses the LP solver, else uses the (recommended) Lasso solver
+        @param solver: Valid values are "lasso" (default), "tree" (for non-overlapping segments), "omp", or "lp"
         @param verbose: If set to a truish value, lots of debug info is printed to console
         @param force_dim: To add dim
         @param force_add_up: To force add up
@@ -125,6 +127,8 @@ def fit(
         group of segments from the same dimension with similar naive averages
 
         """
+
+        assert solver.lower() in ["lasso", "tree", "omp", "lp"]
         min_segments, max_segments = clean_up_min_max(min_segments, max_segments)
         if verbose is not None:
             self.verbose = verbose
@@ -139,12 +143,16 @@ def fit(
         assert min(weights) >= 0
         assert np.sum(np.abs(totals[weights == 0])) == 0
 
+        # Cast all dimension values to strings
+        dim_df = dim_df.astype(str)
+
         dims = list(dim_df.columns)
         # sort the dataframe by dimension values,
         # making sure the other vectors stay aligned
         dim_df = dim_df.reset_index(drop=True)
         dim_df["totals"] = totals
         dim_df["weights"] = weights
+
         if time_col is not None:
             dim_df["__time"] = time_col
             dim_df = pd.merge(dim_df, time_basis, left_on="__time", right_index=True)
@@ -176,70 +184,73 @@ def fit(
         # of dimension values with similar outcomes
         clusters = defaultdict(list)
         self.cluster_names = {}
-        if cluster_values:
-            for dim in dims:
-                if (
-                    len(dim_df[dim].unique()) >= 6
-                ):  # otherwise what's the point in clustering?
-                    grouped_df = (
-                        dim_df[[dim, "totals", "weights"]]
-                        .groupby(dim, as_index=False)
-                        .sum()
-                    )
-                    grouped_df["avg"] = grouped_df["totals"] / grouped_df["weights"]
-                    grouped_df["cluster"], _ = guided_kmeans(grouped_df["avg"])
-                    pre_clusters = (
-                        grouped_df[["cluster", dim]]
-                        .groupby("cluster")
-                        .agg({dim: lambda x: "@@".join(x)})
-                        .values
-                    )
-                    # filter out clusters with only one element
-                    these_clusters = [c for c in pre_clusters.reshape(-1) if "@@" in c]
-                    # create short cluster names
-                    for i, c in enumerate(these_clusters):
-                        self.cluster_names[f"{dim}_cluster_{i+1}"] = c
+
+        if solver == "tree":
+            if cluster_values:
+                warnings.warn(
+                    "Ignoring cluster_values argument as tree solver makes its own clusters"
+                )
+            self.X, self.col_defs, self.cluster_names = tree_solver(
+                dim_df=dim_df,
+                dims=dims,
+                time_basis=self.time_basis,
+                num_leaves=max_segments,
+            )
+            self.nonzeros = np.array(range(self.X.shape[1]))
+            Xw = csc_matrix(diags(self.weights) @ self.X)
+            self.reg = solve_lasso(
+                Xw.toarray(),
+                self.totals,
+                alpha=1e-5,
+                verbose=self.verbose,
+                fit_intercept=False,
+            )
+            print("")
+        else:
+            if cluster_values:
+                self.cluster_names = make_clusters(dim_df, dims)
+                for dim in dims:
                     clusters[dim] = [
                         c for c in self.cluster_names.keys() if c.startswith(dim)
                     ]
 
-        dim_df = dim_df[dims]  # if time_col is None else dims + ["__time"]]
-        self.dim_df = dim_df
-
-        # lazy calculation of the dummy matrix (calculation can be very slow)
-        if (
-            list(dim_df.columns) != self.dims
-            or max_depth != self.max_depth
-            or self.X is not None
-            and len(dim_df) != self.X.shape[1]
-        ):
-            self.X, self.col_defs = self._init_mat(
-                dim_df,
-                min_depth,
-                max_depth,
-                force_dim=force_dim,
-                clusters=clusters,
-                time_basis=self.time_basis,
+            dim_df = dim_df[dims]  # if time_col is None else dims + ["__time"]]
+            self.dim_df = dim_df
+            # lazy calculation of the dummy matrix (calculation can be very slow)
+            if (
+                list(dim_df.columns) != self.dims
+                or max_depth != self.max_depth
+                or self.X is not None
+                and len(dim_df) != self.X.shape[1]
+            ):
+                self.X, self.col_defs = self._init_mat(
+                    dim_df,
+                    min_depth,
+                    max_depth,
+                    force_dim=force_dim,
+                    clusters=clusters,
+                    time_basis=self.time_basis,
+                )
+                assert len(self.col_defs) == self.X.shape[1]
+                self.min_depth = min_depth
+                self.max_depth = max_depth
+                self.dims = list(dim_df.columns)
+
+            Xw = csc_matrix(diags(self.weights) @ self.X)
+
+            if self.verbose:
+                print("Starting solve!")
+            self.reg, self.nonzeros = find_alpha(
+                Xw,
+                self.totals,
+                max_nonzeros=max_segments,
+                solver=solver,
+                min_nonzeros=min_segments,
+                verbose=self.verbose,
+                adding_up_regularizer=force_add_up,
+                constrain_signs=constrain_signs,
             )
-            assert len(self.col_defs) == self.X.shape[1]
-            self.min_depth = min_depth
-            self.max_depth = max_depth
-            self.dims = list(dim_df.columns)
 
-        Xw = csc_matrix(diags(self.weights) @ self.X)
-
-        if self.verbose:
-            print("Starting solve!")
-        self.reg, self.nonzeros = find_alpha(
-            Xw,
-            self.totals,
-            max_nonzeros=max_segments,
-            solver=solver,
-            min_nonzeros=min_segments,
-            verbose=self.verbose,
-            adding_up_regularizer=force_add_up,
-            constrain_signs=constrain_signs,
-        )
         if self.verbose:
             print("Solver done!!")