transferwise · May 7, 2024 · Feb 16, 2024 · Feb 16, 2024 · Mar 20, 2024
@@ -361,6 +361,7 @@ def explain_timeseries(
     max_depth: int = 2,
     solver: str = "omp",
     verbose: bool = False,
+    constrain_signs: bool = False,
     cluster_values: bool = False,
     time_basis: Optional[pd.DataFrame] = None,
     fit_log_space: bool = False,
@@ -388,7 +389,10 @@ def explain_timeseries(
             fit_sizes = True
 
     if fit_log_space:
-        tf = LogTransform(offset=1, weight_pow_sc=log_space_weight_sc)
+        tf = LogTransform(
+            offset=1,
+            weight_pow_sc=log_space_weight_sc,
+        )
     else:
         tf = IdentityTransform()
 
@@ -415,6 +419,7 @@ def explain_timeseries(
             max_depth=max_depth,
             solver=solver,
             verbose=verbose,
+            constrain_signs=constrain_signs,
             cluster_values=cluster_values,
             time_basis=time_basis,
         )
@@ -441,6 +446,7 @@ def explain_timeseries(
         max_depth=max_depth,
         solver=solver,
         verbose=verbose,
+        constrain_signs=constrain_signs,
         cluster_values=cluster_values,
         time_basis=time_basis,
     )
@@ -477,6 +483,7 @@ def explain_timeseries(
         max_depth=max_depth,
         solver=solver,
         verbose=verbose,
+        constrain_signs=constrain_signs,
         cluster_values=cluster_values,
         time_basis=time_basis,
     )

@@ -14,6 +14,7 @@
 from wise_pizza.preselect import HeuristicSelector
 from wise_pizza.time import extend_dataframe
 from wise_pizza.slicer_facades import SliceFinderPredictFacade
+from wise_pizza.tree import tree_solver
 
 
 def _summary(obj) -> str:
@@ -116,7 +117,7 @@ def fit(
         @param max_segments: Maximum number of segments to find, defaults to min_segments
         @param min_depth: Minimum number of dimension to constrain in segment definition
         @param max_depth: Maximum number of dimension to constrain in segment definition
-        @param solver: If this equals to "lp" uses the LP solver, else uses the (recommended) Lasso solver
+        @param solver: Valid values are "lasso" (default), "tree" (for non-overlapping segments), "omp", or "lp"
         @param verbose: If set to a truish value, lots of debug info is printed to console
         @param force_dim: To add dim
         @param force_add_up: To force add up
@@ -125,6 +126,8 @@ def fit(
         group of segments from the same dimension with similar naive averages
 
         """
+
+        assert solver.lower() in ["lasso", "tree", "omp", "lp"]
         min_segments, max_segments = clean_up_min_max(min_segments, max_segments)
         if verbose is not None:
             self.verbose = verbose
@@ -139,12 +142,16 @@ def fit(
         assert min(weights) >= 0
         assert np.sum(np.abs(totals[weights == 0])) == 0
 
+        # Cast all dimension values to strings
+        dim_df = dim_df.astype(str)
+
         dims = list(dim_df.columns)
         # sort the dataframe by dimension values,
         # making sure the other vectors stay aligned
         dim_df = dim_df.reset_index(drop=True)
         dim_df["totals"] = totals
         dim_df["weights"] = weights
+
         if time_col is not None:
             dim_df["__time"] = time_col
             dim_df = pd.merge(dim_df, time_basis, left_on="__time", right_index=True)
@@ -176,6 +183,14 @@ def fit(
         # of dimension values with similar outcomes
         clusters = defaultdict(list)
         self.cluster_names = {}
+
+        if solver == "tree":
+            if cluster_values:
+                warnings.warn(
+                    "Ignoring cluster_values argument as it's irrelevant for tree solver"
+                )
+                cluster_values = False
+
         if cluster_values:
             for dim in dims:
                 if (
@@ -206,40 +221,49 @@ def fit(
         dim_df = dim_df[dims]  # if time_col is None else dims + ["__time"]]
         self.dim_df = dim_df
 
-        # lazy calculation of the dummy matrix (calculation can be very slow)
-        if (
-            list(dim_df.columns) != self.dims
-            or max_depth != self.max_depth
-            or self.X is not None
-            and len(dim_df) != self.X.shape[1]
-        ):
-            self.X, self.col_defs = self._init_mat(
-                dim_df,
-                min_depth,
-                max_depth,
-                force_dim=force_dim,
-                clusters=clusters,
-                time_basis=self.time_basis,
+        if solver == "tree":
+            self.X, self.reg, self.col_defs = tree_solver(
+                self.dim_df, self.weights, self.totals, self.time_basis
             )
-            assert len(self.col_defs) == self.X.shape[1]
-            self.min_depth = min_depth
-            self.max_depth = max_depth
-            self.dims = list(dim_df.columns)
+            self.nonzeros = np.array(range(self.X.shape[0])) == 1.0
+            Xw = csc_matrix(diags(self.weights) @ self.X)
+        else:
 
-        Xw = csc_matrix(diags(self.weights) @ self.X)
+            # lazy calculation of the dummy matrix (calculation can be very slow)
+            if (
+                list(dim_df.columns) != self.dims
+                or max_depth != self.max_depth
+                or self.X is not None
+                and len(dim_df) != self.X.shape[1]
+            ):
+                self.X, self.col_defs = self._init_mat(
+                    dim_df,
+                    min_depth,
+                    max_depth,
+                    force_dim=force_dim,
+                    clusters=clusters,
+                    time_basis=self.time_basis,
+                )
+                assert len(self.col_defs) == self.X.shape[1]
+                self.min_depth = min_depth
+                self.max_depth = max_depth
+                self.dims = list(dim_df.columns)
+
+            Xw = csc_matrix(diags(self.weights) @ self.X)
+
+            if self.verbose:
+                print("Starting solve!")
+            self.reg, self.nonzeros = find_alpha(
+                Xw,
+                self.totals,
+                max_nonzeros=max_segments,
+                solver=solver,
+                min_nonzeros=min_segments,
+                verbose=self.verbose,
+                adding_up_regularizer=force_add_up,
+                constrain_signs=constrain_signs,
+            )
 
-        if self.verbose:
-            print("Starting solve!")
-        self.reg, self.nonzeros = find_alpha(
-            Xw,
-            self.totals,
-            max_nonzeros=max_segments,
-            solver=solver,
-            min_nonzeros=min_segments,
-            verbose=self.verbose,
-            adding_up_regularizer=force_add_up,
-            constrain_signs=constrain_signs,
-        )
         if self.verbose:
             print("Solver done!!")
 

@@ -39,7 +39,7 @@ def inverse_transform_totals_weights(
         w = self.inverse_transform_weight(t_w, t_mean)
         return mean * w, w
 
-    def test_transforms(self, total, weights, eps=1e-6):
+    def test_transforms(self, total, weights, eps=1e-4):
         mean = total / weights
         t_mean = self.transform_mean(mean)
         assert almost_equals(mean, self.inverse_transform_mean(t_mean), eps)
@@ -71,19 +71,28 @@ def inverse_transform_weight(self, w: np.ndarray, x: np.ndarray) -> np.ndarray:
 
 class LogTransform(TransformWithWeights):
     def __init__(
-        self, offset: float, weight_pow_sc: float = 0.1, max_inverse: float = 1e6
+        self, offset: float, weight_pow_sc: float = 0.1, cap_inverse: bool = True
     ):
         self.offset = offset
         self.weight_pow_sc = weight_pow_sc
-        self.max_inverse = max_inverse
+        self.cap_inverse = cap_inverse
+        if cap_inverse:
+            self.max_inverse = 0.0
+        else:
+            self.max_inverse = None
 
     def transform_mean(self, x: np.ndarray) -> np.ndarray:
+        if self.cap_inverse:
+            self.max_inverse = np.maximum(self.max_inverse, 2 * x.max())
         return np.log(self.offset + x)
 
     def inverse_transform_mean(self, x: np.ndarray) -> np.ndarray:
-        return np.maximum(
-            0.0, np.exp(np.minimum(x, np.log(self.max_inverse))) - self.offset
-        )
+        if self.cap_inverse:
+            return np.maximum(
+                0.0, np.exp(np.minimum(x, np.log(self.max_inverse))) - self.offset
+            )
+        else:
+            np.maximum(0.0, np.exp(x) - self.offset)
 
     def transform_weight(self, w: np.ndarray, mean: np.ndarray) -> np.ndarray:
         # pure math would give weight_pow_sc = 1, but then

@@ -0,0 +1,27 @@
+from typing import Optional
+
+import numpy as np
+import pandas as pd
+
+
+def tree_solver(
+    dim_df: pd.DataFrame,
+    weights: np.ndarray,
+    totals: np.ndarray,
+    time_basis: np.ndarray,
+    max_depth: int = 3,
+    num_leaves: Optional[int] = None,
+):
+    # TODO: fill in
+    # Build a tree in the following fashion:
+    # 1. Start with a single node containing the whole dataset
+    # 2. At each node, find the best split by looping over all dimensions, for each dimension
+    # solving the problem of which values to take in the left and right subtrees,
+    # by running a regression of totals/weights on time basis in both subsets separately
+    # and optimizing the total squared error.
+    # the best combination of (node, dimension) is the next one due to be split
+    # If expanding the best node would exceed maximum depth:
+    # If num_leaves is None: stop
+    # If it's not, expand the best node that would not exceed max_depth, until num_leaves is reached
+
+    return X, reg, col_defs