Merge pull request #23 from transferwise/cluster

Consider clusters of segments with similar naive averages as segment candidates in their own right
transferwise · Dec 6, 2023 · 9587cda · 9587cda
2 parents a2de61f + 8400ec3
commit 9587cda
Show file tree

Hide file tree

Showing 10 changed files with 497 additions and 295 deletions.
diff --git a/README.md b/README.md
@@ -121,6 +121,12 @@ sf1 = explain_changes_in_average(
 
 ![plot](https://github.com/transferwise/wise-pizza/blob/main/docs/explain_changes_in_average(totals).png?raw=True)
 
+***In addition to single-value slices, consider slices that consist of a
+    group of segments from the same dimension with similar naive averages***
+For that goal you can use cluster_values=True parameter.
+
+![plot](https://github.com/transferwise/wise-pizza/blob/main/docs/cluster_values.png?raw=True)
+
 And then you can visualize differences:
 
 ```Python
@@ -132,6 +138,12 @@ And check segments:
 ```Python
 sf.segments
 ```
+
+if you use cluster values, you can also check relevant cluster names:
+```Python
+sf.relevant_cluster_names
+```
+
 Please see the full example [here](https://github.com/transferwise/wise-pizza/blob/main/notebooks/Finding%20interesting%20segments.ipynb)
 
 ## For Developers

diff --git a/docs/cluster_values.png b/docs/cluster_values.png
diff --git a/notebooks/Finding interesting segments (continuous segments).ipynb b/notebooks/Finding interesting segments (continuous segments).ipynb
diff --git a/notebooks/Finding interesting segments.ipynb b/notebooks/Finding interesting segments.ipynb
diff --git a/tests/test_fit.py b/tests/test_fit.py
@@ -129,6 +129,7 @@ def test_categorical():
     for s in sf.segments:
         print(s)
     print(sf.summary())
+    print(sf.relevant_cluster_names)
     print("yay!")
 
 

diff --git a/wise_pizza/cluster.py b/wise_pizza/cluster.py
@@ -0,0 +1,47 @@
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import PowerTransformer
+from sklearn.cluster import KMeans, kmeans_plusplus
+from sklearn.metrics import silhouette_score
+
+
+def guided_kmeans(X: np.ndarray, power_transform: bool = True) -> np.ndarray:
+    """
+    Cluster segment averages to calculate aggregated segments
+    @param X: Segment mean minus global mean, for each dimension value
+    @param power_transform: Do we power transform before clustering
+    @return: cluster labels and the transformed values
+    """
+    if isinstance(X, pd.Series):
+        X = X.values.reshape(-1, 1)
+    elif isinstance(X, pd.DataFrame):
+        X = X.values
+
+    if power_transform:
+        if len(X[X > 0] > 1):
+            X[X > 0] = PowerTransformer(standardize=False).fit_transform(X[X > 0].reshape(-1, 1)).reshape(-1)
+        if len(X[X < 0] > 1):
+            X[X < 0] = -PowerTransformer(standardize=False).fit_transform(-X[X < 0].reshape(-1, 1)).reshape(-1)
+
+    best_score = -1
+    best_labels = None
+    best_n = -1
+    # If we allow 2 clusters, it almost always just splits positive vs negative - boring!
+    for n_clusters in range(3, int(len(X) / 2) + 1):
+        cluster_labels = KMeans(n_clusters=n_clusters, init="k-means++", n_init=10).fit_predict(X)
+        score = silhouette_score(X, cluster_labels)
+        # print(n_clusters, score)
+        if score > best_score:
+            best_score = score
+            best_labels = cluster_labels
+            best_n = n_clusters
+
+    # print(best_n)
+    return best_labels, X
+
+
+def to_matrix(labels: np.ndarray) -> np.ndarray:
+    out = np.zeros((len(labels), len(labels.unique())))
+    for i in labels.unique():
+        out[labels == i, i] = 1.0
+    return out
diff --git a/wise_pizza/explain.py b/wise_pizza/explain.py
@@ -25,6 +25,7 @@ def explain_changes_in_average(
     how: str = "totals",
     force_add_up: bool = False,
     constrain_signs: bool = True,
+    cluster_values: bool = False,
     verbose: int = 0,
 ):
     """
@@ -47,6 +48,8 @@ def explain_changes_in_average(
     to the difference between dataset totals
     @param constrain_signs: Whether to constrain weights of segments to have the same
     sign as naive segment averages
+    @param cluster_values: In addition to single-value slices, consider slices that consist of a
+    group of segments from the same dimension with similar naive averages
     @param verbose: If set to a truish value, lots of debug info is printed to console
     @return: A fitted object
     """
@@ -85,6 +88,7 @@ def explain_changes_in_average(
         how=how,
         force_add_up=force_add_up,
         constrain_signs=constrain_signs,
+        cluster_values=cluster_values,
         verbose=verbose,
     )
 
@@ -118,6 +122,7 @@ def explain_changes_in_totals(
     how: str = "totals",
     force_add_up: bool = False,
     constrain_signs: bool = True,
+    cluster_values: bool=False,
     verbose: int = 0,
 ):
     """
@@ -140,6 +145,8 @@ def explain_changes_in_totals(
     to the difference between dataset totals
     @param constrain_signs: Whether to constrain weights of segments to have the same
     sign as naive segment averages
+    @param cluster_values: In addition to single-value slices, consider slices that consist of a
+    group of segments from the same dimension with similar naive averages
     @param verbose: If set to a truish value, lots of debug info is printed to console
     @return: A fitted object
     """
@@ -180,6 +187,7 @@ def explain_changes_in_totals(
             solver=solver,
             force_add_up=force_add_up,
             constrain_signs=constrain_signs,
+            cluster_values=cluster_values,
             verbose=verbose,
         )
 
@@ -194,20 +202,22 @@ def explain_changes_in_totals(
             solver=solver,
             force_add_up=force_add_up,
             constrain_signs=constrain_signs,
+            cluster_values=cluster_values,
             verbose=verbose,
         )
 
         sf_size.final_size = final_size
         sf_avg.final_size = final_size
         sp = SlicerPair(sf_size, sf_avg)
-        sp.plot = (
-            lambda plot_is_static=False, width=2000, height=500: plot_split_segments(
-                sp.s1,
-                sp.s2,
-                plot_is_static=plot_is_static,
-                width=width,
-                height=height,
-            )
+        sp.plot = lambda plot_is_static=False, width=2000, height=500, cluster_key_width=180, cluster_value_width=318: plot_split_segments(
+            sp.s1,
+            sp.s2,
+            plot_is_static=plot_is_static,
+            width=width,
+            height=height,
+            cluster_values=cluster_values,
+            cluster_key_width=cluster_key_width,
+            cluster_value_width=cluster_value_width
         )
         return sp
 
@@ -226,14 +236,21 @@ def explain_changes_in_totals(
             force_dim="Change from" if how == "force_dim" else None,
             force_add_up=force_add_up,
             constrain_signs=constrain_signs,
+            cluster_values=cluster_values,
             verbose=verbose,
         )
 
         sf.pre_total = df1[total_name].sum()
         sf.post_total = df2[total_name].sum()
 
-        sf.plot = lambda plot_is_static=False, width=1000, height=1000: plot_waterfall(
-            sf, plot_is_static=plot_is_static, width=width, height=height
+        sf.plot = lambda plot_is_static=False, width=1000, height=1000, cluster_key_width=180, cluster_value_width=318: plot_waterfall(
+            sf,
+            plot_is_static=plot_is_static,
+            width=width,
+            height=height,
+            cluster_values=cluster_values,
+            cluster_key_width=cluster_key_width,
+            cluster_value_width=cluster_value_width
         )
         sf.task = "changes in totals"
         return sf
@@ -252,6 +269,7 @@ def explain_levels(
     verbose=0,
     force_add_up: bool = False,
     constrain_signs: bool = True,
+    cluster_values: bool=False
 ):
     """
     Find segments whose average is most different from the global one
@@ -266,8 +284,9 @@ def explain_levels(
     @param solver: If this equals to "lp" uses the LP solver, else uses the (recommended) Lasso solver
     @param verbose: If set to a truish value, lots of debug info is printed to console
     @param force_add_up: Force the contributions of chosen segments to add up to zero
-    @param constrain_signs: Whether to constrain weights of segments to have the same
-    sign as naive segment averages
+    @param constrain_signs: Whether to constrain weights of segments to have the same sign as naive segment averages
+    @param cluster_values: In addition to single-value slices, consider slices that consist of a
+    group of segments from the same dimension with similar naive averages
     @return: A fitted object
     """
     df = copy.copy(df)
@@ -297,15 +316,23 @@ def explain_levels(
         verbose=verbose,
         force_add_up=force_add_up,
         constrain_signs=constrain_signs,
+        cluster_values=cluster_values
     )
 
     for s in sf.segments:
         s["naive_avg"] += average
         s["total"] += average * s["seg_size"]
     # print(average)
     sf.reg.intercept_ = average
-    sf.plot = lambda plot_is_static=False, width=2000, height=500, return_fig=False: plot_segments(
-        sf, plot_is_static=plot_is_static, width=width, height=height, return_fig=return_fig
+    sf.plot = lambda plot_is_static=False, width=2000, height=500, return_fig=False, cluster_key_width=180, cluster_value_width=318: plot_segments(
+        sf,
+        plot_is_static=plot_is_static,
+        width=width,
+        height=height,
+        return_fig=return_fig,
+        cluster_values=cluster_values,
+        cluster_key_width=cluster_key_width,
+        cluster_value_width=cluster_value_width
     )
     sf.task = "levels"
     return sf
diff --git a/wise_pizza/make_matrix.py b/wise_pizza/make_matrix.py
@@ -1,5 +1,6 @@
 import itertools
-from typing import Optional, List, Dict
+from typing import Optional, List, Dict, Sequence
+from collections import defaultdict
 
 import numpy as np
 import scipy
@@ -99,6 +100,8 @@ def sparse_dummy_matrix(
     max_depth: int = 2,
     verbose=0,
     force_dim: Optional[str] = None,
+    clusters: Optional[Dict[str, Sequence[str]]] = None,
+    cluster_names: Optional[Dict[str,str]] = None
 ):
     # generate a sparse dummy matrix based on all the combinations
     # TODO: do a  nested sparse regression fit to form groups of dim values, pos, neg, null
@@ -109,6 +112,9 @@ def sparse_dummy_matrix(
         assert force_dim in dim_df.columns
         dims = [c for c in dim_df.columns if c != force_dim]
 
+    if clusters is None:
+        clusters = defaultdict(list)
+
     # drop dimensions with only one value, for clarity
     dims = [d for d in dims if len(dim_df[d].unique()) > 1]
 
@@ -124,7 +130,12 @@ def sparse_dummy_matrix(
         this_mat, these_defs = join_to_sparse(dim_df, d, verbose=verbose)
         dummy_cache[d] = {this_def: this_mat[:, i : i + 1] for i, this_def in enumerate(these_defs)}
 
+    # TODO: maps dimension names to dimension values
+    dims_dict = {dim: list(dim_df[dim].unique()) + list(clusters[dim]) for dim in dim_df.columns}
+
+    # Go over all possible depths
     for num_dims in tqdm(dims_range) if verbose else dims_range:
+        # for each depth, sample the possible dimension combinations
         for these_dims in itertools.combinations(dims, num_dims):
             if num_dims == 1 and these_dims[0] == "Change from":
                 continue
@@ -133,9 +144,55 @@ def sparse_dummy_matrix(
             else:
                 used_dims = [force_dim] + list(these_dims)
 
-            these_defs = segment_defs(dim_df, used_dims, verbose=verbose)
-            this_mat = construct_dummies(these_defs, dummy_cache)
+            segment_constraints = segment_defs_new(dims_dict, used_dims)
+            this_mat, these_defs = construct_dummies_new(used_dims, segment_constraints, dummy_cache, cluster_names)
+
+            # these_defs = segment_defs(dim_df, used_dims, verbose=verbose)
+            # this_mat = construct_dummies(these_defs, dummy_cache)
             mats.append(this_mat)
             defs += these_defs
     mat = hstack(mats)
     return mat, defs
+
+
+def segment_defs_new(dims_dict: Dict[str, Sequence[str]], used_dims: List[str]) -> List[Dict[str, str]]:
+    # Look at all possible combinations of dimension values for the chosen dimensions
+    if len(used_dims) == 1:
+        return np.array(dims_dict[used_dims[0]]).reshape(-1, 1)
+    else:
+        tmp = segment_defs_new(dims_dict, used_dims[:-1])
+        this_dim_values = np.array(dims_dict[used_dims[-1]])
+        repeated_values = np.tile(this_dim_values.reshape(-1, 1), len(tmp)).reshape(-1, 1)
+        pre_out = np.tile(tmp, (len(this_dim_values), 1))
+        out = np.concatenate([pre_out, repeated_values], axis=1)
+        return out
+
+
+def construct_dummies_new(
+    used_dims: List[str],
+        segment_defs: np.ndarray,
+        cache: Dict[str, Dict[str, np.ndarray]],
+        cluster_names: Optional[Dict[str,str]] = None
+) -> scipy.sparse.csc_matrix:
+    dummies = []
+    segments = []
+    for sgdf in segment_defs:
+        tmp = None
+        for i, d in enumerate(used_dims):
+            if isinstance(sgdf[i], str) and sgdf[i] not in cache[d]:  # a group of multiple values from that dim
+                sub_values = cluster_names[sgdf[i]].split("@@")
+                this_dummy = 0
+                for val in sub_values:
+                    this_dummy += cache[d][val]
+
+            else:
+                this_dummy = cache[d][sgdf[i]]
+
+            if tmp is None:
+                tmp = this_dummy
+            else:
+                tmp = tmp.multiply(this_dummy)
+        if tmp.sum() > 0:
+            dummies.append(tmp)
+            segments.append(dict(zip(used_dims, sgdf)))
+    return hstack(dummies), segments