Skip to content

Commit

Permalink
Merge pull request #23 from transferwise/cluster
Browse files Browse the repository at this point in the history
Consider clusters of segments with similar naive averages as segment candidates in their own right
  • Loading branch information
AlxdrPolyakov authored Dec 6, 2023
2 parents a2de61f + 8400ec3 commit 9587cda
Show file tree
Hide file tree
Showing 10 changed files with 497 additions and 295 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ sf1 = explain_changes_in_average(

![plot](https://github.com/transferwise/wise-pizza/blob/main/docs/explain_changes_in_average(totals).png?raw=True)

***In addition to single-value slices, consider slices that consist of a
group of segments from the same dimension with similar naive averages***
For that goal you can use cluster_values=True parameter.

![plot](https://github.com/transferwise/wise-pizza/blob/main/docs/cluster_values.png?raw=True)

And then you can visualize differences:

```Python
Expand All @@ -132,6 +138,12 @@ And check segments:
```Python
sf.segments
```

if you use cluster values, you can also check relevant cluster names:
```Python
sf.relevant_cluster_names
```

Please see the full example [here](https://github.com/transferwise/wise-pizza/blob/main/notebooks/Finding%20interesting%20segments.ipynb)

## For Developers
Expand Down
Binary file added docs/cluster_values.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
191 changes: 68 additions & 123 deletions notebooks/Finding interesting segments (continuous segments).ipynb

Large diffs are not rendered by default.

207 changes: 93 additions & 114 deletions notebooks/Finding interesting segments.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/test_fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def test_categorical():
for s in sf.segments:
print(s)
print(sf.summary())
print(sf.relevant_cluster_names)
print("yay!")


Expand Down
47 changes: 47 additions & 0 deletions wise_pizza/cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import numpy as np
import pandas as pd
from sklearn.preprocessing import PowerTransformer
from sklearn.cluster import KMeans, kmeans_plusplus
from sklearn.metrics import silhouette_score


def guided_kmeans(X: np.ndarray, power_transform: bool = True) -> np.ndarray:
"""
Cluster segment averages to calculate aggregated segments
@param X: Segment mean minus global mean, for each dimension value
@param power_transform: Do we power transform before clustering
@return: cluster labels and the transformed values
"""
if isinstance(X, pd.Series):
X = X.values.reshape(-1, 1)
elif isinstance(X, pd.DataFrame):
X = X.values

if power_transform:
if len(X[X > 0] > 1):
X[X > 0] = PowerTransformer(standardize=False).fit_transform(X[X > 0].reshape(-1, 1)).reshape(-1)
if len(X[X < 0] > 1):
X[X < 0] = -PowerTransformer(standardize=False).fit_transform(-X[X < 0].reshape(-1, 1)).reshape(-1)

best_score = -1
best_labels = None
best_n = -1
# If we allow 2 clusters, it almost always just splits positive vs negative - boring!
for n_clusters in range(3, int(len(X) / 2) + 1):
cluster_labels = KMeans(n_clusters=n_clusters, init="k-means++", n_init=10).fit_predict(X)
score = silhouette_score(X, cluster_labels)
# print(n_clusters, score)
if score > best_score:
best_score = score
best_labels = cluster_labels
best_n = n_clusters

# print(best_n)
return best_labels, X


def to_matrix(labels: np.ndarray) -> np.ndarray:
out = np.zeros((len(labels), len(labels.unique())))
for i in labels.unique():
out[labels == i, i] = 1.0
return out
55 changes: 41 additions & 14 deletions wise_pizza/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def explain_changes_in_average(
how: str = "totals",
force_add_up: bool = False,
constrain_signs: bool = True,
cluster_values: bool = False,
verbose: int = 0,
):
"""
Expand All @@ -47,6 +48,8 @@ def explain_changes_in_average(
to the difference between dataset totals
@param constrain_signs: Whether to constrain weights of segments to have the same
sign as naive segment averages
@param cluster_values: In addition to single-value slices, consider slices that consist of a
group of segments from the same dimension with similar naive averages
@param verbose: If set to a truish value, lots of debug info is printed to console
@return: A fitted object
"""
Expand Down Expand Up @@ -85,6 +88,7 @@ def explain_changes_in_average(
how=how,
force_add_up=force_add_up,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
verbose=verbose,
)

Expand Down Expand Up @@ -118,6 +122,7 @@ def explain_changes_in_totals(
how: str = "totals",
force_add_up: bool = False,
constrain_signs: bool = True,
cluster_values: bool=False,
verbose: int = 0,
):
"""
Expand All @@ -140,6 +145,8 @@ def explain_changes_in_totals(
to the difference between dataset totals
@param constrain_signs: Whether to constrain weights of segments to have the same
sign as naive segment averages
@param cluster_values: In addition to single-value slices, consider slices that consist of a
group of segments from the same dimension with similar naive averages
@param verbose: If set to a truish value, lots of debug info is printed to console
@return: A fitted object
"""
Expand Down Expand Up @@ -180,6 +187,7 @@ def explain_changes_in_totals(
solver=solver,
force_add_up=force_add_up,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
verbose=verbose,
)

Expand All @@ -194,20 +202,22 @@ def explain_changes_in_totals(
solver=solver,
force_add_up=force_add_up,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
verbose=verbose,
)

sf_size.final_size = final_size
sf_avg.final_size = final_size
sp = SlicerPair(sf_size, sf_avg)
sp.plot = (
lambda plot_is_static=False, width=2000, height=500: plot_split_segments(
sp.s1,
sp.s2,
plot_is_static=plot_is_static,
width=width,
height=height,
)
sp.plot = lambda plot_is_static=False, width=2000, height=500, cluster_key_width=180, cluster_value_width=318: plot_split_segments(
sp.s1,
sp.s2,
plot_is_static=plot_is_static,
width=width,
height=height,
cluster_values=cluster_values,
cluster_key_width=cluster_key_width,
cluster_value_width=cluster_value_width
)
return sp

Expand All @@ -226,14 +236,21 @@ def explain_changes_in_totals(
force_dim="Change from" if how == "force_dim" else None,
force_add_up=force_add_up,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
verbose=verbose,
)

sf.pre_total = df1[total_name].sum()
sf.post_total = df2[total_name].sum()

sf.plot = lambda plot_is_static=False, width=1000, height=1000: plot_waterfall(
sf, plot_is_static=plot_is_static, width=width, height=height
sf.plot = lambda plot_is_static=False, width=1000, height=1000, cluster_key_width=180, cluster_value_width=318: plot_waterfall(
sf,
plot_is_static=plot_is_static,
width=width,
height=height,
cluster_values=cluster_values,
cluster_key_width=cluster_key_width,
cluster_value_width=cluster_value_width
)
sf.task = "changes in totals"
return sf
Expand All @@ -252,6 +269,7 @@ def explain_levels(
verbose=0,
force_add_up: bool = False,
constrain_signs: bool = True,
cluster_values: bool=False
):
"""
Find segments whose average is most different from the global one
Expand All @@ -266,8 +284,9 @@ def explain_levels(
@param solver: If this equals to "lp" uses the LP solver, else uses the (recommended) Lasso solver
@param verbose: If set to a truish value, lots of debug info is printed to console
@param force_add_up: Force the contributions of chosen segments to add up to zero
@param constrain_signs: Whether to constrain weights of segments to have the same
sign as naive segment averages
@param constrain_signs: Whether to constrain weights of segments to have the same sign as naive segment averages
@param cluster_values: In addition to single-value slices, consider slices that consist of a
group of segments from the same dimension with similar naive averages
@return: A fitted object
"""
df = copy.copy(df)
Expand Down Expand Up @@ -297,15 +316,23 @@ def explain_levels(
verbose=verbose,
force_add_up=force_add_up,
constrain_signs=constrain_signs,
cluster_values=cluster_values
)

for s in sf.segments:
s["naive_avg"] += average
s["total"] += average * s["seg_size"]
# print(average)
sf.reg.intercept_ = average
sf.plot = lambda plot_is_static=False, width=2000, height=500, return_fig=False: plot_segments(
sf, plot_is_static=plot_is_static, width=width, height=height, return_fig=return_fig
sf.plot = lambda plot_is_static=False, width=2000, height=500, return_fig=False, cluster_key_width=180, cluster_value_width=318: plot_segments(
sf,
plot_is_static=plot_is_static,
width=width,
height=height,
return_fig=return_fig,
cluster_values=cluster_values,
cluster_key_width=cluster_key_width,
cluster_value_width=cluster_value_width
)
sf.task = "levels"
return sf
63 changes: 60 additions & 3 deletions wise_pizza/make_matrix.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import itertools
from typing import Optional, List, Dict
from typing import Optional, List, Dict, Sequence
from collections import defaultdict

import numpy as np
import scipy
Expand Down Expand Up @@ -99,6 +100,8 @@ def sparse_dummy_matrix(
max_depth: int = 2,
verbose=0,
force_dim: Optional[str] = None,
clusters: Optional[Dict[str, Sequence[str]]] = None,
cluster_names: Optional[Dict[str,str]] = None
):
# generate a sparse dummy matrix based on all the combinations
# TODO: do a nested sparse regression fit to form groups of dim values, pos, neg, null
Expand All @@ -109,6 +112,9 @@ def sparse_dummy_matrix(
assert force_dim in dim_df.columns
dims = [c for c in dim_df.columns if c != force_dim]

if clusters is None:
clusters = defaultdict(list)

# drop dimensions with only one value, for clarity
dims = [d for d in dims if len(dim_df[d].unique()) > 1]

Expand All @@ -124,7 +130,12 @@ def sparse_dummy_matrix(
this_mat, these_defs = join_to_sparse(dim_df, d, verbose=verbose)
dummy_cache[d] = {this_def: this_mat[:, i : i + 1] for i, this_def in enumerate(these_defs)}

# TODO: maps dimension names to dimension values
dims_dict = {dim: list(dim_df[dim].unique()) + list(clusters[dim]) for dim in dim_df.columns}

# Go over all possible depths
for num_dims in tqdm(dims_range) if verbose else dims_range:
# for each depth, sample the possible dimension combinations
for these_dims in itertools.combinations(dims, num_dims):
if num_dims == 1 and these_dims[0] == "Change from":
continue
Expand All @@ -133,9 +144,55 @@ def sparse_dummy_matrix(
else:
used_dims = [force_dim] + list(these_dims)

these_defs = segment_defs(dim_df, used_dims, verbose=verbose)
this_mat = construct_dummies(these_defs, dummy_cache)
segment_constraints = segment_defs_new(dims_dict, used_dims)
this_mat, these_defs = construct_dummies_new(used_dims, segment_constraints, dummy_cache, cluster_names)

# these_defs = segment_defs(dim_df, used_dims, verbose=verbose)
# this_mat = construct_dummies(these_defs, dummy_cache)
mats.append(this_mat)
defs += these_defs
mat = hstack(mats)
return mat, defs


def segment_defs_new(dims_dict: Dict[str, Sequence[str]], used_dims: List[str]) -> List[Dict[str, str]]:
# Look at all possible combinations of dimension values for the chosen dimensions
if len(used_dims) == 1:
return np.array(dims_dict[used_dims[0]]).reshape(-1, 1)
else:
tmp = segment_defs_new(dims_dict, used_dims[:-1])
this_dim_values = np.array(dims_dict[used_dims[-1]])
repeated_values = np.tile(this_dim_values.reshape(-1, 1), len(tmp)).reshape(-1, 1)
pre_out = np.tile(tmp, (len(this_dim_values), 1))
out = np.concatenate([pre_out, repeated_values], axis=1)
return out


def construct_dummies_new(
used_dims: List[str],
segment_defs: np.ndarray,
cache: Dict[str, Dict[str, np.ndarray]],
cluster_names: Optional[Dict[str,str]] = None
) -> scipy.sparse.csc_matrix:
dummies = []
segments = []
for sgdf in segment_defs:
tmp = None
for i, d in enumerate(used_dims):
if isinstance(sgdf[i], str) and sgdf[i] not in cache[d]: # a group of multiple values from that dim
sub_values = cluster_names[sgdf[i]].split("@@")
this_dummy = 0
for val in sub_values:
this_dummy += cache[d][val]

else:
this_dummy = cache[d][sgdf[i]]

if tmp is None:
tmp = this_dummy
else:
tmp = tmp.multiply(this_dummy)
if tmp.sum() > 0:
dummies.append(tmp)
segments.append(dict(zip(used_dims, sgdf)))
return hstack(dummies), segments
Loading

0 comments on commit 9587cda

Please sign in to comment.