Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consider clusters of segments with similar naive averages as segment candidates in their own right #23

Merged
merged 8 commits into from
Dec 6, 2023
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ sf1 = explain_changes_in_average(

![plot](https://github.com/transferwise/wise-pizza/blob/main/docs/explain_changes_in_average(totals).png?raw=True)

***In addition to single-value slices, consider slices that consist of a
group of segments from the same dimension with similar naive averages***
For that goal you can use cluster_values=True parameter.

![plot](https://github.com/transferwise/wise-pizza/blob/main/docs/cluster_values.png?raw=True)

And then you can visualize differences:

```Python
Expand All @@ -132,6 +138,12 @@ And check segments:
```Python
sf.segments
```

if you use cluster values, you can also check relevant cluster names:
```Python
sf.relevant_cluster_names
```

Please see the full example [here](https://github.com/transferwise/wise-pizza/blob/main/notebooks/Finding%20interesting%20segments.ipynb)

## For Developers
Expand Down
Binary file added docs/cluster_values.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
191 changes: 68 additions & 123 deletions notebooks/Finding interesting segments (continuous segments).ipynb

Large diffs are not rendered by default.

207 changes: 93 additions & 114 deletions notebooks/Finding interesting segments.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/test_fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def test_categorical():
for s in sf.segments:
print(s)
print(sf.summary())
print(sf.relevant_cluster_names)
print("yay!")


Expand Down
47 changes: 47 additions & 0 deletions wise_pizza/cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import numpy as np
import pandas as pd
from sklearn.preprocessing import PowerTransformer
from sklearn.cluster import KMeans, kmeans_plusplus
from sklearn.metrics import silhouette_score


def guided_kmeans(X: np.ndarray, power_transform: bool = True) -> np.ndarray:
"""
Cluster segment averages to calculate aggregated segments
@param X: Segment mean minus global mean, for each dimension value
@param power_transform: Do we power transform before clustering
@return: cluster labels and the transformed values
"""
if isinstance(X, pd.Series):
X = X.values.reshape(-1, 1)
elif isinstance(X, pd.DataFrame):
X = X.values

if power_transform:
if len(X[X > 0] > 1):
X[X > 0] = PowerTransformer(standardize=False).fit_transform(X[X > 0].reshape(-1, 1)).reshape(-1)
if len(X[X < 0] > 1):
X[X < 0] = -PowerTransformer(standardize=False).fit_transform(-X[X < 0].reshape(-1, 1)).reshape(-1)

best_score = -1
best_labels = None
best_n = -1
# If we allow 2 clusters, it almost always just splits positive vs negative - boring!
for n_clusters in range(3, int(len(X) / 2) + 1):
cluster_labels = KMeans(n_clusters=n_clusters, init="k-means++", n_init=10).fit_predict(X)
score = silhouette_score(X, cluster_labels)
# print(n_clusters, score)
if score > best_score:
best_score = score
best_labels = cluster_labels
best_n = n_clusters
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unlikely that it will cause problems, but we define best_n only in the if statement, I suggest to add best_n = None before "for" loop

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do


# print(best_n)
return best_labels, X


def to_matrix(labels: np.ndarray) -> np.ndarray:
out = np.zeros((len(labels), len(labels.unique())))
for i in labels.unique():
out[labels == i, i] = 1.0
return out
55 changes: 41 additions & 14 deletions wise_pizza/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def explain_changes_in_average(
how: str = "totals",
force_add_up: bool = False,
constrain_signs: bool = True,
cluster_values: bool = False,
verbose: int = 0,
):
"""
Expand All @@ -47,6 +48,8 @@ def explain_changes_in_average(
to the difference between dataset totals
@param constrain_signs: Whether to constrain weights of segments to have the same
sign as naive segment averages
@param cluster_values: In addition to single-value slices, consider slices that consist of a
group of segments from the same dimension with similar naive averages
@param verbose: If set to a truish value, lots of debug info is printed to console
@return: A fitted object
"""
Expand Down Expand Up @@ -85,6 +88,7 @@ def explain_changes_in_average(
how=how,
force_add_up=force_add_up,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
verbose=verbose,
)

Expand Down Expand Up @@ -118,6 +122,7 @@ def explain_changes_in_totals(
how: str = "totals",
force_add_up: bool = False,
constrain_signs: bool = True,
cluster_values: bool=False,
verbose: int = 0,
):
"""
Expand All @@ -140,6 +145,8 @@ def explain_changes_in_totals(
to the difference between dataset totals
@param constrain_signs: Whether to constrain weights of segments to have the same
sign as naive segment averages
@param cluster_values: In addition to single-value slices, consider slices that consist of a
group of segments from the same dimension with similar naive averages
@param verbose: If set to a truish value, lots of debug info is printed to console
@return: A fitted object
"""
Expand Down Expand Up @@ -180,6 +187,7 @@ def explain_changes_in_totals(
solver=solver,
force_add_up=force_add_up,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
verbose=verbose,
)

Expand All @@ -194,20 +202,22 @@ def explain_changes_in_totals(
solver=solver,
force_add_up=force_add_up,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
verbose=verbose,
)

sf_size.final_size = final_size
sf_avg.final_size = final_size
sp = SlicerPair(sf_size, sf_avg)
sp.plot = (
lambda plot_is_static=False, width=2000, height=500: plot_split_segments(
sp.s1,
sp.s2,
plot_is_static=plot_is_static,
width=width,
height=height,
)
sp.plot = lambda plot_is_static=False, width=2000, height=500, cluster_key_width=180, cluster_value_width=318: plot_split_segments(
sp.s1,
sp.s2,
plot_is_static=plot_is_static,
width=width,
height=height,
cluster_values=cluster_values,
cluster_key_width=cluster_key_width,
cluster_value_width=cluster_value_width
)
return sp

Expand All @@ -226,14 +236,21 @@ def explain_changes_in_totals(
force_dim="Change from" if how == "force_dim" else None,
force_add_up=force_add_up,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about adding cluster_values parameter also to the explain_changes_in_averages? Of course we everytime call explain_changes_in_totals then, but just to be sure if someone wants to use cluster_values = False parameter

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch, will fix

verbose=verbose,
)

sf.pre_total = df1[total_name].sum()
sf.post_total = df2[total_name].sum()

sf.plot = lambda plot_is_static=False, width=1000, height=1000: plot_waterfall(
sf, plot_is_static=plot_is_static, width=width, height=height
sf.plot = lambda plot_is_static=False, width=1000, height=1000, cluster_key_width=180, cluster_value_width=318: plot_waterfall(
sf,
plot_is_static=plot_is_static,
width=width,
height=height,
cluster_values=cluster_values,
cluster_key_width=cluster_key_width,
cluster_value_width=cluster_value_width
)
sf.task = "changes in totals"
return sf
Expand All @@ -252,6 +269,7 @@ def explain_levels(
verbose=0,
force_add_up: bool = False,
constrain_signs: bool = True,
cluster_values: bool=False
):
"""
Find segments whose average is most different from the global one
Expand All @@ -266,8 +284,9 @@ def explain_levels(
@param solver: If this equals to "lp" uses the LP solver, else uses the (recommended) Lasso solver
@param verbose: If set to a truish value, lots of debug info is printed to console
@param force_add_up: Force the contributions of chosen segments to add up to zero
@param constrain_signs: Whether to constrain weights of segments to have the same
sign as naive segment averages
@param constrain_signs: Whether to constrain weights of segments to have the same sign as naive segment averages
@param cluster_values: In addition to single-value slices, consider slices that consist of a
group of segments from the same dimension with similar naive averages
@return: A fitted object
"""
df = copy.copy(df)
Expand Down Expand Up @@ -297,15 +316,23 @@ def explain_levels(
verbose=verbose,
force_add_up=force_add_up,
constrain_signs=constrain_signs,
cluster_values=cluster_values
)

for s in sf.segments:
s["naive_avg"] += average
s["total"] += average * s["seg_size"]
# print(average)
sf.reg.intercept_ = average
sf.plot = lambda plot_is_static=False, width=2000, height=500, return_fig=False: plot_segments(
sf, plot_is_static=plot_is_static, width=width, height=height, return_fig=return_fig
sf.plot = lambda plot_is_static=False, width=2000, height=500, return_fig=False, cluster_key_width=180, cluster_value_width=318: plot_segments(
sf,
plot_is_static=plot_is_static,
width=width,
height=height,
return_fig=return_fig,
cluster_values=cluster_values,
cluster_key_width=cluster_key_width,
cluster_value_width=cluster_value_width
)
sf.task = "levels"
return sf
63 changes: 60 additions & 3 deletions wise_pizza/make_matrix.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import itertools
from typing import Optional, List, Dict
from typing import Optional, List, Dict, Sequence
from collections import defaultdict

import numpy as np
import scipy
Expand Down Expand Up @@ -99,6 +100,8 @@ def sparse_dummy_matrix(
max_depth: int = 2,
verbose=0,
force_dim: Optional[str] = None,
clusters: Optional[Dict[str, Sequence[str]]] = None,
cluster_names: Optional[Dict[str,str]] = None
):
# generate a sparse dummy matrix based on all the combinations
# TODO: do a nested sparse regression fit to form groups of dim values, pos, neg, null
Expand All @@ -109,6 +112,9 @@ def sparse_dummy_matrix(
assert force_dim in dim_df.columns
dims = [c for c in dim_df.columns if c != force_dim]

if clusters is None:
clusters = defaultdict(list)

# drop dimensions with only one value, for clarity
dims = [d for d in dims if len(dim_df[d].unique()) > 1]

Expand All @@ -124,7 +130,12 @@ def sparse_dummy_matrix(
this_mat, these_defs = join_to_sparse(dim_df, d, verbose=verbose)
dummy_cache[d] = {this_def: this_mat[:, i : i + 1] for i, this_def in enumerate(these_defs)}

# TODO: maps dimension names to dimension values
dims_dict = {dim: list(dim_df[dim].unique()) + list(clusters[dim]) for dim in dim_df.columns}

# Go over all possible depths
for num_dims in tqdm(dims_range) if verbose else dims_range:
# for each depth, sample the possible dimension combinations
for these_dims in itertools.combinations(dims, num_dims):
if num_dims == 1 and these_dims[0] == "Change from":
continue
Expand All @@ -133,9 +144,55 @@ def sparse_dummy_matrix(
else:
used_dims = [force_dim] + list(these_dims)

these_defs = segment_defs(dim_df, used_dims, verbose=verbose)
this_mat = construct_dummies(these_defs, dummy_cache)
segment_constraints = segment_defs_new(dims_dict, used_dims)
this_mat, these_defs = construct_dummies_new(used_dims, segment_constraints, dummy_cache, cluster_names)

# these_defs = segment_defs(dim_df, used_dims, verbose=verbose)
# this_mat = construct_dummies(these_defs, dummy_cache)
mats.append(this_mat)
defs += these_defs
mat = hstack(mats)
return mat, defs


def segment_defs_new(dims_dict: Dict[str, Sequence[str]], used_dims: List[str]) -> List[Dict[str, str]]:
# Look at all possible combinations of dimension values for the chosen dimensions
if len(used_dims) == 1:
return np.array(dims_dict[used_dims[0]]).reshape(-1, 1)
else:
tmp = segment_defs_new(dims_dict, used_dims[:-1])
this_dim_values = np.array(dims_dict[used_dims[-1]])
repeated_values = np.tile(this_dim_values.reshape(-1, 1), len(tmp)).reshape(-1, 1)
pre_out = np.tile(tmp, (len(this_dim_values), 1))
out = np.concatenate([pre_out, repeated_values], axis=1)
return out


def construct_dummies_new(
used_dims: List[str],
segment_defs: np.ndarray,
cache: Dict[str, Dict[str, np.ndarray]],
cluster_names: Optional[Dict[str,str]] = None
) -> scipy.sparse.csc_matrix:
dummies = []
segments = []
for sgdf in segment_defs:
tmp = None
for i, d in enumerate(used_dims):
if isinstance(sgdf[i], str) and sgdf[i] not in cache[d]: # a group of multiple values from that dim
sub_values = cluster_names[sgdf[i]].split("@@")
this_dummy = 0
for val in sub_values:
this_dummy += cache[d][val]

else:
this_dummy = cache[d][sgdf[i]]

if tmp is None:
tmp = this_dummy
else:
tmp = tmp.multiply(this_dummy)
if tmp.sum() > 0:
dummies.append(tmp)
segments.append(dict(zip(used_dims, sgdf)))
return hstack(dummies), segments
Loading
Loading