Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consider clusters of segments with similar naive averages as segment candidates in their own right #23

Merged
merged 8 commits into from
Dec 6, 2023
56 changes: 56 additions & 0 deletions wise_pizza/cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import numpy as np
import pandas as pd
from sklearn.preprocessing import PowerTransformer
from sklearn.cluster import KMeans, kmeans_plusplus
from sklearn.metrics import silhouette_score


def guided_kmeans(X: np.ndarray, power_transform: bool = True) -> np.ndarray:
"""
Cluster segment averages to calculate aggregated segments
@param X: Segment mean minus global mean, for each dimension value
@param power_transform: Do we power transform before clustering
@return: cluster labels and the transformed values
"""
if isinstance(X, pd.Series):
X = X.values.reshape(-1, 1)
elif isinstance(X, pd.DataFrame):
X = X.values

if power_transform:
if len(X[X > 0] > 1):
X[X > 0] = (
PowerTransformer(standardize=False)
Copy link
Collaborator

@AlxdrPolyakov AlxdrPolyakov Nov 1, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I forgot something, but why don't we standardize here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we don't want to mix up the positive and the negative values, we want to transform both separately.

.fit_transform(X[X > 0].reshape(-1, 1))
.reshape(-1)
)
if len(X[X < 0] > 1):
X[X < 0] = (
-PowerTransformer(standardize=False)
.fit_transform(-X[X < 0].reshape(-1, 1))
.reshape(-1)
)

best_score = -1
best_labels = None
# If we allow 2 clusters, it almost always just splits positive vs negative - boring!
for n_clusters in range(3, 10):
cluster_labels = KMeans(
n_clusters=n_clusters, init="k-means++", n_init=10
).fit_predict(X)
score = silhouette_score(X, cluster_labels)
print(n_clusters, score)
if score > best_score:
best_score = score
best_labels = cluster_labels
best_n = n_clusters
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unlikely that it will cause problems, but we define best_n only in the if statement, I suggest to add best_n = None before "for" loop

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do


print(best_n)
return best_labels, X


def to_matrix(labels: np.ndarray) -> np.ndarray:
out = np.zeros((len(labels), len(labels.unique())))
for i in labels.unique():
out[labels == i, i] = 1.0
return out
17 changes: 7 additions & 10 deletions wise_pizza/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,14 +200,12 @@ def explain_changes_in_totals(
sf_size.final_size = final_size
sf_avg.final_size = final_size
sp = SlicerPair(sf_size, sf_avg)
sp.plot = (
lambda plot_is_static=False, width=2000, height=500: plot_split_segments(
sp.s1,
sp.s2,
plot_is_static=plot_is_static,
width=width,
height=height,
)
sp.plot = lambda plot_is_static=False, width=2000, height=500: plot_split_segments(
sp.s1,
sp.s2,
plot_is_static=plot_is_static,
width=width,
height=height,
)
return sp

Expand Down Expand Up @@ -266,8 +264,7 @@ def explain_levels(
@param solver: If this equals to "lp" uses the LP solver, else uses the (recommended) Lasso solver
@param verbose: If set to a truish value, lots of debug info is printed to console
@param force_add_up: Force the contributions of chosen segments to add up to zero
@param constrain_signs: Whether to constrain weights of segments to have the same
sign as naive segment averages
@param constrain_signs: Whether to constrain weights of segments to have the same sign as naive segment averages
@return: A fitted object
"""
df = copy.copy(df)
Expand Down
39 changes: 36 additions & 3 deletions wise_pizza/make_matrix.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import itertools
from typing import Optional, List, Dict
from typing import Optional, List, Dict, Sequence

import numpy as np
import scipy
Expand Down Expand Up @@ -133,9 +133,42 @@ def sparse_dummy_matrix(
else:
used_dims = [force_dim] + list(these_dims)

these_defs = segment_defs(dim_df, used_dims, verbose=verbose)
this_mat = construct_dummies(these_defs, dummy_cache)
segment_constraints = segment_defs_new(dims_dict, used_dims)
this_mat, these_defs = construct_dummies_new(used_dims, segment_constraints, dummy_cache)

# these_defs = segment_defs(dim_df, used_dims, verbose=verbose)
# this_mat = construct_dummies(these_defs, dummy_cache)
mats.append(this_mat)
defs += these_defs
mat = hstack(mats)
return mat, defs


def segment_defs_new(dims_dict: Dict[str, Sequence[str]], used_dims) -> List[Dict[str, str]]:
if len(used_dims) == 1:
return np.array(dims_dict[used_dims[0]]).reshape(-1, 1)
else:
tmp = segment_defs_new(dims_dict, used_dims[:-1])
this_dim_values = np.array(dims_dict[used_dims[-1]])
repeated_values = np.tile(this_dim_values, len(tmp)).reshape(-1, 1)
pre_out = np.tile(tmp, (len(this_dim_values), 1))
out = np.concatenate(pre_out, repeated_values)
return out


def construct_dummies_new(
used_dims: List[str], segment_defs: np.ndarray, cache: Dict[str, Dict[str, np.ndarray]]
) -> scipy.sparse.csc_matrix:
dummies = []
segments = []
for sgdf in segment_defs:
tmp = None
for i, d in enumerate(used_dims):
if tmp is None:
tmp = cache[d][sgdf[i]]
else:
tmp = tmp.multiply(cache[d][sgdf[i]])
if tmp.sum() > 0:
dummies.append(tmp)
segments.append(dict(zip(used_dims, sgdf)))
return hstack(dummies), segments