-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Consider clusters of segments with similar naive averages as segment candidates in their own right #23
Consider clusters of segments with similar naive averages as segment candidates in their own right #23
Changes from all commits
d379775
b0debad
4092fc1
7300020
d9496e5
51ddd47
445509b
8400ec3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from sklearn.preprocessing import PowerTransformer | ||
from sklearn.cluster import KMeans, kmeans_plusplus | ||
from sklearn.metrics import silhouette_score | ||
|
||
|
||
def guided_kmeans(X: np.ndarray, power_transform: bool = True) -> np.ndarray: | ||
""" | ||
Cluster segment averages to calculate aggregated segments | ||
@param X: Segment mean minus global mean, for each dimension value | ||
@param power_transform: Do we power transform before clustering | ||
@return: cluster labels and the transformed values | ||
""" | ||
if isinstance(X, pd.Series): | ||
X = X.values.reshape(-1, 1) | ||
elif isinstance(X, pd.DataFrame): | ||
X = X.values | ||
|
||
if power_transform: | ||
if len(X[X > 0] > 1): | ||
X[X > 0] = PowerTransformer(standardize=False).fit_transform(X[X > 0].reshape(-1, 1)).reshape(-1) | ||
if len(X[X < 0] > 1): | ||
X[X < 0] = -PowerTransformer(standardize=False).fit_transform(-X[X < 0].reshape(-1, 1)).reshape(-1) | ||
|
||
best_score = -1 | ||
best_labels = None | ||
best_n = -1 | ||
# If we allow 2 clusters, it almost always just splits positive vs negative - boring! | ||
for n_clusters in range(3, int(len(X) / 2) + 1): | ||
cluster_labels = KMeans(n_clusters=n_clusters, init="k-means++", n_init=10).fit_predict(X) | ||
score = silhouette_score(X, cluster_labels) | ||
# print(n_clusters, score) | ||
if score > best_score: | ||
best_score = score | ||
best_labels = cluster_labels | ||
best_n = n_clusters | ||
|
||
# print(best_n) | ||
return best_labels, X | ||
|
||
|
||
def to_matrix(labels: np.ndarray) -> np.ndarray: | ||
out = np.zeros((len(labels), len(labels.unique()))) | ||
for i in labels.unique(): | ||
out[labels == i, i] = 1.0 | ||
return out |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ def explain_changes_in_average( | |
how: str = "totals", | ||
force_add_up: bool = False, | ||
constrain_signs: bool = True, | ||
cluster_values: bool = False, | ||
verbose: int = 0, | ||
): | ||
""" | ||
|
@@ -47,6 +48,8 @@ def explain_changes_in_average( | |
to the difference between dataset totals | ||
@param constrain_signs: Whether to constrain weights of segments to have the same | ||
sign as naive segment averages | ||
@param cluster_values: In addition to single-value slices, consider slices that consist of a | ||
group of segments from the same dimension with similar naive averages | ||
@param verbose: If set to a truish value, lots of debug info is printed to console | ||
@return: A fitted object | ||
""" | ||
|
@@ -85,6 +88,7 @@ def explain_changes_in_average( | |
how=how, | ||
force_add_up=force_add_up, | ||
constrain_signs=constrain_signs, | ||
cluster_values=cluster_values, | ||
verbose=verbose, | ||
) | ||
|
||
|
@@ -118,6 +122,7 @@ def explain_changes_in_totals( | |
how: str = "totals", | ||
force_add_up: bool = False, | ||
constrain_signs: bool = True, | ||
cluster_values: bool=False, | ||
verbose: int = 0, | ||
): | ||
""" | ||
|
@@ -140,6 +145,8 @@ def explain_changes_in_totals( | |
to the difference between dataset totals | ||
@param constrain_signs: Whether to constrain weights of segments to have the same | ||
sign as naive segment averages | ||
@param cluster_values: In addition to single-value slices, consider slices that consist of a | ||
group of segments from the same dimension with similar naive averages | ||
@param verbose: If set to a truish value, lots of debug info is printed to console | ||
@return: A fitted object | ||
""" | ||
|
@@ -180,6 +187,7 @@ def explain_changes_in_totals( | |
solver=solver, | ||
force_add_up=force_add_up, | ||
constrain_signs=constrain_signs, | ||
cluster_values=cluster_values, | ||
verbose=verbose, | ||
) | ||
|
||
|
@@ -194,20 +202,22 @@ def explain_changes_in_totals( | |
solver=solver, | ||
force_add_up=force_add_up, | ||
constrain_signs=constrain_signs, | ||
cluster_values=cluster_values, | ||
verbose=verbose, | ||
) | ||
|
||
sf_size.final_size = final_size | ||
sf_avg.final_size = final_size | ||
sp = SlicerPair(sf_size, sf_avg) | ||
sp.plot = ( | ||
lambda plot_is_static=False, width=2000, height=500: plot_split_segments( | ||
sp.s1, | ||
sp.s2, | ||
plot_is_static=plot_is_static, | ||
width=width, | ||
height=height, | ||
) | ||
sp.plot = lambda plot_is_static=False, width=2000, height=500, cluster_key_width=180, cluster_value_width=318: plot_split_segments( | ||
sp.s1, | ||
sp.s2, | ||
plot_is_static=plot_is_static, | ||
width=width, | ||
height=height, | ||
cluster_values=cluster_values, | ||
cluster_key_width=cluster_key_width, | ||
cluster_value_width=cluster_value_width | ||
) | ||
return sp | ||
|
||
|
@@ -226,14 +236,21 @@ def explain_changes_in_totals( | |
force_dim="Change from" if how == "force_dim" else None, | ||
force_add_up=force_add_up, | ||
constrain_signs=constrain_signs, | ||
cluster_values=cluster_values, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about adding cluster_values parameter also to the explain_changes_in_averages? Of course we everytime call explain_changes_in_totals then, but just to be sure if someone wants to use cluster_values = False parameter There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice catch, will fix |
||
verbose=verbose, | ||
) | ||
|
||
sf.pre_total = df1[total_name].sum() | ||
sf.post_total = df2[total_name].sum() | ||
|
||
sf.plot = lambda plot_is_static=False, width=1000, height=1000: plot_waterfall( | ||
sf, plot_is_static=plot_is_static, width=width, height=height | ||
sf.plot = lambda plot_is_static=False, width=1000, height=1000, cluster_key_width=180, cluster_value_width=318: plot_waterfall( | ||
sf, | ||
plot_is_static=plot_is_static, | ||
width=width, | ||
height=height, | ||
cluster_values=cluster_values, | ||
cluster_key_width=cluster_key_width, | ||
cluster_value_width=cluster_value_width | ||
) | ||
sf.task = "changes in totals" | ||
return sf | ||
|
@@ -252,6 +269,7 @@ def explain_levels( | |
verbose=0, | ||
force_add_up: bool = False, | ||
constrain_signs: bool = True, | ||
cluster_values: bool=False | ||
): | ||
""" | ||
Find segments whose average is most different from the global one | ||
|
@@ -266,8 +284,9 @@ def explain_levels( | |
@param solver: If this equals to "lp" uses the LP solver, else uses the (recommended) Lasso solver | ||
@param verbose: If set to a truish value, lots of debug info is printed to console | ||
@param force_add_up: Force the contributions of chosen segments to add up to zero | ||
@param constrain_signs: Whether to constrain weights of segments to have the same | ||
sign as naive segment averages | ||
@param constrain_signs: Whether to constrain weights of segments to have the same sign as naive segment averages | ||
@param cluster_values: In addition to single-value slices, consider slices that consist of a | ||
group of segments from the same dimension with similar naive averages | ||
@return: A fitted object | ||
""" | ||
df = copy.copy(df) | ||
|
@@ -297,15 +316,23 @@ def explain_levels( | |
verbose=verbose, | ||
force_add_up=force_add_up, | ||
constrain_signs=constrain_signs, | ||
cluster_values=cluster_values | ||
) | ||
|
||
for s in sf.segments: | ||
s["naive_avg"] += average | ||
s["total"] += average * s["seg_size"] | ||
# print(average) | ||
sf.reg.intercept_ = average | ||
sf.plot = lambda plot_is_static=False, width=2000, height=500, return_fig=False: plot_segments( | ||
sf, plot_is_static=plot_is_static, width=width, height=height, return_fig=return_fig | ||
sf.plot = lambda plot_is_static=False, width=2000, height=500, return_fig=False, cluster_key_width=180, cluster_value_width=318: plot_segments( | ||
sf, | ||
plot_is_static=plot_is_static, | ||
width=width, | ||
height=height, | ||
return_fig=return_fig, | ||
cluster_values=cluster_values, | ||
cluster_key_width=cluster_key_width, | ||
cluster_value_width=cluster_value_width | ||
) | ||
sf.task = "levels" | ||
return sf |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unlikely that it will cause problems, but we define best_n only in the if statement, I suggest to add best_n = None before "for" loop
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will do