From b5ff932dedb4ef36662d1743f04f45d003cb7308 Mon Sep 17 00:00:00 2001 From: "Egor.Kraev" Date: Mon, 6 May 2024 10:38:35 +0100 Subject: [PATCH] Refactor for tree solver, all tests pass --- wise_pizza/cluster.py | 41 ++++++++++++++++++++++++++++++++++++++--- wise_pizza/slicer.py | 30 +++++------------------------- 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/wise_pizza/cluster.py b/wise_pizza/cluster.py index f4e185f..d8e3251 100644 --- a/wise_pizza/cluster.py +++ b/wise_pizza/cluster.py @@ -1,3 +1,5 @@ +from typing import List + import numpy as np import pandas as pd from sklearn.preprocessing import PowerTransformer @@ -19,16 +21,26 @@ def guided_kmeans(X: np.ndarray, power_transform: bool = True) -> np.ndarray: if power_transform: if len(X[X > 0] > 1): - X[X > 0] = PowerTransformer(standardize=False).fit_transform(X[X > 0].reshape(-1, 1)).reshape(-1) + X[X > 0] = ( + PowerTransformer(standardize=False) + .fit_transform(X[X > 0].reshape(-1, 1)) + .reshape(-1) + ) if len(X[X < 0] > 1): - X[X < 0] = -PowerTransformer(standardize=False).fit_transform(-X[X < 0].reshape(-1, 1)).reshape(-1) + X[X < 0] = ( + -PowerTransformer(standardize=False) + .fit_transform(-X[X < 0].reshape(-1, 1)) + .reshape(-1) + ) best_score = -1 best_labels = None best_n = -1 # If we allow 2 clusters, it almost always just splits positive vs negative - boring! for n_clusters in range(3, int(len(X) / 2) + 1): - cluster_labels = KMeans(n_clusters=n_clusters, init="k-means++", n_init=10).fit_predict(X) + cluster_labels = KMeans( + n_clusters=n_clusters, init="k-means++", n_init=10 + ).fit_predict(X) score = silhouette_score(X, cluster_labels) # print(n_clusters, score) if score > best_score: @@ -45,3 +57,26 @@ def to_matrix(labels: np.ndarray) -> np.ndarray: for i in labels.unique(): out[labels == i, i] = 1.0 return out + + +def make_clusters(dim_df: pd.DataFrame, dims: List[str]): + cluster_names = {} + for dim in dims: + if len(dim_df[dim].unique()) >= 6: # otherwise what's the point in clustering? + grouped_df = ( + dim_df[[dim, "totals", "weights"]].groupby(dim, as_index=False).sum() + ) + grouped_df["avg"] = grouped_df["totals"] / grouped_df["weights"] + grouped_df["cluster"], _ = guided_kmeans(grouped_df["avg"]) + pre_clusters = ( + grouped_df[["cluster", dim]] + .groupby("cluster") + .agg({dim: lambda x: "@@".join(x)}) + .values + ) + # filter out clusters with only one element + these_clusters = [c for c in pre_clusters.reshape(-1) if "@@" in c] + # create short cluster names + for i, c in enumerate(these_clusters): + cluster_names[f"{dim}_cluster_{i + 1}"] = c + return cluster_names diff --git a/wise_pizza/slicer.py b/wise_pizza/slicer.py index efbf8ea..1495b67 100644 --- a/wise_pizza/slicer.py +++ b/wise_pizza/slicer.py @@ -10,7 +10,7 @@ from wise_pizza.solve.find_alpha import clean_up_min_max, find_alpha from wise_pizza.make_matrix import sparse_dummy_matrix -from wise_pizza.cluster import guided_kmeans +from wise_pizza.cluster import make_clusters from wise_pizza.preselect import HeuristicSelector from wise_pizza.time import extend_dataframe from wise_pizza.slicer_facades import SliceFinderPredictFacade @@ -192,31 +192,11 @@ def fit( cluster_values = False if cluster_values: + self.cluster_names = make_clusters(dim_df, dims) for dim in dims: - if ( - len(dim_df[dim].unique()) >= 6 - ): # otherwise what's the point in clustering? - grouped_df = ( - dim_df[[dim, "totals", "weights"]] - .groupby(dim, as_index=False) - .sum() - ) - grouped_df["avg"] = grouped_df["totals"] / grouped_df["weights"] - grouped_df["cluster"], _ = guided_kmeans(grouped_df["avg"]) - pre_clusters = ( - grouped_df[["cluster", dim]] - .groupby("cluster") - .agg({dim: lambda x: "@@".join(x)}) - .values - ) - # filter out clusters with only one element - these_clusters = [c for c in pre_clusters.reshape(-1) if "@@" in c] - # create short cluster names - for i, c in enumerate(these_clusters): - self.cluster_names[f"{dim}_cluster_{i+1}"] = c - clusters[dim] = [ - c for c in self.cluster_names.keys() if c.startswith(dim) - ] + clusters[dim] = [ + c for c in self.cluster_names.keys() if c.startswith(dim) + ] dim_df = dim_df[dims] # if time_col is None else dims + ["__time"]] self.dim_df = dim_df