Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A tree-based (non-overlapping) solver #46

Merged
merged 10 commits into from
May 7, 2024
Merged
45 changes: 39 additions & 6 deletions tests/test_fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
explain_timeseries,
)
from wise_pizza.segment_data import SegmentData
from wise_pizza.solver import solve_lasso, solve_lp
from wise_pizza.solve.solver import solve_lasso, solve_lp
from wise_pizza.time import create_time_basis
from wise_pizza.plotting_time import plot_time

Expand All @@ -33,7 +33,7 @@
# Too long, delete some values for quick starts, e.g. by deleting the parameters in nan_percent, size_one_percent
deltas_test_values = [
("totals", "split_fits", "force_dim", "extra_dim"), # how
("lp", "lasso"), # solver
("lp", "lasso", "tree"), # solver
(True,), # plot_is_static
(explain_changes_in_average, explain_changes_in_totals), # function
(0.0, 90.0), # nan_percent
Expand All @@ -44,7 +44,7 @@

# possible values for explain_levels
levels_test_values = [
("lp", "lasso"), # solver
("lp", "lasso", "tree"), # solver
(0.0, 90.0), # nan_percent
(0.0, 90.0), # size_one_percent
]
Expand Down Expand Up @@ -136,9 +136,9 @@ def test_categorical():
print("yay!")


@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
def test_synthetic_template(nan_percent: float):
all_data = synthetic_data(init_len=1000)
@pytest.mark.parametrize("nan_percent, clustering", [[0.0, False], [1.0, False]])
def test_synthetic_template(nan_percent: float, clustering: bool):
all_data = synthetic_data(init_len=10000, dim_values=5)
data = all_data.data

data.loc[(data["dim0"] == 0) & (data["dim1"] == 1), "totals"] += 100
Expand All @@ -155,6 +155,7 @@ def test_synthetic_template(nan_percent: float):
min_segments=5,
verbose=1,
solver="lp",
cluster_values=clustering,
)
print("***")
for s in sf.segments:
Expand All @@ -167,6 +168,38 @@ def test_synthetic_template(nan_percent: float):
print("yay!")


@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
def test_synthetic_template_tree(nan_percent: float):
all_data = synthetic_data(init_len=1000)
data = all_data.data

data.loc[(data["dim0"] == 0) & (data["dim1"] == 1), "totals"] += 200
data.loc[(data["dim1"] == 0) & (data["dim2"] == 1), "totals"] += 300

if nan_percent > 0:
data = values_to_nan(data, nan_percent)
sf = explain_levels(
data,
dims=all_data.dimensions,
total_name=all_data.segment_total,
size_name=all_data.segment_size,
max_depth=2,
min_segments=5,
verbose=1,
solver="tree",
)
print("***")
for s in sf.segments:
print(s)

# TODO: insert approppriate asserts
# assert abs(sf.segments[0]["coef"] - 300) < 2
# assert abs(sf.segments[1]["coef"] - 100) < 2

# sf.plot()
print("yay!")


@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
def test_synthetic_ts_template(nan_percent: float):
all_data = synthetic_ts_data(init_len=10000)
Expand Down
75 changes: 70 additions & 5 deletions wise_pizza/cluster.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from typing import List, Dict, Tuple
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.preprocessing import PowerTransformer
Expand All @@ -18,17 +21,27 @@ def guided_kmeans(X: np.ndarray, power_transform: bool = True) -> np.ndarray:
X = X.values

if power_transform:
if len(X[X > 0] > 1):
X[X > 0] = PowerTransformer(standardize=False).fit_transform(X[X > 0].reshape(-1, 1)).reshape(-1)
if len(X[X < 0] > 1):
X[X < 0] = -PowerTransformer(standardize=False).fit_transform(-X[X < 0].reshape(-1, 1)).reshape(-1)
if len(X[X > 0]) > 1:
X[X > 0] = (
PowerTransformer(standardize=False)
.fit_transform(X[X > 0].reshape(-1, 1))
.reshape(-1)
)
if len(X[X < 0]) > 1:
X[X < 0] = (
-PowerTransformer(standardize=False)
.fit_transform(-X[X < 0].reshape(-1, 1))
.reshape(-1)
)

best_score = -1
best_labels = None
best_n = -1
# If we allow 2 clusters, it almost always just splits positive vs negative - boring!
for n_clusters in range(3, int(len(X) / 2) + 1):
cluster_labels = KMeans(n_clusters=n_clusters, init="k-means++", n_init=10).fit_predict(X)
cluster_labels = KMeans(
n_clusters=n_clusters, init="k-means++", n_init=10
).fit_predict(X)
score = silhouette_score(X, cluster_labels)
# print(n_clusters, score)
if score > best_score:
Expand All @@ -45,3 +58,55 @@ def to_matrix(labels: np.ndarray) -> np.ndarray:
for i in labels.unique():
out[labels == i, i] = 1.0
return out


def make_clusters(dim_df: pd.DataFrame, dims: List[str]):
cluster_names = {}
for dim in dims:
if len(dim_df[dim].unique()) >= 6: # otherwise what's the point in clustering?
grouped_df = (
dim_df[[dim, "totals", "weights"]].groupby(dim, as_index=False).sum()
)
grouped_df["avg"] = grouped_df["totals"] / grouped_df["weights"]
grouped_df["cluster"], _ = guided_kmeans(grouped_df["avg"])
pre_clusters = (
grouped_df[["cluster", dim]]
.groupby("cluster")
.agg({dim: lambda x: "@@".join(x)})
.values
)
# filter out clusters with only one element
these_clusters = [c for c in pre_clusters.reshape(-1) if "@@" in c]
# create short cluster names
for i, c in enumerate(these_clusters):
cluster_names[f"{dim}_cluster_{i + 1}"] = c
return cluster_names


def nice_cluster_names(x: List[Dict[str, List[str]]]) -> Tuple[List[Dict], Dict]:
# first pass just populate cluster names
cluster_strings = defaultdict(set)
for xx in x:
for dim, v in xx.items():
if len(v) > 1:
cluster_strings[dim].add("@@".join(v))

cluster_names = {}
reverse_cluster_names = {}
for dim, clusters in cluster_strings.items():
reverse_cluster_names[dim] = {}
for i, c in enumerate(clusters):
cluster_names[f"{dim}_cluster_{i + 1}"] = c
reverse_cluster_names[dim][c] = f"{dim}_cluster_{i + 1}"

col_defs = []
for xx in x:
this_def = {}
for dim, v in xx.items():
if len(v) > 1:
this_def[dim] = reverse_cluster_names[dim]["@@".join(v)]
else:
this_def[dim] = v[0]
col_defs.append(this_def)

return col_defs, cluster_names
9 changes: 8 additions & 1 deletion wise_pizza/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ def explain_timeseries(
max_depth: int = 2,
solver: str = "omp",
verbose: bool = False,
constrain_signs: bool = False,
cluster_values: bool = False,
time_basis: Optional[pd.DataFrame] = None,
fit_log_space: bool = False,
Expand Down Expand Up @@ -388,7 +389,10 @@ def explain_timeseries(
fit_sizes = True

if fit_log_space:
tf = LogTransform(offset=1, weight_pow_sc=log_space_weight_sc)
tf = LogTransform(
offset=1,
weight_pow_sc=log_space_weight_sc,
)
else:
tf = IdentityTransform()

Expand All @@ -415,6 +419,7 @@ def explain_timeseries(
max_depth=max_depth,
solver=solver,
verbose=verbose,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
time_basis=time_basis,
)
Expand All @@ -441,6 +446,7 @@ def explain_timeseries(
max_depth=max_depth,
solver=solver,
verbose=verbose,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
time_basis=time_basis,
)
Expand Down Expand Up @@ -477,6 +483,7 @@ def explain_timeseries(
max_depth=max_depth,
solver=solver,
verbose=verbose,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
time_basis=time_basis,
)
Expand Down
Loading
Loading