Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A tree-based (non-overlapping) solver #46

Merged
merged 10 commits into from
May 7, 2024
Merged
41 changes: 37 additions & 4 deletions tests/test_fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
explain_timeseries,
)
from wise_pizza.segment_data import SegmentData
from wise_pizza.solver import solve_lasso, solve_lp
from wise_pizza.solve.solver import solve_lasso, solve_lp
from wise_pizza.time import create_time_basis
from wise_pizza.plotting_time import plot_time

Expand Down Expand Up @@ -136,9 +136,9 @@ def test_categorical():
print("yay!")


@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
def test_synthetic_template(nan_percent: float):
all_data = synthetic_data(init_len=1000)
@pytest.mark.parametrize("nan_percent, clustering", [[0.0, False], [1.0, False]])
def test_synthetic_template(nan_percent: float, clustering: bool):
all_data = synthetic_data(init_len=10000, dim_values=5)
data = all_data.data

data.loc[(data["dim0"] == 0) & (data["dim1"] == 1), "totals"] += 100
Expand All @@ -155,6 +155,7 @@ def test_synthetic_template(nan_percent: float):
min_segments=5,
verbose=1,
solver="lp",
cluster_values=clustering,
)
print("***")
for s in sf.segments:
Expand All @@ -167,6 +168,38 @@ def test_synthetic_template(nan_percent: float):
print("yay!")


@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
def test_synthetic_template_tree(nan_percent: float):
all_data = synthetic_data(init_len=1000)
data = all_data.data

data.loc[(data["dim0"] == 0) & (data["dim1"] == 1), "totals"] += 200
data.loc[(data["dim1"] == 0) & (data["dim2"] == 1), "totals"] += 300

if nan_percent > 0:
data = values_to_nan(data, nan_percent)
sf = explain_levels(
data,
dims=all_data.dimensions,
total_name=all_data.segment_total,
size_name=all_data.segment_size,
max_depth=2,
min_segments=5,
verbose=1,
solver="tree",
)
print("***")
for s in sf.segments:
print(s)

# TODO: insert approppriate asserts
# assert abs(sf.segments[0]["coef"] - 300) < 2
# assert abs(sf.segments[1]["coef"] - 100) < 2

# sf.plot()
print("yay!")


@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
def test_synthetic_ts_template(nan_percent: float):
all_data = synthetic_ts_data(init_len=10000)
Expand Down
75 changes: 70 additions & 5 deletions wise_pizza/cluster.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from typing import List, Dict, Tuple
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.preprocessing import PowerTransformer
Expand All @@ -18,17 +21,27 @@ def guided_kmeans(X: np.ndarray, power_transform: bool = True) -> np.ndarray:
X = X.values

if power_transform:
if len(X[X > 0] > 1):
X[X > 0] = PowerTransformer(standardize=False).fit_transform(X[X > 0].reshape(-1, 1)).reshape(-1)
if len(X[X < 0] > 1):
X[X < 0] = -PowerTransformer(standardize=False).fit_transform(-X[X < 0].reshape(-1, 1)).reshape(-1)
if len(X[X > 0]) > 1:
X[X > 0] = (
PowerTransformer(standardize=False)
.fit_transform(X[X > 0].reshape(-1, 1))
.reshape(-1)
)
if len(X[X < 0]) > 1:
X[X < 0] = (
-PowerTransformer(standardize=False)
.fit_transform(-X[X < 0].reshape(-1, 1))
.reshape(-1)
)

best_score = -1
best_labels = None
best_n = -1
# If we allow 2 clusters, it almost always just splits positive vs negative - boring!
for n_clusters in range(3, int(len(X) / 2) + 1):
cluster_labels = KMeans(n_clusters=n_clusters, init="k-means++", n_init=10).fit_predict(X)
cluster_labels = KMeans(
n_clusters=n_clusters, init="k-means++", n_init=10
).fit_predict(X)
score = silhouette_score(X, cluster_labels)
# print(n_clusters, score)
if score > best_score:
Expand All @@ -45,3 +58,55 @@ def to_matrix(labels: np.ndarray) -> np.ndarray:
for i in labels.unique():
out[labels == i, i] = 1.0
return out


def make_clusters(dim_df: pd.DataFrame, dims: List[str]):
cluster_names = {}
for dim in dims:
if len(dim_df[dim].unique()) >= 6: # otherwise what's the point in clustering?
grouped_df = (
dim_df[[dim, "totals", "weights"]].groupby(dim, as_index=False).sum()
)
grouped_df["avg"] = grouped_df["totals"] / grouped_df["weights"]
grouped_df["cluster"], _ = guided_kmeans(grouped_df["avg"])
pre_clusters = (
grouped_df[["cluster", dim]]
.groupby("cluster")
.agg({dim: lambda x: "@@".join(x)})
.values
)
# filter out clusters with only one element
these_clusters = [c for c in pre_clusters.reshape(-1) if "@@" in c]
# create short cluster names
for i, c in enumerate(these_clusters):
cluster_names[f"{dim}_cluster_{i + 1}"] = c
return cluster_names


def nice_cluster_names(x: List[Dict[str, List[str]]]) -> Tuple[List[Dict], Dict]:
# first pass just populate cluster names
cluster_strings = defaultdict(set)
for xx in x:
for dim, v in xx.items():
if len(v) > 1:
cluster_strings[dim].add("@@".join(v))

cluster_names = {}
reverse_cluster_names = {}
for dim, clusters in cluster_strings.items():
reverse_cluster_names[dim] = {}
for i, c in enumerate(clusters):
cluster_names[f"{dim}_cluster_{i + 1}"] = c
reverse_cluster_names[dim][c] = f"{dim}_cluster_{i + 1}"

col_defs = []
for xx in x:
this_def = {}
for dim, v in xx.items():
if len(v) > 1:
this_def[dim] = reverse_cluster_names[dim]["@@".join(v)]
else:
this_def[dim] = v[0]
col_defs.append(this_def)

return col_defs, cluster_names
9 changes: 8 additions & 1 deletion wise_pizza/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ def explain_timeseries(
max_depth: int = 2,
solver: str = "omp",
verbose: bool = False,
constrain_signs: bool = False,
cluster_values: bool = False,
time_basis: Optional[pd.DataFrame] = None,
fit_log_space: bool = False,
Expand Down Expand Up @@ -388,7 +389,10 @@ def explain_timeseries(
fit_sizes = True

if fit_log_space:
tf = LogTransform(offset=1, weight_pow_sc=log_space_weight_sc)
tf = LogTransform(
offset=1,
weight_pow_sc=log_space_weight_sc,
)
else:
tf = IdentityTransform()

Expand All @@ -415,6 +419,7 @@ def explain_timeseries(
max_depth=max_depth,
solver=solver,
verbose=verbose,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
time_basis=time_basis,
)
Expand All @@ -441,6 +446,7 @@ def explain_timeseries(
max_depth=max_depth,
solver=solver,
verbose=verbose,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
time_basis=time_basis,
)
Expand Down Expand Up @@ -477,6 +483,7 @@ def explain_timeseries(
max_depth=max_depth,
solver=solver,
verbose=verbose,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
time_basis=time_basis,
)
Expand Down
133 changes: 72 additions & 61 deletions wise_pizza/slicer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@
import pandas as pd
from scipy.sparse import csc_matrix, diags

from wise_pizza.find_alpha import clean_up_min_max, find_alpha
from wise_pizza.solve.find_alpha import clean_up_min_max, find_alpha
from wise_pizza.make_matrix import sparse_dummy_matrix
from wise_pizza.cluster import guided_kmeans
from wise_pizza.cluster import make_clusters
from wise_pizza.preselect import HeuristicSelector
from wise_pizza.time import extend_dataframe
from wise_pizza.slicer_facades import SliceFinderPredictFacade
from wise_pizza.solve.tree import tree_solver
from wise_pizza.solve.solver import solve_lasso


def _summary(obj) -> str:
Expand Down Expand Up @@ -116,7 +118,7 @@ def fit(
@param max_segments: Maximum number of segments to find, defaults to min_segments
@param min_depth: Minimum number of dimension to constrain in segment definition
@param max_depth: Maximum number of dimension to constrain in segment definition
@param solver: If this equals to "lp" uses the LP solver, else uses the (recommended) Lasso solver
@param solver: Valid values are "lasso" (default), "tree" (for non-overlapping segments), "omp", or "lp"
@param verbose: If set to a truish value, lots of debug info is printed to console
@param force_dim: To add dim
@param force_add_up: To force add up
Expand All @@ -125,6 +127,8 @@ def fit(
group of segments from the same dimension with similar naive averages

"""

assert solver.lower() in ["lasso", "tree", "omp", "lp"]
min_segments, max_segments = clean_up_min_max(min_segments, max_segments)
if verbose is not None:
self.verbose = verbose
Expand All @@ -139,12 +143,16 @@ def fit(
assert min(weights) >= 0
assert np.sum(np.abs(totals[weights == 0])) == 0

# Cast all dimension values to strings
dim_df = dim_df.astype(str)

dims = list(dim_df.columns)
# sort the dataframe by dimension values,
# making sure the other vectors stay aligned
dim_df = dim_df.reset_index(drop=True)
dim_df["totals"] = totals
dim_df["weights"] = weights

if time_col is not None:
dim_df["__time"] = time_col
dim_df = pd.merge(dim_df, time_basis, left_on="__time", right_index=True)
Expand Down Expand Up @@ -176,70 +184,73 @@ def fit(
# of dimension values with similar outcomes
clusters = defaultdict(list)
self.cluster_names = {}
if cluster_values:
for dim in dims:
if (
len(dim_df[dim].unique()) >= 6
): # otherwise what's the point in clustering?
grouped_df = (
dim_df[[dim, "totals", "weights"]]
.groupby(dim, as_index=False)
.sum()
)
grouped_df["avg"] = grouped_df["totals"] / grouped_df["weights"]
grouped_df["cluster"], _ = guided_kmeans(grouped_df["avg"])
pre_clusters = (
grouped_df[["cluster", dim]]
.groupby("cluster")
.agg({dim: lambda x: "@@".join(x)})
.values
)
# filter out clusters with only one element
these_clusters = [c for c in pre_clusters.reshape(-1) if "@@" in c]
# create short cluster names
for i, c in enumerate(these_clusters):
self.cluster_names[f"{dim}_cluster_{i+1}"] = c

if solver == "tree":
if cluster_values:
warnings.warn(
"Ignoring cluster_values argument as tree solver makes its own clusters"
)
self.X, self.col_defs, self.cluster_names = tree_solver(
dim_df=dim_df,
dims=dims,
time_basis=self.time_basis,
num_leaves=max_segments,
)
self.nonzeros = np.array(range(self.X.shape[1]))
Xw = csc_matrix(diags(self.weights) @ self.X)
self.reg = solve_lasso(
Xw.toarray(),
self.totals,
alpha=1e-5,
verbose=self.verbose,
fit_intercept=False,
)
print("")
else:
if cluster_values:
self.cluster_names = make_clusters(dim_df, dims)
for dim in dims:
clusters[dim] = [
c for c in self.cluster_names.keys() if c.startswith(dim)
]

dim_df = dim_df[dims] # if time_col is None else dims + ["__time"]]
self.dim_df = dim_df

# lazy calculation of the dummy matrix (calculation can be very slow)
if (
list(dim_df.columns) != self.dims
or max_depth != self.max_depth
or self.X is not None
and len(dim_df) != self.X.shape[1]
):
self.X, self.col_defs = self._init_mat(
dim_df,
min_depth,
max_depth,
force_dim=force_dim,
clusters=clusters,
time_basis=self.time_basis,
dim_df = dim_df[dims] # if time_col is None else dims + ["__time"]]
self.dim_df = dim_df
# lazy calculation of the dummy matrix (calculation can be very slow)
if (
list(dim_df.columns) != self.dims
or max_depth != self.max_depth
or self.X is not None
and len(dim_df) != self.X.shape[1]
):
self.X, self.col_defs = self._init_mat(
dim_df,
min_depth,
max_depth,
force_dim=force_dim,
clusters=clusters,
time_basis=self.time_basis,
)
assert len(self.col_defs) == self.X.shape[1]
self.min_depth = min_depth
self.max_depth = max_depth
self.dims = list(dim_df.columns)

Xw = csc_matrix(diags(self.weights) @ self.X)

if self.verbose:
print("Starting solve!")
self.reg, self.nonzeros = find_alpha(
Xw,
self.totals,
max_nonzeros=max_segments,
solver=solver,
min_nonzeros=min_segments,
verbose=self.verbose,
adding_up_regularizer=force_add_up,
constrain_signs=constrain_signs,
)
assert len(self.col_defs) == self.X.shape[1]
self.min_depth = min_depth
self.max_depth = max_depth
self.dims = list(dim_df.columns)

Xw = csc_matrix(diags(self.weights) @ self.X)

if self.verbose:
print("Starting solve!")
self.reg, self.nonzeros = find_alpha(
Xw,
self.totals,
max_nonzeros=max_segments,
solver=solver,
min_nonzeros=min_segments,
verbose=self.verbose,
adding_up_regularizer=force_add_up,
constrain_signs=constrain_signs,
)
if self.verbose:
print("Solver done!!")

Expand Down
Empty file added wise_pizza/solve/__init__.py
Empty file.
Loading
Loading