From 56fd3adb712a3b30362e8485e5bb227895d20897 Mon Sep 17 00:00:00 2001 From: "Egor.Kraev" Date: Mon, 6 May 2024 14:13:39 +0100 Subject: [PATCH] An apparently decent cut of tree (non-overlapping) solver for wise-pizza --- tests/test_fit.py | 39 +++++++++++- wise_pizza/cluster.py | 36 ++++++++++- wise_pizza/slicer.py | 43 +++++++------ wise_pizza/solve/fitter.py | 40 ++++++++++++ wise_pizza/solve/tree.py | 86 +++++++++++++++----------- wise_pizza/solve/weighted_quantiles.py | 18 ++++-- 6 files changed, 198 insertions(+), 64 deletions(-) create mode 100644 wise_pizza/solve/fitter.py diff --git a/tests/test_fit.py b/tests/test_fit.py index 67354fc..6a175be 100644 --- a/tests/test_fit.py +++ b/tests/test_fit.py @@ -136,9 +136,9 @@ def test_categorical(): print("yay!") -@pytest.mark.parametrize("nan_percent", [0.0, 1.0]) -def test_synthetic_template(nan_percent: float): - all_data = synthetic_data(init_len=1000) +@pytest.mark.parametrize("nan_percent, clustering", [[0.0, False], [1.0, False]]) +def test_synthetic_template(nan_percent: float, clustering: bool): + all_data = synthetic_data(init_len=10000, dim_values=5) data = all_data.data data.loc[(data["dim0"] == 0) & (data["dim1"] == 1), "totals"] += 100 @@ -155,6 +155,7 @@ def test_synthetic_template(nan_percent: float): min_segments=5, verbose=1, solver="lp", + cluster_values=clustering, ) print("***") for s in sf.segments: @@ -167,6 +168,38 @@ def test_synthetic_template(nan_percent: float): print("yay!") +@pytest.mark.parametrize("nan_percent", [0.0, 1.0]) +def test_synthetic_template_tree(nan_percent: float): + all_data = synthetic_data(init_len=1000) + data = all_data.data + + data.loc[(data["dim0"] == 0) & (data["dim1"] == 1), "totals"] += 200 + data.loc[(data["dim1"] == 0) & (data["dim2"] == 1), "totals"] += 300 + + if nan_percent > 0: + data = values_to_nan(data, nan_percent) + sf = explain_levels( + data, + dims=all_data.dimensions, + total_name=all_data.segment_total, + size_name=all_data.segment_size, + max_depth=2, + min_segments=5, + verbose=1, + solver="tree", + ) + print("***") + for s in sf.segments: + print(s) + + # TODO: insert approppriate asserts + # assert abs(sf.segments[0]["coef"] - 300) < 2 + # assert abs(sf.segments[1]["coef"] - 100) < 2 + + # sf.plot() + print("yay!") + + @pytest.mark.parametrize("nan_percent", [0.0, 1.0]) def test_synthetic_ts_template(nan_percent: float): all_data = synthetic_ts_data(init_len=10000) diff --git a/wise_pizza/cluster.py b/wise_pizza/cluster.py index d8e3251..6090af7 100644 --- a/wise_pizza/cluster.py +++ b/wise_pizza/cluster.py @@ -1,4 +1,5 @@ -from typing import List +from typing import List, Dict, Tuple +from collections import defaultdict import numpy as np import pandas as pd @@ -20,13 +21,13 @@ def guided_kmeans(X: np.ndarray, power_transform: bool = True) -> np.ndarray: X = X.values if power_transform: - if len(X[X > 0] > 1): + if len(X[X > 0]) > 1: X[X > 0] = ( PowerTransformer(standardize=False) .fit_transform(X[X > 0].reshape(-1, 1)) .reshape(-1) ) - if len(X[X < 0] > 1): + if len(X[X < 0]) > 1: X[X < 0] = ( -PowerTransformer(standardize=False) .fit_transform(-X[X < 0].reshape(-1, 1)) @@ -80,3 +81,32 @@ def make_clusters(dim_df: pd.DataFrame, dims: List[str]): for i, c in enumerate(these_clusters): cluster_names[f"{dim}_cluster_{i + 1}"] = c return cluster_names + + +def nice_cluster_names(x: List[Dict[str, List[str]]]) -> Tuple[List[Dict], Dict]: + # first pass just populate cluster names + cluster_strings = defaultdict(set) + for xx in x: + for dim, v in xx.items(): + if len(v) > 1: + cluster_strings[dim].add("@@".join(v)) + + cluster_names = {} + reverse_cluster_names = {} + for dim, clusters in cluster_strings.items(): + reverse_cluster_names[dim] = {} + for i, c in enumerate(clusters): + cluster_names[f"{dim}_cluster_{i + 1}"] = c + reverse_cluster_names[dim][c] = f"{dim}_cluster_{i + 1}" + + col_defs = [] + for xx in x: + this_def = {} + for dim, v in xx.items(): + if len(v) > 1: + this_def[dim] = reverse_cluster_names[dim]["@@".join(v)] + else: + this_def[dim] = v[0] + col_defs.append(this_def) + + return col_defs, cluster_names diff --git a/wise_pizza/slicer.py b/wise_pizza/slicer.py index 1495b67..10ac069 100644 --- a/wise_pizza/slicer.py +++ b/wise_pizza/slicer.py @@ -15,6 +15,7 @@ from wise_pizza.time import extend_dataframe from wise_pizza.slicer_facades import SliceFinderPredictFacade from wise_pizza.solve.tree import tree_solver +from wise_pizza.solve.solver import solve_lasso def _summary(obj) -> str: @@ -187,28 +188,34 @@ def fit( if solver == "tree": if cluster_values: warnings.warn( - "Ignoring cluster_values argument as it's irrelevant for tree solver" + "Ignoring cluster_values argument as tree solver makes its own clusters" ) - cluster_values = False - - if cluster_values: - self.cluster_names = make_clusters(dim_df, dims) - for dim in dims: - clusters[dim] = [ - c for c in self.cluster_names.keys() if c.startswith(dim) - ] - - dim_df = dim_df[dims] # if time_col is None else dims + ["__time"]] - self.dim_df = dim_df - - if solver == "tree": - self.X, self.reg, self.col_defs = tree_solver( - self.dim_df, self.weights, self.totals, self.time_basis + self.X, self.col_defs, self.cluster_names = tree_solver( + dim_df=dim_df, + dims=dims, + time_basis=self.time_basis, + num_leaves=max_segments, ) - self.nonzeros = np.array(range(self.X.shape[0])) == 1.0 + self.nonzeros = np.array(range(self.X.shape[1])) Xw = csc_matrix(diags(self.weights) @ self.X) + self.reg = solve_lasso( + Xw.toarray(), + self.totals, + alpha=1e-5, + verbose=self.verbose, + fit_intercept=False, + ) + print("") else: - + if cluster_values: + self.cluster_names = make_clusters(dim_df, dims) + for dim in dims: + clusters[dim] = [ + c for c in self.cluster_names.keys() if c.startswith(dim) + ] + + dim_df = dim_df[dims] # if time_col is None else dims + ["__time"]] + self.dim_df = dim_df # lazy calculation of the dummy matrix (calculation can be very slow) if ( list(dim_df.columns) != self.dims diff --git a/wise_pizza/solve/fitter.py b/wise_pizza/solve/fitter.py new file mode 100644 index 0000000..4f44254 --- /dev/null +++ b/wise_pizza/solve/fitter.py @@ -0,0 +1,40 @@ +from typing import List +from abc import ABC, abstractmethod + +import numpy as np + + +class Fitter(ABC): + @abstractmethod + def fit(self, X, y, sample_weight=None): + pass + + @abstractmethod + def predict(self, X): + pass + + def fit_predict(self, X, y, sample_weight=None): + self.fit(X, y, sample_weight) + return self.predict(X) + + def error(self, X, y, sample_weight=None): + err = y - self.predict(X) + if sample_weight is not None: + err *= sample_weight + return np.nansum(err**2) + + +class AverageFitter(Fitter): + def __init__(self): + self.avg = None + + def fit(self, X, y, sample_weight=None): + y = np.array(y) + sample_weight = np.array(sample_weight) + if sample_weight is None: + self.avg = np.nanmean(y) + else: + self.avg = np.nansum(y * sample_weight) / np.nansum(sample_weight) + + def predict(self, X): + return np.full(X.shape[0], self.avg) diff --git a/wise_pizza/solve/tree.py b/wise_pizza/solve/tree.py index 561d1ce..39ea3e5 100644 --- a/wise_pizza/solve/tree.py +++ b/wise_pizza/solve/tree.py @@ -3,58 +3,68 @@ import numpy as np import pandas as pd -import category_encoders as ce +from scipy.sparse import csc_matrix from .weighted_quantiles import weighted_quantiles +from .fitter import AverageFitter, Fitter +from wise_pizza.cluster import nice_cluster_names def tree_solver( dim_df: pd.DataFrame, - weights: np.ndarray, - totals: np.ndarray, - time_basis: Optional[np.ndarray] = None, + dims: List[str], + time_basis: Optional[pd.DataFrame] = None, max_depth: int = 3, num_leaves: Optional[int] = None, ): if time_basis is None: fitter = AverageFitter() else: - fitter = TimeFitter() + raise NotImplementedError("Time fitter not yet implemented") + # fitter = TimeFitter(dims, list(time_basis.columns)) df = dim_df.copy().reset_index(drop=True) - df["__weight"] = weights - df["__total"] = totals - df["__avg"] = totals / weights - df["__avg"] = df["__avg"].fillna(df["__avg"].nanmean()) - for i, vec in enumerate(time_basis.T): - df[f"__time_{i}"] = vec + df["__avg"] = df["totals"] / df["weights"] + df["__avg"] = df["__avg"].fillna(df["__avg"].mean()) - root = ModelNode(df=df, fitter=fitter, dims=dim_df.columns) + root = ModelNode(df=df, fitter=fitter, dims=dims) build_tree(root=root, num_leaves=num_leaves, max_depth=max_depth) - segments = [] - col_defs = [] - for seg in get_leaves(root): - segments.append(tidy_segment(seg)) - return col_defs + leaves = get_leaves(root) + + col_defs, cluster_names = nice_cluster_names([leaf.dim_split for leaf in leaves]) + + for l, leaf in enumerate(leaves): + leaf.df["Segment_id"] = l + + re_df = pd.concat([leaf.df for leaf in leaves]).sort_values(dims) + X = pd.get_dummies(re_df["Segment_id"]).values + + return csc_matrix(X), col_defs, cluster_names def error(x: np.ndarray, y: np.ndarray) -> float: return np.sum((x - y) ** 2) -def encode_map(X, y) -> Dict: - encoder = ce.TargetEncoder() - encoder.fit(X, y) - return encoder.mapping +def target_encode(df: pd.DataFrame, dim: str) -> dict: + df = df[[dim, "totals", "weights"]] + agg = df.groupby(dim, as_index=False).sum() + agg["__avg"] = agg["totals"] / agg["weights"] + agg["__avg"] = agg["__avg"].fillna(agg["__avg"].mean()) + enc_map = {k: v for k, v in zip(agg[dim], agg["__avg"])} + + if np.isnan(np.array(list(enc_map.values()))).any(): + raise ValueError("NaNs in encoded values") + return enc_map class ModelNode: def __init__( self, df: pd.DataFrame, - fitter: "Fitter", + fitter: Fitter, dims: List[str], dim_split: Optional[Dict[str, List]] = None, depth: int = 0, @@ -63,6 +73,7 @@ def __init__( self.fitter = fitter self.dims = dims self._best_submodels = None + self._error_improvement = None self.children = None self.dim_split = dim_split or {} self.depth = depth @@ -74,28 +85,36 @@ def error(self): self.model = copy.deepcopy(self.fitter) self.model.fit( X=self.df[self.dims], - y=self.df["__total"], - sample_weight=self.df["__weight"], + y=self.df["totals"], + sample_weight=self.df["weights"], ) - return self.model.error(self.df) + return self.model.error( + self.df[self.dims], self.df["__avg"], self.df["weights"] + ) @property def error_improvement(self): if self._best_submodels is None: best_error = float("inf") for dim in self.dims: - enc_map = encode_map(self.df[dim], self.df["__avg"]) - self.df[dim + "_encoded"] = self.df[dim].map(encode_map) - + if len(self.df[dim].unique()) == 1: + continue + enc_map = target_encode(self.df, dim) + self.df[dim + "_encoded"] = self.df[dim].apply(lambda x: enc_map[x]) + if np.any(np.isnan(self.df[dim + "_encoded"])): # pragma: no cover + raise ValueError("NaNs in encoded values") # Get split candidates for brute force search deciles = np.array([q / 10.0 for q in range(1, 10)]) + splits = weighted_quantiles( - self.df[dim + "_encoded"], deciles, self.df["__weight"] + self.df[dim + "_encoded"], deciles, self.df["weights"] ) - for split in splits(self.df[dim + "_encoded"], self.df["__weight"]): + for split in np.unique(splits): left = self.df[self.df[dim + "_encoded"] < split] right = self.df[self.df[dim + "_encoded"] >= split] + if len(left) == 0 or len(right) == 0: + continue dim_values1 = [k for k, v in enc_map.items() if v < split] dim_values2 = [k for k, v in enc_map.items() if v >= split] left_candidate = ModelNode( @@ -147,7 +166,7 @@ def get_best_subtree_result( def build_tree(root: ModelNode, num_leaves: int, max_depth: Optional[int] = 1000): # TODO: modify this to also accept max_depth - for _ in range(num_leaves): + for _ in range(num_leaves - 1): best_node = get_best_subtree_result(root, max_depth) if best_node.error_improvement > 0: best_node.children = best_node._best_submodels @@ -160,8 +179,3 @@ def get_leaves(node: ModelNode) -> List[ModelNode]: return [node] else: return get_leaves(node.children[0]) + get_leaves(node.children[1]) - - -class Model: - def error(self, df: pd.DataFrame) -> float: - return error(self.predict(df), df[self.target_name]) diff --git a/wise_pizza/solve/weighted_quantiles.py b/wise_pizza/solve/weighted_quantiles.py index c8eb408..5fef17a 100644 --- a/wise_pizza/solve/weighted_quantiles.py +++ b/wise_pizza/solve/weighted_quantiles.py @@ -3,9 +3,19 @@ def weighted_quantiles(values, quantiles, sample_weight): """Compute the weighted quantile of a 1D numpy array.""" - sorter = np.argsort(values) - sorted_values = np.array(values)[sorter] - sorted_weights = np.array(sample_weight)[sorter] + values_ = np.array(values) + sample_weight_ = np.array(sample_weight) + nice = ~np.isnan(values) & ~np.isnan(sample_weight) + if np.any(~nice): + raise ValueError("Data contains NaNs") + sorter = np.argsort(values_) + sorted_values = values_[sorter] + sorted_weights = sample_weight_[sorter] w_quantiles = np.cumsum(sorted_weights) - 0.5 * sorted_weights w_quantiles /= np.sum(sorted_weights) - return np.interp(quantiles, w_quantiles, sorted_values) + + try: + return np.interp(quantiles, w_quantiles, sorted_values) + except Exception as e: + print(e) + raise e