Skip to content

Commit

Permalink
An apparently decent cut of tree (non-overlapping) solver for wise-pizza
Browse files Browse the repository at this point in the history
  • Loading branch information
EgorKraevTransferwise committed May 6, 2024
1 parent b5ff932 commit 56fd3ad
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 64 deletions.
39 changes: 36 additions & 3 deletions tests/test_fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,9 @@ def test_categorical():
print("yay!")


@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
def test_synthetic_template(nan_percent: float):
all_data = synthetic_data(init_len=1000)
@pytest.mark.parametrize("nan_percent, clustering", [[0.0, False], [1.0, False]])
def test_synthetic_template(nan_percent: float, clustering: bool):
all_data = synthetic_data(init_len=10000, dim_values=5)
data = all_data.data

data.loc[(data["dim0"] == 0) & (data["dim1"] == 1), "totals"] += 100
Expand All @@ -155,6 +155,7 @@ def test_synthetic_template(nan_percent: float):
min_segments=5,
verbose=1,
solver="lp",
cluster_values=clustering,
)
print("***")
for s in sf.segments:
Expand All @@ -167,6 +168,38 @@ def test_synthetic_template(nan_percent: float):
print("yay!")


@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
def test_synthetic_template_tree(nan_percent: float):
all_data = synthetic_data(init_len=1000)
data = all_data.data

data.loc[(data["dim0"] == 0) & (data["dim1"] == 1), "totals"] += 200
data.loc[(data["dim1"] == 0) & (data["dim2"] == 1), "totals"] += 300

if nan_percent > 0:
data = values_to_nan(data, nan_percent)
sf = explain_levels(
data,
dims=all_data.dimensions,
total_name=all_data.segment_total,
size_name=all_data.segment_size,
max_depth=2,
min_segments=5,
verbose=1,
solver="tree",
)
print("***")
for s in sf.segments:
print(s)

# TODO: insert approppriate asserts
# assert abs(sf.segments[0]["coef"] - 300) < 2
# assert abs(sf.segments[1]["coef"] - 100) < 2

# sf.plot()
print("yay!")


@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
def test_synthetic_ts_template(nan_percent: float):
all_data = synthetic_ts_data(init_len=10000)
Expand Down
36 changes: 33 additions & 3 deletions wise_pizza/cluster.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List
from typing import List, Dict, Tuple
from collections import defaultdict

import numpy as np
import pandas as pd
Expand All @@ -20,13 +21,13 @@ def guided_kmeans(X: np.ndarray, power_transform: bool = True) -> np.ndarray:
X = X.values

if power_transform:
if len(X[X > 0] > 1):
if len(X[X > 0]) > 1:
X[X > 0] = (
PowerTransformer(standardize=False)
.fit_transform(X[X > 0].reshape(-1, 1))
.reshape(-1)
)
if len(X[X < 0] > 1):
if len(X[X < 0]) > 1:
X[X < 0] = (
-PowerTransformer(standardize=False)
.fit_transform(-X[X < 0].reshape(-1, 1))
Expand Down Expand Up @@ -80,3 +81,32 @@ def make_clusters(dim_df: pd.DataFrame, dims: List[str]):
for i, c in enumerate(these_clusters):
cluster_names[f"{dim}_cluster_{i + 1}"] = c
return cluster_names


def nice_cluster_names(x: List[Dict[str, List[str]]]) -> Tuple[List[Dict], Dict]:
# first pass just populate cluster names
cluster_strings = defaultdict(set)
for xx in x:
for dim, v in xx.items():
if len(v) > 1:
cluster_strings[dim].add("@@".join(v))

cluster_names = {}
reverse_cluster_names = {}
for dim, clusters in cluster_strings.items():
reverse_cluster_names[dim] = {}
for i, c in enumerate(clusters):
cluster_names[f"{dim}_cluster_{i + 1}"] = c
reverse_cluster_names[dim][c] = f"{dim}_cluster_{i + 1}"

col_defs = []
for xx in x:
this_def = {}
for dim, v in xx.items():
if len(v) > 1:
this_def[dim] = reverse_cluster_names[dim]["@@".join(v)]
else:
this_def[dim] = v[0]
col_defs.append(this_def)

return col_defs, cluster_names
43 changes: 25 additions & 18 deletions wise_pizza/slicer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from wise_pizza.time import extend_dataframe
from wise_pizza.slicer_facades import SliceFinderPredictFacade
from wise_pizza.solve.tree import tree_solver
from wise_pizza.solve.solver import solve_lasso


def _summary(obj) -> str:
Expand Down Expand Up @@ -187,28 +188,34 @@ def fit(
if solver == "tree":
if cluster_values:
warnings.warn(
"Ignoring cluster_values argument as it's irrelevant for tree solver"
"Ignoring cluster_values argument as tree solver makes its own clusters"
)
cluster_values = False

if cluster_values:
self.cluster_names = make_clusters(dim_df, dims)
for dim in dims:
clusters[dim] = [
c for c in self.cluster_names.keys() if c.startswith(dim)
]

dim_df = dim_df[dims] # if time_col is None else dims + ["__time"]]
self.dim_df = dim_df

if solver == "tree":
self.X, self.reg, self.col_defs = tree_solver(
self.dim_df, self.weights, self.totals, self.time_basis
self.X, self.col_defs, self.cluster_names = tree_solver(
dim_df=dim_df,
dims=dims,
time_basis=self.time_basis,
num_leaves=max_segments,
)
self.nonzeros = np.array(range(self.X.shape[0])) == 1.0
self.nonzeros = np.array(range(self.X.shape[1]))
Xw = csc_matrix(diags(self.weights) @ self.X)
self.reg = solve_lasso(
Xw.toarray(),
self.totals,
alpha=1e-5,
verbose=self.verbose,
fit_intercept=False,
)
print("")
else:

if cluster_values:
self.cluster_names = make_clusters(dim_df, dims)
for dim in dims:
clusters[dim] = [
c for c in self.cluster_names.keys() if c.startswith(dim)
]

dim_df = dim_df[dims] # if time_col is None else dims + ["__time"]]
self.dim_df = dim_df
# lazy calculation of the dummy matrix (calculation can be very slow)
if (
list(dim_df.columns) != self.dims
Expand Down
40 changes: 40 additions & 0 deletions wise_pizza/solve/fitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from typing import List
from abc import ABC, abstractmethod

import numpy as np


class Fitter(ABC):
@abstractmethod
def fit(self, X, y, sample_weight=None):
pass

@abstractmethod
def predict(self, X):
pass

def fit_predict(self, X, y, sample_weight=None):
self.fit(X, y, sample_weight)
return self.predict(X)

def error(self, X, y, sample_weight=None):
err = y - self.predict(X)
if sample_weight is not None:
err *= sample_weight
return np.nansum(err**2)


class AverageFitter(Fitter):
def __init__(self):
self.avg = None

def fit(self, X, y, sample_weight=None):
y = np.array(y)
sample_weight = np.array(sample_weight)
if sample_weight is None:
self.avg = np.nanmean(y)
else:
self.avg = np.nansum(y * sample_weight) / np.nansum(sample_weight)

def predict(self, X):
return np.full(X.shape[0], self.avg)
86 changes: 50 additions & 36 deletions wise_pizza/solve/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,58 +3,68 @@

import numpy as np
import pandas as pd
import category_encoders as ce
from scipy.sparse import csc_matrix

from .weighted_quantiles import weighted_quantiles
from .fitter import AverageFitter, Fitter
from wise_pizza.cluster import nice_cluster_names


def tree_solver(
dim_df: pd.DataFrame,
weights: np.ndarray,
totals: np.ndarray,
time_basis: Optional[np.ndarray] = None,
dims: List[str],
time_basis: Optional[pd.DataFrame] = None,
max_depth: int = 3,
num_leaves: Optional[int] = None,
):
if time_basis is None:
fitter = AverageFitter()
else:
fitter = TimeFitter()
raise NotImplementedError("Time fitter not yet implemented")
# fitter = TimeFitter(dims, list(time_basis.columns))

df = dim_df.copy().reset_index(drop=True)
df["__weight"] = weights
df["__total"] = totals
df["__avg"] = totals / weights
df["__avg"] = df["__avg"].fillna(df["__avg"].nanmean())
for i, vec in enumerate(time_basis.T):
df[f"__time_{i}"] = vec
df["__avg"] = df["totals"] / df["weights"]
df["__avg"] = df["__avg"].fillna(df["__avg"].mean())

root = ModelNode(df=df, fitter=fitter, dims=dim_df.columns)
root = ModelNode(df=df, fitter=fitter, dims=dims)

build_tree(root=root, num_leaves=num_leaves, max_depth=max_depth)
segments = []
col_defs = []
for seg in get_leaves(root):
segments.append(tidy_segment(seg))

return col_defs
leaves = get_leaves(root)

col_defs, cluster_names = nice_cluster_names([leaf.dim_split for leaf in leaves])

for l, leaf in enumerate(leaves):
leaf.df["Segment_id"] = l

re_df = pd.concat([leaf.df for leaf in leaves]).sort_values(dims)
X = pd.get_dummies(re_df["Segment_id"]).values

return csc_matrix(X), col_defs, cluster_names


def error(x: np.ndarray, y: np.ndarray) -> float:
return np.sum((x - y) ** 2)


def encode_map(X, y) -> Dict:
encoder = ce.TargetEncoder()
encoder.fit(X, y)
return encoder.mapping
def target_encode(df: pd.DataFrame, dim: str) -> dict:
df = df[[dim, "totals", "weights"]]
agg = df.groupby(dim, as_index=False).sum()
agg["__avg"] = agg["totals"] / agg["weights"]
agg["__avg"] = agg["__avg"].fillna(agg["__avg"].mean())
enc_map = {k: v for k, v in zip(agg[dim], agg["__avg"])}

if np.isnan(np.array(list(enc_map.values()))).any():
raise ValueError("NaNs in encoded values")
return enc_map


class ModelNode:
def __init__(
self,
df: pd.DataFrame,
fitter: "Fitter",
fitter: Fitter,
dims: List[str],
dim_split: Optional[Dict[str, List]] = None,
depth: int = 0,
Expand All @@ -63,6 +73,7 @@ def __init__(
self.fitter = fitter
self.dims = dims
self._best_submodels = None
self._error_improvement = None
self.children = None
self.dim_split = dim_split or {}
self.depth = depth
Expand All @@ -74,28 +85,36 @@ def error(self):
self.model = copy.deepcopy(self.fitter)
self.model.fit(
X=self.df[self.dims],
y=self.df["__total"],
sample_weight=self.df["__weight"],
y=self.df["totals"],
sample_weight=self.df["weights"],
)
return self.model.error(self.df)
return self.model.error(
self.df[self.dims], self.df["__avg"], self.df["weights"]
)

@property
def error_improvement(self):
if self._best_submodels is None:
best_error = float("inf")
for dim in self.dims:
enc_map = encode_map(self.df[dim], self.df["__avg"])
self.df[dim + "_encoded"] = self.df[dim].map(encode_map)

if len(self.df[dim].unique()) == 1:
continue
enc_map = target_encode(self.df, dim)
self.df[dim + "_encoded"] = self.df[dim].apply(lambda x: enc_map[x])
if np.any(np.isnan(self.df[dim + "_encoded"])): # pragma: no cover
raise ValueError("NaNs in encoded values")
# Get split candidates for brute force search
deciles = np.array([q / 10.0 for q in range(1, 10)])

splits = weighted_quantiles(
self.df[dim + "_encoded"], deciles, self.df["__weight"]
self.df[dim + "_encoded"], deciles, self.df["weights"]
)

for split in splits(self.df[dim + "_encoded"], self.df["__weight"]):
for split in np.unique(splits):
left = self.df[self.df[dim + "_encoded"] < split]
right = self.df[self.df[dim + "_encoded"] >= split]
if len(left) == 0 or len(right) == 0:
continue
dim_values1 = [k for k, v in enc_map.items() if v < split]
dim_values2 = [k for k, v in enc_map.items() if v >= split]
left_candidate = ModelNode(
Expand Down Expand Up @@ -147,7 +166,7 @@ def get_best_subtree_result(

def build_tree(root: ModelNode, num_leaves: int, max_depth: Optional[int] = 1000):
# TODO: modify this to also accept max_depth
for _ in range(num_leaves):
for _ in range(num_leaves - 1):
best_node = get_best_subtree_result(root, max_depth)
if best_node.error_improvement > 0:
best_node.children = best_node._best_submodels
Expand All @@ -160,8 +179,3 @@ def get_leaves(node: ModelNode) -> List[ModelNode]:
return [node]
else:
return get_leaves(node.children[0]) + get_leaves(node.children[1])


class Model:
def error(self, df: pd.DataFrame) -> float:
return error(self.predict(df), df[self.target_name])
Loading

0 comments on commit 56fd3ad

Please sign in to comment.