Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose the force_signs argument for time series too #43

Merged
3 commits merged into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion wise_pizza/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ def explain_timeseries(
max_depth: int = 2,
solver: str = "omp",
verbose: bool = False,
constrain_signs: bool = False,
cluster_values: bool = False,
time_basis: Optional[pd.DataFrame] = None,
fit_log_space: bool = False,
Expand Down Expand Up @@ -388,7 +389,10 @@ def explain_timeseries(
fit_sizes = True

if fit_log_space:
tf = LogTransform(offset=1, weight_pow_sc=log_space_weight_sc)
tf = LogTransform(
offset=1,
weight_pow_sc=log_space_weight_sc,
)
else:
tf = IdentityTransform()

Expand All @@ -415,6 +419,7 @@ def explain_timeseries(
max_depth=max_depth,
solver=solver,
verbose=verbose,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
time_basis=time_basis,
)
Expand All @@ -441,6 +446,7 @@ def explain_timeseries(
max_depth=max_depth,
solver=solver,
verbose=verbose,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
time_basis=time_basis,
)
Expand Down Expand Up @@ -477,6 +483,7 @@ def explain_timeseries(
max_depth=max_depth,
solver=solver,
verbose=verbose,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
time_basis=time_basis,
)
Expand Down
88 changes: 56 additions & 32 deletions wise_pizza/slicer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from wise_pizza.preselect import HeuristicSelector
from wise_pizza.time import extend_dataframe
from wise_pizza.slicer_facades import SliceFinderPredictFacade
from wise_pizza.tree import tree_solver


def _summary(obj) -> str:
Expand Down Expand Up @@ -116,7 +117,7 @@ def fit(
@param max_segments: Maximum number of segments to find, defaults to min_segments
@param min_depth: Minimum number of dimension to constrain in segment definition
@param max_depth: Maximum number of dimension to constrain in segment definition
@param solver: If this equals to "lp" uses the LP solver, else uses the (recommended) Lasso solver
@param solver: Valid values are "lasso" (default), "tree" (for non-overlapping segments), "omp", or "lp"
@param verbose: If set to a truish value, lots of debug info is printed to console
@param force_dim: To add dim
@param force_add_up: To force add up
Expand All @@ -125,6 +126,8 @@ def fit(
group of segments from the same dimension with similar naive averages

"""

assert solver.lower() in ["lasso", "tree", "omp", "lp"]
min_segments, max_segments = clean_up_min_max(min_segments, max_segments)
if verbose is not None:
self.verbose = verbose
Expand All @@ -139,12 +142,16 @@ def fit(
assert min(weights) >= 0
assert np.sum(np.abs(totals[weights == 0])) == 0

# Cast all dimension values to strings
dim_df = dim_df.astype(str)

dims = list(dim_df.columns)
# sort the dataframe by dimension values,
# making sure the other vectors stay aligned
dim_df = dim_df.reset_index(drop=True)
dim_df["totals"] = totals
dim_df["weights"] = weights

if time_col is not None:
dim_df["__time"] = time_col
dim_df = pd.merge(dim_df, time_basis, left_on="__time", right_index=True)
Expand Down Expand Up @@ -176,6 +183,14 @@ def fit(
# of dimension values with similar outcomes
clusters = defaultdict(list)
self.cluster_names = {}

if solver == "tree":
if cluster_values:
warnings.warn(
"Ignoring cluster_values argument as it's irrelevant for tree solver"
)
cluster_values = False

if cluster_values:
for dim in dims:
if (
Expand Down Expand Up @@ -206,40 +221,49 @@ def fit(
dim_df = dim_df[dims] # if time_col is None else dims + ["__time"]]
self.dim_df = dim_df

# lazy calculation of the dummy matrix (calculation can be very slow)
if (
list(dim_df.columns) != self.dims
or max_depth != self.max_depth
or self.X is not None
and len(dim_df) != self.X.shape[1]
):
self.X, self.col_defs = self._init_mat(
dim_df,
min_depth,
max_depth,
force_dim=force_dim,
clusters=clusters,
time_basis=self.time_basis,
if solver == "tree":
self.X, self.reg, self.col_defs = tree_solver(
self.dim_df, self.weights, self.totals, self.time_basis
)
assert len(self.col_defs) == self.X.shape[1]
self.min_depth = min_depth
self.max_depth = max_depth
self.dims = list(dim_df.columns)
self.nonzeros = np.array(range(self.X.shape[0])) == 1.0
Xw = csc_matrix(diags(self.weights) @ self.X)
else:

Xw = csc_matrix(diags(self.weights) @ self.X)
# lazy calculation of the dummy matrix (calculation can be very slow)
if (
list(dim_df.columns) != self.dims
or max_depth != self.max_depth
or self.X is not None
and len(dim_df) != self.X.shape[1]
):
self.X, self.col_defs = self._init_mat(
dim_df,
min_depth,
max_depth,
force_dim=force_dim,
clusters=clusters,
time_basis=self.time_basis,
)
assert len(self.col_defs) == self.X.shape[1]
self.min_depth = min_depth
self.max_depth = max_depth
self.dims = list(dim_df.columns)

Xw = csc_matrix(diags(self.weights) @ self.X)

if self.verbose:
print("Starting solve!")
self.reg, self.nonzeros = find_alpha(
Xw,
self.totals,
max_nonzeros=max_segments,
solver=solver,
min_nonzeros=min_segments,
verbose=self.verbose,
adding_up_regularizer=force_add_up,
constrain_signs=constrain_signs,
)

if self.verbose:
print("Starting solve!")
self.reg, self.nonzeros = find_alpha(
Xw,
self.totals,
max_nonzeros=max_segments,
solver=solver,
min_nonzeros=min_segments,
verbose=self.verbose,
adding_up_regularizer=force_add_up,
constrain_signs=constrain_signs,
)
if self.verbose:
print("Solver done!!")

Expand Down
21 changes: 15 additions & 6 deletions wise_pizza/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def inverse_transform_totals_weights(
w = self.inverse_transform_weight(t_w, t_mean)
return mean * w, w

def test_transforms(self, total, weights, eps=1e-6):
def test_transforms(self, total, weights, eps=1e-4):
mean = total / weights
t_mean = self.transform_mean(mean)
assert almost_equals(mean, self.inverse_transform_mean(t_mean), eps)
Expand Down Expand Up @@ -71,19 +71,28 @@ def inverse_transform_weight(self, w: np.ndarray, x: np.ndarray) -> np.ndarray:

class LogTransform(TransformWithWeights):
def __init__(
self, offset: float, weight_pow_sc: float = 0.1, max_inverse: float = 1e6
self, offset: float, weight_pow_sc: float = 0.1, cap_inverse: bool = True
):
self.offset = offset
self.weight_pow_sc = weight_pow_sc
self.max_inverse = max_inverse
self.cap_inverse = cap_inverse
if cap_inverse:
self.max_inverse = 0.0
else:
self.max_inverse = None

def transform_mean(self, x: np.ndarray) -> np.ndarray:
if self.cap_inverse:
self.max_inverse = np.maximum(self.max_inverse, 2 * x.max())
return np.log(self.offset + x)

def inverse_transform_mean(self, x: np.ndarray) -> np.ndarray:
return np.maximum(
0.0, np.exp(np.minimum(x, np.log(self.max_inverse))) - self.offset
)
if self.cap_inverse:
return np.maximum(
0.0, np.exp(np.minimum(x, np.log(self.max_inverse))) - self.offset
)
else:
np.maximum(0.0, np.exp(x) - self.offset)

def transform_weight(self, w: np.ndarray, mean: np.ndarray) -> np.ndarray:
# pure math would give weight_pow_sc = 1, but then
Expand Down
27 changes: 27 additions & 0 deletions wise_pizza/tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from typing import Optional

import numpy as np
import pandas as pd


def tree_solver(
dim_df: pd.DataFrame,
weights: np.ndarray,
totals: np.ndarray,
time_basis: np.ndarray,
max_depth: int = 3,
num_leaves: Optional[int] = None,
):
# TODO: fill in
# Build a tree in the following fashion:
# 1. Start with a single node containing the whole dataset
# 2. At each node, find the best split by looping over all dimensions, for each dimension
# solving the problem of which values to take in the left and right subtrees,
# by running a regression of totals/weights on time basis in both subsets separately
# and optimizing the total squared error.
# the best combination of (node, dimension) is the next one due to be split
# If expanding the best node would exceed maximum depth:
# If num_leaves is None: stop
# If it's not, expand the best node that would not exceed max_depth, until num_leaves is reached

return X, reg, col_defs
Loading