diff --git a/wise_pizza/explain.py b/wise_pizza/explain.py index 1f6e3b3..ab1066a 100644 --- a/wise_pizza/explain.py +++ b/wise_pizza/explain.py @@ -27,7 +27,7 @@ def explain_changes_in_average( total_name: str, size_name: str, min_segments: Optional[int] = None, - max_segments: int = 5, + max_segments: int = None, min_depth: int = 1, max_depth: int = 2, solver: str = "lasso", @@ -124,7 +124,7 @@ def explain_changes_in_totals( total_name: str, size_name: str, min_segments: Optional[int] = None, - max_segments: int = 5, + max_segments: int = None, min_depth: int = 1, max_depth: int = 2, solver: str = "lasso", @@ -271,7 +271,7 @@ def explain_levels( total_name: str, size_name: Optional[str] = None, min_segments: int = None, - max_segments: int = 10, + max_segments: int = None, min_depth: int = 1, max_depth: int = 2, solver="lasso", @@ -353,7 +353,7 @@ def explain_timeseries( time_name: str, size_name: Optional[str] = None, min_segments: int = None, - max_segments: int = 5, + max_segments: int = None, min_depth: int = 1, max_depth: int = 2, solver: str = "omp", diff --git a/wise_pizza/slicer.py b/wise_pizza/slicer.py index de0b267..1033513 100644 --- a/wise_pizza/slicer.py +++ b/wise_pizza/slicer.py @@ -8,7 +8,8 @@ import pandas as pd from scipy.sparse import csc_matrix, diags -from wise_pizza.solve.find_alpha import clean_up_min_max, find_alpha +from wise_pizza.solve.find_alpha import find_alpha +from wise_pizza.utils import clean_up_min_max from wise_pizza.make_matrix import sparse_dummy_matrix from wise_pizza.cluster import make_clusters from wise_pizza.preselect import HeuristicSelector @@ -98,7 +99,7 @@ def fit( weights: pd.Series = None, time_col: pd.Series = None, time_basis: pd.DataFrame = None, - min_segments: int = 10, + min_segments: int = None, max_segments: int = None, min_depth: int = 1, max_depth: int = 3, diff --git a/wise_pizza/solve/find_alpha.py b/wise_pizza/solve/find_alpha.py index ec454fe..8e05826 100644 --- a/wise_pizza/solve/find_alpha.py +++ b/wise_pizza/solve/find_alpha.py @@ -4,6 +4,7 @@ from scipy.linalg import svd from wise_pizza.solve.solver import solve_lasso, solve_lp, solve_omp +from wise_pizza.utils import clean_up_min_max def find_alpha( @@ -219,19 +220,3 @@ def print_errors(a: np.ndarray): # fit_intercept=not use_proj ) return reg, nonzeros - - -def clean_up_min_max(min_nonzeros: int = None, max_nonzeros: int = None): - assert min_nonzeros is not None or max_nonzeros is not None - if max_nonzeros is None: - if min_nonzeros is None: - max_nonzeros = 5 - min_nonzeros = 5 - else: - max_nonzeros = min_nonzeros - else: - if min_nonzeros is None: - min_nonzeros = max_nonzeros - - assert min_nonzeros <= max_nonzeros - return min_nonzeros, max_nonzeros diff --git a/wise_pizza/utils.py b/wise_pizza/utils.py index b4ad53a..6df4ac8 100644 --- a/wise_pizza/utils.py +++ b/wise_pizza/utils.py @@ -1,3 +1,4 @@ +import logging from typing import List, Optional import numpy as np @@ -120,7 +121,9 @@ def rel_error(x, y): if return_multiple: sd_size = SegmentData( - combined.rename(columns={"Change in totals": "Change from segment size"}), + combined.rename( + columns={"Change in totals": "Change from segment size"} + ), dimensions=dims, segment_total="Change from segment size", segment_size=weights, @@ -137,7 +140,9 @@ def rel_error(x, y): combined["Change from"] = "Segment size" c2["Change from"] = "Segment average" - df = pd.concat([combined, c2])[dims + [weights, "Change in totals", "Change from"]] + df = pd.concat([combined, c2])[ + dims + [weights, "Change in totals", "Change from"] + ] df_change_in_totals = np.array(df["Change in totals"], dtype=np.longdouble) combined_dtotals = np.array(combined["dtotals"], dtype=np.longdouble) df_change_in_totals_sum = np.nansum(df_change_in_totals) @@ -160,7 +165,9 @@ def rel_error(x, y): combined[weights] = 1.0 # combined[totals + "_x"] combined[weights] = np.maximum(1.0, combined[weights]) cols = ( - dims + ["Change in totals", totals + "_x", totals + "_y"] + [c for c in combined.columns if "baseline" in c] + dims + + ["Change in totals", totals + "_x", totals + "_y"] + + [c for c in combined.columns if "baseline" in c] ) return SegmentData( @@ -205,16 +212,26 @@ def prepare_df( # replace NaN values in categorical columns with the column name + "_unknown" object_columns = list(new_df[dims].select_dtypes("object").columns) - new_df[object_columns] = new_df[object_columns].fillna(new_df[object_columns].apply(lambda x: x.name + "_unknown")) + new_df[object_columns] = new_df[object_columns].fillna( + new_df[object_columns].apply(lambda x: x.name + "_unknown") + ) new_df[object_columns] = new_df[object_columns].astype(str) # Groupby all relevant dims to decrease the dataframe size, if possible group_dims = dims if time_name is None else dims + [time_name] if size_name is not None: - new_df = new_df.groupby(by=group_dims, observed=True)[[total_name, size_name]].sum().reset_index() + new_df = ( + new_df.groupby(by=group_dims, observed=True)[[total_name, size_name]] + .sum() + .reset_index() + ) else: - new_df = new_df.groupby(by=group_dims, observed=True)[[total_name]].sum().reset_index() + new_df = ( + new_df.groupby(by=group_dims, observed=True)[[total_name]] + .sum() + .reset_index() + ) return new_df @@ -280,5 +297,24 @@ def prepare_df( # new_df[object_columns] = new_df[object_columns].astype(str) # # return new_df -def almost_equals(x1, x2, eps: float=1e-6) -> bool: - return np.sum(np.abs(x1-x2))/np.mean(np.abs(x1+x2)) < eps +def almost_equals(x1, x2, eps: float = 1e-6) -> bool: + return np.sum(np.abs(x1 - x2)) / np.mean(np.abs(x1 + x2)) < eps + + +def clean_up_min_max(min_nonzeros: int = None, max_nonzeros: int = None): + if min_nonzeros is not None: + logging.warning( + "min_segments parameter is deprecated, please use max_nonzeros instead." + ) + if max_nonzeros is None: + if min_nonzeros is None: + max_nonzeros = 5 + min_nonzeros = 5 + else: + max_nonzeros = min_nonzeros + else: + if min_nonzeros is None: + min_nonzeros = max_nonzeros + + assert min_nonzeros <= max_nonzeros + return min_nonzeros, max_nonzeros