From 8537719b0fc0e00469e36330ca7f3279cc05f3bb Mon Sep 17 00:00:00 2001 From: "Egor.Kraev" Date: Mon, 18 Nov 2024 13:26:30 +0000 Subject: [PATCH 1/5] Parallelize tree solver dimension search --- wise_pizza/explain.py | 1 + wise_pizza/slicer.py | 5 +++-- wise_pizza/solve/partition.py | 11 ++++++----- wise_pizza/solve/tree.py | 26 ++++++++++++++++++++++---- 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/wise_pizza/explain.py b/wise_pizza/explain.py index 6b35f48..f6c07b5 100644 --- a/wise_pizza/explain.py +++ b/wise_pizza/explain.py @@ -482,6 +482,7 @@ def explain_timeseries( solver=solver, verbose=verbose, groupby_dims=groupby_dims, + cluster_values=False, ) # TODO: insert back the normalized bits? diff --git a/wise_pizza/slicer.py b/wise_pizza/slicer.py index 75e9ae2..a916a61 100644 --- a/wise_pizza/slicer.py +++ b/wise_pizza/slicer.py @@ -27,7 +27,7 @@ def _summary(obj) -> str: { k: v for k, v in s.items() - if k in ["segment", "total", "seg_size", "naive_avg"] + if k in ["segment", "total", "seg_size", "naive_avg", "impact"] } for s in obj.segments ], @@ -420,7 +420,8 @@ def relevant_cluster_names(self): relevant_clusters = {} for s in self.segments: for c in s["segment"].values(): - if c in self.cluster_names: + if c in self.cluster_names and ";" not in c: + # Then cluster names containing ; are snumerations, don't need explanation relevant_clusters[c] = self.cluster_names[c].replace("@@", ", ") return relevant_clusters diff --git a/wise_pizza/solve/partition.py b/wise_pizza/solve/partition.py index dca2f35..ab820fe 100644 --- a/wise_pizza/solve/partition.py +++ b/wise_pizza/solve/partition.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd + from .weighted_quantiles import weighted_quantiles @@ -73,8 +74,10 @@ def kmeans_partition(df: pd.DataFrame, dim: str, groupby_dims: List[str]): vector_dict[c] = (weights.loc[c], joint_mat[:, i]) cluster1, cluster2 = weighted_kmeans_two_clusters(vector_dict) - - return [(cluster1, cluster2)] + if cluster1 is None: + return [] + else: + return [(cluster1, cluster2)] def weighted_kmeans_two_clusters(data_dict, tol=1e-4, max_iter=100, max_retries=10): @@ -124,9 +127,7 @@ def weighted_kmeans_two_clusters(data_dict, tol=1e-4, max_iter=100, max_retries= centroids = new_centroids - raise ValueError( - "Failed to find a valid clustering with non-empty clusters after maximum retries." - ) + return None, None def fill_gaps(x: np.ndarray, num_iter=50): diff --git a/wise_pizza/solve/tree.py b/wise_pizza/solve/tree.py index 8fec440..6292a17 100644 --- a/wise_pizza/solve/tree.py +++ b/wise_pizza/solve/tree.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd from scipy.sparse import csc_matrix +from joblib import Parallel, delayed from .fitter import AverageFitter, Fitter, TimeFitterModel, TimeFitter @@ -17,6 +18,7 @@ def tree_solver( fitter: Fitter, max_depth: Optional[int] = None, num_leaves: Optional[int] = None, + parallel_processes: int = 10, ): """ Partition the data into segments using a greedy binary tree approach @@ -38,6 +40,7 @@ def tree_solver( dims=dims, time_col=None if isinstance(fitter, AverageFitter) else "__time", max_depth=max_depth, + parallel_processes=parallel_processes, ) build_tree(root=root, num_leaves=num_leaves, max_depth=max_depth) @@ -85,6 +88,7 @@ def __init__( time_col: str = None, max_depth: Optional[int] = None, dim_split: Optional[Dict[str, List]] = None, + parallel_processes: int = 10, ): self.df = df.copy().sort_values(dims + fitter.groupby_dims) self.fitter = fitter @@ -98,6 +102,7 @@ def __init__( self.model = None # For dimension splitting candidates, hardwired for now self.num_bins = 10 + self.parallel_processes = parallel_processes @property def depth(self): @@ -134,9 +139,9 @@ def error_improvement(self): else: iter_dims = self.dims - for dim in iter_dims: + def error_improvement_for_dim(dim): if len(self.df[dim].unique()) == 1: - continue + return float("inf"), (None, None) elif len(self.df[dim].unique()) == 2: vals = self.df[dim].unique() @@ -150,7 +155,11 @@ def error_improvement(self): partitions = kmeans_partition( self.df, dim, self.fitter.groupby_dims ) + if len(partitions) == 0: + return float("inf"), (None, None) + best_error = float("inf") + candidates = (None, None) for dim_values1, dim_values2 in partitions: left = self.df[self.df[dim].isin(dim_values1)] right = self.df[self.df[dim].isin(dim_values2)] @@ -174,8 +183,17 @@ def error_improvement(self): err = left_candidate.error + right_candidate.error if err < best_error: best_error = err - self._error_improvement = self.error - best_error - self._best_submodels = (left_candidate, right_candidate) + candidates = (left_candidate, right_candidate) + return best_error, candidates + + results = Parallel(n_jobs=self.parallel_processes)( + delayed(error_improvement_for_dim)(i) for i in iter_dims + ) + for err, candidates in results: + if err < best_error: + best_error = err + self._best_submodels = candidates + self._error_improvement = self.error - best_error return self._error_improvement From 6d651364eb9e5a27d44de107f9e2a92809880b95 Mon Sep 17 00:00:00 2001 From: "Egor.Kraev" Date: Fri, 22 Nov 2024 09:10:58 +0000 Subject: [PATCH 2/5] Tweaks to time series fitting --- ... interesting segments in time series.ipynb | 270 +++++++++--------- tests/timeseries_wip_entrypoint.py | 2 +- tests/timeseries_wip_entrypoint_2.py | 47 +++ wise_pizza/explain.py | 40 ++- wise_pizza/plotting_time_tree.py | 4 + wise_pizza/slicer.py | 13 +- wise_pizza/solve/fitter.py | 1 - wise_pizza/solve/partition.py | 57 +++- wise_pizza/solve/tree.py | 5 + wise_pizza/time.py | 62 ++-- 10 files changed, 310 insertions(+), 191 deletions(-) create mode 100644 tests/timeseries_wip_entrypoint_2.py diff --git a/notebooks/Finding interesting segments in time series.ipynb b/notebooks/Finding interesting segments in time series.ipynb index 7838f7a..cc1ad30 100644 --- a/notebooks/Finding interesting segments in time series.ipynb +++ b/notebooks/Finding interesting segments in time series.ipynb @@ -50,7 +50,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\EgorKraev\\AppData\\Local\\Temp\\ipykernel_29848\\3308931027.py:2: DeprecationWarning:\n", + "C:\\Users\\EgorKraev\\AppData\\Local\\Temp\\ipykernel_12296\\3308931027.py:2: DeprecationWarning:\n", "\n", "Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython.display\n", "\n" @@ -150,6 +150,8 @@ "source": [ "# Finding the juiciest slices\n", "\n", + "The most important choice you have to make here is whether you just want to look at time series behavior for the averages, or also to that of the weights - this is controlled by the `fit_sizes` parameter. `max_depth` works as usual, controlling the maximal number of dimensions any segment can constrain.\n", + "\n", "**explain_timeseries**: Find the most unusual segments in the timeseries\n", "\n", "- `df`: Dataset\n", @@ -182,29 +184,12 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 13, "id": "0d57a44a", "metadata": { "scrolled": false }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "yay!\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\EgorKraev\\Documents\\Code\\wise-pizza\\wise_pizza\\slicer.py:213: UserWarning:\n", - "\n", - "Ignoring cluster_values argument as tree solver makes its own clusters\n", - "\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -221,50 +206,21 @@ "Done!\n", "Adding node 6...\n", "Done!\n", - "0 {'segment': {'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_2', 'PRODUCT': 'Credit;Spend'}, 'index': 1, 'orig_i': 1, 'total': 8215853399.18479, 'seg_size': 28914190.0, 'naive_avg': 284.14606804426444, 'dummy': array([1, 1, 1, ..., 0, 0, 0])}\n", - "1 {'segment': {'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_4'}, 'index': 5, 'orig_i': 5, 'total': 5407855256.179745, 'seg_size': 39528930.0, 'naive_avg': 136.807529477265, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", - "2 {'segment': {'SOURCE_CURRENCY': 'PGK;SHP', 'PRODUCT': 'Credit;Spend'}, 'index': 4, 'orig_i': 4, 'total': 1496636816.867681, 'seg_size': 4120685.0, 'naive_avg': 363.20097674723525, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", - "3 {'segment': {'SOURCE_CURRENCY': 'CZK', 'PRODUCT': 'Spend'}, 'index': 3, 'orig_i': 3, 'total': 1253650889.0440717, 'seg_size': 2018425.0, 'naive_avg': 621.1035282678681, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", - "4 {'segment': {'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_3'}, 'index': 6, 'orig_i': 6, 'total': 930638724.9999521, 'seg_size': 12992130.0, 'naive_avg': 71.63095851103338, 'dummy': array([0, 0, 0, ..., 1, 1, 1])}\n", - "5 {'segment': {'SOURCE_CURRENCY': 'CZK', 'PRODUCT': 'Credit'}, 'index': 2, 'orig_i': 2, 'total': 801995917.6628782, 'seg_size': 1962900.0, 'naive_avg': 408.577063356706, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", - "6 {'segment': {'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_5', 'PRODUCT': 'Transfer'}, 'index': 0, 'orig_i': 0, 'total': 168988001.49668851, 'seg_size': 7289425.0, 'naive_avg': 23.18262434920292, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n" + "0 {'segment': {'PRODUCT': 'Spend', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_2'}, 'index': 5, 'orig_i': 5, 'total': 22349930640.964485, 'seg_size': 58287375.0, 'naive_avg': 383.44376704156747, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", + "1 {'segment': {'PRODUCT': 'Credit', 'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_1'}, 'index': 0, 'orig_i': 0, 'total': 9418280996.490507, 'seg_size': 23444615.0, 'naive_avg': 401.7247029431069, 'dummy': array([1, 1, 1, ..., 0, 0, 0])}\n", + "2 {'segment': {'PRODUCT': 'Spend', 'TARGET_CURRENCY': 'BRL;CLP;MAD;NZD;XOF'}, 'index': 2, 'orig_i': 2, 'total': 1765844363.260038, 'seg_size': 2330795.0, 'naive_avg': 757.6146178707427, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", + "3 {'segment': {'PRODUCT': 'Transfer', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_2'}, 'index': 6, 'orig_i': 6, 'total': 1760778920.4430783, 'seg_size': 8148355.0, 'naive_avg': 216.09010903956423, 'dummy': array([0, 0, 0, ..., 1, 1, 1])}\n", + "4 {'segment': {'PRODUCT': 'Transfer', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_1'}, 'index': 4, 'orig_i': 4, 'total': 702549273.7995473, 'seg_size': 3479735.0, 'naive_avg': 201.89734959689383, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", + "5 {'segment': {'PRODUCT': 'Spend', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_3'}, 'index': 3, 'orig_i': 3, 'total': 408786271.34145737, 'seg_size': 815440.0, 'naive_avg': 501.30760244954547, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", + "6 {'segment': {'PRODUCT': 'Credit', 'SOURCE_CURRENCY': 'UYU'}, 'index': 1, 'orig_i': 1, 'total': 145067544.57249302, 'seg_size': 320370.0, 'naive_avg': 452.81251232166875, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n" ] }, { "data": { "text/html": [ - " \n", - " " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'SOURCE_CURRENCY_cluster_1': 'AFN, ARS, AUD, BDT, BGN, BHD, BMD, BRL, BSD, '\n", + " 'BWP, CDF, CHF, CNY, COP, CVE, EUR, GEL, GNF, '\n", + " 'HKD, HRK, INR, ISK, JOD, KGS, KMF, LSL, LYD, '\n", + " 'MKD, MUR, MVR, MWK, MXN, NOK, NZD, OMR, PEN, '\n", + " 'PGK, PHP, PLN, PYG, SBD, SEK, SGD, SHP, SRD, '\n", + " 'SZL, TMT, TND, UGX, USD, VND, XAF, XOF',\n", + " 'TARGET_CURRENCY_cluster_1': 'COP, GEL, NPR, PHP, RON, SGD, USD, VND, ZMW',\n", + " 'TARGET_CURRENCY_cluster_2': 'AED, ARS, AUD, BDT, BRL, BWP, CAD, CHF, CLP, '\n", + " 'CNY, CRC, CZK, DKK, EGP, EUR, GBP, GHS, HKD, '\n", + " 'HRK, HUF, IDR, ILS, INR, JPY, KES, KRW, LKR, '\n", + " 'MAD, MXN, MYR, NGN, NOK, NZD, PEN, PKR, PLN, '\n", + " 'RUB, SEK, THB, TRY, TZS, UAH, UGX, UYU, XOF, '\n", + " 'ZAR'}\n" ] } ], "source": [ - "import json, pprint\n", - "pprint.pprint(json.loads(sf.summary()))" + "# And here is a run that jointly segments by the trends in the averages and the segment sizes\n", + "\n", + "sf = explain_timeseries(\n", + " df=df,\n", + " dims=dims,\n", + " num_segments=7,\n", + " max_depth=2,\n", + " total_name=totals,\n", + " size_name=size,\n", + " time_name=time,\n", + " verbose=False,\n", + " solver=\"tree\",\n", + " fit_sizes=True,\n", + ")\n", + "sf.plot(plot_is_static=False, height=1500, width=1000, average_name=\"VPC\")\n", + "pprint(sf.relevant_cluster_names)" ] }, { @@ -402,9 +408,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:wise-pizza3.10]", + "display_name": "Python [conda env:wise-pizza3.11]", "language": "python", - "name": "conda-env-wise-pizza3.10-py" + "name": "conda-env-wise-pizza3.11-py" }, "language_info": { "codemirror_mode": { @@ -416,7 +422,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.15" + "version": "3.11.10" } }, "nbformat": 4, diff --git a/tests/timeseries_wip_entrypoint.py b/tests/timeseries_wip_entrypoint.py index fffb4fc..43dff46 100644 --- a/tests/timeseries_wip_entrypoint.py +++ b/tests/timeseries_wip_entrypoint.py @@ -37,7 +37,7 @@ time_name=time, verbose=False, solver="tree", - fit_sizes=False, + fit_sizes=True, ) sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC") print(sf.summary()) diff --git a/tests/timeseries_wip_entrypoint_2.py b/tests/timeseries_wip_entrypoint_2.py new file mode 100644 index 0000000..809092f --- /dev/null +++ b/tests/timeseries_wip_entrypoint_2.py @@ -0,0 +1,47 @@ +import os, sys +import pandas as pd + +root_path = os.path.realpath("../..") +print(root_path) + +# this assumes that all of the following files are checked in the same directory +sys.path.append(os.path.join(root_path, "wise-pizza")) + +# create data-related directories +data_dir = os.path.realpath(os.path.join(root_path, "wise-pizza/data")) +if not os.path.isdir(data_dir): + os.mkdir(data_dir) +print(data_dir) + +from wise_pizza import explain_timeseries + +df = pd.read_csv( + os.path.join(data_dir, "volume_data_new.csv") +) # replace this variable with your data +dims = [ + "CUSTOMER_TYPE", + "STRATEGIC_PRODUCT", + "SOURCE_CURRENCY", + "TARGET_CURRENCY", + "PRODUCT_USE_CASE", + "REGION", + "TRANS_VOL_BUCKET", +] # dimensions to find segments +totals = "VOLUME_GBP" # value to analyze +size = "NUM_CUSTOMERS" #'NUM_TRANSACTIONS' # number of objects +time = "ACTION_YM" +sf = explain_timeseries( + df=df, + dims=dims, + max_segments=7, + max_depth=2, + total_name=totals, + size_name=size, + time_name=time, + verbose=False, + solver="tree", + fit_sizes=True, +) +sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC") +print(sf.summary()) +print("yay!") diff --git a/wise_pizza/explain.py b/wise_pizza/explain.py index f6c07b5..90bed7d 100644 --- a/wise_pizza/explain.py +++ b/wise_pizza/explain.py @@ -367,9 +367,7 @@ def explain_timeseries( total_name: str, time_name: str, size_name: Optional[str] = None, - min_segments: int = None, - max_segments: int = None, - min_depth: int = 1, + num_segments: int = None, max_depth: int = 2, solver: str = "tree", verbose: bool = False, @@ -377,6 +375,7 @@ def explain_timeseries( fit_log_space: bool = False, fit_sizes: Optional[bool] = None, num_breaks: int = 2, + ignore_averages: bool = True, log_space_weight_sc: float = 0.5, ): assert ( @@ -450,16 +449,31 @@ def explain_timeseries( time_basis = ( pd.concat([time_basis, re_basis], axis=0).fillna(0.0).reset_index(drop=True) ) - print("yay!") groupby_dims = ["chunk", "__time"] else: groupby_dims = ["__time"] df2["_target"] = df2[total_name] df2["__time"] = df2[time_name] - df2["total_adjustment"] = 0.0 - avg_df = 0.0 - average = 0.0 + + # Adds the column of the time average over each dimension combination + if ignore_averages: + df2, avg_df = add_average_over_time( + df2, + dims=dims, + total_name=total_name, + size_name=size_name, + time_name="__time", + groupby_dims=groupby_dims, + cartesian=False, + ) + else: + df2["total_adjustment"] = 0.0 + avg_df = None + + # The join in the above function could have messed up the ordering + df2 = df2.sort_values(by=dims + groupby_dims) + average = df2[total_name].sum() / df2[size_name].sum() sf = SliceFinder() sf.global_average = average @@ -468,16 +482,14 @@ def explain_timeseries( sf.time_name = time_name sf.y_adj = df2["total_adjustment"].values sf.avg_df = avg_df - sf.time_values = df2[time_name].unique() + sf.time_values = df2["__time"].unique() sf.fit( - df2[dims + groupby_dims], - df2["_target"], - time_col=df2[time_name], + df2[dims + groupby_dims + ["total_adjustment"]], + df2[total_name], + time_col=df2["__time"], time_basis=time_basis, weights=df2[size_name], - min_segments=min_segments, - max_segments=max_segments, - min_depth=min_depth, + max_segments=num_segments, max_depth=max_depth, solver=solver, verbose=verbose, diff --git a/wise_pizza/plotting_time_tree.py b/wise_pizza/plotting_time_tree.py index 04efa5a..ef0fa76 100644 --- a/wise_pizza/plotting_time_tree.py +++ b/wise_pizza/plotting_time_tree.py @@ -158,3 +158,7 @@ def simple_ts_plot( row=row_num, col=col_num, ) + fig.update_layout( + xaxis=dict(autorange=True), + yaxis=dict(autorange=True) + ) \ No newline at end of file diff --git a/wise_pizza/slicer.py b/wise_pizza/slicer.py index a916a61..34ff3d4 100644 --- a/wise_pizza/slicer.py +++ b/wise_pizza/slicer.py @@ -143,6 +143,9 @@ def fit( group of segments from the same dimension with similar naive averages """ + dim_df = dim_df.copy() + if groupby_dims is None: + groupby_dims = [] assert solver.lower() in ["lasso", "tree", "omp", "lp"] min_segments, max_segments = clean_up_min_max(min_segments, max_segments) @@ -160,18 +163,20 @@ def fit( assert np.sum(np.abs(totals[weights == 0])) == 0 # Cast all dimension values to strings - dim_df = dim_df.astype(str) + for c in dim_df.columns: + if c not in groupby_dims + ["total_adjustment"]: + dim_df[c] = dim_df[c].astype(str) dims = list(dim_df.columns) - if groupby_dims is not None: - dims = [d for d in dims if d not in groupby_dims] + if groupby_dims: + dims = [d for d in dims if d not in groupby_dims + ["total_adjustment"]] # sort the dataframe by dimension values, # making sure the other vectors stay aligned dim_df = dim_df.reset_index(drop=True) dim_df["totals"] = totals dim_df["weights"] = weights - if groupby_dims is not None: + if groupby_dims: dim_df = pd.merge(dim_df, time_basis, on=groupby_dims) sort_dims = dims + groupby_dims else: diff --git a/wise_pizza/solve/fitter.py b/wise_pizza/solve/fitter.py index f002cdd..a1b3c64 100644 --- a/wise_pizza/solve/fitter.py +++ b/wise_pizza/solve/fitter.py @@ -48,7 +48,6 @@ def debug_plot(X, y, y_pred, w): plt.plot(X_agg["y_pred"] / X_agg["weights"], label="y_pred") plt.legend() plt.show() - print("yay!") class TimeFitterModel(ABC): diff --git a/wise_pizza/solve/partition.py b/wise_pizza/solve/partition.py index ab820fe..5ea3381 100644 --- a/wise_pizza/solve/partition.py +++ b/wise_pizza/solve/partition.py @@ -42,9 +42,15 @@ def target_encoding_partitions(df: pd.DataFrame, dim: str, num_bins: int): return partitions -def kmeans_partition(df: pd.DataFrame, dim: str, groupby_dims: List[str]): +def kmeans_partition( + df: pd.DataFrame, + dim: str, + groupby_dims: List[str], + normalize_averages: bool = False, +): assert len(df[dim].unique()) >= 3 # Get split candidates + # Get time profiles split by the dimension we are evaluating agg_df = df.groupby([dim] + groupby_dims, as_index=False).sum() agg_df["__avg"] = agg_df["totals"] / agg_df["weights"] pivot_df = agg_df.pivot( @@ -57,16 +63,31 @@ def kmeans_partition(df: pd.DataFrame, dim: str, groupby_dims: List[str]): for chunk in ["Average", "Weights"]: this_df = pivot_df[pivot_df["chunk"] == chunk] nice_values = fill_gaps(this_df[value_cols].values) - if chunk == "Weights": - nice_values = ( - np.mean(nice_mats["Average"]) - * nice_values - / np.sum(nice_values, axis=0, keepdims=True) + + if normalize_averages: + # Normalize both subsegments separately: weights and averages + nice_values /= ( + np.linalg.norm(nice_values, ord=2, axis=0, keepdims=True) + 1e-6 ) + else: + if chunk == "Weights": + nice_values = ( + np.mean(nice_mats["Average"]) + * nice_values + / ( + np.linalg.norm(nice_values, ord=2, axis=0, keepdims=True) + + 1e-6 + ) + ) nice_mats[chunk] = nice_values joint_mat = np.concatenate([nice_mats["Average"], nice_mats["Weights"]], axis=0) else: - joint_mat = fill_gaps(pivot_df[value_cols].values) + nice_values = fill_gaps(pivot_df[value_cols].values) + if normalize_averages: + nice_values /= ( + np.linalg.norm(nice_values, ord=2, axis=0, keepdims=True) + 1e-6 + ) + joint_mat = nice_values weights = pivot_df[value_cols].T.sum(axis=1) vector_dict = {} @@ -109,12 +130,20 @@ def weighted_kmeans_two_clusters(data_dict, tol=1e-4, max_iter=100, max_retries= break # Update centroids with weighted averages - new_centroids = np.array( - [ - np.average(data[labels == i], axis=0, weights=weights[labels == i]) - for i in range(2) - ] - ) + try: + new_centroids = np.array( + [ + np.average( + data[labels == i], axis=0, weights=weights[labels == i] + ) + for i in range(2) + ] + ) + except ZeroDivisionError: + print( + f"Zero division error detected on retry {retry + 1}, reinitializing centroids." + ) + break # Check for convergence if np.linalg.norm(new_centroids - centroids) < tol: @@ -140,7 +169,7 @@ def fill_gaps(x: np.ndarray, num_iter=50): nice_marg = interpolate_and_extrapolate(marg) tile_marg = np.tile(nice_marg, (x.shape[1], 1)).T tile_marg[nans] = np.nan - reg = np.nanmedian(x) * 1e-6 + reg = np.nanmedian(x) * 1e-6 + 1e-6 coeffs = (np.nansum(x * tile_marg, axis=0) + reg) / ( np.nansum(tile_marg * tile_marg, axis=0) + reg ) diff --git a/wise_pizza/solve/tree.py b/wise_pizza/solve/tree.py index 6292a17..c5cb15f 100644 --- a/wise_pizza/solve/tree.py +++ b/wise_pizza/solve/tree.py @@ -31,6 +31,7 @@ def tree_solver( """ df = dim_df.copy().reset_index(drop=True) + df["totals"] -= df["total_adjustment"] df["__avg"] = df["totals"] / df["weights"] df["__avg"] = df["__avg"].fillna(df["__avg"].mean()) @@ -56,6 +57,10 @@ def tree_solver( re_df = pd.concat([leaf.df for leaf in leaves]).sort_values( dims + fitter.groupby_dims ) + # Put back the averages over time by segment + re_df["prediction"] += re_df["total_adjustment"] / re_df["weights"] + + # re_df["totals"] += re_df["total_adjustment"] if len(fitter.groupby_dims) == 2: # Time series with weights re_df_w = re_df[re_df["chunk"] == "Weights"].copy() diff --git a/wise_pizza/time.py b/wise_pizza/time.py index e50e569..1c4b8cc 100644 --- a/wise_pizza/time.py +++ b/wise_pizza/time.py @@ -90,36 +90,48 @@ def add_average_over_time( total_name: str, size_name: str, time_name: str, + groupby_dims: List[str] = None, cartesian: bool = False, ) -> Tuple[pd.DataFrame, pd.DataFrame]: - avgs = df[dims + [total_name, size_name]].groupby(dims, as_index=False).sum() - avgs["avg"] = avgs[total_name] / avgs[size_name] - if cartesian: - # make sure that the cartesian product of dimension combinations x time is present, - # without changing the totals - times = df[[time_name]].groupby(time_name, as_index=False).sum() - times["key"] = 1 - avgs["key"] = 1 - cartesian_df = pd.merge(avgs, times, on="key").drop(columns=["key"]) - joined = pd.merge( - df, - cartesian_df[dims + [time_name]], - on=dims + [time_name], - how="right", - ) - joined[size_name] = joined[size_name].fillna( - np.nanmean(joined[size_name].values) - ) - joined[total_name] = joined[total_name].fillna(0.0) - df = joined - - avgs = df[dims + [total_name, size_name]].groupby(dims, as_index=False).sum() + groupby_dims = groupby_dims or [time_name] + + # get the average of the total over time + group_dims = dims + [c for c in groupby_dims if c != time_name] + avgs = ( + df[group_dims + [total_name, size_name]] + .groupby(group_dims, as_index=False) + .sum() + ) + avgs["avg"] = avgs[total_name] / avgs[size_name] - joined = pd.merge(df, avgs[dims + ["avg"]], on=dims) + # if cartesian: + # # make sure that the cartesian product of dimension combinations x time is present, + # # without changing the totals + # times = df[[time_name]].groupby(time_name, as_index=False).sum() + # times["key"] = 1 + # avgs["key"] = 1 + # cartesian_df = pd.merge(avgs, times, on="key").drop(columns=["key"]) + # joined = pd.merge( + # df, + # cartesian_df[dims + [time_name]], + # on=dims + [time_name], + # how="right", + # ) + # joined[size_name] = joined[size_name].fillna( + # np.nanmean(joined[size_name].values) + # ) + # joined[total_name] = joined[total_name].fillna(0.0) + # df = joined + + # avgs = df[dims + [total_name, size_name]].groupby(dims, as_index=False).sum() + # avgs["avg"] = avgs[total_name] / avgs[size_name] + + joined = pd.merge(df, avgs[group_dims + ["avg"]], on=group_dims) joined["total_adjustment"] = joined[size_name] * joined["avg"] - out = joined[dims + [total_name, size_name, time_name, "total_adjustment"]] - tmp = out[dims + [total_name, "total_adjustment"]].groupby(dims).sum() + + out = joined[group_dims + [total_name, size_name, time_name, "total_adjustment"]] + tmp = out[group_dims + [total_name, "total_adjustment"]].groupby(dims).sum() assert (tmp[total_name] - tmp["total_adjustment"]).abs().sum() < 1e-6 * df[ total_name ].abs().max() From f2caca51bf8d716e445d7fce6ceb811d6786a792 Mon Sep 17 00:00:00 2001 From: "Egor.Kraev" Date: Mon, 25 Nov 2024 13:13:50 +0000 Subject: [PATCH 3/5] minor time series tweaks --- ... interesting segments in time series.ipynb | 218 ++++++++---------- tests/timeseries_wip_entrypoint.py | 3 +- wise_pizza/explain.py | 23 +- wise_pizza/plotting_time_tree.py | 7 +- wise_pizza/slicer.py | 5 + wise_pizza/solve/tree.py | 29 ++- wise_pizza/time.py | 1 + 7 files changed, 149 insertions(+), 137 deletions(-) diff --git a/notebooks/Finding interesting segments in time series.ipynb b/notebooks/Finding interesting segments in time series.ipynb index cc1ad30..f717f1f 100644 --- a/notebooks/Finding interesting segments in time series.ipynb +++ b/notebooks/Finding interesting segments in time series.ipynb @@ -42,20 +42,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 10, "id": "961bc9d1", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\EgorKraev\\AppData\\Local\\Temp\\ipykernel_12296\\3308931027.py:2: DeprecationWarning:\n", - "\n", - "Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython.display\n", - "\n" - ] - }, { "data": { "text/html": [ @@ -71,7 +61,8 @@ ], "source": [ "# this makes the notebook expand to full width of the browser window\n", - "from IPython.core.display import display, HTML\n", + "from IPython.core.display import HTML\n", + "from IPython.display import display\n", "display(HTML(\"\"))" ] }, @@ -152,28 +143,24 @@ "\n", "The most important choice you have to make here is whether you just want to look at time series behavior for the averages, or also to that of the weights - this is controlled by the `fit_sizes` parameter. `max_depth` works as usual, controlling the maximal number of dimensions any segment can constrain.\n", "\n", - "**explain_timeseries**: Find the most unusual segments in the timeseries\n", + "**explain_timeseries**: \n", + "\n", + "This function divides a time series panel dataset into segments that are as distinct as possible.\n", + "\n", + "Parameters:\n", + "\n", + "- **df**: A pandas DataFrame with the time series data.\n", + "- **dims**: Discrete dimensions to segment by.\n", + "- **total_name**: Name of the column containing totals.\n", + "- **time_name**: Name of the column containing the time values.\n", + "- **num_segments**: Number of segments to find.\n", + "- **size_name** (Optional): Name of the column containing the size of the segment.\n", + "- **max_depth** (Optional, defaults to 2): Maximum number of dimensions to constrain per segment.\n", + "- **fit_sizes** (Optional): Whether to fit the sizes of the segments or just the averages.\n", + "- **n_jobs** (Optional, defaults to 10): Number of jobs to run in parallel when finding segments.\n", + "- **num_breaks** (Optional, defaults to 3): Number of breaks in the stylized time series used for comparing segments.\n", + "\n", "\n", - "- `df`: Dataset\n", - "- `dims`: List of discrete dimensions\n", - "- `total_name`: Name of column that contains totals per segment\n", - "- `size_name`: Name of column containing segment sizes\n", - "- `min_segments`: Minimum number of segments to find\n", - "- `max_segments`: Maximum number of segments to find, defaults to min_segments\n", - "- `min_depth`: Minimum number of dimension to constrain in segment definition\n", - "- `max_depth`: Maximum number of dimension to constrain in segment definition\n", - "- `solver`: If this equals to \"lp\" uses the LP solver, else uses the (recommended) Lasso solver\n", - " - `\"lasso\"`: Lasso-based finder of unusual segments\n", - " - `\"lp\"`: LP-based finder of unusual segments\n", - "- `cluster_values`: In addition to single-value slices, consider slices that consist of a\n", - " group of segments from the same dimension with similar naive averages\n", - " - `True`: to use cluster values, you can them using `sf.relevant_cluster_names`\n", - " - `False`: to use simple segments\n", - "- `verbose`: If set to a truish value, lots of debug info is printed to console, also you can check progressbar\n", - " - `True`: to get info\n", - " - `False`: to get result without info\n", - " \n", - " \n", "- Use `.plot()` to see the plot after fitting:\n", " - `plot_is_static`: static (True) or dynamic (False) plotly result\n", " - `True`: to get static plots (Doesn't work on all platforms yet)\n", @@ -184,43 +171,47 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 7, "id": "0d57a44a", "metadata": { "scrolled": false }, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Adding node 1...\n", - "Done!\n", - "Adding node 2...\n", - "Done!\n", - "Adding node 3...\n", - "Done!\n", - "Adding node 4...\n", - "Done!\n", - "Adding node 5...\n", - "Done!\n", - "Adding node 6...\n", - "Done!\n", - "0 {'segment': {'PRODUCT': 'Spend', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_2'}, 'index': 5, 'orig_i': 5, 'total': 22349930640.964485, 'seg_size': 58287375.0, 'naive_avg': 383.44376704156747, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", - "1 {'segment': {'PRODUCT': 'Credit', 'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_1'}, 'index': 0, 'orig_i': 0, 'total': 9418280996.490507, 'seg_size': 23444615.0, 'naive_avg': 401.7247029431069, 'dummy': array([1, 1, 1, ..., 0, 0, 0])}\n", - "2 {'segment': {'PRODUCT': 'Spend', 'TARGET_CURRENCY': 'BRL;CLP;MAD;NZD;XOF'}, 'index': 2, 'orig_i': 2, 'total': 1765844363.260038, 'seg_size': 2330795.0, 'naive_avg': 757.6146178707427, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", - "3 {'segment': {'PRODUCT': 'Transfer', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_2'}, 'index': 6, 'orig_i': 6, 'total': 1760778920.4430783, 'seg_size': 8148355.0, 'naive_avg': 216.09010903956423, 'dummy': array([0, 0, 0, ..., 1, 1, 1])}\n", - "4 {'segment': {'PRODUCT': 'Transfer', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_1'}, 'index': 4, 'orig_i': 4, 'total': 702549273.7995473, 'seg_size': 3479735.0, 'naive_avg': 201.89734959689383, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", - "5 {'segment': {'PRODUCT': 'Spend', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_3'}, 'index': 3, 'orig_i': 3, 'total': 408786271.34145737, 'seg_size': 815440.0, 'naive_avg': 501.30760244954547, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", - "6 {'segment': {'PRODUCT': 'Credit', 'SOURCE_CURRENCY': 'UYU'}, 'index': 1, 'orig_i': 1, 'total': 145067544.57249302, 'seg_size': 320370.0, 'naive_avg': 452.81251232166875, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n" - ] + "data": { + "text/html": [ + " \n", + " " + ] + }, + "metadata": {}, + "output_type": "display_data" }, { "data": { "text/html": [ - "