diff --git a/notebooks/Finding interesting segments in time series.ipynb b/notebooks/Finding interesting segments in time series.ipynb index 7838f7a..cc1ad30 100644 --- a/notebooks/Finding interesting segments in time series.ipynb +++ b/notebooks/Finding interesting segments in time series.ipynb @@ -50,7 +50,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\EgorKraev\\AppData\\Local\\Temp\\ipykernel_29848\\3308931027.py:2: DeprecationWarning:\n", + "C:\\Users\\EgorKraev\\AppData\\Local\\Temp\\ipykernel_12296\\3308931027.py:2: DeprecationWarning:\n", "\n", "Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython.display\n", "\n" @@ -150,6 +150,8 @@ "source": [ "# Finding the juiciest slices\n", "\n", + "The most important choice you have to make here is whether you just want to look at time series behavior for the averages, or also to that of the weights - this is controlled by the `fit_sizes` parameter. `max_depth` works as usual, controlling the maximal number of dimensions any segment can constrain.\n", + "\n", "**explain_timeseries**: Find the most unusual segments in the timeseries\n", "\n", "- `df`: Dataset\n", @@ -182,29 +184,12 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 13, "id": "0d57a44a", "metadata": { "scrolled": false }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "yay!\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\EgorKraev\\Documents\\Code\\wise-pizza\\wise_pizza\\slicer.py:213: UserWarning:\n", - "\n", - "Ignoring cluster_values argument as tree solver makes its own clusters\n", - "\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -221,50 +206,21 @@ "Done!\n", "Adding node 6...\n", "Done!\n", - "0 {'segment': {'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_2', 'PRODUCT': 'Credit;Spend'}, 'index': 1, 'orig_i': 1, 'total': 8215853399.18479, 'seg_size': 28914190.0, 'naive_avg': 284.14606804426444, 'dummy': array([1, 1, 1, ..., 0, 0, 0])}\n", - "1 {'segment': {'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_4'}, 'index': 5, 'orig_i': 5, 'total': 5407855256.179745, 'seg_size': 39528930.0, 'naive_avg': 136.807529477265, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", - "2 {'segment': {'SOURCE_CURRENCY': 'PGK;SHP', 'PRODUCT': 'Credit;Spend'}, 'index': 4, 'orig_i': 4, 'total': 1496636816.867681, 'seg_size': 4120685.0, 'naive_avg': 363.20097674723525, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", - "3 {'segment': {'SOURCE_CURRENCY': 'CZK', 'PRODUCT': 'Spend'}, 'index': 3, 'orig_i': 3, 'total': 1253650889.0440717, 'seg_size': 2018425.0, 'naive_avg': 621.1035282678681, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", - "4 {'segment': {'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_3'}, 'index': 6, 'orig_i': 6, 'total': 930638724.9999521, 'seg_size': 12992130.0, 'naive_avg': 71.63095851103338, 'dummy': array([0, 0, 0, ..., 1, 1, 1])}\n", - "5 {'segment': {'SOURCE_CURRENCY': 'CZK', 'PRODUCT': 'Credit'}, 'index': 2, 'orig_i': 2, 'total': 801995917.6628782, 'seg_size': 1962900.0, 'naive_avg': 408.577063356706, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", - "6 {'segment': {'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_5', 'PRODUCT': 'Transfer'}, 'index': 0, 'orig_i': 0, 'total': 168988001.49668851, 'seg_size': 7289425.0, 'naive_avg': 23.18262434920292, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n" + "0 {'segment': {'PRODUCT': 'Spend', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_2'}, 'index': 5, 'orig_i': 5, 'total': 22349930640.964485, 'seg_size': 58287375.0, 'naive_avg': 383.44376704156747, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", + "1 {'segment': {'PRODUCT': 'Credit', 'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_1'}, 'index': 0, 'orig_i': 0, 'total': 9418280996.490507, 'seg_size': 23444615.0, 'naive_avg': 401.7247029431069, 'dummy': array([1, 1, 1, ..., 0, 0, 0])}\n", + "2 {'segment': {'PRODUCT': 'Spend', 'TARGET_CURRENCY': 'BRL;CLP;MAD;NZD;XOF'}, 'index': 2, 'orig_i': 2, 'total': 1765844363.260038, 'seg_size': 2330795.0, 'naive_avg': 757.6146178707427, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", + "3 {'segment': {'PRODUCT': 'Transfer', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_2'}, 'index': 6, 'orig_i': 6, 'total': 1760778920.4430783, 'seg_size': 8148355.0, 'naive_avg': 216.09010903956423, 'dummy': array([0, 0, 0, ..., 1, 1, 1])}\n", + "4 {'segment': {'PRODUCT': 'Transfer', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_1'}, 'index': 4, 'orig_i': 4, 'total': 702549273.7995473, 'seg_size': 3479735.0, 'naive_avg': 201.89734959689383, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", + "5 {'segment': {'PRODUCT': 'Spend', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_3'}, 'index': 3, 'orig_i': 3, 'total': 408786271.34145737, 'seg_size': 815440.0, 'naive_avg': 501.30760244954547, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n", + "6 {'segment': {'PRODUCT': 'Credit', 'SOURCE_CURRENCY': 'UYU'}, 'index': 1, 'orig_i': 1, 'total': 145067544.57249302, 'seg_size': 320370.0, 'naive_avg': 452.81251232166875, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n" ] }, { "data": { "text/html": [ - " \n", - " " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'SOURCE_CURRENCY_cluster_1': 'AFN, ARS, AUD, BDT, BGN, BHD, BMD, BRL, BSD, '\n", + " 'BWP, CDF, CHF, CNY, COP, CVE, EUR, GEL, GNF, '\n", + " 'HKD, HRK, INR, ISK, JOD, KGS, KMF, LSL, LYD, '\n", + " 'MKD, MUR, MVR, MWK, MXN, NOK, NZD, OMR, PEN, '\n", + " 'PGK, PHP, PLN, PYG, SBD, SEK, SGD, SHP, SRD, '\n", + " 'SZL, TMT, TND, UGX, USD, VND, XAF, XOF',\n", + " 'TARGET_CURRENCY_cluster_1': 'COP, GEL, NPR, PHP, RON, SGD, USD, VND, ZMW',\n", + " 'TARGET_CURRENCY_cluster_2': 'AED, ARS, AUD, BDT, BRL, BWP, CAD, CHF, CLP, '\n", + " 'CNY, CRC, CZK, DKK, EGP, EUR, GBP, GHS, HKD, '\n", + " 'HRK, HUF, IDR, ILS, INR, JPY, KES, KRW, LKR, '\n", + " 'MAD, MXN, MYR, NGN, NOK, NZD, PEN, PKR, PLN, '\n", + " 'RUB, SEK, THB, TRY, TZS, UAH, UGX, UYU, XOF, '\n", + " 'ZAR'}\n" ] } ], "source": [ - "import json, pprint\n", - "pprint.pprint(json.loads(sf.summary()))" + "# And here is a run that jointly segments by the trends in the averages and the segment sizes\n", + "\n", + "sf = explain_timeseries(\n", + " df=df,\n", + " dims=dims,\n", + " num_segments=7,\n", + " max_depth=2,\n", + " total_name=totals,\n", + " size_name=size,\n", + " time_name=time,\n", + " verbose=False,\n", + " solver=\"tree\",\n", + " fit_sizes=True,\n", + ")\n", + "sf.plot(plot_is_static=False, height=1500, width=1000, average_name=\"VPC\")\n", + "pprint(sf.relevant_cluster_names)" ] }, { @@ -402,9 +408,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:wise-pizza3.10]", + "display_name": "Python [conda env:wise-pizza3.11]", "language": "python", - "name": "conda-env-wise-pizza3.10-py" + "name": "conda-env-wise-pizza3.11-py" }, "language_info": { "codemirror_mode": { @@ -416,7 +422,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.15" + "version": "3.11.10" } }, "nbformat": 4, diff --git a/tests/timeseries_wip_entrypoint.py b/tests/timeseries_wip_entrypoint.py index fffb4fc..43dff46 100644 --- a/tests/timeseries_wip_entrypoint.py +++ b/tests/timeseries_wip_entrypoint.py @@ -37,7 +37,7 @@ time_name=time, verbose=False, solver="tree", - fit_sizes=False, + fit_sizes=True, ) sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC") print(sf.summary()) diff --git a/tests/timeseries_wip_entrypoint_2.py b/tests/timeseries_wip_entrypoint_2.py new file mode 100644 index 0000000..809092f --- /dev/null +++ b/tests/timeseries_wip_entrypoint_2.py @@ -0,0 +1,47 @@ +import os, sys +import pandas as pd + +root_path = os.path.realpath("../..") +print(root_path) + +# this assumes that all of the following files are checked in the same directory +sys.path.append(os.path.join(root_path, "wise-pizza")) + +# create data-related directories +data_dir = os.path.realpath(os.path.join(root_path, "wise-pizza/data")) +if not os.path.isdir(data_dir): + os.mkdir(data_dir) +print(data_dir) + +from wise_pizza import explain_timeseries + +df = pd.read_csv( + os.path.join(data_dir, "volume_data_new.csv") +) # replace this variable with your data +dims = [ + "CUSTOMER_TYPE", + "STRATEGIC_PRODUCT", + "SOURCE_CURRENCY", + "TARGET_CURRENCY", + "PRODUCT_USE_CASE", + "REGION", + "TRANS_VOL_BUCKET", +] # dimensions to find segments +totals = "VOLUME_GBP" # value to analyze +size = "NUM_CUSTOMERS" #'NUM_TRANSACTIONS' # number of objects +time = "ACTION_YM" +sf = explain_timeseries( + df=df, + dims=dims, + max_segments=7, + max_depth=2, + total_name=totals, + size_name=size, + time_name=time, + verbose=False, + solver="tree", + fit_sizes=True, +) +sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC") +print(sf.summary()) +print("yay!") diff --git a/wise_pizza/explain.py b/wise_pizza/explain.py index f6c07b5..90bed7d 100644 --- a/wise_pizza/explain.py +++ b/wise_pizza/explain.py @@ -367,9 +367,7 @@ def explain_timeseries( total_name: str, time_name: str, size_name: Optional[str] = None, - min_segments: int = None, - max_segments: int = None, - min_depth: int = 1, + num_segments: int = None, max_depth: int = 2, solver: str = "tree", verbose: bool = False, @@ -377,6 +375,7 @@ def explain_timeseries( fit_log_space: bool = False, fit_sizes: Optional[bool] = None, num_breaks: int = 2, + ignore_averages: bool = True, log_space_weight_sc: float = 0.5, ): assert ( @@ -450,16 +449,31 @@ def explain_timeseries( time_basis = ( pd.concat([time_basis, re_basis], axis=0).fillna(0.0).reset_index(drop=True) ) - print("yay!") groupby_dims = ["chunk", "__time"] else: groupby_dims = ["__time"] df2["_target"] = df2[total_name] df2["__time"] = df2[time_name] - df2["total_adjustment"] = 0.0 - avg_df = 0.0 - average = 0.0 + + # Adds the column of the time average over each dimension combination + if ignore_averages: + df2, avg_df = add_average_over_time( + df2, + dims=dims, + total_name=total_name, + size_name=size_name, + time_name="__time", + groupby_dims=groupby_dims, + cartesian=False, + ) + else: + df2["total_adjustment"] = 0.0 + avg_df = None + + # The join in the above function could have messed up the ordering + df2 = df2.sort_values(by=dims + groupby_dims) + average = df2[total_name].sum() / df2[size_name].sum() sf = SliceFinder() sf.global_average = average @@ -468,16 +482,14 @@ def explain_timeseries( sf.time_name = time_name sf.y_adj = df2["total_adjustment"].values sf.avg_df = avg_df - sf.time_values = df2[time_name].unique() + sf.time_values = df2["__time"].unique() sf.fit( - df2[dims + groupby_dims], - df2["_target"], - time_col=df2[time_name], + df2[dims + groupby_dims + ["total_adjustment"]], + df2[total_name], + time_col=df2["__time"], time_basis=time_basis, weights=df2[size_name], - min_segments=min_segments, - max_segments=max_segments, - min_depth=min_depth, + max_segments=num_segments, max_depth=max_depth, solver=solver, verbose=verbose, diff --git a/wise_pizza/plotting_time_tree.py b/wise_pizza/plotting_time_tree.py index 04efa5a..ef0fa76 100644 --- a/wise_pizza/plotting_time_tree.py +++ b/wise_pizza/plotting_time_tree.py @@ -158,3 +158,7 @@ def simple_ts_plot( row=row_num, col=col_num, ) + fig.update_layout( + xaxis=dict(autorange=True), + yaxis=dict(autorange=True) + ) \ No newline at end of file diff --git a/wise_pizza/slicer.py b/wise_pizza/slicer.py index a916a61..34ff3d4 100644 --- a/wise_pizza/slicer.py +++ b/wise_pizza/slicer.py @@ -143,6 +143,9 @@ def fit( group of segments from the same dimension with similar naive averages """ + dim_df = dim_df.copy() + if groupby_dims is None: + groupby_dims = [] assert solver.lower() in ["lasso", "tree", "omp", "lp"] min_segments, max_segments = clean_up_min_max(min_segments, max_segments) @@ -160,18 +163,20 @@ def fit( assert np.sum(np.abs(totals[weights == 0])) == 0 # Cast all dimension values to strings - dim_df = dim_df.astype(str) + for c in dim_df.columns: + if c not in groupby_dims + ["total_adjustment"]: + dim_df[c] = dim_df[c].astype(str) dims = list(dim_df.columns) - if groupby_dims is not None: - dims = [d for d in dims if d not in groupby_dims] + if groupby_dims: + dims = [d for d in dims if d not in groupby_dims + ["total_adjustment"]] # sort the dataframe by dimension values, # making sure the other vectors stay aligned dim_df = dim_df.reset_index(drop=True) dim_df["totals"] = totals dim_df["weights"] = weights - if groupby_dims is not None: + if groupby_dims: dim_df = pd.merge(dim_df, time_basis, on=groupby_dims) sort_dims = dims + groupby_dims else: diff --git a/wise_pizza/solve/fitter.py b/wise_pizza/solve/fitter.py index f002cdd..a1b3c64 100644 --- a/wise_pizza/solve/fitter.py +++ b/wise_pizza/solve/fitter.py @@ -48,7 +48,6 @@ def debug_plot(X, y, y_pred, w): plt.plot(X_agg["y_pred"] / X_agg["weights"], label="y_pred") plt.legend() plt.show() - print("yay!") class TimeFitterModel(ABC): diff --git a/wise_pizza/solve/partition.py b/wise_pizza/solve/partition.py index ab820fe..5ea3381 100644 --- a/wise_pizza/solve/partition.py +++ b/wise_pizza/solve/partition.py @@ -42,9 +42,15 @@ def target_encoding_partitions(df: pd.DataFrame, dim: str, num_bins: int): return partitions -def kmeans_partition(df: pd.DataFrame, dim: str, groupby_dims: List[str]): +def kmeans_partition( + df: pd.DataFrame, + dim: str, + groupby_dims: List[str], + normalize_averages: bool = False, +): assert len(df[dim].unique()) >= 3 # Get split candidates + # Get time profiles split by the dimension we are evaluating agg_df = df.groupby([dim] + groupby_dims, as_index=False).sum() agg_df["__avg"] = agg_df["totals"] / agg_df["weights"] pivot_df = agg_df.pivot( @@ -57,16 +63,31 @@ def kmeans_partition(df: pd.DataFrame, dim: str, groupby_dims: List[str]): for chunk in ["Average", "Weights"]: this_df = pivot_df[pivot_df["chunk"] == chunk] nice_values = fill_gaps(this_df[value_cols].values) - if chunk == "Weights": - nice_values = ( - np.mean(nice_mats["Average"]) - * nice_values - / np.sum(nice_values, axis=0, keepdims=True) + + if normalize_averages: + # Normalize both subsegments separately: weights and averages + nice_values /= ( + np.linalg.norm(nice_values, ord=2, axis=0, keepdims=True) + 1e-6 ) + else: + if chunk == "Weights": + nice_values = ( + np.mean(nice_mats["Average"]) + * nice_values + / ( + np.linalg.norm(nice_values, ord=2, axis=0, keepdims=True) + + 1e-6 + ) + ) nice_mats[chunk] = nice_values joint_mat = np.concatenate([nice_mats["Average"], nice_mats["Weights"]], axis=0) else: - joint_mat = fill_gaps(pivot_df[value_cols].values) + nice_values = fill_gaps(pivot_df[value_cols].values) + if normalize_averages: + nice_values /= ( + np.linalg.norm(nice_values, ord=2, axis=0, keepdims=True) + 1e-6 + ) + joint_mat = nice_values weights = pivot_df[value_cols].T.sum(axis=1) vector_dict = {} @@ -109,12 +130,20 @@ def weighted_kmeans_two_clusters(data_dict, tol=1e-4, max_iter=100, max_retries= break # Update centroids with weighted averages - new_centroids = np.array( - [ - np.average(data[labels == i], axis=0, weights=weights[labels == i]) - for i in range(2) - ] - ) + try: + new_centroids = np.array( + [ + np.average( + data[labels == i], axis=0, weights=weights[labels == i] + ) + for i in range(2) + ] + ) + except ZeroDivisionError: + print( + f"Zero division error detected on retry {retry + 1}, reinitializing centroids." + ) + break # Check for convergence if np.linalg.norm(new_centroids - centroids) < tol: @@ -140,7 +169,7 @@ def fill_gaps(x: np.ndarray, num_iter=50): nice_marg = interpolate_and_extrapolate(marg) tile_marg = np.tile(nice_marg, (x.shape[1], 1)).T tile_marg[nans] = np.nan - reg = np.nanmedian(x) * 1e-6 + reg = np.nanmedian(x) * 1e-6 + 1e-6 coeffs = (np.nansum(x * tile_marg, axis=0) + reg) / ( np.nansum(tile_marg * tile_marg, axis=0) + reg ) diff --git a/wise_pizza/solve/tree.py b/wise_pizza/solve/tree.py index 6292a17..c5cb15f 100644 --- a/wise_pizza/solve/tree.py +++ b/wise_pizza/solve/tree.py @@ -31,6 +31,7 @@ def tree_solver( """ df = dim_df.copy().reset_index(drop=True) + df["totals"] -= df["total_adjustment"] df["__avg"] = df["totals"] / df["weights"] df["__avg"] = df["__avg"].fillna(df["__avg"].mean()) @@ -56,6 +57,10 @@ def tree_solver( re_df = pd.concat([leaf.df for leaf in leaves]).sort_values( dims + fitter.groupby_dims ) + # Put back the averages over time by segment + re_df["prediction"] += re_df["total_adjustment"] / re_df["weights"] + + # re_df["totals"] += re_df["total_adjustment"] if len(fitter.groupby_dims) == 2: # Time series with weights re_df_w = re_df[re_df["chunk"] == "Weights"].copy() diff --git a/wise_pizza/time.py b/wise_pizza/time.py index e50e569..1c4b8cc 100644 --- a/wise_pizza/time.py +++ b/wise_pizza/time.py @@ -90,36 +90,48 @@ def add_average_over_time( total_name: str, size_name: str, time_name: str, + groupby_dims: List[str] = None, cartesian: bool = False, ) -> Tuple[pd.DataFrame, pd.DataFrame]: - avgs = df[dims + [total_name, size_name]].groupby(dims, as_index=False).sum() - avgs["avg"] = avgs[total_name] / avgs[size_name] - if cartesian: - # make sure that the cartesian product of dimension combinations x time is present, - # without changing the totals - times = df[[time_name]].groupby(time_name, as_index=False).sum() - times["key"] = 1 - avgs["key"] = 1 - cartesian_df = pd.merge(avgs, times, on="key").drop(columns=["key"]) - joined = pd.merge( - df, - cartesian_df[dims + [time_name]], - on=dims + [time_name], - how="right", - ) - joined[size_name] = joined[size_name].fillna( - np.nanmean(joined[size_name].values) - ) - joined[total_name] = joined[total_name].fillna(0.0) - df = joined - - avgs = df[dims + [total_name, size_name]].groupby(dims, as_index=False).sum() + groupby_dims = groupby_dims or [time_name] + + # get the average of the total over time + group_dims = dims + [c for c in groupby_dims if c != time_name] + avgs = ( + df[group_dims + [total_name, size_name]] + .groupby(group_dims, as_index=False) + .sum() + ) + avgs["avg"] = avgs[total_name] / avgs[size_name] - joined = pd.merge(df, avgs[dims + ["avg"]], on=dims) + # if cartesian: + # # make sure that the cartesian product of dimension combinations x time is present, + # # without changing the totals + # times = df[[time_name]].groupby(time_name, as_index=False).sum() + # times["key"] = 1 + # avgs["key"] = 1 + # cartesian_df = pd.merge(avgs, times, on="key").drop(columns=["key"]) + # joined = pd.merge( + # df, + # cartesian_df[dims + [time_name]], + # on=dims + [time_name], + # how="right", + # ) + # joined[size_name] = joined[size_name].fillna( + # np.nanmean(joined[size_name].values) + # ) + # joined[total_name] = joined[total_name].fillna(0.0) + # df = joined + + # avgs = df[dims + [total_name, size_name]].groupby(dims, as_index=False).sum() + # avgs["avg"] = avgs[total_name] / avgs[size_name] + + joined = pd.merge(df, avgs[group_dims + ["avg"]], on=group_dims) joined["total_adjustment"] = joined[size_name] * joined["avg"] - out = joined[dims + [total_name, size_name, time_name, "total_adjustment"]] - tmp = out[dims + [total_name, "total_adjustment"]].groupby(dims).sum() + + out = joined[group_dims + [total_name, size_name, time_name, "total_adjustment"]] + tmp = out[group_dims + [total_name, "total_adjustment"]].groupby(dims).sum() assert (tmp[total_name] - tmp["total_adjustment"]).abs().sum() < 1e-6 * df[ total_name ].abs().max()