diff --git a/notebooks/Finding interesting segments in time series.ipynb b/notebooks/Finding interesting segments in time series.ipynb
index 7838f7a..cc1ad30 100644
--- a/notebooks/Finding interesting segments in time series.ipynb
+++ b/notebooks/Finding interesting segments in time series.ipynb
@@ -50,7 +50,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "C:\\Users\\EgorKraev\\AppData\\Local\\Temp\\ipykernel_29848\\3308931027.py:2: DeprecationWarning:\n",
+ "C:\\Users\\EgorKraev\\AppData\\Local\\Temp\\ipykernel_12296\\3308931027.py:2: DeprecationWarning:\n",
"\n",
"Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython.display\n",
"\n"
@@ -150,6 +150,8 @@
"source": [
"# Finding the juiciest slices\n",
"\n",
+ "The most important choice you have to make here is whether you just want to look at time series behavior for the averages, or also to that of the weights - this is controlled by the `fit_sizes` parameter. `max_depth` works as usual, controlling the maximal number of dimensions any segment can constrain.\n",
+ "\n",
"**explain_timeseries**: Find the most unusual segments in the timeseries\n",
"\n",
"- `df`: Dataset\n",
@@ -182,29 +184,12 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 13,
"id": "0d57a44a",
"metadata": {
"scrolled": false
},
"outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "yay!\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "C:\\Users\\EgorKraev\\Documents\\Code\\wise-pizza\\wise_pizza\\slicer.py:213: UserWarning:\n",
- "\n",
- "Ignoring cluster_values argument as tree solver makes its own clusters\n",
- "\n"
- ]
- },
{
"name": "stdout",
"output_type": "stream",
@@ -221,50 +206,21 @@
"Done!\n",
"Adding node 6...\n",
"Done!\n",
- "0 {'segment': {'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_2', 'PRODUCT': 'Credit;Spend'}, 'index': 1, 'orig_i': 1, 'total': 8215853399.18479, 'seg_size': 28914190.0, 'naive_avg': 284.14606804426444, 'dummy': array([1, 1, 1, ..., 0, 0, 0])}\n",
- "1 {'segment': {'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_4'}, 'index': 5, 'orig_i': 5, 'total': 5407855256.179745, 'seg_size': 39528930.0, 'naive_avg': 136.807529477265, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n",
- "2 {'segment': {'SOURCE_CURRENCY': 'PGK;SHP', 'PRODUCT': 'Credit;Spend'}, 'index': 4, 'orig_i': 4, 'total': 1496636816.867681, 'seg_size': 4120685.0, 'naive_avg': 363.20097674723525, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n",
- "3 {'segment': {'SOURCE_CURRENCY': 'CZK', 'PRODUCT': 'Spend'}, 'index': 3, 'orig_i': 3, 'total': 1253650889.0440717, 'seg_size': 2018425.0, 'naive_avg': 621.1035282678681, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n",
- "4 {'segment': {'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_3'}, 'index': 6, 'orig_i': 6, 'total': 930638724.9999521, 'seg_size': 12992130.0, 'naive_avg': 71.63095851103338, 'dummy': array([0, 0, 0, ..., 1, 1, 1])}\n",
- "5 {'segment': {'SOURCE_CURRENCY': 'CZK', 'PRODUCT': 'Credit'}, 'index': 2, 'orig_i': 2, 'total': 801995917.6628782, 'seg_size': 1962900.0, 'naive_avg': 408.577063356706, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n",
- "6 {'segment': {'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_5', 'PRODUCT': 'Transfer'}, 'index': 0, 'orig_i': 0, 'total': 168988001.49668851, 'seg_size': 7289425.0, 'naive_avg': 23.18262434920292, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n"
+ "0 {'segment': {'PRODUCT': 'Spend', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_2'}, 'index': 5, 'orig_i': 5, 'total': 22349930640.964485, 'seg_size': 58287375.0, 'naive_avg': 383.44376704156747, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n",
+ "1 {'segment': {'PRODUCT': 'Credit', 'SOURCE_CURRENCY': 'SOURCE_CURRENCY_cluster_1'}, 'index': 0, 'orig_i': 0, 'total': 9418280996.490507, 'seg_size': 23444615.0, 'naive_avg': 401.7247029431069, 'dummy': array([1, 1, 1, ..., 0, 0, 0])}\n",
+ "2 {'segment': {'PRODUCT': 'Spend', 'TARGET_CURRENCY': 'BRL;CLP;MAD;NZD;XOF'}, 'index': 2, 'orig_i': 2, 'total': 1765844363.260038, 'seg_size': 2330795.0, 'naive_avg': 757.6146178707427, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n",
+ "3 {'segment': {'PRODUCT': 'Transfer', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_2'}, 'index': 6, 'orig_i': 6, 'total': 1760778920.4430783, 'seg_size': 8148355.0, 'naive_avg': 216.09010903956423, 'dummy': array([0, 0, 0, ..., 1, 1, 1])}\n",
+ "4 {'segment': {'PRODUCT': 'Transfer', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_1'}, 'index': 4, 'orig_i': 4, 'total': 702549273.7995473, 'seg_size': 3479735.0, 'naive_avg': 201.89734959689383, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n",
+ "5 {'segment': {'PRODUCT': 'Spend', 'TARGET_CURRENCY': 'TARGET_CURRENCY_cluster_3'}, 'index': 3, 'orig_i': 3, 'total': 408786271.34145737, 'seg_size': 815440.0, 'naive_avg': 501.30760244954547, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n",
+ "6 {'segment': {'PRODUCT': 'Credit', 'SOURCE_CURRENCY': 'UYU'}, 'index': 1, 'orig_i': 1, 'total': 145067544.57249302, 'seg_size': 320370.0, 'naive_avg': 452.81251232166875, 'dummy': array([0, 0, 0, ..., 0, 0, 0])}\n"
]
},
{
"data": {
"text/html": [
- " \n",
- " "
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "
"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'SOURCE_CURRENCY_cluster_1': 'AFN, ARS, AUD, BDT, BGN, BHD, BMD, BRL, BSD, '\n",
+ " 'BWP, CDF, CHF, CNY, COP, CVE, EUR, GEL, GNF, '\n",
+ " 'HKD, HRK, INR, ISK, JOD, KGS, KMF, LSL, LYD, '\n",
+ " 'MKD, MUR, MVR, MWK, MXN, NOK, NZD, OMR, PEN, '\n",
+ " 'PGK, PHP, PLN, PYG, SBD, SEK, SGD, SHP, SRD, '\n",
+ " 'SZL, TMT, TND, UGX, USD, VND, XAF, XOF',\n",
+ " 'TARGET_CURRENCY_cluster_1': 'COP, GEL, NPR, PHP, RON, SGD, USD, VND, ZMW',\n",
+ " 'TARGET_CURRENCY_cluster_2': 'AED, ARS, AUD, BDT, BRL, BWP, CAD, CHF, CLP, '\n",
+ " 'CNY, CRC, CZK, DKK, EGP, EUR, GBP, GHS, HKD, '\n",
+ " 'HRK, HUF, IDR, ILS, INR, JPY, KES, KRW, LKR, '\n",
+ " 'MAD, MXN, MYR, NGN, NOK, NZD, PEN, PKR, PLN, '\n",
+ " 'RUB, SEK, THB, TRY, TZS, UAH, UGX, UYU, XOF, '\n",
+ " 'ZAR'}\n"
]
}
],
"source": [
- "import json, pprint\n",
- "pprint.pprint(json.loads(sf.summary()))"
+ "# And here is a run that jointly segments by the trends in the averages and the segment sizes\n",
+ "\n",
+ "sf = explain_timeseries(\n",
+ " df=df,\n",
+ " dims=dims,\n",
+ " num_segments=7,\n",
+ " max_depth=2,\n",
+ " total_name=totals,\n",
+ " size_name=size,\n",
+ " time_name=time,\n",
+ " verbose=False,\n",
+ " solver=\"tree\",\n",
+ " fit_sizes=True,\n",
+ ")\n",
+ "sf.plot(plot_is_static=False, height=1500, width=1000, average_name=\"VPC\")\n",
+ "pprint(sf.relevant_cluster_names)"
]
},
{
@@ -402,9 +408,9 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python [conda env:wise-pizza3.10]",
+ "display_name": "Python [conda env:wise-pizza3.11]",
"language": "python",
- "name": "conda-env-wise-pizza3.10-py"
+ "name": "conda-env-wise-pizza3.11-py"
},
"language_info": {
"codemirror_mode": {
@@ -416,7 +422,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.15"
+ "version": "3.11.10"
}
},
"nbformat": 4,
diff --git a/tests/timeseries_wip_entrypoint.py b/tests/timeseries_wip_entrypoint.py
index fffb4fc..43dff46 100644
--- a/tests/timeseries_wip_entrypoint.py
+++ b/tests/timeseries_wip_entrypoint.py
@@ -37,7 +37,7 @@
time_name=time,
verbose=False,
solver="tree",
- fit_sizes=False,
+ fit_sizes=True,
)
sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
print(sf.summary())
diff --git a/tests/timeseries_wip_entrypoint_2.py b/tests/timeseries_wip_entrypoint_2.py
new file mode 100644
index 0000000..809092f
--- /dev/null
+++ b/tests/timeseries_wip_entrypoint_2.py
@@ -0,0 +1,47 @@
+import os, sys
+import pandas as pd
+
+root_path = os.path.realpath("../..")
+print(root_path)
+
+# this assumes that all of the following files are checked in the same directory
+sys.path.append(os.path.join(root_path, "wise-pizza"))
+
+# create data-related directories
+data_dir = os.path.realpath(os.path.join(root_path, "wise-pizza/data"))
+if not os.path.isdir(data_dir):
+ os.mkdir(data_dir)
+print(data_dir)
+
+from wise_pizza import explain_timeseries
+
+df = pd.read_csv(
+ os.path.join(data_dir, "volume_data_new.csv")
+) # replace this variable with your data
+dims = [
+ "CUSTOMER_TYPE",
+ "STRATEGIC_PRODUCT",
+ "SOURCE_CURRENCY",
+ "TARGET_CURRENCY",
+ "PRODUCT_USE_CASE",
+ "REGION",
+ "TRANS_VOL_BUCKET",
+] # dimensions to find segments
+totals = "VOLUME_GBP" # value to analyze
+size = "NUM_CUSTOMERS" #'NUM_TRANSACTIONS' # number of objects
+time = "ACTION_YM"
+sf = explain_timeseries(
+ df=df,
+ dims=dims,
+ max_segments=7,
+ max_depth=2,
+ total_name=totals,
+ size_name=size,
+ time_name=time,
+ verbose=False,
+ solver="tree",
+ fit_sizes=True,
+)
+sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
+print(sf.summary())
+print("yay!")
diff --git a/wise_pizza/explain.py b/wise_pizza/explain.py
index f6c07b5..90bed7d 100644
--- a/wise_pizza/explain.py
+++ b/wise_pizza/explain.py
@@ -367,9 +367,7 @@ def explain_timeseries(
total_name: str,
time_name: str,
size_name: Optional[str] = None,
- min_segments: int = None,
- max_segments: int = None,
- min_depth: int = 1,
+ num_segments: int = None,
max_depth: int = 2,
solver: str = "tree",
verbose: bool = False,
@@ -377,6 +375,7 @@ def explain_timeseries(
fit_log_space: bool = False,
fit_sizes: Optional[bool] = None,
num_breaks: int = 2,
+ ignore_averages: bool = True,
log_space_weight_sc: float = 0.5,
):
assert (
@@ -450,16 +449,31 @@ def explain_timeseries(
time_basis = (
pd.concat([time_basis, re_basis], axis=0).fillna(0.0).reset_index(drop=True)
)
- print("yay!")
groupby_dims = ["chunk", "__time"]
else:
groupby_dims = ["__time"]
df2["_target"] = df2[total_name]
df2["__time"] = df2[time_name]
- df2["total_adjustment"] = 0.0
- avg_df = 0.0
- average = 0.0
+
+ # Adds the column of the time average over each dimension combination
+ if ignore_averages:
+ df2, avg_df = add_average_over_time(
+ df2,
+ dims=dims,
+ total_name=total_name,
+ size_name=size_name,
+ time_name="__time",
+ groupby_dims=groupby_dims,
+ cartesian=False,
+ )
+ else:
+ df2["total_adjustment"] = 0.0
+ avg_df = None
+
+ # The join in the above function could have messed up the ordering
+ df2 = df2.sort_values(by=dims + groupby_dims)
+ average = df2[total_name].sum() / df2[size_name].sum()
sf = SliceFinder()
sf.global_average = average
@@ -468,16 +482,14 @@ def explain_timeseries(
sf.time_name = time_name
sf.y_adj = df2["total_adjustment"].values
sf.avg_df = avg_df
- sf.time_values = df2[time_name].unique()
+ sf.time_values = df2["__time"].unique()
sf.fit(
- df2[dims + groupby_dims],
- df2["_target"],
- time_col=df2[time_name],
+ df2[dims + groupby_dims + ["total_adjustment"]],
+ df2[total_name],
+ time_col=df2["__time"],
time_basis=time_basis,
weights=df2[size_name],
- min_segments=min_segments,
- max_segments=max_segments,
- min_depth=min_depth,
+ max_segments=num_segments,
max_depth=max_depth,
solver=solver,
verbose=verbose,
diff --git a/wise_pizza/plotting_time_tree.py b/wise_pizza/plotting_time_tree.py
index 04efa5a..ef0fa76 100644
--- a/wise_pizza/plotting_time_tree.py
+++ b/wise_pizza/plotting_time_tree.py
@@ -158,3 +158,7 @@ def simple_ts_plot(
row=row_num,
col=col_num,
)
+ fig.update_layout(
+ xaxis=dict(autorange=True),
+ yaxis=dict(autorange=True)
+ )
\ No newline at end of file
diff --git a/wise_pizza/slicer.py b/wise_pizza/slicer.py
index a916a61..34ff3d4 100644
--- a/wise_pizza/slicer.py
+++ b/wise_pizza/slicer.py
@@ -143,6 +143,9 @@ def fit(
group of segments from the same dimension with similar naive averages
"""
+ dim_df = dim_df.copy()
+ if groupby_dims is None:
+ groupby_dims = []
assert solver.lower() in ["lasso", "tree", "omp", "lp"]
min_segments, max_segments = clean_up_min_max(min_segments, max_segments)
@@ -160,18 +163,20 @@ def fit(
assert np.sum(np.abs(totals[weights == 0])) == 0
# Cast all dimension values to strings
- dim_df = dim_df.astype(str)
+ for c in dim_df.columns:
+ if c not in groupby_dims + ["total_adjustment"]:
+ dim_df[c] = dim_df[c].astype(str)
dims = list(dim_df.columns)
- if groupby_dims is not None:
- dims = [d for d in dims if d not in groupby_dims]
+ if groupby_dims:
+ dims = [d for d in dims if d not in groupby_dims + ["total_adjustment"]]
# sort the dataframe by dimension values,
# making sure the other vectors stay aligned
dim_df = dim_df.reset_index(drop=True)
dim_df["totals"] = totals
dim_df["weights"] = weights
- if groupby_dims is not None:
+ if groupby_dims:
dim_df = pd.merge(dim_df, time_basis, on=groupby_dims)
sort_dims = dims + groupby_dims
else:
diff --git a/wise_pizza/solve/fitter.py b/wise_pizza/solve/fitter.py
index f002cdd..a1b3c64 100644
--- a/wise_pizza/solve/fitter.py
+++ b/wise_pizza/solve/fitter.py
@@ -48,7 +48,6 @@ def debug_plot(X, y, y_pred, w):
plt.plot(X_agg["y_pred"] / X_agg["weights"], label="y_pred")
plt.legend()
plt.show()
- print("yay!")
class TimeFitterModel(ABC):
diff --git a/wise_pizza/solve/partition.py b/wise_pizza/solve/partition.py
index ab820fe..5ea3381 100644
--- a/wise_pizza/solve/partition.py
+++ b/wise_pizza/solve/partition.py
@@ -42,9 +42,15 @@ def target_encoding_partitions(df: pd.DataFrame, dim: str, num_bins: int):
return partitions
-def kmeans_partition(df: pd.DataFrame, dim: str, groupby_dims: List[str]):
+def kmeans_partition(
+ df: pd.DataFrame,
+ dim: str,
+ groupby_dims: List[str],
+ normalize_averages: bool = False,
+):
assert len(df[dim].unique()) >= 3
# Get split candidates
+ # Get time profiles split by the dimension we are evaluating
agg_df = df.groupby([dim] + groupby_dims, as_index=False).sum()
agg_df["__avg"] = agg_df["totals"] / agg_df["weights"]
pivot_df = agg_df.pivot(
@@ -57,16 +63,31 @@ def kmeans_partition(df: pd.DataFrame, dim: str, groupby_dims: List[str]):
for chunk in ["Average", "Weights"]:
this_df = pivot_df[pivot_df["chunk"] == chunk]
nice_values = fill_gaps(this_df[value_cols].values)
- if chunk == "Weights":
- nice_values = (
- np.mean(nice_mats["Average"])
- * nice_values
- / np.sum(nice_values, axis=0, keepdims=True)
+
+ if normalize_averages:
+ # Normalize both subsegments separately: weights and averages
+ nice_values /= (
+ np.linalg.norm(nice_values, ord=2, axis=0, keepdims=True) + 1e-6
)
+ else:
+ if chunk == "Weights":
+ nice_values = (
+ np.mean(nice_mats["Average"])
+ * nice_values
+ / (
+ np.linalg.norm(nice_values, ord=2, axis=0, keepdims=True)
+ + 1e-6
+ )
+ )
nice_mats[chunk] = nice_values
joint_mat = np.concatenate([nice_mats["Average"], nice_mats["Weights"]], axis=0)
else:
- joint_mat = fill_gaps(pivot_df[value_cols].values)
+ nice_values = fill_gaps(pivot_df[value_cols].values)
+ if normalize_averages:
+ nice_values /= (
+ np.linalg.norm(nice_values, ord=2, axis=0, keepdims=True) + 1e-6
+ )
+ joint_mat = nice_values
weights = pivot_df[value_cols].T.sum(axis=1)
vector_dict = {}
@@ -109,12 +130,20 @@ def weighted_kmeans_two_clusters(data_dict, tol=1e-4, max_iter=100, max_retries=
break
# Update centroids with weighted averages
- new_centroids = np.array(
- [
- np.average(data[labels == i], axis=0, weights=weights[labels == i])
- for i in range(2)
- ]
- )
+ try:
+ new_centroids = np.array(
+ [
+ np.average(
+ data[labels == i], axis=0, weights=weights[labels == i]
+ )
+ for i in range(2)
+ ]
+ )
+ except ZeroDivisionError:
+ print(
+ f"Zero division error detected on retry {retry + 1}, reinitializing centroids."
+ )
+ break
# Check for convergence
if np.linalg.norm(new_centroids - centroids) < tol:
@@ -140,7 +169,7 @@ def fill_gaps(x: np.ndarray, num_iter=50):
nice_marg = interpolate_and_extrapolate(marg)
tile_marg = np.tile(nice_marg, (x.shape[1], 1)).T
tile_marg[nans] = np.nan
- reg = np.nanmedian(x) * 1e-6
+ reg = np.nanmedian(x) * 1e-6 + 1e-6
coeffs = (np.nansum(x * tile_marg, axis=0) + reg) / (
np.nansum(tile_marg * tile_marg, axis=0) + reg
)
diff --git a/wise_pizza/solve/tree.py b/wise_pizza/solve/tree.py
index 6292a17..c5cb15f 100644
--- a/wise_pizza/solve/tree.py
+++ b/wise_pizza/solve/tree.py
@@ -31,6 +31,7 @@ def tree_solver(
"""
df = dim_df.copy().reset_index(drop=True)
+ df["totals"] -= df["total_adjustment"]
df["__avg"] = df["totals"] / df["weights"]
df["__avg"] = df["__avg"].fillna(df["__avg"].mean())
@@ -56,6 +57,10 @@ def tree_solver(
re_df = pd.concat([leaf.df for leaf in leaves]).sort_values(
dims + fitter.groupby_dims
)
+ # Put back the averages over time by segment
+ re_df["prediction"] += re_df["total_adjustment"] / re_df["weights"]
+
+ # re_df["totals"] += re_df["total_adjustment"]
if len(fitter.groupby_dims) == 2: # Time series with weights
re_df_w = re_df[re_df["chunk"] == "Weights"].copy()
diff --git a/wise_pizza/time.py b/wise_pizza/time.py
index e50e569..1c4b8cc 100644
--- a/wise_pizza/time.py
+++ b/wise_pizza/time.py
@@ -90,36 +90,48 @@ def add_average_over_time(
total_name: str,
size_name: str,
time_name: str,
+ groupby_dims: List[str] = None,
cartesian: bool = False,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
- avgs = df[dims + [total_name, size_name]].groupby(dims, as_index=False).sum()
- avgs["avg"] = avgs[total_name] / avgs[size_name]
- if cartesian:
- # make sure that the cartesian product of dimension combinations x time is present,
- # without changing the totals
- times = df[[time_name]].groupby(time_name, as_index=False).sum()
- times["key"] = 1
- avgs["key"] = 1
- cartesian_df = pd.merge(avgs, times, on="key").drop(columns=["key"])
- joined = pd.merge(
- df,
- cartesian_df[dims + [time_name]],
- on=dims + [time_name],
- how="right",
- )
- joined[size_name] = joined[size_name].fillna(
- np.nanmean(joined[size_name].values)
- )
- joined[total_name] = joined[total_name].fillna(0.0)
- df = joined
-
- avgs = df[dims + [total_name, size_name]].groupby(dims, as_index=False).sum()
+ groupby_dims = groupby_dims or [time_name]
+
+ # get the average of the total over time
+ group_dims = dims + [c for c in groupby_dims if c != time_name]
+ avgs = (
+ df[group_dims + [total_name, size_name]]
+ .groupby(group_dims, as_index=False)
+ .sum()
+ )
+
avgs["avg"] = avgs[total_name] / avgs[size_name]
- joined = pd.merge(df, avgs[dims + ["avg"]], on=dims)
+ # if cartesian:
+ # # make sure that the cartesian product of dimension combinations x time is present,
+ # # without changing the totals
+ # times = df[[time_name]].groupby(time_name, as_index=False).sum()
+ # times["key"] = 1
+ # avgs["key"] = 1
+ # cartesian_df = pd.merge(avgs, times, on="key").drop(columns=["key"])
+ # joined = pd.merge(
+ # df,
+ # cartesian_df[dims + [time_name]],
+ # on=dims + [time_name],
+ # how="right",
+ # )
+ # joined[size_name] = joined[size_name].fillna(
+ # np.nanmean(joined[size_name].values)
+ # )
+ # joined[total_name] = joined[total_name].fillna(0.0)
+ # df = joined
+
+ # avgs = df[dims + [total_name, size_name]].groupby(dims, as_index=False).sum()
+ # avgs["avg"] = avgs[total_name] / avgs[size_name]
+
+ joined = pd.merge(df, avgs[group_dims + ["avg"]], on=group_dims)
joined["total_adjustment"] = joined[size_name] * joined["avg"]
- out = joined[dims + [total_name, size_name, time_name, "total_adjustment"]]
- tmp = out[dims + [total_name, "total_adjustment"]].groupby(dims).sum()
+
+ out = joined[group_dims + [total_name, size_name, time_name, "total_adjustment"]]
+ tmp = out[group_dims + [total_name, "total_adjustment"]].groupby(dims).sum()
assert (tmp[total_name] - tmp["total_adjustment"]).abs().sum() < 1e-6 * df[
total_name
].abs().max()