transferwise · AlxdrPolyakov · Nov 15, 2024 · Nov 13, 2024 · Nov 13, 2024 · Nov 14, 2024
@@ -115,6 +115,43 @@ def monthly_driver_data():
     )
 
 
+def monthly_driver_ts_data():
+    df = pd.read_csv(
+        os.path.join(os.path.dirname(__file__), "../data", "synth_time_data.csv")
+    )
+    return SegmentData(
+        data=df,
+        dimensions=[
+            "PRODUCT",
+            "REGION",
+            "SOURCE_CURRENCY",
+            "TARGET_CURRENCY",
+        ],
+        segment_total="VOLUME",
+        segment_size="ACTIVE_CUSTOMERS",
+        time_col="DATE",
+    )
+
+
+@pytest.mark.parametrize("fit_sizes", [True, False])
+def test_time_series_tree_solver(fit_sizes: bool):
+    data = monthly_driver_ts_data()
+    sf = explain_timeseries(
+        df=data.data,
+        dims=data.dimensions,
+        max_segments=7,
+        max_depth=2,
+        total_name=data.segment_total,
+        size_name=data.segment_size,
+        time_name=data.time_col,
+        verbose=False,
+        solver="tree",
+        fit_sizes=fit_sizes,
+    )
+    sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
+    print(sf.summary())
+
+
 def test_categorical():
     all_data = monthly_driver_data()
     df = all_data.data
@@ -201,46 +238,47 @@ def test_synthetic_template_tree(nan_percent: float):
     print("yay!")
 
 
-@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
-def test_synthetic_ts_template(nan_percent: float):
-    all_data = synthetic_ts_data(init_len=10000)
-
-    # Add some big trends to the data
-    # TODO: insert trend break patterns too
-    months = np.array(sorted(all_data.data[all_data.time_col].unique()))
-    basis = create_time_basis(months, baseline_dims=1)
-    joined = pd.merge(all_data.data, basis, left_on="TIME", right_index=True)
-    df = joined.drop(columns=basis.columns)
-
-    loc1 = (df["dim0"] == 0) & (df["dim1"] == 1)
-    loc2 = (df["dim1"] == 0) & (df["dim2"] == 1)
-
-    df.loc[loc1, "totals"] += 100 * joined.loc[loc1, "Slope"]
-    df.loc[loc2, "totals"] += 300 * joined.loc[loc2, "Slope"]
-
-    if nan_percent > 0:
-        df = values_to_nan(df, nan_percent)
-    sf = explain_timeseries(
-        df,
-        dims=all_data.dimensions,
-        total_name=all_data.segment_total,
-        time_name=all_data.time_col,
-        size_name=all_data.segment_size,
-        max_depth=2,
-        max_segments=5,
-        verbose=True,
-    )
-    print("***")
-    for s in sf.segments:
-        print(s)
-
-    plot_time(sf)
-
-    assert abs(sf.segments[0]["coef"] - 300) < 2
-    assert abs(sf.segments[1]["coef"] - 100) < 2
-
-    # sf.plot()
-    print("yay!")
+# The old solvers for time series no longer work
+# @pytest.mark.parametrize("nan_percent", [0.0, 1.0])
+# def test_synthetic_ts_template(nan_percent: float):
+#     all_data = synthetic_ts_data(init_len=10000)
+#
+#     # Add some big trends to the data
+#     # TODO: insert trend break patterns too
+#     months = np.array(sorted(all_data.data[all_data.time_col].unique()))
+#     basis = create_time_basis(months, baseline_dims=1)
+#     joined = pd.merge(all_data.data, basis, left_on="TIME", right_index=True)
+#     df = joined.drop(columns=basis.columns)
+#
+#     loc1 = (df["dim0"] == 0) & (df["dim1"] == 1)
+#     loc2 = (df["dim1"] == 0) & (df["dim2"] == 1)
+#
+#     df.loc[loc1, "totals"] += 100 * joined.loc[loc1, "Slope"]
+#     df.loc[loc2, "totals"] += 300 * joined.loc[loc2, "Slope"]
+#
+#     if nan_percent > 0:
+#         df = values_to_nan(df, nan_percent)
+#     sf = explain_timeseries(
+#         df,
+#         dims=all_data.dimensions,
+#         total_name=all_data.segment_total,
+#         time_name=all_data.time_col,
+#         size_name=all_data.segment_size,
+#         max_depth=2,
+#         max_segments=5,
+#         verbose=True,
+#     )
+#     print("***")
+#     for s in sf.segments:
+#         print(s)
+#
+#     plot_time(sf)
+#
+#     assert abs(sf.segments[0]["coef"] - 300) < 2
+#     assert abs(sf.segments[1]["coef"] - 100) < 2
+#
+#     # sf.plot()
+#     print("yay!")
 
 
 @pytest.mark.parametrize(
@@ -279,7 +317,7 @@ def test_deltas(
         max_depth=1,
         max_segments=10,
         solver=solver,
-        cluster_values=cluster_values
+        cluster_values=cluster_values,
     )
     # sf.plot(plot_is_static=plot_is_static)
     print("yay!")

@@ -0,0 +1,44 @@
+import os, sys
+import pandas as pd
+
+root_path = os.path.realpath("../..")
+print(root_path)
+
+# this assumes that all of the following files are checked in the same directory
+sys.path.append(os.path.join(root_path, "wise-pizza"))
+
+# create data-related directories
+data_dir = os.path.realpath(os.path.join(root_path, "wise-pizza/data"))
+if not os.path.isdir(data_dir):
+    os.mkdir(data_dir)
+print(data_dir)
+
+from wise_pizza import explain_timeseries
+
+df = pd.read_csv(
+    os.path.join(data_dir, "synth_time_data.csv")
+)  # replace this variable with your data
+dims = [
+    "PRODUCT",
+    "REGION",
+    "SOURCE_CURRENCY",
+    "TARGET_CURRENCY",
+]  # dimensions to find segments
+totals = "VOLUME"  # value to analyze
+size = "ACTIVE_CUSTOMERS"  # number of objects
+time = "DATE"
+sf = explain_timeseries(
+    df=df,
+    dims=dims,
+    max_segments=7,
+    max_depth=2,
+    total_name=totals,
+    size_name=size,
+    time_name=time,
+    verbose=False,
+    solver="tree",
+    fit_sizes=False,
+)
+sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
+print(sf.summary())
+print("yay!")
@@ -96,8 +96,15 @@ def nice_cluster_names(x: List[Dict[str, List[str]]]) -> Tuple[List[Dict], Dict]
     for dim, clusters in cluster_strings.items():
         reverse_cluster_names[dim] = {}
         for i, c in enumerate(clusters):
-            cluster_names[f"{dim}_cluster_{i + 1}"] = c
-            reverse_cluster_names[dim][c] = f"{dim}_cluster_{i + 1}"
+            ugly_name = f"{dim}_cluster_{i + 1}"
+            nice_name = c.replace("@@", ";")
+            if len(nice_name) < 1.2 * len(ugly_name):
+                name = nice_name
+            else:
+                name = ugly_name
+
+            cluster_names[name] = c
+            reverse_cluster_names[dim][c] = name
 
     col_defs = []
     for xx in x: