Skip to content

Commit

Permalink
minor time series tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
EgorKraevTransferwise committed Nov 25, 2024
1 parent 9c2b224 commit f2caca5
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 137 deletions.
218 changes: 97 additions & 121 deletions notebooks/Finding interesting segments in time series.ipynb

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion tests/timeseries_wip_entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,15 @@
sf = explain_timeseries(
df=df,
dims=dims,
max_segments=7,
num_segments=7,
max_depth=2,
total_name=totals,
size_name=size,
time_name=time,
verbose=False,
solver="tree",
fit_sizes=True,
num_breaks=100,
)
sf.plot(plot_is_static=False, height=1500, width=1000, average_name="VPC")
print(sf.summary())
Expand Down
23 changes: 22 additions & 1 deletion wise_pizza/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,10 +374,30 @@ def explain_timeseries(
time_basis: Optional[pd.DataFrame] = None,
fit_log_space: bool = False,
fit_sizes: Optional[bool] = None,
num_breaks: int = 2,
num_breaks: int = 3,
n_jobs: int = 10,
ignore_averages: bool = True,
log_space_weight_sc: float = 0.5,
):
"""
Split a time series panel dataset into segments that are as different as possible
:param df: A pandas DataFrame with the time series data
:param dims: Discrete dimensions to segment by
:param total_name: Name of the column containing totals
:param time_name: Name of the column containing the time values
:param num_segments: Number of segments to find
:param size_name: (Optional) Name of the column containing the size of the segment
:param max_depth: (Optional, defaults to 2) Maximum number of dimensions to constrain per segment
:param fit_sizes: (Optional) Whether to fit the sizes of the segments, or just the averages
:param n_jobs: (Optional, defaults to 10) Number of jobs to run in parallel when finding segments
:param num_breaks: (Optional, defaults to 3) Number of breaks in stylized time series used for comparing segments
:param ignore_averages: If set to True (recommended), the level (across time) of each segment is ignored when calculating similarity
:param time_basis: A DataFrame with the time basis to use. Only use if you know what you're doing.
:param solver: (Optional) The solver to use, currently only "tree" is supported
:param fit_log_space: Do not use
:param log_space_weight_sc: Do not use
:return:
"""
assert (
solver == "tree"
), "Only the tree solver is supported for time series at the moment"
Expand Down Expand Up @@ -495,6 +515,7 @@ def explain_timeseries(
verbose=verbose,
groupby_dims=groupby_dims,
cluster_values=False,
n_jobs=n_jobs,
)

# TODO: insert back the normalized bits?
Expand Down
7 changes: 2 additions & 5 deletions wise_pizza/plotting_time_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def preprocess_for_ts_plot(
) -> List[List[PlotData]]:
out = []
for row, s in enumerate(sf.segments):
print(row, s)
# print(row, s)
this_df = pd.DataFrame(
{
"time": sf.time,
Expand Down Expand Up @@ -158,7 +158,4 @@ def simple_ts_plot(
row=row_num,
col=col_num,
)
fig.update_layout(
xaxis=dict(autorange=True),
yaxis=dict(autorange=True)
)
fig.update_layout(xaxis=dict(autorange=True), yaxis=dict(autorange=True))
5 changes: 5 additions & 0 deletions wise_pizza/slicer.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def fit(
constrain_signs: bool = True,
cluster_values: bool = True,
groupby_dims: Optional[List[str]] = None,
n_jobs: int = 1,
):
"""
Function to fit slicer and find segments
Expand Down Expand Up @@ -225,6 +226,8 @@ def fit(
num_leaves=max_segments,
max_depth=max_depth,
fitter=AverageFitter(),
n_jobs=n_jobs,
verbose=verbose,
)

Xw = csc_matrix(diags(self.weights) @ self.X)
Expand Down Expand Up @@ -261,6 +264,8 @@ def fit(
fitter=fitter,
num_leaves=max_segments,
max_depth=max_depth,
n_jobs=n_jobs,
verbose=verbose,
)
self.nonzeros = np.array(range(self.X.shape[1]))

Expand Down
29 changes: 20 additions & 9 deletions wise_pizza/solve/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ def tree_solver(
fitter: Fitter,
max_depth: Optional[int] = None,
num_leaves: Optional[int] = None,
parallel_processes: int = 10,
n_jobs: int = 10,
verbose: bool = False,
):
"""
Partition the data into segments using a greedy binary tree approach
Expand All @@ -27,6 +28,8 @@ def tree_solver(
:param fitter: A model to fit on the chunks
:param max_depth: max depth of the tree
:param num_leaves: num leaves to generate
:param n_jobs: number of parallel jobs
:param verbose: print progress
:return: Segment description, column definitions, and cluster names
"""

Expand All @@ -41,10 +44,10 @@ def tree_solver(
dims=dims,
time_col=None if isinstance(fitter, AverageFitter) else "__time",
max_depth=max_depth,
parallel_processes=parallel_processes,
n_jobs=n_jobs,
)

build_tree(root=root, num_leaves=num_leaves, max_depth=max_depth)
build_tree(root=root, num_leaves=num_leaves, max_depth=max_depth, verbose=verbose)

leaves = get_leaves(root)

Expand Down Expand Up @@ -93,7 +96,7 @@ def __init__(
time_col: str = None,
max_depth: Optional[int] = None,
dim_split: Optional[Dict[str, List]] = None,
parallel_processes: int = 10,
n_jobs: int = 10,
):
self.df = df.copy().sort_values(dims + fitter.groupby_dims)
self.fitter = fitter
Expand All @@ -107,7 +110,7 @@ def __init__(
self.model = None
# For dimension splitting candidates, hardwired for now
self.num_bins = 10
self.parallel_processes = parallel_processes
self.parallel_processes = n_jobs

@property
def depth(self):
Expand Down Expand Up @@ -219,15 +222,23 @@ def get_best_subtree_result(
return node2


def build_tree(root: ModelNode, num_leaves: int, max_depth: Optional[int] = 1000):
def build_tree(
root: ModelNode,
num_leaves: int,
max_depth: Optional[int] = 1000,
verbose: bool = False,
):
for i in range(num_leaves - 1):
print(f"Adding node {i+1}...")
if verbose:
print(f"Adding node {i+1}...")
best_node = get_best_subtree_result(root, max_depth)
if best_node.error_improvement > 0:
best_node.children = best_node._best_submodels
print("Done!")
if verbose:
print("Done!")
else:
print("No more improvement, stopping")
if verbose:
print("No more improvement, stopping")
break


Expand Down
1 change: 1 addition & 0 deletions wise_pizza/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def prune_time_basis(
dtrend_cols = [t for t in time_basis.columns if "dtrend" in t]
chosen_cols = []
# from all the possible kinks, choose evenly spaced num_breaks ones
num_breaks = min(num_breaks, len(dtrend_cols) - 1)
for i in range(1, num_breaks + 1):
chosen_cols.append(dtrend_cols[int(i * len(dtrend_cols) / (num_breaks + 1))])
pre_basis = time_basis[["Intercept", "Slope"] + chosen_cols].copy()
Expand Down

0 comments on commit f2caca5

Please sign in to comment.