Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new functionality to calculate timestamps for dataframe splits based on user-defined percentages #63

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bibmon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
from ._sklearn_regressor import sklearnRegressor
from ._preprocess import PreProcess
from ._load_data import load_tennessee_eastman, load_real_data
from ._bibmon_tools import train_val_test_split, complete_analysis, comparative_table, spearmanr_dendrogram, create_df_with_dates, create_df_with_noise, align_dfs_by_rows
from ._bibmon_tools import train_val_test_split, complete_analysis, comparative_table, spearmanr_dendrogram, create_df_with_dates, create_df_with_noise, align_dfs_by_rows, calculate_timestamps

__all__ = ['Autoencoder','PCA','ESN','SBM',
'sklearnRegressor', 'PreProcess',
'load_tennessee_eastman', 'load_real_data',
'train_val_test_split', 'complete_analysis', 'comparative_table',
'spearmanr_dendrogram', 'create_df_with_dates',
'create_df_with_noise', 'align_dfs_by_rows']
'create_df_with_noise', 'align_dfs_by_rows', 'calculate_timestamps']
79 changes: 79 additions & 0 deletions bibmon/_bibmon_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,85 @@ def spearmanr_dendrogram(df, figsize = (18,8)):

###############################################################################

def calculate_timestamps(data, train_pct, validation_pct, test_pct, preserve_periods=False, time_tolerance='30min'):
"""
Calculates the timestamps to split the dataframe into train, validation, and test sets
based on the desired percentages, with the option to preserve continuous observation periods.

Parameters
----------
data: pandas.DataFrame
The dataset to be split.
train_pct: float
Percentage of data for the training set (between 0 and 1).
validation_pct: float
Percentage of data for the validation set (between 0 and 1).
test_pct: float
Percentage of data for the test set (between 0 and 1).
preserve_periods: bool, optional
If True, preserve continuous observation periods when splitting the data.
time_tolerance: string, optional
Time tolerance to consider the start of new periods.
Example: '30min' means gaps shorter than 30 minutes will be ignored.

Returns
-------
start_train: string
Timestamp for the start of the training set.
end_train: string
Timestamp for the end of the training set.
end_validation: string
Timestamp for the end of the validation set.
end_test: string
Timestamp for the end of the test set.
"""

if round(train_pct + validation_pct + test_pct, 5) != 1:
raise ValueError("Train, validation, and test percentages must add up to 1.")

if not isinstance(data.index, pd.DatetimeIndex):
raise ValueError("The dataframe index must be a DatetimeIndex.")

total_rows = len(data)

train_count = int(total_rows * train_pct)
validation_count = int(total_rows * validation_pct)
test_count = total_rows - train_count - validation_count

if preserve_periods:
time_diff = data.index.to_series().diff()
period_starts = (time_diff > pd.Timedelta(time_tolerance)).cumsum()

period_sizes = period_starts.value_counts().sort_index()
cumulative_counts = period_sizes.cumsum()

train_period_indices = cumulative_counts[cumulative_counts <= train_count].index
if not train_period_indices.empty:
train_period_index = train_period_indices[-1]
else:
train_period_index = cumulative_counts.index[0]
end_train = data[period_starts <= train_period_index].index[-1]

cumulative_train_validation_count = train_count + validation_count
validation_period_indices = cumulative_counts[cumulative_counts <= cumulative_train_validation_count].index
if not validation_period_indices.empty:
validation_period_index = validation_period_indices[-1]
else:
validation_period_index = cumulative_counts.index[0]
end_validation = data[period_starts <= validation_period_index].index[-1]

end_test = data.index[-1]
start_train = data.index[0]
else:
start_train = data.index[0]
end_train = data.index[train_count - 1]
end_validation = data.index[train_count + validation_count - 1]
end_test = data.index[-1]

return start_train, end_train, end_validation, end_test

###############################################################################

def train_val_test_split (data, start_train, end_train,
end_validation, end_test,
tags_X = None, tags_Y = None):
Expand Down
47 changes: 46 additions & 1 deletion test/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import bibmon
import pandas as pd
import pytest

def test_complete_analysis():

Expand Down Expand Up @@ -60,4 +61,48 @@ def test_complete_analysis():
fault_start = '2018-01-02 06:00:00',
fault_end = '2018-01-02 09:00:00')

model.plot_importances()
model.plot_importances()

def test_calculate_timestamps_with_preserve_periods():
df = pd.DataFrame({
'var1': range(100),
'var2': range(100, 200)
}, index=pd.date_range('2021-01-01', periods=50, freq='h').append(
pd.date_range('2021-01-03', periods=50, freq='h')))

start_train, end_train, end_validation, end_test = bibmon.calculate_timestamps(
df, train_pct=0.6, validation_pct=0.2, test_pct=0.2, preserve_periods=True, time_tolerance='1h'
)

assert end_train == df.index[99], "Train end does not preserve periods correctly"

def test_calculate_timestamps_invalid_percentages():
df = pd.DataFrame({
'var1': range(10),
'var2': range(10, 20)
}, index=pd.date_range('2021-01-01', periods=10, freq='h'))

with pytest.raises(ValueError, match="Train, validation, and test percentages must add up to 1"):
bibmon.calculate_timestamps(df, train_pct=0.5, validation_pct=0.3, test_pct=0.3)

def test_calculate_timestamps_non_datetime_index():
df = pd.DataFrame({
'var1': range(10),
'var2': range(10, 20)
})

with pytest.raises(ValueError, match="The dataframe index must be a DatetimeIndex"):
bibmon.calculate_timestamps(df, train_pct=0.6, validation_pct=0.2, test_pct=0.2)

def test_calculate_timestamps_small_time_tolerance():
df = pd.DataFrame({
'var1': range(100),
'var2': range(100, 200)
}, index=pd.date_range('2021-01-01', periods=50, freq='1min').append(
pd.date_range('2021-01-01 01:00:00', periods=50, freq='1min')))

start_train, end_train, end_validation, end_test = bibmon.calculate_timestamps(
df, train_pct=0.6, validation_pct=0.2, test_pct=0.2, preserve_periods=True, time_tolerance='1min'
)

assert end_train == df.index[49], "Train end timestamp is incorrect"