petrobras · WladRamos · Oct 19, 2024 · Oct 19, 2024
diff --git a/bibmon/__init__.py b/bibmon/__init__.py
@@ -5,11 +5,11 @@
 from ._sklearn_regressor import sklearnRegressor
 from ._preprocess import PreProcess
 from ._load_data import load_tennessee_eastman, load_real_data
-from ._bibmon_tools import train_val_test_split, complete_analysis, comparative_table, spearmanr_dendrogram, create_df_with_dates, create_df_with_noise, align_dfs_by_rows
+from ._bibmon_tools import train_val_test_split, complete_analysis, comparative_table, spearmanr_dendrogram, create_df_with_dates, create_df_with_noise, align_dfs_by_rows, calculate_timestamps
 
 __all__ = ['Autoencoder','PCA','ESN','SBM',
 	   'sklearnRegressor', 'PreProcess',
            'load_tennessee_eastman', 'load_real_data', 
            'train_val_test_split', 'complete_analysis', 'comparative_table',
 	       'spearmanr_dendrogram', 'create_df_with_dates',
-           'create_df_with_noise', 'align_dfs_by_rows']
+           'create_df_with_noise', 'align_dfs_by_rows', 'calculate_timestamps']
diff --git a/bibmon/_bibmon_tools.py b/bibmon/_bibmon_tools.py
@@ -116,6 +116,85 @@ def spearmanr_dendrogram(df, figsize = (18,8)):
 
 ###############################################################################
 
+def calculate_timestamps(data, train_pct, validation_pct, test_pct, preserve_periods=False, time_tolerance='30min'):
+    """
+    Calculates the timestamps to split the dataframe into train, validation, and test sets
+    based on the desired percentages, with the option to preserve continuous observation periods.
+
+    Parameters
+    ----------
+    data: pandas.DataFrame
+        The dataset to be split.
+    train_pct: float
+        Percentage of data for the training set (between 0 and 1).
+    validation_pct: float
+        Percentage of data for the validation set (between 0 and 1).
+    test_pct: float
+        Percentage of data for the test set (between 0 and 1).
+    preserve_periods: bool, optional
+        If True, preserve continuous observation periods when splitting the data.
+    time_tolerance: string, optional
+        Time tolerance to consider the start of new periods.
+        Example: '30min' means gaps shorter than 30 minutes will be ignored.
+
+    Returns
+    -------
+    start_train: string
+        Timestamp for the start of the training set.
+    end_train: string
+        Timestamp for the end of the training set.
+    end_validation: string
+        Timestamp for the end of the validation set.
+    end_test: string
+        Timestamp for the end of the test set.
+    """
+
+    if round(train_pct + validation_pct + test_pct, 5) != 1:
+        raise ValueError("Train, validation, and test percentages must add up to 1.")
+
+    if not isinstance(data.index, pd.DatetimeIndex):
+        raise ValueError("The dataframe index must be a DatetimeIndex.")
+
+    total_rows = len(data)
+
+    train_count = int(total_rows * train_pct)
+    validation_count = int(total_rows * validation_pct)
+    test_count = total_rows - train_count - validation_count
+
+    if preserve_periods:
+        time_diff = data.index.to_series().diff()
+        period_starts = (time_diff > pd.Timedelta(time_tolerance)).cumsum()
+
+        period_sizes = period_starts.value_counts().sort_index()
+        cumulative_counts = period_sizes.cumsum()
+
+        train_period_indices = cumulative_counts[cumulative_counts <= train_count].index
+        if not train_period_indices.empty:
+            train_period_index = train_period_indices[-1]
+        else:
+            train_period_index = cumulative_counts.index[0]
+        end_train = data[period_starts <= train_period_index].index[-1]
+
+        cumulative_train_validation_count = train_count + validation_count
+        validation_period_indices = cumulative_counts[cumulative_counts <= cumulative_train_validation_count].index
+        if not validation_period_indices.empty:
+            validation_period_index = validation_period_indices[-1]
+        else:
+            validation_period_index = cumulative_counts.index[0]
+        end_validation = data[period_starts <= validation_period_index].index[-1]
+
+        end_test = data.index[-1]
+        start_train = data.index[0]
+    else:
+        start_train = data.index[0]
+        end_train = data.index[train_count - 1]
+        end_validation = data.index[train_count + validation_count - 1]
+        end_test = data.index[-1]
+
+    return start_train, end_train, end_validation, end_test
+
+###############################################################################
+
 def train_val_test_split (data, start_train, end_train, 
                           end_validation, end_test, 
                           tags_X = None, tags_Y = None):

diff --git a/test/test_tools.py b/test/test_tools.py
@@ -8,6 +8,7 @@
 
 import bibmon
 import pandas as pd
+import pytest
 
 def test_complete_analysis():
 
@@ -60,4 +61,48 @@ def test_complete_analysis():
                             fault_start = '2018-01-02 06:00:00',
                             fault_end = '2018-01-02 09:00:00') 
 
-    model.plot_importances()                                                                             
+    model.plot_importances()
+
+def test_calculate_timestamps_with_preserve_periods():
+    df = pd.DataFrame({
+        'var1': range(100),
+        'var2': range(100, 200)
+    }, index=pd.date_range('2021-01-01', periods=50, freq='h').append(
+        pd.date_range('2021-01-03', periods=50, freq='h')))
+
+    start_train, end_train, end_validation, end_test = bibmon.calculate_timestamps(
+        df, train_pct=0.6, validation_pct=0.2, test_pct=0.2, preserve_periods=True, time_tolerance='1h'
+    )
+
+    assert end_train == df.index[99], "Train end does not preserve periods correctly"
+
+def test_calculate_timestamps_invalid_percentages():
+    df = pd.DataFrame({
+        'var1': range(10),
+        'var2': range(10, 20)
+    }, index=pd.date_range('2021-01-01', periods=10, freq='h'))
+
+    with pytest.raises(ValueError, match="Train, validation, and test percentages must add up to 1"):
+        bibmon.calculate_timestamps(df, train_pct=0.5, validation_pct=0.3, test_pct=0.3)
+
+def test_calculate_timestamps_non_datetime_index():
+    df = pd.DataFrame({
+        'var1': range(10),
+        'var2': range(10, 20)
+    })
+
+    with pytest.raises(ValueError, match="The dataframe index must be a DatetimeIndex"):
+        bibmon.calculate_timestamps(df, train_pct=0.6, validation_pct=0.2, test_pct=0.2)
+
+def test_calculate_timestamps_small_time_tolerance():
+    df = pd.DataFrame({
+        'var1': range(100),
+        'var2': range(100, 200)
+    }, index=pd.date_range('2021-01-01', periods=50, freq='1min').append(
+        pd.date_range('2021-01-01 01:00:00', periods=50, freq='1min')))
+
+    start_train, end_train, end_validation, end_test = bibmon.calculate_timestamps(
+        df, train_pct=0.6, validation_pct=0.2, test_pct=0.2, preserve_periods=True, time_tolerance='1min'
+    )
+
+    assert end_train == df.index[49], "Train end timestamp is incorrect"