-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpurged_embargoed_kfold.py
61 lines (51 loc) · 2.6 KB
/
purged_embargoed_kfold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pandas as pd
class PurgedKFoldCVWithEmbargos:
def __init__(self, df: pd.DataFrame):
"""
Initializes the PurgedKFoldCVWithEmbargos class with a DataFrame. The DataFrame's
index must be a pandas DatetimeIndex.
Parameters:
- df (pd.DataFrame): The dataset to perform cross-validation on, indexed by datetime.
"""
self.df = df
def purged_k_fold_cv_with_embargos(self, n_splits, train_size, embargo_period_pct):
"""
Generate training and test splits with embargo periods from the DataFrame.
Parameters:
- n_splits (int): The number of splits for cross-validation.
- train_size (float): The proportion of the dataset to include in the train split.
- embargo_period_pct (float): The embargo period as a percentage of the dataset.
Returns:
- List of tuples, where each tuple contains two ranges: training indices and test indices.
"""
n_samples = len(self.df)
embargo_size = int(n_samples * embargo_period_pct)
adjusted_n_samples = n_samples - embargo_size * (n_splits - 1)
split_size = adjusted_n_samples // n_splits
splits = []
for i in range(n_splits):
split_start = i * (split_size + embargo_size)
split_end = split_start + split_size
training_size = int(split_size * train_size)
test_size = split_size - training_size - embargo_size
train_start = split_start
train_end = train_start + training_size
test_start = train_end + embargo_size
test_end = test_start + test_size
splits.append((range(train_start, train_end), range(test_start, test_end)))
return splits
def purge_overlapping_samples(self, train_indices, test_indices, embargo_size):
"""
Purges training samples that are too close to the test set, based on timestamps and an embargo period.
Parameters:
- train_indices (pd.Index): Timestamp index for the training set.
- test_indices (pd.Index): Timestamp index for the test set.
- embargo_size (int): Number of samples to embargo before the test set begins.
Returns:
- List of int: Indices of training samples that do not overlap with the embargoed test period.
"""
train_times = train_indices.to_series()
test_times = test_indices.to_series()
min_time_to_test = test_times.min() - train_times
purged_train_times = min_time_to_test[min_time_to_test >= pd.Timedelta(embargo_size)].index
return purged_train_times.tolist()