diff --git a/openstef/data_classes/prediction_job.py b/openstef/data_classes/prediction_job.py index fa8d4e16..80d85eb6 100644 --- a/openstef/data_classes/prediction_job.py +++ b/openstef/data_classes/prediction_job.py @@ -26,7 +26,6 @@ class PredictionJobDataClass(BaseModel): - ``"lgb"`` - ``"linear"`` - ``"linear_quantile"`` - - ``"gblinear_quantile"`` - ``"xgb_multioutput_quantile"`` - ``"flatliner"`` @@ -83,6 +82,8 @@ class PredictionJobDataClass(BaseModel): data_balancing_ratio: Optional[float] = None """If data balancing is enabled, the data will be balanced with data from 1 year ago in the future.""" + use_rolling_aggregate_features: bool = False + """If True, rolling aggregate of load will be used as features in the model.""" depends_on: Optional[list[Union[int, str]]] """Link to another prediction job on which this prediction job might depend.""" sid: Optional[str] diff --git a/openstef/feature_engineering/apply_features.py b/openstef/feature_engineering/apply_features.py index 3e9de085..ca71988b 100644 --- a/openstef/feature_engineering/apply_features.py +++ b/openstef/feature_engineering/apply_features.py @@ -22,6 +22,7 @@ from openstef.feature_engineering.bidding_zone_to_country_mapping import ( BIDDING_ZONE_TO_COUNTRY_CODE_MAPPING, ) +from openstef.feature_engineering.rolling_features import add_rolling_aggregate_features from openstef.feature_engineering.weather_features import ( add_additional_solar_features, add_additional_wind_features, @@ -130,5 +131,8 @@ def apply_features( # Adds daylight terrestrial feature data = add_daylight_terrestrial_feature(data) + if pj.use_rolling_aggregate_features: + data = add_rolling_aggregate_features(data) + # Return dataframe including all requested features return data diff --git a/openstef/feature_engineering/rolling_features.py b/openstef/feature_engineering/rolling_features.py new file mode 100644 index 00000000..5710a4a9 --- /dev/null +++ b/openstef/feature_engineering/rolling_features.py @@ -0,0 +1,32 @@ +import pandas as pd + + +def add_rolling_aggregate_features( + data: pd.DataFrame, rolling_window: str = "24h" +) -> pd.DataFrame: + """ + Adds rolling aggregate features to the input dataframe. + + These features are calculated with an aggregation over a rolling window of the data. + A list of requested features is used to determine whether to add the rolling features + or not. + + Args: + data: Input dataframe to which the rolling features will be added. + rolling_window: Rolling window size in str format following + https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases + + Returns: + DataFrame with added rolling features. + """ + # Ensure the index is a DatetimeIndex + if not isinstance(data.index, pd.DatetimeIndex): + raise ValueError("The DataFrame index must be a DatetimeIndex.") + + if "load" not in data.columns: + raise ValueError("The DataFrame must contain a 'load' column.") + rolling_window_load = data["load"].rolling(window=rolling_window) + data[f"rolling_median_load_{rolling_window}"] = rolling_window_load.median() + data[f"rolling_max_load_{rolling_window}"] = rolling_window_load.max() + data[f"rolling_min_load_{rolling_window}"] = rolling_window_load.min() + return data diff --git a/test/unit/feature_engineering/test_rolling_features.py b/test/unit/feature_engineering/test_rolling_features.py new file mode 100644 index 00000000..ad7f12d0 --- /dev/null +++ b/test/unit/feature_engineering/test_rolling_features.py @@ -0,0 +1,88 @@ +import numpy as np +import pandas as pd +import pytest + +from openstef.feature_engineering.rolling_features import add_rolling_aggregate_features + + +def test_add_rolling_aggregate_features(): + # Generate 2 days of data at 15-minute intervals + num_points = int(24 * 60 / 15 * 2) + data = pd.DataFrame( + index=pd.date_range( + start="2023-01-01 00:00:00", freq="15min", periods=num_points + ) + ) + data["load"] = list(range(num_points)) + + # Apply the function + output_data = add_rolling_aggregate_features(data) + + # Verify the columns are created + assert "rolling_median_load_24h" in output_data.columns + assert "rolling_max_load_24h" in output_data.columns + assert "rolling_min_load_24h" in output_data.columns + + # Validate the rolling features + rolling_window = "24h" + rolling_window_load = data["load"].rolling(window=rolling_window) + rolling_median_expected = rolling_window_load.median() + rolling_max_expected = rolling_window_load.max() + rolling_min_expected = rolling_window_load.min() + + assert np.allclose( + output_data[f"rolling_median_load_{rolling_window}"], rolling_median_expected + ) + assert np.allclose( + output_data[f"rolling_max_load_{rolling_window}"], rolling_max_expected + ) + assert np.allclose( + output_data[f"rolling_min_load_{rolling_window}"], rolling_min_expected + ) + + +def test_add_rolling_aggregate_features_flatline(): + # Generate 2 days of data at 15-minute intervals + num_points = int(24 * 60 / 15 * 2) + data = pd.DataFrame( + index=pd.date_range( + start="2023-01-01 00:00:00", freq="15min", periods=num_points + ) + ) + all_ones = [1.0] * num_points + data["load"] = all_ones + + # Apply the function + output_data = add_rolling_aggregate_features(data) + + # Verify the columns are created + assert "rolling_median_load_24h" in output_data.columns + assert "rolling_max_load_24h" in output_data.columns + assert "rolling_min_load_24h" in output_data.columns + + # Validate the rolling features + rolling_window = "24h" + assert np.all(output_data[f"rolling_median_load_{rolling_window}"] == all_ones) + assert np.all(output_data[f"rolling_max_load_{rolling_window}"] == all_ones) + assert np.all(output_data[f"rolling_min_load_{rolling_window}"] == all_ones) + + +def test_add_rolling_aggregate_features_non_datetime_index(): + # Test for non-datetime index + data = pd.DataFrame(index=range(10)) + + with pytest.raises( + ValueError, match="The DataFrame index must be a DatetimeIndex." + ): + add_rolling_aggregate_features(data) + + +def test_add_rolling_aggregate_features_no_load_column(): + # Test for dataframe without load column + data = pd.DataFrame( + index=pd.date_range(start="2023-01-01 00:00:00", freq="15min", periods=10), + columns=["not_load"], + ) + + with pytest.raises(ValueError, match="The DataFrame must contain a 'load' column."): + add_rolling_aggregate_features(data)