From 6a81d0fea104f87e0a8a02ba5a011466be31859b Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Sun, 7 Jun 2020 13:19:20 +0200 Subject: [PATCH 1/3] Implement center and scale extractors --- deepdow/data/__init__.py | 7 ++- deepdow/data/augment.py | 77 +++++++++++++++++++++++++++++++++ tests/test_data/test_augment.py | 53 +++++++++++++++++++++++ 3 files changed, 135 insertions(+), 2 deletions(-) create mode 100644 tests/test_data/test_augment.py diff --git a/deepdow/data/__init__.py b/deepdow/data/__init__.py index 20ba81a..df74f51 100644 --- a/deepdow/data/__init__.py +++ b/deepdow/data/__init__.py @@ -1,6 +1,7 @@ """Module dealing with data.""" -from .augment import (Compose, Dropout, Multiply, Noise) +from .augment import (Compose, Dropout, Multiply, Noise, prepare_robust_scaler, + prepare_standard_scaler) from .load import (FlexibleDataLoader, InRAMDataset, RigidDataLoader) __all__ = ['Compose', @@ -9,4 +10,6 @@ 'InRAMDataset', 'Multiply', 'Noise', - 'RigidDataLoader'] + 'RigidDataLoader', + 'prepare_robust_scaler', + 'prepare_standard_scaler'] diff --git a/deepdow/data/augment.py b/deepdow/data/augment.py index 1d4d4bd..2442729 100644 --- a/deepdow/data/augment.py +++ b/deepdow/data/augment.py @@ -1,8 +1,85 @@ """Collection of callable functions that augment deepdow tensors.""" +import numpy as np import torch +def prepare_standard_scaler(X, overlap=False, indices=None): + """Compute mean and standard deviation for each channel. + + Parameters + ---------- + X : np.ndarray + Full features array of shape `(n_samples, n_channels, lookback, n_assets)`. + + overlap : bool + If False, then only using the most recent timestep. This will guarantee that not counting + the same thing multiple times. + + indices : list or None + List of indices to consider from the `X.shape[0]` dimension. If None + then considering all the samples. + + Returns + ------- + means : np.ndarray + Mean of each channel. Shape `(n_channels,)`. + + stds : np.ndarray + Standard deviation of each channel. Shape `(n_channels,)`. + + """ + indices = indices if indices is not None else list(range(len(X))) + considered_values = X[indices, ...] if overlap else X[indices, :, -1:, :] + + means = considered_values.mean(axis=(0, 2, 3)) + stds = considered_values.std(axis=(0, 2, 3)) + + return means, stds + + +def prepare_robust_scaler(X, overlap=False, indices=None, percentile_range=(25, 75)): + """Compute median and percentile range for each channel. + + Parameters + ---------- + X : np.ndarray + Full features array of shape `(n_samples, n_channels, lookback, n_assets)`. + + overlap : bool + If False, then only using the most recent timestep. This will guarantee that not counting + the same thing multiple times. + + indices : list or None + List of indices to consider from the `X.shape[0]` dimension. If None + then considering all the samples. + + percentile_range : tuple + The left and right percentile to consider. Needs to be in [0, 100]. + + Returns + ------- + medians : np.ndarray + Median of each channel. Shape `(n_channels,)`. + + ranges : np.ndarray + Interquantile range for each channel. Shape `(n_channels,)`. + + """ + if not 0 <= percentile_range[0] < percentile_range[1] <= 100: + raise ValueError('The percentile range needs to be in [0, 100] and left < right') + + indices = indices if indices is not None else list(range(len(X))) + considered_values = X[indices, ...] if overlap else X[indices, :, -1:, :] + + medians = np.median(considered_values, axis=(0, 2, 3)) + percentiles = np.percentile(considered_values, percentile_range, axis=(0, 2, 3)) # (2, n_channels) + + ranges = percentiles[1] - percentiles[0] + + return medians, ranges + + class Compose: """Meta transform inspired by torchvision. diff --git a/tests/test_data/test_augment.py b/tests/test_data/test_augment.py new file mode 100644 index 0000000..d6f9029 --- /dev/null +++ b/tests/test_data/test_augment.py @@ -0,0 +1,53 @@ +"""Collection of tests focused on the `deepdow.data.augment`.""" + +import numpy as np +import pytest + +from deepdow.data import prepare_robust_scaler, prepare_standard_scaler + + +@pytest.mark.parametrize('overlap', [True, False]) +@pytest.mark.parametrize('indices', [None, [1, 4, 6]]) +def test_prepare_standard_scaler(overlap, indices): + n_samples, n_channels, lookback, n_assets = 10, 3, 5, 12 + + X = np.random.random((n_samples, n_channels, lookback, n_assets)) - 0.5 + + means, stds = prepare_standard_scaler(X, overlap=overlap, indices=indices) + + assert means.shape == (n_channels,) + assert stds.shape == (n_channels,) + assert np.all(stds > 0) + + +class TestPrepareRobustScaler: + + def test_error(self): + with pytest.raises(ValueError): + prepare_robust_scaler(np.ones((1, 2, 3, 4)), percentile_range=(20, 10)) + + with pytest.raises(ValueError): + prepare_robust_scaler(np.ones((1, 2, 3, 4)), percentile_range=(-2, 99)) + + @pytest.mark.parametrize('overlap', [True, False]) + @pytest.mark.parametrize('indices', [None, [1, 4, 6]]) + def test_basic(self, overlap, indices): + n_samples, n_channels, lookback, n_assets = 10, 3, 5, 12 + + X = np.random.random((n_samples, n_channels, lookback, n_assets)) - 0.5 + + medians, ranges = prepare_robust_scaler(X, overlap=overlap, indices=indices) + + assert medians.shape == (n_channels,) + assert ranges.shape == (n_channels,) + assert np.all(ranges > 0) + + def test_sanity(self): + n_samples, n_channels, lookback, n_assets = 10, 3, 5, 12 + + X = np.random.random((n_samples, n_channels, lookback, n_assets)) - 0.5 + + medians_1, ranges_1 = prepare_robust_scaler(X, percentile_range=(20, 80)) + medians_2, ranges_2 = prepare_robust_scaler(X, percentile_range=(10, 90)) + + assert np.all(ranges_2 > ranges_1) From c58a11152d055c6234553c1b819fdd1766e7ab58 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Sun, 7 Jun 2020 14:26:59 +0200 Subject: [PATCH 2/3] Create Scale transform --- deepdow/data/__init__.py | 3 +- deepdow/data/augment.py | 79 +++++++++++++++++++++++++++++++++ docs/source/data_loading.rst | 8 ++-- tests/test_data/test_augment.py | 35 ++++++++++++++- 4 files changed, 120 insertions(+), 5 deletions(-) diff --git a/deepdow/data/__init__.py b/deepdow/data/__init__.py index df74f51..bea0f5e 100644 --- a/deepdow/data/__init__.py +++ b/deepdow/data/__init__.py @@ -1,6 +1,6 @@ """Module dealing with data.""" -from .augment import (Compose, Dropout, Multiply, Noise, prepare_robust_scaler, +from .augment import (Compose, Dropout, Multiply, Noise, Scale, prepare_robust_scaler, prepare_standard_scaler) from .load import (FlexibleDataLoader, InRAMDataset, RigidDataLoader) @@ -11,5 +11,6 @@ 'Multiply', 'Noise', 'RigidDataLoader', + 'Scale', 'prepare_robust_scaler', 'prepare_standard_scaler'] diff --git a/deepdow/data/augment.py b/deepdow/data/augment.py index 2442729..2aaaee5 100644 --- a/deepdow/data/augment.py +++ b/deepdow/data/augment.py @@ -268,3 +268,82 @@ def __call__(self, X_sample, y_sample, timestamps_sample, asset_names): X_sample_new = self.frac * X_sample.std([1, 2], keepdim=True) * torch.randn_like(X_sample) + X_sample return X_sample_new, y_sample, timestamps_sample, asset_names + + +class Scale: + """Scale input features. + + The input features are per channel centered to zero and scaled to one. We use the same + terminology as scikit-learn. However, the equivalent in torchvision is `Normalize`. + + Parameters + ---------- + center : np.ndarray + 1D array of shape `(n_channels,)` representing the center of the features (mean or median). + Needs to be precomputed in advance. + + scale : np.ndarray + 1D array of shape `(n_channels,)` representing the scale of the features (standard deviation + or quantile range). Needs to be precomputed in advance. + + See Also + -------- + prepare_robust_scaler + prepare_standard_scaler + """ + + def __init__(self, center, scale): + if len(center) != len(scale): + raise ValueError('The center and scale need to have the same size.') + + if np.any(scale <= 0): + raise ValueError('The scale parameters need to be positive.') + + self.center = center + self.scale = scale + self.n_channels = len(self.center) + + def __call__(self, X_sample, y_sample, timestamps_sample, asset_names): + """Perform transform. + + Parameters + ---------- + X_sample : torch.Tensor + Feature vector of shape `(n_channels, lookback, n_assets)`. + + y_sample : torch.Tensor + Target vector of shape `(n_channels, horizon, n_assets)`. + + timestamps_sample : datetime + Time stamp of the sample. + + asset_names + Asset names corresponding to the last channel of `X_sample` and `y_sample`. + + Returns + ------- + X_sample_new : torch.Tensor + Feature vector of shape `(n_channels, lookback, n_assets)` scaled appropriatelly. + + y_sample : torch.Tesnor + Same as input. + + timestamps_sample : datetime + Same as input. + + asset_names + Same as input. + """ + n_channels = X_sample.shape[0] + if n_channels != self.n_channels: + raise ValueError('Expected {} channels in X, got {}'.format(self.n_channels, n_channels)) + + X_sample_new = X_sample.clone() + dtype, device = X_sample_new.dtype, X_sample_new.device + + center = torch.as_tensor(self.center, dtype=dtype, device=device)[:, None, None] + scale = torch.as_tensor(self.scale, dtype=dtype, device=device)[:, None, None] + + X_sample_new.sub_(center).div_(scale) + + return X_sample_new, y_sample, timestamps_sample, asset_names diff --git a/docs/source/data_loading.rst b/docs/source/data_loading.rst index ad2cd59..38b2ce7 100644 --- a/docs/source/data_loading.rst +++ b/docs/source/data_loading.rst @@ -261,10 +261,12 @@ Additionally, one can pass a transformation :code:`transform` that can serve as Currently implemented transforms under :code:`deepdow.data` are - :code:`Compose` - basically a copy of `Compose` from Torch Vision -- :code:`Dropout` - randomly setting elements to zero (not in place) -- :code:`Multiply` - multiplying all elements by a constant (not in place) -- :code:`Noise` - add Gaussian noise (not in place) +- :code:`Dropout` - randomly setting elements to zero +- :code:`Multiply` - multiplying all elements by a constant +- :code:`Noise` - add Gaussian noise +- :code:`Scale` - centering and scaling (similar to scikit-learn :code:`StandardScaler` and :code:`RobustScaler`) +All of the transforms are not in place. Dataloaders ----------- diff --git a/tests/test_data/test_augment.py b/tests/test_data/test_augment.py index d6f9029..82c0573 100644 --- a/tests/test_data/test_augment.py +++ b/tests/test_data/test_augment.py @@ -2,8 +2,9 @@ import numpy as np import pytest +import torch -from deepdow.data import prepare_robust_scaler, prepare_standard_scaler +from deepdow.data import Scale, prepare_robust_scaler, prepare_standard_scaler @pytest.mark.parametrize('overlap', [True, False]) @@ -51,3 +52,35 @@ def test_sanity(self): medians_2, ranges_2 = prepare_robust_scaler(X, percentile_range=(10, 90)) assert np.all(ranges_2 > ranges_1) + + +class TestScaler: + def test_erorrs(self): + with pytest.raises(ValueError): + raise Scale(np.ones(3), np.ones(4)) + + with pytest.raises(ValueError): + raise Scale(np.array([1, -1]), np.array([9, -0.1])) + + tform = Scale(np.array([1, -1]), np.array([9, 10.])) + with pytest.raises(ValueError): + tform(torch.rand(3, 4, 5), None, None, None) + + def test_overall(self): + n_channels, lookback, n_assets = 3, 5, 12 + + X = np.random.random((n_channels, lookback, n_assets)) + X_torch = torch.as_tensor(X) + dtype = X_torch.dtype + + center = X.mean(axis=(1, 2)) + scale = X.std(axis=(1, 2), ) + + tform = Scale(center, scale) + X_scaled = tform(X_torch, None, None, None)[0] + + assert torch.is_tensor(X_scaled) + assert X_torch.shape == X_scaled.shape + assert not torch.allclose(X_torch, X_scaled) + assert torch.allclose(X_scaled.mean(dim=(1, 2)), torch.zeros(n_channels, dtype=dtype)) + assert torch.allclose(X_scaled.std(dim=(1, 2), unbiased=False), torch.ones(n_channels, dtype=dtype)) From 6990cddaa6663dc8b767b87e00726065a72de7a1 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Sun, 7 Jun 2020 14:38:42 +0200 Subject: [PATCH 3/3] Fix typo --- deepdow/data/augment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepdow/data/augment.py b/deepdow/data/augment.py index 2aaaee5..d7df311 100644 --- a/deepdow/data/augment.py +++ b/deepdow/data/augment.py @@ -323,7 +323,7 @@ def __call__(self, X_sample, y_sample, timestamps_sample, asset_names): Returns ------- X_sample_new : torch.Tensor - Feature vector of shape `(n_channels, lookback, n_assets)` scaled appropriatelly. + Feature vector of shape `(n_channels, lookback, n_assets)` scaled appropriately. y_sample : torch.Tesnor Same as input.