Skip to content

Commit

Permalink
Scale transform (#53)
Browse files Browse the repository at this point in the history
* Implement center and scale extractors

* Create Scale transform
  • Loading branch information
jankrepl authored Jun 7, 2020
1 parent d9fbfc4 commit ca6bd2c
Show file tree
Hide file tree
Showing 4 changed files with 253 additions and 5 deletions.
8 changes: 6 additions & 2 deletions deepdow/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Module dealing with data."""

from .augment import (Compose, Dropout, Multiply, Noise)
from .augment import (Compose, Dropout, Multiply, Noise, Scale, prepare_robust_scaler,
prepare_standard_scaler)
from .load import (FlexibleDataLoader, InRAMDataset, RigidDataLoader)

__all__ = ['Compose',
Expand All @@ -9,4 +10,7 @@
'InRAMDataset',
'Multiply',
'Noise',
'RigidDataLoader']
'RigidDataLoader',
'Scale',
'prepare_robust_scaler',
'prepare_standard_scaler']
156 changes: 156 additions & 0 deletions deepdow/data/augment.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,85 @@
"""Collection of callable functions that augment deepdow tensors."""

import numpy as np
import torch


def prepare_standard_scaler(X, overlap=False, indices=None):
"""Compute mean and standard deviation for each channel.
Parameters
----------
X : np.ndarray
Full features array of shape `(n_samples, n_channels, lookback, n_assets)`.
overlap : bool
If False, then only using the most recent timestep. This will guarantee that not counting
the same thing multiple times.
indices : list or None
List of indices to consider from the `X.shape[0]` dimension. If None
then considering all the samples.
Returns
-------
means : np.ndarray
Mean of each channel. Shape `(n_channels,)`.
stds : np.ndarray
Standard deviation of each channel. Shape `(n_channels,)`.
"""
indices = indices if indices is not None else list(range(len(X)))
considered_values = X[indices, ...] if overlap else X[indices, :, -1:, :]

means = considered_values.mean(axis=(0, 2, 3))
stds = considered_values.std(axis=(0, 2, 3))

return means, stds


def prepare_robust_scaler(X, overlap=False, indices=None, percentile_range=(25, 75)):
"""Compute median and percentile range for each channel.
Parameters
----------
X : np.ndarray
Full features array of shape `(n_samples, n_channels, lookback, n_assets)`.
overlap : bool
If False, then only using the most recent timestep. This will guarantee that not counting
the same thing multiple times.
indices : list or None
List of indices to consider from the `X.shape[0]` dimension. If None
then considering all the samples.
percentile_range : tuple
The left and right percentile to consider. Needs to be in [0, 100].
Returns
-------
medians : np.ndarray
Median of each channel. Shape `(n_channels,)`.
ranges : np.ndarray
Interquantile range for each channel. Shape `(n_channels,)`.
"""
if not 0 <= percentile_range[0] < percentile_range[1] <= 100:
raise ValueError('The percentile range needs to be in [0, 100] and left < right')

indices = indices if indices is not None else list(range(len(X)))
considered_values = X[indices, ...] if overlap else X[indices, :, -1:, :]

medians = np.median(considered_values, axis=(0, 2, 3))
percentiles = np.percentile(considered_values, percentile_range, axis=(0, 2, 3)) # (2, n_channels)

ranges = percentiles[1] - percentiles[0]

return medians, ranges


class Compose:
"""Meta transform inspired by torchvision.
Expand Down Expand Up @@ -191,3 +268,82 @@ def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
X_sample_new = self.frac * X_sample.std([1, 2], keepdim=True) * torch.randn_like(X_sample) + X_sample

return X_sample_new, y_sample, timestamps_sample, asset_names


class Scale:
"""Scale input features.
The input features are per channel centered to zero and scaled to one. We use the same
terminology as scikit-learn. However, the equivalent in torchvision is `Normalize`.
Parameters
----------
center : np.ndarray
1D array of shape `(n_channels,)` representing the center of the features (mean or median).
Needs to be precomputed in advance.
scale : np.ndarray
1D array of shape `(n_channels,)` representing the scale of the features (standard deviation
or quantile range). Needs to be precomputed in advance.
See Also
--------
prepare_robust_scaler
prepare_standard_scaler
"""

def __init__(self, center, scale):
if len(center) != len(scale):
raise ValueError('The center and scale need to have the same size.')

if np.any(scale <= 0):
raise ValueError('The scale parameters need to be positive.')

self.center = center
self.scale = scale
self.n_channels = len(self.center)

def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
"""Perform transform.
Parameters
----------
X_sample : torch.Tensor
Feature vector of shape `(n_channels, lookback, n_assets)`.
y_sample : torch.Tensor
Target vector of shape `(n_channels, horizon, n_assets)`.
timestamps_sample : datetime
Time stamp of the sample.
asset_names
Asset names corresponding to the last channel of `X_sample` and `y_sample`.
Returns
-------
X_sample_new : torch.Tensor
Feature vector of shape `(n_channels, lookback, n_assets)` scaled appropriately.
y_sample : torch.Tesnor
Same as input.
timestamps_sample : datetime
Same as input.
asset_names
Same as input.
"""
n_channels = X_sample.shape[0]
if n_channels != self.n_channels:
raise ValueError('Expected {} channels in X, got {}'.format(self.n_channels, n_channels))

X_sample_new = X_sample.clone()
dtype, device = X_sample_new.dtype, X_sample_new.device

center = torch.as_tensor(self.center, dtype=dtype, device=device)[:, None, None]
scale = torch.as_tensor(self.scale, dtype=dtype, device=device)[:, None, None]

X_sample_new.sub_(center).div_(scale)

return X_sample_new, y_sample, timestamps_sample, asset_names
8 changes: 5 additions & 3 deletions docs/source/data_loading.rst
Original file line number Diff line number Diff line change
Expand Up @@ -261,10 +261,12 @@ Additionally, one can pass a transformation :code:`transform` that can serve as
Currently implemented transforms under :code:`deepdow.data` are

- :code:`Compose` - basically a copy of `Compose` from Torch Vision
- :code:`Dropout` - randomly setting elements to zero (not in place)
- :code:`Multiply` - multiplying all elements by a constant (not in place)
- :code:`Noise` - add Gaussian noise (not in place)
- :code:`Dropout` - randomly setting elements to zero
- :code:`Multiply` - multiplying all elements by a constant
- :code:`Noise` - add Gaussian noise
- :code:`Scale` - centering and scaling (similar to scikit-learn :code:`StandardScaler` and :code:`RobustScaler`)

All of the transforms are not in place.

Dataloaders
-----------
Expand Down
86 changes: 86 additions & 0 deletions tests/test_data/test_augment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Collection of tests focused on the `deepdow.data.augment`."""

import numpy as np
import pytest
import torch

from deepdow.data import Scale, prepare_robust_scaler, prepare_standard_scaler


@pytest.mark.parametrize('overlap', [True, False])
@pytest.mark.parametrize('indices', [None, [1, 4, 6]])
def test_prepare_standard_scaler(overlap, indices):
n_samples, n_channels, lookback, n_assets = 10, 3, 5, 12

X = np.random.random((n_samples, n_channels, lookback, n_assets)) - 0.5

means, stds = prepare_standard_scaler(X, overlap=overlap, indices=indices)

assert means.shape == (n_channels,)
assert stds.shape == (n_channels,)
assert np.all(stds > 0)


class TestPrepareRobustScaler:

def test_error(self):
with pytest.raises(ValueError):
prepare_robust_scaler(np.ones((1, 2, 3, 4)), percentile_range=(20, 10))

with pytest.raises(ValueError):
prepare_robust_scaler(np.ones((1, 2, 3, 4)), percentile_range=(-2, 99))

@pytest.mark.parametrize('overlap', [True, False])
@pytest.mark.parametrize('indices', [None, [1, 4, 6]])
def test_basic(self, overlap, indices):
n_samples, n_channels, lookback, n_assets = 10, 3, 5, 12

X = np.random.random((n_samples, n_channels, lookback, n_assets)) - 0.5

medians, ranges = prepare_robust_scaler(X, overlap=overlap, indices=indices)

assert medians.shape == (n_channels,)
assert ranges.shape == (n_channels,)
assert np.all(ranges > 0)

def test_sanity(self):
n_samples, n_channels, lookback, n_assets = 10, 3, 5, 12

X = np.random.random((n_samples, n_channels, lookback, n_assets)) - 0.5

medians_1, ranges_1 = prepare_robust_scaler(X, percentile_range=(20, 80))
medians_2, ranges_2 = prepare_robust_scaler(X, percentile_range=(10, 90))

assert np.all(ranges_2 > ranges_1)


class TestScaler:
def test_erorrs(self):
with pytest.raises(ValueError):
raise Scale(np.ones(3), np.ones(4))

with pytest.raises(ValueError):
raise Scale(np.array([1, -1]), np.array([9, -0.1]))

tform = Scale(np.array([1, -1]), np.array([9, 10.]))
with pytest.raises(ValueError):
tform(torch.rand(3, 4, 5), None, None, None)

def test_overall(self):
n_channels, lookback, n_assets = 3, 5, 12

X = np.random.random((n_channels, lookback, n_assets))
X_torch = torch.as_tensor(X)
dtype = X_torch.dtype

center = X.mean(axis=(1, 2))
scale = X.std(axis=(1, 2), )

tform = Scale(center, scale)
X_scaled = tform(X_torch, None, None, None)[0]

assert torch.is_tensor(X_scaled)
assert X_torch.shape == X_scaled.shape
assert not torch.allclose(X_torch, X_scaled)
assert torch.allclose(X_scaled.mean(dim=(1, 2)), torch.zeros(n_channels, dtype=dtype))
assert torch.allclose(X_scaled.std(dim=(1, 2), unbiased=False), torch.ones(n_channels, dtype=dtype))

0 comments on commit ca6bd2c

Please sign in to comment.