danpovey
diff --git a/‎README.md
+39-6 b/‎README.md
+39-6
diff --git a/‎lilfilter/__init__.py
+1-1 b/‎lilfilter/__init__.py
+1-1
diff --git a/‎lilfilter/resampler.py
+202-105 b/‎lilfilter/resampler.py
+202-105
@@ -1,11 +1,44 @@
-# filtering
+## lilfilter
 
 Utilities for resampling and filtering audio data
 
-This repository will contain a Python package containing certain utilities for filtering and
-resampling audio data.
+This repository exports a Python package `lilfilter` containing certain
+utilities for filtering and resampling audio data.
+
+
+One quite-useful thing is class Resampler:
+```
+python3
+>>> import lilfilter
+>>> # ... let a be a Torch tensor of size (num_channels, num_samples)
+>>> # that we want to downsample from 42.1kHz to 16kHz.  Note,
+>>> # the sampling rates must be integers; only their ratio
+>>> # matters.
+>>> r = lilfilter.Resampler(42100, 16000, dtype=torch.float32)
+>>> b = r.resample(a)
+```
+
+Another thing that's useful is class Multistreamer, which can turn a
+signal into multiple parallel signals at a lower sampling rate, where
+pairs of those signals represent the (real,complex) part of one
+complex frequency band of the input.
+```
+>>> import lilfilter
+>>> num_freq_bands = 8
+>>> m = lilfilter.Multistreamer(num_freq_bands)
+>>>
+>>> # ... let a be a Torch tensor of size (num_channels, num_samples)
+>>> # that we want to `demultiplex`.
+>>>
+>>> b = m.split(a)
+>>> # now b is of size (num_channels, 2, num_freq_bands, num_samples/num_freq_bands)
+>>> # (note: the dim of the last axis may be slightly different from that number).
+>>> # You can in principle manipulate b somehow, e.g. do some kind of machine
+>>> # learning with it, and then reconstruct to the original format:
+>>>
+>>> c = m.merge(b)
+>>> # now c is of size (num_channels, 8*(num_samples/8)) and will be extremely
+>>> # close to a.
+```
 
-The most thing exported, currently, is class Multistreamer which can be used to
-split an audio stream into multiple lower-frequency audio streams, each one
-representing one frequency band of the input.
 
@@ -1,3 +1,3 @@
 
 from . multistreamer import Multistreamer
-
+from . resampler import Resampler
@@ -1,12 +1,8 @@
 # To be run with python3
 
 """
-CAUTION: you probably want to use ./torch_resampler.py instead; it's more
-general.
-
-This module defines an object that can be used for upsampling and downsampling
-of signals.  Note: unlike ./filters.py, this object has a torch dependency.
-(It uses ./filters.py for initialization though.)
+This module defines an object that can be used for signal resampling.
+It has a torch dependency because it does the resampling via 1d convolution.
 """
 
 
@@ -15,114 +11,215 @@
 import math
 import torch
 
-class Resampler:
 
-    def __init__(self, N, num_zeros = 32,
-                 filter_cutoff_ratio = 0.95,
-                 full_padding = False,
-                 double_precision = False):
-        """
-        This creates an object which can be used for both upsampling and
-        downsampling of signals.  This involves creating a low-pass filter with
-        the appropriate cutoff.
+def gcd(a, b):
+    """ Return the greatest common divisor of a and b"""
+    assert isinstance(a, int) and isinstance(b, int)
+    if b == 0:
+        return a
+    else:
+        return gcd(b, a % b)
 
-        Args:
-             N (int):   The downsampling or upsampling  ratio.  For example,
-                     4 would mean we downsample or upsample by a factor of 4.
-                     Must be > 1.
-
-             num_zeros (int): The number of zeros in the filter function..
-                     a larger number will give a sharper cutoff, but will be
-                     slower.
-
-             filter_cutoff_ratio (float):  Determines where we place the
-                     cutoff of the filter used for upsampling and
-                     downsampling, relative to the Nyquist of the lower
-                     of the two frequencies.  Must be >0.5 and <1.0.
-
-             full_padding (bool):  If true, will pad on each side with
-                     (filter_width - 1) which ensures that a sufficiently-low-pass
-                     signal that's upsampled and then downsampled will
-                     undergo the round trip with minimal end effects.
-                     If false, we pad with filter_width when downsampling,
-                     which will give a signal length closer to
-                     input_signal_length / N and enables easier
-                     mapping of time offsets,(without worrying about time
-                     offsets).
-
-             double_precision:  If true, will use torch.float64 for the filter
-                     (and expect this for the input); else will use torch.float32.
+class Resampler:
+    """
+    This object should ideally be initialized once and used many times,
+    but the construction time shouldn't be excessive.
+    Please read the documentation carefully!
+    """
+
+    def __init__(self,
+                 input_sr, output_sr, dtype,
+                 num_zeros = 64, cutoff_ratio = 0.95):
         """
-        self.N = N
-        if not (isinstance(N, int) and isinstance(num_zeros, int) and isinstance(filter_cutoff_ratio, float)):
-            raise TypeError("One of the args has the wrong type")
-        if N <= 1 or num_zeros < 2:
-            raise ValueError("Require N > 1 and num_zeros > 1")
-        if not (filter_cutoff_ratio > 0.5 and filter_cutoff_ratio < 1.0):
-            raise ValueError("Invalid number for filter_cutoff_ratio: ",
-                             filter_cutoff_ratio)
-
-        self.dtype = (torch.float64 if double_precision else torch.float32)
-
-        # f is a numpy array.  i is its central index, not really needed.
-        (f, i) = filters.low_pass_filter(filter_cutoff_ratio / (N * 2),
-                                         num_zeros = num_zeros)
-
-
-        f_len = f.shape[0]
+        This creates an object that can apply a symmetric FIR filter
+        based on torch.nn.functional.conv1d.
 
-        # self.filter is a torch.Tensor whose dimension is interpreted
-        # as (out_channels, in_channels, width) where out_channels and
-        # in_channels are both 1.
-        self.forward_filter = torch.tensor(f, dtype=self.dtype).view(1, 1, f_len)
-
-        self.backward_filter = self.forward_filter * N
+        Args:
+          input_sr:  The input sampling rate, AS A SMALL INTEGER..
+              does not have to be the real sampling rate but should
+              have the correct ratio with output_sr.
+          output_sr:  The output sampling rate, AS A SMALL INTEGER.
+              It is the ratio with the input sampling rate that is
+              important here.
+          dtype:  The torch dtype to use for computations
+          num_zeros: The number of zeros per side in the (sinc*hanning-window)
+              filter function.  More is more accurate, but 64 is already
+              quite a lot.
+
+        You can think of this algorithm as dividing up the signals
+        (input,output) into blocks where there are `input_sr` input
+        samples and `output_sr` output samples.  Then we treat it
+        using convolutional code, imagining there are `input_sr`
+        input channels and `output_sr` output channels per time step.
 
-        if full_padding:
-            self.padding = f_len - 1
+        """
+        assert isinstance(input_sr, int) and isinstance(output_sr, int)
+        if input_sr == output_sr:
+            self.resample_type = 'trivial'
+            return
+        d = gcd(input_sr, output_sr)
+        input_sr, output_sr = input_sr // d, output_sr // d
+
+        assert dtype in [torch.float32, torch.float64]
+        assert num_zeros > 3  # a reasonable bare minimum
+        np_dtype = np.float32 if dtype == torch.float32 else np.float64
+
+        # Define one 'block' of samples `input_sr` input samples
+        # and `output_sr` output samples.  We can divide up
+        # the samples into these blocks and have the blocks be
+        #in correspondence.
+
+        # The sinc function will have, on average, `zeros_per_block`
+        # zeros per block.
+        zeros_per_block = min(input_sr, output_sr) * cutoff_ratio
+
+        # The convolutional kernel size will be n = (blocks_per_side*2 + 1),
+        # i.e. we add that many blocks on each side of the central block.  The
+        # window radius (defined as distance from center to edge)
+        # is `blocks_per_side` blocks.  This ensures that each sample in the
+        # central block can "see" all the samples in its window.
+        #
+        # Assuming the following division is not exact, adding 1
+        # will have the same effect as rounding up.
+        blocks_per_side = 1 + int(num_zeros / zeros_per_block)
+
+        kernel_width = 2*blocks_per_side + 1
+
+        # We want the weights as used by torch's conv1d code; format is
+        #  (out_channels, in_channels, kernel_width)
+        # https://pytorch.org/docs/stable/nn.functional.html
+        weights = torch.tensor((output_sr, input_sr, kernel_width), dtype=dtype)
+
+        # Computations involving time will be in units of 1 block.  Actually this
+        # is the same as the `canonical` time axis since each block has input_sr
+        # input samples, so it would be one of whatever time unit we are using
+        window_radius_in_blocks = blocks_per_side
+
+
+        # The `times` below will end up being the args to the sinc function.
+        # For the shapes of the things below, look at the args to `view`.  The terms
+        # below will get expanded to shape (output_sr, input_sr, kernel_width) through
+        # broadcasting
+        # We want it so that, assuming input_sr == output_sr, along the diagonal of
+        # the central block we have t == 0.
+        # The signs of the output_sr and input_sr terms need to be opposite.  The
+        # sign that the kernel_width term needs to be will depend on whether it's
+        # convolution or correlation, and the logic is tricky.. I will just find
+        # which sign works.
+
+
+        times = (
+            np.arange(output_sr, dtype=np_dtype).reshape((output_sr, 1, 1)) / output_sr -
+            np.arange(input_sr, dtype=np_dtype).reshape((1, input_sr, 1)) / input_sr -
+            (np.arange(kernel_width, dtype=np_dtype).reshape((1, 1, kernel_width)) - blocks_per_side))
+
+
+        def window_func(a):
+            """
+            window_func returns the Hann window on [-1,1], which is zero
+            if a < -1 or a > 1, and otherwise 0.5 + 0.5 cos(a/pi).
+            This is applied elementwise to a, which should be a NumPy array.
+
+            The heaviside function returns (a > 0 ? 1 : 0).
+            """
+            return np.heaviside(1 - np.abs(a), 0.0) * (0.5 + 0.5 * np.cos(a * np.pi))
+
+
+        # The weights below are a sinc function times a Hann-window function.
+        #
+        # multiplication by zeros_per_block can be seen as correctly normalizing
+        # the sinc function (to compensate for scaling on the x-axis), so that
+        # its integral is 1.
+        #
+        # division is by input_sr can be interpreted as normalizing the input
+        # function correctly...if we view it as a stream of dirac deltas that's
+        # passed through a low pass filter and want that to have the same
+        # magnitude as the original input function, we need to divide by the
+        # number of those deltas per unit time.
+        weights = (np.sinc(times * zeros_per_block)
+                   * window_func(times / window_radius_in_blocks)
+                   * zeros_per_block / input_sr)
+
+        self.input_sr = input_sr
+        self.output_sr = output_sr
+
+
+        # OK, at this point the dim of the weights is (output_sr, input_sr,
+        # kernel_width).  If output_sr == 1, we can fold the input_sr into the
+        # kernel_width (i.e. have just 1 input channel); this will make the
+        # convolution faster and avoid unnecessary reshaping.
+
+        assert weights.shape ==  (output_sr, input_sr, kernel_width)
+        if output_sr == 1:
+            self.resample_type = 'integer_downsample'
+            self.padding = input_sr * blocks_per_side
+            weights = torch.tensor(weights, dtype=dtype)
+            self.weights = weights.transpose(1, 2).contiguous().view(1, 1, input_sr * kernel_width)
+        elif input_sr == 1:
+            # In this case we'll be doing conv_transpose, so we want the same weights that
+            # we would have if we wer *downsampling* by this factor-- i.e. as if input_sr,
+            # output_sr had been swapped.
+            self.resample_type = 'integer_upsample'
+            self.padding = output_sr * blocks_per_side
+            weights = torch.tensor(weights, dtype=dtype)
+            self.weights = weights.flip(2).transpose(0, 2).contiguous().view(1, 1, output_sr * kernel_width)
         else:
-            self.padding = (f_len - 1) // 2
+            self.resample_type = 'general'
+            self.reshaped = False
+            self.padding = blocks_per_side
+            self.weights = torch.tensor(weights, dtype=dtype)
 
 
 
-    def downsample(self, input):
+    def resample(self, in_data):
         """
-        This downsamples the signal `input` and returns the result.
-        Args:
-         input (torch.Tensor): A Tensor with shape (minibatch_size, signal_length),
-              and dtype torch.float64 if double_precision to constructor was true,
-              else torch.float32.
+        Resample the data
 
-        Return:
-            Returns a torch.Tensor with shape (minibatch_size, reduced_signal_length).
-        """
-        if not isinstance(input, torch.Tensor):
-            raise TypeError("Expected input to be torch.Tensor, got ",
-                            type(input))
-        if not (input.dtype == self.dtype):
-            raise TypeError("Expected input tensor to have dtype {}, got {}".format(
-                    self.dtype, input.dtype))
-
-        # The squeeze and unsqueeze are to insert a dim for num_channels == 1.
-        return torch.nn.functional.conv1d(input.unsqueeze(1),
-                                          self.forward_filter,
-                                          stride=self.N,
-                                          padding=self.padding).squeeze(1)
-
-    def upsample(self, input):
-        """
-        This upsamples the signal `input`.
+        Args:
+         input: a torch.Tensor with the same dtype as was passed to the
+           constructor.
+         There must be 2 axes, interpreted as (minibatch_size, sequence_length)...
+         the minibatch_size may in practice be the number of channels.
+
+        Return:  Returns a torch.Tensor with the same dtype as the input, and
+         dimension (minibatch_size, (sequence_length//input_sr)*output_sr),
+         where input_sr and output_sr are the corresponding constructor args,
+         modified to remove any common factors.
         """
-        if not isinstance(input, torch.Tensor):
-            raise TypeError("Expected input to be torch.Tensor, got ",
-                            type(input))
-        if not (input.dtype == self.dtype):
-            raise TypeError("Expected input tensor to have dtype {}, got {}".format(
-                    self.dtype, input.dtype))
-
-
-        # The squeeze and unsqueeze are to insert a dim for num_channels == 1.
-        return torch.nn.functional.conv_transpose1d(input.unsqueeze(1),
-                                                    self.backward_filter,
-                                                    stride=self.N,
-                                                    padding=self.padding).squeeze(1)
+        if self.resample_type == 'trivial':
+            return in_data
+        elif self.resample_type == 'integer_downsample':
+            (minibatch_size, seq_len) = in_data.shape
+            # will be shape (minibatch_size, in_channels, seq_len) with in_channels == 1
+            in_data = in_data.unsqueeze(1)
+            out = torch.nn.functional.conv1d(in_data,
+                                             self.weights,
+                                             stride=self.input_sr,
+                                             padding=self.padding)
+            # shape will be (minibatch_size, out_channels = 1, seq_len);
+            # return as (minibatch_size, seq_len)
+            return out.squeeze(1)
+        elif self.resample_type == 'integer_upsample':
+            out = torch.nn.functional.conv_transpose1d(in_data.unsqueeze(1),
+                                                       self.weights,
+                                                       stride=self.output_sr,
+                                                       padding=self.padding)
+            return out.squeeze(1)
+        else:
+            assert self.resample_type == 'general'
+            (minibatch_size, seq_len) = in_data.shape
+            num_blocks = seq_len // self.input_sr
+            if num_blocks == 0:
+                # TODO: pad with zeros.
+                raise RuntimeError("Signal is too short to resample")
+            in_data = in_data[:, 0:(num_blocks*self.input_sr)]  # Truncate input
+            in_data = in_data.view(minibatch_size, num_blocks, self.input_sr)
+
+            # Torch's conv1d actually expects input data with shape (minibatch,
+            # in_channels, width) so we need to reshape (note: time is width).
+            in_data = in_data.transpose(1, 2)
+
+            out = torch.nn.functional.conv1d(in_data, self.weights,
+                                             padding=self.padding)
+            assert out.shape == (minibatch_size, self.output_sr, num_blocks)
+            return out.transpose(1, 2).contiguous().view(minibatch_size, num_blocks * self.output_sr)
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`
`2`	`2`	`from . multistreamer import Multistreamer`
`3`		`-`
	`3`	`+from . resampler import Resampler`