Optimize down and up samplers by using FFT convolution

This improves down sampling performance by 40 percent in all cases on x64 with AVX and up sampling performance by 50 percent in the case of large input block sizes (256 samples or more) on x64 with AVX. Up sampling performance in the case of normal input block sizes (128 samples) is not affected. Performance improvements on x64 with SSE but without AVX are even bigger than those on x64 with AVX. On Arm, the performance improvement is 270 percent. When FFT resampling is used for the WaveShaper, we need to account for some small round-off errors in the result compared with the previous direct convolver. Slightly adjust how the thresholds work and the threshold value for the tests. Thanks to Raymond Toy (rtoy@) for these layout test updates. Bug: 851941 Change-Id: I2dc4bb30ccc6d1314a3e779a61ffac210cf702a1 Reviewed-on: https://chromium-review.googlesource.com/1090841 Commit-Queue: Eero Häkkinen <eero.hakkinen@intel.com> Reviewed-by: Raymond Toy <rtoy@chromium.org> Cr-Commit-Position: refs/heads/master@{#569217}

Optimize down and up samplers by using FFT convolution
This improves down sampling performance by 40 percent in all cases on x64 with AVX and up sampling performance by 50 percent in the case of large input block sizes (256 samples or more) on x64 with AVX. Up sampling performance in the case of normal input block sizes (128 samples) is not affected. Performance improvements on x64 with SSE but without AVX are even bigger than those on x64 with AVX. On Arm, the performance improvement is 270 percent. When FFT resampling is used for the WaveShaper, we need to account for some small round-off errors in the result compared with the previous direct convolver. Slightly adjust how the thresholds work and the threshold value for the tests. Thanks to Raymond Toy (rtoy@) for these layout test updates. Bug: 851941 Change-Id: I2dc4bb30ccc6d1314a3e779a61ffac210cf702a1 Reviewed-on: https://chromium-review.googlesource.com/1090841 Commit-Queue: Eero Häkkinen <eero.hakkinen@intel.com> Reviewed-by: Raymond Toy <rtoy@chromium.org> Cr-Commit-Position: refs/heads/master@{#569217}
deebd6c2 · Eero Häkkinen · Commit Bot · 36b93653 · deebd6c2 · deebd6c2
Commit deebd6c2 authored Jun 21, 2018 by Eero Häkkinen Committed by Commit Bot Jun 21, 2018
8 changed files
--- a/third_party/WebKit/LayoutTests/webaudio/AudioNode/tail-processing.html
+++ b/third_party/WebKit/LayoutTests/webaudio/AudioNode/tail-processing.html
@@ -265,7 +265,9 @@
              let prefix = 'WaveShaper';
              let output = renderedBuffer.getChannelData(0);
              let response = renderedBuffer.getChannelData(1);
-              let tailFrame = findTailFrame(response);
+              // FFT resampler introduces some very small round-off.  Use a
+              // threshold of zero to find the tail frame.
+              let tailFrame = findTailFrame(response, 0);

              should(tailFrame, `${prefix} tail frame (${tailFrame})`)
                  .beGreaterThan(0);
@@ -282,7 +284,7 @@
            .then(() => task.done());
      });

-      audit.run();
+      audit.run('waveshaper-tail');

      function runTest(nodeName, nodeOptions) {
        // Two-channel output.  Channel 0 is the test result; channel 1 is the
@@ -319,15 +321,15 @@
      // Starting from the end find the first frame that exceeds our threshold.
      // This assumes that everything less than the threshold is equivalent to 0
      // for our purposes.
-      function findTailFrame(response) {
+      function findTailFrame(response, zeroThreshold) {
        // Any value below this is considered to be zero for our purpose of
        // finding the first non-zero value.  Somewhat arbitrary, but the value
        // here is the size of the LSB of a 16-bit PCM sample.
-        let zeroThreshold = 1 / 32768.
+        let threshold = zeroThreshold === undefined ? 1 / 32768 : zeroThreshold;
        let tailFrame = response.length;

        for (let k = response.length - 1; k >= 0; --k) {
-          if (Math.abs(response[k]) > zeroThreshold) {
+          if (Math.abs(response[k]) > threshold) {
            tailFrame = k + 1;
            break;
          }

--- a/third_party/WebKit/LayoutTests/webaudio/WaveShaper/waveshaper-oversample-4x.html
+++ b/third_party/WebKit/LayoutTests/webaudio/WaveShaper/waveshaper-oversample-4x.html
@@ -20,7 +20,7 @@
        // Should generate harmonics at 18000, 36000, 54000, 72000
        // All except for 18000 should be filtered out with the 4x oversampling.
        'fundamentalFrequency': 18000,
-        'acceptableAliasingThresholdDecibels': -79.9,
+        'acceptableAliasingThresholdDecibels': -79.88,
        description: '4x WaveShaperNode oversampling'
      };
      runWaveShaperOversamplingTest(testParams);

--- a/third_party/blink/renderer/platform/BUILD.gn
+++ b/third_party/blink/renderer/platform/BUILD.gn
@@ -406,6 +406,8 @@ jumbo_component("platform") {
    "audio/reverb_convolver_stage.h",
    "audio/reverb_input_buffer.cc",
    "audio/reverb_input_buffer.h",
+    "audio/simple_fft_convolver.cc",
+    "audio/simple_fft_convolver.h",
    "audio/sinc_resampler.cc",
    "audio/sinc_resampler.h",
    "audio/stereo_panner.cc",

--- a/third_party/blink/renderer/platform/audio/down_sampler.h
+++ b/third_party/blink/renderer/platform/audio/down_sampler.h
@@ -32,7 +32,7 @@
 #define THIRD_PARTY_BLINK_RENDERER_PLATFORM_AUDIO_DOWN_SAMPLER_H_

 #include "third_party/blink/renderer/platform/audio/audio_array.h"
-#include "third_party/blink/renderer/platform/audio/direct_convolver.h"
+#include "third_party/blink/renderer/platform/audio/simple_fft_convolver.h"
 #include "third_party/blink/renderer/platform/wtf/allocator.h"
 #include "third_party/blink/renderer/platform/wtf/noncopyable.h"

@@ -62,8 +62,8 @@ class PLATFORM_EXPORT DownSampler {

  size_t input_block_size_;

-  // Half-band filter.
-  DirectConvolver convolver_;
+  // Half-band filter. SimpleFFTConvolver is always faster than DirectConvolver.
+  SimpleFFTConvolver convolver_;

  AudioFloatArray temp_buffer_;


--- a/third_party/blink/renderer/platform/audio/simple_fft_convolver.cc
+++ b/third_party/blink/renderer/platform/audio/simple_fft_convolver.cc
+// Copyright 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "third_party/blink/renderer/platform/audio/simple_fft_convolver.h"
+#include "third_party/blink/renderer/platform/audio/vector_math.h"
+
+namespace blink {
+
+SimpleFFTConvolver::SimpleFFTConvolver(
+    size_t input_block_size,
+    const std::unique_ptr<AudioFloatArray>& convolution_kernel)
+    : convolution_kernel_size_(convolution_kernel->size()),
+      fft_kernel_(2 * input_block_size),
+      frame_(2 * input_block_size),
+      input_buffer_(2 *
+                    input_block_size),  // 2nd half of buffer is always zeroed
+      output_buffer_(2 * input_block_size),
+      last_overlap_buffer_(input_block_size) {
+  DCHECK_LE(convolution_kernel_size_, FftSize() / 2);
+  // Do padded FFT to get frequency-domain version of the convolution kernel.
+  // This FFT and caching is done once in here so that it does not have to be
+  // done repeatedly in |Process|.
+  fft_kernel_.DoPaddedFFT(convolution_kernel->Data(), convolution_kernel_size_);
+}
+
+void SimpleFFTConvolver::Process(const float* source_p,
+                                 float* dest_p,
+                                 size_t frames_to_process) {
+  size_t half_size = FftSize() / 2;
+
+  // frames_to_process must be exactly half_size.
+  DCHECK(source_p);
+  DCHECK(dest_p);
+  DCHECK_EQ(frames_to_process, half_size);
+  if (!(source_p && dest_p && frames_to_process == half_size))
+    return;
+
+  // Do padded FFT (get frequency-domain version) by copying samples to the 1st
+  // half of the input buffer (the second half is always zero), multiply in
+  // frequency-domain and do inverse FFT to get output samples.
+  input_buffer_.CopyToRange(source_p, 0, half_size);
+  frame_.DoFFT(input_buffer_.Data());
+  frame_.Multiply(fft_kernel_);
+  frame_.DoInverseFFT(output_buffer_.Data());
+
+  // Overlap-add 1st half with 2nd half from previous time and write
+  // to destination.
+  VectorMath::Vadd(output_buffer_.Data(), 1, last_overlap_buffer_.Data(), 1,
+                   dest_p, 1, half_size);
+
+  // Finally, save 2nd half for the next time.
+  last_overlap_buffer_.CopyToRange(output_buffer_.Data() + half_size, 0,
+                                   half_size);
+}
+
+void SimpleFFTConvolver::Reset() {
+  last_overlap_buffer_.Zero();
+}
+
+}  // namespace blink
--- a/third_party/blink/renderer/platform/audio/simple_fft_convolver.h
+++ b/third_party/blink/renderer/platform/audio/simple_fft_convolver.h
+// Copyright 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef THIRD_PARTY_BLINK_RENDERER_PLATFORM_AUDIO_SIMPLE_FFT_CONVOLVER_H_
+#define THIRD_PARTY_BLINK_RENDERER_PLATFORM_AUDIO_SIMPLE_FFT_CONVOLVER_H_
+
+#include <memory>
+
+#include "third_party/blink/renderer/platform/audio/audio_array.h"
+#include "third_party/blink/renderer/platform/audio/fft_frame.h"
+#include "third_party/blink/renderer/platform/wtf/allocator.h"
+#include "third_party/blink/renderer/platform/wtf/noncopyable.h"
+
+namespace blink {
+
+// The SimpleFFTConvolver does an FFT convolution. It differs from
+// the FFTConvolver in that it restricts the maximum size of
+// |convolution_kernel| to |input_block_size|. This restriction allows it to do
+// an FFT on every Process call. Therefore, the processing delay of
+// the SimpleFFTConvolver is the same as that of the DirectConvolver and thus
+// smaller than that of the FFTConvolver.
+class PLATFORM_EXPORT SimpleFFTConvolver {
+  USING_FAST_MALLOC(SimpleFFTConvolver);
+  WTF_MAKE_NONCOPYABLE(SimpleFFTConvolver);
+
+ public:
+  SimpleFFTConvolver(
+      size_t input_block_size,
+      const std::unique_ptr<AudioFloatArray>& convolution_kernel);
+
+  void Process(const float* source_p, float* dest_p, size_t frames_to_process);
+
+  void Reset();
+
+  size_t ConvolutionKernelSize() const { return convolution_kernel_size_; }
+
+ private:
+  size_t FftSize() const { return frame_.FftSize(); }
+
+  size_t convolution_kernel_size_;
+  FFTFrame fft_kernel_;
+  FFTFrame frame_;
+
+  // Buffer input until we get fftSize / 2 samples then do an FFT
+  AudioFloatArray input_buffer_;
+
+  // Stores output which we read a little at a time
+  AudioFloatArray output_buffer_;
+
+  // Saves the 2nd half of the FFT buffer, so we can do an overlap-add with the
+  // 1st half of the next one
+  AudioFloatArray last_overlap_buffer_;
+};
+
+}  // namespace blink
+
+#endif  // THIRD_PARTY_BLINK_RENDERER_PLATFORM_AUDIO_SIMPLE_FFT_CONVOLVER_H_
--- a/third_party/blink/renderer/platform/audio/up_sampler.cc
+++ b/third_party/blink/renderer/platform/audio/up_sampler.cc
@@ -76,13 +76,30 @@ std::unique_ptr<AudioFloatArray> MakeKernel(size_t size) {

 UpSampler::UpSampler(size_t input_block_size)
    : input_block_size_(input_block_size),
-      convolver_(input_block_size, MakeKernel(kDefaultKernelSize)),
      temp_buffer_(input_block_size),
-      input_buffer_(input_block_size * 2) {}
+      input_buffer_(input_block_size * 2) {
+  std::unique_ptr<AudioFloatArray> convolution_kernel =
+      MakeKernel(kDefaultKernelSize);
+  if (input_block_size_ <= 128) {
+    // If the input block size is small enough, use direct convolution because
+    // it is faster than FFT convolution for such input block sizes.
+    direct_convolver_ = std::make_unique<DirectConvolver>(
+        input_block_size_, std::move(convolution_kernel));
+  } else {
+    // Otherwise, use FFT convolution because it is faster than direct
+    // convolution for large input block sizes.
+    simple_fft_convolver_ = std::make_unique<SimpleFFTConvolver>(
+        input_block_size_, std::move(convolution_kernel));
+  }
+}

 void UpSampler::Process(const float* source_p,
                        float* dest_p,
                        size_t source_frames_to_process) {
+  const size_t convolution_kernel_size =
+      direct_convolver_ ? direct_convolver_->ConvolutionKernelSize()
+                        : simple_fft_convolver_->ConvolutionKernelSize();
+
  bool is_input_block_size_good = source_frames_to_process == input_block_size_;
  DCHECK(is_input_block_size_good);
  if (!is_input_block_size_good)
@@ -93,13 +110,7 @@ void UpSampler::Process(const float* source_p,
  if (!is_temp_buffer_good)
    return;

-  bool is_kernel_good =
-      convolver_.ConvolutionKernelSize() == kDefaultKernelSize;
-  DCHECK(is_kernel_good);
-  if (!is_kernel_good)
-    return;
-
-  size_t half_size = convolver_.ConvolutionKernelSize() / 2;
+  size_t half_size = convolution_kernel_size / 2;

  // Copy source samples to 2nd half of input buffer.
  bool is_input_buffer_good =
@@ -119,7 +130,13 @@ void UpSampler::Process(const float* source_p,

  // Compute odd sample-frames 1,3,5,7...
  float* odd_samples_p = temp_buffer_.Data();
-  convolver_.Process(source_p, odd_samples_p, source_frames_to_process);
+  if (direct_convolver_) {
+    direct_convolver_->Process(source_p, odd_samples_p,
+                               source_frames_to_process);
+  } else {
+    simple_fft_convolver_->Process(source_p, odd_samples_p,
+                                   source_frames_to_process);
+  }

  for (unsigned i = 0; i < source_frames_to_process; ++i)
    dest_p[i * 2 + 1] = odd_samples_p[i];
@@ -130,14 +147,18 @@ void UpSampler::Process(const float* source_p,
 }

 void UpSampler::Reset() {
-  convolver_.Reset();
+  direct_convolver_.reset();
+  simple_fft_convolver_.reset();
  input_buffer_.Zero();
 }

 size_t UpSampler::LatencyFrames() const {
+  const size_t convolution_kernel_size =
+      direct_convolver_ ? direct_convolver_->ConvolutionKernelSize()
+                        : simple_fft_convolver_->ConvolutionKernelSize();
  // Divide by two since this is a linear phase kernel and the delay is at the
  // center of the kernel.
-  return convolver_.ConvolutionKernelSize() / 2;
+  return convolution_kernel_size / 2;
 }

 }  // namespace blink
--- a/third_party/blink/renderer/platform/audio/up_sampler.h
+++ b/third_party/blink/renderer/platform/audio/up_sampler.h
@@ -31,8 +31,11 @@
 #ifndef THIRD_PARTY_BLINK_RENDERER_PLATFORM_AUDIO_UP_SAMPLER_H_
 #define THIRD_PARTY_BLINK_RENDERER_PLATFORM_AUDIO_UP_SAMPLER_H_

+#include <memory>
+
 #include "third_party/blink/renderer/platform/audio/audio_array.h"
 #include "third_party/blink/renderer/platform/audio/direct_convolver.h"
+#include "third_party/blink/renderer/platform/audio/simple_fft_convolver.h"
 #include "third_party/blink/renderer/platform/wtf/allocator.h"
 #include "third_party/blink/renderer/platform/wtf/noncopyable.h"

@@ -63,7 +66,8 @@ class PLATFORM_EXPORT UpSampler {
  size_t input_block_size_;

  // Computes the odd sample-frames of the output.
-  DirectConvolver convolver_;
+  std::unique_ptr<DirectConvolver> direct_convolver_;
+  std::unique_ptr<SimpleFFTConvolver> simple_fft_convolver_;

  AudioFloatArray temp_buffer_;