Optimize blink::VectorMath::Conv for AVX

This CL doubles the performance on an Intel Broadwell CPU. Bug: 778262 Change-Id: Ie60d64a0e862148a8a74e1fb162be2ec147e6cc6 Reviewed-on: https://chromium-review.googlesource.com/924144 Commit-Queue: Eero Häkkinen <eero.hakkinen@intel.com> Reviewed-by: Raymond Toy <rtoy@chromium.org> Cr-Commit-Position: refs/heads/master@{#544319}

Optimize blink::VectorMath::Conv for AVX
This CL doubles the performance on an Intel Broadwell CPU. Bug: 778262 Change-Id: Ie60d64a0e862148a8a74e1fb162be2ec147e6cc6 Reviewed-on: https://chromium-review.googlesource.com/924144 Commit-Queue: Eero Häkkinen <eero.hakkinen@intel.com> Reviewed-by: Raymond Toy <rtoy@chromium.org> Cr-Commit-Position: refs/heads/master@{#544319}
b401d9b0 · Eero Häkkinen · Commit Bot · cdc7defb · b401d9b0 · b401d9b0
Commit b401d9b0 authored Mar 20, 2018 by Eero Häkkinen Committed by Commit Bot Mar 20, 2018
17 changed files
--- a/third_party/WebKit/Source/platform/audio/DirectConvolver.cpp
+++ b/third_party/WebKit/Source/platform/audio/DirectConvolver.cpp
@@ -28,6 +28,8 @@

 #include "platform/audio/DirectConvolver.h"

+#include <utility>
+
 #include "build/build_config.h"
 #include "platform/audio/VectorMath.h"

@@ -41,13 +43,23 @@

 namespace blink {

-using namespace VectorMath;
-
-DirectConvolver::DirectConvolver(size_t input_block_size)
-    : input_block_size_(input_block_size), buffer_(input_block_size * 2) {}
+namespace {
+using VectorMath::Conv;
+using VectorMath::PrepareFilterForConv;
+}  // namespace
+
+DirectConvolver::DirectConvolver(
+    size_t input_block_size,
+    std::unique_ptr<AudioFloatArray> convolution_kernel)
+    : input_block_size_(input_block_size),
+      buffer_(input_block_size * 2),
+      convolution_kernel_(std::move(convolution_kernel)) {
+  size_t kernel_size = ConvolutionKernelSize();
+  PrepareFilterForConv(convolution_kernel_->Data() + kernel_size - 1, -1,
+                       kernel_size, &prepared_convolution_kernel_);
+}

-void DirectConvolver::Process(AudioFloatArray* convolution_kernel,
-                              const float* source_p,
+void DirectConvolver::Process(const float* source_p,
                              float* dest_p,
                              size_t frames_to_process) {
  DCHECK_EQ(frames_to_process, input_block_size_);
@@ -55,12 +67,12 @@ void DirectConvolver::Process(AudioFloatArray* convolution_kernel,
    return;

  // Only support kernelSize <= m_inputBlockSize
-  size_t kernel_size = convolution_kernel->size();
+  size_t kernel_size = ConvolutionKernelSize();
  DCHECK_LE(kernel_size, input_block_size_);
  if (kernel_size > input_block_size_)
    return;

-  float* kernel_p = convolution_kernel->Data();
+  float* kernel_p = convolution_kernel_->Data();

  // Sanity check
  bool is_copy_good = kernel_p && source_p && dest_p && buffer_.Data();
@@ -74,7 +86,7 @@ void DirectConvolver::Process(AudioFloatArray* convolution_kernel,
  memcpy(input_p, source_p, sizeof(float) * frames_to_process);

  Conv(input_p - kernel_size + 1, 1, kernel_p + kernel_size - 1, -1, dest_p, 1,
-       frames_to_process, kernel_size);
+       frames_to_process, kernel_size, &prepared_convolution_kernel_);

  // Copy 2nd half of input buffer to 1st half.
  memcpy(buffer_.Data(), input_p, sizeof(float) * frames_to_process);

--- a/third_party/WebKit/Source/platform/audio/DirectConvolver.h
+++ b/third_party/WebKit/Source/platform/audio/DirectConvolver.h
@@ -29,6 +29,8 @@
 #ifndef DirectConvolver_h
 #define DirectConvolver_h

+#include <memory>
+
 #include "platform/PlatformExport.h"
 #include "platform/audio/AudioArray.h"
 #include "platform/wtf/Allocator.h"
@@ -41,19 +43,21 @@ class PLATFORM_EXPORT DirectConvolver {
  WTF_MAKE_NONCOPYABLE(DirectConvolver);

 public:
-  DirectConvolver(size_t input_block_size);
+  DirectConvolver(size_t input_block_size,
+                  std::unique_ptr<AudioFloatArray> convolution_kernel);

-  void Process(AudioFloatArray* convolution_kernel,
-               const float* source_p,
-               float* dest_p,
-               size_t frames_to_process);
+  void Process(const float* source_p, float* dest_p, size_t frames_to_process);

  void Reset();

+  size_t ConvolutionKernelSize() const { return convolution_kernel_->size(); }
+
 private:
  size_t input_block_size_;

  AudioFloatArray buffer_;
+  std::unique_ptr<AudioFloatArray> convolution_kernel_;
+  AudioFloatArray prepared_convolution_kernel_;
 };

 }  // namespace blink

--- a/third_party/WebKit/Source/platform/audio/DownSampler.cpp
+++ b/third_party/WebKit/Source/platform/audio/DownSampler.cpp
@@ -29,27 +29,27 @@
 */

 #include "platform/audio/DownSampler.h"
+
+#include <memory>
+
 #include "platform/wtf/MathExtras.h"

 namespace blink {

-DownSampler::DownSampler(size_t input_block_size)
-    : input_block_size_(input_block_size),
-      reduced_kernel_(kDefaultKernelSize / 2),
-      convolver_(input_block_size / 2),  // runs at 1/2 source sample-rate
-      temp_buffer_(input_block_size / 2),
-      input_buffer_(input_block_size * 2) {
-  InitializeKernel();
-}
+namespace {
+
+// Computes ideal band-limited half-band filter coefficients.
+// In other words, filter out all frequencies higher than 0.25 * Nyquist.
+std::unique_ptr<AudioFloatArray> MakeReducedKernel(size_t size) {
+  auto reduced_kernel = std::make_unique<AudioFloatArray>(size / 2);

-void DownSampler::InitializeKernel() {
  // Blackman window parameters.
  double alpha = 0.16;
  double a0 = 0.5 * (1.0 - alpha);
  double a1 = 0.5;
  double a2 = 0.5 * alpha;

-  int n = kDefaultKernelSize;
+  int n = size;
  int half_size = n / 2;

  // Half-band filter.
@@ -73,10 +73,21 @@ void DownSampler::InitializeKernel() {
    // Then store only the odd terms in the kernel.
    // In a sense, this is shifting forward in time by one sample-frame at the
    // destination sample-rate.
-    reduced_kernel_[(i - 1) / 2] = sinc * window;
+    (*reduced_kernel)[(i - 1) / 2] = sinc * window;
  }
+
+  return reduced_kernel;
 }

+}  // namespace
+
+DownSampler::DownSampler(size_t input_block_size)
+    : input_block_size_(input_block_size),
+      convolver_(input_block_size / 2,  // runs at 1/2 source sample-rate
+                 MakeReducedKernel(kDefaultKernelSize)),
+      temp_buffer_(input_block_size / 2),
+      input_buffer_(input_block_size * 2) {}
+
 void DownSampler::Process(const float* source_p,
                          float* dest_p,
                          size_t source_frames_to_process) {
@@ -93,7 +104,7 @@ void DownSampler::Process(const float* source_p,
    return;

  bool is_reduced_kernel_good =
-      reduced_kernel_.size() == kDefaultKernelSize / 2;
+      convolver_.ConvolutionKernelSize() == kDefaultKernelSize / 2;
  DCHECK(is_reduced_kernel_good);
  if (!is_reduced_kernel_good)
    return;
@@ -121,8 +132,7 @@ void DownSampler::Process(const float* source_p,
  // Actually process oddSamplesP with m_reducedKernel for efficiency.
  // The theoretical kernel is double this size with 0 values for even terms
  // (except center).
-  convolver_.Process(&reduced_kernel_, odd_samples_p, dest_p,
-                     dest_frames_to_process);
+  convolver_.Process(odd_samples_p, dest_p, dest_frames_to_process);

  // Now, account for the 0.5 term right in the middle of the kernel.
  // This amounts to a delay-line of length halfSize (at the source
@@ -145,7 +155,7 @@ void DownSampler::Reset() {
 size_t DownSampler::LatencyFrames() const {
  // Divide by two since this is a linear phase kernel and the delay is at the
  // center of the kernel.
-  return reduced_kernel_.size() / 2;
+  return convolver_.ConvolutionKernelSize() / 2;
 }

 }  // namespace blink
--- a/third_party/WebKit/Source/platform/audio/DownSampler.h
+++ b/third_party/WebKit/Source/platform/audio/DownSampler.h
@@ -45,7 +45,7 @@ class PLATFORM_EXPORT DownSampler {
  WTF_MAKE_NONCOPYABLE(DownSampler);

 public:
-  DownSampler(size_t input_block_size);
+  explicit DownSampler(size_t input_block_size);

  // The destination buffer |destP| is of size sourceFramesToProcess / 2.
  void Process(const float* source_p,
@@ -62,11 +62,6 @@ class PLATFORM_EXPORT DownSampler {

  size_t input_block_size_;

-  // Computes ideal band-limited half-band filter coefficients.
-  // In other words, filter out all frequencies higher than 0.25 * Nyquist.
-  void InitializeKernel();
-  AudioFloatArray reduced_kernel_;
-
  // Half-band filter.
  DirectConvolver convolver_;


--- a/third_party/WebKit/Source/platform/audio/ReverbConvolverStage.cpp
+++ b/third_party/WebKit/Source/platform/audio/ReverbConvolverStage.cpp
@@ -30,6 +30,7 @@

 #include <algorithm>
 #include <memory>
+#include <utility>

 #include "platform/audio/ReverbAccumulationBuffer.h"
 #include "platform/audio/ReverbConvolver.h"
@@ -38,8 +39,6 @@

 namespace blink {

-using namespace VectorMath;
-
 ReverbConvolverStage::ReverbConvolverStage(
    const float* impulse_response,
    size_t,
@@ -66,9 +65,10 @@ ReverbConvolverStage::ReverbConvolverStage(
    DCHECK(!stage_offset);
    DCHECK_LE(stage_length, fft_size / 2);

-    direct_kernel_ = std::make_unique<AudioFloatArray>(fft_size / 2);
-    direct_kernel_->CopyToRange(impulse_response, 0, stage_length);
-    direct_convolver_ = std::make_unique<DirectConvolver>(render_slice_size);
+    auto direct_kernel = std::make_unique<AudioFloatArray>(fft_size / 2);
+    direct_kernel->CopyToRange(impulse_response, 0, stage_length);
+    direct_convolver_ = std::make_unique<DirectConvolver>(
+        render_slice_size, std::move(direct_kernel));
  }
  temporary_buffer_.Allocate(render_slice_size);

@@ -166,8 +166,8 @@ void ReverbConvolverStage::Process(const float* source,
      fft_convolver_->Process(fft_kernel_.get(), pre_delayed_source,
                              temporary_buffer, frames_to_process);
    else
-      direct_convolver_->Process(direct_kernel_.get(), pre_delayed_source,
-                                 temporary_buffer, frames_to_process);
+      direct_convolver_->Process(pre_delayed_source, temporary_buffer,
+                                 frames_to_process);

    // Now accumulate into reverb's accumulation buffer.
    accumulation_buffer_->Accumulate(temporary_buffer, frames_to_process,

--- a/third_party/WebKit/Source/platform/audio/ReverbConvolverStage.h
+++ b/third_party/WebKit/Source/platform/audio/ReverbConvolverStage.h
@@ -30,6 +30,7 @@
 #define ReverbConvolverStage_h

 #include <memory>
+
 #include "platform/audio/AudioArray.h"
 #include "platform/audio/FFTFrame.h"
 #include "platform/wtf/Allocator.h"
@@ -95,7 +96,6 @@ class PLATFORM_EXPORT ReverbConvolverStage {
  AudioFloatArray temporary_buffer_;

  bool direct_mode_;
-  std::unique_ptr<AudioFloatArray> direct_kernel_;
  std::unique_ptr<DirectConvolver> direct_convolver_;
 };


--- a/third_party/WebKit/Source/platform/audio/UpSampler.cpp
+++ b/third_party/WebKit/Source/platform/audio/UpSampler.cpp
@@ -29,27 +29,29 @@
 */

 #include "platform/audio/UpSampler.h"
+
+#include <memory>
+
 #include "platform/wtf/MathExtras.h"

 namespace blink {

-UpSampler::UpSampler(size_t input_block_size)
-    : input_block_size_(input_block_size),
-      kernel_(kDefaultKernelSize),
-      convolver_(input_block_size),
-      temp_buffer_(input_block_size),
-      input_buffer_(input_block_size * 2) {
-  InitializeKernel();
-}
+namespace {
+
+// Computes ideal band-limited filter coefficients to sample in between each
+// source sample-frame.  This filter will be used to compute the odd
+// sample-frames of the output.
+std::unique_ptr<AudioFloatArray> MakeKernel(size_t size) {
+  std::unique_ptr<AudioFloatArray> kernel =
+      std::make_unique<AudioFloatArray>(size);

-void UpSampler::InitializeKernel() {
  // Blackman window parameters.
  double alpha = 0.16;
  double a0 = 0.5 * (1.0 - alpha);
  double a1 = 0.5;
  double a2 = 0.5 * alpha;

-  int n = kernel_.size();
+  int n = kernel->size();
  int half_size = n / 2;
  double subsample_offset = -0.5;

@@ -64,10 +66,20 @@ void UpSampler::InitializeKernel() {
        a0 - a1 * cos(twoPiDouble * x) + a2 * cos(twoPiDouble * 2.0 * x);

    // Window the sinc() function.
-    kernel_[i] = sinc * window;
+    (*kernel)[i] = sinc * window;
  }
+
+  return kernel;
 }

+}  // namespace
+
+UpSampler::UpSampler(size_t input_block_size)
+    : input_block_size_(input_block_size),
+      convolver_(input_block_size, MakeKernel(kDefaultKernelSize)),
+      temp_buffer_(input_block_size),
+      input_buffer_(input_block_size * 2) {}
+
 void UpSampler::Process(const float* source_p,
                        float* dest_p,
                        size_t source_frames_to_process) {
@@ -81,12 +93,13 @@ void UpSampler::Process(const float* source_p,
  if (!is_temp_buffer_good)
    return;

-  bool is_kernel_good = kernel_.size() == kDefaultKernelSize;
+  bool is_kernel_good =
+      convolver_.ConvolutionKernelSize() == kDefaultKernelSize;
  DCHECK(is_kernel_good);
  if (!is_kernel_good)
    return;

-  size_t half_size = kernel_.size() / 2;
+  size_t half_size = convolver_.ConvolutionKernelSize() / 2;

  // Copy source samples to 2nd half of input buffer.
  bool is_input_buffer_good =
@@ -106,8 +119,7 @@ void UpSampler::Process(const float* source_p,

  // Compute odd sample-frames 1,3,5,7...
  float* odd_samples_p = temp_buffer_.Data();
-  convolver_.Process(&kernel_, source_p, odd_samples_p,
-                     source_frames_to_process);
+  convolver_.Process(source_p, odd_samples_p, source_frames_to_process);

  for (unsigned i = 0; i < source_frames_to_process; ++i)
    dest_p[i * 2 + 1] = odd_samples_p[i];
@@ -125,7 +137,7 @@ void UpSampler::Reset() {
 size_t UpSampler::LatencyFrames() const {
  // Divide by two since this is a linear phase kernel and the delay is at the
  // center of the kernel.
-  return kernel_.size() / 2;
+  return convolver_.ConvolutionKernelSize() / 2;
 }

 }  // namespace blink
--- a/third_party/WebKit/Source/platform/audio/UpSampler.h
+++ b/third_party/WebKit/Source/platform/audio/UpSampler.h
@@ -45,7 +45,7 @@ class PLATFORM_EXPORT UpSampler {
  WTF_MAKE_NONCOPYABLE(UpSampler);

 public:
-  UpSampler(size_t input_block_size);
+  explicit UpSampler(size_t input_block_size);

  // The destination buffer |destP| is of size sourceFramesToProcess * 2.
  void Process(const float* source_p,
@@ -62,12 +62,6 @@ class PLATFORM_EXPORT UpSampler {

  size_t input_block_size_;

-  // Computes ideal band-limited filter coefficients to sample in between each
-  // source sample-frame.  This filter will be used to compute the odd
-  // sample-frames of the output.
-  void InitializeKernel();
-  AudioFloatArray kernel_;
-
  // Computes the odd sample-frames of the output.
  DirectConvolver convolver_;


--- a/third_party/WebKit/Source/platform/audio/VectorMath.cpp
+++ b/third_party/WebKit/Source/platform/audio/VectorMath.cpp
@@ -61,6 +61,21 @@ namespace Impl = Scalar;
 #endif
 }  // namespace

+void PrepareFilterForConv(const float* filter_p,
+                          int filter_stride,
+                          size_t filter_size,
+                          AudioFloatArray* prepared_filter) {
+  // Only contiguous convolution is implemented by all implementations.
+  // Correlation (positive |filter_stride|) and support for non-contiguous
+  // vectors are not implemented by all implementations.
+  DCHECK_EQ(-1, filter_stride);
+  DCHECK(prepared_filter);
+#if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MACOSX)
+  X86::PrepareFilterForConv(filter_p, filter_stride, filter_size,
+                            prepared_filter);
+#endif
+}
+
 void Conv(const float* source_p,
          int source_stride,
          const float* filter_p,
@@ -68,7 +83,8 @@ void Conv(const float* source_p,
          float* dest_p,
          int dest_stride,
          size_t frames_to_process,
-          size_t filter_size) {
+          size_t filter_size,
+          const AudioFloatArray* prepared_filter) {
  // Only contiguous convolution is implemented by all implementations.
  // Correlation (positive |filter_stride|) and support for non-contiguous
  // vectors are not implemented by all implementations.
@@ -76,7 +92,7 @@ void Conv(const float* source_p,
  DCHECK_EQ(-1, filter_stride);
  DCHECK_EQ(1, dest_stride);
  Impl::Conv(source_p, source_stride, filter_p, filter_stride, dest_p,
-             dest_stride, frames_to_process, filter_size);
+             dest_stride, frames_to_process, filter_size, prepared_filter);
 }

 void Vadd(const float* source1p,

--- a/third_party/WebKit/Source/platform/audio/VectorMath.h
+++ b/third_party/WebKit/Source/platform/audio/VectorMath.h
@@ -29,6 +29,7 @@
 #include <cstddef>

 #include "platform/PlatformExport.h"
+#include "platform/audio/AudioArray.h"

 // Defines the interface for several vector math functions whose implementation
 // will ideally be optimized.
@@ -47,7 +48,14 @@ PLATFORM_EXPORT void Conv(const float* source_p,
                          float* dest_p,
                          int dest_stride,
                          size_t frames_to_process,
-                          size_t filter_size);
+                          size_t filter_size,
+                          const AudioFloatArray* prepared_filter);
+
+// Prepare filter for Conv for faster processing.
+PLATFORM_EXPORT void PrepareFilterForConv(const float* filter_p,
+                                          int filter_stride,
+                                          size_t filter_size,
+                                          AudioFloatArray* prepared_filter);

 // Vector scalar multiply and then add.
 //

--- a/third_party/WebKit/Source/platform/audio/VectorMathScalar.h
+++ b/third_party/WebKit/Source/platform/audio/VectorMathScalar.h
@@ -8,6 +8,7 @@
 #include <algorithm>
 #include <cmath>

+#include "platform/audio/AudioArray.h"
 #include "platform/wtf/Assertions.h"
 #include "platform/wtf/MathExtras.h"

@@ -22,7 +23,8 @@ static ALWAYS_INLINE void Conv(const float* source_p,
                               float* dest_p,
                               int dest_stride,
                               size_t frames_to_process,
-                               size_t filter_size) {
+                               size_t filter_size,
+                               const AudioFloatArray* /*prepared_filter*/) {
  // Only contiguous convolution is implemented. Correlation (positive
  // |filter_stride|) and support for non-contiguous vectors are not
  // implemented.
@@ -30,26 +32,22 @@ static ALWAYS_INLINE void Conv(const float* source_p,
  DCHECK_EQ(-1, filter_stride);
  DCHECK_EQ(1, dest_stride);

-  size_t kernel_size = filter_size;
-  const float* input_p = source_p + kernel_size - 1;
-  const float* kernel_p = filter_p + 1 - kernel_size;
-
  size_t i = 0;

 // FIXME: The macro can be further optimized to avoid pipeline stalls. One
 // possibility is to maintain 4 separate sums and change the macro to
 // CONVOLVE_FOUR_SAMPLES.
-#define CONVOLVE_ONE_SAMPLE              \
-  do {                                   \
-    sum += input_p[i - j] * kernel_p[j]; \
-    j++;                                 \
+#define CONVOLVE_ONE_SAMPLE                   \
+  do {                                        \
+    sum += source_p[i + j] * *(filter_p - j); \
+    j++;                                      \
  } while (0)

  while (i < frames_to_process) {
    size_t j = 0;
    float sum = 0;

-    if (kernel_size == 32) {
+    if (filter_size == 32) {
      CONVOLVE_ONE_SAMPLE;  // 1
      CONVOLVE_ONE_SAMPLE;  // 2
      CONVOLVE_ONE_SAMPLE;  // 3
@@ -86,7 +84,7 @@ static ALWAYS_INLINE void Conv(const float* source_p,
      CONVOLVE_ONE_SAMPLE;  // 31
      CONVOLVE_ONE_SAMPLE;  // 32

-    } else if (kernel_size == 64) {
+    } else if (filter_size == 64) {
      CONVOLVE_ONE_SAMPLE;  // 1
      CONVOLVE_ONE_SAMPLE;  // 2
      CONVOLVE_ONE_SAMPLE;  // 3
@@ -158,7 +156,7 @@ static ALWAYS_INLINE void Conv(const float* source_p,
      CONVOLVE_ONE_SAMPLE;  // 63
      CONVOLVE_ONE_SAMPLE;  // 64

-    } else if (kernel_size == 128) {
+    } else if (filter_size == 128) {
      CONVOLVE_ONE_SAMPLE;  // 1
      CONVOLVE_ONE_SAMPLE;  // 2
      CONVOLVE_ONE_SAMPLE;  // 3
@@ -300,7 +298,7 @@ static ALWAYS_INLINE void Conv(const float* source_p,
      CONVOLVE_ONE_SAMPLE;  // 127
      CONVOLVE_ONE_SAMPLE;  // 128
    } else {
-      while (j < kernel_size) {
+      while (j < filter_size) {
        // Non-optimized using actual while loop.
        CONVOLVE_ONE_SAMPLE;
      }

--- a/third_party/WebKit/Source/platform/audio/VectorMathTest.cpp
+++ b/third_party/WebKit/Source/platform/audio/VectorMathTest.cpp
@@ -264,7 +264,7 @@ TEST_F(VectorMathTest, Conv) {
  for (const auto& source : GetPrimaryVectors(GetSource(kFullyFiniteSource))) {
    if (source.stride() != 1)
      continue;
-    for (size_t filter_size : {3u, 20u, 32u, 64u, 128u}) {
+    for (size_t filter_size : {3u, 32u, 64u, 128u}) {
      // The maximum number of frames which could be processed here is
      // |source.size() - filter_size + 1|. However, in order to test
      // optimization paths, |frames_to_process| should be optimal (divisible
@@ -289,8 +289,10 @@ TEST_F(VectorMathTest, Conv) {
      }
      for (auto& dest : GetSecondaryVectors(
               GetDestination(1u), source.memory_layout(), frames_to_process)) {
+        AudioFloatArray prepared_filter;
+        PrepareFilterForConv(filter_p, -1, filter_size, &prepared_filter);
        Conv(source.p(), 1, filter_p, -1, dest.p(), 1, frames_to_process,
-             filter_size);
+             filter_size, &prepared_filter);
        for (size_t i = 0u; i < frames_to_process; ++i) {
          EXPECT_NEAR(expected_dest[i], dest[i],
                      1e-3 * std::abs(expected_dest[i]));

--- a/third_party/WebKit/Source/platform/audio/cpu/x86/VectorMathAVX.h
+++ b/third_party/WebKit/Source/platform/audio/cpu/x86/VectorMathAVX.h
@@ -7,6 +7,8 @@

 #include <cstddef>

+#include "platform/audio/AudioArray.h"
+
 namespace blink {
 namespace VectorMath {
 namespace AVX {
@@ -17,6 +19,21 @@ constexpr size_t kFramesToProcessMask = ~(kPackedFloatsPerRegister - 1u);

 bool IsAligned(const float*);

+// Direct vector convolution:
+// dest[k] = sum(source[k+m]*filter[m*filter_stride]) for all m
+// provided that |prepared_filter_p| is |prepared_filter->Data()| and that
+// |prepared_filter| is prepared with |PrepareFilterForConv|.
+void Conv(const float* source_p,
+          const float* prepared_filter_p,
+          float* dest_p,
+          size_t frames_to_process,
+          size_t filter_size);
+
+void PrepareFilterForConv(const float* filter_p,
+                          int filter_stride,
+                          size_t filter_size,
+                          AudioFloatArray* prepared_filter);
+
 // dest[k] = source1[k] + source2[k]
 void Vadd(const float* source1p,
          const float* source2p,

--- a/third_party/WebKit/Source/platform/audio/cpu/x86/VectorMathImpl.h
+++ b/third_party/WebKit/Source/platform/audio/cpu/x86/VectorMathImpl.h
@@ -2,25 +2,98 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

+// This file intentionally does not have header guards, it's included from
+// VectorMathAVX.h and from VectorMathSSE.h with different macro definitions.
+// The following line silences a presubmit warning that would otherwise be
+// triggered by this: no-include-guard-because-multiply-included
+
 #include "build/build_config.h"

 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MACOSX)

-#include "platform/wtf/Assertions.h"
-
 #include <algorithm>
 #include <cmath>

+#include "platform/audio/AudioArray.h"
+#include "platform/wtf/Assertions.h"
+
 namespace blink {
 namespace VectorMath {
 namespace VECTOR_MATH_SIMD_NAMESPACE_NAME {

+// This stride is chosen so that the same prepared filter created by
+// AVX::PrepareFilterForConv can be used by both AVX::Conv and SSE::Conv.
+// A prepared filter created by SSE::PrepareFilterForConv can only be used
+// by SSE::Conv.
+constexpr size_t kReversedFilterStride = 8u / kPackedFloatsPerRegister;
+
 bool IsAligned(const float* p) {
  constexpr size_t kBytesPerRegister = kBitsPerRegister / 8u;
  constexpr size_t kAlignmentOffsetMask = kBytesPerRegister - 1u;
  return (reinterpret_cast<size_t>(p) & kAlignmentOffsetMask) == 0u;
 }

+void PrepareFilterForConv(const float* filter_p,
+                          int filter_stride,
+                          size_t filter_size,
+                          AudioFloatArray* prepared_filter) {
+  // Only contiguous convolution is implemented. Correlation (positive
+  // |filter_stride|) and support for non-contiguous vectors are not
+  // implemented.
+  DCHECK_EQ(-1, filter_stride);
+  DCHECK(prepared_filter);
+
+  // Reverse the filter and repeat each value across a vector
+  prepared_filter->Allocate(kReversedFilterStride * kPackedFloatsPerRegister *
+                            filter_size);
+  MType* reversed_filter = reinterpret_cast<MType*>(prepared_filter->Data());
+  for (size_t i = 0; i < filter_size; ++i) {
+    reversed_filter[kReversedFilterStride * i] = MM_PS(set1)(*(filter_p - i));
+  }
+}
+
+// Direct vector convolution:
+// dest[k] = sum(source[k+m]*filter[m*filter_stride]) for all m
+// provided that |prepared_filter_p| is |prepared_filter->Data()| and that
+// |prepared_filter| is prepared with |PrepareFilterForConv|.
+void Conv(const float* source_p,
+          const float* prepared_filter_p,
+          float* dest_p,
+          size_t frames_to_process,
+          size_t filter_size) {
+  const float* const dest_end_p = dest_p + frames_to_process;
+
+  DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
+  DCHECK_EQ(0u, filter_size % kPackedFloatsPerRegister);
+
+  const MType* reversed_filter =
+      reinterpret_cast<const MType*>(prepared_filter_p);
+
+  // Do convolution with kPackedFloatsPerRegister inputs at a time.
+  while (dest_p < dest_end_p) {
+    MType m_convolution_sum = MM_PS(setzero)();
+
+    // |filter_size| is a multiple of kPackedFloatsPerRegister so we can unroll
+    // the loop by kPackedFloatsPerRegister, manually.
+    for (size_t i = 0; i < filter_size; i += kPackedFloatsPerRegister) {
+      for (size_t j = 0; j < kPackedFloatsPerRegister; ++j) {
+        size_t k = i + j;
+        MType m_product;
+        MType m_source;
+
+        m_source = MM_PS(loadu)(source_p + k);
+        m_product =
+            MM_PS(mul)(reversed_filter[kReversedFilterStride * k], m_source);
+        m_convolution_sum = MM_PS(add)(m_convolution_sum, m_product);
+      }
+    }
+    MM_PS(storeu)(dest_p, m_convolution_sum);
+
+    source_p += kPackedFloatsPerRegister;
+    dest_p += kPackedFloatsPerRegister;
+  }
+}
+
 // dest[k] = source1[k] + source2[k]
 void Vadd(const float* source1p,
          const float* source2p,

--- a/third_party/WebKit/Source/platform/audio/cpu/x86/VectorMathSSE.h
+++ b/third_party/WebKit/Source/platform/audio/cpu/x86/VectorMathSSE.h
@@ -7,6 +7,8 @@

 #include <cstddef>

+#include "platform/audio/AudioArray.h"
+
 namespace blink {
 namespace VectorMath {
 namespace SSE {
@@ -17,6 +19,21 @@ constexpr size_t kFramesToProcessMask = ~(kPackedFloatsPerRegister - 1u);

 bool IsAligned(const float*);

+// Direct vector convolution:
+// dest[k] = sum(source[k+m]*filter[m*filter_stride]) for all m
+// provided that |prepared_filter_p| is |prepared_filter->Data()| and that
+// |prepared_filter| is prepared with |PrepareFilterForConv|.
+void Conv(const float* source_p,
+          const float* prepared_filter_p,
+          float* dest_p,
+          size_t frames_to_process,
+          size_t filter_size);
+
+void PrepareFilterForConv(const float* filter_p,
+                          int filter_stride,
+                          size_t filter_size,
+                          AudioFloatArray* prepared_filter);
+
 // dest[k] = source1[k] + source2[k]
 void Vadd(const float* source1p,
          const float* source2p,

--- a/third_party/WebKit/Source/platform/audio/cpu/x86/VectorMathX86.h
+++ b/third_party/WebKit/Source/platform/audio/cpu/x86/VectorMathX86.h
@@ -6,14 +6,11 @@
 #define VectorMathX86_h

 #include "base/cpu.h"
-#include "platform/audio/AudioArray.h"
 #include "platform/audio/VectorMathScalar.h"
 #include "platform/audio/cpu/x86/VectorMathAVX.h"
 #include "platform/audio/cpu/x86/VectorMathSSE.h"
 #include "platform/wtf/Assertions.h"

-#include <xmmintrin.h>
-
 namespace blink {
 namespace VectorMath {
 namespace X86 {
@@ -94,6 +91,20 @@ SplitFramesToProcess(const float* source_p, size_t frames_to_process) {
  return counts;
 }

+static ALWAYS_INLINE void PrepareFilterForConv(
+    const float* filter_p,
+    int filter_stride,
+    size_t filter_size,
+    AudioFloatArray* prepared_filter) {
+  if (CPUSupportsAVX()) {
+    AVX::PrepareFilterForConv(filter_p, filter_stride, filter_size,
+                              prepared_filter);
+  } else {
+    SSE::PrepareFilterForConv(filter_p, filter_stride, filter_size,
+                              prepared_filter);
+  }
+}
+
 static ALWAYS_INLINE void Conv(const float* source_p,
                               int source_stride,
                               const float* filter_p,
@@ -101,61 +112,30 @@ static ALWAYS_INLINE void Conv(const float* source_p,
                               float* dest_p,
                               int dest_stride,
                               size_t frames_to_process,
-                               size_t filter_size) {
-  // Only contiguous convolution is implemented. Correlation (positive
-  // |filter_stride|) and support for non-contiguous vectors are not
-  // implemented.
-  DCHECK_EQ(1, source_stride);
-  DCHECK_EQ(-1, filter_stride);
-  DCHECK_EQ(1, dest_stride);
-
-  size_t kernel_size = filter_size;
-  const float* input_p = source_p + kernel_size - 1;
-  const float* kernel_p = filter_p + 1 - kernel_size;
-
-  size_t i = 0;
-
-  // Convolution using SSE2. Currently only do this if both |kernel_size| and
-  // |frames_to_process| are multiples of 4. If not, use Scalar::Conv.
-
-  if ((kernel_size % 4 == 0) && (frames_to_process % 4 == 0)) {
-    // AudioFloatArray's are always aligned on at least a 32-byte boundary.
-    AudioFloatArray kernel_buffer(4 * kernel_size);
-    __m128* kernel_reversed = reinterpret_cast<__m128*>(kernel_buffer.Data());
-
-    // Reverse the kernel and repeat each value across a vector
-    for (i = 0; i < kernel_size; ++i) {
-      kernel_reversed[i] = _mm_set1_ps(kernel_p[kernel_size - i - 1]);
+                               size_t filter_size,
+                               const AudioFloatArray* prepared_filter) {
+  const float* prepared_filter_p =
+      prepared_filter ? prepared_filter->Data() : nullptr;
+  if (source_stride == 1 && dest_stride == 1 && prepared_filter_p) {
+    if (CPUSupportsAVX() && (filter_size & ~AVX::kFramesToProcessMask) == 0u) {
+      // |frames_to_process| is always a multiply of render quantum and
+      // therefore the frames can always be processed using AVX.
+      CHECK_EQ(frames_to_process & ~AVX::kFramesToProcessMask, 0u);
+      AVX::Conv(source_p, prepared_filter_p, dest_p, frames_to_process,
+                filter_size);
+      return;
    }
-
-    const float* input_start_p = input_p - kernel_size + 1;
-
-    // Do convolution with 4 inputs at a time.
-    for (i = 0; i < frames_to_process; i += 4) {
-      __m128 convolution_sum;
-
-      convolution_sum = _mm_setzero_ps();
-
-      // |kernel_size| is a multiple of 4 so we can unroll the loop by 4,
-      // manually.
-      for (size_t k = 0; k < kernel_size; k += 4) {
-        size_t data_offset = i + k;
-
-        for (size_t m = 0; m < 4; ++m) {
-          __m128 source_block;
-          __m128 product;
-
-          source_block = _mm_loadu_ps(input_start_p + data_offset + m);
-          product = _mm_mul_ps(kernel_reversed[k + m], source_block);
-          convolution_sum = _mm_add_ps(convolution_sum, product);
-        }
-      }
-      _mm_storeu_ps(dest_p + i, convolution_sum);
+    if ((filter_size & ~SSE::kFramesToProcessMask) == 0u) {
+      // |frames_to_process| is always a multiply of render quantum and
+      // therefore the frames can always be processed using SSE.
+      CHECK_EQ(frames_to_process & ~SSE::kFramesToProcessMask, 0u);
+      SSE::Conv(source_p, prepared_filter_p, dest_p, frames_to_process,
+                filter_size);
+      return;
    }
-  } else {
-    Scalar::Conv(source_p, source_stride, filter_p, filter_stride, dest_p,
-                 dest_stride, frames_to_process, filter_size);
  }
+  Scalar::Conv(source_p, source_stride, filter_p, filter_stride, dest_p,
+               dest_stride, frames_to_process, filter_size, nullptr);
 }

 static ALWAYS_INLINE void Vadd(const float* source1p,

--- a/third_party/WebKit/Source/platform/audio/mac/VectorMathMac.h
+++ b/third_party/WebKit/Source/platform/audio/mac/VectorMathMac.h
@@ -8,6 +8,7 @@
 #include <Accelerate/Accelerate.h>

 #include "build/build_config.h"
+#include "platform/audio/AudioArray.h"

 namespace blink {
 namespace VectorMath {
@@ -26,7 +27,8 @@ static ALWAYS_INLINE void Conv(const float* source_p,
                               float* dest_p,
                               int dest_stride,
                               size_t frames_to_process,
-                               size_t filter_size) {
+                               size_t filter_size,
+                               const AudioFloatArray* /*prepared_filter*/) {
 #if defined(ARCH_CPU_X86)
  ::conv(source_p, source_stride, filter_p, filter_stride, dest_p, dest_stride,
         frames_to_process, filter_size);