Commit b401d9b0 authored by Eero Häkkinen's avatar Eero Häkkinen Committed by Commit Bot

Optimize blink::VectorMath::Conv for AVX

This CL doubles the performance on an Intel Broadwell CPU.

Bug: 778262
Change-Id: Ie60d64a0e862148a8a74e1fb162be2ec147e6cc6
Reviewed-on: https://chromium-review.googlesource.com/924144
Commit-Queue: Eero Häkkinen <eero.hakkinen@intel.com>
Reviewed-by: default avatarRaymond Toy <rtoy@chromium.org>
Cr-Commit-Position: refs/heads/master@{#544319}
parent cdc7defb
...@@ -28,6 +28,8 @@ ...@@ -28,6 +28,8 @@
#include "platform/audio/DirectConvolver.h" #include "platform/audio/DirectConvolver.h"
#include <utility>
#include "build/build_config.h" #include "build/build_config.h"
#include "platform/audio/VectorMath.h" #include "platform/audio/VectorMath.h"
...@@ -41,13 +43,23 @@ ...@@ -41,13 +43,23 @@
namespace blink { namespace blink {
using namespace VectorMath; namespace {
using VectorMath::Conv;
DirectConvolver::DirectConvolver(size_t input_block_size) using VectorMath::PrepareFilterForConv;
: input_block_size_(input_block_size), buffer_(input_block_size * 2) {} } // namespace
DirectConvolver::DirectConvolver(
size_t input_block_size,
std::unique_ptr<AudioFloatArray> convolution_kernel)
: input_block_size_(input_block_size),
buffer_(input_block_size * 2),
convolution_kernel_(std::move(convolution_kernel)) {
size_t kernel_size = ConvolutionKernelSize();
PrepareFilterForConv(convolution_kernel_->Data() + kernel_size - 1, -1,
kernel_size, &prepared_convolution_kernel_);
}
void DirectConvolver::Process(AudioFloatArray* convolution_kernel, void DirectConvolver::Process(const float* source_p,
const float* source_p,
float* dest_p, float* dest_p,
size_t frames_to_process) { size_t frames_to_process) {
DCHECK_EQ(frames_to_process, input_block_size_); DCHECK_EQ(frames_to_process, input_block_size_);
...@@ -55,12 +67,12 @@ void DirectConvolver::Process(AudioFloatArray* convolution_kernel, ...@@ -55,12 +67,12 @@ void DirectConvolver::Process(AudioFloatArray* convolution_kernel,
return; return;
// Only support kernelSize <= m_inputBlockSize // Only support kernelSize <= m_inputBlockSize
size_t kernel_size = convolution_kernel->size(); size_t kernel_size = ConvolutionKernelSize();
DCHECK_LE(kernel_size, input_block_size_); DCHECK_LE(kernel_size, input_block_size_);
if (kernel_size > input_block_size_) if (kernel_size > input_block_size_)
return; return;
float* kernel_p = convolution_kernel->Data(); float* kernel_p = convolution_kernel_->Data();
// Sanity check // Sanity check
bool is_copy_good = kernel_p && source_p && dest_p && buffer_.Data(); bool is_copy_good = kernel_p && source_p && dest_p && buffer_.Data();
...@@ -74,7 +86,7 @@ void DirectConvolver::Process(AudioFloatArray* convolution_kernel, ...@@ -74,7 +86,7 @@ void DirectConvolver::Process(AudioFloatArray* convolution_kernel,
memcpy(input_p, source_p, sizeof(float) * frames_to_process); memcpy(input_p, source_p, sizeof(float) * frames_to_process);
Conv(input_p - kernel_size + 1, 1, kernel_p + kernel_size - 1, -1, dest_p, 1, Conv(input_p - kernel_size + 1, 1, kernel_p + kernel_size - 1, -1, dest_p, 1,
frames_to_process, kernel_size); frames_to_process, kernel_size, &prepared_convolution_kernel_);
// Copy 2nd half of input buffer to 1st half. // Copy 2nd half of input buffer to 1st half.
memcpy(buffer_.Data(), input_p, sizeof(float) * frames_to_process); memcpy(buffer_.Data(), input_p, sizeof(float) * frames_to_process);
......
...@@ -29,6 +29,8 @@ ...@@ -29,6 +29,8 @@
#ifndef DirectConvolver_h #ifndef DirectConvolver_h
#define DirectConvolver_h #define DirectConvolver_h
#include <memory>
#include "platform/PlatformExport.h" #include "platform/PlatformExport.h"
#include "platform/audio/AudioArray.h" #include "platform/audio/AudioArray.h"
#include "platform/wtf/Allocator.h" #include "platform/wtf/Allocator.h"
...@@ -41,19 +43,21 @@ class PLATFORM_EXPORT DirectConvolver { ...@@ -41,19 +43,21 @@ class PLATFORM_EXPORT DirectConvolver {
WTF_MAKE_NONCOPYABLE(DirectConvolver); WTF_MAKE_NONCOPYABLE(DirectConvolver);
public: public:
DirectConvolver(size_t input_block_size); DirectConvolver(size_t input_block_size,
std::unique_ptr<AudioFloatArray> convolution_kernel);
void Process(AudioFloatArray* convolution_kernel, void Process(const float* source_p, float* dest_p, size_t frames_to_process);
const float* source_p,
float* dest_p,
size_t frames_to_process);
void Reset(); void Reset();
size_t ConvolutionKernelSize() const { return convolution_kernel_->size(); }
private: private:
size_t input_block_size_; size_t input_block_size_;
AudioFloatArray buffer_; AudioFloatArray buffer_;
std::unique_ptr<AudioFloatArray> convolution_kernel_;
AudioFloatArray prepared_convolution_kernel_;
}; };
} // namespace blink } // namespace blink
......
...@@ -29,27 +29,27 @@ ...@@ -29,27 +29,27 @@
*/ */
#include "platform/audio/DownSampler.h" #include "platform/audio/DownSampler.h"
#include <memory>
#include "platform/wtf/MathExtras.h" #include "platform/wtf/MathExtras.h"
namespace blink { namespace blink {
DownSampler::DownSampler(size_t input_block_size) namespace {
: input_block_size_(input_block_size),
reduced_kernel_(kDefaultKernelSize / 2), // Computes ideal band-limited half-band filter coefficients.
convolver_(input_block_size / 2), // runs at 1/2 source sample-rate // In other words, filter out all frequencies higher than 0.25 * Nyquist.
temp_buffer_(input_block_size / 2), std::unique_ptr<AudioFloatArray> MakeReducedKernel(size_t size) {
input_buffer_(input_block_size * 2) { auto reduced_kernel = std::make_unique<AudioFloatArray>(size / 2);
InitializeKernel();
}
void DownSampler::InitializeKernel() {
// Blackman window parameters. // Blackman window parameters.
double alpha = 0.16; double alpha = 0.16;
double a0 = 0.5 * (1.0 - alpha); double a0 = 0.5 * (1.0 - alpha);
double a1 = 0.5; double a1 = 0.5;
double a2 = 0.5 * alpha; double a2 = 0.5 * alpha;
int n = kDefaultKernelSize; int n = size;
int half_size = n / 2; int half_size = n / 2;
// Half-band filter. // Half-band filter.
...@@ -73,10 +73,21 @@ void DownSampler::InitializeKernel() { ...@@ -73,10 +73,21 @@ void DownSampler::InitializeKernel() {
// Then store only the odd terms in the kernel. // Then store only the odd terms in the kernel.
// In a sense, this is shifting forward in time by one sample-frame at the // In a sense, this is shifting forward in time by one sample-frame at the
// destination sample-rate. // destination sample-rate.
reduced_kernel_[(i - 1) / 2] = sinc * window; (*reduced_kernel)[(i - 1) / 2] = sinc * window;
} }
return reduced_kernel;
} }
} // namespace
DownSampler::DownSampler(size_t input_block_size)
: input_block_size_(input_block_size),
convolver_(input_block_size / 2, // runs at 1/2 source sample-rate
MakeReducedKernel(kDefaultKernelSize)),
temp_buffer_(input_block_size / 2),
input_buffer_(input_block_size * 2) {}
void DownSampler::Process(const float* source_p, void DownSampler::Process(const float* source_p,
float* dest_p, float* dest_p,
size_t source_frames_to_process) { size_t source_frames_to_process) {
...@@ -93,7 +104,7 @@ void DownSampler::Process(const float* source_p, ...@@ -93,7 +104,7 @@ void DownSampler::Process(const float* source_p,
return; return;
bool is_reduced_kernel_good = bool is_reduced_kernel_good =
reduced_kernel_.size() == kDefaultKernelSize / 2; convolver_.ConvolutionKernelSize() == kDefaultKernelSize / 2;
DCHECK(is_reduced_kernel_good); DCHECK(is_reduced_kernel_good);
if (!is_reduced_kernel_good) if (!is_reduced_kernel_good)
return; return;
...@@ -121,8 +132,7 @@ void DownSampler::Process(const float* source_p, ...@@ -121,8 +132,7 @@ void DownSampler::Process(const float* source_p,
// Actually process oddSamplesP with m_reducedKernel for efficiency. // Actually process oddSamplesP with m_reducedKernel for efficiency.
// The theoretical kernel is double this size with 0 values for even terms // The theoretical kernel is double this size with 0 values for even terms
// (except center). // (except center).
convolver_.Process(&reduced_kernel_, odd_samples_p, dest_p, convolver_.Process(odd_samples_p, dest_p, dest_frames_to_process);
dest_frames_to_process);
// Now, account for the 0.5 term right in the middle of the kernel. // Now, account for the 0.5 term right in the middle of the kernel.
// This amounts to a delay-line of length halfSize (at the source // This amounts to a delay-line of length halfSize (at the source
...@@ -145,7 +155,7 @@ void DownSampler::Reset() { ...@@ -145,7 +155,7 @@ void DownSampler::Reset() {
size_t DownSampler::LatencyFrames() const { size_t DownSampler::LatencyFrames() const {
// Divide by two since this is a linear phase kernel and the delay is at the // Divide by two since this is a linear phase kernel and the delay is at the
// center of the kernel. // center of the kernel.
return reduced_kernel_.size() / 2; return convolver_.ConvolutionKernelSize() / 2;
} }
} // namespace blink } // namespace blink
...@@ -45,7 +45,7 @@ class PLATFORM_EXPORT DownSampler { ...@@ -45,7 +45,7 @@ class PLATFORM_EXPORT DownSampler {
WTF_MAKE_NONCOPYABLE(DownSampler); WTF_MAKE_NONCOPYABLE(DownSampler);
public: public:
DownSampler(size_t input_block_size); explicit DownSampler(size_t input_block_size);
// The destination buffer |destP| is of size sourceFramesToProcess / 2. // The destination buffer |destP| is of size sourceFramesToProcess / 2.
void Process(const float* source_p, void Process(const float* source_p,
...@@ -62,11 +62,6 @@ class PLATFORM_EXPORT DownSampler { ...@@ -62,11 +62,6 @@ class PLATFORM_EXPORT DownSampler {
size_t input_block_size_; size_t input_block_size_;
// Computes ideal band-limited half-band filter coefficients.
// In other words, filter out all frequencies higher than 0.25 * Nyquist.
void InitializeKernel();
AudioFloatArray reduced_kernel_;
// Half-band filter. // Half-band filter.
DirectConvolver convolver_; DirectConvolver convolver_;
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <utility>
#include "platform/audio/ReverbAccumulationBuffer.h" #include "platform/audio/ReverbAccumulationBuffer.h"
#include "platform/audio/ReverbConvolver.h" #include "platform/audio/ReverbConvolver.h"
...@@ -38,8 +39,6 @@ ...@@ -38,8 +39,6 @@
namespace blink { namespace blink {
using namespace VectorMath;
ReverbConvolverStage::ReverbConvolverStage( ReverbConvolverStage::ReverbConvolverStage(
const float* impulse_response, const float* impulse_response,
size_t, size_t,
...@@ -66,9 +65,10 @@ ReverbConvolverStage::ReverbConvolverStage( ...@@ -66,9 +65,10 @@ ReverbConvolverStage::ReverbConvolverStage(
DCHECK(!stage_offset); DCHECK(!stage_offset);
DCHECK_LE(stage_length, fft_size / 2); DCHECK_LE(stage_length, fft_size / 2);
direct_kernel_ = std::make_unique<AudioFloatArray>(fft_size / 2); auto direct_kernel = std::make_unique<AudioFloatArray>(fft_size / 2);
direct_kernel_->CopyToRange(impulse_response, 0, stage_length); direct_kernel->CopyToRange(impulse_response, 0, stage_length);
direct_convolver_ = std::make_unique<DirectConvolver>(render_slice_size); direct_convolver_ = std::make_unique<DirectConvolver>(
render_slice_size, std::move(direct_kernel));
} }
temporary_buffer_.Allocate(render_slice_size); temporary_buffer_.Allocate(render_slice_size);
...@@ -166,8 +166,8 @@ void ReverbConvolverStage::Process(const float* source, ...@@ -166,8 +166,8 @@ void ReverbConvolverStage::Process(const float* source,
fft_convolver_->Process(fft_kernel_.get(), pre_delayed_source, fft_convolver_->Process(fft_kernel_.get(), pre_delayed_source,
temporary_buffer, frames_to_process); temporary_buffer, frames_to_process);
else else
direct_convolver_->Process(direct_kernel_.get(), pre_delayed_source, direct_convolver_->Process(pre_delayed_source, temporary_buffer,
temporary_buffer, frames_to_process); frames_to_process);
// Now accumulate into reverb's accumulation buffer. // Now accumulate into reverb's accumulation buffer.
accumulation_buffer_->Accumulate(temporary_buffer, frames_to_process, accumulation_buffer_->Accumulate(temporary_buffer, frames_to_process,
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#define ReverbConvolverStage_h #define ReverbConvolverStage_h
#include <memory> #include <memory>
#include "platform/audio/AudioArray.h" #include "platform/audio/AudioArray.h"
#include "platform/audio/FFTFrame.h" #include "platform/audio/FFTFrame.h"
#include "platform/wtf/Allocator.h" #include "platform/wtf/Allocator.h"
...@@ -95,7 +96,6 @@ class PLATFORM_EXPORT ReverbConvolverStage { ...@@ -95,7 +96,6 @@ class PLATFORM_EXPORT ReverbConvolverStage {
AudioFloatArray temporary_buffer_; AudioFloatArray temporary_buffer_;
bool direct_mode_; bool direct_mode_;
std::unique_ptr<AudioFloatArray> direct_kernel_;
std::unique_ptr<DirectConvolver> direct_convolver_; std::unique_ptr<DirectConvolver> direct_convolver_;
}; };
......
...@@ -29,27 +29,29 @@ ...@@ -29,27 +29,29 @@
*/ */
#include "platform/audio/UpSampler.h" #include "platform/audio/UpSampler.h"
#include <memory>
#include "platform/wtf/MathExtras.h" #include "platform/wtf/MathExtras.h"
namespace blink { namespace blink {
UpSampler::UpSampler(size_t input_block_size) namespace {
: input_block_size_(input_block_size),
kernel_(kDefaultKernelSize), // Computes ideal band-limited filter coefficients to sample in between each
convolver_(input_block_size), // source sample-frame. This filter will be used to compute the odd
temp_buffer_(input_block_size), // sample-frames of the output.
input_buffer_(input_block_size * 2) { std::unique_ptr<AudioFloatArray> MakeKernel(size_t size) {
InitializeKernel(); std::unique_ptr<AudioFloatArray> kernel =
} std::make_unique<AudioFloatArray>(size);
void UpSampler::InitializeKernel() {
// Blackman window parameters. // Blackman window parameters.
double alpha = 0.16; double alpha = 0.16;
double a0 = 0.5 * (1.0 - alpha); double a0 = 0.5 * (1.0 - alpha);
double a1 = 0.5; double a1 = 0.5;
double a2 = 0.5 * alpha; double a2 = 0.5 * alpha;
int n = kernel_.size(); int n = kernel->size();
int half_size = n / 2; int half_size = n / 2;
double subsample_offset = -0.5; double subsample_offset = -0.5;
...@@ -64,10 +66,20 @@ void UpSampler::InitializeKernel() { ...@@ -64,10 +66,20 @@ void UpSampler::InitializeKernel() {
a0 - a1 * cos(twoPiDouble * x) + a2 * cos(twoPiDouble * 2.0 * x); a0 - a1 * cos(twoPiDouble * x) + a2 * cos(twoPiDouble * 2.0 * x);
// Window the sinc() function. // Window the sinc() function.
kernel_[i] = sinc * window; (*kernel)[i] = sinc * window;
} }
return kernel;
} }
} // namespace
UpSampler::UpSampler(size_t input_block_size)
: input_block_size_(input_block_size),
convolver_(input_block_size, MakeKernel(kDefaultKernelSize)),
temp_buffer_(input_block_size),
input_buffer_(input_block_size * 2) {}
void UpSampler::Process(const float* source_p, void UpSampler::Process(const float* source_p,
float* dest_p, float* dest_p,
size_t source_frames_to_process) { size_t source_frames_to_process) {
...@@ -81,12 +93,13 @@ void UpSampler::Process(const float* source_p, ...@@ -81,12 +93,13 @@ void UpSampler::Process(const float* source_p,
if (!is_temp_buffer_good) if (!is_temp_buffer_good)
return; return;
bool is_kernel_good = kernel_.size() == kDefaultKernelSize; bool is_kernel_good =
convolver_.ConvolutionKernelSize() == kDefaultKernelSize;
DCHECK(is_kernel_good); DCHECK(is_kernel_good);
if (!is_kernel_good) if (!is_kernel_good)
return; return;
size_t half_size = kernel_.size() / 2; size_t half_size = convolver_.ConvolutionKernelSize() / 2;
// Copy source samples to 2nd half of input buffer. // Copy source samples to 2nd half of input buffer.
bool is_input_buffer_good = bool is_input_buffer_good =
...@@ -106,8 +119,7 @@ void UpSampler::Process(const float* source_p, ...@@ -106,8 +119,7 @@ void UpSampler::Process(const float* source_p,
// Compute odd sample-frames 1,3,5,7... // Compute odd sample-frames 1,3,5,7...
float* odd_samples_p = temp_buffer_.Data(); float* odd_samples_p = temp_buffer_.Data();
convolver_.Process(&kernel_, source_p, odd_samples_p, convolver_.Process(source_p, odd_samples_p, source_frames_to_process);
source_frames_to_process);
for (unsigned i = 0; i < source_frames_to_process; ++i) for (unsigned i = 0; i < source_frames_to_process; ++i)
dest_p[i * 2 + 1] = odd_samples_p[i]; dest_p[i * 2 + 1] = odd_samples_p[i];
...@@ -125,7 +137,7 @@ void UpSampler::Reset() { ...@@ -125,7 +137,7 @@ void UpSampler::Reset() {
size_t UpSampler::LatencyFrames() const { size_t UpSampler::LatencyFrames() const {
// Divide by two since this is a linear phase kernel and the delay is at the // Divide by two since this is a linear phase kernel and the delay is at the
// center of the kernel. // center of the kernel.
return kernel_.size() / 2; return convolver_.ConvolutionKernelSize() / 2;
} }
} // namespace blink } // namespace blink
...@@ -45,7 +45,7 @@ class PLATFORM_EXPORT UpSampler { ...@@ -45,7 +45,7 @@ class PLATFORM_EXPORT UpSampler {
WTF_MAKE_NONCOPYABLE(UpSampler); WTF_MAKE_NONCOPYABLE(UpSampler);
public: public:
UpSampler(size_t input_block_size); explicit UpSampler(size_t input_block_size);
// The destination buffer |destP| is of size sourceFramesToProcess * 2. // The destination buffer |destP| is of size sourceFramesToProcess * 2.
void Process(const float* source_p, void Process(const float* source_p,
...@@ -62,12 +62,6 @@ class PLATFORM_EXPORT UpSampler { ...@@ -62,12 +62,6 @@ class PLATFORM_EXPORT UpSampler {
size_t input_block_size_; size_t input_block_size_;
// Computes ideal band-limited filter coefficients to sample in between each
// source sample-frame. This filter will be used to compute the odd
// sample-frames of the output.
void InitializeKernel();
AudioFloatArray kernel_;
// Computes the odd sample-frames of the output. // Computes the odd sample-frames of the output.
DirectConvolver convolver_; DirectConvolver convolver_;
......
...@@ -61,6 +61,21 @@ namespace Impl = Scalar; ...@@ -61,6 +61,21 @@ namespace Impl = Scalar;
#endif #endif
} // namespace } // namespace
void PrepareFilterForConv(const float* filter_p,
int filter_stride,
size_t filter_size,
AudioFloatArray* prepared_filter) {
// Only contiguous convolution is implemented by all implementations.
// Correlation (positive |filter_stride|) and support for non-contiguous
// vectors are not implemented by all implementations.
DCHECK_EQ(-1, filter_stride);
DCHECK(prepared_filter);
#if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MACOSX)
X86::PrepareFilterForConv(filter_p, filter_stride, filter_size,
prepared_filter);
#endif
}
void Conv(const float* source_p, void Conv(const float* source_p,
int source_stride, int source_stride,
const float* filter_p, const float* filter_p,
...@@ -68,7 +83,8 @@ void Conv(const float* source_p, ...@@ -68,7 +83,8 @@ void Conv(const float* source_p,
float* dest_p, float* dest_p,
int dest_stride, int dest_stride,
size_t frames_to_process, size_t frames_to_process,
size_t filter_size) { size_t filter_size,
const AudioFloatArray* prepared_filter) {
// Only contiguous convolution is implemented by all implementations. // Only contiguous convolution is implemented by all implementations.
// Correlation (positive |filter_stride|) and support for non-contiguous // Correlation (positive |filter_stride|) and support for non-contiguous
// vectors are not implemented by all implementations. // vectors are not implemented by all implementations.
...@@ -76,7 +92,7 @@ void Conv(const float* source_p, ...@@ -76,7 +92,7 @@ void Conv(const float* source_p,
DCHECK_EQ(-1, filter_stride); DCHECK_EQ(-1, filter_stride);
DCHECK_EQ(1, dest_stride); DCHECK_EQ(1, dest_stride);
Impl::Conv(source_p, source_stride, filter_p, filter_stride, dest_p, Impl::Conv(source_p, source_stride, filter_p, filter_stride, dest_p,
dest_stride, frames_to_process, filter_size); dest_stride, frames_to_process, filter_size, prepared_filter);
} }
void Vadd(const float* source1p, void Vadd(const float* source1p,
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include <cstddef> #include <cstddef>
#include "platform/PlatformExport.h" #include "platform/PlatformExport.h"
#include "platform/audio/AudioArray.h"
// Defines the interface for several vector math functions whose implementation // Defines the interface for several vector math functions whose implementation
// will ideally be optimized. // will ideally be optimized.
...@@ -47,7 +48,14 @@ PLATFORM_EXPORT void Conv(const float* source_p, ...@@ -47,7 +48,14 @@ PLATFORM_EXPORT void Conv(const float* source_p,
float* dest_p, float* dest_p,
int dest_stride, int dest_stride,
size_t frames_to_process, size_t frames_to_process,
size_t filter_size); size_t filter_size,
const AudioFloatArray* prepared_filter);
// Prepare filter for Conv for faster processing.
PLATFORM_EXPORT void PrepareFilterForConv(const float* filter_p,
int filter_stride,
size_t filter_size,
AudioFloatArray* prepared_filter);
// Vector scalar multiply and then add. // Vector scalar multiply and then add.
// //
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include "platform/audio/AudioArray.h"
#include "platform/wtf/Assertions.h" #include "platform/wtf/Assertions.h"
#include "platform/wtf/MathExtras.h" #include "platform/wtf/MathExtras.h"
...@@ -22,7 +23,8 @@ static ALWAYS_INLINE void Conv(const float* source_p, ...@@ -22,7 +23,8 @@ static ALWAYS_INLINE void Conv(const float* source_p,
float* dest_p, float* dest_p,
int dest_stride, int dest_stride,
size_t frames_to_process, size_t frames_to_process,
size_t filter_size) { size_t filter_size,
const AudioFloatArray* /*prepared_filter*/) {
// Only contiguous convolution is implemented. Correlation (positive // Only contiguous convolution is implemented. Correlation (positive
// |filter_stride|) and support for non-contiguous vectors are not // |filter_stride|) and support for non-contiguous vectors are not
// implemented. // implemented.
...@@ -30,10 +32,6 @@ static ALWAYS_INLINE void Conv(const float* source_p, ...@@ -30,10 +32,6 @@ static ALWAYS_INLINE void Conv(const float* source_p,
DCHECK_EQ(-1, filter_stride); DCHECK_EQ(-1, filter_stride);
DCHECK_EQ(1, dest_stride); DCHECK_EQ(1, dest_stride);
size_t kernel_size = filter_size;
const float* input_p = source_p + kernel_size - 1;
const float* kernel_p = filter_p + 1 - kernel_size;
size_t i = 0; size_t i = 0;
// FIXME: The macro can be further optimized to avoid pipeline stalls. One // FIXME: The macro can be further optimized to avoid pipeline stalls. One
...@@ -41,7 +39,7 @@ static ALWAYS_INLINE void Conv(const float* source_p, ...@@ -41,7 +39,7 @@ static ALWAYS_INLINE void Conv(const float* source_p,
// CONVOLVE_FOUR_SAMPLES. // CONVOLVE_FOUR_SAMPLES.
#define CONVOLVE_ONE_SAMPLE \ #define CONVOLVE_ONE_SAMPLE \
do { \ do { \
sum += input_p[i - j] * kernel_p[j]; \ sum += source_p[i + j] * *(filter_p - j); \
j++; \ j++; \
} while (0) } while (0)
...@@ -49,7 +47,7 @@ static ALWAYS_INLINE void Conv(const float* source_p, ...@@ -49,7 +47,7 @@ static ALWAYS_INLINE void Conv(const float* source_p,
size_t j = 0; size_t j = 0;
float sum = 0; float sum = 0;
if (kernel_size == 32) { if (filter_size == 32) {
CONVOLVE_ONE_SAMPLE; // 1 CONVOLVE_ONE_SAMPLE; // 1
CONVOLVE_ONE_SAMPLE; // 2 CONVOLVE_ONE_SAMPLE; // 2
CONVOLVE_ONE_SAMPLE; // 3 CONVOLVE_ONE_SAMPLE; // 3
...@@ -86,7 +84,7 @@ static ALWAYS_INLINE void Conv(const float* source_p, ...@@ -86,7 +84,7 @@ static ALWAYS_INLINE void Conv(const float* source_p,
CONVOLVE_ONE_SAMPLE; // 31 CONVOLVE_ONE_SAMPLE; // 31
CONVOLVE_ONE_SAMPLE; // 32 CONVOLVE_ONE_SAMPLE; // 32
} else if (kernel_size == 64) { } else if (filter_size == 64) {
CONVOLVE_ONE_SAMPLE; // 1 CONVOLVE_ONE_SAMPLE; // 1
CONVOLVE_ONE_SAMPLE; // 2 CONVOLVE_ONE_SAMPLE; // 2
CONVOLVE_ONE_SAMPLE; // 3 CONVOLVE_ONE_SAMPLE; // 3
...@@ -158,7 +156,7 @@ static ALWAYS_INLINE void Conv(const float* source_p, ...@@ -158,7 +156,7 @@ static ALWAYS_INLINE void Conv(const float* source_p,
CONVOLVE_ONE_SAMPLE; // 63 CONVOLVE_ONE_SAMPLE; // 63
CONVOLVE_ONE_SAMPLE; // 64 CONVOLVE_ONE_SAMPLE; // 64
} else if (kernel_size == 128) { } else if (filter_size == 128) {
CONVOLVE_ONE_SAMPLE; // 1 CONVOLVE_ONE_SAMPLE; // 1
CONVOLVE_ONE_SAMPLE; // 2 CONVOLVE_ONE_SAMPLE; // 2
CONVOLVE_ONE_SAMPLE; // 3 CONVOLVE_ONE_SAMPLE; // 3
...@@ -300,7 +298,7 @@ static ALWAYS_INLINE void Conv(const float* source_p, ...@@ -300,7 +298,7 @@ static ALWAYS_INLINE void Conv(const float* source_p,
CONVOLVE_ONE_SAMPLE; // 127 CONVOLVE_ONE_SAMPLE; // 127
CONVOLVE_ONE_SAMPLE; // 128 CONVOLVE_ONE_SAMPLE; // 128
} else { } else {
while (j < kernel_size) { while (j < filter_size) {
// Non-optimized using actual while loop. // Non-optimized using actual while loop.
CONVOLVE_ONE_SAMPLE; CONVOLVE_ONE_SAMPLE;
} }
......
...@@ -264,7 +264,7 @@ TEST_F(VectorMathTest, Conv) { ...@@ -264,7 +264,7 @@ TEST_F(VectorMathTest, Conv) {
for (const auto& source : GetPrimaryVectors(GetSource(kFullyFiniteSource))) { for (const auto& source : GetPrimaryVectors(GetSource(kFullyFiniteSource))) {
if (source.stride() != 1) if (source.stride() != 1)
continue; continue;
for (size_t filter_size : {3u, 20u, 32u, 64u, 128u}) { for (size_t filter_size : {3u, 32u, 64u, 128u}) {
// The maximum number of frames which could be processed here is // The maximum number of frames which could be processed here is
// |source.size() - filter_size + 1|. However, in order to test // |source.size() - filter_size + 1|. However, in order to test
// optimization paths, |frames_to_process| should be optimal (divisible // optimization paths, |frames_to_process| should be optimal (divisible
...@@ -289,8 +289,10 @@ TEST_F(VectorMathTest, Conv) { ...@@ -289,8 +289,10 @@ TEST_F(VectorMathTest, Conv) {
} }
for (auto& dest : GetSecondaryVectors( for (auto& dest : GetSecondaryVectors(
GetDestination(1u), source.memory_layout(), frames_to_process)) { GetDestination(1u), source.memory_layout(), frames_to_process)) {
AudioFloatArray prepared_filter;
PrepareFilterForConv(filter_p, -1, filter_size, &prepared_filter);
Conv(source.p(), 1, filter_p, -1, dest.p(), 1, frames_to_process, Conv(source.p(), 1, filter_p, -1, dest.p(), 1, frames_to_process,
filter_size); filter_size, &prepared_filter);
for (size_t i = 0u; i < frames_to_process; ++i) { for (size_t i = 0u; i < frames_to_process; ++i) {
EXPECT_NEAR(expected_dest[i], dest[i], EXPECT_NEAR(expected_dest[i], dest[i],
1e-3 * std::abs(expected_dest[i])); 1e-3 * std::abs(expected_dest[i]));
......
...@@ -7,6 +7,8 @@ ...@@ -7,6 +7,8 @@
#include <cstddef> #include <cstddef>
#include "platform/audio/AudioArray.h"
namespace blink { namespace blink {
namespace VectorMath { namespace VectorMath {
namespace AVX { namespace AVX {
...@@ -17,6 +19,21 @@ constexpr size_t kFramesToProcessMask = ~(kPackedFloatsPerRegister - 1u); ...@@ -17,6 +19,21 @@ constexpr size_t kFramesToProcessMask = ~(kPackedFloatsPerRegister - 1u);
bool IsAligned(const float*); bool IsAligned(const float*);
// Direct vector convolution:
// dest[k] = sum(source[k+m]*filter[m*filter_stride]) for all m
// provided that |prepared_filter_p| is |prepared_filter->Data()| and that
// |prepared_filter| is prepared with |PrepareFilterForConv|.
void Conv(const float* source_p,
const float* prepared_filter_p,
float* dest_p,
size_t frames_to_process,
size_t filter_size);
void PrepareFilterForConv(const float* filter_p,
int filter_stride,
size_t filter_size,
AudioFloatArray* prepared_filter);
// dest[k] = source1[k] + source2[k] // dest[k] = source1[k] + source2[k]
void Vadd(const float* source1p, void Vadd(const float* source1p,
const float* source2p, const float* source2p,
......
...@@ -2,25 +2,98 @@ ...@@ -2,25 +2,98 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. // found in the LICENSE file.
// This file intentionally does not have header guards, it's included from
// VectorMathAVX.h and from VectorMathSSE.h with different macro definitions.
// The following line silences a presubmit warning that would otherwise be
// triggered by this: no-include-guard-because-multiply-included
#include "build/build_config.h" #include "build/build_config.h"
#if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MACOSX) #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MACOSX)
#include "platform/wtf/Assertions.h"
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include "platform/audio/AudioArray.h"
#include "platform/wtf/Assertions.h"
namespace blink { namespace blink {
namespace VectorMath { namespace VectorMath {
namespace VECTOR_MATH_SIMD_NAMESPACE_NAME { namespace VECTOR_MATH_SIMD_NAMESPACE_NAME {
// This stride is chosen so that the same prepared filter created by
// AVX::PrepareFilterForConv can be used by both AVX::Conv and SSE::Conv.
// A prepared filter created by SSE::PrepareFilterForConv can only be used
// by SSE::Conv.
constexpr size_t kReversedFilterStride = 8u / kPackedFloatsPerRegister;
bool IsAligned(const float* p) { bool IsAligned(const float* p) {
constexpr size_t kBytesPerRegister = kBitsPerRegister / 8u; constexpr size_t kBytesPerRegister = kBitsPerRegister / 8u;
constexpr size_t kAlignmentOffsetMask = kBytesPerRegister - 1u; constexpr size_t kAlignmentOffsetMask = kBytesPerRegister - 1u;
return (reinterpret_cast<size_t>(p) & kAlignmentOffsetMask) == 0u; return (reinterpret_cast<size_t>(p) & kAlignmentOffsetMask) == 0u;
} }
void PrepareFilterForConv(const float* filter_p,
int filter_stride,
size_t filter_size,
AudioFloatArray* prepared_filter) {
// Only contiguous convolution is implemented. Correlation (positive
// |filter_stride|) and support for non-contiguous vectors are not
// implemented.
DCHECK_EQ(-1, filter_stride);
DCHECK(prepared_filter);
// Reverse the filter and repeat each value across a vector
prepared_filter->Allocate(kReversedFilterStride * kPackedFloatsPerRegister *
filter_size);
MType* reversed_filter = reinterpret_cast<MType*>(prepared_filter->Data());
for (size_t i = 0; i < filter_size; ++i) {
reversed_filter[kReversedFilterStride * i] = MM_PS(set1)(*(filter_p - i));
}
}
// Direct vector convolution:
// dest[k] = sum(source[k+m]*filter[m*filter_stride]) for all m
// provided that |prepared_filter_p| is |prepared_filter->Data()| and that
// |prepared_filter| is prepared with |PrepareFilterForConv|.
void Conv(const float* source_p,
const float* prepared_filter_p,
float* dest_p,
size_t frames_to_process,
size_t filter_size) {
const float* const dest_end_p = dest_p + frames_to_process;
DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
DCHECK_EQ(0u, filter_size % kPackedFloatsPerRegister);
const MType* reversed_filter =
reinterpret_cast<const MType*>(prepared_filter_p);
// Do convolution with kPackedFloatsPerRegister inputs at a time.
while (dest_p < dest_end_p) {
MType m_convolution_sum = MM_PS(setzero)();
// |filter_size| is a multiple of kPackedFloatsPerRegister so we can unroll
// the loop by kPackedFloatsPerRegister, manually.
for (size_t i = 0; i < filter_size; i += kPackedFloatsPerRegister) {
for (size_t j = 0; j < kPackedFloatsPerRegister; ++j) {
size_t k = i + j;
MType m_product;
MType m_source;
m_source = MM_PS(loadu)(source_p + k);
m_product =
MM_PS(mul)(reversed_filter[kReversedFilterStride * k], m_source);
m_convolution_sum = MM_PS(add)(m_convolution_sum, m_product);
}
}
MM_PS(storeu)(dest_p, m_convolution_sum);
source_p += kPackedFloatsPerRegister;
dest_p += kPackedFloatsPerRegister;
}
}
// dest[k] = source1[k] + source2[k] // dest[k] = source1[k] + source2[k]
void Vadd(const float* source1p, void Vadd(const float* source1p,
const float* source2p, const float* source2p,
......
...@@ -7,6 +7,8 @@ ...@@ -7,6 +7,8 @@
#include <cstddef> #include <cstddef>
#include "platform/audio/AudioArray.h"
namespace blink { namespace blink {
namespace VectorMath { namespace VectorMath {
namespace SSE { namespace SSE {
...@@ -17,6 +19,21 @@ constexpr size_t kFramesToProcessMask = ~(kPackedFloatsPerRegister - 1u); ...@@ -17,6 +19,21 @@ constexpr size_t kFramesToProcessMask = ~(kPackedFloatsPerRegister - 1u);
bool IsAligned(const float*); bool IsAligned(const float*);
// Direct vector convolution:
// dest[k] = sum(source[k+m]*filter[m*filter_stride]) for all m
// provided that |prepared_filter_p| is |prepared_filter->Data()| and that
// |prepared_filter| is prepared with |PrepareFilterForConv|.
void Conv(const float* source_p,
const float* prepared_filter_p,
float* dest_p,
size_t frames_to_process,
size_t filter_size);
void PrepareFilterForConv(const float* filter_p,
int filter_stride,
size_t filter_size,
AudioFloatArray* prepared_filter);
// dest[k] = source1[k] + source2[k] // dest[k] = source1[k] + source2[k]
void Vadd(const float* source1p, void Vadd(const float* source1p,
const float* source2p, const float* source2p,
......
...@@ -6,14 +6,11 @@ ...@@ -6,14 +6,11 @@
#define VectorMathX86_h #define VectorMathX86_h
#include "base/cpu.h" #include "base/cpu.h"
#include "platform/audio/AudioArray.h"
#include "platform/audio/VectorMathScalar.h" #include "platform/audio/VectorMathScalar.h"
#include "platform/audio/cpu/x86/VectorMathAVX.h" #include "platform/audio/cpu/x86/VectorMathAVX.h"
#include "platform/audio/cpu/x86/VectorMathSSE.h" #include "platform/audio/cpu/x86/VectorMathSSE.h"
#include "platform/wtf/Assertions.h" #include "platform/wtf/Assertions.h"
#include <xmmintrin.h>
namespace blink { namespace blink {
namespace VectorMath { namespace VectorMath {
namespace X86 { namespace X86 {
...@@ -94,6 +91,20 @@ SplitFramesToProcess(const float* source_p, size_t frames_to_process) { ...@@ -94,6 +91,20 @@ SplitFramesToProcess(const float* source_p, size_t frames_to_process) {
return counts; return counts;
} }
static ALWAYS_INLINE void PrepareFilterForConv(
const float* filter_p,
int filter_stride,
size_t filter_size,
AudioFloatArray* prepared_filter) {
if (CPUSupportsAVX()) {
AVX::PrepareFilterForConv(filter_p, filter_stride, filter_size,
prepared_filter);
} else {
SSE::PrepareFilterForConv(filter_p, filter_stride, filter_size,
prepared_filter);
}
}
static ALWAYS_INLINE void Conv(const float* source_p, static ALWAYS_INLINE void Conv(const float* source_p,
int source_stride, int source_stride,
const float* filter_p, const float* filter_p,
...@@ -101,61 +112,30 @@ static ALWAYS_INLINE void Conv(const float* source_p, ...@@ -101,61 +112,30 @@ static ALWAYS_INLINE void Conv(const float* source_p,
float* dest_p, float* dest_p,
int dest_stride, int dest_stride,
size_t frames_to_process, size_t frames_to_process,
size_t filter_size) { size_t filter_size,
// Only contiguous convolution is implemented. Correlation (positive const AudioFloatArray* prepared_filter) {
// |filter_stride|) and support for non-contiguous vectors are not const float* prepared_filter_p =
// implemented. prepared_filter ? prepared_filter->Data() : nullptr;
DCHECK_EQ(1, source_stride); if (source_stride == 1 && dest_stride == 1 && prepared_filter_p) {
DCHECK_EQ(-1, filter_stride); if (CPUSupportsAVX() && (filter_size & ~AVX::kFramesToProcessMask) == 0u) {
DCHECK_EQ(1, dest_stride); // |frames_to_process| is always a multiply of render quantum and
// therefore the frames can always be processed using AVX.
size_t kernel_size = filter_size; CHECK_EQ(frames_to_process & ~AVX::kFramesToProcessMask, 0u);
const float* input_p = source_p + kernel_size - 1; AVX::Conv(source_p, prepared_filter_p, dest_p, frames_to_process,
const float* kernel_p = filter_p + 1 - kernel_size; filter_size);
return;
size_t i = 0; }
if ((filter_size & ~SSE::kFramesToProcessMask) == 0u) {
// Convolution using SSE2. Currently only do this if both |kernel_size| and // |frames_to_process| is always a multiply of render quantum and
// |frames_to_process| are multiples of 4. If not, use Scalar::Conv. // therefore the frames can always be processed using SSE.
CHECK_EQ(frames_to_process & ~SSE::kFramesToProcessMask, 0u);
if ((kernel_size % 4 == 0) && (frames_to_process % 4 == 0)) { SSE::Conv(source_p, prepared_filter_p, dest_p, frames_to_process,
// AudioFloatArray's are always aligned on at least a 32-byte boundary. filter_size);
AudioFloatArray kernel_buffer(4 * kernel_size); return;
__m128* kernel_reversed = reinterpret_cast<__m128*>(kernel_buffer.Data());
// Reverse the kernel and repeat each value across a vector
for (i = 0; i < kernel_size; ++i) {
kernel_reversed[i] = _mm_set1_ps(kernel_p[kernel_size - i - 1]);
} }
const float* input_start_p = input_p - kernel_size + 1;
// Do convolution with 4 inputs at a time.
for (i = 0; i < frames_to_process; i += 4) {
__m128 convolution_sum;
convolution_sum = _mm_setzero_ps();
// |kernel_size| is a multiple of 4 so we can unroll the loop by 4,
// manually.
for (size_t k = 0; k < kernel_size; k += 4) {
size_t data_offset = i + k;
for (size_t m = 0; m < 4; ++m) {
__m128 source_block;
__m128 product;
source_block = _mm_loadu_ps(input_start_p + data_offset + m);
product = _mm_mul_ps(kernel_reversed[k + m], source_block);
convolution_sum = _mm_add_ps(convolution_sum, product);
}
}
_mm_storeu_ps(dest_p + i, convolution_sum);
} }
} else {
Scalar::Conv(source_p, source_stride, filter_p, filter_stride, dest_p, Scalar::Conv(source_p, source_stride, filter_p, filter_stride, dest_p,
dest_stride, frames_to_process, filter_size); dest_stride, frames_to_process, filter_size, nullptr);
}
} }
static ALWAYS_INLINE void Vadd(const float* source1p, static ALWAYS_INLINE void Vadd(const float* source1p,
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <Accelerate/Accelerate.h> #include <Accelerate/Accelerate.h>
#include "build/build_config.h" #include "build/build_config.h"
#include "platform/audio/AudioArray.h"
namespace blink { namespace blink {
namespace VectorMath { namespace VectorMath {
...@@ -26,7 +27,8 @@ static ALWAYS_INLINE void Conv(const float* source_p, ...@@ -26,7 +27,8 @@ static ALWAYS_INLINE void Conv(const float* source_p,
float* dest_p, float* dest_p,
int dest_stride, int dest_stride,
size_t frames_to_process, size_t frames_to_process,
size_t filter_size) { size_t filter_size,
const AudioFloatArray* /*prepared_filter*/) {
#if defined(ARCH_CPU_X86) #if defined(ARCH_CPU_X86)
::conv(source_p, source_stride, filter_p, filter_stride, dest_p, dest_stride, ::conv(source_p, source_stride, filter_p, filter_stride, dest_p, dest_stride,
frames_to_process, filter_size); frames_to_process, filter_size);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment