Commit b401d9b0 authored by Eero Häkkinen's avatar Eero Häkkinen Committed by Commit Bot

Optimize blink::VectorMath::Conv for AVX

This CL doubles the performance on an Intel Broadwell CPU.

Bug: 778262
Change-Id: Ie60d64a0e862148a8a74e1fb162be2ec147e6cc6
Reviewed-on: https://chromium-review.googlesource.com/924144
Commit-Queue: Eero Häkkinen <eero.hakkinen@intel.com>
Reviewed-by: default avatarRaymond Toy <rtoy@chromium.org>
Cr-Commit-Position: refs/heads/master@{#544319}
parent cdc7defb
......@@ -28,6 +28,8 @@
#include "platform/audio/DirectConvolver.h"
#include <utility>
#include "build/build_config.h"
#include "platform/audio/VectorMath.h"
......@@ -41,13 +43,23 @@
namespace blink {
using namespace VectorMath;
DirectConvolver::DirectConvolver(size_t input_block_size)
: input_block_size_(input_block_size), buffer_(input_block_size * 2) {}
namespace {
using VectorMath::Conv;
using VectorMath::PrepareFilterForConv;
} // namespace
DirectConvolver::DirectConvolver(
size_t input_block_size,
std::unique_ptr<AudioFloatArray> convolution_kernel)
: input_block_size_(input_block_size),
buffer_(input_block_size * 2),
convolution_kernel_(std::move(convolution_kernel)) {
size_t kernel_size = ConvolutionKernelSize();
PrepareFilterForConv(convolution_kernel_->Data() + kernel_size - 1, -1,
kernel_size, &prepared_convolution_kernel_);
}
void DirectConvolver::Process(AudioFloatArray* convolution_kernel,
const float* source_p,
void DirectConvolver::Process(const float* source_p,
float* dest_p,
size_t frames_to_process) {
DCHECK_EQ(frames_to_process, input_block_size_);
......@@ -55,12 +67,12 @@ void DirectConvolver::Process(AudioFloatArray* convolution_kernel,
return;
// Only support kernelSize <= m_inputBlockSize
size_t kernel_size = convolution_kernel->size();
size_t kernel_size = ConvolutionKernelSize();
DCHECK_LE(kernel_size, input_block_size_);
if (kernel_size > input_block_size_)
return;
float* kernel_p = convolution_kernel->Data();
float* kernel_p = convolution_kernel_->Data();
// Sanity check
bool is_copy_good = kernel_p && source_p && dest_p && buffer_.Data();
......@@ -74,7 +86,7 @@ void DirectConvolver::Process(AudioFloatArray* convolution_kernel,
memcpy(input_p, source_p, sizeof(float) * frames_to_process);
Conv(input_p - kernel_size + 1, 1, kernel_p + kernel_size - 1, -1, dest_p, 1,
frames_to_process, kernel_size);
frames_to_process, kernel_size, &prepared_convolution_kernel_);
// Copy 2nd half of input buffer to 1st half.
memcpy(buffer_.Data(), input_p, sizeof(float) * frames_to_process);
......
......@@ -29,6 +29,8 @@
#ifndef DirectConvolver_h
#define DirectConvolver_h
#include <memory>
#include "platform/PlatformExport.h"
#include "platform/audio/AudioArray.h"
#include "platform/wtf/Allocator.h"
......@@ -41,19 +43,21 @@ class PLATFORM_EXPORT DirectConvolver {
WTF_MAKE_NONCOPYABLE(DirectConvolver);
public:
DirectConvolver(size_t input_block_size);
DirectConvolver(size_t input_block_size,
std::unique_ptr<AudioFloatArray> convolution_kernel);
void Process(AudioFloatArray* convolution_kernel,
const float* source_p,
float* dest_p,
size_t frames_to_process);
void Process(const float* source_p, float* dest_p, size_t frames_to_process);
void Reset();
size_t ConvolutionKernelSize() const { return convolution_kernel_->size(); }
private:
size_t input_block_size_;
AudioFloatArray buffer_;
std::unique_ptr<AudioFloatArray> convolution_kernel_;
AudioFloatArray prepared_convolution_kernel_;
};
} // namespace blink
......
......@@ -29,27 +29,27 @@
*/
#include "platform/audio/DownSampler.h"
#include <memory>
#include "platform/wtf/MathExtras.h"
namespace blink {
DownSampler::DownSampler(size_t input_block_size)
: input_block_size_(input_block_size),
reduced_kernel_(kDefaultKernelSize / 2),
convolver_(input_block_size / 2), // runs at 1/2 source sample-rate
temp_buffer_(input_block_size / 2),
input_buffer_(input_block_size * 2) {
InitializeKernel();
}
namespace {
// Computes ideal band-limited half-band filter coefficients.
// In other words, filter out all frequencies higher than 0.25 * Nyquist.
std::unique_ptr<AudioFloatArray> MakeReducedKernel(size_t size) {
auto reduced_kernel = std::make_unique<AudioFloatArray>(size / 2);
void DownSampler::InitializeKernel() {
// Blackman window parameters.
double alpha = 0.16;
double a0 = 0.5 * (1.0 - alpha);
double a1 = 0.5;
double a2 = 0.5 * alpha;
int n = kDefaultKernelSize;
int n = size;
int half_size = n / 2;
// Half-band filter.
......@@ -73,10 +73,21 @@ void DownSampler::InitializeKernel() {
// Then store only the odd terms in the kernel.
// In a sense, this is shifting forward in time by one sample-frame at the
// destination sample-rate.
reduced_kernel_[(i - 1) / 2] = sinc * window;
(*reduced_kernel)[(i - 1) / 2] = sinc * window;
}
return reduced_kernel;
}
} // namespace
DownSampler::DownSampler(size_t input_block_size)
: input_block_size_(input_block_size),
convolver_(input_block_size / 2, // runs at 1/2 source sample-rate
MakeReducedKernel(kDefaultKernelSize)),
temp_buffer_(input_block_size / 2),
input_buffer_(input_block_size * 2) {}
void DownSampler::Process(const float* source_p,
float* dest_p,
size_t source_frames_to_process) {
......@@ -93,7 +104,7 @@ void DownSampler::Process(const float* source_p,
return;
bool is_reduced_kernel_good =
reduced_kernel_.size() == kDefaultKernelSize / 2;
convolver_.ConvolutionKernelSize() == kDefaultKernelSize / 2;
DCHECK(is_reduced_kernel_good);
if (!is_reduced_kernel_good)
return;
......@@ -121,8 +132,7 @@ void DownSampler::Process(const float* source_p,
// Actually process oddSamplesP with m_reducedKernel for efficiency.
// The theoretical kernel is double this size with 0 values for even terms
// (except center).
convolver_.Process(&reduced_kernel_, odd_samples_p, dest_p,
dest_frames_to_process);
convolver_.Process(odd_samples_p, dest_p, dest_frames_to_process);
// Now, account for the 0.5 term right in the middle of the kernel.
// This amounts to a delay-line of length halfSize (at the source
......@@ -145,7 +155,7 @@ void DownSampler::Reset() {
size_t DownSampler::LatencyFrames() const {
// Divide by two since this is a linear phase kernel and the delay is at the
// center of the kernel.
return reduced_kernel_.size() / 2;
return convolver_.ConvolutionKernelSize() / 2;
}
} // namespace blink
......@@ -45,7 +45,7 @@ class PLATFORM_EXPORT DownSampler {
WTF_MAKE_NONCOPYABLE(DownSampler);
public:
DownSampler(size_t input_block_size);
explicit DownSampler(size_t input_block_size);
// The destination buffer |destP| is of size sourceFramesToProcess / 2.
void Process(const float* source_p,
......@@ -62,11 +62,6 @@ class PLATFORM_EXPORT DownSampler {
size_t input_block_size_;
// Computes ideal band-limited half-band filter coefficients.
// In other words, filter out all frequencies higher than 0.25 * Nyquist.
void InitializeKernel();
AudioFloatArray reduced_kernel_;
// Half-band filter.
DirectConvolver convolver_;
......
......@@ -30,6 +30,7 @@
#include <algorithm>
#include <memory>
#include <utility>
#include "platform/audio/ReverbAccumulationBuffer.h"
#include "platform/audio/ReverbConvolver.h"
......@@ -38,8 +39,6 @@
namespace blink {
using namespace VectorMath;
ReverbConvolverStage::ReverbConvolverStage(
const float* impulse_response,
size_t,
......@@ -66,9 +65,10 @@ ReverbConvolverStage::ReverbConvolverStage(
DCHECK(!stage_offset);
DCHECK_LE(stage_length, fft_size / 2);
direct_kernel_ = std::make_unique<AudioFloatArray>(fft_size / 2);
direct_kernel_->CopyToRange(impulse_response, 0, stage_length);
direct_convolver_ = std::make_unique<DirectConvolver>(render_slice_size);
auto direct_kernel = std::make_unique<AudioFloatArray>(fft_size / 2);
direct_kernel->CopyToRange(impulse_response, 0, stage_length);
direct_convolver_ = std::make_unique<DirectConvolver>(
render_slice_size, std::move(direct_kernel));
}
temporary_buffer_.Allocate(render_slice_size);
......@@ -166,8 +166,8 @@ void ReverbConvolverStage::Process(const float* source,
fft_convolver_->Process(fft_kernel_.get(), pre_delayed_source,
temporary_buffer, frames_to_process);
else
direct_convolver_->Process(direct_kernel_.get(), pre_delayed_source,
temporary_buffer, frames_to_process);
direct_convolver_->Process(pre_delayed_source, temporary_buffer,
frames_to_process);
// Now accumulate into reverb's accumulation buffer.
accumulation_buffer_->Accumulate(temporary_buffer, frames_to_process,
......
......@@ -30,6 +30,7 @@
#define ReverbConvolverStage_h
#include <memory>
#include "platform/audio/AudioArray.h"
#include "platform/audio/FFTFrame.h"
#include "platform/wtf/Allocator.h"
......@@ -95,7 +96,6 @@ class PLATFORM_EXPORT ReverbConvolverStage {
AudioFloatArray temporary_buffer_;
bool direct_mode_;
std::unique_ptr<AudioFloatArray> direct_kernel_;
std::unique_ptr<DirectConvolver> direct_convolver_;
};
......
......@@ -29,27 +29,29 @@
*/
#include "platform/audio/UpSampler.h"
#include <memory>
#include "platform/wtf/MathExtras.h"
namespace blink {
UpSampler::UpSampler(size_t input_block_size)
: input_block_size_(input_block_size),
kernel_(kDefaultKernelSize),
convolver_(input_block_size),
temp_buffer_(input_block_size),
input_buffer_(input_block_size * 2) {
InitializeKernel();
}
namespace {
// Computes ideal band-limited filter coefficients to sample in between each
// source sample-frame. This filter will be used to compute the odd
// sample-frames of the output.
std::unique_ptr<AudioFloatArray> MakeKernel(size_t size) {
std::unique_ptr<AudioFloatArray> kernel =
std::make_unique<AudioFloatArray>(size);
void UpSampler::InitializeKernel() {
// Blackman window parameters.
double alpha = 0.16;
double a0 = 0.5 * (1.0 - alpha);
double a1 = 0.5;
double a2 = 0.5 * alpha;
int n = kernel_.size();
int n = kernel->size();
int half_size = n / 2;
double subsample_offset = -0.5;
......@@ -64,10 +66,20 @@ void UpSampler::InitializeKernel() {
a0 - a1 * cos(twoPiDouble * x) + a2 * cos(twoPiDouble * 2.0 * x);
// Window the sinc() function.
kernel_[i] = sinc * window;
(*kernel)[i] = sinc * window;
}
return kernel;
}
} // namespace
UpSampler::UpSampler(size_t input_block_size)
: input_block_size_(input_block_size),
convolver_(input_block_size, MakeKernel(kDefaultKernelSize)),
temp_buffer_(input_block_size),
input_buffer_(input_block_size * 2) {}
void UpSampler::Process(const float* source_p,
float* dest_p,
size_t source_frames_to_process) {
......@@ -81,12 +93,13 @@ void UpSampler::Process(const float* source_p,
if (!is_temp_buffer_good)
return;
bool is_kernel_good = kernel_.size() == kDefaultKernelSize;
bool is_kernel_good =
convolver_.ConvolutionKernelSize() == kDefaultKernelSize;
DCHECK(is_kernel_good);
if (!is_kernel_good)
return;
size_t half_size = kernel_.size() / 2;
size_t half_size = convolver_.ConvolutionKernelSize() / 2;
// Copy source samples to 2nd half of input buffer.
bool is_input_buffer_good =
......@@ -106,8 +119,7 @@ void UpSampler::Process(const float* source_p,
// Compute odd sample-frames 1,3,5,7...
float* odd_samples_p = temp_buffer_.Data();
convolver_.Process(&kernel_, source_p, odd_samples_p,
source_frames_to_process);
convolver_.Process(source_p, odd_samples_p, source_frames_to_process);
for (unsigned i = 0; i < source_frames_to_process; ++i)
dest_p[i * 2 + 1] = odd_samples_p[i];
......@@ -125,7 +137,7 @@ void UpSampler::Reset() {
size_t UpSampler::LatencyFrames() const {
// Divide by two since this is a linear phase kernel and the delay is at the
// center of the kernel.
return kernel_.size() / 2;
return convolver_.ConvolutionKernelSize() / 2;
}
} // namespace blink
......@@ -45,7 +45,7 @@ class PLATFORM_EXPORT UpSampler {
WTF_MAKE_NONCOPYABLE(UpSampler);
public:
UpSampler(size_t input_block_size);
explicit UpSampler(size_t input_block_size);
// The destination buffer |destP| is of size sourceFramesToProcess * 2.
void Process(const float* source_p,
......@@ -62,12 +62,6 @@ class PLATFORM_EXPORT UpSampler {
size_t input_block_size_;
// Computes ideal band-limited filter coefficients to sample in between each
// source sample-frame. This filter will be used to compute the odd
// sample-frames of the output.
void InitializeKernel();
AudioFloatArray kernel_;
// Computes the odd sample-frames of the output.
DirectConvolver convolver_;
......
......@@ -61,6 +61,21 @@ namespace Impl = Scalar;
#endif
} // namespace
void PrepareFilterForConv(const float* filter_p,
int filter_stride,
size_t filter_size,
AudioFloatArray* prepared_filter) {
// Only contiguous convolution is implemented by all implementations.
// Correlation (positive |filter_stride|) and support for non-contiguous
// vectors are not implemented by all implementations.
DCHECK_EQ(-1, filter_stride);
DCHECK(prepared_filter);
#if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MACOSX)
X86::PrepareFilterForConv(filter_p, filter_stride, filter_size,
prepared_filter);
#endif
}
void Conv(const float* source_p,
int source_stride,
const float* filter_p,
......@@ -68,7 +83,8 @@ void Conv(const float* source_p,
float* dest_p,
int dest_stride,
size_t frames_to_process,
size_t filter_size) {
size_t filter_size,
const AudioFloatArray* prepared_filter) {
// Only contiguous convolution is implemented by all implementations.
// Correlation (positive |filter_stride|) and support for non-contiguous
// vectors are not implemented by all implementations.
......@@ -76,7 +92,7 @@ void Conv(const float* source_p,
DCHECK_EQ(-1, filter_stride);
DCHECK_EQ(1, dest_stride);
Impl::Conv(source_p, source_stride, filter_p, filter_stride, dest_p,
dest_stride, frames_to_process, filter_size);
dest_stride, frames_to_process, filter_size, prepared_filter);
}
void Vadd(const float* source1p,
......
......@@ -29,6 +29,7 @@
#include <cstddef>
#include "platform/PlatformExport.h"
#include "platform/audio/AudioArray.h"
// Defines the interface for several vector math functions whose implementation
// will ideally be optimized.
......@@ -47,7 +48,14 @@ PLATFORM_EXPORT void Conv(const float* source_p,
float* dest_p,
int dest_stride,
size_t frames_to_process,
size_t filter_size);
size_t filter_size,
const AudioFloatArray* prepared_filter);
// Prepare filter for Conv for faster processing.
PLATFORM_EXPORT void PrepareFilterForConv(const float* filter_p,
int filter_stride,
size_t filter_size,
AudioFloatArray* prepared_filter);
// Vector scalar multiply and then add.
//
......
......@@ -8,6 +8,7 @@
#include <algorithm>
#include <cmath>
#include "platform/audio/AudioArray.h"
#include "platform/wtf/Assertions.h"
#include "platform/wtf/MathExtras.h"
......@@ -22,7 +23,8 @@ static ALWAYS_INLINE void Conv(const float* source_p,
float* dest_p,
int dest_stride,
size_t frames_to_process,
size_t filter_size) {
size_t filter_size,
const AudioFloatArray* /*prepared_filter*/) {
// Only contiguous convolution is implemented. Correlation (positive
// |filter_stride|) and support for non-contiguous vectors are not
// implemented.
......@@ -30,26 +32,22 @@ static ALWAYS_INLINE void Conv(const float* source_p,
DCHECK_EQ(-1, filter_stride);
DCHECK_EQ(1, dest_stride);
size_t kernel_size = filter_size;
const float* input_p = source_p + kernel_size - 1;
const float* kernel_p = filter_p + 1 - kernel_size;
size_t i = 0;
// FIXME: The macro can be further optimized to avoid pipeline stalls. One
// possibility is to maintain 4 separate sums and change the macro to
// CONVOLVE_FOUR_SAMPLES.
#define CONVOLVE_ONE_SAMPLE \
do { \
sum += input_p[i - j] * kernel_p[j]; \
j++; \
#define CONVOLVE_ONE_SAMPLE \
do { \
sum += source_p[i + j] * *(filter_p - j); \
j++; \
} while (0)
while (i < frames_to_process) {
size_t j = 0;
float sum = 0;
if (kernel_size == 32) {
if (filter_size == 32) {
CONVOLVE_ONE_SAMPLE; // 1
CONVOLVE_ONE_SAMPLE; // 2
CONVOLVE_ONE_SAMPLE; // 3
......@@ -86,7 +84,7 @@ static ALWAYS_INLINE void Conv(const float* source_p,
CONVOLVE_ONE_SAMPLE; // 31
CONVOLVE_ONE_SAMPLE; // 32
} else if (kernel_size == 64) {
} else if (filter_size == 64) {
CONVOLVE_ONE_SAMPLE; // 1
CONVOLVE_ONE_SAMPLE; // 2
CONVOLVE_ONE_SAMPLE; // 3
......@@ -158,7 +156,7 @@ static ALWAYS_INLINE void Conv(const float* source_p,
CONVOLVE_ONE_SAMPLE; // 63
CONVOLVE_ONE_SAMPLE; // 64
} else if (kernel_size == 128) {
} else if (filter_size == 128) {
CONVOLVE_ONE_SAMPLE; // 1
CONVOLVE_ONE_SAMPLE; // 2
CONVOLVE_ONE_SAMPLE; // 3
......@@ -300,7 +298,7 @@ static ALWAYS_INLINE void Conv(const float* source_p,
CONVOLVE_ONE_SAMPLE; // 127
CONVOLVE_ONE_SAMPLE; // 128
} else {
while (j < kernel_size) {
while (j < filter_size) {
// Non-optimized using actual while loop.
CONVOLVE_ONE_SAMPLE;
}
......
......@@ -264,7 +264,7 @@ TEST_F(VectorMathTest, Conv) {
for (const auto& source : GetPrimaryVectors(GetSource(kFullyFiniteSource))) {
if (source.stride() != 1)
continue;
for (size_t filter_size : {3u, 20u, 32u, 64u, 128u}) {
for (size_t filter_size : {3u, 32u, 64u, 128u}) {
// The maximum number of frames which could be processed here is
// |source.size() - filter_size + 1|. However, in order to test
// optimization paths, |frames_to_process| should be optimal (divisible
......@@ -289,8 +289,10 @@ TEST_F(VectorMathTest, Conv) {
}
for (auto& dest : GetSecondaryVectors(
GetDestination(1u), source.memory_layout(), frames_to_process)) {
AudioFloatArray prepared_filter;
PrepareFilterForConv(filter_p, -1, filter_size, &prepared_filter);
Conv(source.p(), 1, filter_p, -1, dest.p(), 1, frames_to_process,
filter_size);
filter_size, &prepared_filter);
for (size_t i = 0u; i < frames_to_process; ++i) {
EXPECT_NEAR(expected_dest[i], dest[i],
1e-3 * std::abs(expected_dest[i]));
......
......@@ -7,6 +7,8 @@
#include <cstddef>
#include "platform/audio/AudioArray.h"
namespace blink {
namespace VectorMath {
namespace AVX {
......@@ -17,6 +19,21 @@ constexpr size_t kFramesToProcessMask = ~(kPackedFloatsPerRegister - 1u);
bool IsAligned(const float*);
// Direct vector convolution:
// dest[k] = sum(source[k+m]*filter[m*filter_stride]) for all m
// provided that |prepared_filter_p| is |prepared_filter->Data()| and that
// |prepared_filter| is prepared with |PrepareFilterForConv|.
void Conv(const float* source_p,
const float* prepared_filter_p,
float* dest_p,
size_t frames_to_process,
size_t filter_size);
void PrepareFilterForConv(const float* filter_p,
int filter_stride,
size_t filter_size,
AudioFloatArray* prepared_filter);
// dest[k] = source1[k] + source2[k]
void Vadd(const float* source1p,
const float* source2p,
......
......@@ -2,25 +2,98 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This file intentionally does not have header guards, it's included from
// VectorMathAVX.h and from VectorMathSSE.h with different macro definitions.
// The following line silences a presubmit warning that would otherwise be
// triggered by this: no-include-guard-because-multiply-included
#include "build/build_config.h"
#if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MACOSX)
#include "platform/wtf/Assertions.h"
#include <algorithm>
#include <cmath>
#include "platform/audio/AudioArray.h"
#include "platform/wtf/Assertions.h"
namespace blink {
namespace VectorMath {
namespace VECTOR_MATH_SIMD_NAMESPACE_NAME {
// This stride is chosen so that the same prepared filter created by
// AVX::PrepareFilterForConv can be used by both AVX::Conv and SSE::Conv.
// A prepared filter created by SSE::PrepareFilterForConv can only be used
// by SSE::Conv.
constexpr size_t kReversedFilterStride = 8u / kPackedFloatsPerRegister;
bool IsAligned(const float* p) {
constexpr size_t kBytesPerRegister = kBitsPerRegister / 8u;
constexpr size_t kAlignmentOffsetMask = kBytesPerRegister - 1u;
return (reinterpret_cast<size_t>(p) & kAlignmentOffsetMask) == 0u;
}
void PrepareFilterForConv(const float* filter_p,
int filter_stride,
size_t filter_size,
AudioFloatArray* prepared_filter) {
// Only contiguous convolution is implemented. Correlation (positive
// |filter_stride|) and support for non-contiguous vectors are not
// implemented.
DCHECK_EQ(-1, filter_stride);
DCHECK(prepared_filter);
// Reverse the filter and repeat each value across a vector
prepared_filter->Allocate(kReversedFilterStride * kPackedFloatsPerRegister *
filter_size);
MType* reversed_filter = reinterpret_cast<MType*>(prepared_filter->Data());
for (size_t i = 0; i < filter_size; ++i) {
reversed_filter[kReversedFilterStride * i] = MM_PS(set1)(*(filter_p - i));
}
}
// Direct vector convolution:
// dest[k] = sum(source[k+m]*filter[m*filter_stride]) for all m
// provided that |prepared_filter_p| is |prepared_filter->Data()| and that
// |prepared_filter| is prepared with |PrepareFilterForConv|.
void Conv(const float* source_p,
const float* prepared_filter_p,
float* dest_p,
size_t frames_to_process,
size_t filter_size) {
const float* const dest_end_p = dest_p + frames_to_process;
DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
DCHECK_EQ(0u, filter_size % kPackedFloatsPerRegister);
const MType* reversed_filter =
reinterpret_cast<const MType*>(prepared_filter_p);
// Do convolution with kPackedFloatsPerRegister inputs at a time.
while (dest_p < dest_end_p) {
MType m_convolution_sum = MM_PS(setzero)();
// |filter_size| is a multiple of kPackedFloatsPerRegister so we can unroll
// the loop by kPackedFloatsPerRegister, manually.
for (size_t i = 0; i < filter_size; i += kPackedFloatsPerRegister) {
for (size_t j = 0; j < kPackedFloatsPerRegister; ++j) {
size_t k = i + j;
MType m_product;
MType m_source;
m_source = MM_PS(loadu)(source_p + k);
m_product =
MM_PS(mul)(reversed_filter[kReversedFilterStride * k], m_source);
m_convolution_sum = MM_PS(add)(m_convolution_sum, m_product);
}
}
MM_PS(storeu)(dest_p, m_convolution_sum);
source_p += kPackedFloatsPerRegister;
dest_p += kPackedFloatsPerRegister;
}
}
// dest[k] = source1[k] + source2[k]
void Vadd(const float* source1p,
const float* source2p,
......
......@@ -7,6 +7,8 @@
#include <cstddef>
#include "platform/audio/AudioArray.h"
namespace blink {
namespace VectorMath {
namespace SSE {
......@@ -17,6 +19,21 @@ constexpr size_t kFramesToProcessMask = ~(kPackedFloatsPerRegister - 1u);
bool IsAligned(const float*);
// Direct vector convolution:
// dest[k] = sum(source[k+m]*filter[m*filter_stride]) for all m
// provided that |prepared_filter_p| is |prepared_filter->Data()| and that
// |prepared_filter| is prepared with |PrepareFilterForConv|.
void Conv(const float* source_p,
const float* prepared_filter_p,
float* dest_p,
size_t frames_to_process,
size_t filter_size);
void PrepareFilterForConv(const float* filter_p,
int filter_stride,
size_t filter_size,
AudioFloatArray* prepared_filter);
// dest[k] = source1[k] + source2[k]
void Vadd(const float* source1p,
const float* source2p,
......
......@@ -6,14 +6,11 @@
#define VectorMathX86_h
#include "base/cpu.h"
#include "platform/audio/AudioArray.h"
#include "platform/audio/VectorMathScalar.h"
#include "platform/audio/cpu/x86/VectorMathAVX.h"
#include "platform/audio/cpu/x86/VectorMathSSE.h"
#include "platform/wtf/Assertions.h"
#include <xmmintrin.h>
namespace blink {
namespace VectorMath {
namespace X86 {
......@@ -94,6 +91,20 @@ SplitFramesToProcess(const float* source_p, size_t frames_to_process) {
return counts;
}
static ALWAYS_INLINE void PrepareFilterForConv(
const float* filter_p,
int filter_stride,
size_t filter_size,
AudioFloatArray* prepared_filter) {
if (CPUSupportsAVX()) {
AVX::PrepareFilterForConv(filter_p, filter_stride, filter_size,
prepared_filter);
} else {
SSE::PrepareFilterForConv(filter_p, filter_stride, filter_size,
prepared_filter);
}
}
static ALWAYS_INLINE void Conv(const float* source_p,
int source_stride,
const float* filter_p,
......@@ -101,61 +112,30 @@ static ALWAYS_INLINE void Conv(const float* source_p,
float* dest_p,
int dest_stride,
size_t frames_to_process,
size_t filter_size) {
// Only contiguous convolution is implemented. Correlation (positive
// |filter_stride|) and support for non-contiguous vectors are not
// implemented.
DCHECK_EQ(1, source_stride);
DCHECK_EQ(-1, filter_stride);
DCHECK_EQ(1, dest_stride);
size_t kernel_size = filter_size;
const float* input_p = source_p + kernel_size - 1;
const float* kernel_p = filter_p + 1 - kernel_size;
size_t i = 0;
// Convolution using SSE2. Currently only do this if both |kernel_size| and
// |frames_to_process| are multiples of 4. If not, use Scalar::Conv.
if ((kernel_size % 4 == 0) && (frames_to_process % 4 == 0)) {
// AudioFloatArray's are always aligned on at least a 32-byte boundary.
AudioFloatArray kernel_buffer(4 * kernel_size);
__m128* kernel_reversed = reinterpret_cast<__m128*>(kernel_buffer.Data());
// Reverse the kernel and repeat each value across a vector
for (i = 0; i < kernel_size; ++i) {
kernel_reversed[i] = _mm_set1_ps(kernel_p[kernel_size - i - 1]);
size_t filter_size,
const AudioFloatArray* prepared_filter) {
const float* prepared_filter_p =
prepared_filter ? prepared_filter->Data() : nullptr;
if (source_stride == 1 && dest_stride == 1 && prepared_filter_p) {
if (CPUSupportsAVX() && (filter_size & ~AVX::kFramesToProcessMask) == 0u) {
// |frames_to_process| is always a multiply of render quantum and
// therefore the frames can always be processed using AVX.
CHECK_EQ(frames_to_process & ~AVX::kFramesToProcessMask, 0u);
AVX::Conv(source_p, prepared_filter_p, dest_p, frames_to_process,
filter_size);
return;
}
const float* input_start_p = input_p - kernel_size + 1;
// Do convolution with 4 inputs at a time.
for (i = 0; i < frames_to_process; i += 4) {
__m128 convolution_sum;
convolution_sum = _mm_setzero_ps();
// |kernel_size| is a multiple of 4 so we can unroll the loop by 4,
// manually.
for (size_t k = 0; k < kernel_size; k += 4) {
size_t data_offset = i + k;
for (size_t m = 0; m < 4; ++m) {
__m128 source_block;
__m128 product;
source_block = _mm_loadu_ps(input_start_p + data_offset + m);
product = _mm_mul_ps(kernel_reversed[k + m], source_block);
convolution_sum = _mm_add_ps(convolution_sum, product);
}
}
_mm_storeu_ps(dest_p + i, convolution_sum);
if ((filter_size & ~SSE::kFramesToProcessMask) == 0u) {
// |frames_to_process| is always a multiply of render quantum and
// therefore the frames can always be processed using SSE.
CHECK_EQ(frames_to_process & ~SSE::kFramesToProcessMask, 0u);
SSE::Conv(source_p, prepared_filter_p, dest_p, frames_to_process,
filter_size);
return;
}
} else {
Scalar::Conv(source_p, source_stride, filter_p, filter_stride, dest_p,
dest_stride, frames_to_process, filter_size);
}
Scalar::Conv(source_p, source_stride, filter_p, filter_stride, dest_p,
dest_stride, frames_to_process, filter_size, nullptr);
}
static ALWAYS_INLINE void Vadd(const float* source1p,
......
......@@ -8,6 +8,7 @@
#include <Accelerate/Accelerate.h>
#include "build/build_config.h"
#include "platform/audio/AudioArray.h"
namespace blink {
namespace VectorMath {
......@@ -26,7 +27,8 @@ static ALWAYS_INLINE void Conv(const float* source_p,
float* dest_p,
int dest_stride,
size_t frames_to_process,
size_t filter_size) {
size_t filter_size,
const AudioFloatArray* /*prepared_filter*/) {
#if defined(ARCH_CPU_X86)
::conv(source_p, source_stride, filter_p, filter_stride, dest_p, dest_stride,
frames_to_process, filter_size);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment