Commit 13a6b23c authored by Eero Häkkinen's avatar Eero Häkkinen Committed by Commit Bot

Move vector convolution from DirectConvolver to blink::VectorMath

This CL adds a new function blink::VectorMath::Conv based on code in
blink::DirectConvolver, modifies DirectConvolver to use that new
function and adds a unit test for that new function.

This is a preparation for AVX optimized vector convolution.

Bug: 778262
Change-Id: Iae970e9c1ecb359c2375305b6af8c54e853dcdc1
Reviewed-on: https://chromium-review.googlesource.com/924143
Commit-Queue: Eero Häkkinen <eero.hakkinen@intel.com>
Reviewed-by: default avatarRaymond Toy <rtoy@chromium.org>
Cr-Commit-Position: refs/heads/master@{#544164}
parent 27844bb9
...@@ -61,6 +61,24 @@ namespace Impl = Scalar; ...@@ -61,6 +61,24 @@ namespace Impl = Scalar;
#endif #endif
} // namespace } // namespace
void Conv(const float* source_p,
int source_stride,
const float* filter_p,
int filter_stride,
float* dest_p,
int dest_stride,
size_t frames_to_process,
size_t filter_size) {
// Only contiguous convolution is implemented by all implementations.
// Correlation (positive |filter_stride|) and support for non-contiguous
// vectors are not implemented by all implementations.
DCHECK_EQ(1, source_stride);
DCHECK_EQ(-1, filter_stride);
DCHECK_EQ(1, dest_stride);
Impl::Conv(source_p, source_stride, filter_p, filter_stride, dest_p,
dest_stride, frames_to_process, filter_size);
}
void Vadd(const float* source1p, void Vadd(const float* source1p,
int source_stride1, int source_stride1,
const float* source2p, const float* source2p,
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#define VectorMath_h #define VectorMath_h
#include <cstddef> #include <cstddef>
#include "platform/PlatformExport.h" #include "platform/PlatformExport.h"
// Defines the interface for several vector math functions whose implementation // Defines the interface for several vector math functions whose implementation
...@@ -35,6 +36,19 @@ ...@@ -35,6 +36,19 @@
namespace blink { namespace blink {
namespace VectorMath { namespace VectorMath {
// Direct vector convolution:
//
// dest[k*dest_stride] =
// sum(source[(k+m)*source_stride]*filter[m*filter_stride]) for all m
PLATFORM_EXPORT void Conv(const float* source_p,
int source_stride,
const float* filter_p,
int filter_stride,
float* dest_p,
int dest_stride,
size_t frames_to_process,
size_t filter_size);
// Vector scalar multiply and then add. // Vector scalar multiply and then add.
// //
// dest[k*dest_stride] += scale * source[k*source_stride] // dest[k*dest_stride] += scale * source[k*source_stride]
......
...@@ -8,12 +8,308 @@ ...@@ -8,12 +8,308 @@
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include "platform/wtf/Assertions.h"
#include "platform/wtf/MathExtras.h" #include "platform/wtf/MathExtras.h"
namespace blink { namespace blink {
namespace VectorMath { namespace VectorMath {
namespace Scalar { namespace Scalar {
static ALWAYS_INLINE void Conv(const float* source_p,
int source_stride,
const float* filter_p,
int filter_stride,
float* dest_p,
int dest_stride,
size_t frames_to_process,
size_t filter_size) {
// Only contiguous convolution is implemented. Correlation (positive
// |filter_stride|) and support for non-contiguous vectors are not
// implemented.
DCHECK_EQ(1, source_stride);
DCHECK_EQ(-1, filter_stride);
DCHECK_EQ(1, dest_stride);
size_t kernel_size = filter_size;
const float* input_p = source_p + kernel_size - 1;
const float* kernel_p = filter_p + 1 - kernel_size;
size_t i = 0;
// FIXME: The macro can be further optimized to avoid pipeline stalls. One
// possibility is to maintain 4 separate sums and change the macro to
// CONVOLVE_FOUR_SAMPLES.
#define CONVOLVE_ONE_SAMPLE \
do { \
sum += input_p[i - j] * kernel_p[j]; \
j++; \
} while (0)
while (i < frames_to_process) {
size_t j = 0;
float sum = 0;
if (kernel_size == 32) {
CONVOLVE_ONE_SAMPLE; // 1
CONVOLVE_ONE_SAMPLE; // 2
CONVOLVE_ONE_SAMPLE; // 3
CONVOLVE_ONE_SAMPLE; // 4
CONVOLVE_ONE_SAMPLE; // 5
CONVOLVE_ONE_SAMPLE; // 6
CONVOLVE_ONE_SAMPLE; // 7
CONVOLVE_ONE_SAMPLE; // 8
CONVOLVE_ONE_SAMPLE; // 9
CONVOLVE_ONE_SAMPLE; // 10
CONVOLVE_ONE_SAMPLE; // 11
CONVOLVE_ONE_SAMPLE; // 12
CONVOLVE_ONE_SAMPLE; // 13
CONVOLVE_ONE_SAMPLE; // 14
CONVOLVE_ONE_SAMPLE; // 15
CONVOLVE_ONE_SAMPLE; // 16
CONVOLVE_ONE_SAMPLE; // 17
CONVOLVE_ONE_SAMPLE; // 18
CONVOLVE_ONE_SAMPLE; // 19
CONVOLVE_ONE_SAMPLE; // 20
CONVOLVE_ONE_SAMPLE; // 21
CONVOLVE_ONE_SAMPLE; // 22
CONVOLVE_ONE_SAMPLE; // 23
CONVOLVE_ONE_SAMPLE; // 24
CONVOLVE_ONE_SAMPLE; // 25
CONVOLVE_ONE_SAMPLE; // 26
CONVOLVE_ONE_SAMPLE; // 27
CONVOLVE_ONE_SAMPLE; // 28
CONVOLVE_ONE_SAMPLE; // 29
CONVOLVE_ONE_SAMPLE; // 30
CONVOLVE_ONE_SAMPLE; // 31
CONVOLVE_ONE_SAMPLE; // 32
} else if (kernel_size == 64) {
CONVOLVE_ONE_SAMPLE; // 1
CONVOLVE_ONE_SAMPLE; // 2
CONVOLVE_ONE_SAMPLE; // 3
CONVOLVE_ONE_SAMPLE; // 4
CONVOLVE_ONE_SAMPLE; // 5
CONVOLVE_ONE_SAMPLE; // 6
CONVOLVE_ONE_SAMPLE; // 7
CONVOLVE_ONE_SAMPLE; // 8
CONVOLVE_ONE_SAMPLE; // 9
CONVOLVE_ONE_SAMPLE; // 10
CONVOLVE_ONE_SAMPLE; // 11
CONVOLVE_ONE_SAMPLE; // 12
CONVOLVE_ONE_SAMPLE; // 13
CONVOLVE_ONE_SAMPLE; // 14
CONVOLVE_ONE_SAMPLE; // 15
CONVOLVE_ONE_SAMPLE; // 16
CONVOLVE_ONE_SAMPLE; // 17
CONVOLVE_ONE_SAMPLE; // 18
CONVOLVE_ONE_SAMPLE; // 19
CONVOLVE_ONE_SAMPLE; // 20
CONVOLVE_ONE_SAMPLE; // 21
CONVOLVE_ONE_SAMPLE; // 22
CONVOLVE_ONE_SAMPLE; // 23
CONVOLVE_ONE_SAMPLE; // 24
CONVOLVE_ONE_SAMPLE; // 25
CONVOLVE_ONE_SAMPLE; // 26
CONVOLVE_ONE_SAMPLE; // 27
CONVOLVE_ONE_SAMPLE; // 28
CONVOLVE_ONE_SAMPLE; // 29
CONVOLVE_ONE_SAMPLE; // 30
CONVOLVE_ONE_SAMPLE; // 31
CONVOLVE_ONE_SAMPLE; // 32
CONVOLVE_ONE_SAMPLE; // 33
CONVOLVE_ONE_SAMPLE; // 34
CONVOLVE_ONE_SAMPLE; // 35
CONVOLVE_ONE_SAMPLE; // 36
CONVOLVE_ONE_SAMPLE; // 37
CONVOLVE_ONE_SAMPLE; // 38
CONVOLVE_ONE_SAMPLE; // 39
CONVOLVE_ONE_SAMPLE; // 40
CONVOLVE_ONE_SAMPLE; // 41
CONVOLVE_ONE_SAMPLE; // 42
CONVOLVE_ONE_SAMPLE; // 43
CONVOLVE_ONE_SAMPLE; // 44
CONVOLVE_ONE_SAMPLE; // 45
CONVOLVE_ONE_SAMPLE; // 46
CONVOLVE_ONE_SAMPLE; // 47
CONVOLVE_ONE_SAMPLE; // 48
CONVOLVE_ONE_SAMPLE; // 49
CONVOLVE_ONE_SAMPLE; // 50
CONVOLVE_ONE_SAMPLE; // 51
CONVOLVE_ONE_SAMPLE; // 52
CONVOLVE_ONE_SAMPLE; // 53
CONVOLVE_ONE_SAMPLE; // 54
CONVOLVE_ONE_SAMPLE; // 55
CONVOLVE_ONE_SAMPLE; // 56
CONVOLVE_ONE_SAMPLE; // 57
CONVOLVE_ONE_SAMPLE; // 58
CONVOLVE_ONE_SAMPLE; // 59
CONVOLVE_ONE_SAMPLE; // 60
CONVOLVE_ONE_SAMPLE; // 61
CONVOLVE_ONE_SAMPLE; // 62
CONVOLVE_ONE_SAMPLE; // 63
CONVOLVE_ONE_SAMPLE; // 64
} else if (kernel_size == 128) {
CONVOLVE_ONE_SAMPLE; // 1
CONVOLVE_ONE_SAMPLE; // 2
CONVOLVE_ONE_SAMPLE; // 3
CONVOLVE_ONE_SAMPLE; // 4
CONVOLVE_ONE_SAMPLE; // 5
CONVOLVE_ONE_SAMPLE; // 6
CONVOLVE_ONE_SAMPLE; // 7
CONVOLVE_ONE_SAMPLE; // 8
CONVOLVE_ONE_SAMPLE; // 9
CONVOLVE_ONE_SAMPLE; // 10
CONVOLVE_ONE_SAMPLE; // 11
CONVOLVE_ONE_SAMPLE; // 12
CONVOLVE_ONE_SAMPLE; // 13
CONVOLVE_ONE_SAMPLE; // 14
CONVOLVE_ONE_SAMPLE; // 15
CONVOLVE_ONE_SAMPLE; // 16
CONVOLVE_ONE_SAMPLE; // 17
CONVOLVE_ONE_SAMPLE; // 18
CONVOLVE_ONE_SAMPLE; // 19
CONVOLVE_ONE_SAMPLE; // 20
CONVOLVE_ONE_SAMPLE; // 21
CONVOLVE_ONE_SAMPLE; // 22
CONVOLVE_ONE_SAMPLE; // 23
CONVOLVE_ONE_SAMPLE; // 24
CONVOLVE_ONE_SAMPLE; // 25
CONVOLVE_ONE_SAMPLE; // 26
CONVOLVE_ONE_SAMPLE; // 27
CONVOLVE_ONE_SAMPLE; // 28
CONVOLVE_ONE_SAMPLE; // 29
CONVOLVE_ONE_SAMPLE; // 30
CONVOLVE_ONE_SAMPLE; // 31
CONVOLVE_ONE_SAMPLE; // 32
CONVOLVE_ONE_SAMPLE; // 33
CONVOLVE_ONE_SAMPLE; // 34
CONVOLVE_ONE_SAMPLE; // 35
CONVOLVE_ONE_SAMPLE; // 36
CONVOLVE_ONE_SAMPLE; // 37
CONVOLVE_ONE_SAMPLE; // 38
CONVOLVE_ONE_SAMPLE; // 39
CONVOLVE_ONE_SAMPLE; // 40
CONVOLVE_ONE_SAMPLE; // 41
CONVOLVE_ONE_SAMPLE; // 42
CONVOLVE_ONE_SAMPLE; // 43
CONVOLVE_ONE_SAMPLE; // 44
CONVOLVE_ONE_SAMPLE; // 45
CONVOLVE_ONE_SAMPLE; // 46
CONVOLVE_ONE_SAMPLE; // 47
CONVOLVE_ONE_SAMPLE; // 48
CONVOLVE_ONE_SAMPLE; // 49
CONVOLVE_ONE_SAMPLE; // 50
CONVOLVE_ONE_SAMPLE; // 51
CONVOLVE_ONE_SAMPLE; // 52
CONVOLVE_ONE_SAMPLE; // 53
CONVOLVE_ONE_SAMPLE; // 54
CONVOLVE_ONE_SAMPLE; // 55
CONVOLVE_ONE_SAMPLE; // 56
CONVOLVE_ONE_SAMPLE; // 57
CONVOLVE_ONE_SAMPLE; // 58
CONVOLVE_ONE_SAMPLE; // 59
CONVOLVE_ONE_SAMPLE; // 60
CONVOLVE_ONE_SAMPLE; // 61
CONVOLVE_ONE_SAMPLE; // 62
CONVOLVE_ONE_SAMPLE; // 63
CONVOLVE_ONE_SAMPLE; // 64
CONVOLVE_ONE_SAMPLE; // 65
CONVOLVE_ONE_SAMPLE; // 66
CONVOLVE_ONE_SAMPLE; // 67
CONVOLVE_ONE_SAMPLE; // 68
CONVOLVE_ONE_SAMPLE; // 69
CONVOLVE_ONE_SAMPLE; // 70
CONVOLVE_ONE_SAMPLE; // 71
CONVOLVE_ONE_SAMPLE; // 72
CONVOLVE_ONE_SAMPLE; // 73
CONVOLVE_ONE_SAMPLE; // 74
CONVOLVE_ONE_SAMPLE; // 75
CONVOLVE_ONE_SAMPLE; // 76
CONVOLVE_ONE_SAMPLE; // 77
CONVOLVE_ONE_SAMPLE; // 78
CONVOLVE_ONE_SAMPLE; // 79
CONVOLVE_ONE_SAMPLE; // 80
CONVOLVE_ONE_SAMPLE; // 81
CONVOLVE_ONE_SAMPLE; // 82
CONVOLVE_ONE_SAMPLE; // 83
CONVOLVE_ONE_SAMPLE; // 84
CONVOLVE_ONE_SAMPLE; // 85
CONVOLVE_ONE_SAMPLE; // 86
CONVOLVE_ONE_SAMPLE; // 87
CONVOLVE_ONE_SAMPLE; // 88
CONVOLVE_ONE_SAMPLE; // 89
CONVOLVE_ONE_SAMPLE; // 90
CONVOLVE_ONE_SAMPLE; // 91
CONVOLVE_ONE_SAMPLE; // 92
CONVOLVE_ONE_SAMPLE; // 93
CONVOLVE_ONE_SAMPLE; // 94
CONVOLVE_ONE_SAMPLE; // 95
CONVOLVE_ONE_SAMPLE; // 96
CONVOLVE_ONE_SAMPLE; // 97
CONVOLVE_ONE_SAMPLE; // 98
CONVOLVE_ONE_SAMPLE; // 99
CONVOLVE_ONE_SAMPLE; // 100
CONVOLVE_ONE_SAMPLE; // 101
CONVOLVE_ONE_SAMPLE; // 102
CONVOLVE_ONE_SAMPLE; // 103
CONVOLVE_ONE_SAMPLE; // 104
CONVOLVE_ONE_SAMPLE; // 105
CONVOLVE_ONE_SAMPLE; // 106
CONVOLVE_ONE_SAMPLE; // 107
CONVOLVE_ONE_SAMPLE; // 108
CONVOLVE_ONE_SAMPLE; // 109
CONVOLVE_ONE_SAMPLE; // 110
CONVOLVE_ONE_SAMPLE; // 111
CONVOLVE_ONE_SAMPLE; // 112
CONVOLVE_ONE_SAMPLE; // 113
CONVOLVE_ONE_SAMPLE; // 114
CONVOLVE_ONE_SAMPLE; // 115
CONVOLVE_ONE_SAMPLE; // 116
CONVOLVE_ONE_SAMPLE; // 117
CONVOLVE_ONE_SAMPLE; // 118
CONVOLVE_ONE_SAMPLE; // 119
CONVOLVE_ONE_SAMPLE; // 120
CONVOLVE_ONE_SAMPLE; // 121
CONVOLVE_ONE_SAMPLE; // 122
CONVOLVE_ONE_SAMPLE; // 123
CONVOLVE_ONE_SAMPLE; // 124
CONVOLVE_ONE_SAMPLE; // 125
CONVOLVE_ONE_SAMPLE; // 126
CONVOLVE_ONE_SAMPLE; // 127
CONVOLVE_ONE_SAMPLE; // 128
} else {
while (j < kernel_size) {
// Non-optimized using actual while loop.
CONVOLVE_ONE_SAMPLE;
}
}
dest_p[i++] = sum;
}
#undef CONVOLVE_ONE_SAMPLE
}
static ALWAYS_INLINE void Vadd(const float* source1p, static ALWAYS_INLINE void Vadd(const float* source1p,
int source_stride1, int source_stride1,
const float* source2p, const float* source2p,
......
...@@ -32,7 +32,7 @@ constexpr size_t kMaxByteAlignment = kMaxBitAlignment / 8u; ...@@ -32,7 +32,7 @@ constexpr size_t kMaxByteAlignment = kMaxBitAlignment / 8u;
constexpr size_t kMaxStride = 2u; constexpr size_t kMaxStride = 2u;
constexpr MemoryLayout kMemoryLayouts[] = { constexpr MemoryLayout kMemoryLayouts[] = {
{kMaxByteAlignment / 2u - kMaxByteAlignment / 4u, 1u}, {kMaxByteAlignment / 4u, 1u},
{kMaxByteAlignment / 2u, 1u}, {kMaxByteAlignment / 2u, 1u},
{kMaxByteAlignment / 2u + kMaxByteAlignment / 4u, 1u}, {kMaxByteAlignment / 2u + kMaxByteAlignment / 4u, 1u},
{kMaxByteAlignment, 1u}, {kMaxByteAlignment, 1u},
...@@ -75,7 +75,7 @@ class TestVector { ...@@ -75,7 +75,7 @@ class TestVector {
// These types are used by std::iterator_traits used by std::equal used by // These types are used by std::iterator_traits used by std::equal used by
// TestVector::operator==. // TestVector::operator==.
using difference_type = ptrdiff_t; using difference_type = ptrdiff_t;
using iterator_category = std::input_iterator_tag; using iterator_category = std::bidirectional_iterator_tag;
using pointer = T*; using pointer = T*;
using reference = T&; using reference = T&;
using value_type = T; using value_type = T;
...@@ -91,6 +91,15 @@ class TestVector { ...@@ -91,6 +91,15 @@ class TestVector {
++(*this); ++(*this);
return iter; return iter;
} }
Iterator& operator--() {
p_ -= stride_;
return *this;
}
Iterator operator--(int) {
Iterator iter = *this;
--(*this);
return iter;
}
bool operator==(const Iterator& other) const { return p_ == other.p_; } bool operator==(const Iterator& other) const { return p_ == other.p_; }
bool operator!=(const Iterator& other) const { return !(*this == other); } bool operator!=(const Iterator& other) const { return !(*this == other); }
T& operator*() const { return *p_; } T& operator*() const { return *p_; }
...@@ -101,6 +110,8 @@ class TestVector { ...@@ -101,6 +110,8 @@ class TestVector {
}; };
public: public:
using ReverseIterator = std::reverse_iterator<Iterator>;
// These types are used internally by Google Test. // These types are used internally by Google Test.
using const_iterator = Iterator; using const_iterator = Iterator;
using iterator = Iterator; using iterator = Iterator;
...@@ -117,6 +128,8 @@ class TestVector { ...@@ -117,6 +128,8 @@ class TestVector {
Iterator begin() const { return Iterator(p_, stride()); } Iterator begin() const { return Iterator(p_, stride()); }
Iterator end() const { return Iterator(p_ + size() * stride(), stride()); } Iterator end() const { return Iterator(p_ + size() * stride(), stride()); }
ReverseIterator rbegin() const { return ReverseIterator(end()); }
ReverseIterator rend() const { return ReverseIterator(begin()); }
const MemoryLayout* memory_layout() const { return memory_layout_; } const MemoryLayout* memory_layout() const { return memory_layout_; }
T* p() const { return p_; } T* p() const { return p_; }
size_t size() const { return size_; } size_t size() const { return size_; }
...@@ -175,17 +188,27 @@ GetPrimaryVectors(const T* base) { ...@@ -175,17 +188,27 @@ GetPrimaryVectors(const T* base) {
template <typename T> template <typename T>
std::array<TestVector<T>, 2u> GetSecondaryVectors( std::array<TestVector<T>, 2u> GetSecondaryVectors(
T* base, T* base,
const TestVector<const float>& primary_vector) { const MemoryLayout* primary_memory_layout,
size_t size) {
std::array<TestVector<T>, 2u> vectors; std::array<TestVector<T>, 2u> vectors;
const MemoryLayout* primary_memory_layout = primary_vector.memory_layout();
const MemoryLayout* other_memory_layout = const MemoryLayout* other_memory_layout =
&kMemoryLayouts[primary_memory_layout == &kMemoryLayouts[0]]; &kMemoryLayouts[primary_memory_layout == &kMemoryLayouts[0]];
CHECK_NE(primary_memory_layout, other_memory_layout); CHECK_NE(primary_memory_layout, other_memory_layout);
vectors[0] = TestVector<T>(base, primary_vector); CHECK_NE(primary_memory_layout->byte_alignment,
vectors[1] = TestVector<T>(base, other_memory_layout, primary_vector.size()); other_memory_layout->byte_alignment);
vectors[0] = TestVector<T>(base, primary_memory_layout, size);
vectors[1] = TestVector<T>(base, other_memory_layout, size);
return vectors; return vectors;
} }
template <typename T>
std::array<TestVector<T>, 2u> GetSecondaryVectors(
T* base,
const TestVector<const float>& primary_vector) {
return GetSecondaryVectors(base, primary_vector.memory_layout(),
primary_vector.size());
}
class VectorMathTest : public ::testing::Test { class VectorMathTest : public ::testing::Test {
protected: protected:
enum { enum {
...@@ -194,8 +217,9 @@ class VectorMathTest : public ::testing::Test { ...@@ -194,8 +217,9 @@ class VectorMathTest : public ::testing::Test {
(kMaxStride * kMaxVectorSizeInBytes + kMaxByteAlignment - 1u) / (kMaxStride * kMaxVectorSizeInBytes + kMaxByteAlignment - 1u) /
sizeof(float), sizeof(float),
kFullyFiniteSource = 4u, kFullyFiniteSource = 4u,
kFullyNonNanSource = 5u, kFullyFiniteSource2 = 5u,
kSourceCount = 6u kFullyNonNanSource = 6u,
kSourceCount = 7u
}; };
// Get a destination buffer containing initially uninitialized floats. // Get a destination buffer containing initially uninitialized floats.
...@@ -219,7 +243,7 @@ class VectorMathTest : public ::testing::Test { ...@@ -219,7 +243,7 @@ class VectorMathTest : public ::testing::Test {
std::uniform_int_distribution<size_t> index_distribution( std::uniform_int_distribution<size_t> index_distribution(
0u, kFloatArraySize / 2u - 1u); 0u, kFloatArraySize / 2u - 1u);
for (size_t i = 0u; i < kSourceCount; ++i) { for (size_t i = 0u; i < kSourceCount; ++i) {
if (i == kFullyFiniteSource) if (i == kFullyFiniteSource || i == kFullyFiniteSource2)
continue; continue;
sources_[i][index_distribution(generator)] = INFINITY; sources_[i][index_distribution(generator)] = INFINITY;
sources_[i][index_distribution(generator)] = -INFINITY; sources_[i][index_distribution(generator)] = -INFINITY;
...@@ -236,6 +260,46 @@ class VectorMathTest : public ::testing::Test { ...@@ -236,6 +260,46 @@ class VectorMathTest : public ::testing::Test {
float VectorMathTest::destinations_[kDestinationCount][kFloatArraySize]; float VectorMathTest::destinations_[kDestinationCount][kFloatArraySize];
float VectorMathTest::sources_[kSourceCount][kFloatArraySize]; float VectorMathTest::sources_[kSourceCount][kFloatArraySize];
TEST_F(VectorMathTest, Conv) {
for (const auto& source : GetPrimaryVectors(GetSource(kFullyFiniteSource))) {
if (source.stride() != 1)
continue;
for (size_t filter_size : {3u, 20u, 32u, 64u, 128u}) {
// The maximum number of frames which could be processed here is
// |source.size() - filter_size + 1|. However, in order to test
// optimization paths, |frames_to_process| should be optimal (divisible
// by a power of 2) whenever |filter_size| is optimal. Therefore, let's
// process only |source.size() - filter_size| frames here.
if (filter_size >= source.size())
break;
size_t frames_to_process = source.size() - filter_size;
// The stride of a convolution filter must be -1. Let's first create
// a reversed filter whose stride is 1.
TestVector<const float> reversed_filter(
GetSource(kFullyFiniteSource2), source.memory_layout(), filter_size);
// The filter begins from the reverse beginning of the reversed filter
// and grows downwards.
const float* filter_p = &*reversed_filter.rbegin();
TestVector<float> expected_dest(
GetDestination(0u), source.memory_layout(), frames_to_process);
for (size_t i = 0u; i < frames_to_process; ++i) {
expected_dest[i] = 0u;
for (size_t j = 0u; j < filter_size; ++j)
expected_dest[i] += source[i + j] * *(filter_p - j);
}
for (auto& dest : GetSecondaryVectors(
GetDestination(1u), source.memory_layout(), frames_to_process)) {
Conv(source.p(), 1, filter_p, -1, dest.p(), 1, frames_to_process,
filter_size);
for (size_t i = 0u; i < frames_to_process; ++i) {
EXPECT_NEAR(expected_dest[i], dest[i],
1e-3 * std::abs(expected_dest[i]));
}
}
}
}
}
TEST_F(VectorMathTest, Vadd) { TEST_F(VectorMathTest, Vadd) {
for (const auto& source1 : GetPrimaryVectors(GetSource(0u))) { for (const auto& source1 : GetPrimaryVectors(GetSource(0u))) {
for (const auto& source2 : GetSecondaryVectors(GetSource(1u), source1)) { for (const auto& source2 : GetSecondaryVectors(GetSource(1u), source1)) {
......
...@@ -15,6 +15,9 @@ namespace blink { ...@@ -15,6 +15,9 @@ namespace blink {
namespace VectorMath { namespace VectorMath {
namespace NEON { namespace NEON {
// TODO: Consider optimizing this.
using Scalar::Conv;
static ALWAYS_INLINE void Vadd(const float* source1p, static ALWAYS_INLINE void Vadd(const float* source1p,
int source_stride1, int source_stride1,
const float* source2p, const float* source2p,
......
...@@ -14,6 +14,11 @@ namespace blink { ...@@ -14,6 +14,11 @@ namespace blink {
namespace VectorMath { namespace VectorMath {
namespace MSA { namespace MSA {
// TODO: Consider optimizing these.
using Scalar::Conv;
using Scalar::Vsvesq;
using Scalar::Zvmul;
static ALWAYS_INLINE void Vadd(const float* source1p, static ALWAYS_INLINE void Vadd(const float* source1p,
int source_stride1, int source_stride1,
const float* source2p, const float* source2p,
...@@ -207,24 +212,6 @@ static ALWAYS_INLINE void Vsmul(const float* source_p, ...@@ -207,24 +212,6 @@ static ALWAYS_INLINE void Vsmul(const float* source_p,
Scalar::Vsmul(source_p, source_stride, scale, dest_p, dest_stride, n); Scalar::Vsmul(source_p, source_stride, scale, dest_p, dest_stride, n);
} }
static ALWAYS_INLINE void Vsvesq(const float* source_p,
int source_stride,
float* sum_p,
size_t frames_to_process) {
Scalar::Vsvesq(source_p, source_stride, sum_p, frames_to_process);
}
static ALWAYS_INLINE void Zvmul(const float* real1p,
const float* imag1p,
const float* real2p,
const float* imag2p,
float* real_dest_p,
float* imag_dest_p,
size_t frames_to_process) {
Scalar::Zvmul(real1p, imag1p, real2p, imag2p, real_dest_p, imag_dest_p,
frames_to_process);
}
} // namespace MSA } // namespace MSA
} // namespace VectorMath } // namespace VectorMath
} // namespace blink } // namespace blink
......
...@@ -6,11 +6,14 @@ ...@@ -6,11 +6,14 @@
#define VectorMathX86_h #define VectorMathX86_h
#include "base/cpu.h" #include "base/cpu.h"
#include "platform/audio/AudioArray.h"
#include "platform/audio/VectorMathScalar.h" #include "platform/audio/VectorMathScalar.h"
#include "platform/audio/cpu/x86/VectorMathAVX.h" #include "platform/audio/cpu/x86/VectorMathAVX.h"
#include "platform/audio/cpu/x86/VectorMathSSE.h" #include "platform/audio/cpu/x86/VectorMathSSE.h"
#include "platform/wtf/Assertions.h" #include "platform/wtf/Assertions.h"
#include <xmmintrin.h>
namespace blink { namespace blink {
namespace VectorMath { namespace VectorMath {
namespace X86 { namespace X86 {
...@@ -91,6 +94,70 @@ SplitFramesToProcess(const float* source_p, size_t frames_to_process) { ...@@ -91,6 +94,70 @@ SplitFramesToProcess(const float* source_p, size_t frames_to_process) {
return counts; return counts;
} }
static ALWAYS_INLINE void Conv(const float* source_p,
int source_stride,
const float* filter_p,
int filter_stride,
float* dest_p,
int dest_stride,
size_t frames_to_process,
size_t filter_size) {
// Only contiguous convolution is implemented. Correlation (positive
// |filter_stride|) and support for non-contiguous vectors are not
// implemented.
DCHECK_EQ(1, source_stride);
DCHECK_EQ(-1, filter_stride);
DCHECK_EQ(1, dest_stride);
size_t kernel_size = filter_size;
const float* input_p = source_p + kernel_size - 1;
const float* kernel_p = filter_p + 1 - kernel_size;
size_t i = 0;
// Convolution using SSE2. Currently only do this if both |kernel_size| and
// |frames_to_process| are multiples of 4. If not, use Scalar::Conv.
if ((kernel_size % 4 == 0) && (frames_to_process % 4 == 0)) {
// AudioFloatArray's are always aligned on at least a 32-byte boundary.
AudioFloatArray kernel_buffer(4 * kernel_size);
__m128* kernel_reversed = reinterpret_cast<__m128*>(kernel_buffer.Data());
// Reverse the kernel and repeat each value across a vector
for (i = 0; i < kernel_size; ++i) {
kernel_reversed[i] = _mm_set1_ps(kernel_p[kernel_size - i - 1]);
}
const float* input_start_p = input_p - kernel_size + 1;
// Do convolution with 4 inputs at a time.
for (i = 0; i < frames_to_process; i += 4) {
__m128 convolution_sum;
convolution_sum = _mm_setzero_ps();
// |kernel_size| is a multiple of 4 so we can unroll the loop by 4,
// manually.
for (size_t k = 0; k < kernel_size; k += 4) {
size_t data_offset = i + k;
for (size_t m = 0; m < 4; ++m) {
__m128 source_block;
__m128 product;
source_block = _mm_loadu_ps(input_start_p + data_offset + m);
product = _mm_mul_ps(kernel_reversed[k + m], source_block);
convolution_sum = _mm_add_ps(convolution_sum, product);
}
}
_mm_storeu_ps(dest_p + i, convolution_sum);
}
} else {
Scalar::Conv(source_p, source_stride, filter_p, filter_stride, dest_p,
dest_stride, frames_to_process, filter_size);
}
}
static ALWAYS_INLINE void Vadd(const float* source1p, static ALWAYS_INLINE void Vadd(const float* source1p,
int source_stride1, int source_stride1,
const float* source2p, const float* source2p,
......
...@@ -19,6 +19,23 @@ namespace Mac { ...@@ -19,6 +19,23 @@ namespace Mac {
// our namespaced function names, so we must handle this case differently. Other // our namespaced function names, so we must handle this case differently. Other
// architectures (64bit, ARM, etc.) do not include this header file. // architectures (64bit, ARM, etc.) do not include this header file.
static ALWAYS_INLINE void Conv(const float* source_p,
int source_stride,
const float* filter_p,
int filter_stride,
float* dest_p,
int dest_stride,
size_t frames_to_process,
size_t filter_size) {
#if defined(ARCH_CPU_X86)
::conv(source_p, source_stride, filter_p, filter_stride, dest_p, dest_stride,
frames_to_process, filter_size);
#else
vDSP_conv(source_p, source_stride, filter_p, filter_stride, dest_p,
dest_stride, frames_to_process, filter_size);
#endif
}
static ALWAYS_INLINE void Vadd(const float* source1p, static ALWAYS_INLINE void Vadd(const float* source1p,
int source_stride1, int source_stride1,
const float* source2p, const float* source2p,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment