Commit 7fc937e7 authored by dalecurtis@google.com's avatar dalecurtis@google.com

Break out SSE functions into new media_sse target.

It turns out Chrome doesn't have a minimum requirement of SSE yet,
so we can't rely on __SSE__ being set at compile time.  To use SSE
code we need to put it in a separate GYP target compiled with -msse.

This patch set does exactly that for vector_fmac::FMAC_SSE() and
SincResampler::Convolve_SSE().  Doing so required some slight
rearrangements of constants for SincResampler.

Given all of our bots should have SSE I've made it a requirement
for passing the tests when run on X86.

BUG=none
TEST=media_unittests

Review URL: https://codereview.chromium.org/12478002

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@186285 0039d316-1c4b-4281-b951-d872f2087c98
parent 0570dd37
// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "media/base/sinc_resampler.h"
#include <xmmintrin.h>
namespace media {
float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,
const float* k2,
double kernel_interpolation_factor) {
__m128 m_input;
__m128 m_sums1 = _mm_setzero_ps();
__m128 m_sums2 = _mm_setzero_ps();
// Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
// these loops hurt performance in local testing.
if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {
for (int i = 0; i < kKernelSize; i += 4) {
m_input = _mm_loadu_ps(input_ptr + i);
m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
}
} else {
for (int i = 0; i < kKernelSize; i += 4) {
m_input = _mm_load_ps(input_ptr + i);
m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
}
}
// Linearly interpolate the two "convolutions".
m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));
m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));
m_sums1 = _mm_add_ps(m_sums1, m_sums2);
// Sum components together.
float result;
m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
_mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(
m_sums2, m_sums2, 1)));
return result;
}
} // namespace media
// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "media/base/vector_math_testing.h"
#include <xmmintrin.h> // NOLINT
namespace media {
namespace vector_math {
void FMAC_SSE(const float src[], float scale, int len, float dest[]) {
const int rem = len % 4;
const int last_index = len - rem;
__m128 m_scale = _mm_set_ps1(scale);
for (int i = 0; i < last_index; i += 4) {
_mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i),
_mm_mul_ps(_mm_load_ps(src + i), m_scale)));
}
// Handle any remaining values that wouldn't fit in an SSE pass.
for (int i = last_index; i < len; ++i)
dest[i] += src[i] * scale;
}
} // namespace vector_math
} // namespace media
......@@ -40,11 +40,6 @@
#include "base/cpu.h"
#include "base/logging.h"
#include "build/build_config.h"
#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
#include <xmmintrin.h>
#endif
#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
#include <arm_neon.h>
......@@ -52,33 +47,6 @@
namespace media {
namespace {
enum {
// The kernel size can be adjusted for quality (higher is better) at the
// expense of performance. Must be a multiple of 32.
// TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
kKernelSize = 32,
// The number of destination frames generated per processing pass. Affects
// how often and for how much SincResampler calls back for input. Must be
// greater than kKernelSize.
kBlockSize = 512,
// The kernel offset count is used for interpolation and is the number of
// sub-sample kernel shifts. Can be adjusted for quality (higher is better)
// at the expense of allocating more memory.
kKernelOffsetCount = 32,
kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),
// The size (in samples) of the internal buffer used by the resampler.
kBufferSize = kBlockSize + kKernelSize
};
} // namespace
const int SincResampler::kMaximumLookAheadSize = kBufferSize;
SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)
: io_sample_rate_ratio_(io_sample_rate_ratio),
virtual_source_idx_(0),
......@@ -222,7 +190,7 @@ void SincResampler::Resample(float* destination, int frames) {
}
}
int SincResampler::ChunkSize() {
int SincResampler::ChunkSize() const {
return kBlockSize / io_sample_rate_ratio_;
}
......@@ -235,14 +203,23 @@ void SincResampler::Flush() {
float SincResampler::Convolve(const float* input_ptr, const float* k1,
const float* k2,
double kernel_interpolation_factor) {
// Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true
// so long as kKernelSize is a multiple of 16.
DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);
DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);
// Rely on function level static initialization to keep ConvolveProc selection
// thread safe.
typedef float (*ConvolveProc)(const float* src, const float* k1,
const float* k2,
double kernel_interpolation_factor);
#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
#if defined(ARCH_CPU_X86_FAMILY)
#if defined(__SSE__)
static const ConvolveProc kConvolveProc = Convolve_SSE;
#else
static const ConvolveProc kConvolveProc =
base::CPU().has_sse() ? Convolve_SSE : Convolve_C;
#endif
#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
static const ConvolveProc kConvolveProc = Convolve_NEON;
#else
......@@ -271,50 +248,6 @@ float SincResampler::Convolve_C(const float* input_ptr, const float* k1,
+ kernel_interpolation_factor * sum2;
}
#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,
const float* k2,
double kernel_interpolation_factor) {
// Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true
// so long as kKernelSize is a multiple of 16.
DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);
DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);
__m128 m_input;
__m128 m_sums1 = _mm_setzero_ps();
__m128 m_sums2 = _mm_setzero_ps();
// Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
// these loops hurt performance in local testing.
if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {
for (int i = 0; i < kKernelSize; i += 4) {
m_input = _mm_loadu_ps(input_ptr + i);
m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
}
} else {
for (int i = 0; i < kKernelSize; i += 4) {
m_input = _mm_load_ps(input_ptr + i);
m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
}
}
// Linearly interpolate the two "convolutions".
m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));
m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));
m_sums1 = _mm_add_ps(m_sums1, m_sums2);
// Sum components together.
float result;
m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
_mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(
m_sums2, m_sums2, 1)));
return result;
}
#endif
#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,
const float* k2,
......
......@@ -9,6 +9,7 @@
#include "base/gtest_prod_util.h"
#include "base/memory/aligned_memory.h"
#include "base/memory/scoped_ptr.h"
#include "build/build_config.h"
#include "media/base/media_export.h"
namespace media {
......@@ -16,9 +17,30 @@ namespace media {
// SincResampler is a high-quality single-channel sample-rate converter.
class MEDIA_EXPORT SincResampler {
public:
// The maximum number of samples that may be requested from the callback ahead
// of the current position in the stream.
static const int kMaximumLookAheadSize;
enum {
// The kernel size can be adjusted for quality (higher is better) at the
// expense of performance. Must be a multiple of 32.
// TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
kKernelSize = 32,
// The number of destination frames generated per processing pass. Affects
// how often and for how much SincResampler calls back for input. Must be
// greater than kKernelSize.
kBlockSize = 512,
// The kernel offset count is used for interpolation and is the number of
// sub-sample kernel shifts. Can be adjusted for quality (higher is better)
// at the expense of allocating more memory.
kKernelOffsetCount = 32,
kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),
// The size (in samples) of the internal buffer used by the resampler.
kBufferSize = kBlockSize + kKernelSize,
// The maximum number of samples that may be requested from the callback
// ahead of the current position in the stream.
kMaximumLookAheadSize = kBufferSize
};
// Callback type for providing more data into the resampler. Expects |frames|
// of data to be rendered into |destination|; zero padded if not enough frames
......@@ -36,7 +58,7 @@ class MEDIA_EXPORT SincResampler {
// The maximum size in frames that guarantees Resample() will only make a
// single call to |read_cb_| for more data.
int ChunkSize();
int ChunkSize() const;
// Flush all buffered data and reset internal indices.
void Flush();
......@@ -55,15 +77,18 @@ class MEDIA_EXPORT SincResampler {
const float* k2, double kernel_interpolation_factor);
static float Convolve_C(const float* input_ptr, const float* k1,
const float* k2, double kernel_interpolation_factor);
#if defined(ARCH_CPU_X86_FAMILY)
static float Convolve_SSE(const float* input_ptr, const float* k1,
const float* k2,
double kernel_interpolation_factor);
#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
static float Convolve_NEON(const float* input_ptr, const float* k1,
const float* k2,
double kernel_interpolation_factor);
#endif
// The ratio of input / output sample rates.
double io_sample_rate_ratio_;
const double io_sample_rate_ratio_;
// An index on the source input buffer with sub-sample precision. It must be
// double precision to avoid drift.
......
......@@ -10,6 +10,7 @@
#include "base/bind.h"
#include "base/bind_helpers.h"
#include "base/command_line.h"
#include "base/cpu.h"
#include "base/logging.h"
#include "base/string_number_conversions.h"
#include "base/strings/stringize_macros.h"
......@@ -98,7 +99,7 @@ TEST(SincResamplerTest, Flush) {
}
// Define platform independent function name for Convolve* tests.
#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
#if defined(ARCH_CPU_X86_FAMILY)
#define CONVOLVE_FUNC Convolve_SSE
#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
#define CONVOLVE_FUNC Convolve_NEON
......@@ -109,6 +110,10 @@ TEST(SincResamplerTest, Flush) {
// will be tested by the parameterized SincResampler tests below.
#if defined(CONVOLVE_FUNC)
TEST(SincResamplerTest, Convolve) {
#if defined(ARCH_CPU_X86_FAMILY)
ASSERT_TRUE(base::CPU().has_sse());
#endif
// Initialize a dummy resampler.
MockSource mock_source;
SincResampler resampler(
......@@ -171,6 +176,10 @@ TEST(SincResamplerTest, ConvolveBenchmark) {
printf("Convolve_C took %.2fms.\n", total_time_c_ms);
#if defined(CONVOLVE_FUNC)
#if defined(ARCH_CPU_X86_FAMILY)
ASSERT_TRUE(base::CPU().has_sse());
#endif
// Benchmark with unaligned input pointer.
start = base::TimeTicks::HighResNow();
for (int j = 0; j < convolve_iterations; ++j) {
......
......@@ -7,11 +7,6 @@
#include "base/cpu.h"
#include "base/logging.h"
#include "build/build_config.h"
#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
#include <xmmintrin.h>
#endif
namespace media {
namespace vector_math {
......@@ -25,9 +20,13 @@ void FMAC(const float src[], float scale, int len, float dest[]) {
// selection thread safe.
typedef void (*VectorFMACProc)(const float src[], float scale, int len,
float dest[]);
#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
#if defined(ARCH_CPU_X86_FAMILY)
#if defined(__SSE__)
static const VectorFMACProc kVectorFMACProc = FMAC_SSE;
#else
static const VectorFMACProc kVectorFMACProc =
base::CPU().has_sse() ? FMAC_SSE : FMAC_C;
#endif
#else
static const VectorFMACProc kVectorFMACProc = FMAC_C;
#endif
......@@ -40,20 +39,5 @@ void FMAC_C(const float src[], float scale, int len, float dest[]) {
dest[i] += src[i] * scale;
}
#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
void FMAC_SSE(const float src[], float scale, int len, float dest[]) {
__m128 m_scale = _mm_set_ps1(scale);
int rem = len % 4;
for (int i = 0; i < len - rem; i += 4) {
_mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i),
_mm_mul_ps(_mm_load_ps(src + i), m_scale)));
}
// Handle any remaining values that wouldn't fit in an SSE pass.
if (rem)
FMAC_C(src + len - rem, scale, rem, dest + len - rem);
}
#endif
} // namespace vector_math
} // namespace media
......@@ -5,6 +5,7 @@
#ifndef MEDIA_BASE_VECTOR_MATH_TESTING_H_
#define MEDIA_BASE_VECTOR_MATH_TESTING_H_
#include "build/build_config.h"
#include "media/base/media_export.h"
namespace media {
......@@ -13,8 +14,11 @@ namespace vector_math {
// Optimized versions of FMAC() function exposed for testing. See vector_math.h
// for details.
MEDIA_EXPORT void FMAC_C(const float src[], float scale, int len, float dest[]);
#if defined(ARCH_CPU_X86_FAMILY)
MEDIA_EXPORT void FMAC_SSE(const float src[], float scale, int len,
float dest[]);
#endif
} // namespace vector_math
} // namespace media
......
......@@ -7,6 +7,7 @@
#include <cmath>
#include "base/command_line.h"
#include "base/cpu.h"
#include "base/memory/aligned_memory.h"
#include "base/memory/scoped_ptr.h"
#include "base/string_number_conversions.h"
......@@ -90,8 +91,9 @@ TEST_F(VectorMathTest, FMAC) {
VerifyOutput(kResult);
}
#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
#if defined(ARCH_CPU_X86_FAMILY)
{
ASSERT_TRUE(base::CPU().has_sse());
SCOPED_TRACE("FMAC_SSE");
FillTestVectors(kInputFillValue, kOutputFillValue);
vector_math::FMAC_SSE(
......@@ -118,7 +120,9 @@ TEST_F(VectorMathTest, FMACBenchmark) {
double total_time_c_ms = (TimeTicks::HighResNow() - start).InMillisecondsF();
printf("FMAC_C took %.2fms.\n", total_time_c_ms);
#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
#if defined(ARCH_CPU_X86_FAMILY)
ASSERT_TRUE(base::CPU().has_sse());
// Benchmark FMAC_SSE() with unaligned size.
ASSERT_NE((kVectorSize - 1) % (vector_math::kRequiredAlignment /
sizeof(float)), 0U);
......
......@@ -683,7 +683,7 @@
'message': 'Generating Pulse stubs for dynamic loading.',
},
],
'conditions': [
'conditions': [
# Linux/Solaris need libdl for dlopen() and friends.
['OS == "linux" or OS == "solaris"', {
'link_settings': {
......@@ -811,6 +811,12 @@
'../build/linux/system.gyp:gtk',
],
}],
# ios check is necessary due to http://crbug.com/172682.
['OS != "ios" and (target_arch == "ia32" or target_arch == "x64")', {
'dependencies': [
'media_sse',
],
}],
],
'target_conditions': [
['OS == "ios"', {
......@@ -1018,12 +1024,15 @@
'audio/audio_low_latency_input_output_unittest.cc',
],
}],
[ 'target_arch=="ia32" or target_arch=="x64"', {
['OS != "ios" and (target_arch=="ia32" or target_arch=="x64")', {
'sources': [
'base/simd/convert_rgb_to_yuv_unittest.cc',
],
'dependencies': [
'media_sse',
],
}],
[ 'screen_capture_supported == 0', {
['screen_capture_supported == 0', {
'sources/': [
['exclude', '^video/capture/screen/'],
],
......@@ -1610,5 +1619,27 @@
}, # end of target differ_block_sse2
],
}],
# ios check is necessary due to http://crbug.com/172682.
['OS != "ios" and (target_arch=="ia32" or target_arch=="x64")', {
'targets': [
{
'target_name': 'media_sse',
'type': 'static_library',
'cflags': [
'-msse',
],
'include_dirs': [
'..',
],
'defines': [
'MEDIA_IMPLEMENTATION',
],
'sources': [
'base/simd/sinc_resampler_sse.cc',
'base/simd/vector_math_sse.cc',
],
}, # end of target media_sse
],
}],
],
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment