Break out SSE functions into new media_sse target.

It turns out Chrome doesn't have a minimum requirement of SSE yet, so we can't rely on __SSE__ being set at compile time. To use SSE code we need to put it in a separate GYP target compiled with -msse. This patch set does exactly that for vector_fmac::FMAC_SSE() and SincResampler::Convolve_SSE(). Doing so required some slight rearrangements of constants for SincResampler. Given all of our bots should have SSE I've made it a requirement for passing the tests when run on X86. BUG=none TEST=media_unittests Review URL: https://codereview.chromium.org/12478002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@186285 0039d316-1c4b-4281-b951-d872f2087c98

Break out SSE functions into new media_sse target.
It turns out Chrome doesn't have a minimum requirement of SSE yet, so we can't rely on __SSE__ being set at compile time. To use SSE code we need to put it in a separate GYP target compiled with -msse. This patch set does exactly that for vector_fmac::FMAC_SSE() and SincResampler::Convolve_SSE(). Doing so required some slight rearrangements of constants for SincResampler. Given all of our bots should have SSE I've made it a requirement for passing the tests when run on X86. BUG=none TEST=media_unittests Review URL: https://codereview.chromium.org/12478002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@186285 0039d316-1c4b-4281-b951-d872f2087c98
7fc937e7 · dalecurtis@google.com · 0570dd37 · 7fc937e7 · 7fc937e7 · 7fc937e7
Commit 7fc937e7 authored Mar 06, 2013 by dalecurtis@google.com
9 changed files
--- a/media/base/simd/sinc_resampler_sse.cc
+++ b/media/base/simd/sinc_resampler_sse.cc
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "media/base/sinc_resampler.h"
+
+#include <xmmintrin.h>
+
+namespace media {
+
+float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,
+                                  const float* k2,
+                                  double kernel_interpolation_factor) {
+  __m128 m_input;
+  __m128 m_sums1 = _mm_setzero_ps();
+  __m128 m_sums2 = _mm_setzero_ps();
+
+  // Based on |input_ptr| alignment, we need to use loadu or load.  Unrolling
+  // these loops hurt performance in local testing.
+  if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {
+    for (int i = 0; i < kKernelSize; i += 4) {
+      m_input = _mm_loadu_ps(input_ptr + i);
+      m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
+      m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
+    }
+  } else {
+    for (int i = 0; i < kKernelSize; i += 4) {
+      m_input = _mm_load_ps(input_ptr + i);
+      m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
+      m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
+    }
+  }
+
+  // Linearly interpolate the two "convolutions".
+  m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));
+  m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));
+  m_sums1 = _mm_add_ps(m_sums1, m_sums2);
+
+  // Sum components together.
+  float result;
+  m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
+  _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(
+      m_sums2, m_sums2, 1)));
+
+  return result;
+}
+
+}  // namespace media
--- a/media/base/simd/vector_math_sse.cc
+++ b/media/base/simd/vector_math_sse.cc
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "media/base/vector_math_testing.h"
+
+#include <xmmintrin.h>  // NOLINT
+
+namespace media {
+namespace vector_math {
+
+void FMAC_SSE(const float src[], float scale, int len, float dest[]) {
+  const int rem = len % 4;
+  const int last_index = len - rem;
+  __m128 m_scale = _mm_set_ps1(scale);
+  for (int i = 0; i < last_index; i += 4) {
+    _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i),
+                 _mm_mul_ps(_mm_load_ps(src + i), m_scale)));
+  }
+
+  // Handle any remaining values that wouldn't fit in an SSE pass.
+  for (int i = last_index; i < len; ++i)
+    dest[i] += src[i] * scale;
+}
+
+}  // namespace vector_math
+}  // namespace media
--- a/media/base/sinc_resampler.cc
+++ b/media/base/sinc_resampler.cc
@@ -40,11 +40,6 @@

 #include "base/cpu.h"
 #include "base/logging.h"
-#include "build/build_config.h"
-
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
-#include <xmmintrin.h>
-#endif

 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
 #include <arm_neon.h>
@@ -52,33 +47,6 @@

 namespace media {

-namespace {
-
-enum {
-  // The kernel size can be adjusted for quality (higher is better) at the
-  // expense of performance.  Must be a multiple of 32.
-  // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
-  kKernelSize = 32,
-
-  // The number of destination frames generated per processing pass.  Affects
-  // how often and for how much SincResampler calls back for input.  Must be
-  // greater than kKernelSize.
-  kBlockSize = 512,
-
-  // The kernel offset count is used for interpolation and is the number of
-  // sub-sample kernel shifts.  Can be adjusted for quality (higher is better)
-  // at the expense of allocating more memory.
-  kKernelOffsetCount = 32,
-  kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),
-
-  // The size (in samples) of the internal buffer used by the resampler.
-  kBufferSize = kBlockSize + kKernelSize
-};
-
-}  // namespace
-
-const int SincResampler::kMaximumLookAheadSize = kBufferSize;
-
 SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)
    : io_sample_rate_ratio_(io_sample_rate_ratio),
      virtual_source_idx_(0),
@@ -222,7 +190,7 @@ void SincResampler::Resample(float* destination, int frames) {
  }
 }

-int SincResampler::ChunkSize() {
+int SincResampler::ChunkSize() const {
  return kBlockSize / io_sample_rate_ratio_;
 }

@@ -235,14 +203,23 @@ void SincResampler::Flush() {
 float SincResampler::Convolve(const float* input_ptr, const float* k1,
                              const float* k2,
                              double kernel_interpolation_factor) {
+  // Ensure |k1|, |k2| are 16-byte aligned for SSE usage.  Should always be true
+  // so long as kKernelSize is a multiple of 16.
+  DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);
+  DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);
+
  // Rely on function level static initialization to keep ConvolveProc selection
  // thread safe.
  typedef float (*ConvolveProc)(const float* src, const float* k1,
                                const float* k2,
                                double kernel_interpolation_factor);
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(ARCH_CPU_X86_FAMILY)
+#if defined(__SSE__)
+  static const ConvolveProc kConvolveProc = Convolve_SSE;
+#else
  static const ConvolveProc kConvolveProc =
      base::CPU().has_sse() ? Convolve_SSE : Convolve_C;
+#endif
 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
  static const ConvolveProc kConvolveProc = Convolve_NEON;
 #else
@@ -271,50 +248,6 @@ float SincResampler::Convolve_C(const float* input_ptr, const float* k1,
      + kernel_interpolation_factor * sum2;
 }

-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
-float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,
-                                  const float* k2,
-                                  double kernel_interpolation_factor) {
-  // Ensure |k1|, |k2| are 16-byte aligned for SSE usage.  Should always be true
-  // so long as kKernelSize is a multiple of 16.
-  DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);
-  DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);
-
-  __m128 m_input;
-  __m128 m_sums1 = _mm_setzero_ps();
-  __m128 m_sums2 = _mm_setzero_ps();
-
-  // Based on |input_ptr| alignment, we need to use loadu or load.  Unrolling
-  // these loops hurt performance in local testing.
-  if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {
-    for (int i = 0; i < kKernelSize; i += 4) {
-      m_input = _mm_loadu_ps(input_ptr + i);
-      m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
-      m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
-    }
-  } else {
-    for (int i = 0; i < kKernelSize; i += 4) {
-      m_input = _mm_load_ps(input_ptr + i);
-      m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
-      m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
-    }
-  }
-
-  // Linearly interpolate the two "convolutions".
-  m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));
-  m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));
-  m_sums1 = _mm_add_ps(m_sums1, m_sums2);
-
-  // Sum components together.
-  float result;
-  m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
-  _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(
-      m_sums2, m_sums2, 1)));
-
-  return result;
-}
-#endif
-
 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,
                                   const float* k2,

--- a/media/base/sinc_resampler.h
+++ b/media/base/sinc_resampler.h
@@ -9,6 +9,7 @@
 #include "base/gtest_prod_util.h"
 #include "base/memory/aligned_memory.h"
 #include "base/memory/scoped_ptr.h"
+#include "build/build_config.h"
 #include "media/base/media_export.h"

 namespace media {
@@ -16,9 +17,30 @@ namespace media {
 // SincResampler is a high-quality single-channel sample-rate converter.
 class MEDIA_EXPORT SincResampler {
 public:
-  // The maximum number of samples that may be requested from the callback ahead
-  // of the current position in the stream.
-  static const int kMaximumLookAheadSize;
+  enum {
+    // The kernel size can be adjusted for quality (higher is better) at the
+    // expense of performance.  Must be a multiple of 32.
+    // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
+    kKernelSize = 32,
+
+    // The number of destination frames generated per processing pass.  Affects
+    // how often and for how much SincResampler calls back for input.  Must be
+    // greater than kKernelSize.
+    kBlockSize = 512,
+
+    // The kernel offset count is used for interpolation and is the number of
+    // sub-sample kernel shifts.  Can be adjusted for quality (higher is better)
+    // at the expense of allocating more memory.
+    kKernelOffsetCount = 32,
+    kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),
+
+    // The size (in samples) of the internal buffer used by the resampler.
+    kBufferSize = kBlockSize + kKernelSize,
+
+    // The maximum number of samples that may be requested from the callback
+    // ahead of the current position in the stream.
+    kMaximumLookAheadSize = kBufferSize
+  };

  // Callback type for providing more data into the resampler.  Expects |frames|
  // of data to be rendered into |destination|; zero padded if not enough frames
@@ -36,7 +58,7 @@ class MEDIA_EXPORT SincResampler {

  // The maximum size in frames that guarantees Resample() will only make a
  // single call to |read_cb_| for more data.
-  int ChunkSize();
+  int ChunkSize() const;

  // Flush all buffered data and reset internal indices.
  void Flush();
@@ -55,15 +77,18 @@ class MEDIA_EXPORT SincResampler {
                        const float* k2, double kernel_interpolation_factor);
  static float Convolve_C(const float* input_ptr, const float* k1,
                          const float* k2, double kernel_interpolation_factor);
+#if defined(ARCH_CPU_X86_FAMILY)
  static float Convolve_SSE(const float* input_ptr, const float* k1,
                            const float* k2,
                            double kernel_interpolation_factor);
+#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
  static float Convolve_NEON(const float* input_ptr, const float* k1,
                             const float* k2,
                             double kernel_interpolation_factor);
+#endif

  // The ratio of input / output sample rates.
-  double io_sample_rate_ratio_;
+  const double io_sample_rate_ratio_;

  // An index on the source input buffer with sub-sample precision.  It must be
  // double precision to avoid drift.

--- a/media/base/sinc_resampler_unittest.cc
+++ b/media/base/sinc_resampler_unittest.cc
@@ -10,6 +10,7 @@
 #include "base/bind.h"
 #include "base/bind_helpers.h"
 #include "base/command_line.h"
+#include "base/cpu.h"
 #include "base/logging.h"
 #include "base/string_number_conversions.h"
 #include "base/strings/stringize_macros.h"
@@ -98,7 +99,7 @@ TEST(SincResamplerTest, Flush) {
 }

 // Define platform independent function name for Convolve* tests.
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(ARCH_CPU_X86_FAMILY)
 #define CONVOLVE_FUNC Convolve_SSE
 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
 #define CONVOLVE_FUNC Convolve_NEON
@@ -109,6 +110,10 @@ TEST(SincResamplerTest, Flush) {
 // will be tested by the parameterized SincResampler tests below.
 #if defined(CONVOLVE_FUNC)
 TEST(SincResamplerTest, Convolve) {
+#if defined(ARCH_CPU_X86_FAMILY)
+  ASSERT_TRUE(base::CPU().has_sse());
+#endif
+
  // Initialize a dummy resampler.
  MockSource mock_source;
  SincResampler resampler(
@@ -171,6 +176,10 @@ TEST(SincResamplerTest, ConvolveBenchmark) {
  printf("Convolve_C took %.2fms.\n", total_time_c_ms);

 #if defined(CONVOLVE_FUNC)
+#if defined(ARCH_CPU_X86_FAMILY)
+  ASSERT_TRUE(base::CPU().has_sse());
+#endif
+
  // Benchmark with unaligned input pointer.
  start = base::TimeTicks::HighResNow();
  for (int j = 0; j < convolve_iterations; ++j) {

--- a/media/base/vector_math.cc
+++ b/media/base/vector_math.cc
@@ -7,11 +7,6 @@

 #include "base/cpu.h"
 #include "base/logging.h"
-#include "build/build_config.h"
-
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
-#include <xmmintrin.h>
-#endif

 namespace media {
 namespace vector_math {
@@ -25,9 +20,13 @@ void FMAC(const float src[], float scale, int len, float dest[]) {
  // selection thread safe.
  typedef void (*VectorFMACProc)(const float src[], float scale, int len,
                                 float dest[]);
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(ARCH_CPU_X86_FAMILY)
+#if defined(__SSE__)
+  static const VectorFMACProc kVectorFMACProc = FMAC_SSE;
+#else
  static const VectorFMACProc kVectorFMACProc =
      base::CPU().has_sse() ? FMAC_SSE : FMAC_C;
+#endif
 #else
  static const VectorFMACProc kVectorFMACProc = FMAC_C;
 #endif
@@ -40,20 +39,5 @@ void FMAC_C(const float src[], float scale, int len, float dest[]) {
    dest[i] += src[i] * scale;
 }

-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
-void FMAC_SSE(const float src[], float scale, int len, float dest[]) {
-  __m128 m_scale = _mm_set_ps1(scale);
-  int rem = len % 4;
-  for (int i = 0; i < len - rem; i += 4) {
-    _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i),
-                 _mm_mul_ps(_mm_load_ps(src + i), m_scale)));
-  }
-
-  // Handle any remaining values that wouldn't fit in an SSE pass.
-  if (rem)
-    FMAC_C(src + len - rem, scale, rem, dest + len - rem);
-}
-#endif
-
 }  // namespace vector_math
 }  // namespace media
--- a/media/base/vector_math_testing.h
+++ b/media/base/vector_math_testing.h
@@ -5,6 +5,7 @@
 #ifndef MEDIA_BASE_VECTOR_MATH_TESTING_H_
 #define MEDIA_BASE_VECTOR_MATH_TESTING_H_

+#include "build/build_config.h"
 #include "media/base/media_export.h"

 namespace media {
@@ -13,8 +14,11 @@ namespace vector_math {
 // Optimized versions of FMAC() function exposed for testing.  See vector_math.h
 // for details.
 MEDIA_EXPORT void FMAC_C(const float src[], float scale, int len, float dest[]);
+
+#if defined(ARCH_CPU_X86_FAMILY)
 MEDIA_EXPORT void FMAC_SSE(const float src[], float scale, int len,
                           float dest[]);
+#endif

 }  // namespace vector_math
 }  // namespace media

--- a/media/base/vector_math_unittest.cc
+++ b/media/base/vector_math_unittest.cc
@@ -7,6 +7,7 @@
 #include <cmath>

 #include "base/command_line.h"
+#include "base/cpu.h"
 #include "base/memory/aligned_memory.h"
 #include "base/memory/scoped_ptr.h"
 #include "base/string_number_conversions.h"
@@ -90,8 +91,9 @@ TEST_F(VectorMathTest, FMAC) {
    VerifyOutput(kResult);
  }

-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(ARCH_CPU_X86_FAMILY)
  {
+    ASSERT_TRUE(base::CPU().has_sse());
    SCOPED_TRACE("FMAC_SSE");
    FillTestVectors(kInputFillValue, kOutputFillValue);
    vector_math::FMAC_SSE(
@@ -118,7 +120,9 @@ TEST_F(VectorMathTest, FMACBenchmark) {
  double total_time_c_ms = (TimeTicks::HighResNow() - start).InMillisecondsF();
  printf("FMAC_C took %.2fms.\n", total_time_c_ms);

-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(ARCH_CPU_X86_FAMILY)
+  ASSERT_TRUE(base::CPU().has_sse());
+
  // Benchmark FMAC_SSE() with unaligned size.
  ASSERT_NE((kVectorSize - 1) % (vector_math::kRequiredAlignment /
            sizeof(float)), 0U);

--- a/media/media.gyp
+++ b/media/media.gyp
@@ -683,7 +683,7 @@
                  'message': 'Generating Pulse stubs for dynamic loading.',
                },
              ],
-              'conditions': [ 
+              'conditions': [
                # Linux/Solaris need libdl for dlopen() and friends.
                ['OS == "linux" or OS == "solaris"', {
                  'link_settings': {
@@ -811,6 +811,12 @@
            '../build/linux/system.gyp:gtk',
          ],
        }],
+        # ios check is necessary due to http://crbug.com/172682.
+        ['OS != "ios" and (target_arch == "ia32" or target_arch == "x64")', {
+          'dependencies': [
+            'media_sse',
+          ],
+        }],
      ],
      'target_conditions': [
        ['OS == "ios"', {
@@ -1018,12 +1024,15 @@
            'audio/audio_low_latency_input_output_unittest.cc',
          ],
        }],
-        [ 'target_arch=="ia32" or target_arch=="x64"', {
+        ['OS != "ios" and (target_arch=="ia32" or target_arch=="x64")', {
          'sources': [
            'base/simd/convert_rgb_to_yuv_unittest.cc',
          ],
+          'dependencies': [
+            'media_sse',
+          ],
        }],
-        [ 'screen_capture_supported == 0', {
+        ['screen_capture_supported == 0', {
          'sources/': [
            ['exclude', '^video/capture/screen/'],
          ],
@@ -1610,5 +1619,27 @@
        }, # end of target differ_block_sse2
      ],
    }],
+    # ios check is necessary due to http://crbug.com/172682.
+    ['OS != "ios" and (target_arch=="ia32" or target_arch=="x64")', {
+      'targets': [
+        {
+          'target_name': 'media_sse',
+          'type': 'static_library',
+          'cflags': [
+            '-msse',
+          ],
+          'include_dirs': [
+            '..',
+          ],
+          'defines': [
+            'MEDIA_IMPLEMENTATION',
+          ],
+          'sources': [
+            'base/simd/sinc_resampler_sse.cc',
+            'base/simd/vector_math_sse.cc',
+          ],
+        }, # end of target media_sse
+      ],
+    }],
  ],
 }