Optimize blink::VectorMath::Vclip for SSE

The performance gain on an Intel Broadwell CPU is about 8-fold, and on Intel Skylake and newer CPUs the SSE throughput should be further doubled. Bug: 778262 Change-Id: If4682a4d95a6729626049a38a4b1d5b1ebf830bd Reviewed-on: https://chromium-review.googlesource.com/738243 Commit-Queue: Eero Häkkinen <eero.hakkinen@intel.com> Reviewed-by: Raymond Toy <rtoy@chromium.org> Cr-Commit-Position: refs/heads/master@{#520973}

Optimize blink::VectorMath::Vclip for SSE
The performance gain on an Intel Broadwell CPU is about 8-fold, and on Intel Skylake and newer CPUs the SSE throughput should be further doubled. Bug: 778262 Change-Id: If4682a4d95a6729626049a38a4b1d5b1ebf830bd Reviewed-on: https://chromium-review.googlesource.com/738243 Commit-Queue: Eero Häkkinen <eero.hakkinen@intel.com> Reviewed-by: Raymond Toy <rtoy@chromium.org> Cr-Commit-Position: refs/heads/master@{#520973}
a4b0dbe6 · Eero Häkkinen · Commit Bot · 2fd83667 · a4b0dbe6 · a4b0dbe6
Commit a4b0dbe6 authored Dec 01, 2017 by Eero Häkkinen Committed by Commit Bot Dec 01, 2017
2 changed files
--- a/third_party/WebKit/Source/platform/audio/VectorMath.cpp
+++ b/third_party/WebKit/Source/platform/audio/VectorMath.cpp
@@ -683,8 +683,38 @@ void Vclip(const float* source_p,
  float low_threshold = *low_threshold_p;
  float high_threshold = *high_threshold_p;

-// FIXME: Optimize for SSE2.
-#if WTF_CPU_ARM_NEON
+#if DCHECK_IS_ON()
+  // Do the same DCHECKs that |clampTo| would do so that optimization paths do
+  // not have to do them.
+  for (size_t i = 0u; i < frames_to_process; ++i)
+    DCHECK(!std::isnan(source_p[i]));
+  // This also ensures that thresholds are not NaNs.
+  DCHECK_LE(low_threshold, high_threshold);
+#endif
+
+#if defined(ARCH_CPU_X86_FAMILY)
+  if (source_stride == 1 && dest_stride == 1) {
+    size_t i = 0u;
+
+    // If the source_p address is not 16-byte aligned, the first several
+    // frames  (at most three) should be processed separately.
+    for (; !SSE::IsAligned(source_p + i) && i < frames_to_process; ++i)
+      dest_p[i] = clampTo(source_p[i], low_threshold, high_threshold);
+
+    // Now the source_p+i address is 16-byte aligned. Start to apply SSE.
+    size_t sse_frames_to_process =
+        (frames_to_process - i) & SSE::kFramesToProcessMask;
+    if (sse_frames_to_process > 0u) {
+      SSE::Vclip(source_p + i, &low_threshold, &high_threshold, dest_p + i,
+                 sse_frames_to_process);
+      i += sse_frames_to_process;
+    }
+
+    source_p += i;
+    dest_p += i;
+    n -= i;
+  }
+#elif WTF_CPU_ARM_NEON
  if ((source_stride == 1) && (dest_stride == 1)) {
    int tail_frames = n % 4;
    const float* end_p = dest_p + n - tail_frames;

--- a/third_party/WebKit/Source/platform/audio/cpu/x86/VectorMathImpl.cpp
+++ b/third_party/WebKit/Source/platform/audio/cpu/x86/VectorMathImpl.cpp
@@ -9,6 +9,7 @@
 #include "platform/wtf/Assertions.h"

 #include <algorithm>
+#include <cmath>

 #include <xmmintrin.h>

@@ -72,6 +73,40 @@ void Vadd(const float* source1p,
 #undef ADD_ALL
 }

+// dest[k] = clip(source[k], low_threshold, high_threshold)
+//         = max(low_threshold, min(high_threshold, source[k]))
+void Vclip(const float* source_p,
+           const float* low_threshold_p,
+           const float* high_threshold_p,
+           float* dest_p,
+           size_t frames_to_process) {
+  const float* const source_end_p = source_p + frames_to_process;
+
+  DCHECK(IsAligned(source_p));
+  DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
+
+  MType m_low_threshold = MM_PS(set1)(*low_threshold_p);
+  MType m_high_threshold = MM_PS(set1)(*high_threshold_p);
+
+#define CLIP_ALL(storeDest)                                                  \
+  while (source_p < source_end_p) {                                          \
+    MType m_source = MM_PS(load)(source_p);                                  \
+    MType m_dest =                                                           \
+        MM_PS(max)(m_low_threshold, MM_PS(min)(m_high_threshold, m_source)); \
+    MM_PS(storeDest)(dest_p, m_dest);                                        \
+    source_p += kPackedFloatsPerRegister;                                    \
+    dest_p += kPackedFloatsPerRegister;                                      \
+  }
+
+  if (IsAligned(dest_p)) {
+    CLIP_ALL(store);
+  } else {
+    CLIP_ALL(storeu);
+  }
+
+#undef CLIP_ALL
+}
+
 // max = max(abs(source[k])) for all k
 void Vmaxmgv(const float* source_p, float* max_p, size_t frames_to_process) {
  constexpr uint32_t kMask = 0x7FFFFFFFu;