Move ARM NEON optimized VectorMath code to its own file

This CL is part of VectorMath code clean up series: [1/4] https://chromium-review.googlesource.com/c/824046 [2/4] https://chromium-review.googlesource.com/c/824047 [3/4] this CL [4/4] https://chromium-review.googlesource.com/c/824049 Bug: 778262 Change-Id: I26f4e6cf672e2739e93cbf2a52dc44d776deec70 Reviewed-on: https://chromium-review.googlesource.com/824048Reviewed-by: Kenneth Russell <kbr@chromium.org> Reviewed-by: Raymond Toy <rtoy@chromium.org> Commit-Queue: Eero Häkkinen <eero.hakkinen@intel.com> Cr-Commit-Position: refs/heads/master@{#524609}

Move ARM NEON optimized VectorMath code to its own file
This CL is part of VectorMath code clean up series: [1/4] https://chromium-review.googlesource.com/c/824046 [2/4] https://chromium-review.googlesource.com/c/824047 [3/4] this CL [4/4] https://chromium-review.googlesource.com/c/824049 Bug: 778262 Change-Id: I26f4e6cf672e2739e93cbf2a52dc44d776deec70 Reviewed-on: https://chromium-review.googlesource.com/824048Reviewed-by: Kenneth Russell <kbr@chromium.org> Reviewed-by: Raymond Toy <rtoy@chromium.org> Commit-Queue: Eero Häkkinen <eero.hakkinen@intel.com> Cr-Commit-Position: refs/heads/master@{#524609}
85162c43 · Eero Häkkinen · Commit Bot · c3df0e35 · 85162c43 · 85162c43
Commit 85162c43 authored Dec 16, 2017 by Eero Häkkinen Committed by Commit Bot Dec 16, 2017
3 changed files
--- a/third_party/WebKit/Source/platform/BUILD.gn
+++ b/third_party/WebKit/Source/platform/BUILD.gn
@@ -16,7 +16,10 @@ import("//third_party/WebKit/Source/platform/platform_generated.gni")
 # Most targets in this file are private actions so use that as the default.
 visibility = [ ":*" ]

-blink_platform_neon_files = [ "graphics/cpu/arm/WebGLImageConversionNEON.h" ]
+blink_platform_neon_files = [
+  "audio/cpu/arm/VectorMathNEON.h",
+  "graphics/cpu/arm/WebGLImageConversionNEON.h",
+]

 blink_platform_msa_files = [ "graphics/cpu/mips/WebGLImageConversionMSA.h" ]

@@ -470,6 +473,7 @@ jumbo_component("platform") {
    "audio/VectorMath.h",
    "audio/VectorMathScalar.h",
    "audio/android/FFTFrameOpenMAXDLAndroid.cpp",
+    "audio/cpu/arm/VectorMathNEON.h",
    "audio/cpu/x86/VectorMathSSE.h",
    "audio/cpu/x86/VectorMathX86.h",
    "audio/ffmpeg/FFTFrameFFMPEG.cpp",

--- a/third_party/WebKit/Source/platform/audio/VectorMath.cpp
+++ b/third_party/WebKit/Source/platform/audio/VectorMath.cpp
@@ -33,16 +33,14 @@

 #if defined(OS_MACOSX)
 #include "platform/audio/mac/VectorMathMac.h"
+#elif WTF_CPU_ARM_NEON
+#include "platform/audio/cpu/arm/VectorMathNEON.h"
 #elif defined(ARCH_CPU_X86_FAMILY)
 #include "platform/audio/cpu/x86/VectorMathX86.h"
 #else
 #include "platform/audio/VectorMathScalar.h"
 #endif

-#if WTF_CPU_ARM_NEON
-#include <arm_neon.h>
-#endif
-
 #if HAVE_MIPS_MSA_INTRINSICS
 #include "platform/cpu/mips/CommonMacrosMSA.h"
 #endif
@@ -56,6 +54,8 @@ namespace VectorMath {
 namespace {
 #if defined(OS_MACOSX)
 namespace Impl = Mac;
+#elif WTF_CPU_ARM_NEON
+namespace Impl = NEON;
 #elif defined(ARCH_CPU_X86_FAMILY)
 namespace Impl = X86;
 #else
@@ -69,28 +69,9 @@ void Vsma(const float* source_p,
          float* dest_p,
          int dest_stride,
          size_t frames_to_process) {
-#if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
+#if HAVE_MIPS_MSA_INTRINSICS
  int n = frames_to_process;

-#if WTF_CPU_ARM_NEON
-  if ((source_stride == 1) && (dest_stride == 1)) {
-    int tail_frames = n % 4;
-    const float* end_p = dest_p + n - tail_frames;
-
-    float32x4_t k = vdupq_n_f32(*scale);
-    while (dest_p < end_p) {
-      float32x4_t source = vld1q_f32(source_p);
-      float32x4_t dest = vld1q_f32(dest_p);
-
-      dest = vmlaq_f32(dest, source, k);
-      vst1q_f32(dest_p, dest);
-
-      source_p += 4;
-      dest_p += 4;
-    }
-    n = tail_frames;
-  }
-#elif HAVE_MIPS_MSA_INTRINSICS
  if ((source_stride == 1) && (dest_stride == 1)) {
    float* destPCopy = dest_p;
    v4f32 vScale;
@@ -111,7 +92,6 @@ void Vsma(const float* source_p,
      ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
    }
  }
-#endif

  frames_to_process = n;
 #endif
@@ -126,25 +106,9 @@ void Vsmul(const float* source_p,
           float* dest_p,
           int dest_stride,
           size_t frames_to_process) {
-#if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
+#if HAVE_MIPS_MSA_INTRINSICS
  int n = frames_to_process;

-#if WTF_CPU_ARM_NEON
-  if ((source_stride == 1) && (dest_stride == 1)) {
-    float k = *scale;
-    int tail_frames = n % 4;
-    const float* end_p = dest_p + n - tail_frames;
-
-    while (dest_p < end_p) {
-      float32x4_t source = vld1q_f32(source_p);
-      vst1q_f32(dest_p, vmulq_n_f32(source, k));
-
-      source_p += 4;
-      dest_p += 4;
-    }
-    n = tail_frames;
-  }
-#elif HAVE_MIPS_MSA_INTRINSICS
  if ((source_stride == 1) && (dest_stride == 1)) {
    v4f32 vScale;
    v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
@@ -162,7 +126,6 @@ void Vsmul(const float* source_p,
      ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
    }
  }
-#endif

  frames_to_process = n;
 #endif
@@ -178,26 +141,9 @@ void Vadd(const float* source1p,
          float* dest_p,
          int dest_stride,
          size_t frames_to_process) {
-#if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
+#if HAVE_MIPS_MSA_INTRINSICS
  int n = frames_to_process;

-#if WTF_CPU_ARM_NEON
-  if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
-    int tail_frames = n % 4;
-    const float* end_p = dest_p + n - tail_frames;
-
-    while (dest_p < end_p) {
-      float32x4_t source1 = vld1q_f32(source1p);
-      float32x4_t source2 = vld1q_f32(source2p);
-      vst1q_f32(dest_p, vaddq_f32(source1, source2));
-
-      source1p += 4;
-      source2p += 4;
-      dest_p += 4;
-    }
-    n = tail_frames;
-  }
-#elif HAVE_MIPS_MSA_INTRINSICS
  if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
    v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,
        vSrc1P7;
@@ -217,7 +163,6 @@ void Vadd(const float* source1p,
      ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
    }
  }
-#endif

  frames_to_process = n;
 #endif
@@ -233,26 +178,9 @@ void Vmul(const float* source1p,
          float* dest_p,
          int dest_stride,
          size_t frames_to_process) {
-#if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
+#if HAVE_MIPS_MSA_INTRINSICS
  int n = frames_to_process;

-#if WTF_CPU_ARM_NEON
-  if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
-    int tail_frames = n % 4;
-    const float* end_p = dest_p + n - tail_frames;
-
-    while (dest_p < end_p) {
-      float32x4_t source1 = vld1q_f32(source1p);
-      float32x4_t source2 = vld1q_f32(source2p);
-      vst1q_f32(dest_p, vmulq_f32(source1, source2));
-
-      source1p += 4;
-      source2p += 4;
-      dest_p += 4;
-    }
-    n = tail_frames;
-  }
-#elif HAVE_MIPS_MSA_INTRINSICS
  if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
    v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,
        vSrc1P7;
@@ -272,7 +200,6 @@ void Vmul(const float* source1p,
      ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
    }
  }
-#endif

  frames_to_process = n;
 #endif
@@ -288,28 +215,8 @@ void Zvmul(const float* real1p,
           float* real_dest_p,
           float* imag_dest_p,
           size_t frames_to_process) {
-  unsigned i = 0;
-
-#if WTF_CPU_ARM_NEON
-  unsigned end_size = frames_to_process - frames_to_process % 4;
-  while (i < end_size) {
-    float32x4_t real1 = vld1q_f32(real1p + i);
-    float32x4_t real2 = vld1q_f32(real2p + i);
-    float32x4_t imag1 = vld1q_f32(imag1p + i);
-    float32x4_t imag2 = vld1q_f32(imag2p + i);
-
-    float32x4_t real_result = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
-    float32x4_t imag_result = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);
-
-    vst1q_f32(real_dest_p + i, real_result);
-    vst1q_f32(imag_dest_p + i, imag_result);
-
-    i += 4;
-  }
-#endif
-
-  Impl::Zvmul(real1p + i, imag1p + i, real2p + i, imag2p + i, real_dest_p + i,
-              imag_dest_p + i, frames_to_process - i);
+  Impl::Zvmul(real1p, imag1p, real2p, imag2p, real_dest_p, imag_dest_p,
+              frames_to_process);
 }

 void Vsvesq(const float* source_p,
@@ -318,32 +225,6 @@ void Vsvesq(const float* source_p,
            size_t frames_to_process) {
  float sum = 0;

-#if WTF_CPU_ARM_NEON
-  int n = frames_to_process;
-
-  if (source_stride == 1) {
-    int tail_frames = n % 4;
-    const float* end_p = source_p + n - tail_frames;
-
-    float32x4_t four_sum = vdupq_n_f32(0);
-    while (source_p < end_p) {
-      float32x4_t source = vld1q_f32(source_p);
-      four_sum = vmlaq_f32(four_sum, source, source);
-      source_p += 4;
-    }
-    float32x2_t two_sum =
-        vadd_f32(vget_low_f32(four_sum), vget_high_f32(four_sum));
-
-    float group_sum[2];
-    vst1_f32(group_sum, two_sum);
-    sum += group_sum[0] + group_sum[1];
-
-    n = tail_frames;
-  }
-
-  frames_to_process = n;
-#endif
-
  Impl::Vsvesq(source_p, source_stride, &sum, frames_to_process);

  DCHECK(sum_p);
@@ -356,30 +237,9 @@ void Vmaxmgv(const float* source_p,
             size_t frames_to_process) {
  float max = 0;

-#if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
+#if HAVE_MIPS_MSA_INTRINSICS
  int n = frames_to_process;

-#if WTF_CPU_ARM_NEON
-  if (source_stride == 1) {
-    int tail_frames = n % 4;
-    const float* end_p = source_p + n - tail_frames;
-
-    float32x4_t four_max = vdupq_n_f32(0);
-    while (source_p < end_p) {
-      float32x4_t source = vld1q_f32(source_p);
-      four_max = vmaxq_f32(four_max, vabsq_f32(source));
-      source_p += 4;
-    }
-    float32x2_t two_max =
-        vmax_f32(vget_low_f32(four_max), vget_high_f32(four_max));
-
-    float group_max[2];
-    vst1_f32(group_max, two_max);
-    max = std::max(group_max[0], group_max[1]);
-
-    n = tail_frames;
-  }
-#elif HAVE_MIPS_MSA_INTRINSICS
  if (source_stride == 1) {
    v4f32 vMax = {
        0,
@@ -401,7 +261,6 @@ void Vmaxmgv(const float* source_p,
    max = std::max(max, vMax[2]);
    max = std::max(max, vMax[3]);
  }
-#endif

  frames_to_process = n;
 #endif
@@ -431,25 +290,9 @@ void Vclip(const float* source_p,
  DCHECK_LE(low_threshold, high_threshold);
 #endif

-#if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
+#if HAVE_MIPS_MSA_INTRINSICS
  int n = frames_to_process;

-#if WTF_CPU_ARM_NEON
-  if ((source_stride == 1) && (dest_stride == 1)) {
-    int tail_frames = n % 4;
-    const float* end_p = dest_p + n - tail_frames;
-
-    float32x4_t low = vdupq_n_f32(low_threshold);
-    float32x4_t high = vdupq_n_f32(high_threshold);
-    while (dest_p < end_p) {
-      float32x4_t source = vld1q_f32(source_p);
-      vst1q_f32(dest_p, vmaxq_f32(vminq_f32(source, high), low));
-      source_p += 4;
-      dest_p += 4;
-    }
-    n = tail_frames;
-  }
-#elif HAVE_MIPS_MSA_INTRINSICS
  if ((source_stride == 1) && (dest_stride == 1)) {
    v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
    v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
@@ -471,7 +314,6 @@ void Vclip(const float* source_p,
      ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
    }
  }
-#endif

  frames_to_process = n;
 #endif

--- a/third_party/WebKit/Source/platform/audio/cpu/arm/VectorMathNEON.h
+++ b/third_party/WebKit/Source/platform/audio/cpu/arm/VectorMathNEON.h
+// Copyright 2017 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef VectorMathNEON_h
+#define VectorMathNEON_h
+
+#include <arm_neon.h>
+
+#include <algorithm>
+
+#include "platform/audio/VectorMathScalar.h"
+
+namespace blink {
+namespace VectorMath {
+namespace NEON {
+
+static ALWAYS_INLINE void Vadd(const float* source1p,
+                               int source_stride1,
+                               const float* source2p,
+                               int source_stride2,
+                               float* dest_p,
+                               int dest_stride,
+                               size_t frames_to_process) {
+  int n = frames_to_process;
+
+  if (source_stride1 == 1 && source_stride2 == 1 && dest_stride == 1) {
+    int tail_frames = n % 4;
+    const float* end_p = dest_p + n - tail_frames;
+
+    while (dest_p < end_p) {
+      float32x4_t source1 = vld1q_f32(source1p);
+      float32x4_t source2 = vld1q_f32(source2p);
+      vst1q_f32(dest_p, vaddq_f32(source1, source2));
+
+      source1p += 4;
+      source2p += 4;
+      dest_p += 4;
+    }
+    n = tail_frames;
+  }
+
+  Scalar::Vadd(source1p, source_stride1, source2p, source_stride2, dest_p,
+               dest_stride, n);
+}
+
+static ALWAYS_INLINE void Vclip(const float* source_p,
+                                int source_stride,
+                                const float* low_threshold_p,
+                                const float* high_threshold_p,
+                                float* dest_p,
+                                int dest_stride,
+                                size_t frames_to_process) {
+  int n = frames_to_process;
+
+  if (source_stride == 1 && dest_stride == 1) {
+    int tail_frames = n % 4;
+    const float* end_p = dest_p + n - tail_frames;
+
+    float32x4_t low = vdupq_n_f32(*low_threshold_p);
+    float32x4_t high = vdupq_n_f32(*high_threshold_p);
+    while (dest_p < end_p) {
+      float32x4_t source = vld1q_f32(source_p);
+      vst1q_f32(dest_p, vmaxq_f32(vminq_f32(source, high), low));
+      source_p += 4;
+      dest_p += 4;
+    }
+    n = tail_frames;
+  }
+
+  Scalar::Vclip(source_p, source_stride, low_threshold_p, high_threshold_p,
+                dest_p, dest_stride, n);
+}
+
+static ALWAYS_INLINE void Vmaxmgv(const float* source_p,
+                                  int source_stride,
+                                  float* max_p,
+                                  size_t frames_to_process) {
+  int n = frames_to_process;
+
+  if (source_stride == 1) {
+    int tail_frames = n % 4;
+    const float* end_p = source_p + n - tail_frames;
+
+    float32x4_t four_max = vdupq_n_f32(*max_p);
+    while (source_p < end_p) {
+      float32x4_t source = vld1q_f32(source_p);
+      four_max = vmaxq_f32(four_max, vabsq_f32(source));
+      source_p += 4;
+    }
+    float32x2_t two_max =
+        vmax_f32(vget_low_f32(four_max), vget_high_f32(four_max));
+
+    float group_max[2];
+    vst1_f32(group_max, two_max);
+    *max_p = std::max(group_max[0], group_max[1]);
+
+    n = tail_frames;
+  }
+
+  Scalar::Vmaxmgv(source_p, source_stride, max_p, n);
+}
+
+static ALWAYS_INLINE void Vmul(const float* source1p,
+                               int source_stride1,
+                               const float* source2p,
+                               int source_stride2,
+                               float* dest_p,
+                               int dest_stride,
+                               size_t frames_to_process) {
+  int n = frames_to_process;
+
+  if (source_stride1 == 1 && source_stride2 == 1 && dest_stride == 1) {
+    int tail_frames = n % 4;
+    const float* end_p = dest_p + n - tail_frames;
+
+    while (dest_p < end_p) {
+      float32x4_t source1 = vld1q_f32(source1p);
+      float32x4_t source2 = vld1q_f32(source2p);
+      vst1q_f32(dest_p, vmulq_f32(source1, source2));
+
+      source1p += 4;
+      source2p += 4;
+      dest_p += 4;
+    }
+    n = tail_frames;
+  }
+
+  Scalar::Vmul(source1p, source_stride1, source2p, source_stride2, dest_p,
+               dest_stride, n);
+}
+
+static ALWAYS_INLINE void Vsma(const float* source_p,
+                               int source_stride,
+                               const float* scale,
+                               float* dest_p,
+                               int dest_stride,
+                               size_t frames_to_process) {
+  int n = frames_to_process;
+
+  if (source_stride == 1 && dest_stride == 1) {
+    int tail_frames = n % 4;
+    const float* end_p = dest_p + n - tail_frames;
+
+    float32x4_t k = vdupq_n_f32(*scale);
+    while (dest_p < end_p) {
+      float32x4_t source = vld1q_f32(source_p);
+      float32x4_t dest = vld1q_f32(dest_p);
+
+      dest = vmlaq_f32(dest, source, k);
+      vst1q_f32(dest_p, dest);
+
+      source_p += 4;
+      dest_p += 4;
+    }
+    n = tail_frames;
+  }
+
+  Scalar::Vsma(source_p, source_stride, scale, dest_p, dest_stride, n);
+}
+
+static ALWAYS_INLINE void Vsmul(const float* source_p,
+                                int source_stride,
+                                const float* scale,
+                                float* dest_p,
+                                int dest_stride,
+                                size_t frames_to_process) {
+  int n = frames_to_process;
+
+  if (source_stride == 1 && dest_stride == 1) {
+    float k = *scale;
+    int tail_frames = n % 4;
+    const float* end_p = dest_p + n - tail_frames;
+
+    while (dest_p < end_p) {
+      float32x4_t source = vld1q_f32(source_p);
+      vst1q_f32(dest_p, vmulq_n_f32(source, k));
+
+      source_p += 4;
+      dest_p += 4;
+    }
+    n = tail_frames;
+  }
+
+  Scalar::Vsmul(source_p, source_stride, scale, dest_p, dest_stride, n);
+}
+
+static ALWAYS_INLINE void Vsvesq(const float* source_p,
+                                 int source_stride,
+                                 float* sum_p,
+                                 size_t frames_to_process) {
+  int n = frames_to_process;
+
+  if (source_stride == 1) {
+    int tail_frames = n % 4;
+    const float* end_p = source_p + n - tail_frames;
+
+    float32x4_t four_sum = vdupq_n_f32(0);
+    while (source_p < end_p) {
+      float32x4_t source = vld1q_f32(source_p);
+      four_sum = vmlaq_f32(four_sum, source, source);
+      source_p += 4;
+    }
+    float32x2_t two_sum =
+        vadd_f32(vget_low_f32(four_sum), vget_high_f32(four_sum));
+
+    float group_sum[2];
+    vst1_f32(group_sum, two_sum);
+    *sum_p += group_sum[0] + group_sum[1];
+
+    n = tail_frames;
+  }
+
+  Scalar::Vsvesq(source_p, source_stride, sum_p, n);
+}
+
+static ALWAYS_INLINE void Zvmul(const float* real1p,
+                                const float* imag1p,
+                                const float* real2p,
+                                const float* imag2p,
+                                float* real_dest_p,
+                                float* imag_dest_p,
+                                size_t frames_to_process) {
+  unsigned i = 0;
+
+  unsigned end_size = frames_to_process - frames_to_process % 4;
+  while (i < end_size) {
+    float32x4_t real1 = vld1q_f32(real1p + i);
+    float32x4_t real2 = vld1q_f32(real2p + i);
+    float32x4_t imag1 = vld1q_f32(imag1p + i);
+    float32x4_t imag2 = vld1q_f32(imag2p + i);
+
+    float32x4_t real_result = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
+    float32x4_t imag_result = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);
+
+    vst1q_f32(real_dest_p + i, real_result);
+    vst1q_f32(imag_dest_p + i, imag_result);
+
+    i += 4;
+  }
+
+  Scalar::Zvmul(real1p + i, imag1p + i, real2p + i, imag2p + i, real_dest_p + i,
+                imag_dest_p + i, frames_to_process - i);
+}
+
+}  // namespace NEON
+}  // namespace VectorMath
+}  // namespace blink
+
+#endif  // VectorMathNEON_h