Commit 85162c43 authored by Eero Häkkinen's avatar Eero Häkkinen Committed by Commit Bot

Move ARM NEON optimized VectorMath code to its own file

This CL is part of VectorMath code clean up series:
[1/4] https://chromium-review.googlesource.com/c/824046
[2/4] https://chromium-review.googlesource.com/c/824047
[3/4] this CL
[4/4] https://chromium-review.googlesource.com/c/824049

Bug: 778262
Change-Id: I26f4e6cf672e2739e93cbf2a52dc44d776deec70
Reviewed-on: https://chromium-review.googlesource.com/824048Reviewed-by: default avatarKenneth Russell <kbr@chromium.org>
Reviewed-by: default avatarRaymond Toy <rtoy@chromium.org>
Commit-Queue: Eero Häkkinen <eero.hakkinen@intel.com>
Cr-Commit-Position: refs/heads/master@{#524609}
parent c3df0e35
......@@ -16,7 +16,10 @@ import("//third_party/WebKit/Source/platform/platform_generated.gni")
# Most targets in this file are private actions so use that as the default.
visibility = [ ":*" ]
blink_platform_neon_files = [ "graphics/cpu/arm/WebGLImageConversionNEON.h" ]
blink_platform_neon_files = [
"audio/cpu/arm/VectorMathNEON.h",
"graphics/cpu/arm/WebGLImageConversionNEON.h",
]
blink_platform_msa_files = [ "graphics/cpu/mips/WebGLImageConversionMSA.h" ]
......@@ -470,6 +473,7 @@ jumbo_component("platform") {
"audio/VectorMath.h",
"audio/VectorMathScalar.h",
"audio/android/FFTFrameOpenMAXDLAndroid.cpp",
"audio/cpu/arm/VectorMathNEON.h",
"audio/cpu/x86/VectorMathSSE.h",
"audio/cpu/x86/VectorMathX86.h",
"audio/ffmpeg/FFTFrameFFMPEG.cpp",
......
......@@ -33,16 +33,14 @@
#if defined(OS_MACOSX)
#include "platform/audio/mac/VectorMathMac.h"
#elif WTF_CPU_ARM_NEON
#include "platform/audio/cpu/arm/VectorMathNEON.h"
#elif defined(ARCH_CPU_X86_FAMILY)
#include "platform/audio/cpu/x86/VectorMathX86.h"
#else
#include "platform/audio/VectorMathScalar.h"
#endif
#if WTF_CPU_ARM_NEON
#include <arm_neon.h>
#endif
#if HAVE_MIPS_MSA_INTRINSICS
#include "platform/cpu/mips/CommonMacrosMSA.h"
#endif
......@@ -56,6 +54,8 @@ namespace VectorMath {
namespace {
#if defined(OS_MACOSX)
namespace Impl = Mac;
#elif WTF_CPU_ARM_NEON
namespace Impl = NEON;
#elif defined(ARCH_CPU_X86_FAMILY)
namespace Impl = X86;
#else
......@@ -69,28 +69,9 @@ void Vsma(const float* source_p,
float* dest_p,
int dest_stride,
size_t frames_to_process) {
#if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
#if HAVE_MIPS_MSA_INTRINSICS
int n = frames_to_process;
#if WTF_CPU_ARM_NEON
if ((source_stride == 1) && (dest_stride == 1)) {
int tail_frames = n % 4;
const float* end_p = dest_p + n - tail_frames;
float32x4_t k = vdupq_n_f32(*scale);
while (dest_p < end_p) {
float32x4_t source = vld1q_f32(source_p);
float32x4_t dest = vld1q_f32(dest_p);
dest = vmlaq_f32(dest, source, k);
vst1q_f32(dest_p, dest);
source_p += 4;
dest_p += 4;
}
n = tail_frames;
}
#elif HAVE_MIPS_MSA_INTRINSICS
if ((source_stride == 1) && (dest_stride == 1)) {
float* destPCopy = dest_p;
v4f32 vScale;
......@@ -111,7 +92,6 @@ void Vsma(const float* source_p,
ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
}
}
#endif
frames_to_process = n;
#endif
......@@ -126,25 +106,9 @@ void Vsmul(const float* source_p,
float* dest_p,
int dest_stride,
size_t frames_to_process) {
#if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
#if HAVE_MIPS_MSA_INTRINSICS
int n = frames_to_process;
#if WTF_CPU_ARM_NEON
if ((source_stride == 1) && (dest_stride == 1)) {
float k = *scale;
int tail_frames = n % 4;
const float* end_p = dest_p + n - tail_frames;
while (dest_p < end_p) {
float32x4_t source = vld1q_f32(source_p);
vst1q_f32(dest_p, vmulq_n_f32(source, k));
source_p += 4;
dest_p += 4;
}
n = tail_frames;
}
#elif HAVE_MIPS_MSA_INTRINSICS
if ((source_stride == 1) && (dest_stride == 1)) {
v4f32 vScale;
v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
......@@ -162,7 +126,6 @@ void Vsmul(const float* source_p,
ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
}
}
#endif
frames_to_process = n;
#endif
......@@ -178,26 +141,9 @@ void Vadd(const float* source1p,
float* dest_p,
int dest_stride,
size_t frames_to_process) {
#if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
#if HAVE_MIPS_MSA_INTRINSICS
int n = frames_to_process;
#if WTF_CPU_ARM_NEON
if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
int tail_frames = n % 4;
const float* end_p = dest_p + n - tail_frames;
while (dest_p < end_p) {
float32x4_t source1 = vld1q_f32(source1p);
float32x4_t source2 = vld1q_f32(source2p);
vst1q_f32(dest_p, vaddq_f32(source1, source2));
source1p += 4;
source2p += 4;
dest_p += 4;
}
n = tail_frames;
}
#elif HAVE_MIPS_MSA_INTRINSICS
if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,
vSrc1P7;
......@@ -217,7 +163,6 @@ void Vadd(const float* source1p,
ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
}
}
#endif
frames_to_process = n;
#endif
......@@ -233,26 +178,9 @@ void Vmul(const float* source1p,
float* dest_p,
int dest_stride,
size_t frames_to_process) {
#if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
#if HAVE_MIPS_MSA_INTRINSICS
int n = frames_to_process;
#if WTF_CPU_ARM_NEON
if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
int tail_frames = n % 4;
const float* end_p = dest_p + n - tail_frames;
while (dest_p < end_p) {
float32x4_t source1 = vld1q_f32(source1p);
float32x4_t source2 = vld1q_f32(source2p);
vst1q_f32(dest_p, vmulq_f32(source1, source2));
source1p += 4;
source2p += 4;
dest_p += 4;
}
n = tail_frames;
}
#elif HAVE_MIPS_MSA_INTRINSICS
if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,
vSrc1P7;
......@@ -272,7 +200,6 @@ void Vmul(const float* source1p,
ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
}
}
#endif
frames_to_process = n;
#endif
......@@ -288,28 +215,8 @@ void Zvmul(const float* real1p,
float* real_dest_p,
float* imag_dest_p,
size_t frames_to_process) {
unsigned i = 0;
#if WTF_CPU_ARM_NEON
unsigned end_size = frames_to_process - frames_to_process % 4;
while (i < end_size) {
float32x4_t real1 = vld1q_f32(real1p + i);
float32x4_t real2 = vld1q_f32(real2p + i);
float32x4_t imag1 = vld1q_f32(imag1p + i);
float32x4_t imag2 = vld1q_f32(imag2p + i);
float32x4_t real_result = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
float32x4_t imag_result = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);
vst1q_f32(real_dest_p + i, real_result);
vst1q_f32(imag_dest_p + i, imag_result);
i += 4;
}
#endif
Impl::Zvmul(real1p + i, imag1p + i, real2p + i, imag2p + i, real_dest_p + i,
imag_dest_p + i, frames_to_process - i);
Impl::Zvmul(real1p, imag1p, real2p, imag2p, real_dest_p, imag_dest_p,
frames_to_process);
}
void Vsvesq(const float* source_p,
......@@ -318,32 +225,6 @@ void Vsvesq(const float* source_p,
size_t frames_to_process) {
float sum = 0;
#if WTF_CPU_ARM_NEON
int n = frames_to_process;
if (source_stride == 1) {
int tail_frames = n % 4;
const float* end_p = source_p + n - tail_frames;
float32x4_t four_sum = vdupq_n_f32(0);
while (source_p < end_p) {
float32x4_t source = vld1q_f32(source_p);
four_sum = vmlaq_f32(four_sum, source, source);
source_p += 4;
}
float32x2_t two_sum =
vadd_f32(vget_low_f32(four_sum), vget_high_f32(four_sum));
float group_sum[2];
vst1_f32(group_sum, two_sum);
sum += group_sum[0] + group_sum[1];
n = tail_frames;
}
frames_to_process = n;
#endif
Impl::Vsvesq(source_p, source_stride, &sum, frames_to_process);
DCHECK(sum_p);
......@@ -356,30 +237,9 @@ void Vmaxmgv(const float* source_p,
size_t frames_to_process) {
float max = 0;
#if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
#if HAVE_MIPS_MSA_INTRINSICS
int n = frames_to_process;
#if WTF_CPU_ARM_NEON
if (source_stride == 1) {
int tail_frames = n % 4;
const float* end_p = source_p + n - tail_frames;
float32x4_t four_max = vdupq_n_f32(0);
while (source_p < end_p) {
float32x4_t source = vld1q_f32(source_p);
four_max = vmaxq_f32(four_max, vabsq_f32(source));
source_p += 4;
}
float32x2_t two_max =
vmax_f32(vget_low_f32(four_max), vget_high_f32(four_max));
float group_max[2];
vst1_f32(group_max, two_max);
max = std::max(group_max[0], group_max[1]);
n = tail_frames;
}
#elif HAVE_MIPS_MSA_INTRINSICS
if (source_stride == 1) {
v4f32 vMax = {
0,
......@@ -401,7 +261,6 @@ void Vmaxmgv(const float* source_p,
max = std::max(max, vMax[2]);
max = std::max(max, vMax[3]);
}
#endif
frames_to_process = n;
#endif
......@@ -431,25 +290,9 @@ void Vclip(const float* source_p,
DCHECK_LE(low_threshold, high_threshold);
#endif
#if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
#if HAVE_MIPS_MSA_INTRINSICS
int n = frames_to_process;
#if WTF_CPU_ARM_NEON
if ((source_stride == 1) && (dest_stride == 1)) {
int tail_frames = n % 4;
const float* end_p = dest_p + n - tail_frames;
float32x4_t low = vdupq_n_f32(low_threshold);
float32x4_t high = vdupq_n_f32(high_threshold);
while (dest_p < end_p) {
float32x4_t source = vld1q_f32(source_p);
vst1q_f32(dest_p, vmaxq_f32(vminq_f32(source, high), low));
source_p += 4;
dest_p += 4;
}
n = tail_frames;
}
#elif HAVE_MIPS_MSA_INTRINSICS
if ((source_stride == 1) && (dest_stride == 1)) {
v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
......@@ -471,7 +314,6 @@ void Vclip(const float* source_p,
ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
}
}
#endif
frames_to_process = n;
#endif
......
// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef VectorMathNEON_h
#define VectorMathNEON_h
#include <arm_neon.h>
#include <algorithm>
#include "platform/audio/VectorMathScalar.h"
namespace blink {
namespace VectorMath {
namespace NEON {
static ALWAYS_INLINE void Vadd(const float* source1p,
int source_stride1,
const float* source2p,
int source_stride2,
float* dest_p,
int dest_stride,
size_t frames_to_process) {
int n = frames_to_process;
if (source_stride1 == 1 && source_stride2 == 1 && dest_stride == 1) {
int tail_frames = n % 4;
const float* end_p = dest_p + n - tail_frames;
while (dest_p < end_p) {
float32x4_t source1 = vld1q_f32(source1p);
float32x4_t source2 = vld1q_f32(source2p);
vst1q_f32(dest_p, vaddq_f32(source1, source2));
source1p += 4;
source2p += 4;
dest_p += 4;
}
n = tail_frames;
}
Scalar::Vadd(source1p, source_stride1, source2p, source_stride2, dest_p,
dest_stride, n);
}
static ALWAYS_INLINE void Vclip(const float* source_p,
int source_stride,
const float* low_threshold_p,
const float* high_threshold_p,
float* dest_p,
int dest_stride,
size_t frames_to_process) {
int n = frames_to_process;
if (source_stride == 1 && dest_stride == 1) {
int tail_frames = n % 4;
const float* end_p = dest_p + n - tail_frames;
float32x4_t low = vdupq_n_f32(*low_threshold_p);
float32x4_t high = vdupq_n_f32(*high_threshold_p);
while (dest_p < end_p) {
float32x4_t source = vld1q_f32(source_p);
vst1q_f32(dest_p, vmaxq_f32(vminq_f32(source, high), low));
source_p += 4;
dest_p += 4;
}
n = tail_frames;
}
Scalar::Vclip(source_p, source_stride, low_threshold_p, high_threshold_p,
dest_p, dest_stride, n);
}
static ALWAYS_INLINE void Vmaxmgv(const float* source_p,
int source_stride,
float* max_p,
size_t frames_to_process) {
int n = frames_to_process;
if (source_stride == 1) {
int tail_frames = n % 4;
const float* end_p = source_p + n - tail_frames;
float32x4_t four_max = vdupq_n_f32(*max_p);
while (source_p < end_p) {
float32x4_t source = vld1q_f32(source_p);
four_max = vmaxq_f32(four_max, vabsq_f32(source));
source_p += 4;
}
float32x2_t two_max =
vmax_f32(vget_low_f32(four_max), vget_high_f32(four_max));
float group_max[2];
vst1_f32(group_max, two_max);
*max_p = std::max(group_max[0], group_max[1]);
n = tail_frames;
}
Scalar::Vmaxmgv(source_p, source_stride, max_p, n);
}
static ALWAYS_INLINE void Vmul(const float* source1p,
int source_stride1,
const float* source2p,
int source_stride2,
float* dest_p,
int dest_stride,
size_t frames_to_process) {
int n = frames_to_process;
if (source_stride1 == 1 && source_stride2 == 1 && dest_stride == 1) {
int tail_frames = n % 4;
const float* end_p = dest_p + n - tail_frames;
while (dest_p < end_p) {
float32x4_t source1 = vld1q_f32(source1p);
float32x4_t source2 = vld1q_f32(source2p);
vst1q_f32(dest_p, vmulq_f32(source1, source2));
source1p += 4;
source2p += 4;
dest_p += 4;
}
n = tail_frames;
}
Scalar::Vmul(source1p, source_stride1, source2p, source_stride2, dest_p,
dest_stride, n);
}
static ALWAYS_INLINE void Vsma(const float* source_p,
int source_stride,
const float* scale,
float* dest_p,
int dest_stride,
size_t frames_to_process) {
int n = frames_to_process;
if (source_stride == 1 && dest_stride == 1) {
int tail_frames = n % 4;
const float* end_p = dest_p + n - tail_frames;
float32x4_t k = vdupq_n_f32(*scale);
while (dest_p < end_p) {
float32x4_t source = vld1q_f32(source_p);
float32x4_t dest = vld1q_f32(dest_p);
dest = vmlaq_f32(dest, source, k);
vst1q_f32(dest_p, dest);
source_p += 4;
dest_p += 4;
}
n = tail_frames;
}
Scalar::Vsma(source_p, source_stride, scale, dest_p, dest_stride, n);
}
static ALWAYS_INLINE void Vsmul(const float* source_p,
int source_stride,
const float* scale,
float* dest_p,
int dest_stride,
size_t frames_to_process) {
int n = frames_to_process;
if (source_stride == 1 && dest_stride == 1) {
float k = *scale;
int tail_frames = n % 4;
const float* end_p = dest_p + n - tail_frames;
while (dest_p < end_p) {
float32x4_t source = vld1q_f32(source_p);
vst1q_f32(dest_p, vmulq_n_f32(source, k));
source_p += 4;
dest_p += 4;
}
n = tail_frames;
}
Scalar::Vsmul(source_p, source_stride, scale, dest_p, dest_stride, n);
}
static ALWAYS_INLINE void Vsvesq(const float* source_p,
int source_stride,
float* sum_p,
size_t frames_to_process) {
int n = frames_to_process;
if (source_stride == 1) {
int tail_frames = n % 4;
const float* end_p = source_p + n - tail_frames;
float32x4_t four_sum = vdupq_n_f32(0);
while (source_p < end_p) {
float32x4_t source = vld1q_f32(source_p);
four_sum = vmlaq_f32(four_sum, source, source);
source_p += 4;
}
float32x2_t two_sum =
vadd_f32(vget_low_f32(four_sum), vget_high_f32(four_sum));
float group_sum[2];
vst1_f32(group_sum, two_sum);
*sum_p += group_sum[0] + group_sum[1];
n = tail_frames;
}
Scalar::Vsvesq(source_p, source_stride, sum_p, n);
}
static ALWAYS_INLINE void Zvmul(const float* real1p,
const float* imag1p,
const float* real2p,
const float* imag2p,
float* real_dest_p,
float* imag_dest_p,
size_t frames_to_process) {
unsigned i = 0;
unsigned end_size = frames_to_process - frames_to_process % 4;
while (i < end_size) {
float32x4_t real1 = vld1q_f32(real1p + i);
float32x4_t real2 = vld1q_f32(real2p + i);
float32x4_t imag1 = vld1q_f32(imag1p + i);
float32x4_t imag2 = vld1q_f32(imag2p + i);
float32x4_t real_result = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
float32x4_t imag_result = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);
vst1q_f32(real_dest_p + i, real_result);
vst1q_f32(imag_dest_p + i, imag_result);
i += 4;
}
Scalar::Zvmul(real1p + i, imag1p + i, real2p + i, imag2p + i, real_dest_p + i,
imag_dest_p + i, frames_to_process - i);
}
} // namespace NEON
} // namespace VectorMath
} // namespace blink
#endif // VectorMathNEON_h
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment