Commit a4b0dbe6 authored by Eero Häkkinen's avatar Eero Häkkinen Committed by Commit Bot

Optimize blink::VectorMath::Vclip for SSE

The performance gain on an Intel Broadwell CPU is about 8-fold, and on
Intel Skylake and newer CPUs the SSE throughput should be further
doubled.

Bug: 778262
Change-Id: If4682a4d95a6729626049a38a4b1d5b1ebf830bd
Reviewed-on: https://chromium-review.googlesource.com/738243
Commit-Queue: Eero Häkkinen <eero.hakkinen@intel.com>
Reviewed-by: default avatarRaymond Toy <rtoy@chromium.org>
Cr-Commit-Position: refs/heads/master@{#520973}
parent 2fd83667
......@@ -683,8 +683,38 @@ void Vclip(const float* source_p,
float low_threshold = *low_threshold_p;
float high_threshold = *high_threshold_p;
// FIXME: Optimize for SSE2.
#if WTF_CPU_ARM_NEON
#if DCHECK_IS_ON()
// Do the same DCHECKs that |clampTo| would do so that optimization paths do
// not have to do them.
for (size_t i = 0u; i < frames_to_process; ++i)
DCHECK(!std::isnan(source_p[i]));
// This also ensures that thresholds are not NaNs.
DCHECK_LE(low_threshold, high_threshold);
#endif
#if defined(ARCH_CPU_X86_FAMILY)
if (source_stride == 1 && dest_stride == 1) {
size_t i = 0u;
// If the source_p address is not 16-byte aligned, the first several
// frames (at most three) should be processed separately.
for (; !SSE::IsAligned(source_p + i) && i < frames_to_process; ++i)
dest_p[i] = clampTo(source_p[i], low_threshold, high_threshold);
// Now the source_p+i address is 16-byte aligned. Start to apply SSE.
size_t sse_frames_to_process =
(frames_to_process - i) & SSE::kFramesToProcessMask;
if (sse_frames_to_process > 0u) {
SSE::Vclip(source_p + i, &low_threshold, &high_threshold, dest_p + i,
sse_frames_to_process);
i += sse_frames_to_process;
}
source_p += i;
dest_p += i;
n -= i;
}
#elif WTF_CPU_ARM_NEON
if ((source_stride == 1) && (dest_stride == 1)) {
int tail_frames = n % 4;
const float* end_p = dest_p + n - tail_frames;
......
......@@ -9,6 +9,7 @@
#include "platform/wtf/Assertions.h"
#include <algorithm>
#include <cmath>
#include <xmmintrin.h>
......@@ -72,6 +73,40 @@ void Vadd(const float* source1p,
#undef ADD_ALL
}
// dest[k] = clip(source[k], low_threshold, high_threshold)
// = max(low_threshold, min(high_threshold, source[k]))
void Vclip(const float* source_p,
const float* low_threshold_p,
const float* high_threshold_p,
float* dest_p,
size_t frames_to_process) {
const float* const source_end_p = source_p + frames_to_process;
DCHECK(IsAligned(source_p));
DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
MType m_low_threshold = MM_PS(set1)(*low_threshold_p);
MType m_high_threshold = MM_PS(set1)(*high_threshold_p);
#define CLIP_ALL(storeDest) \
while (source_p < source_end_p) { \
MType m_source = MM_PS(load)(source_p); \
MType m_dest = \
MM_PS(max)(m_low_threshold, MM_PS(min)(m_high_threshold, m_source)); \
MM_PS(storeDest)(dest_p, m_dest); \
source_p += kPackedFloatsPerRegister; \
dest_p += kPackedFloatsPerRegister; \
}
if (IsAligned(dest_p)) {
CLIP_ALL(store);
} else {
CLIP_ALL(storeu);
}
#undef CLIP_ALL
}
// max = max(abs(source[k])) for all k
void Vmaxmgv(const float* source_p, float* max_p, size_t frames_to_process) {
constexpr uint32_t kMask = 0x7FFFFFFFu;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment