Commit 94423071 authored by Raymond Toy's avatar Raymond Toy Committed by Commit Bot

NEON optimization for a-rate Oscillator

NEON version of the SSE2 optimizations for the a-rate OscillatorNode.
We see about a 23% increase in performance with these changes according
to Spotify's Web Audio Bench on a Pixel 2.

Without CL
TEST	μs	MIN	Q1	MEDIAN	Q3	MAX	MEAN	STDDEV
Oscillator.frequency-linear-a-rate	5640	5640	5923	6170	6780	7800	6354.266667	511.6527827

With CL
TEST	μs	MIN	Q1	MEDIAN	Q3	MAX	MEAN	STDDEV
Oscillator.frequency-linear-a-rate	4355	4355	4588	4732	5258	6408	4929.9	474.3852906

Manually ran the oscillator tests on a Pixel 2 and update the thresholds.

Bug: 1013118
Change-Id: I37fbb995ec6bef55ae5195f579356b97adb94347
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2293424
Commit-Queue: Raymond Toy <rtoy@chromium.org>
Reviewed-by: default avatarDale Curtis <dalecurtis@chromium.org>
Reviewed-by: default avatarHongchan Choi <hongchan@chromium.org>
Reviewed-by: default avatarRaymond Toy <rtoy@chromium.org>
Cr-Commit-Position: refs/heads/master@{#790016}
parent cb657a7b
......@@ -6,14 +6,16 @@
#include "third_party/blink/renderer/modules/webaudio/periodic_wave.h"
#if defined(CPU_ARM_NEON)
#include <arm_neon.h>
#endif
namespace blink {
#if defined(CPU_ARM_NEON)
static float32x4_t v_wrap_virtual_index(float32x4_t x,
float32x4_t wave_size,
float32x4_t inv_wave_size) {
static float32x4_t WrapVirtualIndexVector(float32x4_t x,
float32x4_t wave_size,
float32x4_t inv_wave_size) {
// r = x/wave_size, f = truncate(r), truncating towards 0
const float32x4_t r = vmulq_f32(x, inv_wave_size);
int32x4_t f = vcvtq_s32_f32(r);
......@@ -74,7 +76,7 @@ std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
// It's possible that adding the incr above exceeded the bounds, so wrap them
// if needed.
v_virt_index =
v_wrap_virtual_index(v_virt_index, v_wave_size, v_inv_wave_size);
WrapVirtualIndexVector(v_virt_index, v_wave_size, v_inv_wave_size);
int k = 0;
int n_loops = n / 4;
......@@ -119,17 +121,125 @@ std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
// 0 -> periodicWaveSize.
v_virt_index = vaddq_f32(v_virt_index, v_incr);
v_virt_index =
v_wrap_virtual_index(v_virt_index, v_wave_size, v_inv_wave_size);
WrapVirtualIndexVector(v_virt_index, v_wave_size, v_inv_wave_size);
}
// There's a bit of round-off above, so update the index more accurately so at
// least the next render starts over with a more accurate value.
virtual_read_index += k * incr;
virtual_read_index -=
floor(virtual_read_index * inv_periodic_wave_size) * periodic_wave_size;
std::floor(virtual_read_index * inv_periodic_wave_size) *
periodic_wave_size;
return std::make_tuple(k, virtual_read_index);
}
static ALWAYS_INLINE double WrapVirtualIndex(double virtual_index,
unsigned periodic_wave_size,
double inv_periodic_wave_size) {
return virtual_index -
floor(virtual_index * inv_periodic_wave_size) * periodic_wave_size;
}
double OscillatorHandler::ProcessARateVectorKernel(
float* destination,
double virtual_read_index,
const float* phase_increments,
unsigned periodic_wave_size,
const float* const lower_wave_data[4],
const float* const higher_wave_data[4],
const float table_interpolation_factor[4]) const {
// See the scalar version in oscillator_node.cc for the basic algorithm.
double inv_periodic_wave_size = 1.0 / periodic_wave_size;
unsigned read_index_mask = periodic_wave_size - 1;
// Accumulate the phase increments so we can set up the virtual read index
// vector appropriately. This must be a double to preserve accuracy and
// to match the scalar version.
double incr_sum[4];
incr_sum[0] = phase_increments[0];
for (int m = 1; m < 4; ++m) {
incr_sum[m] = incr_sum[m - 1] + phase_increments[m];
}
// It's really important for accuracy that we use doubles instead of
// floats for the virtual_read_index. Without this, we can only get some
// 30-50 dB in the sweep tests instead of 100+ dB.
//
// Arm NEON doesn't have float64x2_t so we have to do this. (Aarch64 has
// float64x2_t.)
double virt_index[4];
virt_index[0] = virtual_read_index;
virt_index[1] = WrapVirtualIndex(virtual_read_index + incr_sum[0],
periodic_wave_size, inv_periodic_wave_size);
virt_index[2] = WrapVirtualIndex(virtual_read_index + incr_sum[1],
periodic_wave_size, inv_periodic_wave_size);
virt_index[3] = WrapVirtualIndex(virtual_read_index + incr_sum[2],
periodic_wave_size, inv_periodic_wave_size);
// The virtual indices we're working with now.
const float32x4_t v_virt_index = {
static_cast<float>(virt_index[0]), static_cast<float>(virt_index[1]),
static_cast<float>(virt_index[2]), static_cast<float>(virt_index[3])};
// Convert virtual index to actual index into wave data.
const uint32x4_t v_read0 = vcvtq_u32_f32(v_virt_index);
// v_read1 = v_read0 + 1, but wrap the index around, if needed.
const uint32x4_t v_read1 = vandq_s32(vaddq_s32(v_read0, vdupq_n_u32(1)),
vdupq_n_u32(read_index_mask));
float sample1_lower[4] __attribute__((aligned(16)));
float sample2_lower[4] __attribute__((aligned(16)));
float sample1_higher[4] __attribute__((aligned(16)));
float sample2_higher[4] __attribute__((aligned(16)));
uint32_t read0[4] __attribute__((aligned(16)));
uint32_t read1[4] __attribute__((aligned(16)));
vst1q_u32(read0, v_read0);
vst1q_u32(read1, v_read1);
// Read the samples from the wave tables
for (int m = 0; m < 4; ++m) {
DCHECK_LT(read0[m], periodic_wave_size);
DCHECK_LT(read1[m], periodic_wave_size);
sample1_lower[m] = lower_wave_data[m][read0[m]];
sample2_lower[m] = lower_wave_data[m][read1[m]];
sample1_higher[m] = higher_wave_data[m][read0[m]];
sample2_higher[m] = higher_wave_data[m][read1[m]];
}
// Compute factor for linear interpolation within a wave table.
const float32x4_t v_factor = vsubq_f32(v_virt_index, vcvtq_f32_u32(v_read0));
// Linearly interpolate between samples from the higher wave table.
const float32x4_t sample_higher = vmlaq_f32(
vld1q_f32(sample1_higher), v_factor,
vsubq_f32(vld1q_f32(sample2_higher), vld1q_f32(sample1_higher)));
// Linearly interpolate between samples from the lower wave table.
const float32x4_t sample_lower =
vmlaq_f32(vld1q_f32(sample1_lower), v_factor,
vsubq_f32(vld1q_f32(sample2_lower), vld1q_f32(sample1_lower)));
// Linearly interpolate between wave tables to get the desired
// output samples.
const float32x4_t sample =
vmlaq_f32(sample_higher, vld1q_f32(table_interpolation_factor),
vsubq_f32(sample_lower, sample_higher));
vst1q_f32(destination, sample);
// Update the virtual_read_index appropriately and return it for the
// next call.
virtual_read_index =
WrapVirtualIndex(virtual_read_index + incr_sum[3], periodic_wave_size,
inv_periodic_wave_size);
return virtual_read_index;
}
#endif
} // namespace blink
......@@ -371,7 +371,7 @@ std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
}
#endif
#if !defined(ARCH_CPU_X86_FAMILY)
#if !(defined(ARCH_CPU_X86_FAMILY) || defined(CPU_ARM_NEON))
double OscillatorHandler::ProcessARateVectorKernel(
float* dest_p,
double virtual_read_index,
......
......@@ -38,6 +38,10 @@
#include "third_party/blink/renderer/platform/audio/vector_math.h"
#include "third_party/blink/renderer/platform/bindings/exception_state.h"
#if defined(CPU_ARM_NEON)
#include <arm_neon.h>
#endif
namespace blink {
// The number of bands per octave. Each octave will have this many entries in
......@@ -270,6 +274,64 @@ void PeriodicWave::WaveDataForFundamentalFrequency(
higher_wave_data[k] = band_limited_tables_[range_index1[k]]->Data();
}
}
#elif defined(CPU_ARM_NEON)
void PeriodicWave::WaveDataForFundamentalFrequency(
const float fundamental_frequency[4],
float* lower_wave_data[4],
float* higher_wave_data[4],
float table_interpolation_factor[4]) {
// Negative frequencies are allowed, in which case we alias to the positive
// frequency.
float32x4_t frequency = vabsq_f32(vld1q_f32(fundamental_frequency));
// pos = 0xffffffff if frequency > 0; otherwise 0.
uint32x4_t pos = vcgtq_f32(frequency, vdupq_n_f32(0));
// v_ratio = frequency / lowest_fundamental_frequency_. But NEON
// doesn't have a division instruction, so multiply by reciprocal.
// (Aarch64 does, though).
float32x4_t v_ratio =
vmulq_f32(frequency, vdupq_n_f32(1 / lowest_fundamental_frequency_));
// Select v_ratio or 0.5 depending on whether pos is all ones or all
// zeroes.
v_ratio = vbslq_f32(pos, v_ratio, vdupq_n_f32(0.5));
float ratio[4] __attribute__((aligned(16)));
vst1q_f32(ratio, v_ratio);
float cents_above_lowest_frequency[4] __attribute__((aligned(16)));
for (int k = 0; k < 4; ++k) {
cents_above_lowest_frequency[k] = log2f(ratio[k]) * 1200;
}
float32x4_t v_pitch_range = vaddq_f32(
vdupq_n_f32(1.0), vmulq_f32(vld1q_f32(cents_above_lowest_frequency),
vdupq_n_f32(1 / cents_per_range_)));
v_pitch_range = vmaxq_f32(v_pitch_range, vdupq_n_f32(0));
v_pitch_range = vminq_f32(v_pitch_range, vdupq_n_f32(NumberOfRanges() - 1));
const uint32x4_t v_index1 = vcvtq_u32_f32(v_pitch_range);
uint32x4_t v_index2 = vaddq_u32(v_index1, vdupq_n_u32(1));
v_index2 = vminq_u32(v_index2, vdupq_n_f32(NumberOfRanges() - 1));
uint32_t range_index1[4] __attribute__((aligned(16)));
uint32_t range_index2[4] __attribute__((aligned(16)));
vst1q_u32(range_index1, v_index1);
vst1q_u32(range_index2, v_index2);
const float32x4_t table_factor =
vsubq_f32(v_pitch_range, vcvtq_f32_u32(v_index1));
vst1q_f32(table_interpolation_factor, table_factor);
for (int k = 0; k < 4; ++k) {
lower_wave_data[k] = band_limited_tables_[range_index2[k]]->Data();
higher_wave_data[k] = band_limited_tables_[range_index1[k]]->Data();
}
}
#else
void PeriodicWave::WaveDataForFundamentalFrequency(
const float fundamental_frequency[4],
......
......@@ -22,7 +22,7 @@
1, tester.sampleRate * tester.lengthInSeconds, tester.sampleRate);
// The thresholds are experimentally determined.
tester.setThresholds({snr: 139.98, maxDiff: 2.2650e-6});
tester.setThresholds({snr: 130.47, maxDiff: 2.2576e-6});
tester.runTest(
context, 'custom', 'Custom Oscillator with Exponential Sweep', task,
should);
......
......@@ -22,7 +22,7 @@
1, tester.sampleRate * tester.lengthInSeconds, tester.sampleRate);
// The thresholds are experimentally determined.
tester.setThresholds({snr: 134.34, maxDiff: 1.8925e-6});
tester.setThresholds({snr: 129.40, maxDiff: 1.9894e-6});
tester.runTest(
context, 'sawtooth', 'Sawtooth Oscillator with Exponential Sweep',
task, should);
......
......@@ -22,7 +22,7 @@
1, tester.sampleRate * tester.lengthInSeconds, tester.sampleRate);
// The thresholds are experimentally determined.
tester.setThresholds({snr: 140.44, maxDiff: 3.4869e-6});
tester.setThresholds({snr: 129.72, maxDiff: 3.7551e-6});
tester.runTest(
context, 'sine', 'Sine Oscillator with Exponential Sweep', task,
should);
......
......@@ -22,7 +22,7 @@
1, tester.sampleRate * tester.lengthInSeconds, tester.sampleRate);
// The thresholds are experimentally determined.
tester.setThresholds({snr: 137.20, maxDiff: 3.8147e-6});
tester.setThresholds({snr: 129.71, maxDiff: 3.9638e-6});
tester.runTest(
context, 'square', 'Square Oscillator with Exponential Sweep', task,
should);
......
......@@ -22,7 +22,7 @@
1, tester.sampleRate * tester.lengthInSeconds, tester.sampleRate);
// The thresholds are experimentally determined.
tester.setThresholds({snr: 139.88, maxDiff: 2.8313e-6});
tester.setThresholds({snr: 129.66, maxDiff: 3.0995e-6});
tester.runTest(
context, 'triangle', 'Triangle Oscillator with Exponential Sweep ',
task, should);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment