Commit f8dec579 authored by Raymond Toy's avatar Raymond Toy Committed by Commit Bot

NEON implementation for main k-rate loop of OscillatorNode

This is the Arm/Neon equivalent of the x86/simd version of the main
k-rate processing loop.

The Web Audio Bench (100 runs of 10 sec) shows that we have about a 33%
improvement in speed (on a Pixel 2).  The difference is statistically
significant.

Without CL:
TEST	μs	MIN	Q1	MEDIAN	Q3	MAX	MEAN	STDDEV
Baseline	1778	1778	1888	2013	2498	3363	2225.6	437.41
Oscillator	1388	1388	1597	2104	2427	5244	2079.37	581.86

With CL:
TEST	μs	MIN	Q1	MEDIAN	Q3	MAX	MEAN	STDDEV
Baseline	1757	1757	1863	1955	2378	3280	2164.5	424.23
Oscillator	897	897	1001	1224	1764	4068	1399.1	515.9

Manually ran the tests in webaudio/Oscillator and all tests pass.

Bug: 1013118
Change-Id: I738dfc9e9f108ac6a421099c69f11aab7750605f
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2233003
Commit-Queue: Raymond Toy <rtoy@chromium.org>
Reviewed-by: default avatarDale Curtis <dalecurtis@chromium.org>
Reviewed-by: default avatarHongchan Choi <hongchan@chromium.org>
Cr-Commit-Position: refs/heads/master@{#779816}
parent 7a231ac0
......@@ -432,8 +432,8 @@ std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
// Temporary arrays where we can gather up the wave data we need for
// interpolation. Align these for best efficiency on older CPUs where aligned
// access is much faster than unaliged. TODO(rtoy): Is there a faster way to
// do this?
// access is much faster than unaliged.
// TODO(1013118): Is there a faster way to do this?
float sample1_lower[4] __attribute__((aligned(16)));
float sample2_lower[4] __attribute__((aligned(16)));
float sample1_higher[4] __attribute__((aligned(16)));
......@@ -500,6 +500,126 @@ std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
return std::make_tuple(k, virtual_read_index);
}
#elif defined(CPU_ARM_NEON)
static float32x4_t v_wrap_virtual_index(float32x4_t x,
float32x4_t wave_size,
float32x4_t inv_wave_size) {
// r = x/wave_size, f = truncate(r), truncating towards 0
const float32x4_t r = vmulq_f32(x, inv_wave_size);
int32x4_t f = vcvtq_s32_f32(r);
// vcltq_f32 returns returns all 0xfffffff (-1) if a < b and if if not.
const uint32x4_t cmp = vcltq_f32(r, vcvtq_f32_s32(f));
f = vaddq_s32(f, static_cast<int32x4_t>(cmp));
return vsubq_f32(x, vmulq_f32(vcvtq_f32_s32(f), wave_size));
}
std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
int n,
float* dest_p,
double virtual_read_index,
float frequency,
float rate_scale) const {
const unsigned periodic_wave_size = periodic_wave_->PeriodicWaveSize();
const double inv_periodic_wave_size = 1.0 / periodic_wave_size;
float* higher_wave_data = nullptr;
float* lower_wave_data = nullptr;
float table_interpolation_factor = 0;
const float incr = frequency * rate_scale;
DCHECK_GE(incr, kInterpolate2Point);
periodic_wave_->WaveDataForFundamentalFrequency(
frequency, lower_wave_data, higher_wave_data, table_interpolation_factor);
const float32x4_t v_wave_size = vdupq_n_f32(periodic_wave_size);
const float32x4_t v_inv_wave_size = vdupq_n_f32(1.0f / periodic_wave_size);
const uint32x4_t v_read_mask = vdupq_n_s32(periodic_wave_size - 1);
const uint32x4_t v_one = vdupq_n_s32(1);
const float32x4_t v_table_factor = vdupq_n_f32(table_interpolation_factor);
const float32x4_t v_incr = vdupq_n_f32(4 * incr);
float32x4_t v_virt_index = {
virtual_read_index + 0 * incr, virtual_read_index + 1 * incr,
virtual_read_index + 2 * incr, virtual_read_index + 3 * incr};
// Temporary arrsys to hold the read indices so we can access them
// individually to get the samples needed for interpolation.
uint32_t r0[4] __attribute__((aligned(16)));
uint32_t r1[4] __attribute__((aligned(16)));
// Temporary arrays where we can gather up the wave data we need for
// interpolation. Align these for best efficiency on older CPUs where aligned
// access is much faster than unaliged. TODO(rtoy): Is there a faster way to
// do this?
float sample1_lower[4] __attribute__((aligned(16)));
float sample2_lower[4] __attribute__((aligned(16)));
float sample1_higher[4] __attribute__((aligned(16)));
float sample2_higher[4] __attribute__((aligned(16)));
// It's possible that adding the incr above exceeded the bounds, so wrap them
// if needed.
v_virt_index =
v_wrap_virtual_index(v_virt_index, v_wave_size, v_inv_wave_size);
int k = 0;
int n_loops = n / 4;
for (int loop = 0; loop < n_loops; ++loop, k += 4) {
// Compute indices for the samples and contain within the valid range.
const uint32x4_t read_index_0 =
vandq_u32(vcvtq_u32_f32(v_virt_index), v_read_mask);
const uint32x4_t read_index_1 =
vandq_u32(vaddq_u32(read_index_0, v_one), v_read_mask);
// Extract the components of the indices so we can get the samples
// associated with the lower and higher wave data.
vst1q_u32(r0, read_index_0);
vst1q_u32(r1, read_index_1);
for (int m = 0; m < 4; ++m) {
sample1_lower[m] = lower_wave_data[r0[m]];
sample2_lower[m] = lower_wave_data[r1[m]];
sample1_higher[m] = higher_wave_data[r0[m]];
sample2_higher[m] = higher_wave_data[r1[m]];
}
const float32x4_t s1_low = vld1q_f32(sample1_lower);
const float32x4_t s2_low = vld1q_f32(sample2_lower);
const float32x4_t s1_high = vld1q_f32(sample1_higher);
const float32x4_t s2_high = vld1q_f32(sample2_higher);
const float32x4_t interpolation_factor =
vsubq_f32(v_virt_index, vcvtq_f32_u32(read_index_0));
const float32x4_t sample_higher = vaddq_f32(
s1_high, vmulq_f32(interpolation_factor, vsubq_f32(s2_high, s1_high)));
const float32x4_t sample_lower = vaddq_f32(
s1_low, vmulq_f32(interpolation_factor, vsubq_f32(s2_low, s1_low)));
const float32x4_t sample = vaddq_f32(
sample_higher,
vmulq_f32(v_table_factor, vsubq_f32(sample_lower, sample_higher)));
vst1q_f32(dest_p + k, sample);
// Increment virtual read index and wrap virtualReadIndex into the range
// 0 -> periodicWaveSize.
v_virt_index = vaddq_f32(v_virt_index, v_incr);
v_virt_index =
v_wrap_virtual_index(v_virt_index, v_wave_size, v_inv_wave_size);
}
// There's a bit of round-off above, so update the index more accurately so at
// least the next render starts over with a more accurate value.
virtual_read_index += k * incr;
virtual_read_index -=
floor(virtual_read_index * inv_periodic_wave_size) * periodic_wave_size;
return std::make_tuple(k, virtual_read_index);
}
#else
// Vector operations not supported, so there's nothing to do except return 0 and
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment