Commit 318d65a3 authored by Raymond Toy's avatar Raymond Toy Committed by Commit Bot

Reland: SIMD the main loop for k-rate Oscillator

The main processing loop for the k-rate Oscillator is updated to use
SSE2 instructions to speed up processing.

Four tests fail because we have slightly reduced accuracy.  Testing
shows that the loss of accuracy is due to using a float for
virtual_read_index in the loop instead of a double as previously.  A
implementation using a double passes all the original tests but does
show that it does lose performance.  I think the change in accuracy is
acceptable.

WebAudio Bench results from a linux machine.  In summary, the SIMD
version is about 60% as much time (about 1.75 times faster).

Without CL:
TEST	μs	MIN	Q1	MEDIAN	Q3	MAX	MEAN	STDDEV
Baseline	713	713	748	757	766	877	760.25	18.35
Oscillator	949	949	972	990	1005	1132	990.78	24.35

With CL:
TEST	μs	MIN	Q1	MEDIAN	Q3	MAX	MEAN	STDDEV
Baseline	728	728	747	756	763	908	757.46	16.81
Oscillator	521	521	549	564	581	874	566.87	27.77

Bug: 1013118
Change-Id: Ic5445cc152e960b41817f245ac306691f7c57f90
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2246831Reviewed-by: default avatarHongchan Choi <hongchan@chromium.org>
Commit-Queue: Raymond Toy <rtoy@chromium.org>
Cr-Commit-Position: refs/heads/master@{#778579}
parent e4d6ced5
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include <algorithm> #include <algorithm>
#include <limits> #include <limits>
#include "build/build_config.h"
#include "third_party/blink/renderer/modules/webaudio/audio_node_output.h" #include "third_party/blink/renderer/modules/webaudio/audio_node_output.h"
#include "third_party/blink/renderer/modules/webaudio/oscillator_node.h" #include "third_party/blink/renderer/modules/webaudio/oscillator_node.h"
#include "third_party/blink/renderer/modules/webaudio/periodic_wave.h" #include "third_party/blink/renderer/modules/webaudio/periodic_wave.h"
...@@ -37,6 +38,11 @@ ...@@ -37,6 +38,11 @@
namespace blink { namespace blink {
// Breakpoints where we deicde to do linear interoplation, 3-point
// interpolation or 5-point interpolation. See DoInterpolation().
constexpr float kInterpolate2Point = 0.3;
constexpr float kInterpolate3Point = 0.16;
OscillatorHandler::OscillatorHandler(AudioNode& node, OscillatorHandler::OscillatorHandler(AudioNode& node,
float sample_rate, float sample_rate,
const String& oscillator_type, const String& oscillator_type,
...@@ -272,7 +278,7 @@ static float DoInterpolation(double virtual_read_index, ...@@ -272,7 +278,7 @@ static float DoInterpolation(double virtual_read_index,
// We use Lagrange interpolation because it's relatively simple to // We use Lagrange interpolation because it's relatively simple to
// implement and fairly inexpensive, and the interpolator always // implement and fairly inexpensive, and the interpolator always
// passes through known points. // passes through known points.
if (incr >= 0.3) { if (incr >= kInterpolate2Point) {
// Increment is fairly large, so we're doing no more than about 3 // Increment is fairly large, so we're doing no more than about 3
// points between each wave table entry. Assume linear // points between each wave table entry. Assume linear
// interpolation between points is good enough. // interpolation between points is good enough.
...@@ -295,7 +301,7 @@ static float DoInterpolation(double virtual_read_index, ...@@ -295,7 +301,7 @@ static float DoInterpolation(double virtual_read_index,
sample_lower = (1 - interpolation_factor) * sample1_lower + sample_lower = (1 - interpolation_factor) * sample1_lower +
interpolation_factor * sample2_lower; interpolation_factor * sample2_lower;
} else if (incr >= .16) { } else if (incr >= kInterpolate3Point) {
// We're doing about 6 interpolation values between each wave // We're doing about 6 interpolation values between each wave
// table sample. Just use a 3-point Lagrange interpolator to get a // table sample. Just use a 3-point Lagrange interpolator to get a
// better estimate than just linear. // better estimate than just linear.
...@@ -351,41 +357,275 @@ static float DoInterpolation(double virtual_read_index, ...@@ -351,41 +357,275 @@ static float DoInterpolation(double virtual_read_index,
return sample; return sample;
} }
#if defined(ARCH_CPU_X86_FAMILY)
static __m128 v_wrap_virtual_index(__m128 x,
__m128 wave_size,
__m128 inv_wave_size) {
// Wrap the virtual index |x| to the range 0 to wave_size - 1. This is done
// by computing x - floor(x/wave_size)*wave_size.
//
// But there's no SSE2 SIMD instruction for this, so we do it the following
// way.
// f = truncate(x/wave_size), truncating towards 0.
const __m128 r = _mm_mul_ps(x, inv_wave_size);
__m128i f = _mm_cvttps_epi32(r);
// Note that if r >= 0, then f <= r. But if r < 0, then r <= f, with equality
// only if r is already an integer. Hence if r < f, we want to subtract 1
// from f to get floor(r).
// cmplt(a,b) returns 0xffffffff (-1) if a < b and 0 if not. So cmp is -1 or
// 0 depending on whether r < f, which is what we need to compute floor(r).
const __m128i cmp = _mm_cmplt_ps(r, _mm_cvtepi32_ps(f));
// This subtracts 1 if needed to get floor(r).
f = _mm_add_epi32(f, cmp);
// Convert back to float, and scale by wave_size. And finally subtract that
// from x.
return _mm_sub_ps(x, _mm_mul_ps(_mm_cvtepi32_ps(f), wave_size));
}
std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
int n,
float* dest_p,
double virtual_read_index,
float frequency,
float rate_scale) const {
const unsigned periodic_wave_size = periodic_wave_->PeriodicWaveSize();
const double inv_periodic_wave_size = 1.0 / periodic_wave_size;
float* higher_wave_data = nullptr;
float* lower_wave_data = nullptr;
float table_interpolation_factor = 0;
float incr = frequency * rate_scale;
DCHECK_GE(incr, kInterpolate2Point);
periodic_wave_->WaveDataForFundamentalFrequency(
frequency, lower_wave_data, higher_wave_data, table_interpolation_factor);
const __m128 v_wave_size = _mm_set1_ps(periodic_wave_size);
const __m128 v_inv_wave_size = _mm_set1_ps(1.0f / periodic_wave_size);
// Mask to use to wrap the read indices to the proper range.
const __m128i v_read_mask = _mm_set1_epi32(periodic_wave_size - 1);
const __m128i one = _mm_set1_epi32(1);
const __m128 v_table_factor = _mm_set1_ps(table_interpolation_factor);
// The loop processes 4 items at a time, so we need to increment the
// virtual index by 4*incr each time.
const __m128 v_incr = _mm_set1_ps(4 * incr);
// The virtual index vector. Ideally, to preserve accuracy, we should use
// (two) packed double vectors for this, but that degrades performance quite a
// bit.
__m128 v_virt_index =
_mm_set_ps(virtual_read_index + 3 * incr, virtual_read_index + 2 * incr,
virtual_read_index + incr, virtual_read_index);
// It's possible that adding the incr above exceeded the bounds, so wrap them
// if needed.
v_virt_index =
v_wrap_virtual_index(v_virt_index, v_wave_size, v_inv_wave_size);
// Temporary arrays where we can gather up the wave data we need for
// interpolation. Align these for best efficiency on older CPUs where aligned
// access is much faster than unaliged. TODO(rtoy): Is there a faster way to
// do this?
float sample1_lower[4] __attribute__((aligned(16)));
float sample2_lower[4] __attribute__((aligned(16)));
float sample1_higher[4] __attribute__((aligned(16)));
float sample2_higher[4] __attribute__((aligned(16)));
int k = 0;
int n_loops = n / 4;
for (int loop = 0; loop < n_loops; ++loop, k += 4) {
// Compute indices for the samples and contain within the valid range.
const __m128i read_index_0 =
_mm_and_si128(_mm_cvttps_epi32(v_virt_index), v_read_mask);
const __m128i read_index_1 =
_mm_and_si128(_mm_add_epi32(read_index_0, one), v_read_mask);
// Extract the components of the indices so we can get the samples
// associated with the lower and higher wave data.
const uint32_t* r0 = reinterpret_cast<const uint32_t*>(&read_index_0);
const uint32_t* r1 = reinterpret_cast<const uint32_t*>(&read_index_1);
// Get the samples from the wave tables and save them in work arrays so we
// can load them into simd registers.
for (int m = 0; m < 4; ++m) {
sample1_lower[m] = lower_wave_data[r0[m]];
sample2_lower[m] = lower_wave_data[r1[m]];
sample1_higher[m] = higher_wave_data[r0[m]];
sample2_higher[m] = higher_wave_data[r1[m]];
}
const __m128 s1_low = _mm_load_ps(sample1_lower);
const __m128 s2_low = _mm_load_ps(sample2_lower);
const __m128 s1_high = _mm_load_ps(sample1_higher);
const __m128 s2_high = _mm_load_ps(sample2_higher);
// Linearly interpolate within each table (lower and higher).
const __m128 interpolation_factor =
_mm_sub_ps(v_virt_index, _mm_cvtepi32_ps(read_index_0));
const __m128 sample_higher = _mm_add_ps(
s1_high,
_mm_mul_ps(interpolation_factor, _mm_sub_ps(s2_high, s1_high)));
const __m128 sample_lower = _mm_add_ps(
s1_low, _mm_mul_ps(interpolation_factor, _mm_sub_ps(s2_low, s1_low)));
// Then interpolate between the two tables.
const __m128 sample = _mm_add_ps(
sample_higher,
_mm_mul_ps(v_table_factor, _mm_sub_ps(sample_lower, sample_higher)));
// WARNING: dest_p may not be aligned!
_mm_storeu_ps(dest_p + k, sample);
// Increment virtual read index and wrap virtualReadIndex into the range
// 0 -> periodicWaveSize.
v_virt_index = _mm_add_ps(v_virt_index, v_incr);
v_virt_index =
v_wrap_virtual_index(v_virt_index, v_wave_size, v_inv_wave_size);
}
// There's a bit of round-off above, so update the index more accurately so at
// least the next render starts over with a more accurate value.
virtual_read_index += k * incr;
virtual_read_index -=
floor(virtual_read_index * inv_periodic_wave_size) * periodic_wave_size;
return std::make_tuple(k, virtual_read_index);
}
#else
// Vector operations not supported, so there's nothing to do except return 0 and
// virtual_read_index. The scalar version will do the necessary processing.
std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
int n,
float* dest_p,
double virtual_read_index,
float frequency,
float rate_scale) const {
DCHECK_GE(frequency * rate_scale, kInterpolate2Point);
return std::make_tuple(0, virtual_read_index);
}
#endif
double OscillatorHandler::ProcessKRateScalar(int start,
int n,
float* dest_p,
double virtual_read_index,
float frequency,
float rate_scale) const {
const unsigned periodic_wave_size = periodic_wave_->PeriodicWaveSize();
const double inv_periodic_wave_size = 1.0 / periodic_wave_size;
const unsigned read_index_mask = periodic_wave_size - 1;
float* higher_wave_data = nullptr;
float* lower_wave_data = nullptr;
float table_interpolation_factor = 0;
periodic_wave_->WaveDataForFundamentalFrequency(
frequency, lower_wave_data, higher_wave_data, table_interpolation_factor);
const float incr = frequency * rate_scale;
DCHECK_GE(incr, kInterpolate2Point);
for (int k = start; k < n; ++k) {
// Get indices for the current and next sample, and contain them within the
// valid range
const unsigned read_index_0 =
static_cast<unsigned>(virtual_read_index) & read_index_mask;
const unsigned read_index_1 = (read_index_0 + 1) & read_index_mask;
const float sample1_lower = lower_wave_data[read_index_0];
const float sample2_lower = lower_wave_data[read_index_1];
const float sample1_higher = higher_wave_data[read_index_0];
const float sample2_higher = higher_wave_data[read_index_1];
// Linearly interpolate within each table (lower and higher).
const float interpolation_factor =
static_cast<float>(virtual_read_index) - read_index_0;
const float sample_higher =
sample1_higher +
interpolation_factor * (sample2_higher - sample1_higher);
const float sample_lower =
sample1_lower + interpolation_factor * (sample2_lower - sample1_lower);
// Then interpolate between the two tables.
const float sample = sample_higher + table_interpolation_factor *
(sample_lower - sample_higher);
dest_p[k] = sample;
// Increment virtual read index and wrap virtualReadIndex into the range
// 0 -> periodicWaveSize.
virtual_read_index += incr;
virtual_read_index -=
floor(virtual_read_index * inv_periodic_wave_size) * periodic_wave_size;
}
return virtual_read_index;
}
double OscillatorHandler::ProcessKRate(int n, double OscillatorHandler::ProcessKRate(int n,
float* dest_p, float* dest_p,
double virtual_read_index) const { double virtual_read_index) const {
unsigned periodic_wave_size = periodic_wave_->PeriodicWaveSize(); const unsigned periodic_wave_size = periodic_wave_->PeriodicWaveSize();
double inv_periodic_wave_size = 1.0 / periodic_wave_size; const double inv_periodic_wave_size = 1.0 / periodic_wave_size;
unsigned read_index_mask = periodic_wave_size - 1; const unsigned read_index_mask = periodic_wave_size - 1;
float frequency = 0;
float* higher_wave_data = nullptr; float* higher_wave_data = nullptr;
float* lower_wave_data = nullptr; float* lower_wave_data = nullptr;
float table_interpolation_factor = 0; float table_interpolation_factor = 0;
frequency = frequency_->FinalValue(); float frequency = frequency_->FinalValue();
float detune = detune_->FinalValue(); const float detune_scale = DetuneToFrequencyMultiplier(detune_->FinalValue());
float detune_scale = DetuneToFrequencyMultiplier(detune);
frequency *= detune_scale; frequency *= detune_scale;
ClampFrequency(&frequency, 1, Context()->sampleRate() / 2); ClampFrequency(&frequency, 1, Context()->sampleRate() / 2);
periodic_wave_->WaveDataForFundamentalFrequency( periodic_wave_->WaveDataForFundamentalFrequency(
frequency, lower_wave_data, higher_wave_data, table_interpolation_factor); frequency, lower_wave_data, higher_wave_data, table_interpolation_factor);
float rate_scale = periodic_wave_->RateScale(); const float rate_scale = periodic_wave_->RateScale();
float incr = frequency * rate_scale; const float incr = frequency * rate_scale;
for (int k = 0; k < n; ++k) { if (incr >= kInterpolate2Point) {
float sample = DoInterpolation(virtual_read_index, fabs(incr), int k;
read_index_mask, table_interpolation_factor, double v_index = virtual_read_index;
lower_wave_data, higher_wave_data);
*dest_p++ = sample; std::tie(k, v_index) =
ProcessKRateVector(n, dest_p, v_index, frequency, rate_scale);
// Increment virtual read index and wrap virtualReadIndex into the range if (k < n) {
// 0 -> periodicWaveSize. // In typical cases, this won't be run because the number of frames is 128
virtual_read_index += incr; // so the vector version will process all the samples.
v_index =
ProcessKRateScalar(k, n, dest_p, v_index, frequency, rate_scale);
}
// Recompute to reduce round-off introduced when processing the samples
// above.
virtual_read_index += n * incr;
virtual_read_index -= virtual_read_index -=
floor(virtual_read_index * inv_periodic_wave_size) * periodic_wave_size; floor(virtual_read_index * inv_periodic_wave_size) * periodic_wave_size;
} else {
for (int k = 0; k < n; ++k) {
float sample = DoInterpolation(
virtual_read_index, fabs(incr), read_index_mask,
table_interpolation_factor, lower_wave_data, higher_wave_data);
*dest_p++ = sample;
// Increment virtual read index and wrap virtualReadIndex into the range
// 0 -> periodicWaveSize.
virtual_read_index += incr;
virtual_read_index -= floor(virtual_read_index * inv_periodic_wave_size) *
periodic_wave_size;
}
} }
return virtual_read_index; return virtual_read_index;
......
...@@ -89,6 +89,24 @@ class OscillatorHandler final : public AudioScheduledSourceHandler { ...@@ -89,6 +89,24 @@ class OscillatorHandler final : public AudioScheduledSourceHandler {
// Compute the output for k-rate AudioParams // Compute the output for k-rate AudioParams
double ProcessKRate(int n, float* dest_p, double virtual_read_index) const; double ProcessKRate(int n, float* dest_p, double virtual_read_index) const;
// Scalar version for the main loop in ProcessKRate(). Returns the updated
// virtual_read_index.
double ProcessKRateScalar(int start_index,
int n,
float* dest_p,
double virtual_read_index,
float frequency,
float rate_scale) const;
// Vectorized version (if available) for the main loop in ProcessKRate().
// Returns the number of elements processed and the updated
// virtual_read_index.
std::tuple<int, double> ProcessKRateVector(int n,
float* dest_p,
double virtual_read_index,
float frequency,
float rate_scale) const;
// Compute the output for a-rate AudioParams // Compute the output for a-rate AudioParams
double ProcessARate(int n, double ProcessARate(int n,
float* dest_p, float* dest_p,
......
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
testFFTSize(should, { testFFTSize(should, {
initialFFTSize: 128, initialFFTSize: 128,
finalFFTSize: 1024, finalFFTSize: 1024,
errorThreshold: {relativeThreshold: 1.9238e-6} errorThreshold: {relativeThreshold: 1.9455e-6}
}).then(() => task.done()); }).then(() => task.done());
}); });
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
testFFTSize(should, { testFFTSize(should, {
initialFFTSize: 512, initialFFTSize: 512,
finalFFTSize: 256, finalFFTSize: 256,
errorThreshold: {relativeThreshold: 1.8166e-6} errorThreshold: {relativeThreshold: 1.8592e-6}
}).then(() => task.done()); }).then(() => task.done());
}); });
......
...@@ -57,7 +57,7 @@ ...@@ -57,7 +57,7 @@
frequency1: frequency1, frequency1: frequency1,
paramName: 'detune', paramName: 'detune',
newValue: detune, newValue: detune,
thresholds: [1.1921e-7, 1.3114e-6], thresholds: [1.1921e-7, 3.6657e-6],
}).then(() => task.done()); }).then(() => task.done());
}); });
......
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
// Minimum allowed SNR between the actual oscillator and the expected // Minimum allowed SNR between the actual oscillator and the expected
// result. Experimentally determined. // result. Experimentally determined.
let snrThreshold = 59.287; let snrThreshold = 59.280;
// Max absolute difference between actual and expected oscillator // Max absolute difference between actual and expected oscillator
// outputs. Experimentally determined. // outputs. Experimentally determined.
let maxDiffThreshold = 1.5632e-3; let maxDiffThreshold = 1.5640e-3;
let audit = Audit.createTaskRunner(); let audit = Audit.createTaskRunner();
......
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
runTest(should, { runTest(should, {
message: 'Sum of positive and negative frequency sine oscillators', message: 'Sum of positive and negative frequency sine oscillators',
type: 'sine', type: 'sine',
threshold: 4.1724e-7 threshold: 1.5349e-6
}).then(() => task.done()); }).then(() => task.done());
}); });
...@@ -35,7 +35,7 @@ ...@@ -35,7 +35,7 @@
runTest(should, { runTest(should, {
message: 'Sum of positive and negative frequency square oscillators', message: 'Sum of positive and negative frequency square oscillators',
type: 'square', type: 'square',
threshold: 1.4753e-6 threshold: 1.1496e-5
}).then(() => task.done()); }).then(() => task.done());
}); });
...@@ -44,7 +44,7 @@ ...@@ -44,7 +44,7 @@
message: message:
'Sum of positive and negative frequency sawtooth oscillators', 'Sum of positive and negative frequency sawtooth oscillators',
type: 'sawtooth', type: 'sawtooth',
threshold: 1.4753e-6 threshold: 1.1506e-5
}).then(() => task.done()); }).then(() => task.done());
}); });
...@@ -53,7 +53,7 @@ ...@@ -53,7 +53,7 @@
message: message:
'Sum of positive and negative frequency triangle oscillators', 'Sum of positive and negative frequency triangle oscillators',
type: 'triangle', type: 'triangle',
threshold: 2.9803e-7 threshold: 1.0133e-5
}).then(() => task.done()); }).then(() => task.done());
}); });
...@@ -145,7 +145,7 @@ ...@@ -145,7 +145,7 @@
should( should(
actual, actual,
'Sum of positive and negative frequency custom oscillators') 'Sum of positive and negative frequency custom oscillators')
.beCloseToArray(expected, {absoluteThreshold: 4.1724e-7}); .beCloseToArray(expected, {absoluteThreshold: 2.2352e-6});
}) })
.then(() => task.done()); .then(() => task.done());
}); });
......
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
}, },
function(task, should) { function(task, should) {
testStartSampling(should, 1.25, { testStartSampling(should, 1.25, {
error: 1.0843e-4, error: 1.0880e-4,
snrThreshold: 84.054 snrThreshold: 84.054
}).then(task.done.bind(task)); }).then(task.done.bind(task));
}); });
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
}, },
function(task, should) { function(task, should) {
testStartSampling(should, 1.75, { testStartSampling(should, 1.75, {
error: 1.0839e-4, error: 1.0844e-4,
snrThreshold: 84.056 snrThreshold: 84.056
}).then(task.done.bind(task)); }).then(task.done.bind(task));
}); });
...@@ -57,8 +57,8 @@ ...@@ -57,8 +57,8 @@
// 1/2 frame from the preceding sampling frame. This tests one path // 1/2 frame from the preceding sampling frame. This tests one path
// of the internal implementation. // of the internal implementation.
testStartWithGain(should, defaultSampleRate, { testStartWithGain(should, defaultSampleRate, {
error: 4.1724e-7, error: 1.9521e-6,
snrThreshold: 137.536 snrThreshold: 128.12
}).then(task.done.bind(task)); }).then(task.done.bind(task));
}); });
...@@ -75,8 +75,8 @@ ...@@ -75,8 +75,8 @@
// frame from the preceding sampling frame. This tests one path of // frame from the preceding sampling frame. This tests one path of
// the internal implementation. // the internal implementation.
testStartWithGain(should, 48000, { testStartWithGain(should, 48000, {
error: 4.1724e-7, error: 1.9521e-6,
snrThreshold: 137.536 snrThreshold: 122.92
}).then(task.done.bind(task)); }).then(task.done.bind(task));
}); });
......
...@@ -134,14 +134,14 @@ ...@@ -134,14 +134,14 @@
verifyPeriodicWaveOutput( verifyPeriodicWaveOutput(
should, {real: [0, 1], imag: [0, 1], disableNormalization: false}, should, {real: [0, 1], imag: [0, 1], disableNormalization: false},
generateReference(x => Math.SQRT1_2 * (Math.sin(x) + Math.cos(x))), generateReference(x => Math.SQRT1_2 * (Math.sin(x) + Math.cos(x))),
2.7165e-5) 2.7225e-5)
.then(() => task.done()); .then(() => task.done());
}); });
audit.define('3: real/imag periodicwave test', (task, should) => { audit.define('3: real/imag periodicwave test', (task, should) => {
verifyPeriodicWaveOutput( verifyPeriodicWaveOutput(
should, {real: [0, 1], imag: [0, 1], disableNormalization: true}, should, {real: [0, 1], imag: [0, 1], disableNormalization: true},
generateReference(x => Math.sin(x) + Math.cos(x)), 3.8416e-5) generateReference(x => Math.sin(x) + Math.cos(x)), 3.8501e-5)
.then(() => task.done()); .then(() => task.done());
}); });
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment