Commit 7d8ffaab authored by Raymond Toy's avatar Raymond Toy Committed by Commit Bot

SSE2 optimization for a-rate Oscillator

Add SSE2 optimization for the main processing loop for the a-rate
Oscillator and for the computation in WaveDataForFundamentalFrequency.
Profiling showed WaveDataForFundamentalFrequency was taking 27% of the
total time.  The optimized version takes about half as long by computing
4 results at a time.

The failed test is caused by changing the linear interpolation formula
from (1-f)*x0 + f*x1 to the mathematically equivalent x0+f*(x1-x0).
However, this isn't exactly the same in floating-point.  The updated
formula has one less operation so it's a useful speed up.

Using a float virtual_read_index in the SIMD loop causes significant
loss of precision (50 dB or less in the osc sweep tests instead of
100+ dB), so we need to do a slightly more complex SIMD version with
double virtual_read_index.

WebAudio bench results without this CL:

TEST	μs	MIN	Q1	MEDIAN	Q3	MAX	MEAN	STDDEV
Oscillator.frequency-linear-a-rate	1884	1884	1948	1969	1997	2598	1987.72	94.13

With this CL:
TEST	μs	MIN	Q1	MEDIAN	Q3	MAX	MEAN	STDDEV
Oscillator.frequency-linear-a-rate	1348	1348	1397	1418	1453	2028	1474.74	151.59

We see about a 25% improvement in speed (based on the mean).

Bug: 1013118
Change-Id: I818834c64c3529b26467dbaa7623354f92b47b2a
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2264204Reviewed-by: default avatarHongchan Choi <hongchan@chromium.org>
Reviewed-by: default avatarDale Curtis <dalecurtis@chromium.org>
Commit-Queue: Raymond Toy <rtoy@chromium.org>
Cr-Commit-Position: refs/heads/master@{#789097}
parent dcdc2951
......@@ -154,4 +154,127 @@ std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
return std::make_tuple(k, virtual_read_index);
}
static __m128 WrapVirtualIndexVectorPd(__m128d x,
__m128d wave_size,
__m128d inv_wave_size) {
// Wrap the virtual index |x| to the range 0 to wave_size - 1. This is done
// by computing x - floor(x/wave_size)*wave_size.
//
// But there's no SSE2 SIMD instruction for this, so we do it the following
// way.
// f = truncate(x/wave_size), truncating towards 0.
const __m128d r = _mm_mul_pd(x, inv_wave_size);
__m128i f = _mm_cvttpd_epi32(r);
// Note that if r >= 0, then f <= r. But if r < 0, then r <= f, with equality
// only if r is already an integer. Hence if r < f, we want to subtract 1
// from f to get floor(r).
// cmplt(a,b) returns 0xffffffffffffffff (-1) if a < b and 0 if not. So cmp
// is -1 or 0 depending on whether r < f, which is what we need to compute
// floor(r).
__m128d cmp = _mm_cmplt_pd(r, _mm_cvtepi32_pd(f));
// Take the low 32 bits of each 64-bit result and move them into the two
// lowest 32-bit fields.
cmp = _mm_shuffle_epi32(cmp, (2 << 2) | 0);
// This subtracts 1 if needed to get floor(r).
f = _mm_add_epi32(f, cmp);
// Convert back to float, and scale by wave_size. And finally subtract that
// from x.
return _mm_sub_pd(x, _mm_mul_pd(_mm_cvtepi32_pd(f), wave_size));
}
double OscillatorHandler::ProcessARateVectorKernel(
float* dest_p,
double virtual_read_index,
const float* phase_increments,
unsigned periodic_wave_size,
const float* const lower_wave_data[4],
const float* const higher_wave_data[4],
const float table_interpolation_factor[4]) const {
// See the scalar version in oscillator_node.cc for the basic algorithm.
double inv_periodic_wave_size = 1.0 / periodic_wave_size;
unsigned read_index_mask = periodic_wave_size - 1;
// Accumulate the phase increments so we can set up the virtual read index
// vector appropriately. This must be a double to preserve accuracy and
// to match the scalar version.
double incr_sum[4];
incr_sum[0] = phase_increments[0];
for (int m = 1; m < 4; ++m) {
incr_sum[m] = incr_sum[m - 1] + phase_increments[m];
}
// It's really important for accuracy that we use doubles instead of
// floats for the virtual_read_index. Without this, we can only get some
// 30-50 dB in the sweep tests instead of 100+ dB.
__m128d v_read_index_hi = _mm_set_pd(virtual_read_index + incr_sum[2],
virtual_read_index + incr_sum[1]);
__m128d v_read_index_lo =
_mm_set_pd(virtual_read_index + incr_sum[0], virtual_read_index);
v_read_index_hi =
WrapVirtualIndexVectorPd(v_read_index_hi, _mm_set1_pd(periodic_wave_size),
_mm_set1_pd(inv_periodic_wave_size));
v_read_index_lo =
WrapVirtualIndexVectorPd(v_read_index_lo, _mm_set1_pd(periodic_wave_size),
_mm_set1_pd(inv_periodic_wave_size));
// Convert the virtual read index (parts) to an integer, and carefully
// merge them into one vector.
const __m128i v_read0 = _mm_movelh_ps(_mm_cvttpd_epi32(v_read_index_lo),
_mm_cvttpd_epi32(v_read_index_hi));
// Get index to next element being sure to wrap the index around if needed.
const __m128i v_read1 =
_mm_and_si128(_mm_add_epi32(v_read0, _mm_set1_epi32(1)),
_mm_set1_epi32(read_index_mask));
float sample1_lower[4] __attribute__((aligned(16)));
float sample2_lower[4] __attribute__((aligned(16)));
float sample1_higher[4] __attribute__((aligned(16)));
float sample2_higher[4] __attribute__((aligned(16)));
const unsigned* read0 = reinterpret_cast<const unsigned*>(&v_read0);
const unsigned* read1 = reinterpret_cast<const unsigned*>(&v_read1);
for (int m = 0; m < 4; ++m) {
DCHECK_LT(read0[m], periodic_wave_size);
DCHECK_LT(read1[m], periodic_wave_size);
sample1_lower[m] = lower_wave_data[m][read0[m]];
sample2_lower[m] = lower_wave_data[m][read1[m]];
sample1_higher[m] = higher_wave_data[m][read0[m]];
sample2_higher[m] = higher_wave_data[m][read1[m]];
}
const __m128 v_factor =
_mm_sub_ps(_mm_movelh_ps(_mm_cvtpd_ps(v_read_index_lo),
_mm_cvtpd_ps(v_read_index_hi)),
_mm_cvtepi32_ps(v_read0));
const __m128 sample_higher =
_mm_add_ps(_mm_load_ps(sample1_higher),
_mm_mul_ps(v_factor, _mm_sub_ps(_mm_load_ps(sample2_higher),
_mm_load_ps(sample1_higher))));
const __m128 sample_lower =
_mm_add_ps(_mm_load_ps(sample1_lower),
_mm_mul_ps(v_factor, _mm_sub_ps(_mm_load_ps(sample2_lower),
_mm_load_ps(sample1_lower))));
const __m128 sample = _mm_add_ps(
sample_higher, _mm_mul_ps(_mm_load_ps(table_interpolation_factor),
_mm_sub_ps(sample_lower, sample_higher)));
_mm_storeu_ps(dest_p, sample);
virtual_read_index += incr_sum[3];
virtual_read_index -=
floor(virtual_read_index * inv_periodic_wave_size) * periodic_wave_size;
return virtual_read_index;
}
} // namespace blink
......@@ -358,7 +358,6 @@ static float DoInterpolation(double virtual_read_index,
}
#if !(defined(ARCH_CPU_X86_FAMILY) || defined(CPU_ARM_NEON))
// Vector operations not supported, so there's nothing to do except return 0 and
// virtual_read_index. The scalar version will do the necessary processing.
std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
......@@ -372,6 +371,65 @@ std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
}
#endif
#if !defined(ARCH_CPU_X86_FAMILY)
double OscillatorHandler::ProcessARateVectorKernel(
float* dest_p,
double virtual_read_index,
const float* phase_increments,
unsigned periodic_wave_size,
const float* const lower_wave_data[4],
const float* const higher_wave_data[4],
const float table_interpolation_factor[4]) const {
double inv_periodic_wave_size = 1.0 / periodic_wave_size;
unsigned read_index_mask = periodic_wave_size - 1;
for (int m = 0; m < 4; ++m) {
unsigned read_index_0 = static_cast<unsigned>(virtual_read_index);
// Increment is fairly large, so we're doing no more than about 3
// points between each wave table entry. Assume linear
// interpolation between points is good enough.
unsigned read_index2 = read_index_0 + 1;
// Contain within valid range.
read_index_0 = read_index_0 & read_index_mask;
read_index2 = read_index2 & read_index_mask;
float sample1_lower = lower_wave_data[m][read_index_0];
float sample2_lower = lower_wave_data[m][read_index2];
float sample1_higher = higher_wave_data[m][read_index_0];
float sample2_higher = higher_wave_data[m][read_index2];
// Linearly interpolate within each table (lower and higher).
double interpolation_factor =
static_cast<float>(virtual_read_index) - read_index_0;
// Doing linear interpolation via x0 + f*(x1-x0) gives slightly
// different results from (1-f)*x0 + f*x1, but requires fewer
// operations. This causes a very slight decrease in SNR (< 0.05 dB) in
// oscillator sweep tests.
float sample_higher =
sample1_higher +
interpolation_factor * (sample2_higher - sample1_higher);
float sample_lower =
sample1_lower + interpolation_factor * (sample2_lower - sample1_lower);
// Then interpolate between the two tables.
float sample = sample_higher + table_interpolation_factor[m] *
(sample_lower - sample_higher);
dest_p[m] = sample;
// Increment virtual read index and wrap virtualReadIndex into the range
// 0 -> periodicWaveSize.
virtual_read_index += phase_increments[m];
virtual_read_index -=
floor(virtual_read_index * inv_periodic_wave_size) * periodic_wave_size;
}
return virtual_read_index;
}
#endif
double OscillatorHandler::ProcessKRateScalar(int start,
int n,
float* dest_p,
......@@ -488,10 +546,75 @@ double OscillatorHandler::ProcessKRate(int n,
return virtual_read_index;
}
double OscillatorHandler::ProcessARate(int n,
float* dest_p,
std::tuple<int, double> OscillatorHandler::ProcessARateVector(
int n,
float* destination,
double virtual_read_index,
float* phase_increments) const {
const float* phase_increments) const {
float rate_scale = periodic_wave_->RateScale();
float inv_rate_scale = 1 / rate_scale;
unsigned periodic_wave_size = periodic_wave_->PeriodicWaveSize();
double inv_periodic_wave_size = 1.0 / periodic_wave_size;
unsigned read_index_mask = periodic_wave_size - 1;
float* higher_wave_data[4];
float* lower_wave_data[4];
float table_interpolation_factor[4] __attribute__((aligned(16)));
int k = 0;
int n_loops = n / 4;
for (int loop = 0; loop < n_loops; ++loop, k += 4) {
bool is_big_increment = true;
float frequency[4];
for (int m = 0; m < 4; ++m) {
float phase_incr = phase_increments[k + m];
is_big_increment =
is_big_increment && (fabs(phase_incr) >= kInterpolate2Point);
frequency[m] = inv_rate_scale * phase_incr;
}
periodic_wave_->WaveDataForFundamentalFrequency(frequency, lower_wave_data,
higher_wave_data,
table_interpolation_factor);
// If all the phase increments are large enough, we can use linear
// interpolation with a possibly vectorized implementation. If not, we need
// to call DoInterpolation to handle it correctly.
if (is_big_increment) {
virtual_read_index = ProcessARateVectorKernel(
destination + k, virtual_read_index, phase_increments + k,
periodic_wave_size, lower_wave_data, higher_wave_data,
table_interpolation_factor);
} else {
for (int m = 0; m < 4; ++m) {
float sample =
DoInterpolation(virtual_read_index, fabs(phase_increments[k + m]),
read_index_mask, table_interpolation_factor[m],
lower_wave_data[m], higher_wave_data[m]);
destination[k + m] = sample;
// Increment virtual read index and wrap virtualReadIndex into the range
// 0 -> periodicWaveSize.
virtual_read_index += phase_increments[k + m];
virtual_read_index -=
floor(virtual_read_index * inv_periodic_wave_size) *
periodic_wave_size;
}
}
}
return std::make_tuple(k, virtual_read_index);
}
double OscillatorHandler::ProcessARateScalar(
int k,
int n,
float* destination,
double virtual_read_index,
const float* phase_increments) const {
float rate_scale = periodic_wave_->RateScale();
float inv_rate_scale = 1 / rate_scale;
unsigned periodic_wave_size = periodic_wave_->PeriodicWaveSize();
......@@ -502,8 +625,8 @@ double OscillatorHandler::ProcessARate(int n,
float* lower_wave_data = nullptr;
float table_interpolation_factor = 0;
for (int k = 0; k < n; ++k) {
float incr = *phase_increments++;
for (int m = k; m < n; ++m) {
float incr = phase_increments[m];
float frequency = inv_rate_scale * incr;
periodic_wave_->WaveDataForFundamentalFrequency(frequency, lower_wave_data,
......@@ -514,7 +637,7 @@ double OscillatorHandler::ProcessARate(int n,
read_index_mask, table_interpolation_factor,
lower_wave_data, higher_wave_data);
*dest_p++ = sample;
destination[m] = sample;
// Increment virtual read index and wrap virtualReadIndex into the range
// 0 -> periodicWaveSize.
......@@ -526,6 +649,21 @@ double OscillatorHandler::ProcessARate(int n,
return virtual_read_index;
}
double OscillatorHandler::ProcessARate(int n,
float* destination,
double virtual_read_index,
float* phase_increments) const {
int frames_processed = 0;
std::tie(frames_processed, virtual_read_index) =
ProcessARateVector(n, destination, virtual_read_index, phase_increments);
virtual_read_index = ProcessARateScalar(frames_processed, n, destination,
virtual_read_index, phase_increments);
return virtual_read_index;
}
void OscillatorHandler::Process(uint32_t frames_to_process) {
AudioBus* output_bus = Output(0).Bus();
......
......@@ -118,6 +118,63 @@ class OscillatorHandler final : public AudioScheduledSourceHandler {
double virtual_read_index,
float* phase_increments) const;
// Scalar version of ProcessARate(). Also handles any values not handled by
// the vector version.
//
// k
// start index for where to write the result (and read phase_increments)
// n
// total number of frames to process
// destination
// Array where the samples values are written
// virtual_read_index
// index into the wave data tables containing the waveform
// phase_increments
// phase change to use for each frame of output
//
// Returns the updated virtual_read_index.
double ProcessARateScalar(int k,
int n,
float* destination,
double virtual_read_index,
const float* phase_increments) const;
// Vector version of ProcessARate(). Returns the number of frames processed
// and the update virtual_read_index.
std::tuple<int, double> ProcessARateVector(
int n,
float* destination,
double virtual_read_index,
const float* phase_increments) const;
// Handles the linear interpolation in ProcessARateVector().
//
// destination
// Where the interpolated values are written.
// virtual_read_index
// index into the wave table data
// phase_increments
// phase increments array
// periodic_wave_size
// Length of the periodic wave stored in the wave tables
// lower_wave_data
// Array of the 4 lower wave table arrays
// higher_wave_data
// Array of the 4 higher wave table arrays
// table_interpolation_factor
// Array of linear interpolation factors to use between the lower and
// higher wave tables.
//
// Returns the updated virtual_read_index
double ProcessARateVectorKernel(
float* destination,
double virtual_read_index,
const float* phase_increments,
unsigned periodic_wave_size,
const float* const lower_wave_data[4],
const float* const higher_wave_data[4],
const float table_interpolation_factor[4]) const;
// One of the waveform types defined in the enum.
uint8_t type_;
......
......@@ -29,6 +29,7 @@
#include <algorithm>
#include <memory>
#include "build/build_config.h"
#include "third_party/blink/renderer/bindings/modules/v8/v8_periodic_wave_options.h"
#include "third_party/blink/renderer/modules/webaudio/base_audio_context.h"
#include "third_party/blink/renderer/modules/webaudio/oscillator_node.h"
......@@ -198,6 +199,90 @@ void PeriodicWave::WaveDataForFundamentalFrequency(
table_interpolation_factor = pitch_range - range_index1;
}
#if defined(ARCH_CPU_X86_FAMILY)
void PeriodicWave::WaveDataForFundamentalFrequency(
const float fundamental_frequency[4],
float* lower_wave_data[4],
float* higher_wave_data[4],
float table_interpolation_factor[4]) {
// Negative frequencies are allowed, in which case we alias to the positive
// frequency. SSE2 doesn't have an fabs instruction, so just remove the sign
// bit of the float numbers, effecitvely taking the absolute value.
const __m128 frequency = _mm_and_ps(_mm_loadu_ps(fundamental_frequency),
_mm_set1_epi32(0x7fffffff));
// pos = 0xffffffff if freq > 0; otherwise 0
const __m128i pos = _mm_cmpgt_ps(frequency, _mm_set1_ps(0));
// Calculate the pitch range.
__m128 v_ratio =
_mm_div_ps(frequency, _mm_set1_ps(lowest_fundamental_frequency_));
// Set v_ratio to 0 if freq <= 0; otherwise keep the ratio.
v_ratio = _mm_and_ps(v_ratio, pos);
// If pos = 0, set value to 0.5 and 0 otherwise. Or this into v_ratio so that
// v_ratio is 0.5 if freq <= 0. Otherwise preserve v_ratio.
v_ratio = _mm_or_ps(v_ratio, _mm_andnot_ps(pos, _mm_set1_ps(0.5)));
const float* ratio = reinterpret_cast<float*>(&v_ratio);
float cents_above_lowest_frequency[4] __attribute__((aligned(16)));
for (int k = 0; k < 4; ++k) {
cents_above_lowest_frequency[k] = log2f(ratio[k]) * 1200;
}
__m128 v_pitch_range = _mm_add_ps(
_mm_set1_ps(1.0), _mm_div_ps(_mm_load_ps(cents_above_lowest_frequency),
_mm_set1_ps((cents_per_range_))));
v_pitch_range = _mm_max_ps(v_pitch_range, _mm_set1_ps(0.0));
v_pitch_range = _mm_min_ps(v_pitch_range, _mm_set1_ps(NumberOfRanges() - 1));
const __m128i v_index1 = _mm_cvttps_epi32(v_pitch_range);
__m128i v_index2 = _mm_add_epi32(v_index1, _mm_set1_epi32(1));
// SSE2 deosn't have _mm_min_epi32 (but SSE4.2 does).
//
// The following ought to work because the small integers for the index and
// number of ranges should look like tiny denormals that should compare in the
// same order as integers. This doesn't work because we have flush-to-zero
// enabled.
//
// __m128i v_range = _mm_set1_epi32(NumberOfRanges() - 1);
// v_index2 = _mm_min_ps(v_index2, v_range);
//
// Instead we convert to float, take the min and convert back. No round off
// because the integers are small.
v_index2 = _mm_cvttps_epi32(
_mm_min_ps(_mm_cvtepi32_ps(v_index2), _mm_set1_ps(NumberOfRanges() - 1)));
const __m128 table_factor =
_mm_sub_ps(v_pitch_range, _mm_cvtepi32_ps(v_index1));
_mm_storeu_ps(table_interpolation_factor, table_factor);
const unsigned* range_index1 = reinterpret_cast<const unsigned*>(&v_index1);
const unsigned* range_index2 = reinterpret_cast<const unsigned*>(&v_index2);
for (int k = 0; k < 4; ++k) {
lower_wave_data[k] = band_limited_tables_[range_index2[k]]->Data();
higher_wave_data[k] = band_limited_tables_[range_index1[k]]->Data();
}
}
#else
void PeriodicWave::WaveDataForFundamentalFrequency(
const float fundamental_frequency[4],
float* lower_wave_data[4],
float* higher_wave_data[4],
float table_interpolation_factor[4]) {
for (int k = 0; k < 4; ++k) {
WaveDataForFundamentalFrequency(fundamental_frequency[k],
lower_wave_data[k], higher_wave_data[k],
table_interpolation_factor[k]);
}
}
#endif
unsigned PeriodicWave::NumberOfPartialsForRange(unsigned range_index) const {
// Number of cents below nyquist where we cull partials.
float cents_to_cull = range_index * cents_per_range_;
......
......@@ -80,6 +80,15 @@ class PeriodicWave final : public ScriptWrappable {
float*& higher_wave_data,
float& table_interpolation_factor);
// Like the above, except we compute accept 4 frequencies at a time and return
// 4 lower/higher wave data tables and the 4 corresponding table interpolation
// factors. Intended for use with the OscillatorNode for faster a-rate
// processing.
void WaveDataForFundamentalFrequency(const float fundamental_frequency[4],
float* lower_wave_data[4],
float* higher_wave_data[4],
float table_interpolation_factor[4]);
// Returns the scalar multiplier to the oscillator frequency to calculate wave
// buffer phase increment.
float RateScale() const { return rate_scale_; }
......
......@@ -22,7 +22,7 @@
1, tester.sampleRate * tester.lengthInSeconds, tester.sampleRate);
// The thresholds are experimentally determined.
tester.setThresholds({snr: 139.991, maxDiff: 2.2650e-6});
tester.setThresholds({snr: 139.98, maxDiff: 2.2650e-6});
tester.runTest(
context, 'custom', 'Custom Oscillator with Exponential Sweep', task,
should);
......
......@@ -22,7 +22,7 @@
1, tester.sampleRate * tester.lengthInSeconds, tester.sampleRate);
// The thresholds are experimentally determined.
tester.setThresholds({snr: 134.39, maxDiff: 1.8925e-6});
tester.setThresholds({snr: 134.34, maxDiff: 1.8925e-6});
tester.runTest(
context, 'sawtooth', 'Sawtooth Oscillator with Exponential Sweep',
task, should);
......
......@@ -22,7 +22,7 @@
1, tester.sampleRate * tester.lengthInSeconds, tester.sampleRate);
// The thresholds are experimentally determined.
tester.setThresholds({snr: 139.98, maxDiff: 2.8313e-6});
tester.setThresholds({snr: 139.88, maxDiff: 2.8313e-6});
tester.runTest(
context, 'triangle', 'Triangle Oscillator with Exponential Sweep ',
task, should);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment