Reland: SIMD the main loop for k-rate Oscillator

The main processing loop for the k-rate Oscillator is updated to use SSE2 instructions to speed up processing. Four tests fail because we have slightly reduced accuracy. Testing shows that the loss of accuracy is due to using a float for virtual_read_index in the loop instead of a double as previously. A implementation using a double passes all the original tests but does show that it does lose performance. I think the change in accuracy is acceptable. WebAudio Bench results from a linux machine. In summary, the SIMD version is about 60% as much time (about 1.75 times faster). Without CL: TEST μs MIN Q1 MEDIAN Q3 MAX MEAN STDDEV Baseline 713 713 748 757 766 877 760.25 18.35 Oscillator 949 949 972 990 1005 1132 990.78 24.35 With CL: TEST μs MIN Q1 MEDIAN Q3 MAX MEAN STDDEV Baseline 728 728 747 756 763 908 757.46 16.81 Oscillator 521 521 549 564 581 874 566.87 27.77 Bug: 1013118 Change-Id: Ic5445cc152e960b41817f245ac306691f7c57f90 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2246831Reviewed-by: Hongchan Choi <hongchan@chromium.org> Commit-Queue: Raymond Toy <rtoy@chromium.org> Cr-Commit-Position: refs/heads/master@{#778579}

Reland: SIMD the main loop for k-rate Oscillator
The main processing loop for the k-rate Oscillator is updated to use SSE2 instructions to speed up processing. Four tests fail because we have slightly reduced accuracy. Testing shows that the loss of accuracy is due to using a float for virtual_read_index in the loop instead of a double as previously. A implementation using a double passes all the original tests but does show that it does lose performance. I think the change in accuracy is acceptable. WebAudio Bench results from a linux machine. In summary, the SIMD version is about 60% as much time (about 1.75 times faster). Without CL: TEST μs MIN Q1 MEDIAN Q3 MAX MEAN STDDEV Baseline 713 713 748 757 766 877 760.25 18.35 Oscillator 949 949 972 990 1005 1132 990.78 24.35 With CL: TEST μs MIN Q1 MEDIAN Q3 MAX MEAN STDDEV Baseline 728 728 747 756 763 908 757.46 16.81 Oscillator 521 521 549 564 581 874 566.87 27.77 Bug: 1013118 Change-Id: Ic5445cc152e960b41817f245ac306691f7c57f90 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2246831Reviewed-by: Hongchan Choi <hongchan@chromium.org> Commit-Queue: Raymond Toy <rtoy@chromium.org> Cr-Commit-Position: refs/heads/master@{#778579}
318d65a3 · Raymond Toy · Commit Bot · e4d6ced5 · 318d65a3 · 318d65a3
Commit 318d65a3 authored Jun 16, 2020 by Raymond Toy Committed by Commit Bot Jun 16, 2020
8 changed files
--- a/third_party/blink/renderer/modules/webaudio/oscillator_node.cc
+++ b/third_party/blink/renderer/modules/webaudio/oscillator_node.cc
@@ -26,6 +26,7 @@
 #include <algorithm>
 #include <limits>
+#include "build/build_config.h"
 #include "third_party/blink/renderer/modules/webaudio/audio_node_output.h"
 #include "third_party/blink/renderer/modules/webaudio/oscillator_node.h"
 #include "third_party/blink/renderer/modules/webaudio/periodic_wave.h"
@@ -37,6 +38,11 @@
 namespace blink {
+// Breakpoints where we deicde to do linear interoplation, 3-point
+// interpolation or 5-point interpolation.  See DoInterpolation().
+constexpr float kInterpolate2Point = 0.3;
+constexpr float kInterpolate3Point = 0.16;
 OscillatorHandler::OscillatorHandler(AudioNode& node,
                                     float sample_rate,
                                     const String& oscillator_type,
@@ -272,7 +278,7 @@ static float DoInterpolation(double virtual_read_index,
  // We use Lagrange interpolation because it's relatively simple to
  // implement and fairly inexpensive, and the interpolator always
  // passes through known points.
-  if (incr >= 0.3) {
+  if (incr >= kInterpolate2Point) {
    // Increment is fairly large, so we're doing no more than about 3
    // points between each wave table entry. Assume linear
    // interpolation between points is good enough.
@@ -295,7 +301,7 @@ static float DoInterpolation(double virtual_read_index,
    sample_lower = (1 - interpolation_factor) * sample1_lower +
                   interpolation_factor * sample2_lower;
-  } else if (incr >= .16) {
+  } else if (incr >= kInterpolate3Point) {
    // We're doing about 6 interpolation values between each wave
    // table sample. Just use a 3-point Lagrange interpolator to get a
    // better estimate than just linear.
@@ -351,41 +357,275 @@ static float DoInterpolation(double virtual_read_index,
  return sample;
 }
+#if defined(ARCH_CPU_X86_FAMILY)
+static __m128 v_wrap_virtual_index(__m128 x,
+                                   __m128 wave_size,
+                                   __m128 inv_wave_size) {
+  // Wrap the virtual index |x| to the range 0 to wave_size - 1.  This is done
+  // by computing x - floor(x/wave_size)*wave_size.
+  //
+  // But there's no SSE2 SIMD instruction for this, so we do it the following
+  // way.
+  // f = truncate(x/wave_size), truncating towards 0.
+  const __m128 r = _mm_mul_ps(x, inv_wave_size);
+  __m128i f = _mm_cvttps_epi32(r);
+  // Note that if r >= 0, then f <= r. But if r < 0, then r <= f, with equality
+  // only if r is already an integer.  Hence if r < f, we want to subtract 1
+  // from f to get floor(r).
+  // cmplt(a,b) returns 0xffffffff (-1) if a < b and 0 if not.  So cmp is -1 or
+  // 0 depending on whether r < f, which is what we need to compute floor(r).
+  const __m128i cmp = _mm_cmplt_ps(r, _mm_cvtepi32_ps(f));
+  // This subtracts 1 if needed to get floor(r).
+  f = _mm_add_epi32(f, cmp);
+  // Convert back to float, and scale by wave_size.  And finally subtract that
+  // from x.
+  return _mm_sub_ps(x, _mm_mul_ps(_mm_cvtepi32_ps(f), wave_size));
+}
+std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
+    int n,
+    float* dest_p,
+    double virtual_read_index,
+    float frequency,
+    float rate_scale) const {
+  const unsigned periodic_wave_size = periodic_wave_->PeriodicWaveSize();
+  const double inv_periodic_wave_size = 1.0 / periodic_wave_size;
+  float* higher_wave_data = nullptr;
+  float* lower_wave_data = nullptr;
+  float table_interpolation_factor = 0;
+  float incr = frequency * rate_scale;
+  DCHECK_GE(incr, kInterpolate2Point);
+  periodic_wave_->WaveDataForFundamentalFrequency(
+      frequency, lower_wave_data, higher_wave_data, table_interpolation_factor);
+  const __m128 v_wave_size = _mm_set1_ps(periodic_wave_size);
+  const __m128 v_inv_wave_size = _mm_set1_ps(1.0f / periodic_wave_size);
+  // Mask to use to wrap the read indices to the proper range.
+  const __m128i v_read_mask = _mm_set1_epi32(periodic_wave_size - 1);
+  const __m128i one = _mm_set1_epi32(1);
+  const __m128 v_table_factor = _mm_set1_ps(table_interpolation_factor);
+  // The loop processes 4 items at a time, so we need to increment the
+  // virtual index by 4*incr each time.
+  const __m128 v_incr = _mm_set1_ps(4 * incr);
+  // The virtual index vector.  Ideally, to preserve accuracy, we should use
+  // (two) packed double vectors for this, but that degrades performance quite a
+  // bit.
+  __m128 v_virt_index =
+      _mm_set_ps(virtual_read_index + 3 * incr, virtual_read_index + 2 * incr,
+                 virtual_read_index + incr, virtual_read_index);
+  // It's possible that adding the incr above exceeded the bounds, so wrap them
+  // if needed.
+  v_virt_index =
+      v_wrap_virtual_index(v_virt_index, v_wave_size, v_inv_wave_size);
+  // Temporary arrays where we can gather up the wave data we need for
+  // interpolation.  Align these for best efficiency on older CPUs where aligned
+  // access is much faster than unaliged.  TODO(rtoy): Is there a faster way to
+  // do this?
+  float sample1_lower[4] __attribute__((aligned(16)));
+  float sample2_lower[4] __attribute__((aligned(16)));
+  float sample1_higher[4] __attribute__((aligned(16)));
+  float sample2_higher[4] __attribute__((aligned(16)));
+  int k = 0;
+  int n_loops = n / 4;
+  for (int loop = 0; loop < n_loops; ++loop, k += 4) {
+    // Compute indices for the samples and contain within the valid range.
+    const __m128i read_index_0 =
+        _mm_and_si128(_mm_cvttps_epi32(v_virt_index), v_read_mask);
+    const __m128i read_index_1 =
+        _mm_and_si128(_mm_add_epi32(read_index_0, one), v_read_mask);
+    // Extract the components of the indices so we can get the samples
+    // associated with the lower and higher wave data.
+    const uint32_t* r0 = reinterpret_cast<const uint32_t*>(&read_index_0);
+    const uint32_t* r1 = reinterpret_cast<const uint32_t*>(&read_index_1);
+    // Get the samples from the wave tables and save them in work arrays so we
+    // can load them into simd registers.
+    for (int m = 0; m < 4; ++m) {
+      sample1_lower[m] = lower_wave_data[r0[m]];
+      sample2_lower[m] = lower_wave_data[r1[m]];
+      sample1_higher[m] = higher_wave_data[r0[m]];
+      sample2_higher[m] = higher_wave_data[r1[m]];
+    }
+    const __m128 s1_low = _mm_load_ps(sample1_lower);
+    const __m128 s2_low = _mm_load_ps(sample2_lower);
+    const __m128 s1_high = _mm_load_ps(sample1_higher);
+    const __m128 s2_high = _mm_load_ps(sample2_higher);
+    // Linearly interpolate within each table (lower and higher).
+    const __m128 interpolation_factor =
+        _mm_sub_ps(v_virt_index, _mm_cvtepi32_ps(read_index_0));
+    const __m128 sample_higher = _mm_add_ps(
+        s1_high,
+        _mm_mul_ps(interpolation_factor, _mm_sub_ps(s2_high, s1_high)));
+    const __m128 sample_lower = _mm_add_ps(
+        s1_low, _mm_mul_ps(interpolation_factor, _mm_sub_ps(s2_low, s1_low)));
+    // Then interpolate between the two tables.
+    const __m128 sample = _mm_add_ps(
+        sample_higher,
+        _mm_mul_ps(v_table_factor, _mm_sub_ps(sample_lower, sample_higher)));
+    // WARNING: dest_p may not be aligned!
+    _mm_storeu_ps(dest_p + k, sample);
+    // Increment virtual read index and wrap virtualReadIndex into the range
+    // 0 -> periodicWaveSize.
+    v_virt_index = _mm_add_ps(v_virt_index, v_incr);
+    v_virt_index =
+        v_wrap_virtual_index(v_virt_index, v_wave_size, v_inv_wave_size);
+  }
+  // There's a bit of round-off above, so update the index more accurately so at
+  // least the next render starts over with a more accurate value.
+  virtual_read_index += k * incr;
+  virtual_read_index -=
+      floor(virtual_read_index * inv_periodic_wave_size) * periodic_wave_size;
+  return std::make_tuple(k, virtual_read_index);
+}
+#else
+// Vector operations not supported, so there's nothing to do except return 0 and
+// virtual_read_index.  The scalar version will do the necessary processing.
+std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
+    int n,
+    float* dest_p,
+    double virtual_read_index,
+    float frequency,
+    float rate_scale) const {
+  DCHECK_GE(frequency * rate_scale, kInterpolate2Point);
+  return std::make_tuple(0, virtual_read_index);
+}
+#endif
+double OscillatorHandler::ProcessKRateScalar(int start,
+                                             int n,
+                                             float* dest_p,
+                                             double virtual_read_index,
+                                             float frequency,
+                                             float rate_scale) const {
+  const unsigned periodic_wave_size = periodic_wave_->PeriodicWaveSize();
+  const double inv_periodic_wave_size = 1.0 / periodic_wave_size;
+  const unsigned read_index_mask = periodic_wave_size - 1;
+  float* higher_wave_data = nullptr;
+  float* lower_wave_data = nullptr;
+  float table_interpolation_factor = 0;
+  periodic_wave_->WaveDataForFundamentalFrequency(
+      frequency, lower_wave_data, higher_wave_data, table_interpolation_factor);
+  const float incr = frequency * rate_scale;
+  DCHECK_GE(incr, kInterpolate2Point);
+  for (int k = start; k < n; ++k) {
+    // Get indices for the current and next sample, and contain them within the
+    // valid range
+    const unsigned read_index_0 =
+        static_cast<unsigned>(virtual_read_index) & read_index_mask;
+    const unsigned read_index_1 = (read_index_0 + 1) & read_index_mask;
+    const float sample1_lower = lower_wave_data[read_index_0];
+    const float sample2_lower = lower_wave_data[read_index_1];
+    const float sample1_higher = higher_wave_data[read_index_0];
+    const float sample2_higher = higher_wave_data[read_index_1];
+    // Linearly interpolate within each table (lower and higher).
+    const float interpolation_factor =
+        static_cast<float>(virtual_read_index) - read_index_0;
+    const float sample_higher =
+        sample1_higher +
+        interpolation_factor * (sample2_higher - sample1_higher);
+    const float sample_lower =
+        sample1_lower + interpolation_factor * (sample2_lower - sample1_lower);
+    // Then interpolate between the two tables.
+    const float sample = sample_higher + table_interpolation_factor *
+                                             (sample_lower - sample_higher);
+    dest_p[k] = sample;
+    // Increment virtual read index and wrap virtualReadIndex into the range
+    // 0 -> periodicWaveSize.
+    virtual_read_index += incr;
+    virtual_read_index -=
+        floor(virtual_read_index * inv_periodic_wave_size) * periodic_wave_size;
+  }
+  return virtual_read_index;
+}
 double OscillatorHandler::ProcessKRate(int n,
                                       float* dest_p,
                                       double virtual_read_index) const {
-  unsigned periodic_wave_size = periodic_wave_->PeriodicWaveSize();
+  const unsigned periodic_wave_size = periodic_wave_->PeriodicWaveSize();
-  double inv_periodic_wave_size = 1.0 / periodic_wave_size;
+  const double inv_periodic_wave_size = 1.0 / periodic_wave_size;
-  unsigned read_index_mask = periodic_wave_size - 1;
+  const unsigned read_index_mask = periodic_wave_size - 1;
-  float frequency = 0;
  float* higher_wave_data = nullptr;
  float* lower_wave_data = nullptr;
  float table_interpolation_factor = 0;
-  frequency = frequency_->FinalValue();
+  float frequency = frequency_->FinalValue();
-  float detune = detune_->FinalValue();
+  const float detune_scale = DetuneToFrequencyMultiplier(detune_->FinalValue());
-  float detune_scale = DetuneToFrequencyMultiplier(detune);
  frequency *= detune_scale;
  ClampFrequency(&frequency, 1, Context()->sampleRate() / 2);
  periodic_wave_->WaveDataForFundamentalFrequency(
      frequency, lower_wave_data, higher_wave_data, table_interpolation_factor);
-  float rate_scale = periodic_wave_->RateScale();
+  const float rate_scale = periodic_wave_->RateScale();
-  float incr = frequency * rate_scale;
+  const float incr = frequency * rate_scale;
-  for (int k = 0; k < n; ++k) {
+  if (incr >= kInterpolate2Point) {
-    float sample = DoInterpolation(virtual_read_index, fabs(incr),
+    int k;
-                                   read_index_mask, table_interpolation_factor,
+    double v_index = virtual_read_index;
-                                   lower_wave_data, higher_wave_data);
-    *dest_p++ = sample;
+    std::tie(k, v_index) =
+        ProcessKRateVector(n, dest_p, v_index, frequency, rate_scale);
-    // Increment virtual read index and wrap virtualReadIndex into the range
+    if (k < n) {
-    // 0 -> periodicWaveSize.
+      // In typical cases, this won't be run because the number of frames is 128
-    virtual_read_index += incr;
+      // so the vector version will process all the samples.
+      v_index =
+          ProcessKRateScalar(k, n, dest_p, v_index, frequency, rate_scale);
+    }
+    // Recompute to reduce round-off introduced when processing the samples
+    // above.
+    virtual_read_index += n * incr;
    virtual_read_index -=
        floor(virtual_read_index * inv_periodic_wave_size) * periodic_wave_size;
+  } else {
+    for (int k = 0; k < n; ++k) {
+      float sample = DoInterpolation(
+          virtual_read_index, fabs(incr), read_index_mask,
+          table_interpolation_factor, lower_wave_data, higher_wave_data);
+      *dest_p++ = sample;
+      // Increment virtual read index and wrap virtualReadIndex into the range
+      // 0 -> periodicWaveSize.
+      virtual_read_index += incr;
+      virtual_read_index -= floor(virtual_read_index * inv_periodic_wave_size) *
+                            periodic_wave_size;
+    }
  }
  return virtual_read_index;

--- a/third_party/blink/renderer/modules/webaudio/oscillator_node.h
+++ b/third_party/blink/renderer/modules/webaudio/oscillator_node.h
@@ -89,6 +89,24 @@ class OscillatorHandler final : public AudioScheduledSourceHandler {
  // Compute the output for k-rate AudioParams
  double ProcessKRate(int n, float* dest_p, double virtual_read_index) const;
+  // Scalar version for the main loop in ProcessKRate().  Returns the updated
+  // virtual_read_index.
+  double ProcessKRateScalar(int start_index,
+                            int n,
+                            float* dest_p,
+                            double virtual_read_index,
+                            float frequency,
+                            float rate_scale) const;
+  // Vectorized version (if available) for the main loop in ProcessKRate().
+  // Returns the number of elements processed and the updated
+  // virtual_read_index.
+  std::tuple<int, double> ProcessKRateVector(int n,
+                                             float* dest_p,
+                                             double virtual_read_index,
+                                             float frequency,
+                                             float rate_scale) const;
  // Compute the output for a-rate AudioParams
  double ProcessARate(int n,
                      float* dest_p,

--- a/third_party/blink/web_tests/webaudio/Analyser/realtimeanalyser-fftsize-reset.html
+++ b/third_party/blink/web_tests/webaudio/Analyser/realtimeanalyser-fftsize-reset.html
@@ -23,7 +23,7 @@
        testFFTSize(should, {
          initialFFTSize: 128,
          finalFFTSize: 1024,
-          errorThreshold: {relativeThreshold: 1.9238e-6}
+          errorThreshold: {relativeThreshold: 1.9455e-6}
        }).then(() => task.done());
      });
@@ -31,7 +31,7 @@
        testFFTSize(should, {
          initialFFTSize: 512,
          finalFFTSize: 256,
-          errorThreshold: {relativeThreshold: 1.8166e-6}
+          errorThreshold: {relativeThreshold: 1.8592e-6}
        }).then(() => task.done());
      });

--- a/third_party/blink/web_tests/webaudio/Oscillator/no-dezippering.html
+++ b/third_party/blink/web_tests/webaudio/Oscillator/no-dezippering.html
@@ -57,7 +57,7 @@
              frequency1: frequency1,
              paramName: 'detune',
              newValue: detune,
-              thresholds: [1.1921e-7, 1.3114e-6],
+              thresholds: [1.1921e-7, 3.6657e-6],
            }).then(() => task.done());
          });

--- a/third_party/blink/web_tests/webaudio/Oscillator/osc-440hz.html
+++ b/third_party/blink/web_tests/webaudio/Oscillator/osc-440hz.html
@@ -14,10 +14,10 @@
      // Minimum allowed SNR between the actual oscillator and the expected
      // result.  Experimentally determined.
-      let snrThreshold = 59.287;
+      let snrThreshold = 59.280;
      // Max absolute difference between actual and expected oscillator
      // outputs.  Experimentally determined.
-      let maxDiffThreshold = 1.5632e-3;
+      let maxDiffThreshold = 1.5640e-3;
      let audit = Audit.createTaskRunner();

--- a/third_party/blink/web_tests/webaudio/Oscillator/osc-negative-freq.html
+++ b/third_party/blink/web_tests/webaudio/Oscillator/osc-negative-freq.html
@@ -27,7 +27,7 @@
        runTest(should, {
          message: 'Sum of positive and negative frequency sine oscillators',
          type: 'sine',
-          threshold: 4.1724e-7
+          threshold: 1.5349e-6
        }).then(() => task.done());
      });
@@ -35,7 +35,7 @@
        runTest(should, {
          message: 'Sum of positive and negative frequency square oscillators',
          type: 'square',
-          threshold: 1.4753e-6
+          threshold: 1.1496e-5
        }).then(() => task.done());
      });
@@ -44,7 +44,7 @@
          message:
              'Sum of positive and negative frequency sawtooth oscillators',
          type: 'sawtooth',
-          threshold: 1.4753e-6
+          threshold: 1.1506e-5
        }).then(() => task.done());
      });
@@ -53,7 +53,7 @@
          message:
              'Sum of positive and negative frequency triangle oscillators',
          type: 'triangle',
-          threshold: 2.9803e-7
+          threshold: 1.0133e-5
        }).then(() => task.done());
      });
@@ -145,7 +145,7 @@
              should(
                  actual,
                  'Sum of positive and negative frequency custom oscillators')
-                  .beCloseToArray(expected, {absoluteThreshold: 4.1724e-7});
+                  .beCloseToArray(expected, {absoluteThreshold: 2.2352e-6});
            })
            .then(() => task.done());
      });

--- a/third_party/blink/web_tests/webaudio/Oscillator/start-sampling.html
+++ b/third_party/blink/web_tests/webaudio/Oscillator/start-sampling.html
@@ -27,7 +27,7 @@
          },
          function(task, should) {
            testStartSampling(should, 1.25, {
-              error: 1.0843e-4,
+              error: 1.0880e-4,
              snrThreshold: 84.054
            }).then(task.done.bind(task));
          });
@@ -39,7 +39,7 @@
          },
          function(task, should) {
            testStartSampling(should, 1.75, {
-              error: 1.0839e-4,
+              error: 1.0844e-4,
              snrThreshold: 84.056
            }).then(task.done.bind(task));
          });
@@ -57,8 +57,8 @@
            // 1/2 frame from the preceding sampling frame.  This tests one path
            // of the internal implementation.
            testStartWithGain(should, defaultSampleRate, {
-              error: 4.1724e-7,
+              error: 1.9521e-6,
-              snrThreshold: 137.536
+              snrThreshold: 128.12
            }).then(task.done.bind(task));
          });
@@ -75,8 +75,8 @@
            // frame from the preceding sampling frame.  This tests one path of
            // the internal implementation.
            testStartWithGain(should, 48000, {
-              error: 4.1724e-7,
+              error: 1.9521e-6,
-              snrThreshold: 137.536
+              snrThreshold: 122.92
            }).then(task.done.bind(task));
          });

--- a/third_party/blink/web_tests/webaudio/PeriodicWave/ctor-periodicwave.html
+++ b/third_party/blink/web_tests/webaudio/PeriodicWave/ctor-periodicwave.html
@@ -134,14 +134,14 @@
        verifyPeriodicWaveOutput(
            should, {real: [0, 1], imag: [0, 1], disableNormalization: false},
            generateReference(x => Math.SQRT1_2 * (Math.sin(x) + Math.cos(x))),
-            2.7165e-5)
+            2.7225e-5)
            .then(() => task.done());
      });
      audit.define('3: real/imag periodicwave test', (task, should) => {
        verifyPeriodicWaveOutput(
            should, {real: [0, 1], imag: [0, 1], disableNormalization: true},
-            generateReference(x => Math.sin(x) + Math.cos(x)), 3.8416e-5)
+            generateReference(x => Math.sin(x) + Math.cos(x)), 3.8501e-5)
            .then(() => task.done());
      });