NEON optimization for a-rate Oscillator

NEON version of the SSE2 optimizations for the a-rate OscillatorNode. We see about a 23% increase in performance with these changes according to Spotify's Web Audio Bench on a Pixel 2. Without CL TEST μs MIN Q1 MEDIAN Q3 MAX MEAN STDDEV Oscillator.frequency-linear-a-rate 5640 5640 5923 6170 6780 7800 6354.266667 511.6527827 With CL TEST μs MIN Q1 MEDIAN Q3 MAX MEAN STDDEV Oscillator.frequency-linear-a-rate 4355 4355 4588 4732 5258 6408 4929.9 474.3852906 Manually ran the oscillator tests on a Pixel 2 and update the thresholds. Bug: 1013118 Change-Id: I37fbb995ec6bef55ae5195f579356b97adb94347 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2293424 Commit-Queue: Raymond Toy <rtoy@chromium.org> Reviewed-by: Dale Curtis <dalecurtis@chromium.org> Reviewed-by: Hongchan Choi <hongchan@chromium.org> Reviewed-by: Raymond Toy <rtoy@chromium.org> Cr-Commit-Position: refs/heads/master@{#790016}

NEON optimization for a-rate Oscillator
NEON version of the SSE2 optimizations for the a-rate OscillatorNode. We see about a 23% increase in performance with these changes according to Spotify's Web Audio Bench on a Pixel 2. Without CL TEST μs MIN Q1 MEDIAN Q3 MAX MEAN STDDEV Oscillator.frequency-linear-a-rate 5640 5640 5923 6170 6780 7800 6354.266667 511.6527827 With CL TEST μs MIN Q1 MEDIAN Q3 MAX MEAN STDDEV Oscillator.frequency-linear-a-rate 4355 4355 4588 4732 5258 6408 4929.9 474.3852906 Manually ran the oscillator tests on a Pixel 2 and update the thresholds. Bug: 1013118 Change-Id: I37fbb995ec6bef55ae5195f579356b97adb94347 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2293424 Commit-Queue: Raymond Toy <rtoy@chromium.org> Reviewed-by: Dale Curtis <dalecurtis@chromium.org> Reviewed-by: Hongchan Choi <hongchan@chromium.org> Reviewed-by: Raymond Toy <rtoy@chromium.org> Cr-Commit-Position: refs/heads/master@{#790016}
94423071 · Raymond Toy · Commit Bot · cb657a7b · 94423071 · 94423071
Commit 94423071 authored Jul 20, 2020 by Raymond Toy Committed by Commit Bot Jul 20, 2020
8 changed files
--- a/third_party/blink/renderer/modules/webaudio/cpu/arm/oscillator_kernel_neon.cc
+++ b/third_party/blink/renderer/modules/webaudio/cpu/arm/oscillator_kernel_neon.cc
@@ -6,14 +6,16 @@

 #include "third_party/blink/renderer/modules/webaudio/periodic_wave.h"

+#if defined(CPU_ARM_NEON)
 #include <arm_neon.h>
+#endif

 namespace blink {

 #if defined(CPU_ARM_NEON)
-static float32x4_t v_wrap_virtual_index(float32x4_t x,
-                                        float32x4_t wave_size,
-                                        float32x4_t inv_wave_size) {
+static float32x4_t WrapVirtualIndexVector(float32x4_t x,
+                                          float32x4_t wave_size,
+                                          float32x4_t inv_wave_size) {
  // r = x/wave_size, f = truncate(r), truncating towards 0
  const float32x4_t r = vmulq_f32(x, inv_wave_size);
  int32x4_t f = vcvtq_s32_f32(r);
@@ -74,7 +76,7 @@ std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
  // It's possible that adding the incr above exceeded the bounds, so wrap them
  // if needed.
  v_virt_index =
-      v_wrap_virtual_index(v_virt_index, v_wave_size, v_inv_wave_size);
+      WrapVirtualIndexVector(v_virt_index, v_wave_size, v_inv_wave_size);

  int k = 0;
  int n_loops = n / 4;
@@ -119,17 +121,125 @@ std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
    // 0 -> periodicWaveSize.
    v_virt_index = vaddq_f32(v_virt_index, v_incr);
    v_virt_index =
-        v_wrap_virtual_index(v_virt_index, v_wave_size, v_inv_wave_size);
+        WrapVirtualIndexVector(v_virt_index, v_wave_size, v_inv_wave_size);
  }

  // There's a bit of round-off above, so update the index more accurately so at
  // least the next render starts over with a more accurate value.
  virtual_read_index += k * incr;
  virtual_read_index -=
-      floor(virtual_read_index * inv_periodic_wave_size) * periodic_wave_size;
+      std::floor(virtual_read_index * inv_periodic_wave_size) *
+      periodic_wave_size;

  return std::make_tuple(k, virtual_read_index);
 }
+
+static ALWAYS_INLINE double WrapVirtualIndex(double virtual_index,
+                                             unsigned periodic_wave_size,
+                                             double inv_periodic_wave_size) {
+  return virtual_index -
+         floor(virtual_index * inv_periodic_wave_size) * periodic_wave_size;
+}
+
+double OscillatorHandler::ProcessARateVectorKernel(
+    float* destination,
+    double virtual_read_index,
+    const float* phase_increments,
+    unsigned periodic_wave_size,
+    const float* const lower_wave_data[4],
+    const float* const higher_wave_data[4],
+    const float table_interpolation_factor[4]) const {
+  // See the scalar version in oscillator_node.cc for the basic algorithm.
+  double inv_periodic_wave_size = 1.0 / periodic_wave_size;
+  unsigned read_index_mask = periodic_wave_size - 1;
+
+  // Accumulate the phase increments so we can set up the virtual read index
+  // vector appropriately.  This must be a double to preserve accuracy and
+  // to match the scalar version.
+  double incr_sum[4];
+  incr_sum[0] = phase_increments[0];
+  for (int m = 1; m < 4; ++m) {
+    incr_sum[m] = incr_sum[m - 1] + phase_increments[m];
+  }
+
+  // It's really important for accuracy that we use doubles instead of
+  // floats for the virtual_read_index.  Without this, we can only get some
+  // 30-50 dB in the sweep tests instead of 100+ dB.
+  //
+  // Arm NEON doesn't have float64x2_t so we have to do this.  (Aarch64 has
+  // float64x2_t.)
+  double virt_index[4];
+  virt_index[0] = virtual_read_index;
+  virt_index[1] = WrapVirtualIndex(virtual_read_index + incr_sum[0],
+                                   periodic_wave_size, inv_periodic_wave_size);
+  virt_index[2] = WrapVirtualIndex(virtual_read_index + incr_sum[1],
+                                   periodic_wave_size, inv_periodic_wave_size);
+  virt_index[3] = WrapVirtualIndex(virtual_read_index + incr_sum[2],
+                                   periodic_wave_size, inv_periodic_wave_size);
+
+  // The virtual indices we're working with now.
+  const float32x4_t v_virt_index = {
+      static_cast<float>(virt_index[0]), static_cast<float>(virt_index[1]),
+      static_cast<float>(virt_index[2]), static_cast<float>(virt_index[3])};
+
+  // Convert virtual index to actual index into wave data.
+  const uint32x4_t v_read0 = vcvtq_u32_f32(v_virt_index);
+
+  // v_read1 = v_read0 + 1, but wrap the index around, if needed.
+  const uint32x4_t v_read1 = vandq_s32(vaddq_s32(v_read0, vdupq_n_u32(1)),
+                                       vdupq_n_u32(read_index_mask));
+
+  float sample1_lower[4] __attribute__((aligned(16)));
+  float sample2_lower[4] __attribute__((aligned(16)));
+  float sample1_higher[4] __attribute__((aligned(16)));
+  float sample2_higher[4] __attribute__((aligned(16)));
+
+  uint32_t read0[4] __attribute__((aligned(16)));
+  uint32_t read1[4] __attribute__((aligned(16)));
+
+  vst1q_u32(read0, v_read0);
+  vst1q_u32(read1, v_read1);
+
+  // Read the samples from the wave tables
+  for (int m = 0; m < 4; ++m) {
+    DCHECK_LT(read0[m], periodic_wave_size);
+    DCHECK_LT(read1[m], periodic_wave_size);
+
+    sample1_lower[m] = lower_wave_data[m][read0[m]];
+    sample2_lower[m] = lower_wave_data[m][read1[m]];
+    sample1_higher[m] = higher_wave_data[m][read0[m]];
+    sample2_higher[m] = higher_wave_data[m][read1[m]];
+  }
+
+  // Compute factor for linear interpolation within a wave table.
+  const float32x4_t v_factor = vsubq_f32(v_virt_index, vcvtq_f32_u32(v_read0));
+
+  // Linearly interpolate between samples from the higher wave table.
+  const float32x4_t sample_higher = vmlaq_f32(
+      vld1q_f32(sample1_higher), v_factor,
+      vsubq_f32(vld1q_f32(sample2_higher), vld1q_f32(sample1_higher)));
+
+  // Linearly interpolate between samples from the lower wave table.
+  const float32x4_t sample_lower =
+      vmlaq_f32(vld1q_f32(sample1_lower), v_factor,
+                vsubq_f32(vld1q_f32(sample2_lower), vld1q_f32(sample1_lower)));
+
+  // Linearly interpolate between wave tables to get the desired
+  // output samples.
+  const float32x4_t sample =
+      vmlaq_f32(sample_higher, vld1q_f32(table_interpolation_factor),
+                vsubq_f32(sample_lower, sample_higher));
+
+  vst1q_f32(destination, sample);
+
+  // Update the virtual_read_index appropriately and return it for the
+  // next call.
+  virtual_read_index =
+      WrapVirtualIndex(virtual_read_index + incr_sum[3], periodic_wave_size,
+                       inv_periodic_wave_size);
+
+  return virtual_read_index;
+}
 #endif

 }  // namespace blink
--- a/third_party/blink/renderer/modules/webaudio/oscillator_node.cc
+++ b/third_party/blink/renderer/modules/webaudio/oscillator_node.cc
@@ -371,7 +371,7 @@ std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
 }
 #endif

-#if !defined(ARCH_CPU_X86_FAMILY)
+#if !(defined(ARCH_CPU_X86_FAMILY) || defined(CPU_ARM_NEON))
 double OscillatorHandler::ProcessARateVectorKernel(
    float* dest_p,
    double virtual_read_index,

--- a/third_party/blink/renderer/modules/webaudio/periodic_wave.cc
+++ b/third_party/blink/renderer/modules/webaudio/periodic_wave.cc
@@ -38,6 +38,10 @@
 #include "third_party/blink/renderer/platform/audio/vector_math.h"
 #include "third_party/blink/renderer/platform/bindings/exception_state.h"

+#if defined(CPU_ARM_NEON)
+#include <arm_neon.h>
+#endif
+
 namespace blink {

 // The number of bands per octave.  Each octave will have this many entries in
@@ -270,6 +274,64 @@ void PeriodicWave::WaveDataForFundamentalFrequency(
    higher_wave_data[k] = band_limited_tables_[range_index1[k]]->Data();
  }
 }
+#elif defined(CPU_ARM_NEON)
+void PeriodicWave::WaveDataForFundamentalFrequency(
+    const float fundamental_frequency[4],
+    float* lower_wave_data[4],
+    float* higher_wave_data[4],
+    float table_interpolation_factor[4]) {
+  // Negative frequencies are allowed, in which case we alias to the positive
+  // frequency.
+  float32x4_t frequency = vabsq_f32(vld1q_f32(fundamental_frequency));
+
+  // pos = 0xffffffff if frequency > 0; otherwise 0.
+  uint32x4_t pos = vcgtq_f32(frequency, vdupq_n_f32(0));
+
+  // v_ratio = frequency / lowest_fundamental_frequency_.  But NEON
+  // doesn't have a division instruction, so multiply by reciprocal.
+  // (Aarch64 does, though).
+  float32x4_t v_ratio =
+      vmulq_f32(frequency, vdupq_n_f32(1 / lowest_fundamental_frequency_));
+
+  // Select v_ratio or 0.5 depending on whether pos is all ones or all
+  // zeroes.
+  v_ratio = vbslq_f32(pos, v_ratio, vdupq_n_f32(0.5));
+
+  float ratio[4] __attribute__((aligned(16)));
+  vst1q_f32(ratio, v_ratio);
+
+  float cents_above_lowest_frequency[4] __attribute__((aligned(16)));
+
+  for (int k = 0; k < 4; ++k) {
+    cents_above_lowest_frequency[k] = log2f(ratio[k]) * 1200;
+  }
+
+  float32x4_t v_pitch_range = vaddq_f32(
+      vdupq_n_f32(1.0), vmulq_f32(vld1q_f32(cents_above_lowest_frequency),
+                                  vdupq_n_f32(1 / cents_per_range_)));
+
+  v_pitch_range = vmaxq_f32(v_pitch_range, vdupq_n_f32(0));
+  v_pitch_range = vminq_f32(v_pitch_range, vdupq_n_f32(NumberOfRanges() - 1));
+
+  const uint32x4_t v_index1 = vcvtq_u32_f32(v_pitch_range);
+  uint32x4_t v_index2 = vaddq_u32(v_index1, vdupq_n_u32(1));
+  v_index2 = vminq_u32(v_index2, vdupq_n_f32(NumberOfRanges() - 1));
+
+  uint32_t range_index1[4] __attribute__((aligned(16)));
+  uint32_t range_index2[4] __attribute__((aligned(16)));
+
+  vst1q_u32(range_index1, v_index1);
+  vst1q_u32(range_index2, v_index2);
+
+  const float32x4_t table_factor =
+      vsubq_f32(v_pitch_range, vcvtq_f32_u32(v_index1));
+  vst1q_f32(table_interpolation_factor, table_factor);
+
+  for (int k = 0; k < 4; ++k) {
+    lower_wave_data[k] = band_limited_tables_[range_index2[k]]->Data();
+    higher_wave_data[k] = band_limited_tables_[range_index1[k]]->Data();
+  }
+}
 #else
 void PeriodicWave::WaveDataForFundamentalFrequency(
    const float fundamental_frequency[4],

--- a/third_party/blink/web_tests/webaudio/Oscillator/osc-sweep-snr-custom.html
+++ b/third_party/blink/web_tests/webaudio/Oscillator/osc-sweep-snr-custom.html
@@ -22,7 +22,7 @@
            1, tester.sampleRate * tester.lengthInSeconds, tester.sampleRate);

        // The thresholds are experimentally determined.
-        tester.setThresholds({snr: 139.98, maxDiff: 2.2650e-6});
+        tester.setThresholds({snr: 130.47, maxDiff: 2.2576e-6});
        tester.runTest(
            context, 'custom', 'Custom Oscillator with Exponential Sweep', task,
            should);

--- a/third_party/blink/web_tests/webaudio/Oscillator/osc-sweep-snr-sawtooth.html
+++ b/third_party/blink/web_tests/webaudio/Oscillator/osc-sweep-snr-sawtooth.html
@@ -22,7 +22,7 @@
            1, tester.sampleRate * tester.lengthInSeconds, tester.sampleRate);

        // The thresholds are experimentally determined.
-        tester.setThresholds({snr: 134.34, maxDiff: 1.8925e-6});
+        tester.setThresholds({snr: 129.40, maxDiff: 1.9894e-6});
        tester.runTest(
            context, 'sawtooth', 'Sawtooth Oscillator with Exponential Sweep',
            task, should);

--- a/third_party/blink/web_tests/webaudio/Oscillator/osc-sweep-snr-sine.html
+++ b/third_party/blink/web_tests/webaudio/Oscillator/osc-sweep-snr-sine.html
@@ -22,7 +22,7 @@
            1, tester.sampleRate * tester.lengthInSeconds, tester.sampleRate);

        // The thresholds are experimentally determined.
-        tester.setThresholds({snr: 140.44, maxDiff: 3.4869e-6});
+        tester.setThresholds({snr: 129.72, maxDiff: 3.7551e-6});
        tester.runTest(
            context, 'sine', 'Sine Oscillator with Exponential Sweep', task,
            should);

--- a/third_party/blink/web_tests/webaudio/Oscillator/osc-sweep-snr-square.html
+++ b/third_party/blink/web_tests/webaudio/Oscillator/osc-sweep-snr-square.html
@@ -22,7 +22,7 @@
            1, tester.sampleRate * tester.lengthInSeconds, tester.sampleRate);

        // The thresholds are experimentally determined.
-        tester.setThresholds({snr: 137.20, maxDiff: 3.8147e-6});
+        tester.setThresholds({snr: 129.71, maxDiff: 3.9638e-6});
        tester.runTest(
            context, 'square', 'Square Oscillator with Exponential Sweep', task,
            should);

--- a/third_party/blink/web_tests/webaudio/Oscillator/osc-sweep-snr-triangle.html
+++ b/third_party/blink/web_tests/webaudio/Oscillator/osc-sweep-snr-triangle.html
@@ -22,7 +22,7 @@
            1, tester.sampleRate * tester.lengthInSeconds, tester.sampleRate);

        // The thresholds are experimentally determined.
-        tester.setThresholds({snr: 139.88, maxDiff: 2.8313e-6});
+        tester.setThresholds({snr: 129.66, maxDiff: 3.0995e-6});
        tester.runTest(
            context, 'triangle', 'Triangle Oscillator with Exponential Sweep ',
            task, should);