SSE2 Optimization for ParamEvent::SetTarget

Optimized with SSE2 and obtained 4.37x speedup. Standard: 2.900553 seconds SSE2 (fp32): 0.662973 seconds Speedup (fp32): 4.375067 Due to multiple multiplications with a small discreteTimeConstant, some divergences appear versus the serialized version. Test adjusted accordingly. BUG=512376 TEST=Run webaudio/audioparam-setTargetAtTime.html Review URL: https://codereview.chromium.org/1288773003 git-svn-id: svn://svn.chromium.org/blink/trunk@200726 bbb929c8-8fbe-4397-9dbb-9b2b20218538

SSE2 Optimization for ParamEvent::SetTarget
Optimized with SSE2 and obtained 4.37x speedup. Standard: 2.900553 seconds SSE2 (fp32): 0.662973 seconds Speedup (fp32): 4.375067 Due to multiple multiplications with a small discreteTimeConstant, some divergences appear versus the serialized version. Test adjusted accordingly. BUG=512376 TEST=Run webaudio/audioparam-setTargetAtTime.html Review URL: https://codereview.chromium.org/1288773003 git-svn-id: svn://svn.chromium.org/blink/trunk@200726 bbb929c8-8fbe-4397-9dbb-9b2b20218538
b8abb565 · adrian.belgun@intel.com · adb353b9 · b8abb565 · b8abb565
Commit b8abb565 authored Aug 18, 2015 by adrian.belgun@intel.com
2 changed files
--- a/third_party/WebKit/LayoutTests/webaudio/audioparam-setTargetAtTime.html
+++ b/third_party/WebKit/LayoutTests/webaudio/audioparam-setTargetAtTime.html
@@ -24,7 +24,7 @@ description("Test AudioParam setTargetAtTime() functionality.");
 var numberOfTests = 100;

 // Max allowed difference between the rendered data and the expected result.
-var maxAllowedError = 2.79e-5;
+var maxAllowedError = 3.9e-5;

 // The AudioGainNode starts with this value instead of the default value.
 var initialValue = 100;

--- a/third_party/WebKit/Source/modules/webaudio/AudioParamTimeline.cpp
+++ b/third_party/WebKit/Source/modules/webaudio/AudioParamTimeline.cpp
@@ -31,9 +31,14 @@
 #include "core/dom/ExceptionCode.h"
 #include "platform/FloatConversion.h"
 #include "platform/audio/AudioUtilities.h"
+#include "wtf/CPU.h"
 #include "wtf/MathExtras.h"
 #include <algorithm>

+#if CPU(X86) || CPU(X86_64)
+#include <emmintrin.h>
+#endif
+
 namespace blink {

 static bool isPositiveAudioParamValue(float value, ExceptionState& exceptionState)
@@ -407,6 +412,38 @@ float AudioParamTimeline::valuesForTimeRangeImpl(
                    float timeConstant = event.timeConstant();
                    float discreteTimeConstant = static_cast<float>(AudioUtilities::discreteTimeConstantForSampleRate(timeConstant, controlRate));

+#if CPU(X86) || CPU(X86_64)
+                    // Resolve recursion by expanding constants to achieve a 4-step loop unrolling.
+                    // v1 = v0 + (t - v0) * c
+                    // v2 = v1 + (t - v1) * c
+                    // v2 = v0 + (t - v0) * c + (t - (v0 + (t - v0) * c)) * c
+                    // v2 = v0 + (t - v0) * c + (t - v0) * c - (t - v0) * c * c
+                    // v2 = v0 + (t - v0) * c * (2 - c)
+                    // Thus c0 = c, c1 = c*(2-c). The same logic applies to c2 and c3.
+                    const float c0 = discreteTimeConstant;
+                    const float c1 = c0 * (2 - c0);
+                    const float c2 = c0 * ((c0 - 3) * c0 + 3);
+                    const float c3 = c0 * (c0 * ((4 - c0) * c0 - 6) + 4);
+
+                    float delta;
+                    __m128 vC = _mm_set_ps(c2, c1, c0, 0);
+                    __m128 vDelta, vValue, vResult;
+
+                    // Process 4 loop steps.
+                    unsigned fillToFrameTrunc = writeIndex + ((fillToFrame - writeIndex) / 4) * 4;
+                    for (; writeIndex < fillToFrameTrunc; writeIndex += 4) {
+                        delta = target - value;
+                        vDelta = _mm_set_ps1(delta);
+                        vValue = _mm_set_ps1(value);
+
+                        vResult = _mm_add_ps(vValue, _mm_mul_ps(vDelta, vC));
+                        _mm_storeu_ps(values + writeIndex, vResult);
+
+                        // Update value for next iteration.
+                        value += delta * c3;
+                    }
+#endif
+                    // Serially process remaining values
                    for (; writeIndex < fillToFrame; ++writeIndex) {
                        values[writeIndex] = value;
                        value += (target - value) * discreteTimeConstant;