SSE2 Optimization for ParamEvent::SetTarget

Optimized with SSE2 and obtained 4.37x speedup.
Standard:      	2.900553 seconds
SSE2 (fp32):   	0.662973 seconds
Speedup (fp32):	4.375067

Due to multiple multiplications with a small discreteTimeConstant, some divergences appear versus the serialized version. Test adjusted accordingly.

BUG=512376
TEST=Run webaudio/audioparam-setTargetAtTime.html

Review URL: https://codereview.chromium.org/1288773003

git-svn-id: svn://svn.chromium.org/blink/trunk@200726 bbb929c8-8fbe-4397-9dbb-9b2b20218538
parent adb353b9
......@@ -24,7 +24,7 @@ description("Test AudioParam setTargetAtTime() functionality.");
var numberOfTests = 100;
// Max allowed difference between the rendered data and the expected result.
var maxAllowedError = 2.79e-5;
var maxAllowedError = 3.9e-5;
// The AudioGainNode starts with this value instead of the default value.
var initialValue = 100;
......
......@@ -31,9 +31,14 @@
#include "core/dom/ExceptionCode.h"
#include "platform/FloatConversion.h"
#include "platform/audio/AudioUtilities.h"
#include "wtf/CPU.h"
#include "wtf/MathExtras.h"
#include <algorithm>
#if CPU(X86) || CPU(X86_64)
#include <emmintrin.h>
#endif
namespace blink {
static bool isPositiveAudioParamValue(float value, ExceptionState& exceptionState)
......@@ -407,6 +412,38 @@ float AudioParamTimeline::valuesForTimeRangeImpl(
float timeConstant = event.timeConstant();
float discreteTimeConstant = static_cast<float>(AudioUtilities::discreteTimeConstantForSampleRate(timeConstant, controlRate));
#if CPU(X86) || CPU(X86_64)
// Resolve recursion by expanding constants to achieve a 4-step loop unrolling.
// v1 = v0 + (t - v0) * c
// v2 = v1 + (t - v1) * c
// v2 = v0 + (t - v0) * c + (t - (v0 + (t - v0) * c)) * c
// v2 = v0 + (t - v0) * c + (t - v0) * c - (t - v0) * c * c
// v2 = v0 + (t - v0) * c * (2 - c)
// Thus c0 = c, c1 = c*(2-c). The same logic applies to c2 and c3.
const float c0 = discreteTimeConstant;
const float c1 = c0 * (2 - c0);
const float c2 = c0 * ((c0 - 3) * c0 + 3);
const float c3 = c0 * (c0 * ((4 - c0) * c0 - 6) + 4);
float delta;
__m128 vC = _mm_set_ps(c2, c1, c0, 0);
__m128 vDelta, vValue, vResult;
// Process 4 loop steps.
unsigned fillToFrameTrunc = writeIndex + ((fillToFrame - writeIndex) / 4) * 4;
for (; writeIndex < fillToFrameTrunc; writeIndex += 4) {
delta = target - value;
vDelta = _mm_set_ps1(delta);
vValue = _mm_set_ps1(value);
vResult = _mm_add_ps(vValue, _mm_mul_ps(vDelta, vC));
_mm_storeu_ps(values + writeIndex, vResult);
// Update value for next iteration.
value += delta * c3;
}
#endif
// Serially process remaining values
for (; writeIndex < fillToFrame; ++writeIndex) {
values[writeIndex] = value;
value += (target - value) * discreteTimeConstant;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment