[Reland] Use resampler for playback speeds close to 1.0

Commit 'ab98b39d' broke the MSAN bots and was reverted. This CL relands those changes, while fixing the UTs. Original description: The WSOLA algorithm introduces noticeable audio artifacts when doing small adjustments to playback rate (e.g. 1.03 playback speed). The distortions can be described as warbling or transient stuttering. This CL introduces fixes this issue by using resampling instead, for speeds close to 1.00. The resampled audio doesn't have any distortions, but its pitch is shifted proportionally to the playback changes. The maximal slowdown/speed up we use resampling for is 5-6%, which corresponds to shifting by 1 note tone. Beyond that, the WSOLA artifacts become tolerable, and the pitch shifting doesn't. Bug: 920019, 1018617 Change-Id: Ide606af05253f02576f548f20dfc9007040cc106 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1884572 Auto-Submit: Thomas Guilbert <tguilbert@chromium.org> Reviewed-by: Dale Curtis <dalecurtis@chromium.org> Commit-Queue: Dale Curtis <dalecurtis@chromium.org> Commit-Queue: Thomas Guilbert <tguilbert@chromium.org> Cr-Commit-Position: refs/heads/master@{#710106}

[Reland] Use resampler for playback speeds close to 1.0
Commit 'ab98b39d' broke the MSAN bots and was reverted. This CL relands those changes, while fixing the UTs. Original description: The WSOLA algorithm introduces noticeable audio artifacts when doing small adjustments to playback rate (e.g. 1.03 playback speed). The distortions can be described as warbling or transient stuttering. This CL introduces fixes this issue by using resampling instead, for speeds close to 1.00. The resampled audio doesn't have any distortions, but its pitch is shifted proportionally to the playback changes. The maximal slowdown/speed up we use resampling for is 5-6%, which corresponds to shifting by 1 note tone. Beyond that, the WSOLA artifacts become tolerable, and the pitch shifting doesn't. Bug: 920019, 1018617 Change-Id: Ide606af05253f02576f548f20dfc9007040cc106 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1884572 Auto-Submit: Thomas Guilbert <tguilbert@chromium.org> Reviewed-by: Dale Curtis <dalecurtis@chromium.org> Commit-Queue: Dale Curtis <dalecurtis@chromium.org> Commit-Queue: Thomas Guilbert <tguilbert@chromium.org> Cr-Commit-Position: refs/heads/master@{#710106}
86ca53e6 · Thomas Guilbert · Commit Bot · f6db3b85 · 86ca53e6 · 86ca53e6
Commit 86ca53e6 authored Oct 29, 2019 by Thomas Guilbert Committed by Commit Bot Oct 29, 2019
3 changed files
--- a/media/filters/audio_renderer_algorithm.cc
+++ b/media/filters/audio_renderer_algorithm.cc
@@ -7,11 +7,13 @@
 #include <algorithm>
 #include <cmath>

+#include "base/bind.h"
 #include "base/logging.h"
 #include "cc/base/math_util.h"
 #include "media/base/audio_bus.h"
 #include "media/base/audio_timestamp_helper.h"
 #include "media/base/limits.h"
+#include "media/base/multi_channel_resampler.h"
 #include "media/filters/wsola_internals.h"

 namespace media {
@@ -157,6 +159,73 @@ void AudioRendererAlgorithm::SetChannelMask(std::vector<bool> channel_mask) {
    CreateSearchWrappers();
 }

+void AudioRendererAlgorithm::OnResamplerRead(int frame_delay,
+                                             AudioBus* audio_bus) {
+  input_frames_read_or_buffered_ +=
+      audio_buffer_.ReadFrames(audio_bus->frames(), 0, audio_bus);
+}
+
+int AudioRendererAlgorithm::CalculateOutputFramesResampled(
+    double playback_rate,
+    int requested_frames) {
+  double input_frames_consumed =
+      input_frames_read_or_buffered_ - resampler_->BufferedFrames();
+
+  int output_frames =
+      static_cast<int>(input_frames_consumed / playback_rate + 0.5);
+
+  // The first or second call to resample appears to consume more input frames
+  // than it actually does. This is due to the internals of |resampler_|
+  // treating the first data read differently, to prime internal buffers. We
+  // therefore appear to have read up to SincResampler::kKernelSize more frames
+  // when using the difference between calls to |resampler_->BufferedFrames()|.
+  // |resampler_| never actually writes more frames than we request out of it,
+  // so we can safely cap this value here.
+  return std::min(output_frames, requested_frames);
+}
+
+int AudioRendererAlgorithm::ResampleAndFill(AudioBus* dest,
+                                            int dest_offset,
+                                            int requested_frames,
+                                            double playback_rate) {
+  if (!resampler_) {
+    resampler_ = std::make_unique<MultiChannelResampler>(
+        channels_, playback_rate, SincResampler::kDefaultRequestSize,
+        base::BindRepeating(&AudioRendererAlgorithm::OnResamplerRead,
+                            base::Unretained(this)));
+  }
+
+  resampler_->SetRatio(playback_rate);
+
+  // Reset with leftover frames from previous resampling iteration.
+  input_frames_read_or_buffered_ = resampler_->BufferedFrames();
+
+  // Directly use |dest| for the most common case of having 0 offset.
+  if (!dest_offset) {
+    resampler_->Resample(requested_frames, dest);
+
+    return CalculateOutputFramesResampled(playback_rate, requested_frames);
+  }
+
+  // This is only really used once, at the beginning of a stream, which means
+  // we can use a temporary variable, rather than saving it as a member.
+  // NOTE: We don't wrap |dest|'s channel data in an AudioBus wrapper, because
+  // |dest_offset| isn't aligned always with AudioBus::kChannelAlignment.
+  std::unique_ptr<AudioBus> resampler_output =
+      AudioBus::Create(channels_, requested_frames);
+
+  resampler_->Resample(requested_frames, resampler_output.get());
+
+  int output_frames =
+      CalculateOutputFramesResampled(playback_rate, requested_frames);
+
+  DCHECK_LE(output_frames, resampler_output->frames());
+
+  resampler_output->CopyPartialFramesTo(0, output_frames, dest_offset, dest);
+
+  return output_frames;
+}
+
 int AudioRendererAlgorithm::FillBuffer(AudioBus* dest,
                                       int dest_offset,
                                       int requested_frames,
@@ -185,6 +254,18 @@ int AudioRendererAlgorithm::FillBuffer(AudioBus* dest,
    return frames_read;
  }

+  // WSOLA at playback rates that are close to 1.0 produces noticeable
+  // warbling and stuttering. We prefer resampling the audio at these speeds.
+  // This does results in a noticeable pitch shift.
+  // NOTE: The cutoff values are arbitrary, and picked based off of a tradeoff
+  // between "resample pitch shift" vs "WSOLA distortions".
+  constexpr double kLowerResampleThreshold = 0.95;
+  constexpr double kUpperResampleThreshold = 1.06;
+  if (kLowerResampleThreshold <= playback_rate &&
+      playback_rate <= kUpperResampleThreshold) {
+    return ResampleAndFill(dest, dest_offset, requested_frames, playback_rate);
+  }
+
  // Allocate structures on first non-1.0 playback rate; these can eat a fair
  // chunk of memory. ~56kB for stereo 48kHz, up to ~765kB for 7.1 192kHz.
  if (!ola_window_) {
@@ -235,6 +316,8 @@ void AudioRendererAlgorithm::FlushBuffers() {
    wsola_output_->Zero();
  num_complete_frames_ = 0;

+  resampler_.reset();
+
  // Reset |capacity_| so growth triggered by underflows doesn't penalize seek
  // time.
  capacity_ = initial_capacity_;

--- a/media/filters/audio_renderer_algorithm.h
+++ b/media/filters/audio_renderer_algorithm.h
@@ -35,6 +35,7 @@
 namespace media {

 class AudioBus;
+class MultiChannelResampler;

 class MEDIA_EXPORT AudioRendererAlgorithm {
 public:
@@ -145,6 +146,24 @@ class MEDIA_EXPORT AudioRendererAlgorithm {
  // mask has been specified.
  void CreateSearchWrappers();

+  // Uses |resampler_| to speed up or slowdown audio, by using a resampling
+  // ratio of |playback_rate|.
+  int ResampleAndFill(AudioBus* dest,
+                      int dest_offset,
+                      int requested_frames,
+                      double playback_rate);
+
+  // Called by |resampler_| to get more audio data.
+  void OnResamplerRead(int frame_delay, AudioBus* audio_bus);
+
+  // Calculate how many frames |resampler_| wrote to output, based off of
+  // |input_frames_read_or_buffered_| and |resampler_->BufferedFrames()|.
+  //
+  // NOTE: The return value is always <= |request_frames|. See comment in the
+  //       implementation file.
+  int CalculateOutputFramesResampled(double playback_rate,
+                                     int requested_frames);
+
  // Parameters.
  AudioRendererAlgorithmParameters audio_renderer_algorithm_params_;

@@ -198,6 +217,14 @@ class MEDIA_EXPORT AudioRendererAlgorithm {
  // specifies the index where the next WSOLA window has to overlap-and-add.
  int num_complete_frames_;

+  // Used to replace WSOLA algorithm at playback speeds close to 1.0. This is to
+  // prevent noticeable audio artifacts introduced by WSOLA, at the expense of
+  // changing the pitch of the audio.
+  std::unique_ptr<MultiChannelResampler> resampler_;
+
+  // Number of input frames read or buffered by |resampler_|.
+  double input_frames_read_or_buffered_ = 0;
+
  // This stores a part of the output that is created but couldn't be rendered.
  // Output is generated frame-by-frame which at some point might exceed the
  // number of requested samples. Furthermore, due to overlap-and-add,

--- a/media/filters/audio_renderer_algorithm_unittest.cc
+++ b/media/filters/audio_renderer_algorithm_unittest.cc
@@ -171,8 +171,8 @@ class AudioRendererAlgorithmTest : public testing::Test {
    return true;
  }

-  bool AudioDataIsMuted(AudioBus* audio_data, int frames_written) {
-    return VerifyAudioData(audio_data, 0, frames_written, 0);
+  bool AudioDataIsMuted(AudioBus* audio_data, int frames_written, int offset) {
+    return VerifyAudioData(audio_data, offset, frames_written, 0);
  }

  int ComputeConsumedFrames(int initial_frames_enqueued,
@@ -189,18 +189,21 @@ class AudioRendererAlgorithmTest : public testing::Test {
    const int kDefaultFramesRequested = kOutputDurationInSec *
        algorithm_.samples_per_second();

-    TestPlaybackRate(
-        playback_rate, kDefaultBufferSize, kDefaultFramesRequested);
+    TestPlaybackRate(playback_rate, kDefaultBufferSize, kDefaultFramesRequested,
+                     0);
  }

  void TestPlaybackRate(double playback_rate,
                        int buffer_size_in_frames,
-                        int total_frames_requested) {
+                        int total_frames_requested,
+                        int dest_offset) {
    int initial_frames_enqueued = frames_enqueued_;
    int initial_frames_buffered = algorithm_.frames_buffered();

    std::unique_ptr<AudioBus> bus =
        AudioBus::Create(channels_, buffer_size_in_frames);
+    bus->ZeroFrames(dest_offset);
+
    if (playback_rate == 0.0) {
      int frames_written = algorithm_.FillBuffer(
          bus.get(), 0, buffer_size_in_frames, playback_rate);
@@ -211,9 +214,10 @@ class AudioRendererAlgorithmTest : public testing::Test {
    int frames_remaining = total_frames_requested;
    bool first_fill_buffer = true;
    while (frames_remaining > 0) {
-      int frames_requested = std::min(buffer_size_in_frames, frames_remaining);
-      int frames_written =
-          algorithm_.FillBuffer(bus.get(), 0, frames_requested, playback_rate);
+      int frames_requested =
+          std::min(buffer_size_in_frames - dest_offset, frames_remaining);
+      int frames_written = algorithm_.FillBuffer(
+          bus.get(), dest_offset, frames_requested, playback_rate);
      ASSERT_GT(frames_written, 0) << "Requested: " << frames_requested
                                   << ", playing at " << playback_rate;

@@ -223,7 +227,7 @@ class AudioRendererAlgorithmTest : public testing::Test {
      // if at very first buffer-fill only one frame is written, that is zero
      // which might cause exception in CheckFakeData().
      if (!first_fill_buffer || frames_written > 1)
-        ASSERT_FALSE(AudioDataIsMuted(bus.get(), frames_written));
+        ASSERT_FALSE(AudioDataIsMuted(bus.get(), frames_written, dest_offset));
      first_fill_buffer = false;
      frames_remaining -= frames_written;

@@ -356,7 +360,7 @@ TEST_F(AudioRendererAlgorithmTest, InitializeWithLargeParameters) {
 TEST_F(AudioRendererAlgorithmTest, FillBuffer_Bitstream) {
  Initialize(CHANNEL_LAYOUT_STEREO, kSampleFormatEac3, kSamplesPerSecond,
             kSamplesPerSecond / 100);
-  TestPlaybackRate(1.0, kFrameSize, 16 * kFrameSize);
+  TestPlaybackRate(1.0, kFrameSize, 16 * kFrameSize, /* dest_offset */ 0);
 }

 TEST_F(AudioRendererAlgorithmTest, FillBuffer_NormalRate) {
@@ -374,6 +378,36 @@ TEST_F(AudioRendererAlgorithmTest, FillBuffer_NearlyNormalSlowerRate) {
  TestPlaybackRate(0.9999);
 }

+// This test verifies that the resampling based time stretch algorithms works.
+// The range of playback rates in which we use resampling is [0.95, 1.06].
+TEST_F(AudioRendererAlgorithmTest, FillBuffer_ResamplingRates) {
+  Initialize();
+  TestPlaybackRate(0.94);  // WSOLA.
+  TestPlaybackRate(0.95);  // Lower limit of resampling.
+  TestPlaybackRate(0.97);
+  TestPlaybackRate(1.00);
+  TestPlaybackRate(1.04);
+  TestPlaybackRate(1.06);  // Upper limit of resampling.
+  TestPlaybackRate(1.07);  // WSOLA.
+}
+
+TEST_F(AudioRendererAlgorithmTest, FillBuffer_WithOffset) {
+  Initialize();
+  const int kBufferSize = algorithm_.samples_per_second() / 10;
+  const int kOffset = kBufferSize / 10;
+  const int kFramesRequested =
+      kOutputDurationInSec * algorithm_.samples_per_second();
+
+  // No time-strech.
+  TestPlaybackRate(1.00, kBufferSize, kFramesRequested, kOffset);
+
+  // Resampling based time-strech.
+  TestPlaybackRate(1.05, kBufferSize, kFramesRequested, kOffset);
+
+  // WSOLA based time-strech.
+  TestPlaybackRate(1.25, kBufferSize, kFramesRequested, kOffset);
+}
+
 TEST_F(AudioRendererAlgorithmTest, FillBuffer_OneAndAQuarterRate) {
  Initialize();
  TestPlaybackRate(1.25);
@@ -447,9 +481,9 @@ TEST_F(AudioRendererAlgorithmTest, FillBuffer_SmallBufferSize) {
  Initialize();
  static const int kBufferSizeInFrames = 1;
  static const int kFramesRequested = kOutputDurationInSec * kSamplesPerSecond;
-  TestPlaybackRate(1.0, kBufferSizeInFrames, kFramesRequested);
-  TestPlaybackRate(0.5, kBufferSizeInFrames, kFramesRequested);
-  TestPlaybackRate(1.5, kBufferSizeInFrames, kFramesRequested);
+  TestPlaybackRate(1.0, kBufferSizeInFrames, kFramesRequested, 0);
+  TestPlaybackRate(0.5, kBufferSizeInFrames, kFramesRequested, 0);
+  TestPlaybackRate(1.5, kBufferSizeInFrames, kFramesRequested, 0);
 }

 TEST_F(AudioRendererAlgorithmTest, FillBuffer_LargeBufferSize) {