Reland "Revert "Add Live Caption support for WebRTC""

Patchset 1 contains the reland without any changes. This CL integrates the WebRTC audio renderer with the live captioning pipeline, allowing the Chrome browser to automatically generate captions from the audio rendered by the WebRTCAudioRenderer. The high-level design document for the Live Caption feature can be found at: go/chrome-live-caption. Bug: 1093096 Change-Id: I84f31a42aecb511f8d1c188eed23dd738f2cb2b1 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2250707 Commit-Queue: Evan Liu <evliu@google.com> Reviewed-by: Dale Curtis <dalecurtis@chromium.org> Reviewed-by: Kentaro Hara <haraken@chromium.org> Cr-Commit-Position: refs/heads/master@{#781562}

Reland "Revert "Add Live Caption support for WebRTC""
Patchset 1 contains the reland without any changes. This CL integrates the WebRTC audio renderer with the live captioning pipeline, allowing the Chrome browser to automatically generate captions from the audio rendered by the WebRTCAudioRenderer. The high-level design document for the Live Caption feature can be found at: go/chrome-live-caption. Bug: 1093096 Change-Id: I84f31a42aecb511f8d1c188eed23dd738f2cb2b1 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2250707 Commit-Queue: Evan Liu <evliu@google.com> Reviewed-by: Dale Curtis <dalecurtis@chromium.org> Reviewed-by: Kentaro Hara <haraken@chromium.org> Cr-Commit-Position: refs/heads/master@{#781562}
ca885407 · Evan Liu · Commit Bot · 14a3f13a · ca885407 · ca885407
Commit ca885407 authored Jun 23, 2020 by Evan Liu Committed by Commit Bot Jun 23, 2020
12 changed files
--- a/chrome/renderer/media/chrome_speech_recognition_client.cc
+++ b/chrome/renderer/media/chrome_speech_recognition_client.cc
@@ -8,6 +8,8 @@

 #include "base/metrics/field_trial_params.h"
 #include "content/public/renderer/render_frame.h"
+#include "media/base/audio_bus.h"
+#include "media/base/audio_parameters.h"
 #include "media/base/channel_mixer.h"
 #include "media/base/media_switches.h"
 #include "media/mojo/mojom/media_types.mojom.h"
@@ -66,6 +68,18 @@ void ChromeSpeechRecognitionClient::AddAudio(
  }
 }

+void ChromeSpeechRecognitionClient::AddAudio(
+    std::unique_ptr<media::AudioBus> audio_bus,
+    int sample_rate,
+    media::ChannelLayout channel_layout) {
+  DCHECK(audio_bus);
+  if (IsSpeechRecognitionAvailable()) {
+    speech_recognition_recognizer_->SendAudioToSpeechRecognitionService(
+        ConvertToAudioDataS16(std::move(audio_bus), sample_rate,
+                              channel_layout));
+  }
+}
+
 bool ChromeSpeechRecognitionClient::IsSpeechRecognitionAvailable() {
  // TODO(evliu): Check if SODA is available.
  return !is_website_blocked_ && is_browser_requesting_transcription_ &&
@@ -100,17 +114,17 @@ void ChromeSpeechRecognitionClient::CopyBufferToTempAudioBus(
 }

 void ChromeSpeechRecognitionClient::ResetChannelMixer(
-    const media::AudioBuffer& buffer) {
-  if (!monaural_audio_bus_ ||
-      buffer.frame_count() != monaural_audio_bus_->frames()) {
+    int frame_count,
+    media::ChannelLayout channel_layout) {
+  if (!monaural_audio_bus_ || frame_count != monaural_audio_bus_->frames()) {
    monaural_audio_bus_ =
-        media::AudioBus::Create(1 /* channels */, buffer.frame_count());
+        media::AudioBus::Create(1 /* channels */, frame_count);
  }

-  if (buffer.channel_layout() != channel_layout_) {
-    channel_layout_ = buffer.channel_layout();
+  if (channel_layout != channel_layout_) {
+    channel_layout_ = channel_layout;
    channel_mixer_ = std::make_unique<media::ChannelMixer>(
-        buffer.channel_layout(), media::CHANNEL_LAYOUT_MONO);
+        channel_layout, media::CHANNEL_LAYOUT_MONO);
  }
 }

@@ -131,7 +145,7 @@ ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
  if (buffer->channel_count() > 1 && !is_multichannel_supported_) {
    signed_buffer->channel_count = 1;
    CopyBufferToTempAudioBus(*buffer);
-    ResetChannelMixer(*buffer);
+    ResetChannelMixer(buffer->frame_count(), buffer->channel_layout());
    signed_buffer->data.resize(buffer->frame_count());
    channel_mixer_->Transform(temp_audio_bus_.get(), monaural_audio_bus_.get());
    monaural_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
@@ -158,6 +172,40 @@ ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
  return signed_buffer;
 }

+media::mojom::AudioDataS16Ptr
+ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
+    std::unique_ptr<media::AudioBus> audio_bus,
+    int sample_rate,
+    media::ChannelLayout channel_layout) {
+  DCHECK_GT(audio_bus->frames(), 0);
+  DCHECK_GT(audio_bus->channels(), 0);
+
+  auto signed_buffer = media::mojom::AudioDataS16::New();
+  signed_buffer->channel_count = audio_bus->channels();
+  signed_buffer->frame_count = audio_bus->frames();
+  signed_buffer->sample_rate = sample_rate;
+
+  // If multichannel audio is not supported by the speech recognition service,
+  // mix the channels into a monaural channel before converting it.
+  if (audio_bus->channels() > 1 && !is_multichannel_supported_) {
+    signed_buffer->channel_count = 1;
+    ResetChannelMixer(audio_bus->frames(), channel_layout);
+    signed_buffer->data.resize(audio_bus->frames());
+
+    channel_mixer_->Transform(audio_bus.get(), monaural_audio_bus_.get());
+    monaural_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
+        monaural_audio_bus_->frames(), &signed_buffer->data[0]);
+
+    return signed_buffer;
+  }
+
+  signed_buffer->data.resize(audio_bus->frames() * audio_bus->channels());
+  audio_bus->ToInterleaved<media::SignedInt16SampleTypeTraits>(
+      audio_bus->frames(), &signed_buffer->data[0]);
+
+  return signed_buffer;
+}
+
 bool ChromeSpeechRecognitionClient::IsUrlBlocked(const std::string& url) const {
  return blocked_urls_.find(url) != blocked_urls_.end();
 }
--- a/chrome/renderer/media/chrome_speech_recognition_client.h
+++ b/chrome/renderer/media/chrome_speech_recognition_client.h
@@ -21,6 +21,7 @@ class RenderFrame;
 }  // namespace content

 namespace media {
+class AudioBus;
 class ChannelMixer;
 }  // namespace media

@@ -38,6 +39,9 @@ class ChromeSpeechRecognitionClient

  // media::SpeechRecognitionClient
  void AddAudio(scoped_refptr<media::AudioBuffer> buffer) override;
+  void AddAudio(std::unique_ptr<media::AudioBus> audio_bus,
+                int sample_rate,
+                media::ChannelLayout channel_layout) override;
  bool IsSpeechRecognitionAvailable() override;

  // Callback executed when the recognizer is bound. Sets the flag indicating
@@ -54,13 +58,20 @@ class ChromeSpeechRecognitionClient

  // Called as a response to sending a transcription to the browser.
  void OnTranscriptionCallback(bool success);
+
+  media::mojom::AudioDataS16Ptr ConvertToAudioDataS16(
+      std::unique_ptr<media::AudioBus> audio_bus,
+      int sample_rate,
+      media::ChannelLayout channel_layout);
+
  // Recreates the temporary audio bus if the frame count or channel count
  // changed and reads the frames from the buffer into the temporary audio bus.
  void CopyBufferToTempAudioBus(const media::AudioBuffer& buffer);

  // Resets the temporary monaural audio bus and the channel mixer used to
  // combine multiple audio channels.
-  void ResetChannelMixer(const media::AudioBuffer& buffer);
+  void ResetChannelMixer(int frame_count, media::ChannelLayout channel_layout);
+
  bool IsUrlBlocked(const std::string& url) const;

  media::SpeechRecognitionClient::OnReadyCallback on_ready_callback_;

--- a/content/renderer/render_frame_impl.h
+++ b/content/renderer/render_frame_impl.h
@@ -614,7 +614,7 @@ class CONTENT_EXPORT RenderFrameImpl
  CreateWorkerContentSettingsClient() override;
 #if !defined(OS_ANDROID)
  std::unique_ptr<media::SpeechRecognitionClient> CreateSpeechRecognitionClient(
-      media::SpeechRecognitionClient::OnReadyCallback callback);
+      media::SpeechRecognitionClient::OnReadyCallback callback) override;
 #endif
  scoped_refptr<blink::WebWorkerFetchContext> CreateWorkerFetchContext()
      override;

--- a/media/base/speech_recognition_client.h
+++ b/media/base/speech_recognition_client.h
@@ -9,6 +9,7 @@

 #include "base/callback.h"
 #include "media/base/audio_buffer.h"
+#include "media/base/audio_bus.h"
 #include "media/base/media_export.h"

 namespace media {
@@ -23,6 +24,10 @@ class MEDIA_EXPORT SpeechRecognitionClient {

  virtual void AddAudio(scoped_refptr<AudioBuffer> buffer) = 0;

+  virtual void AddAudio(std::unique_ptr<media::AudioBus> audio_bus,
+                        int sample_rate,
+                        media::ChannelLayout channel_layout) = 0;
+
  virtual bool IsSpeechRecognitionAvailable() = 0;
 };


--- a/third_party/blink/public/web/DEPS
+++ b/third_party/blink/public/web/DEPS
@@ -23,6 +23,7 @@ include_rules = [
    "+cc/trees/layer_tree_host_client.h",
    "+cc/trees/viewport_layers.h",
    "+components/viz/common/surfaces/frame_sink_id.h",
+    "+media/base",
    "+mojo/public",
    "+net/cookies/site_for_cookies.h",
    "+printing/mojom/print.mojom-shared.h",

--- a/third_party/blink/public/web/web_local_frame_client.h
+++ b/third_party/blink/public/web/web_local_frame_client.h
@@ -37,6 +37,7 @@
 #include "base/i18n/rtl.h"
 #include "base/optional.h"
 #include "base/unguessable_token.h"
+#include "media/base/speech_recognition_client.h"
 #include "services/network/public/mojom/web_sandbox_flags.mojom-shared.h"
 #include "third_party/blink/public/common/feature_policy/feature_policy.h"
 #include "third_party/blink/public/common/loader/loading_behavior_flag.h"
@@ -155,6 +156,13 @@ class BLINK_EXPORT WebLocalFrameClient {
    return nullptr;
  }

+  // May return null.
+  virtual std::unique_ptr<media::SpeechRecognitionClient>
+  CreateSpeechRecognitionClient(
+      media::SpeechRecognitionClient::OnReadyCallback callback) {
+    return nullptr;
+  }
+
  // Returns a new WebWorkerFetchContext for a dedicated worker (in the
  // non-PlzDedicatedWorker case) or worklet.
  virtual scoped_refptr<WebWorkerFetchContext> CreateWorkerFetchContext() {

--- a/third_party/blink/renderer/core/exported/local_frame_client_impl.cc
+++ b/third_party/blink/renderer/core/exported/local_frame_client_impl.cc
@@ -1121,6 +1121,14 @@ LocalFrameClientImpl::CreateWorkerContentSettingsClient() {
  return web_frame_->Client()->CreateWorkerContentSettingsClient();
 }

+std::unique_ptr<media::SpeechRecognitionClient>
+LocalFrameClientImpl::CreateSpeechRecognitionClient(
+    media::SpeechRecognitionClient::OnReadyCallback callback) {
+  DCHECK(web_frame_->Client());
+  return web_frame_->Client()->CreateSpeechRecognitionClient(
+      std::move(callback));
+}
+
 void LocalFrameClientImpl::SetMouseCapture(bool capture) {
  web_frame_->Client()->SetMouseCapture(capture);
 }

--- a/third_party/blink/renderer/core/exported/local_frame_client_impl.h
+++ b/third_party/blink/renderer/core/exported/local_frame_client_impl.h
@@ -266,6 +266,9 @@ class CORE_EXPORT LocalFrameClientImpl final : public LocalFrameClient {
  std::unique_ptr<WebContentSettingsClient> CreateWorkerContentSettingsClient()
      override;

+  std::unique_ptr<media::SpeechRecognitionClient> CreateSpeechRecognitionClient(
+      media::SpeechRecognitionClient::OnReadyCallback callback) override;
+
  void SetMouseCapture(bool capture) override;

  bool UsePrintingLayout() const override;

--- a/third_party/blink/renderer/core/frame/local_frame_client.h
+++ b/third_party/blink/renderer/core/frame/local_frame_client.h
@@ -403,6 +403,12 @@ class CORE_EXPORT LocalFrameClient : public FrameClient {
    return nullptr;
  }

+  virtual std::unique_ptr<media::SpeechRecognitionClient>
+  CreateSpeechRecognitionClient(
+      media::SpeechRecognitionClient::OnReadyCallback callback) {
+    return nullptr;
+  }
+
  virtual void SetMouseCapture(bool) {}

  // Returns whether we are associated with a print context who suggests to use

--- a/third_party/blink/renderer/modules/webrtc/DEPS
+++ b/third_party/blink/renderer/modules/webrtc/DEPS
@@ -10,6 +10,7 @@ include_rules = [
    "+media/base/audio_parameters.h",
    "+media/base/audio_pull_fifo.h",
    "+media/base/audio_renderer_sink.h",
+    "+media/base/bind_to_current_loop.h",
    "+media/base/channel_layout.h",
    "+media/base/sample_rates.h",


--- a/third_party/blink/renderer/modules/webrtc/webrtc_audio_renderer.cc
+++ b/third_party/blink/renderer/modules/webrtc/webrtc_audio_renderer.cc
@@ -13,14 +13,18 @@
 #include "base/threading/thread_checker.h"
 #include "build/build_config.h"
 #include "media/audio/audio_sink_parameters.h"
+#include "media/base/audio_bus.h"
 #include "media/base/audio_capturer_source.h"
 #include "media/base/audio_latency.h"
 #include "media/base/audio_parameters.h"
+#include "media/base/bind_to_current_loop.h"
+#include "media/base/channel_layout.h"
 #include "media/base/sample_rates.h"
 #include "third_party/blink/public/platform/modules/webrtc/webrtc_logging.h"
 #include "third_party/blink/public/platform/platform.h"
 #include "third_party/blink/public/platform/web_media_stream_track.h"
 #include "third_party/blink/public/web/web_local_frame.h"
+#include "third_party/blink/public/web/web_local_frame_client.h"
 #include "third_party/blink/renderer/core/frame/local_frame.h"
 #include "third_party/blink/renderer/platform/mediastream/media_stream_audio_track.h"
 #include "third_party/blink/renderer/platform/scheduler/public/post_cross_thread_task.h"
@@ -316,6 +320,15 @@ WebRtcAudioRenderer::WebRtcAudioRenderer(
      sink_params_(kFormat, media::CHANNEL_LAYOUT_STEREO, 0, 0),
      output_device_id_(device_id),
      on_render_error_callback_(std::move(on_render_error_callback)) {
+  if (web_frame && web_frame->Client()) {
+    speech_recognition_client_ =
+        web_frame->Client()->CreateSpeechRecognitionClient(
+            media::BindToCurrentLoop(
+                ConvertToBaseOnceCallback(CrossThreadBindOnce(
+                    &WebRtcAudioRenderer::EnableSpeechRecognition,
+                    weak_factory_.GetWeakPtr()))));
+  }
+
  SendLogMessage(
      String::Format("%s({session_id=%s}, {device_id=%s})", __func__,
                     session_id.is_empty() ? "" : session_id.ToString().c_str(),
@@ -589,6 +602,14 @@ void WebRtcAudioRenderer::SwitchOutputDevice(
  std::move(callback).Run(media::OUTPUT_DEVICE_STATUS_OK);
 }

+void WebRtcAudioRenderer::TranscribeAudio(
+    std::unique_ptr<media::AudioBus> audio_bus,
+    int sample_rate,
+    media::ChannelLayout channel_layout) {
+  speech_recognition_client_->AddAudio(std::move(audio_bus), sample_rate,
+                                       channel_layout);
+}
+
 int WebRtcAudioRenderer::Render(base::TimeDelta delay,
                                base::TimeTicks delay_timestamp,
                                int prior_frames_skipped,
@@ -634,6 +655,15 @@ int WebRtcAudioRenderer::Render(base::TimeDelta delay,
    audio_stream_tracker_->MeasurePower(*audio_bus, audio_bus->frames());
  }

+  if (transcribe_audio_callback_) {
+    auto audio_bus_copy =
+        media::AudioBus::Create(audio_bus->channels(), audio_bus->frames());
+    audio_bus->CopyTo(audio_bus_copy.get());
+    transcribe_audio_callback_.Run(std::move(audio_bus_copy),
+                                   sink_params_.sample_rate(),
+                                   sink_params_.channel_layout());
+  }
+
  return (state_ == PLAYING) ? audio_bus->frames() : 0;
 }

@@ -925,4 +955,14 @@ void WebRtcAudioRenderer::SendLogMessage(const WTF::String& message) {
                       .Utf8());
 }

+void WebRtcAudioRenderer::EnableSpeechRecognition() {
+  if (speech_recognition_client_ &&
+      speech_recognition_client_->IsSpeechRecognitionAvailable()) {
+    transcribe_audio_callback_ =
+        media::BindToCurrentLoop(ConvertToBaseRepeatingCallback(
+            CrossThreadBindRepeating(&WebRtcAudioRenderer::TranscribeAudio,
+                                     weak_factory_.GetWeakPtr())));
+  }
+}
+
 }  // namespace blink
--- a/third_party/blink/renderer/modules/webrtc/webrtc_audio_renderer.h
+++ b/third_party/blink/renderer/modules/webrtc/webrtc_audio_renderer.h
@@ -35,6 +35,10 @@
 #include "third_party/blink/renderer/platform/webrtc/webrtc_source.h"
 #include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"

+namespace media {
+class SpeechRecognitionClient;
+}  // namespace media
+
 namespace webrtc {
 class AudioSourceInterface;
 }  // namespace webrtc
@@ -50,6 +54,10 @@ class MODULES_EXPORT WebRtcAudioRenderer
    : public media::AudioRendererSink::RenderCallback,
      public blink::WebMediaStreamAudioRenderer {
 public:
+  // Send the audio to the speech recognition service for caption transcription.
+  using TranscribeAudioCallback = base::RepeatingCallback<
+      void(std::unique_ptr<media::AudioBus>, int, media::ChannelLayout)>;
+
  // This is a little utility class that holds the configured state of an audio
  // stream.
  // It is used by both WebRtcAudioRenderer and SharedAudioRenderer (see cc
@@ -245,6 +253,10 @@ class MODULES_EXPORT WebRtcAudioRenderer
  // Flag to keep track the state of the renderer.
  State state_;

+  void TranscribeAudio(std::unique_ptr<media::AudioBus> audio_bus,
+                       int sample_rate,
+                       media::ChannelLayout channel_layout);
+
  // media::AudioRendererSink::RenderCallback implementation.
  // These two methods are called on the AudioOutputDevice worker thread.
  int Render(base::TimeDelta delay,
@@ -291,6 +303,8 @@ class MODULES_EXPORT WebRtcAudioRenderer

  void SendLogMessage(const WTF::String& message);

+  void EnableSpeechRecognition();
+
  // The WebLocalFrame in which the audio is rendered into |sink_|.
  //
  // TODO(crbug.com/704136): Replace |source_internal_frame_| with regular
@@ -369,6 +383,11 @@ class MODULES_EXPORT WebRtcAudioRenderer

  base::RepeatingCallback<void()> on_render_error_callback_;

+  std::unique_ptr<media::SpeechRecognitionClient> speech_recognition_client_;
+  TranscribeAudioCallback transcribe_audio_callback_;
+
+  base::WeakPtrFactory<WebRtcAudioRenderer> weak_factory_{this};
+
  DISALLOW_IMPLICIT_CONSTRUCTORS(WebRtcAudioRenderer);
 };