Commit ca885407 authored by Evan Liu's avatar Evan Liu Committed by Commit Bot

Reland "Revert "Add Live Caption support for WebRTC""

Patchset 1 contains the reland without any changes.

This CL integrates the WebRTC audio renderer with the live captioning
pipeline, allowing the Chrome browser to automatically generate captions
from the audio rendered by the WebRTCAudioRenderer. The high-level
design document for the Live Caption feature can be found at:
go/chrome-live-caption.

Bug: 1093096

Change-Id: I84f31a42aecb511f8d1c188eed23dd738f2cb2b1
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2250707
Commit-Queue: Evan Liu <evliu@google.com>
Reviewed-by: default avatarDale Curtis <dalecurtis@chromium.org>
Reviewed-by: default avatarKentaro Hara <haraken@chromium.org>
Cr-Commit-Position: refs/heads/master@{#781562}
parent 14a3f13a
......@@ -8,6 +8,8 @@
#include "base/metrics/field_trial_params.h"
#include "content/public/renderer/render_frame.h"
#include "media/base/audio_bus.h"
#include "media/base/audio_parameters.h"
#include "media/base/channel_mixer.h"
#include "media/base/media_switches.h"
#include "media/mojo/mojom/media_types.mojom.h"
......@@ -66,6 +68,18 @@ void ChromeSpeechRecognitionClient::AddAudio(
}
}
void ChromeSpeechRecognitionClient::AddAudio(
std::unique_ptr<media::AudioBus> audio_bus,
int sample_rate,
media::ChannelLayout channel_layout) {
DCHECK(audio_bus);
if (IsSpeechRecognitionAvailable()) {
speech_recognition_recognizer_->SendAudioToSpeechRecognitionService(
ConvertToAudioDataS16(std::move(audio_bus), sample_rate,
channel_layout));
}
}
bool ChromeSpeechRecognitionClient::IsSpeechRecognitionAvailable() {
// TODO(evliu): Check if SODA is available.
return !is_website_blocked_ && is_browser_requesting_transcription_ &&
......@@ -100,17 +114,17 @@ void ChromeSpeechRecognitionClient::CopyBufferToTempAudioBus(
}
void ChromeSpeechRecognitionClient::ResetChannelMixer(
const media::AudioBuffer& buffer) {
if (!monaural_audio_bus_ ||
buffer.frame_count() != monaural_audio_bus_->frames()) {
int frame_count,
media::ChannelLayout channel_layout) {
if (!monaural_audio_bus_ || frame_count != monaural_audio_bus_->frames()) {
monaural_audio_bus_ =
media::AudioBus::Create(1 /* channels */, buffer.frame_count());
media::AudioBus::Create(1 /* channels */, frame_count);
}
if (buffer.channel_layout() != channel_layout_) {
channel_layout_ = buffer.channel_layout();
if (channel_layout != channel_layout_) {
channel_layout_ = channel_layout;
channel_mixer_ = std::make_unique<media::ChannelMixer>(
buffer.channel_layout(), media::CHANNEL_LAYOUT_MONO);
channel_layout, media::CHANNEL_LAYOUT_MONO);
}
}
......@@ -131,7 +145,7 @@ ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
if (buffer->channel_count() > 1 && !is_multichannel_supported_) {
signed_buffer->channel_count = 1;
CopyBufferToTempAudioBus(*buffer);
ResetChannelMixer(*buffer);
ResetChannelMixer(buffer->frame_count(), buffer->channel_layout());
signed_buffer->data.resize(buffer->frame_count());
channel_mixer_->Transform(temp_audio_bus_.get(), monaural_audio_bus_.get());
monaural_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
......@@ -158,6 +172,40 @@ ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
return signed_buffer;
}
media::mojom::AudioDataS16Ptr
ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
std::unique_ptr<media::AudioBus> audio_bus,
int sample_rate,
media::ChannelLayout channel_layout) {
DCHECK_GT(audio_bus->frames(), 0);
DCHECK_GT(audio_bus->channels(), 0);
auto signed_buffer = media::mojom::AudioDataS16::New();
signed_buffer->channel_count = audio_bus->channels();
signed_buffer->frame_count = audio_bus->frames();
signed_buffer->sample_rate = sample_rate;
// If multichannel audio is not supported by the speech recognition service,
// mix the channels into a monaural channel before converting it.
if (audio_bus->channels() > 1 && !is_multichannel_supported_) {
signed_buffer->channel_count = 1;
ResetChannelMixer(audio_bus->frames(), channel_layout);
signed_buffer->data.resize(audio_bus->frames());
channel_mixer_->Transform(audio_bus.get(), monaural_audio_bus_.get());
monaural_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
monaural_audio_bus_->frames(), &signed_buffer->data[0]);
return signed_buffer;
}
signed_buffer->data.resize(audio_bus->frames() * audio_bus->channels());
audio_bus->ToInterleaved<media::SignedInt16SampleTypeTraits>(
audio_bus->frames(), &signed_buffer->data[0]);
return signed_buffer;
}
bool ChromeSpeechRecognitionClient::IsUrlBlocked(const std::string& url) const {
return blocked_urls_.find(url) != blocked_urls_.end();
}
......@@ -21,6 +21,7 @@ class RenderFrame;
} // namespace content
namespace media {
class AudioBus;
class ChannelMixer;
} // namespace media
......@@ -38,6 +39,9 @@ class ChromeSpeechRecognitionClient
// media::SpeechRecognitionClient
void AddAudio(scoped_refptr<media::AudioBuffer> buffer) override;
void AddAudio(std::unique_ptr<media::AudioBus> audio_bus,
int sample_rate,
media::ChannelLayout channel_layout) override;
bool IsSpeechRecognitionAvailable() override;
// Callback executed when the recognizer is bound. Sets the flag indicating
......@@ -54,13 +58,20 @@ class ChromeSpeechRecognitionClient
// Called as a response to sending a transcription to the browser.
void OnTranscriptionCallback(bool success);
media::mojom::AudioDataS16Ptr ConvertToAudioDataS16(
std::unique_ptr<media::AudioBus> audio_bus,
int sample_rate,
media::ChannelLayout channel_layout);
// Recreates the temporary audio bus if the frame count or channel count
// changed and reads the frames from the buffer into the temporary audio bus.
void CopyBufferToTempAudioBus(const media::AudioBuffer& buffer);
// Resets the temporary monaural audio bus and the channel mixer used to
// combine multiple audio channels.
void ResetChannelMixer(const media::AudioBuffer& buffer);
void ResetChannelMixer(int frame_count, media::ChannelLayout channel_layout);
bool IsUrlBlocked(const std::string& url) const;
media::SpeechRecognitionClient::OnReadyCallback on_ready_callback_;
......
......@@ -614,7 +614,7 @@ class CONTENT_EXPORT RenderFrameImpl
CreateWorkerContentSettingsClient() override;
#if !defined(OS_ANDROID)
std::unique_ptr<media::SpeechRecognitionClient> CreateSpeechRecognitionClient(
media::SpeechRecognitionClient::OnReadyCallback callback);
media::SpeechRecognitionClient::OnReadyCallback callback) override;
#endif
scoped_refptr<blink::WebWorkerFetchContext> CreateWorkerFetchContext()
override;
......
......@@ -9,6 +9,7 @@
#include "base/callback.h"
#include "media/base/audio_buffer.h"
#include "media/base/audio_bus.h"
#include "media/base/media_export.h"
namespace media {
......@@ -23,6 +24,10 @@ class MEDIA_EXPORT SpeechRecognitionClient {
virtual void AddAudio(scoped_refptr<AudioBuffer> buffer) = 0;
virtual void AddAudio(std::unique_ptr<media::AudioBus> audio_bus,
int sample_rate,
media::ChannelLayout channel_layout) = 0;
virtual bool IsSpeechRecognitionAvailable() = 0;
};
......
......@@ -23,6 +23,7 @@ include_rules = [
"+cc/trees/layer_tree_host_client.h",
"+cc/trees/viewport_layers.h",
"+components/viz/common/surfaces/frame_sink_id.h",
"+media/base",
"+mojo/public",
"+net/cookies/site_for_cookies.h",
"+printing/mojom/print.mojom-shared.h",
......
......@@ -37,6 +37,7 @@
#include "base/i18n/rtl.h"
#include "base/optional.h"
#include "base/unguessable_token.h"
#include "media/base/speech_recognition_client.h"
#include "services/network/public/mojom/web_sandbox_flags.mojom-shared.h"
#include "third_party/blink/public/common/feature_policy/feature_policy.h"
#include "third_party/blink/public/common/loader/loading_behavior_flag.h"
......@@ -155,6 +156,13 @@ class BLINK_EXPORT WebLocalFrameClient {
return nullptr;
}
// May return null.
virtual std::unique_ptr<media::SpeechRecognitionClient>
CreateSpeechRecognitionClient(
media::SpeechRecognitionClient::OnReadyCallback callback) {
return nullptr;
}
// Returns a new WebWorkerFetchContext for a dedicated worker (in the
// non-PlzDedicatedWorker case) or worklet.
virtual scoped_refptr<WebWorkerFetchContext> CreateWorkerFetchContext() {
......
......@@ -1121,6 +1121,14 @@ LocalFrameClientImpl::CreateWorkerContentSettingsClient() {
return web_frame_->Client()->CreateWorkerContentSettingsClient();
}
std::unique_ptr<media::SpeechRecognitionClient>
LocalFrameClientImpl::CreateSpeechRecognitionClient(
media::SpeechRecognitionClient::OnReadyCallback callback) {
DCHECK(web_frame_->Client());
return web_frame_->Client()->CreateSpeechRecognitionClient(
std::move(callback));
}
void LocalFrameClientImpl::SetMouseCapture(bool capture) {
web_frame_->Client()->SetMouseCapture(capture);
}
......
......@@ -266,6 +266,9 @@ class CORE_EXPORT LocalFrameClientImpl final : public LocalFrameClient {
std::unique_ptr<WebContentSettingsClient> CreateWorkerContentSettingsClient()
override;
std::unique_ptr<media::SpeechRecognitionClient> CreateSpeechRecognitionClient(
media::SpeechRecognitionClient::OnReadyCallback callback) override;
void SetMouseCapture(bool capture) override;
bool UsePrintingLayout() const override;
......
......@@ -403,6 +403,12 @@ class CORE_EXPORT LocalFrameClient : public FrameClient {
return nullptr;
}
virtual std::unique_ptr<media::SpeechRecognitionClient>
CreateSpeechRecognitionClient(
media::SpeechRecognitionClient::OnReadyCallback callback) {
return nullptr;
}
virtual void SetMouseCapture(bool) {}
// Returns whether we are associated with a print context who suggests to use
......
......@@ -10,6 +10,7 @@ include_rules = [
"+media/base/audio_parameters.h",
"+media/base/audio_pull_fifo.h",
"+media/base/audio_renderer_sink.h",
"+media/base/bind_to_current_loop.h",
"+media/base/channel_layout.h",
"+media/base/sample_rates.h",
......
......@@ -13,14 +13,18 @@
#include "base/threading/thread_checker.h"
#include "build/build_config.h"
#include "media/audio/audio_sink_parameters.h"
#include "media/base/audio_bus.h"
#include "media/base/audio_capturer_source.h"
#include "media/base/audio_latency.h"
#include "media/base/audio_parameters.h"
#include "media/base/bind_to_current_loop.h"
#include "media/base/channel_layout.h"
#include "media/base/sample_rates.h"
#include "third_party/blink/public/platform/modules/webrtc/webrtc_logging.h"
#include "third_party/blink/public/platform/platform.h"
#include "third_party/blink/public/platform/web_media_stream_track.h"
#include "third_party/blink/public/web/web_local_frame.h"
#include "third_party/blink/public/web/web_local_frame_client.h"
#include "third_party/blink/renderer/core/frame/local_frame.h"
#include "third_party/blink/renderer/platform/mediastream/media_stream_audio_track.h"
#include "third_party/blink/renderer/platform/scheduler/public/post_cross_thread_task.h"
......@@ -316,6 +320,15 @@ WebRtcAudioRenderer::WebRtcAudioRenderer(
sink_params_(kFormat, media::CHANNEL_LAYOUT_STEREO, 0, 0),
output_device_id_(device_id),
on_render_error_callback_(std::move(on_render_error_callback)) {
if (web_frame && web_frame->Client()) {
speech_recognition_client_ =
web_frame->Client()->CreateSpeechRecognitionClient(
media::BindToCurrentLoop(
ConvertToBaseOnceCallback(CrossThreadBindOnce(
&WebRtcAudioRenderer::EnableSpeechRecognition,
weak_factory_.GetWeakPtr()))));
}
SendLogMessage(
String::Format("%s({session_id=%s}, {device_id=%s})", __func__,
session_id.is_empty() ? "" : session_id.ToString().c_str(),
......@@ -589,6 +602,14 @@ void WebRtcAudioRenderer::SwitchOutputDevice(
std::move(callback).Run(media::OUTPUT_DEVICE_STATUS_OK);
}
void WebRtcAudioRenderer::TranscribeAudio(
std::unique_ptr<media::AudioBus> audio_bus,
int sample_rate,
media::ChannelLayout channel_layout) {
speech_recognition_client_->AddAudio(std::move(audio_bus), sample_rate,
channel_layout);
}
int WebRtcAudioRenderer::Render(base::TimeDelta delay,
base::TimeTicks delay_timestamp,
int prior_frames_skipped,
......@@ -634,6 +655,15 @@ int WebRtcAudioRenderer::Render(base::TimeDelta delay,
audio_stream_tracker_->MeasurePower(*audio_bus, audio_bus->frames());
}
if (transcribe_audio_callback_) {
auto audio_bus_copy =
media::AudioBus::Create(audio_bus->channels(), audio_bus->frames());
audio_bus->CopyTo(audio_bus_copy.get());
transcribe_audio_callback_.Run(std::move(audio_bus_copy),
sink_params_.sample_rate(),
sink_params_.channel_layout());
}
return (state_ == PLAYING) ? audio_bus->frames() : 0;
}
......@@ -925,4 +955,14 @@ void WebRtcAudioRenderer::SendLogMessage(const WTF::String& message) {
.Utf8());
}
void WebRtcAudioRenderer::EnableSpeechRecognition() {
if (speech_recognition_client_ &&
speech_recognition_client_->IsSpeechRecognitionAvailable()) {
transcribe_audio_callback_ =
media::BindToCurrentLoop(ConvertToBaseRepeatingCallback(
CrossThreadBindRepeating(&WebRtcAudioRenderer::TranscribeAudio,
weak_factory_.GetWeakPtr())));
}
}
} // namespace blink
......@@ -35,6 +35,10 @@
#include "third_party/blink/renderer/platform/webrtc/webrtc_source.h"
#include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
namespace media {
class SpeechRecognitionClient;
} // namespace media
namespace webrtc {
class AudioSourceInterface;
} // namespace webrtc
......@@ -50,6 +54,10 @@ class MODULES_EXPORT WebRtcAudioRenderer
: public media::AudioRendererSink::RenderCallback,
public blink::WebMediaStreamAudioRenderer {
public:
// Send the audio to the speech recognition service for caption transcription.
using TranscribeAudioCallback = base::RepeatingCallback<
void(std::unique_ptr<media::AudioBus>, int, media::ChannelLayout)>;
// This is a little utility class that holds the configured state of an audio
// stream.
// It is used by both WebRtcAudioRenderer and SharedAudioRenderer (see cc
......@@ -245,6 +253,10 @@ class MODULES_EXPORT WebRtcAudioRenderer
// Flag to keep track the state of the renderer.
State state_;
void TranscribeAudio(std::unique_ptr<media::AudioBus> audio_bus,
int sample_rate,
media::ChannelLayout channel_layout);
// media::AudioRendererSink::RenderCallback implementation.
// These two methods are called on the AudioOutputDevice worker thread.
int Render(base::TimeDelta delay,
......@@ -291,6 +303,8 @@ class MODULES_EXPORT WebRtcAudioRenderer
void SendLogMessage(const WTF::String& message);
void EnableSpeechRecognition();
// The WebLocalFrame in which the audio is rendered into |sink_|.
//
// TODO(crbug.com/704136): Replace |source_internal_frame_| with regular
......@@ -369,6 +383,11 @@ class MODULES_EXPORT WebRtcAudioRenderer
base::RepeatingCallback<void()> on_render_error_callback_;
std::unique_ptr<media::SpeechRecognitionClient> speech_recognition_client_;
TranscribeAudioCallback transcribe_audio_callback_;
base::WeakPtrFactory<WebRtcAudioRenderer> weak_factory_{this};
DISALLOW_IMPLICIT_CONSTRUCTORS(WebRtcAudioRenderer);
};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment