Commit 92468fa3 authored by evliu's avatar evliu Committed by Commit Bot

Create Cloud Speech Recognition Client

This CL adds a speech recognition client that will be used by the Live
Caption feature. This is a temporary implementation using the Open
Speech API that will allow testing and experimentation of the Live
Caption feature while the Speech On-Device API (SODA) is under
development. Once SODA development is completed, the Cloud client will
be replaced by the SodaClient.

Bug: 1076667
Change-Id: Ia4fd07ab3ec8a6c454a125addca7d55e600a57a0
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2194923
Commit-Queue: Evan Liu <evliu@google.com>
Reviewed-by: default avatarOlga Sharonova <olka@chromium.org>
Reviewed-by: default avatarDavid Roger <droger@chromium.org>
Reviewed-by: default avatarColin Blundell <blundell@chromium.org>
Reviewed-by: default avatarRobert Sesek <rsesek@chromium.org>
Reviewed-by: default avatarMatt Menke <mmenke@chromium.org>
Reviewed-by: default avatarDominic Mazzoni <dmazzoni@chromium.org>
Reviewed-by: default avatarRyan Sleevi <rsleevi@chromium.org>
Reviewed-by: default avatarNicolas Ouellet-Payeur <nicolaso@chromium.org>
Reviewed-by: default avatarDale Curtis <dalecurtis@chromium.org>
Cr-Commit-Position: refs/heads/master@{#776192}
parent 7767af0a
include_rules = [
"+chrome/services/speech/buildflags.h",
"+services/network/network_context.h",
"+services/network/public/cpp"
]
...@@ -6,13 +6,24 @@ ...@@ -6,13 +6,24 @@
#include "chrome/browser/service_sandbox_type.h" #include "chrome/browser/service_sandbox_type.h"
#include "chrome/grit/generated_resources.h" #include "chrome/grit/generated_resources.h"
#include "content/public/browser/browser_context.h"
#include "content/public/browser/service_process_host.h" #include "content/public/browser/service_process_host.h"
#include "content/public/browser/storage_partition.h"
#include "services/network/network_context.h"
#include "services/network/public/cpp/shared_url_loader_factory.h"
namespace speech { namespace speech {
constexpr base::TimeDelta kIdleProcessTimeout = base::TimeDelta::FromSeconds(5); constexpr base::TimeDelta kIdleProcessTimeout = base::TimeDelta::FromSeconds(5);
SpeechRecognitionService::SpeechRecognitionService() = default; SpeechRecognitionService::SpeechRecognitionService(
content::BrowserContext* context)
#if !BUILDFLAG(ENABLE_SODA)
: context_(context)
#endif // !BUILDFLAG(ENABLE_SODA)
{
}
SpeechRecognitionService::~SpeechRecognitionService() = default; SpeechRecognitionService::~SpeechRecognitionService() = default;
void SpeechRecognitionService::Create( void SpeechRecognitionService::Create(
...@@ -21,6 +32,27 @@ void SpeechRecognitionService::Create( ...@@ -21,6 +32,27 @@ void SpeechRecognitionService::Create(
speech_recognition_service_->BindContext(std::move(receiver)); speech_recognition_service_->BindContext(std::move(receiver));
} }
void SpeechRecognitionService::OnNetworkServiceDisconnect() {
#if !BUILDFLAG(ENABLE_SODA)
// If the Speech On-Device API is not enabled, pass the URL loader factory to
// the speech recognition service to allow network requests to the Open Speech
// API.
mojo::PendingRemote<network::mojom::URLLoaderFactory> url_loader_factory;
network::mojom::URLLoaderFactoryParamsPtr params =
network::mojom::URLLoaderFactoryParams::New();
params->process_id = network::mojom::kBrowserProcessId;
params->is_trusted = false;
params->automatically_assign_isolation_info = true;
network::mojom::NetworkContext* network_context =
content::BrowserContext::GetDefaultStoragePartition(context_)
->GetNetworkContext();
network_context->CreateURLLoaderFactory(
url_loader_factory.InitWithNewPipeAndPassReceiver(), std::move(params));
speech_recognition_service_->SetUrlLoaderFactory(
std::move(url_loader_factory));
#endif // !BUILDFLAG(ENABLE_SODA)
}
void SpeechRecognitionService::LaunchIfNotRunning() { void SpeechRecognitionService::LaunchIfNotRunning() {
if (speech_recognition_service_.is_bound()) if (speech_recognition_service_.is_bound())
return; return;
...@@ -38,6 +70,10 @@ void SpeechRecognitionService::LaunchIfNotRunning() { ...@@ -38,6 +70,10 @@ void SpeechRecognitionService::LaunchIfNotRunning() {
// terminated if it isn't already. // terminated if it isn't already.
speech_recognition_service_.reset_on_disconnect(); speech_recognition_service_.reset_on_disconnect();
speech_recognition_service_.reset_on_idle_timeout(kIdleProcessTimeout); speech_recognition_service_.reset_on_idle_timeout(kIdleProcessTimeout);
}
speech_recognition_service_client_.reset();
speech_recognition_service_->BindSpeechRecognitionServiceClient(
speech_recognition_service_client_.BindNewPipeAndPassRemote());
OnNetworkServiceDisconnect();
}
} // namespace speech } // namespace speech
...@@ -5,18 +5,25 @@ ...@@ -5,18 +5,25 @@
#ifndef CHROME_BROWSER_SPEECH_SPEECH_RECOGNITION_SERVICE_H_ #ifndef CHROME_BROWSER_SPEECH_SPEECH_RECOGNITION_SERVICE_H_
#define CHROME_BROWSER_SPEECH_SPEECH_RECOGNITION_SERVICE_H_ #define CHROME_BROWSER_SPEECH_SPEECH_RECOGNITION_SERVICE_H_
#include "chrome/services/speech/buildflags.h"
#include "components/keyed_service/core/keyed_service.h" #include "components/keyed_service/core/keyed_service.h"
#include "media/mojo/mojom/speech_recognition_service.mojom.h" #include "media/mojo/mojom/speech_recognition_service.mojom.h"
#include "mojo/public/cpp/bindings/remote.h" #include "mojo/public/cpp/bindings/remote.h"
namespace content {
class BrowserContext;
} // namespace content
namespace speech { namespace speech {
// Provides a mojo endpoint in the browser that allows the renderer process to // Provides a mojo endpoint in the browser that allows the renderer process to
// launch and initialize the sandboxed speech recognition service // launch and initialize the sandboxed speech recognition service
// process. // process.
class SpeechRecognitionService : public KeyedService { class SpeechRecognitionService
: public KeyedService,
public media::mojom::SpeechRecognitionServiceClient {
public: public:
SpeechRecognitionService(); explicit SpeechRecognitionService(content::BrowserContext* context);
SpeechRecognitionService(const SpeechRecognitionService&) = delete; SpeechRecognitionService(const SpeechRecognitionService&) = delete;
SpeechRecognitionService& operator=(const SpeechRecognitionService&) = delete; SpeechRecognitionService& operator=(const SpeechRecognitionService&) = delete;
~SpeechRecognitionService() override; ~SpeechRecognitionService() override;
...@@ -24,14 +31,25 @@ class SpeechRecognitionService : public KeyedService { ...@@ -24,14 +31,25 @@ class SpeechRecognitionService : public KeyedService {
void Create( void Create(
mojo::PendingReceiver<media::mojom::SpeechRecognitionContext> receiver); mojo::PendingReceiver<media::mojom::SpeechRecognitionContext> receiver);
// media::mojom::SpeechRecognitionServiceClient
void OnNetworkServiceDisconnect() override;
private: private:
// Launches the speech recognition service in a sandboxed utility process. // Launches the speech recognition service in a sandboxed utility process.
void LaunchIfNotRunning(); void LaunchIfNotRunning();
#if !BUILDFLAG(ENABLE_SODA)
// The browser context associated with the keyed service.
content::BrowserContext* const context_;
#endif // !BUILDFLAG(ENABLE_SODA)
// The remote to the speech recognition service. The browser will not launch a // The remote to the speech recognition service. The browser will not launch a
// new speech recognition service process if this remote is already bound. // new speech recognition service process if this remote is already bound.
mojo::Remote<media::mojom::SpeechRecognitionService> mojo::Remote<media::mojom::SpeechRecognitionService>
speech_recognition_service_; speech_recognition_service_;
mojo::Receiver<media::mojom::SpeechRecognitionServiceClient>
speech_recognition_service_client_{this};
}; };
} // namespace speech } // namespace speech
......
...@@ -31,6 +31,6 @@ SpeechRecognitionServiceFactory::SpeechRecognitionServiceFactory() ...@@ -31,6 +31,6 @@ SpeechRecognitionServiceFactory::SpeechRecognitionServiceFactory()
SpeechRecognitionServiceFactory::~SpeechRecognitionServiceFactory() = default; SpeechRecognitionServiceFactory::~SpeechRecognitionServiceFactory() = default;
KeyedService* SpeechRecognitionServiceFactory::BuildServiceInstanceFor( KeyedService* SpeechRecognitionServiceFactory::BuildServiceInstanceFor(
content::BrowserContext* profile) const { content::BrowserContext* context) const {
return new speech::SpeechRecognitionService(); return new speech::SpeechRecognitionService(context);
} }
...@@ -34,7 +34,7 @@ class SpeechRecognitionServiceFactory ...@@ -34,7 +34,7 @@ class SpeechRecognitionServiceFactory
// BrowserContextKeyedServiceFactory: // BrowserContextKeyedServiceFactory:
KeyedService* BuildServiceInstanceFor( KeyedService* BuildServiceInstanceFor(
content::BrowserContext* profile) const override; content::BrowserContext* context) const override;
}; };
#endif // CHROME_BROWSER_SPEECH_SPEECH_RECOGNITION_SERVICE_FACTORY_H_ #endif // CHROME_BROWSER_SPEECH_SPEECH_RECOGNITION_SERVICE_FACTORY_H_
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <utility> #include <utility>
#include "content/public/renderer/render_frame.h" #include "content/public/renderer/render_frame.h"
#include "media/base/channel_mixer.h"
#include "media/mojo/mojom/media_types.mojom.h" #include "media/mojo/mojom/media_types.mojom.h"
#include "third_party/blink/public/common/browser_interface_broker_proxy.h" #include "third_party/blink/public/common/browser_interface_broker_proxy.h"
...@@ -17,13 +18,20 @@ ChromeSpeechRecognitionClient::ChromeSpeechRecognitionClient( ...@@ -17,13 +18,20 @@ ChromeSpeechRecognitionClient::ChromeSpeechRecognitionClient(
speech_recognition_context_.BindNewPipeAndPassReceiver(); speech_recognition_context_.BindNewPipeAndPassReceiver();
speech_recognition_context_->BindRecognizer( speech_recognition_context_->BindRecognizer(
speech_recognition_recognizer_.BindNewPipeAndPassReceiver(), speech_recognition_recognizer_.BindNewPipeAndPassReceiver(),
speech_recognition_client_receiver_.BindNewPipeAndPassRemote()); speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
base::BindOnce(&ChromeSpeechRecognitionClient::OnRecognizerBound,
base::Unretained(this)));
render_frame->GetBrowserInterfaceBroker()->GetInterface( render_frame->GetBrowserInterfaceBroker()->GetInterface(
std::move(speech_recognition_context_receiver)); std::move(speech_recognition_context_receiver));
render_frame->GetBrowserInterfaceBroker()->GetInterface( render_frame->GetBrowserInterfaceBroker()->GetInterface(
caption_host_.BindNewPipeAndPassReceiver()); caption_host_.BindNewPipeAndPassReceiver());
} }
void ChromeSpeechRecognitionClient::OnRecognizerBound(
bool is_multichannel_supported) {
is_multichannel_supported_ = is_multichannel_supported;
}
ChromeSpeechRecognitionClient::~ChromeSpeechRecognitionClient() = default; ChromeSpeechRecognitionClient::~ChromeSpeechRecognitionClient() = default;
void ChromeSpeechRecognitionClient::AddAudio( void ChromeSpeechRecognitionClient::AddAudio(
...@@ -54,6 +62,35 @@ void ChromeSpeechRecognitionClient::OnTranscriptionCallback(bool success) { ...@@ -54,6 +62,35 @@ void ChromeSpeechRecognitionClient::OnTranscriptionCallback(bool success) {
is_browser_requesting_transcription_ = success; is_browser_requesting_transcription_ = success;
} }
void ChromeSpeechRecognitionClient::CopyBufferToTempAudioBus(
const media::AudioBuffer& buffer) {
if (!temp_audio_bus_ ||
buffer.channel_count() != temp_audio_bus_->channels() ||
buffer.frame_count() != temp_audio_bus_->frames()) {
temp_audio_bus_ =
media::AudioBus::Create(buffer.channel_count(), buffer.frame_count());
}
buffer.ReadFrames(buffer.frame_count(),
/* source_frame_offset */ 0, /* dest_frame_offset */ 0,
temp_audio_bus_.get());
}
void ChromeSpeechRecognitionClient::ResetChannelMixer(
const media::AudioBuffer& buffer) {
if (!monaural_audio_bus_ ||
buffer.frame_count() != monaural_audio_bus_->frames()) {
monaural_audio_bus_ =
media::AudioBus::Create(1 /* channels */, buffer.frame_count());
}
if (buffer.channel_layout() != channel_layout_) {
channel_layout_ = buffer.channel_layout();
channel_mixer_ = std::make_unique<media::ChannelMixer>(
buffer.channel_layout(), media::CHANNEL_LAYOUT_MONO);
}
}
media::mojom::AudioDataS16Ptr media::mojom::AudioDataS16Ptr
ChromeSpeechRecognitionClient::ConvertToAudioDataS16( ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
scoped_refptr<media::AudioBuffer> buffer) { scoped_refptr<media::AudioBuffer> buffer) {
...@@ -66,6 +103,21 @@ ChromeSpeechRecognitionClient::ConvertToAudioDataS16( ...@@ -66,6 +103,21 @@ ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
signed_buffer->frame_count = buffer->frame_count(); signed_buffer->frame_count = buffer->frame_count();
signed_buffer->sample_rate = buffer->sample_rate(); signed_buffer->sample_rate = buffer->sample_rate();
// If multichannel audio is not supported by the speech recognition service,
// mix the channels into a monaural channel before converting it.
if (buffer->channel_count() > 1 && !is_multichannel_supported_) {
signed_buffer->channel_count = 1;
CopyBufferToTempAudioBus(*buffer);
ResetChannelMixer(*buffer);
signed_buffer->data.resize(buffer->frame_count());
channel_mixer_->Transform(temp_audio_bus_.get(), monaural_audio_bus_.get());
monaural_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
monaural_audio_bus_->frames(), &signed_buffer->data[0]);
return signed_buffer;
}
// If the audio is already in the interleaved signed int 16 format, directly
// assign it to the buffer.
if (buffer->sample_format() == media::SampleFormat::kSampleFormatS16) { if (buffer->sample_format() == media::SampleFormat::kSampleFormatS16) {
int16_t* audio_data = reinterpret_cast<int16_t*>(buffer->channel_data()[0]); int16_t* audio_data = reinterpret_cast<int16_t*>(buffer->channel_data()[0]);
signed_buffer->data.assign( signed_buffer->data.assign(
...@@ -75,17 +127,7 @@ ChromeSpeechRecognitionClient::ConvertToAudioDataS16( ...@@ -75,17 +127,7 @@ ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
} }
// Convert the raw audio to the interleaved signed int 16 sample type. // Convert the raw audio to the interleaved signed int 16 sample type.
if (!temp_audio_bus_ || CopyBufferToTempAudioBus(*buffer);
buffer->channel_count() != temp_audio_bus_->channels() ||
buffer->frame_count() != temp_audio_bus_->frames()) {
temp_audio_bus_ =
media::AudioBus::Create(buffer->channel_count(), buffer->frame_count());
}
buffer->ReadFrames(buffer->frame_count(),
/* source_frame_offset */ 0, /* dest_frame_offset */ 0,
temp_audio_bus_.get());
signed_buffer->data.resize(buffer->frame_count() * buffer->channel_count()); signed_buffer->data.resize(buffer->frame_count() * buffer->channel_count());
temp_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>( temp_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
temp_audio_bus_->frames(), &signed_buffer->data[0]); temp_audio_bus_->frames(), &signed_buffer->data[0]);
......
...@@ -19,6 +19,10 @@ namespace content { ...@@ -19,6 +19,10 @@ namespace content {
class RenderFrame; class RenderFrame;
} // namespace content } // namespace content
namespace media {
class ChannelMixer;
} // namespace media
class ChromeSpeechRecognitionClient class ChromeSpeechRecognitionClient
: public media::SpeechRecognitionClient, : public media::SpeechRecognitionClient,
public media::mojom::SpeechRecognitionRecognizerClient { public media::mojom::SpeechRecognitionRecognizerClient {
...@@ -33,6 +37,10 @@ class ChromeSpeechRecognitionClient ...@@ -33,6 +37,10 @@ class ChromeSpeechRecognitionClient
void AddAudio(scoped_refptr<media::AudioBuffer> buffer) override; void AddAudio(scoped_refptr<media::AudioBuffer> buffer) override;
bool IsSpeechRecognitionAvailable() override; bool IsSpeechRecognitionAvailable() override;
// Callback executed when the recognizer is bound. Sets the flag indicating
// whether the speech recognition service supports multichannel audio.
void OnRecognizerBound(bool is_multichannel_supported);
// media::mojom::SpeechRecognitionRecognizerClient // media::mojom::SpeechRecognitionRecognizerClient
void OnSpeechRecognitionRecognitionEvent( void OnSpeechRecognitionRecognitionEvent(
media::mojom::SpeechRecognitionResultPtr result) override; media::mojom::SpeechRecognitionResultPtr result) override;
...@@ -43,6 +51,13 @@ class ChromeSpeechRecognitionClient ...@@ -43,6 +51,13 @@ class ChromeSpeechRecognitionClient
// Called as a response to sending a transcription to the browser. // Called as a response to sending a transcription to the browser.
void OnTranscriptionCallback(bool success); void OnTranscriptionCallback(bool success);
// Recreates the temporary audio bus if the frame count or channel count
// changed and reads the frames from the buffer into the temporary audio bus.
void CopyBufferToTempAudioBus(const media::AudioBuffer& buffer);
// Resets the temporary monaural audio bus and the channel mixer used to
// combine multiple audio channels.
void ResetChannelMixer(const media::AudioBuffer& buffer);
mojo::Remote<media::mojom::SpeechRecognitionContext> mojo::Remote<media::mojom::SpeechRecognitionContext>
speech_recognition_context_; speech_recognition_context_;
...@@ -58,6 +73,19 @@ class ChromeSpeechRecognitionClient ...@@ -58,6 +73,19 @@ class ChromeSpeechRecognitionClient
// Whether the browser is still requesting transcriptions. // Whether the browser is still requesting transcriptions.
bool is_browser_requesting_transcription_ = true; bool is_browser_requesting_transcription_ = true;
// The temporary audio bus used to mix multichannel audio into a single
// channel.
std::unique_ptr<media::AudioBus> monaural_audio_bus_;
std::unique_ptr<media::ChannelMixer> channel_mixer_;
// The layout used to instantiate the channel mixer.
media::ChannelLayout channel_layout_ =
media::ChannelLayout::CHANNEL_LAYOUT_NONE;
// A flag indicating whether the speech recognition service supports
// multichannel audio.
bool is_multichannel_supported_ = false;
}; };
#endif // CHROME_RENDERER_MEDIA_CHROME_SPEECH_RECOGNITION_CLIENT_H_ #endif // CHROME_RENDERER_MEDIA_CHROME_SPEECH_RECOGNITION_CLIENT_H_
...@@ -12,6 +12,8 @@ buildflag_header("buildflags") { ...@@ -12,6 +12,8 @@ buildflag_header("buildflags") {
source_set("lib") { source_set("lib") {
sources = [ sources = [
"cloud_speech_recognition_client.cc",
"cloud_speech_recognition_client.h",
"speech_recognition_recognizer_impl.cc", "speech_recognition_recognizer_impl.cc",
"speech_recognition_recognizer_impl.h", "speech_recognition_recognizer_impl.h",
"speech_recognition_service_impl.cc", "speech_recognition_service_impl.cc",
...@@ -27,10 +29,32 @@ source_set("lib") { ...@@ -27,10 +29,32 @@ source_set("lib") {
deps = [ deps = [
":buildflags", ":buildflags",
"//base", "//base",
"//services/service_manager/public/cpp", "//components/speech",
"//content/browser/speech/proto",
"//mojo/public/cpp/bindings",
"//net",
"//services/network/public/cpp",
"//services/network/public/mojom",
] ]
if (enable_soda) { if (enable_soda) {
deps += [ "//chrome/services/soda/internal" ] deps += [ "//chrome/services/soda/internal" ]
} }
} }
source_set("unit_tests") {
testonly = true
sources = [ "cloud_speech_recognition_client_unittest.cc" ]
deps = [
":lib",
"//base",
"//base/test:test_support",
"//chrome/test:test_support",
"//components/speech",
"//content/browser/speech/proto",
"//testing/gmock",
"//testing/gtest",
]
}
include_rules = [ include_rules = [
"+chrome/services/soda/internal", "+chrome/services/soda/internal",
"+chrome/services/speech",
"+components/soda/constants.h", "+components/soda/constants.h",
"+components/speech",
"+content/browser/speech",
"+google_apis",
"+media", "+media",
"+services/network",
"+third_party/blink/public/mojom/speech",
] ]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/services/speech/cloud_speech_recognition_client.h"
#include "base/memory/ptr_util.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "content/browser/speech/proto/google_streaming_api.pb.h"
#include "google_apis/google_api_keys.h"
#include "mojo/public/cpp/bindings/receiver_set.h"
#include "net/base/escape.h"
#include "net/traffic_annotation/network_traffic_annotation.h"
#include "services/network/public/cpp/shared_url_loader_factory.h"
#include "services/network/public/cpp/simple_url_loader.h"
#include "services/network/public/cpp/simple_url_loader_stream_consumer.h"
#include "services/network/public/mojom/chunked_data_pipe_getter.mojom.h"
#include "url/gurl.h"
namespace speech {
// The maximum duration a stream can be open for. The Open Speech API supports 5
// minutes of continuous recognition.
constexpr base::TimeDelta kStreamResetDuration =
base::TimeDelta::FromSeconds(295);
constexpr char kWebServiceBaseUrl[] =
"https://www.google.com/speech-api/full-duplex/v1";
constexpr char kDownstreamUrl[] = "/down";
constexpr char kUpstreamUrl[] = "/up";
CloudSpeechRecognitionClient::CloudSpeechRecognitionClient(
OnRecognitionEventCallback callback,
base::WeakPtr<SpeechRecognitionServiceImpl> speech_recognition_service_impl)
: recognition_event_callback_(callback),
speech_recognition_service_impl_(
std::move(speech_recognition_service_impl)) {
ResetUrlLoaderFactory();
}
CloudSpeechRecognitionClient::~CloudSpeechRecognitionClient() = default;
bool CloudSpeechRecognitionClient::DidAudioPropertyChange(int sample_rate,
int channel_count) {
return sample_rate != sample_rate_ || channel_count != channel_count_;
}
void CloudSpeechRecognitionClient::Initialize(const CloudSpeechConfig& config) {
channel_count_ = config.channel_count;
sample_rate_ = config.sample_rate;
language_code_ = config.language_code;
is_initialized_ = true;
Reset();
}
void CloudSpeechRecognitionClient::OnDownstreamDataReceived(
base::StringPiece new_response_data) {
// The downstream response is organized in chunks, whose size is determined
// by a 4 bytes prefix, transparently handled by the ChunkedByteBuffer class.
// Such chunks are sent by the speech recognition webservice over the HTTP
// downstream channel using HTTP chunked transfer (unrelated to our chunks).
// This function is called every time an HTTP chunk is received by the
// url fetcher. However there isn't any particular matching between our
// protocol chunks and HTTP chunks, in the sense that a single HTTP chunk can
// contain a portion of one chunk or even more chunks together.
chunked_byte_buffer_.Append(new_response_data);
// A single HTTP chunk can contain more than one data chunk, thus the while.
while (chunked_byte_buffer_.HasChunks()) {
auto chunk = chunked_byte_buffer_.PopChunk();
content::proto::SpeechRecognitionEvent event;
if (!event.ParseFromArray(chunk->data(), chunk->size() * sizeof(uint8_t))) {
DLOG(ERROR) << "Parsing of the recognition response failed.";
return;
}
std::string result;
for (const auto& recognition_result : event.result()) {
if (recognition_result.has_stability()) {
for (const auto& alternative : recognition_result.alternative()) {
if (alternative.has_transcript())
result += alternative.transcript();
}
}
}
recognition_event_callback().Run(result, false);
}
}
void CloudSpeechRecognitionClient::Reset() {
DCHECK(is_initialized_);
// Return if the URL loader factory has not been set.
if (!url_loader_factory_)
return;
last_reset_ = base::Time::Now();
const std::string request_key = base::UnguessableToken::Create().ToString();
// Setup downstream fetcher.
GURL downstream_url(base::StringPrintf(
"%s%s?key=%s&pair=%s&output=pb", kWebServiceBaseUrl, kDownstreamUrl,
net::EscapeQueryParamValue(google_apis::GetAPIKey(), true).c_str(),
net::EscapeQueryParamValue(request_key, true).c_str()));
net::NetworkTrafficAnnotationTag traffic_annotation =
net::DefineNetworkTrafficAnnotation("cloud_speech_recognition",
R"(
semantics {
sender: "Speech Recognition"
description:
"Chrome provides transcription from output audio by using the "
"Google speech recognition web service. Audio is sent to Google's "
"servers (upstream) and text is returned (downstream). This "
"network request (downstream) sends an id for getting the text "
"response. Then the (upstream) request sends the audio data along "
"with the id. When the server has finished processing the audio "
"data and produced a text response, it replies to this request."
trigger:
"Generally triggered in direct response to a user playing a "
"media with audio."
data: "A unique random id for this speech recognition request and "
"the audio output stream."
destination: GOOGLE_OWNED_SERVICE
}
policy {
cookies_allowed: NO
setting:
"The Live Caption feature can be enabled/disabled in the Chrome "
"accessibility settings menu. The feature is disabled by default."
chrome_policy {
AudioCaptureAllowed {
policy_options {mode: MANDATORY}
AudioCaptureAllowed: false
}
}
chrome_policy {
AudioCaptureAllowedUrls {
policy_options {mode: MANDATORY}
AudioCaptureAllowedUrls: {}
}
}
})");
auto downstream_request = std::make_unique<network::ResourceRequest>();
downstream_request->credentials_mode = network::mojom::CredentialsMode::kOmit;
downstream_request->url = downstream_url;
downstream_loader_ = std::make_unique<speech::DownstreamLoader>(
std::move(downstream_request), traffic_annotation,
url_loader_factory_.get(), this);
// Setup upstream fetcher.
GURL upstream_url(base::StringPrintf(
"%s%s?key=%s&pair=%s&output=pb&lang=%s&pFilter=0&maxAlternatives=1&app="
"chrome&continuous&interim",
kWebServiceBaseUrl, kUpstreamUrl,
net::EscapeQueryParamValue(google_apis::GetAPIKey(), true).c_str(),
net::EscapeQueryParamValue(request_key, true).c_str(),
net::EscapeQueryParamValue(language_code_, true).c_str()));
auto upstream_request = std::make_unique<network::ResourceRequest>();
upstream_request->url = upstream_url;
upstream_request->method = "POST";
upstream_request->credentials_mode = network::mojom::CredentialsMode::kOmit;
upstream_request->headers.SetHeader(
net::HttpRequestHeaders::kContentType,
"audio/l16; rate=" + base::NumberToString(sample_rate_));
upstream_loader_ = std::make_unique<speech::UpstreamLoader>(
std::move(upstream_request), traffic_annotation,
url_loader_factory_.get(), this);
}
void CloudSpeechRecognitionClient::AddAudio(base::span<const char> chunk) {
DCHECK(is_initialized_);
if (base::Time::Now() - last_reset_ > kStreamResetDuration) {
Reset();
}
upstream_loader_->AppendChunkToUpload(std::string(chunk.data(), chunk.size()),
false);
}
void CloudSpeechRecognitionClient::SetUrlLoaderFactoryForTesting(
mojo::PendingRemote<network::mojom::URLLoaderFactory> factory) {
url_loader_factory_ =
mojo::Remote<network::mojom::URLLoaderFactory>(std::move(factory));
}
void CloudSpeechRecognitionClient::ResetUrlLoaderFactory() {
downstream_loader_.reset();
upstream_loader_.reset();
url_loader_factory_.reset();
if (!speech_recognition_service_impl_)
return;
url_loader_factory_ = mojo::Remote<network::mojom::URLLoaderFactory>(
speech_recognition_service_impl_->GetUrlLoaderFactory());
url_loader_factory_.set_disconnect_handler(
base::BindOnce(&CloudSpeechRecognitionClient::ResetUrlLoaderFactory,
base::Unretained(this)));
if (!is_initialized_)
return;
Reset();
}
} // namespace speech
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_SERVICES_SPEECH_CLOUD_SPEECH_RECOGNITION_CLIENT_H_
#define CHROME_SERVICES_SPEECH_CLOUD_SPEECH_RECOGNITION_CLIENT_H_
#include "base/callback.h"
#include "base/containers/span.h"
#include "base/memory/weak_ptr.h"
#include "base/strings/string_piece.h"
#include "base/unguessable_token.h"
#include "chrome/services/speech/speech_recognition_service_impl.h"
#include "components/speech/downstream_loader.h"
#include "components/speech/downstream_loader_client.h"
#include "components/speech/upstream_loader.h"
#include "components/speech/upstream_loader_client.h"
#include "content/browser/speech/chunked_byte_buffer.h"
#include "services/network/public/mojom/url_loader_factory.mojom.h"
namespace speech {
// Encapsulates the configuration parameters used to initialize the stream.
struct CloudSpeechConfig {
int sample_rate;
int channel_count;
std::string language_code;
};
// Streams audio to the Open Speech API to generate transcriptions. This is a
// temporary solution that will enable testing and experimentation of the Live
// Caption feature while the Speech On-Device API (SODA) is under development.
// Much of this implementation overlaps with that of the SpeechRecognitionEngine
// used by the WebSpeech API. This code is intentionally kept separate from the
// WebSpeech API implementation to reduce code churn once this client is removed
// and replaced with the SodaClient.
class CloudSpeechRecognitionClient : public speech::UpstreamLoaderClient,
public speech::DownstreamLoaderClient {
public:
using OnRecognitionEventCallback =
base::RepeatingCallback<void(const std::string& result,
const bool is_final)>;
explicit CloudSpeechRecognitionClient(
OnRecognitionEventCallback callback,
base::WeakPtr<SpeechRecognitionServiceImpl>
speech_recognition_service_impl);
~CloudSpeechRecognitionClient() override;
// Checks whether the sample rate or channel count differs from the values
// used to initialize the stream.
bool DidAudioPropertyChange(int sample_rate, int channel_count);
// Initializes the stream instance with the provided config.
void Initialize(const CloudSpeechConfig& config);
// speech::DownstreamLoaderClient
void OnDownstreamDataReceived(base::StringPiece new_response_data) override;
void OnDownstreamDataComplete(bool success, int response_code) override {}
// speech::UpstreamLoaderClient
void OnUpstreamDataComplete(bool success, int response_code) override {}
// Resets the stream instance.
void Reset();
// Feeds raw audio to the Open Speech API.
void AddAudio(base::span<const char> chunk);
void SetUrlLoaderFactoryForTesting(
mojo::PendingRemote<network::mojom::URLLoaderFactory> factory);
// Returns a flag indicating whether the stream has been initialized.
bool IsInitialized() { return is_initialized_; }
OnRecognitionEventCallback recognition_event_callback() {
return recognition_event_callback_;
}
private:
friend class speech::UpstreamLoader;
friend class speech::DownstreamLoader;
void ResetUrlLoaderFactory();
bool is_initialized_ = false;
int sample_rate_ = 0;
int channel_count_ = 0;
std::string language_code_;
// Stores the last time the stream was reset.
base::Time last_reset_;
OnRecognitionEventCallback recognition_event_callback_;
std::unique_ptr<speech::UpstreamLoader> upstream_loader_;
std::unique_ptr<speech::DownstreamLoader> downstream_loader_;
// Remote owned by the SpeechRecognitionServiceImpl.
mojo::Remote<network::mojom::URLLoaderFactory> url_loader_factory_;
base::WeakPtr<SpeechRecognitionServiceImpl> speech_recognition_service_impl_;
content::ChunkedByteBuffer chunked_byte_buffer_;
};
} // namespace speech
#endif // CHROME_SERVICES_SPEECH_CLOUD_SPEECH_RECOGNITION_CLIENT_H_
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <utility> #include <utility>
#include "base/bind.h" #include "base/bind.h"
#include "base/containers/span.h"
#include "components/soda/constants.h" #include "components/soda/constants.h"
#include "media/base/audio_buffer.h" #include "media/base/audio_buffer.h"
#include "media/base/audio_sample_types.h" #include "media/base/audio_sample_types.h"
...@@ -22,6 +23,8 @@ ...@@ -22,6 +23,8 @@
namespace speech { namespace speech {
constexpr char kInvalidAudioDataError[] = "Invalid audio data received.";
namespace { namespace {
#if BUILDFLAG(ENABLE_SODA) #if BUILDFLAG(ENABLE_SODA)
...@@ -47,13 +50,23 @@ SpeechRecognitionRecognizerImpl::~SpeechRecognitionRecognizerImpl() = default; ...@@ -47,13 +50,23 @@ SpeechRecognitionRecognizerImpl::~SpeechRecognitionRecognizerImpl() = default;
void SpeechRecognitionRecognizerImpl::Create( void SpeechRecognitionRecognizerImpl::Create(
mojo::PendingReceiver<media::mojom::SpeechRecognitionRecognizer> receiver, mojo::PendingReceiver<media::mojom::SpeechRecognitionRecognizer> receiver,
mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> remote,
remote) { base::WeakPtr<SpeechRecognitionServiceImpl>
speech_recognition_service_impl) {
mojo::MakeSelfOwnedReceiver( mojo::MakeSelfOwnedReceiver(
base::WrapUnique(new SpeechRecognitionRecognizerImpl(std::move(remote))), std::make_unique<SpeechRecognitionRecognizerImpl>(
std::move(remote), std::move(speech_recognition_service_impl)),
std::move(receiver)); std::move(receiver));
} }
bool SpeechRecognitionRecognizerImpl::IsMultichannelSupported() {
#if BUILDFLAG(ENABLE_SODA)
return true;
#else
return false;
#endif // BUILDFLAG(ENABLE_SODA)
}
void SpeechRecognitionRecognizerImpl::OnRecognitionEvent( void SpeechRecognitionRecognizerImpl::OnRecognitionEvent(
const std::string& result, const std::string& result,
const bool is_final) { const bool is_final) {
...@@ -62,13 +75,17 @@ void SpeechRecognitionRecognizerImpl::OnRecognitionEvent( ...@@ -62,13 +75,17 @@ void SpeechRecognitionRecognizerImpl::OnRecognitionEvent(
} }
SpeechRecognitionRecognizerImpl::SpeechRecognitionRecognizerImpl( SpeechRecognitionRecognizerImpl::SpeechRecognitionRecognizerImpl(
mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> remote) mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> remote,
base::WeakPtr<SpeechRecognitionServiceImpl> speech_recognition_service_impl)
: client_remote_(std::move(remote)) { : client_remote_(std::move(remote)) {
recognition_event_callback_ = media::BindToCurrentLoop( recognition_event_callback_ = media::BindToCurrentLoop(
base::Bind(&SpeechRecognitionRecognizerImpl::OnRecognitionEvent, base::Bind(&SpeechRecognitionRecognizerImpl::OnRecognitionEvent,
weak_factory_.GetWeakPtr())); weak_factory_.GetWeakPtr()));
#if BUILDFLAG(ENABLE_SODA) #if BUILDFLAG(ENABLE_SODA)
soda_client_ = std::make_unique<soda::SodaClient>(GetSodaBinaryPath()); soda_client_ = std::make_unique<soda::SodaClient>(GetSodaBinaryPath());
#else
cloud_client_ = std::make_unique<CloudSpeechRecognitionClient>(
recognition_event_callback(), std::move(speech_recognition_service_impl));
#endif // BUILDFLAG(ENABLE_SODA) #endif // BUILDFLAG(ENABLE_SODA)
} }
...@@ -77,13 +94,27 @@ void SpeechRecognitionRecognizerImpl::SendAudioToSpeechRecognitionService( ...@@ -77,13 +94,27 @@ void SpeechRecognitionRecognizerImpl::SendAudioToSpeechRecognitionService(
int channel_count = buffer->channel_count; int channel_count = buffer->channel_count;
int frame_count = buffer->frame_count; int frame_count = buffer->frame_count;
int sample_rate = buffer->sample_rate; int sample_rate = buffer->sample_rate;
int num_samples; size_t num_samples = 0;
int data_size; size_t buffer_size = 0;
if (channel_count <= 0 || channel_count > media::limits::kMaxChannels ||
sample_rate <= 0 || frame_count <= 0 || // Verify the channel count.
if (channel_count <= 0 || channel_count > media::limits::kMaxChannels) {
mojo::ReportBadMessage(kInvalidAudioDataError);
return;
}
// Verify and calculate the number of samples.
if (sample_rate <= 0 || frame_count <= 0 ||
!base::CheckMul(frame_count, channel_count).AssignIfValid(&num_samples) || !base::CheckMul(frame_count, channel_count).AssignIfValid(&num_samples) ||
!base::CheckMul(num_samples, sizeof(int16_t)).AssignIfValid(&data_size)) { num_samples != buffer->data.size()) {
mojo::ReportBadMessage("Invalid audio data received."); mojo::ReportBadMessage(kInvalidAudioDataError);
return;
}
// Verify and calculate the buffer size.
if (!base::CheckMul(buffer->data.size(), sizeof(buffer->data[0]))
.AssignIfValid(&buffer_size)) {
mojo::ReportBadMessage(kInvalidAudioDataError);
return; return;
} }
...@@ -103,7 +134,21 @@ void SpeechRecognitionRecognizerImpl::SendAudioToSpeechRecognitionService( ...@@ -103,7 +134,21 @@ void SpeechRecognitionRecognizerImpl::SendAudioToSpeechRecognitionService(
} }
soda_client_->AddAudio(reinterpret_cast<char*>(buffer->data.data()), soda_client_->AddAudio(reinterpret_cast<char*>(buffer->data.data()),
data_size); buffer_size);
#else
DCHECK(cloud_client_);
if (!cloud_client_->IsInitialized() ||
cloud_client_->DidAudioPropertyChange(sample_rate, channel_count)) {
// Initialize the stream.
CloudSpeechConfig config;
config.sample_rate = sample_rate;
config.channel_count = channel_count;
config.language_code = "en-US";
cloud_client_->Initialize(config);
}
cloud_client_->AddAudio(base::span<const char>(
reinterpret_cast<char*>(buffer->data.data()), buffer_size));
#endif // BUILDFLAG(ENABLE_SODA) #endif // BUILDFLAG(ENABLE_SODA)
} }
......
...@@ -9,8 +9,8 @@ ...@@ -9,8 +9,8 @@
#include <string> #include <string>
#include "base/memory/weak_ptr.h" #include "base/memory/weak_ptr.h"
#include "build/branding_buildflags.h"
#include "chrome/services/speech/buildflags.h" #include "chrome/services/speech/buildflags.h"
#include "chrome/services/speech/cloud_speech_recognition_client.h"
#include "media/mojo/mojom/speech_recognition_service.mojom.h" #include "media/mojo/mojom/speech_recognition_service.mojom.h"
#include "mojo/public/cpp/bindings/receiver.h" #include "mojo/public/cpp/bindings/receiver.h"
#include "mojo/public/cpp/bindings/remote.h" #include "mojo/public/cpp/bindings/remote.h"
...@@ -20,6 +20,7 @@ class SodaClient; ...@@ -20,6 +20,7 @@ class SodaClient;
} // namespace soda } // namespace soda
namespace speech { namespace speech {
class SpeechRecognitionServiceImpl;
class SpeechRecognitionRecognizerImpl class SpeechRecognitionRecognizerImpl
: public media::mojom::SpeechRecognitionRecognizer { : public media::mojom::SpeechRecognitionRecognizer {
...@@ -27,23 +28,27 @@ class SpeechRecognitionRecognizerImpl ...@@ -27,23 +28,27 @@ class SpeechRecognitionRecognizerImpl
using OnRecognitionEventCallback = using OnRecognitionEventCallback =
base::RepeatingCallback<void(const std::string& result, base::RepeatingCallback<void(const std::string& result,
const bool is_final)>; const bool is_final)>;
SpeechRecognitionRecognizerImpl(
mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient>
remote,
base::WeakPtr<SpeechRecognitionServiceImpl>
speech_recognition_service_impl);
~SpeechRecognitionRecognizerImpl() override; ~SpeechRecognitionRecognizerImpl() override;
static void Create( static void Create(
mojo::PendingReceiver<media::mojom::SpeechRecognitionRecognizer> receiver, mojo::PendingReceiver<media::mojom::SpeechRecognitionRecognizer> receiver,
mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient>
remote); remote,
base::WeakPtr<SpeechRecognitionServiceImpl>
speech_recognition_service_impl);
static bool IsMultichannelSupported();
OnRecognitionEventCallback recognition_event_callback() const { OnRecognitionEventCallback recognition_event_callback() const {
return recognition_event_callback_; return recognition_event_callback_;
} }
private: private:
explicit SpeechRecognitionRecognizerImpl(
mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient>
remote);
// Convert the audio buffer into the appropriate format and feed the raw audio // Convert the audio buffer into the appropriate format and feed the raw audio
// into the speech recognition instance. // into the speech recognition instance.
void SendAudioToSpeechRecognitionService( void SendAudioToSpeechRecognitionService(
...@@ -61,6 +66,8 @@ class SpeechRecognitionRecognizerImpl ...@@ -61,6 +66,8 @@ class SpeechRecognitionRecognizerImpl
std::unique_ptr<soda::SodaClient> soda_client_; std::unique_ptr<soda::SodaClient> soda_client_;
#endif // BUILDFLAG(ENABLE_SODA) #endif // BUILDFLAG(ENABLE_SODA)
std::unique_ptr<CloudSpeechRecognitionClient> cloud_client_;
// The callback that is eventually executed on a speech recognition event // The callback that is eventually executed on a speech recognition event
// which passes the transcribed audio back to the caller via the speech // which passes the transcribed audio back to the caller via the speech
// recognition event client remote. // recognition event client remote.
......
...@@ -19,12 +19,43 @@ void SpeechRecognitionServiceImpl::BindContext( ...@@ -19,12 +19,43 @@ void SpeechRecognitionServiceImpl::BindContext(
speech_recognition_contexts_.Add(this, std::move(context)); speech_recognition_contexts_.Add(this, std::move(context));
} }
void SpeechRecognitionServiceImpl::SetUrlLoaderFactory(
mojo::PendingRemote<network::mojom::URLLoaderFactory> url_loader_factory) {
url_loader_factory_ = mojo::Remote<network::mojom::URLLoaderFactory>(
std::move(url_loader_factory));
url_loader_factory_.set_disconnect_handler(
base::BindOnce(&SpeechRecognitionServiceImpl::DisconnectHandler,
base::Unretained(this)));
}
void SpeechRecognitionServiceImpl::BindSpeechRecognitionServiceClient(
mojo::PendingRemote<media::mojom::SpeechRecognitionServiceClient> client) {
client_ = mojo::Remote<media::mojom::SpeechRecognitionServiceClient>(
std::move(client));
}
mojo::PendingRemote<network::mojom::URLLoaderFactory>
SpeechRecognitionServiceImpl::GetUrlLoaderFactory() {
mojo::PendingRemote<network::mojom::URLLoaderFactory> pending_factory_remote;
url_loader_factory_->Clone(
pending_factory_remote.InitWithNewPipeAndPassReceiver());
return pending_factory_remote;
}
void SpeechRecognitionServiceImpl::BindRecognizer( void SpeechRecognitionServiceImpl::BindRecognizer(
mojo::PendingReceiver<media::mojom::SpeechRecognitionRecognizer> receiver, mojo::PendingReceiver<media::mojom::SpeechRecognitionRecognizer> receiver,
mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> client,
client) { BindRecognizerCallback callback) {
SpeechRecognitionRecognizerImpl::Create(std::move(receiver), SpeechRecognitionRecognizerImpl::Create(
std::move(client)); std::move(receiver), std::move(client), weak_factory_.GetWeakPtr());
std::move(callback).Run(
SpeechRecognitionRecognizerImpl::IsMultichannelSupported());
}
void SpeechRecognitionServiceImpl::DisconnectHandler() {
if (client_.is_bound())
client_->OnNetworkServiceDisconnect();
} }
} // namespace speech } // namespace speech
...@@ -5,10 +5,12 @@ ...@@ -5,10 +5,12 @@
#ifndef CHROME_SERVICES_SPEECH_SPEECH_RECOGNITION_SERVICE_IMPL_H_ #ifndef CHROME_SERVICES_SPEECH_SPEECH_RECOGNITION_SERVICE_IMPL_H_
#define CHROME_SERVICES_SPEECH_SPEECH_RECOGNITION_SERVICE_IMPL_H_ #define CHROME_SERVICES_SPEECH_SPEECH_RECOGNITION_SERVICE_IMPL_H_
#include "base/memory/weak_ptr.h"
#include "media/mojo/mojom/speech_recognition_service.mojom.h" #include "media/mojo/mojom/speech_recognition_service.mojom.h"
#include "mojo/public/cpp/bindings/pending_receiver.h" #include "mojo/public/cpp/bindings/pending_receiver.h"
#include "mojo/public/cpp/bindings/receiver.h" #include "mojo/public/cpp/bindings/receiver.h"
#include "mojo/public/cpp/bindings/receiver_set.h" #include "mojo/public/cpp/bindings/receiver_set.h"
#include "services/network/url_loader_factory.h"
namespace speech { namespace speech {
...@@ -23,20 +25,37 @@ class SpeechRecognitionServiceImpl ...@@ -23,20 +25,37 @@ class SpeechRecognitionServiceImpl
// media::mojom::SpeechRecognitionService // media::mojom::SpeechRecognitionService
void BindContext(mojo::PendingReceiver<media::mojom::SpeechRecognitionContext> void BindContext(mojo::PendingReceiver<media::mojom::SpeechRecognitionContext>
context) override; context) override;
void SetUrlLoaderFactory(mojo::PendingRemote<network::mojom::URLLoaderFactory>
url_loader_factory) override;
void BindSpeechRecognitionServiceClient(
mojo::PendingRemote<media::mojom::SpeechRecognitionServiceClient> client)
override;
virtual mojo::PendingRemote<network::mojom::URLLoaderFactory>
GetUrlLoaderFactory();
// media::mojom::SpeechRecognitionContext // media::mojom::SpeechRecognitionContext
void BindRecognizer( void BindRecognizer(
mojo::PendingReceiver<media::mojom::SpeechRecognitionRecognizer> receiver, mojo::PendingReceiver<media::mojom::SpeechRecognitionRecognizer> receiver,
mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient>
client) override; client,
BindRecognizerCallback callback) override;
protected:
void DisconnectHandler();
private:
mojo::Receiver<media::mojom::SpeechRecognitionService> receiver_; mojo::Receiver<media::mojom::SpeechRecognitionService> receiver_;
// The set of receivers used to receive messages from the renderer clients. // The set of receivers used to receive messages from the renderer clients.
mojo::ReceiverSet<media::mojom::SpeechRecognitionContext> mojo::ReceiverSet<media::mojom::SpeechRecognitionContext>
speech_recognition_contexts_; speech_recognition_contexts_;
mojo::Remote<network::mojom::URLLoaderFactory> url_loader_factory_;
mojo::Remote<media::mojom::SpeechRecognitionServiceClient> client_;
base::WeakPtrFactory<SpeechRecognitionServiceImpl> weak_factory_{this};
DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionServiceImpl); DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionServiceImpl);
}; };
......
...@@ -4286,6 +4286,7 @@ test("unit_tests") { ...@@ -4286,6 +4286,7 @@ test("unit_tests") {
"//chrome/browser/resource_coordinator/tab_ranker:tab_features_test_helper", "//chrome/browser/resource_coordinator/tab_ranker:tab_features_test_helper",
"//chrome/services/sharing:unit_tests", "//chrome/services/sharing:unit_tests",
"//chrome/services/sharing/nearby_decoder:unit_tests", "//chrome/services/sharing/nearby_decoder:unit_tests",
"//chrome/services/speech:unit_tests",
"//components/chrome_cleaner/test:test_name_helper", "//components/chrome_cleaner/test:test_name_helper",
"//components/feature_engagement/test:test_support", "//components/feature_engagement/test:test_support",
"//components/safety_check:test_support", "//components/safety_check:test_support",
......
...@@ -62,6 +62,7 @@ mojom("mojom") { ...@@ -62,6 +62,7 @@ mojom("mojom") {
"//gpu/ipc/common:interfaces", "//gpu/ipc/common:interfaces",
"//media/learning/mojo/public/mojom", "//media/learning/mojo/public/mojom",
"//mojo/public/mojom/base", "//mojo/public/mojom/base",
"//services/network/public/mojom",
"//services/service_manager/public/mojom", "//services/service_manager/public/mojom",
"//ui/gfx/geometry/mojom", "//ui/gfx/geometry/mojom",
"//ui/gfx/mojom", "//ui/gfx/mojom",
......
...@@ -5,15 +5,18 @@ ...@@ -5,15 +5,18 @@
module media.mojom; module media.mojom;
import "media/mojo/mojom/media_types.mojom"; import "media/mojo/mojom/media_types.mojom";
import "services/network/public/mojom/url_loader_factory.mojom";
// The main interface a client uses to interact with a speech recognition // The main interface a client uses to interact with a speech recognition
// service process. Every renderer can own one or more // service process. Every renderer can own one or more
// Remote<SpeechRecognitionContext>, with the receiver bound through the // Remote<SpeechRecognitionContext>, with the receiver bound through the
// BrowserInterfaceBroker. // BrowserInterfaceBroker. Returns a flag indicating whether multichannel
// audio is supported by the speech recognition service.
interface SpeechRecognitionContext { interface SpeechRecognitionContext {
// Bind the recognizers to the speech recognition service. // Bind the recognizers to the speech recognition service.
BindRecognizer(pending_receiver<SpeechRecognitionRecognizer> receiver, BindRecognizer(pending_receiver<SpeechRecognitionRecognizer> receiver,
pending_remote<SpeechRecognitionRecognizerClient> client); pending_remote<SpeechRecognitionRecognizerClient> client)
=> (bool is_multichannel_supported);
}; };
// The main interface to a speech secognition service process. // The main interface to a speech secognition service process.
...@@ -22,6 +25,23 @@ interface SpeechRecognitionContext { ...@@ -22,6 +25,23 @@ interface SpeechRecognitionContext {
interface SpeechRecognitionService { interface SpeechRecognitionService {
// Bind the context to a new instance of the speech recognition. // Bind the context to a new instance of the speech recognition.
BindContext(pending_receiver<SpeechRecognitionContext> context); BindContext(pending_receiver<SpeechRecognitionContext> context);
// Sets the URL loader factory used to create network requests.
SetUrlLoaderFactory(
pending_remote<network.mojom.URLLoaderFactory> url_loader_factory);
// Binds the speech recognition service client used by the speech
// recognition service to send messages back to the client.
BindSpeechRecognitionServiceClient(
pending_remote<SpeechRecognitionServiceClient> client);
};
// The interface used to send messages from the speech recognition service
// back to the consumer of the service.
interface SpeechRecognitionServiceClient {
// Executed when the network service crashes, prompting the client to
// reset the URL loader factory.
OnNetworkServiceDisconnect();
}; };
// The interface used to pass raw audio from the renderer to the speech // The interface used to pass raw audio from the renderer to the speech
......
...@@ -5,7 +5,10 @@ ...@@ -5,7 +5,10 @@
static_library("google_trust_services") { static_library("google_trust_services") {
# This is currently only used by remoting. Changes to visibility require a # This is currently only used by remoting. Changes to visibility require a
# security review. # security review.
visibility = [ "//remoting/*" ] visibility = [
"//chrome/services/speech/*",
"//remoting/*",
]
sources = [ sources = [
"src/roots.cc", "src/roots.cc",
"src/roots.h", "src/roots.h",
......
...@@ -52,6 +52,7 @@ Refer to README.md for content description and update process. ...@@ -52,6 +52,7 @@ Refer to README.md for content description and update process.
<item id="cloud_print_privet_register" hash_code="24978481" type="0" content_hash_code="131359002" os_list="linux,windows" file_path="chrome/browser/printing/cloud_print/gcd_api_flow_impl.cc"/> <item id="cloud_print_privet_register" hash_code="24978481" type="0" content_hash_code="131359002" os_list="linux,windows" file_path="chrome/browser/printing/cloud_print/gcd_api_flow_impl.cc"/>
<item id="cloud_print_proxy" hash_code="50859288" type="1" second_id="111712433" content_hash_code="90868083" os_list="linux,windows" semantics_fields="2,3,4" file_path="chrome/service/cloud_print/cloud_print_proxy.cc"/> <item id="cloud_print_proxy" hash_code="50859288" type="1" second_id="111712433" content_hash_code="90868083" os_list="linux,windows" semantics_fields="2,3,4" file_path="chrome/service/cloud_print/cloud_print_proxy.cc"/>
<item id="cloud_print_search" hash_code="132055347" type="0" content_hash_code="123783474" os_list="linux,windows" file_path="chrome/browser/printing/cloud_print/gcd_api_flow_impl.cc"/> <item id="cloud_print_search" hash_code="132055347" type="0" content_hash_code="123783474" os_list="linux,windows" file_path="chrome/browser/printing/cloud_print/gcd_api_flow_impl.cc"/>
<item id="cloud_speech_recognition" hash_code="15750036" type="0" deprecated="2020-05-27" content_hash_code="114921835" file_path="chrome/services/speech/cloud_speech_recognition_client.cc"/>
<item id="content_hash_verification_job" hash_code="64733114" type="0" content_hash_code="127912411" os_list="linux,windows" file_path="extensions/browser/content_hash_fetcher.cc"/> <item id="content_hash_verification_job" hash_code="64733114" type="0" content_hash_code="127912411" os_list="linux,windows" file_path="extensions/browser/content_hash_fetcher.cc"/>
<item id="content_resource_fetcher" hash_code="70796791" type="0" deprecated="2017-09-16" content_hash_code="135648626" file_path=""/> <item id="content_resource_fetcher" hash_code="70796791" type="0" deprecated="2017-09-16" content_hash_code="135648626" file_path=""/>
<item id="content_suggestion_get_favicon" hash_code="16653985" type="0" content_hash_code="134280933" os_list="linux,windows" file_path="components/ntp_snippets/content_suggestions_service.cc"/> <item id="content_suggestion_get_favicon" hash_code="16653985" type="0" content_hash_code="134280933" os_list="linux,windows" file_path="components/ntp_snippets/content_suggestions_service.cc"/>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment