Commit 3c53d30e authored by Abigail Klein's avatar Abigail Klein Committed by Commit Bot

[Live Caption] Pass a message to the browser when the speech

recognition service is ready.

As soon as a user presses play on a video or audio, the caption bubble
should appear with the text "Detecting speech...". To do this, this CL
introduces an OnSpeechRecognitionReady function to the CaptionHost mojo
interface, which is called the first time the
ChromeSpeechRecognitionClient receives audio to be sent to the speech
service. The ChromeSpeechRecognitionClient is constructed when the
media is first constructed, which is why this is called when audio is
received rather than when the recognizer is bound (which is when the
on_ready_callback is called back to the renderer).

The flow of information from the speech recognition service to the UI
is as follows:

ChromeSpeechRecognitionClient::SendAudioToSpeechRecognitionService()
-----chrome::mojom::CaptionHost OnSpeechRecognitionReady-----
CaptionHostImpl::OnSpeechRecognitionReady()
CaptionController::OnSpeechRecognitionReady(
      content::WebContents* web_contents)
CaptionBubbleControllerViews::OnSpeechRecognitionReady()
CaptionBubbleModel::OnReady()

Bug: 1055150
Change-Id: Ib5eff4034b236b1ee1c6747628b11f9f21506460
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2321406Reviewed-by: default avatarAvi Drissman <avi@chromium.org>
Reviewed-by: default avatarAlex Gough <ajgo@chromium.org>
Reviewed-by: default avatarKatie Dektar <katie@chromium.org>
Commit-Queue: Abigail Klein <abigailbklein@google.com>
Cr-Commit-Position: refs/heads/master@{#793684}
parent df80f840
...@@ -161,6 +161,15 @@ void CaptionController::OnBrowserRemoved(Browser* browser) { ...@@ -161,6 +161,15 @@ void CaptionController::OnBrowserRemoved(Browser* browser) {
caption_bubble_controllers_.erase(browser); caption_bubble_controllers_.erase(browser);
} }
bool CaptionController::OnSpeechRecognitionReady(
content::WebContents* web_contents) {
Browser* browser = chrome::FindBrowserWithWebContents(web_contents);
if (!browser || !caption_bubble_controllers_.count(browser))
return false;
return caption_bubble_controllers_[browser]->OnSpeechRecognitionReady(
web_contents);
}
bool CaptionController::DispatchTranscription( bool CaptionController::DispatchTranscription(
content::WebContents* web_contents, content::WebContents* web_contents,
const chrome::mojom::TranscriptionResultPtr& transcription_result) { const chrome::mojom::TranscriptionResultPtr& transcription_result) {
......
...@@ -64,6 +64,12 @@ class CaptionController : public BrowserListObserver, public KeyedService { ...@@ -64,6 +64,12 @@ class CaptionController : public BrowserListObserver, public KeyedService {
void Init(); void Init();
// Alerts the CaptionBubbleController that belongs to the appropriate browser
// that speech recognition is ready to start for the given web contents.
// Returns whether this message was routed successfully. Transcriptions will
// not proceed if this returns false.
bool OnSpeechRecognitionReady(content::WebContents* web_contents);
// Routes a transcription to the CaptionBubbleController that belongs to the // Routes a transcription to the CaptionBubbleController that belongs to the
// appropriate browser. Returns whether the transcription result was routed // appropriate browser. Returns whether the transcription result was routed
// successfully. Transcriptions will halt if this returns false. // successfully. Transcriptions will halt if this returns false.
......
...@@ -39,6 +39,29 @@ CaptionHostImpl::CaptionHostImpl(content::RenderFrameHost* frame_host) ...@@ -39,6 +39,29 @@ CaptionHostImpl::CaptionHostImpl(content::RenderFrameHost* frame_host)
CaptionHostImpl::~CaptionHostImpl() = default; CaptionHostImpl::~CaptionHostImpl() = default;
void CaptionHostImpl::OnSpeechRecognitionReady(
OnSpeechRecognitionReadyCallback reply) {
if (!frame_host_) {
std::move(reply).Run(false);
return;
}
content::WebContents* web_contents =
content::WebContents::FromRenderFrameHost(frame_host_);
if (!web_contents) {
frame_host_ = nullptr;
std::move(reply).Run(false);
return;
}
Profile* profile =
Profile::FromBrowserContext(web_contents->GetBrowserContext());
if (!profile) {
std::move(reply).Run(false);
return;
}
std::move(reply).Run(CaptionControllerFactory::GetForProfile(profile)
->OnSpeechRecognitionReady(web_contents));
}
void CaptionHostImpl::OnTranscription( void CaptionHostImpl::OnTranscription(
chrome::mojom::TranscriptionResultPtr transcription_result, chrome::mojom::TranscriptionResultPtr transcription_result,
OnTranscriptionCallback reply) { OnTranscriptionCallback reply) {
......
...@@ -37,6 +37,8 @@ class CaptionHostImpl : public chrome::mojom::CaptionHost, ...@@ -37,6 +37,8 @@ class CaptionHostImpl : public chrome::mojom::CaptionHost,
mojo::PendingReceiver<chrome::mojom::CaptionHost> receiver); mojo::PendingReceiver<chrome::mojom::CaptionHost> receiver);
// chrome::mojom::CaptionHost: // chrome::mojom::CaptionHost:
void OnSpeechRecognitionReady(
OnSpeechRecognitionReadyCallback reply) override;
void OnTranscription( void OnTranscription(
chrome::mojom::TranscriptionResultPtr transcription_result, chrome::mojom::TranscriptionResultPtr transcription_result,
OnTranscriptionCallback reply) override; OnTranscriptionCallback reply) override;
......
...@@ -37,6 +37,8 @@ class CaptionBubbleController { ...@@ -37,6 +37,8 @@ class CaptionBubbleController {
static std::unique_ptr<CaptionBubbleController> Create(Browser* browser); static std::unique_ptr<CaptionBubbleController> Create(Browser* browser);
virtual bool OnSpeechRecognitionReady(content::WebContents* web_contents) = 0;
// Called when a transcription is received from the service. Returns whether // Called when a transcription is received from the service. Returns whether
// the transcription result was set on the caption bubble successfully. // the transcription result was set on the caption bubble successfully.
// Transcriptions will halt if this returns false. // Transcriptions will halt if this returns false.
......
...@@ -539,6 +539,16 @@ void CaptionBubble::OnErrorChanged() { ...@@ -539,6 +539,16 @@ void CaptionBubble::OnErrorChanged() {
Redraw(); Redraw();
} }
void CaptionBubble::OnReadyChanged() {
// There is a bug in RenderText in which the label text must not be empty when
// it is displayed, or otherwise subsequent calculation of the number of lines
// (CaptionBubble::GetNumLinesInLabel) will be incorrect. The label text here
// is set to a space character.
// TODO(1055150): Fix the bug in RenderText and then remove this workaround.
label_->SetText(base::ASCIIToUTF16("\u0020"));
UpdateBubbleAndWaitTextVisibility();
}
void CaptionBubble::OnIsExpandedChanged() { void CaptionBubble::OnIsExpandedChanged() {
expand_button_->SetVisible(!is_expanded_); expand_button_->SetVisible(!is_expanded_);
collapse_button_->SetVisible(is_expanded_); collapse_button_->SetVisible(is_expanded_);
...@@ -566,14 +576,14 @@ void CaptionBubble::UpdateBubbleVisibility() { ...@@ -566,14 +576,14 @@ void CaptionBubble::UpdateBubbleVisibility() {
// Hide the widget if there is no room for it or the model is closed. // Hide the widget if there is no room for it or the model is closed.
if (GetWidget()->IsVisible()) if (GetWidget()->IsVisible())
GetWidget()->Hide(); GetWidget()->Hide();
} else if (label_->GetText().size() > 0 || model_->HasError()) { } else if (model_->IsReady() || model_->HasError()) {
// Show the widget if it has text or an error to display. Only show the // Show the widget if it is ready to receive transcriptions or it has an
// widget if it isn't already visible. Always calling Widget::Show() will // error to display. Only show the widget if it isn't already visible.
// mean the widget gets focus each time. // Always calling Widget::Show() will mean the widget gets focus each time.
if (!GetWidget()->IsVisible()) if (!GetWidget()->IsVisible())
GetWidget()->Show(); GetWidget()->Show();
} else if (GetWidget()->IsVisible()) { } else if (GetWidget()->IsVisible()) {
// No text and no error. Hide it. // Not ready and no error. Hide it.
GetWidget()->Hide(); GetWidget()->Hide();
} }
} }
......
...@@ -103,6 +103,11 @@ class CaptionBubble : public views::BubbleDialogDelegateView, ...@@ -103,6 +103,11 @@ class CaptionBubble : public views::BubbleDialogDelegateView,
// the model has an error, otherwise displays the latest text. // the model has an error, otherwise displays the latest text.
void OnErrorChanged(); void OnErrorChanged();
// Called by the CaptionBubbleModel to notify this object that the model's
// on ready state has changed. Makes the caption bubble become visible and
// show the wait text.
void OnReadyChanged();
// Called when the caption bubble expanded state has changed. Changes the // Called when the caption bubble expanded state has changed. Changes the
// number of lines displayed. // number of lines displayed.
void OnIsExpandedChanged(); void OnIsExpandedChanged();
......
...@@ -69,6 +69,18 @@ void CaptionBubbleControllerViews::OnCaptionBubbleDestroyed() { ...@@ -69,6 +69,18 @@ void CaptionBubbleControllerViews::OnCaptionBubbleDestroyed() {
browser_ = nullptr; browser_ = nullptr;
} }
bool CaptionBubbleControllerViews::OnSpeechRecognitionReady(
content::WebContents* web_contents) {
if (!caption_bubble_ || !caption_bubble_models_.count(web_contents) ||
caption_bubble_models_[web_contents]->IsClosed())
return false;
CaptionBubbleModel* caption_bubble_model =
caption_bubble_models_[web_contents].get();
caption_bubble_model->OnReady();
return true;
}
bool CaptionBubbleControllerViews::OnTranscription( bool CaptionBubbleControllerViews::OnTranscription(
const chrome::mojom::TranscriptionResultPtr& transcription_result, const chrome::mojom::TranscriptionResultPtr& transcription_result,
content::WebContents* web_contents) { content::WebContents* web_contents) {
......
...@@ -38,6 +38,8 @@ class CaptionBubbleControllerViews : public CaptionBubbleController, ...@@ -38,6 +38,8 @@ class CaptionBubbleControllerViews : public CaptionBubbleController,
CaptionBubbleControllerViews& operator=(const CaptionBubbleControllerViews&) = CaptionBubbleControllerViews& operator=(const CaptionBubbleControllerViews&) =
delete; delete;
bool OnSpeechRecognitionReady(content::WebContents* web_contents) override;
// Called when a transcription is received from the service. Returns whether // Called when a transcription is received from the service. Returns whether
// the transcription result was set on the caption bubble successfully. // the transcription result was set on the caption bubble successfully.
// Transcriptions will halt if this returns false. // Transcriptions will halt if this returns false.
......
...@@ -29,6 +29,7 @@ void CaptionBubbleModel::SetObserver(CaptionBubble* observer) { ...@@ -29,6 +29,7 @@ void CaptionBubbleModel::SetObserver(CaptionBubble* observer) {
return; return;
observer_ = observer; observer_ = observer;
if (observer_) { if (observer_) {
observer_->OnReadyChanged();
observer_->OnTextChanged(); observer_->OnTextChanged();
observer_->OnErrorChanged(); observer_->OnErrorChanged();
} }
...@@ -52,9 +53,23 @@ void CaptionBubbleModel::Close() { ...@@ -52,9 +53,23 @@ void CaptionBubbleModel::Close() {
final_text_.clear(); final_text_.clear();
partial_text_.clear(); partial_text_.clear();
is_closed_ = true; is_closed_ = true;
is_ready_ = false;
OnTextChanged(); OnTextChanged();
} }
void CaptionBubbleModel::OnReady() {
final_text_.clear();
partial_text_.clear();
is_ready_ = true;
// The label text must not be empty when it is displayed, so there is a
// special OnReadyChanged() function in the CaptionBubble that handles the
// on_ready state change.
// TODO(1055150): Fix the bug in RenderText and then change this to
// OnTextChanged().
if (observer_)
observer_->OnReadyChanged();
}
void CaptionBubbleModel::SetHasError(bool has_error) { void CaptionBubbleModel::SetHasError(bool has_error) {
has_error_ = has_error; has_error_ = has_error;
if (observer_) if (observer_)
...@@ -70,6 +85,7 @@ void CaptionBubbleModel::DidFinishNavigation( ...@@ -70,6 +85,7 @@ void CaptionBubbleModel::DidFinishNavigation(
final_text_.clear(); final_text_.clear();
partial_text_.clear(); partial_text_.clear();
is_closed_ = false; is_closed_ = false;
is_ready_ = false;
has_error_ = false; has_error_ = false;
OnTextChanged(); OnTextChanged();
} }
......
...@@ -57,7 +57,10 @@ class CaptionBubbleModel : public content::WebContentsObserver { ...@@ -57,7 +57,10 @@ class CaptionBubbleModel : public content::WebContentsObserver {
// observer. // observer.
void Close(); void Close();
void OnReady();
bool IsClosed() const { return is_closed_; } bool IsClosed() const { return is_closed_; }
bool IsReady() const { return is_ready_; }
bool HasError() const { return has_error_; } bool HasError() const { return has_error_; }
std::string GetFullText() const { return final_text_ + partial_text_; } std::string GetFullText() const { return final_text_ + partial_text_; }
...@@ -76,6 +79,9 @@ class CaptionBubbleModel : public content::WebContentsObserver { ...@@ -76,6 +79,9 @@ class CaptionBubbleModel : public content::WebContentsObserver {
// Whether the bubble has been closed by the user. // Whether the bubble has been closed by the user.
bool is_closed_ = false; bool is_closed_ = false;
// Whether bubble is ready to receive transcriptions.
bool is_ready_ = false;
// Whether an error should be displayed one the bubble. // Whether an error should be displayed one the bubble.
bool has_error_ = false; bool has_error_ = false;
......
...@@ -397,9 +397,11 @@ IN_PROC_BROWSER_TEST_F(BrowserViewTest, F6CyclesThroughCaptionBubbleToo) { ...@@ -397,9 +397,11 @@ IN_PROC_BROWSER_TEST_F(BrowserViewTest, F6CyclesThroughCaptionBubbleToo) {
caption_controller->GetCaptionBubbleControllerForBrowser(browser())); caption_controller->GetCaptionBubbleControllerForBrowser(browser()));
EXPECT_FALSE(bubble_controller->GetFocusableCaptionBubble()); EXPECT_FALSE(bubble_controller->GetFocusableCaptionBubble());
content::WebContents* contents =
browser()->tab_strip_model()->GetActiveWebContents();
caption_controller->OnSpeechRecognitionReady(contents);
caption_controller->DispatchTranscription( caption_controller->DispatchTranscription(
browser()->tab_strip_model()->GetActiveWebContents(), contents, chrome::mojom::TranscriptionResult::New("Hello, world", false));
chrome::mojom::TranscriptionResult::New("Hello, world", false));
// Now the caption bubble exists but is not focused. // Now the caption bubble exists but is not focused.
views::View* bubble = bubble_controller->GetFocusableCaptionBubble(); views::View* bubble = bubble_controller->GetFocusableCaptionBubble();
EXPECT_TRUE(bubble); EXPECT_TRUE(bubble);
......
...@@ -7,6 +7,9 @@ module chrome.mojom; ...@@ -7,6 +7,9 @@ module chrome.mojom;
// Browser process interface exposed to the renderer for communication about // Browser process interface exposed to the renderer for communication about
// the Live Caption feature. // the Live Caption feature.
interface CaptionHost { interface CaptionHost {
// Called when speech recognition service is ready to send transcriptions.
OnSpeechRecognitionReady() => (bool success);
// Called when the speech recognition client receives a transcription from the // Called when the speech recognition client receives a transcription from the
// speech service. Returns whether the transcription result was received // speech service. Returns whether the transcription result was received
// successfully. Transcriptions will halt if this returns false. // successfully. Transcriptions will halt if this returns false.
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "chrome/renderer/media/chrome_speech_recognition_client.h" #include "chrome/renderer/media/chrome_speech_recognition_client.h"
#include <utility> #include <utility>
#include <vector>
#include "base/metrics/field_trial_params.h" #include "base/metrics/field_trial_params.h"
#include "content/public/renderer/render_frame.h" #include "content/public/renderer/render_frame.h"
...@@ -57,7 +58,6 @@ void ChromeSpeechRecognitionClient::OnRecognizerBound( ...@@ -57,7 +58,6 @@ void ChromeSpeechRecognitionClient::OnRecognizerBound(
bool is_multichannel_supported) { bool is_multichannel_supported) {
is_multichannel_supported_ = is_multichannel_supported; is_multichannel_supported_ = is_multichannel_supported;
is_recognizer_bound_ = true; is_recognizer_bound_ = true;
if (on_ready_callback_) if (on_ready_callback_)
std::move(on_ready_callback_).Run(); std::move(on_ready_callback_).Run();
} }
...@@ -102,11 +102,11 @@ void ChromeSpeechRecognitionClient::OnSpeechRecognitionRecognitionEvent( ...@@ -102,11 +102,11 @@ void ChromeSpeechRecognitionClient::OnSpeechRecognitionRecognitionEvent(
caption_host_->OnTranscription( caption_host_->OnTranscription(
chrome::mojom::TranscriptionResult::New(result->transcription, chrome::mojom::TranscriptionResult::New(result->transcription,
result->is_final), result->is_final),
base::BindOnce(&ChromeSpeechRecognitionClient::OnTranscriptionCallback, base::BindOnce(&ChromeSpeechRecognitionClient::OnBrowserCallback,
base::Unretained(this))); base::Unretained(this)));
} }
void ChromeSpeechRecognitionClient::OnTranscriptionCallback(bool success) { void ChromeSpeechRecognitionClient::OnBrowserCallback(bool success) {
is_browser_requesting_transcription_ = success; is_browser_requesting_transcription_ = success;
} }
...@@ -145,6 +145,17 @@ void ChromeSpeechRecognitionClient::SendAudioToSpeechRecognitionService( ...@@ -145,6 +145,17 @@ void ChromeSpeechRecognitionClient::SendAudioToSpeechRecognitionService(
if (IsSpeechRecognitionAvailable()) { if (IsSpeechRecognitionAvailable()) {
speech_recognition_recognizer_->SendAudioToSpeechRecognitionService( speech_recognition_recognizer_->SendAudioToSpeechRecognitionService(
std::move(audio_data)); std::move(audio_data));
// When the speech recognition client receives speech, it alerts the
// live caption host in the browser that it is ready so that the UI can
// display a message. This happens at the time of playing the video and not
// at the time of construction of this object.
if (!on_ready_message_sent_to_caption_host_) {
caption_host_->OnSpeechRecognitionReady(
base::BindOnce(&ChromeSpeechRecognitionClient::OnBrowserCallback,
base::Unretained(this)));
on_ready_message_sent_to_caption_host_ = true;
}
} }
} }
......
...@@ -65,8 +65,8 @@ class ChromeSpeechRecognitionClient ...@@ -65,8 +65,8 @@ class ChromeSpeechRecognitionClient
media::mojom::AudioDataS16Ptr ConvertToAudioDataS16( media::mojom::AudioDataS16Ptr ConvertToAudioDataS16(
scoped_refptr<media::AudioBuffer> buffer); scoped_refptr<media::AudioBuffer> buffer);
// Called as a response to sending a transcription to the browser. // Called as a response to sending a message to the browser.
void OnTranscriptionCallback(bool success); void OnBrowserCallback(bool success);
media::mojom::AudioDataS16Ptr ConvertToAudioDataS16( media::mojom::AudioDataS16Ptr ConvertToAudioDataS16(
std::unique_ptr<media::AudioBus> audio_bus, std::unique_ptr<media::AudioBus> audio_bus,
...@@ -108,6 +108,9 @@ class ChromeSpeechRecognitionClient ...@@ -108,6 +108,9 @@ class ChromeSpeechRecognitionClient
bool is_recognizer_bound_ = false; bool is_recognizer_bound_ = false;
// Whether or not the on ready message has been sent to the caption host.
bool on_ready_message_sent_to_caption_host_ = false;
// The temporary audio bus used to mix multichannel audio into a single // The temporary audio bus used to mix multichannel audio into a single
// channel. // channel.
std::unique_ptr<media::AudioBus> monaural_audio_bus_; std::unique_ptr<media::AudioBus> monaural_audio_bus_;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment