Commit 47754cf1 authored by David Tseng's avatar David Tseng Committed by Commit Bot

Native audio tts playback

Moves text-to-speech playback from js-based web audio to C++.
See bug for more details.

Depends on
https://chromium-review.googlesource.com/c/chromiumos/overlays/chromiumos-overlay/+/2456208/1

Fixed: 1134289
Change-Id: I7ecfa800f6b688f6a3d0d655f557e86efa73188a
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2435919
Commit-Queue: David Tseng <dtseng@chromium.org>
Reviewed-by: default avatarDominic Mazzoni <dmazzoni@chromium.org>
Reviewed-by: default avatarRobert Sesek <rsesek@chromium.org>
Reviewed-by: default avatarDale Curtis <dalecurtis@chromium.org>
Cr-Commit-Position: refs/heads/master@{#814980}
parent 04ebb1f4
...@@ -148,7 +148,10 @@ void TtsEngineExtensionObserver::BindTtsStream( ...@@ -148,7 +148,10 @@ void TtsEngineExtensionObserver::BindTtsStream(
.WithDisplayName("TtsService") .WithDisplayName("TtsService")
.Pass()); .Pass());
tts_service_->BindTtsStream(std::move(receiver)); mojo::PendingRemote<audio::mojom::StreamFactory> factory_remote;
auto factory_receiver = factory_remote.InitWithNewPipeAndPassReceiver();
content::GetAudioService().BindStreamFactory(std::move(factory_receiver));
tts_service_->BindTtsStream(std::move(receiver), std::move(factory_remote));
} }
#endif // defined(OS_CHROMEOS) #endif // defined(OS_CHROMEOS)
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include "base/macros.h" #include "base/macros.h"
#include "base/scoped_observer.h" #include "base/scoped_observer.h"
#include "components/keyed_service/core/keyed_service.h" #include "components/keyed_service/core/keyed_service.h"
#include "content/public/browser/audio_service.h"
#include "extensions/browser/event_router.h" #include "extensions/browser/event_router.h"
#include "extensions/browser/extension_registry.h" #include "extensions/browser/extension_registry.h"
#include "extensions/browser/extension_registry_observer.h" #include "extensions/browser/extension_registry_observer.h"
......
...@@ -16,6 +16,7 @@ source_set("tts") { ...@@ -16,6 +16,7 @@ source_set("tts") {
":libchrometts", ":libchrometts",
"//base", "//base",
"//chromeos/services/tts/public/mojom", "//chromeos/services/tts/public/mojom",
"//services/audio/public/cpp:cpp",
] ]
} }
...@@ -54,5 +55,6 @@ generate_library_loader("libchrometts") { ...@@ -54,5 +55,6 @@ generate_library_loader("libchrometts") {
"GoogleTtsGetTimepointsCount", "GoogleTtsGetTimepointsCount",
"GoogleTtsGetTimepointsTimeInSecsAtIndex", "GoogleTtsGetTimepointsTimeInSecsAtIndex",
"GoogleTtsGetTimepointsCharIndexAtIndex", "GoogleTtsGetTimepointsCharIndexAtIndex",
"GoogleTtsGetFramesInAudioBuffer",
] ]
} }
include_rules = [ include_rules = [
"+media/base",
"+mojo/public", "+mojo/public",
"+sandbox", "+sandbox",
"+sandbox/policy", "+sandbox/policy",
"+services/audio/public/cpp",
] ]
...@@ -20,7 +20,7 @@ bool GoogleTtsInstallVoice(const char* voice_name, ...@@ -20,7 +20,7 @@ bool GoogleTtsInstallVoice(const char* voice_name,
bool GoogleTtsInitBuffered(const char* text_jspb, int text_jspb_len); bool GoogleTtsInitBuffered(const char* text_jspb, int text_jspb_len);
int GoogleTtsReadBuffered(); int GoogleTtsReadBuffered(float* audio_channel_buffer, size_t* frames_written);
void GoogleTtsFinalizeBuffered(); void GoogleTtsFinalizeBuffered();
...@@ -33,4 +33,7 @@ int GoogleTtsGetTimepointsCharIndexAtIndex(size_t index); ...@@ -33,4 +33,7 @@ int GoogleTtsGetTimepointsCharIndexAtIndex(size_t index);
char* GoogleTtsGetEventBufferPtr(); char* GoogleTtsGetEventBufferPtr();
size_t GoogleTtsGetEventBufferLen(); size_t GoogleTtsGetEventBufferLen();
size_t GoogleTtsGetFramesInAudioBuffer();
#endif // CHROMEOS_SERVICES_TTS_CHROME_TTS_H_ #endif // CHROMEOS_SERVICES_TTS_CHROME_TTS_H_
...@@ -6,4 +6,6 @@ import("//mojo/public/tools/bindings/mojom.gni") ...@@ -6,4 +6,6 @@ import("//mojo/public/tools/bindings/mojom.gni")
mojom("mojom") { mojom("mojom") {
sources = [ "tts_service.mojom" ] sources = [ "tts_service.mojom" ]
public_deps = [ "//services/audio/public/mojom" ]
} }
...@@ -4,26 +4,7 @@ ...@@ -4,26 +4,7 @@
module chromeos.tts.mojom; module chromeos.tts.mojom;
// Structure describing a point in time during speech synthesis. import "services/audio/public/mojom/stream_factory.mojom";
struct Timepoint {
// The time, in seconds.
float time_sec;
// The index in the text being spoken.
int32 char_index;
};
// A TTS event and associated metadata within a TTS stream.
struct TtsStreamItem {
// An internal serialized proto.speech.tts.TtsControllerEvent proto.
array<uint8> event_buffer_bytes;
// Whether streaming is complete.
bool done;
// A list of timepoints associated with the event above.
array<Timepoint> timepoints;
};
// The main interface to the TTS engine on Chrome OS. Only used by and private // The main interface to the TTS engine on Chrome OS. Only used by and private
// to the Chrome OS Google TTS engine component extension. TtsService lives in a // to the Chrome OS Google TTS engine component extension. TtsService lives in a
...@@ -32,12 +13,14 @@ struct TtsStreamItem { ...@@ -32,12 +13,14 @@ struct TtsStreamItem {
// and the Google TTS engine component extension through a TtsStream, but does // and the Google TTS engine component extension through a TtsStream, but does
// not participate otherwise. // not participate otherwise.
interface TtsService { interface TtsService {
// Binds a TtsStream to this service. // Binds a TtsStream to this service and returns an AudioOutputStream receiver
BindTtsStream(pending_receiver<TtsStream> receiver); // which this service uses to play audio.
BindTtsStream(pending_receiver<TtsStream> receiver,
pending_remote<audio.mojom.StreamFactory> stream_factory);
}; };
// Interface for the Google component TTS engine to control and consume a stream // Interface for the Google component TTS engine to control
// of TtsStreamItems produced by TtsService. There is only ever one TtsStream // the TtsService's production of audio. There is only ever one TtsStream
// owned by the TtsService. // owned by the TtsService.
// //
// The component extension sets up the stream's voice by doing: // The component extension sets up the stream's voice by doing:
...@@ -45,25 +28,18 @@ interface TtsService { ...@@ -45,25 +28,18 @@ interface TtsService {
// InstallVoice(other_data, "other_voice") // InstallVoice(other_data, "other_voice")
// SelectVoice("other_voice") // SelectVoice("other_voice")
// //
// After reading from the stream (see below), the component extension can do: // After speaking using the stream (see below), the component extension can do:
// SelectVoice("voice") // SelectVoice("voice")
// to change voices. // to change voices.
// //
// The component extension calls the following three methods repeatedly, in // The component extension calls the following two methods repeatedly and
// order to read from the stream given text. For example, // optionally observes events.
// //
// Init(<a proto containing text "Hello there.">) // Speak(<a proto containing text "Hello there.">)
// Read() // Speak(<proto containing text "Testing 1, 2, 3.")
// Read() // Stop()
// ...
// Finalize()
// Init(<proto containing text "Testing 1, 2, 3.")
// Read()
// Read()
// ...
// Finalize()
// //
// Note that the component extension may call Finalize() early, if the TTS api // Note that the component extension may call Stop() early, if the TTS api
// wants to, for example, stop speech. // wants to, for example, stop speech.
interface TtsStream { interface TtsStream {
// Forward and install the |voice_name| encoded by |voice_bytes|. // Forward and install the |voice_name| encoded by |voice_bytes|.
...@@ -73,13 +49,30 @@ interface TtsStream { ...@@ -73,13 +49,30 @@ interface TtsStream {
// Selects a voice for streaming given a |voice_name|. // Selects a voice for streaming given a |voice_name|.
SelectVoice(string voice_name) => (bool success); SelectVoice(string voice_name) => (bool success);
// Initialize a new TTS stream given a serialized proto.speech.tts.Text proto. // Speak text described by a serialized proto.speech.tts.Text proto.
Init(array<uint8> text_jspb) Speak(array<uint8> text_jspb)
=> (bool success); => (pending_receiver<TtsEventObserver> event_observer);
// Stop speaking the currently speaking text, if any.
Stop();
// Sets the volume of the tts playback (0.0 to 1.0).
SetVolume(float volume);
};
// Returned to callers of TtsStream.speak(). It receives notable events
// pertaining to the text spoken.
interface TtsEventObserver {
// TtsStream.Speak started speech playback.
OnStart();
// TtsStream.Speak is playing text at |char_index| approximately at the
// current time.
OnTimepoint(int32 char_index);
// Read the next stream item. // TtsStream.Speak ended speech playback.
Read() => (TtsStreamItem item); OnEnd();
// Clean up and finish the current TTS stream. // TtsStream.Speak encountered an error.
Finalize(); OnError();
}; };
...@@ -5,9 +5,13 @@ ...@@ -5,9 +5,13 @@
#include "chromeos/services/tts/tts_service.h" #include "chromeos/services/tts/tts_service.h"
#include <dlfcn.h> #include <dlfcn.h>
#include <sys/resource.h>
#include "base/files/file_util.h" #include "base/files/file_util.h"
#include "chromeos/services/tts/constants.h" #include "chromeos/services/tts/constants.h"
#include "media/base/audio_parameters.h"
#include "media/base/audio_sample_types.h"
#include "services/audio/public/cpp/output_device.h"
namespace chromeos { namespace chromeos {
namespace tts { namespace tts {
...@@ -37,24 +41,46 @@ void HandleLibraryLogging(int severity, const char* message) { ...@@ -37,24 +41,46 @@ void HandleLibraryLogging(int severity, const char* message) {
// methods utilize C features only. // methods utilize C features only.
TtsService::TtsService(mojo::PendingReceiver<mojom::TtsService> receiver) TtsService::TtsService(mojo::PendingReceiver<mojom::TtsService> receiver)
: service_receiver_(this, std::move(receiver)), stream_receiver_(this) { : service_receiver_(this, std::move(receiver)),
stream_receiver_(this),
got_first_buffer_(false) {
if (setpriority(PRIO_PROCESS, 0, -10 /* real time audio */) != 0) {
PLOG(ERROR) << "Unable to request real time priority; performance will be "
"impacted.";
}
bool loaded = libchrometts_.Load(kLibchromettsPath); bool loaded = libchrometts_.Load(kLibchromettsPath);
if (!loaded) if (!loaded) {
LOG(ERROR) << "Unable to load libchrometts.so: " << dlerror(); LOG(ERROR) << "Unable to load libchrometts.so.";
else exit(0);
} else {
libchrometts_.GoogleTtsSetLogger(HandleLibraryLogging); libchrometts_.GoogleTtsSetLogger(HandleLibraryLogging);
}
} }
TtsService::~TtsService() = default; TtsService::~TtsService() = default;
void TtsService::BindTtsStream( void TtsService::BindTtsStream(
mojo::PendingReceiver<mojom::TtsStream> receiver) { mojo::PendingReceiver<mojom::TtsStream> receiver,
mojo::PendingRemote<audio::mojom::StreamFactory> factory) {
base::AutoLock al(state_lock_);
stream_receiver_.Bind(std::move(receiver)); stream_receiver_.Bind(std::move(receiver));
// TODO(accessibility): The sample rate below can change based on the audio
// data retrieved. Plumb this data through and re-create the output device if
// it changes.
media::AudioParameters params(
media::AudioParameters::AUDIO_PCM_LOW_LATENCY, media::CHANNEL_LAYOUT_MONO,
22050 /* sample rate */, libchrometts_.GoogleTtsGetFramesInAudioBuffer());
output_device_ = std::make_unique<audio::OutputDevice>(
std::move(factory), params, this, std::string());
} }
void TtsService::InstallVoice(const std::string& voice_name, void TtsService::InstallVoice(const std::string& voice_name,
const std::vector<uint8_t>& voice_bytes, const std::vector<uint8_t>& voice_bytes,
InstallVoiceCallback callback) { InstallVoiceCallback callback) {
base::AutoLock al(state_lock_);
// Create a directory to place extracted voice data. // Create a directory to place extracted voice data.
base::FilePath voice_data_path(kTempDataDirectory); base::FilePath voice_data_path(kTempDataDirectory);
voice_data_path = voice_data_path.Append(voice_name); voice_data_path = voice_data_path.Append(voice_name);
...@@ -75,6 +101,8 @@ void TtsService::InstallVoice(const std::string& voice_name, ...@@ -75,6 +101,8 @@ void TtsService::InstallVoice(const std::string& voice_name,
void TtsService::SelectVoice(const std::string& voice_name, void TtsService::SelectVoice(const std::string& voice_name,
SelectVoiceCallback callback) { SelectVoiceCallback callback) {
base::AutoLock al(state_lock_);
base::FilePath path_prefix = base::FilePath path_prefix =
base::FilePath(kTempDataDirectory).Append(voice_name); base::FilePath(kTempDataDirectory).Append(voice_name);
base::FilePath pipeline_path = path_prefix.Append("pipeline"); base::FilePath pipeline_path = path_prefix.Append("pipeline");
...@@ -82,39 +110,93 @@ void TtsService::SelectVoice(const std::string& voice_name, ...@@ -82,39 +110,93 @@ void TtsService::SelectVoice(const std::string& voice_name,
pipeline_path.value().c_str(), path_prefix.value().c_str())); pipeline_path.value().c_str(), path_prefix.value().c_str()));
} }
void TtsService::Init(const std::vector<uint8_t>& text_jspb, void TtsService::Speak(const std::vector<uint8_t>& text_jspb,
InitCallback callback) { SpeakCallback callback) {
std::move(callback).Run(libchrometts_.GoogleTtsInitBuffered( base::AutoLock al(state_lock_);
(char*)&text_jspb[0], text_jspb.size()));
}
void TtsService::Read(ReadCallback callback) { tts_event_observer_.reset();
int32_t status = libchrometts_.GoogleTtsReadBuffered(); auto pending_receiver = tts_event_observer_.BindNewPipeAndPassReceiver();
if (status == -1) { std::move(callback).Run(std::move(pending_receiver));
std::move(callback).Run(mojom::TtsStreamItem::New(
std::vector<uint8_t>(), true, std::vector<mojom::TimepointPtr>())); bool status = libchrometts_.GoogleTtsInitBuffered((char*)&text_jspb[0],
text_jspb.size());
if (!status) {
tts_event_observer_->OnError();
return; return;
} }
char* event = libchrometts_.GoogleTtsGetEventBufferPtr(); output_device_->Play();
std::vector<uint8_t> send_event(libchrometts_.GoogleTtsGetEventBufferLen()); }
for (size_t i = 0; i < send_event.size(); i++)
send_event[i] = event[i]; void TtsService::Stop() {
base::AutoLock al(state_lock_);
std::vector<mojom::TimepointPtr> timepoints( StopLocked();
libchrometts_.GoogleTtsGetTimepointsCount()); }
for (size_t i = 0; i < timepoints.size(); i++) {
timepoints[i] = mojom::Timepoint::New( void TtsService::SetVolume(float volume) {
libchrometts_.GoogleTtsGetTimepointsTimeInSecsAtIndex(i), base::AutoLock al(state_lock_);
libchrometts_.GoogleTtsGetTimepointsCharIndexAtIndex(i)); output_device_->SetVolume(volume);
}
int TtsService::Render(base::TimeDelta delay,
base::TimeTicks delay_timestamp,
int prior_frames_skipped,
media::AudioBus* dest) {
// Careful to not block the render callback. Only try to acquire the lock
// here, but early return if we are processing a series of other calls. This
// can be extremely important if there's a long queue of pending Speak/Stop
// pairs being processed on the main thread. This can occur if the tts api
// receives lots of tts requests.
if (!state_lock_.Try()) {
return 0;
}
size_t frames = 0;
int32_t status =
libchrometts_.GoogleTtsReadBuffered(dest->channel(0), &frames);
if (status <= 0) {
// -1 means an error, 0 means done.
if (status == -1)
tts_event_observer_->OnError();
dest->Zero();
StopLocked();
state_lock_.Release();
return 0;
}
if (frames == 0) {
state_lock_.Release();
return 0;
}
if (!got_first_buffer_) {
got_first_buffer_ = true;
tts_event_observer_->OnStart();
} }
std::move(callback).Run(mojom::TtsStreamItem::New(send_event, status == 0, // There's only really ever one timepoint since we play this buffer in one
std::move(timepoints))); // chunk.
int char_index = -1;
if (libchrometts_.GoogleTtsGetTimepointsCount() > 0)
char_index = libchrometts_.GoogleTtsGetTimepointsCharIndexAtIndex(0);
if (char_index != -1)
tts_event_observer_->OnTimepoint(char_index);
state_lock_.Release();
return frames;
} }
void TtsService::Finalize() { void TtsService::OnRenderError() {}
void TtsService::StopLocked() {
output_device_->Pause();
libchrometts_.GoogleTtsFinalizeBuffered(); libchrometts_.GoogleTtsFinalizeBuffered();
if (tts_event_observer_ && got_first_buffer_)
tts_event_observer_->OnEnd();
got_first_buffer_ = false;
} }
} // namespace tts } // namespace tts
......
...@@ -5,36 +5,77 @@ ...@@ -5,36 +5,77 @@
#ifndef CHROMEOS_SERVICES_TTS_TTS_SERVICE_H_ #ifndef CHROMEOS_SERVICES_TTS_TTS_SERVICE_H_
#define CHROMEOS_SERVICES_TTS_TTS_SERVICE_H_ #define CHROMEOS_SERVICES_TTS_TTS_SERVICE_H_
#include "base/synchronization/lock.h"
#include "base/thread_annotations.h"
#include "chromeos/services/tts/public/mojom/tts_service.mojom.h" #include "chromeos/services/tts/public/mojom/tts_service.mojom.h"
#include "library_loaders/libchrometts.h" #include "library_loaders/libchrometts.h"
#include "media/base/audio_renderer_sink.h"
#include "mojo/public/cpp/bindings/receiver.h" #include "mojo/public/cpp/bindings/receiver.h"
#include "mojo/public/cpp/bindings/remote.h"
namespace audio {
class OutputDevice;
}
namespace chromeos { namespace chromeos {
namespace tts { namespace tts {
class TtsService : public mojom::TtsService, public mojom::TtsStream { class TtsService : public mojom::TtsService,
public mojom::TtsStream,
public media::AudioRendererSink::RenderCallback {
public: public:
explicit TtsService(mojo::PendingReceiver<mojom::TtsService> receiver); explicit TtsService(mojo::PendingReceiver<mojom::TtsService> receiver);
~TtsService() override; ~TtsService() override;
private: private:
// TtsService: // mojom::TtsService:
void BindTtsStream(mojo::PendingReceiver<mojom::TtsStream> receiver) override; void BindTtsStream(
mojo::PendingReceiver<mojom::TtsStream> receiver,
mojo::PendingRemote<audio::mojom::StreamFactory> factory) override;
// TtsStream: // mojom::TtsStream:
void InstallVoice(const std::string& voice_name, void InstallVoice(const std::string& voice_name,
const std::vector<uint8_t>& voice_bytes, const std::vector<uint8_t>& voice_bytes,
InstallVoiceCallback callback) override; InstallVoiceCallback callback) override;
void SelectVoice(const std::string& voice_name, void SelectVoice(const std::string& voice_name,
SelectVoiceCallback callback) override; SelectVoiceCallback callback) override;
void Init(const std::vector<uint8_t>& text_jspb, void Speak(const std::vector<uint8_t>& text_jspb,
InitCallback callback) override; SpeakCallback callback) override;
void Read(ReadCallback callback) override; void Stop() override;
void Finalize() override; void SetVolume(float volume) override;
// media::AudioRendererSink::RenderCallback:
int Render(base::TimeDelta delay,
base::TimeTicks delay_timestamp,
int prior_frames_skipped,
media::AudioBus* dest) override;
void OnRenderError() override;
// Handles stopping tts.
void StopLocked() EXCLUSIVE_LOCKS_REQUIRED(state_lock_);
LibChromeTtsLoader libchrometts_; // Connection to tts in the browser.
mojo::Receiver<mojom::TtsService> service_receiver_; mojo::Receiver<mojom::TtsService> service_receiver_;
mojo::Receiver<mojom::TtsStream> stream_receiver_;
// Protects access to state from main thread and audio thread.
base::Lock state_lock_;
// Prebuilt.
LibChromeTtsLoader libchrometts_ GUARDED_BY(state_lock_);
// Connection to tts in the component extension.
mojo::Receiver<mojom::TtsStream> stream_receiver_ GUARDED_BY(state_lock_);
// Connection to send tts events to component extension.
mojo::Remote<mojom::TtsEventObserver> tts_event_observer_
GUARDED_BY(state_lock_);
// Outputs speech synthesis to audio.
std::unique_ptr<audio::OutputDevice> output_device_ GUARDED_BY(state_lock_);
// Tracks whether any audio data came as a result of |Speak|. Reset for every
// call to |Speak|.
bool got_first_buffer_ GUARDED_BY(state_lock_);
}; };
} // namespace tts } // namespace tts
......
...@@ -25,6 +25,13 @@ TtsProcessPolicy::TtsProcessPolicy() {} ...@@ -25,6 +25,13 @@ TtsProcessPolicy::TtsProcessPolicy() {}
TtsProcessPolicy::~TtsProcessPolicy() {} TtsProcessPolicy::~TtsProcessPolicy() {}
ResultExpr TtsProcessPolicy::EvaluateSyscall(int sysno) const { ResultExpr TtsProcessPolicy::EvaluateSyscall(int sysno) const {
switch (sysno) {
case __NR_sched_setscheduler:
return RestrictSchedTarget(GetPolicyPid(), sysno);
default:
break;
}
auto* sandbox_linux = SandboxLinux::GetInstance(); auto* sandbox_linux = SandboxLinux::GetInstance();
if (sandbox_linux->ShouldBrokerHandleSyscall(sysno)) if (sandbox_linux->ShouldBrokerHandleSyscall(sysno))
return sandbox_linux->HandleViaBroker(); return sandbox_linux->HandleViaBroker();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment