Commit 47754cf1 authored by David Tseng's avatar David Tseng Committed by Commit Bot

Native audio tts playback

Moves text-to-speech playback from js-based web audio to C++.
See bug for more details.

Depends on
https://chromium-review.googlesource.com/c/chromiumos/overlays/chromiumos-overlay/+/2456208/1

Fixed: 1134289
Change-Id: I7ecfa800f6b688f6a3d0d655f557e86efa73188a
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2435919
Commit-Queue: David Tseng <dtseng@chromium.org>
Reviewed-by: default avatarDominic Mazzoni <dmazzoni@chromium.org>
Reviewed-by: default avatarRobert Sesek <rsesek@chromium.org>
Reviewed-by: default avatarDale Curtis <dalecurtis@chromium.org>
Cr-Commit-Position: refs/heads/master@{#814980}
parent 04ebb1f4
......@@ -148,7 +148,10 @@ void TtsEngineExtensionObserver::BindTtsStream(
.WithDisplayName("TtsService")
.Pass());
tts_service_->BindTtsStream(std::move(receiver));
mojo::PendingRemote<audio::mojom::StreamFactory> factory_remote;
auto factory_receiver = factory_remote.InitWithNewPipeAndPassReceiver();
content::GetAudioService().BindStreamFactory(std::move(factory_receiver));
tts_service_->BindTtsStream(std::move(receiver), std::move(factory_remote));
}
#endif // defined(OS_CHROMEOS)
......
......@@ -8,6 +8,7 @@
#include "base/macros.h"
#include "base/scoped_observer.h"
#include "components/keyed_service/core/keyed_service.h"
#include "content/public/browser/audio_service.h"
#include "extensions/browser/event_router.h"
#include "extensions/browser/extension_registry.h"
#include "extensions/browser/extension_registry_observer.h"
......
......@@ -16,6 +16,7 @@ source_set("tts") {
":libchrometts",
"//base",
"//chromeos/services/tts/public/mojom",
"//services/audio/public/cpp:cpp",
]
}
......@@ -54,5 +55,6 @@ generate_library_loader("libchrometts") {
"GoogleTtsGetTimepointsCount",
"GoogleTtsGetTimepointsTimeInSecsAtIndex",
"GoogleTtsGetTimepointsCharIndexAtIndex",
"GoogleTtsGetFramesInAudioBuffer",
]
}
include_rules = [
"+media/base",
"+mojo/public",
"+sandbox",
"+sandbox/policy",
"+services/audio/public/cpp",
]
......@@ -20,7 +20,7 @@ bool GoogleTtsInstallVoice(const char* voice_name,
bool GoogleTtsInitBuffered(const char* text_jspb, int text_jspb_len);
int GoogleTtsReadBuffered();
int GoogleTtsReadBuffered(float* audio_channel_buffer, size_t* frames_written);
void GoogleTtsFinalizeBuffered();
......@@ -33,4 +33,7 @@ int GoogleTtsGetTimepointsCharIndexAtIndex(size_t index);
char* GoogleTtsGetEventBufferPtr();
size_t GoogleTtsGetEventBufferLen();
size_t GoogleTtsGetFramesInAudioBuffer();
#endif // CHROMEOS_SERVICES_TTS_CHROME_TTS_H_
......@@ -6,4 +6,6 @@ import("//mojo/public/tools/bindings/mojom.gni")
mojom("mojom") {
sources = [ "tts_service.mojom" ]
public_deps = [ "//services/audio/public/mojom" ]
}
......@@ -4,26 +4,7 @@
module chromeos.tts.mojom;
// Structure describing a point in time during speech synthesis.
struct Timepoint {
// The time, in seconds.
float time_sec;
// The index in the text being spoken.
int32 char_index;
};
// A TTS event and associated metadata within a TTS stream.
struct TtsStreamItem {
// An internal serialized proto.speech.tts.TtsControllerEvent proto.
array<uint8> event_buffer_bytes;
// Whether streaming is complete.
bool done;
// A list of timepoints associated with the event above.
array<Timepoint> timepoints;
};
import "services/audio/public/mojom/stream_factory.mojom";
// The main interface to the TTS engine on Chrome OS. Only used by and private
// to the Chrome OS Google TTS engine component extension. TtsService lives in a
......@@ -32,12 +13,14 @@ struct TtsStreamItem {
// and the Google TTS engine component extension through a TtsStream, but does
// not participate otherwise.
interface TtsService {
// Binds a TtsStream to this service.
BindTtsStream(pending_receiver<TtsStream> receiver);
// Binds a TtsStream to this service and returns an AudioOutputStream receiver
// which this service uses to play audio.
BindTtsStream(pending_receiver<TtsStream> receiver,
pending_remote<audio.mojom.StreamFactory> stream_factory);
};
// Interface for the Google component TTS engine to control and consume a stream
// of TtsStreamItems produced by TtsService. There is only ever one TtsStream
// Interface for the Google component TTS engine to control
// the TtsService's production of audio. There is only ever one TtsStream
// owned by the TtsService.
//
// The component extension sets up the stream's voice by doing:
......@@ -45,25 +28,18 @@ interface TtsService {
// InstallVoice(other_data, "other_voice")
// SelectVoice("other_voice")
//
// After reading from the stream (see below), the component extension can do:
// After speaking using the stream (see below), the component extension can do:
// SelectVoice("voice")
// to change voices.
//
// The component extension calls the following three methods repeatedly, in
// order to read from the stream given text. For example,
// The component extension calls the following two methods repeatedly and
// optionally observes events.
//
// Init(<a proto containing text "Hello there.">)
// Read()
// Read()
// ...
// Finalize()
// Init(<proto containing text "Testing 1, 2, 3.")
// Read()
// Read()
// ...
// Finalize()
// Speak(<a proto containing text "Hello there.">)
// Speak(<proto containing text "Testing 1, 2, 3.")
// Stop()
//
// Note that the component extension may call Finalize() early, if the TTS api
// Note that the component extension may call Stop() early, if the TTS api
// wants to, for example, stop speech.
interface TtsStream {
// Forward and install the |voice_name| encoded by |voice_bytes|.
......@@ -73,13 +49,30 @@ interface TtsStream {
// Selects a voice for streaming given a |voice_name|.
SelectVoice(string voice_name) => (bool success);
// Initialize a new TTS stream given a serialized proto.speech.tts.Text proto.
Init(array<uint8> text_jspb)
=> (bool success);
// Speak text described by a serialized proto.speech.tts.Text proto.
Speak(array<uint8> text_jspb)
=> (pending_receiver<TtsEventObserver> event_observer);
// Stop speaking the currently speaking text, if any.
Stop();
// Sets the volume of the tts playback (0.0 to 1.0).
SetVolume(float volume);
};
// Returned to callers of TtsStream.speak(). It receives notable events
// pertaining to the text spoken.
interface TtsEventObserver {
// TtsStream.Speak started speech playback.
OnStart();
// TtsStream.Speak is playing text at |char_index| approximately at the
// current time.
OnTimepoint(int32 char_index);
// Read the next stream item.
Read() => (TtsStreamItem item);
// TtsStream.Speak ended speech playback.
OnEnd();
// Clean up and finish the current TTS stream.
Finalize();
// TtsStream.Speak encountered an error.
OnError();
};
......@@ -5,9 +5,13 @@
#include "chromeos/services/tts/tts_service.h"
#include <dlfcn.h>
#include <sys/resource.h>
#include "base/files/file_util.h"
#include "chromeos/services/tts/constants.h"
#include "media/base/audio_parameters.h"
#include "media/base/audio_sample_types.h"
#include "services/audio/public/cpp/output_device.h"
namespace chromeos {
namespace tts {
......@@ -37,24 +41,46 @@ void HandleLibraryLogging(int severity, const char* message) {
// methods utilize C features only.
TtsService::TtsService(mojo::PendingReceiver<mojom::TtsService> receiver)
: service_receiver_(this, std::move(receiver)), stream_receiver_(this) {
: service_receiver_(this, std::move(receiver)),
stream_receiver_(this),
got_first_buffer_(false) {
if (setpriority(PRIO_PROCESS, 0, -10 /* real time audio */) != 0) {
PLOG(ERROR) << "Unable to request real time priority; performance will be "
"impacted.";
}
bool loaded = libchrometts_.Load(kLibchromettsPath);
if (!loaded)
LOG(ERROR) << "Unable to load libchrometts.so: " << dlerror();
else
if (!loaded) {
LOG(ERROR) << "Unable to load libchrometts.so.";
exit(0);
} else {
libchrometts_.GoogleTtsSetLogger(HandleLibraryLogging);
}
}
TtsService::~TtsService() = default;
void TtsService::BindTtsStream(
mojo::PendingReceiver<mojom::TtsStream> receiver) {
mojo::PendingReceiver<mojom::TtsStream> receiver,
mojo::PendingRemote<audio::mojom::StreamFactory> factory) {
base::AutoLock al(state_lock_);
stream_receiver_.Bind(std::move(receiver));
// TODO(accessibility): The sample rate below can change based on the audio
// data retrieved. Plumb this data through and re-create the output device if
// it changes.
media::AudioParameters params(
media::AudioParameters::AUDIO_PCM_LOW_LATENCY, media::CHANNEL_LAYOUT_MONO,
22050 /* sample rate */, libchrometts_.GoogleTtsGetFramesInAudioBuffer());
output_device_ = std::make_unique<audio::OutputDevice>(
std::move(factory), params, this, std::string());
}
void TtsService::InstallVoice(const std::string& voice_name,
const std::vector<uint8_t>& voice_bytes,
InstallVoiceCallback callback) {
base::AutoLock al(state_lock_);
// Create a directory to place extracted voice data.
base::FilePath voice_data_path(kTempDataDirectory);
voice_data_path = voice_data_path.Append(voice_name);
......@@ -75,6 +101,8 @@ void TtsService::InstallVoice(const std::string& voice_name,
void TtsService::SelectVoice(const std::string& voice_name,
SelectVoiceCallback callback) {
base::AutoLock al(state_lock_);
base::FilePath path_prefix =
base::FilePath(kTempDataDirectory).Append(voice_name);
base::FilePath pipeline_path = path_prefix.Append("pipeline");
......@@ -82,39 +110,93 @@ void TtsService::SelectVoice(const std::string& voice_name,
pipeline_path.value().c_str(), path_prefix.value().c_str()));
}
void TtsService::Init(const std::vector<uint8_t>& text_jspb,
InitCallback callback) {
std::move(callback).Run(libchrometts_.GoogleTtsInitBuffered(
(char*)&text_jspb[0], text_jspb.size()));
}
void TtsService::Speak(const std::vector<uint8_t>& text_jspb,
SpeakCallback callback) {
base::AutoLock al(state_lock_);
void TtsService::Read(ReadCallback callback) {
int32_t status = libchrometts_.GoogleTtsReadBuffered();
if (status == -1) {
std::move(callback).Run(mojom::TtsStreamItem::New(
std::vector<uint8_t>(), true, std::vector<mojom::TimepointPtr>()));
tts_event_observer_.reset();
auto pending_receiver = tts_event_observer_.BindNewPipeAndPassReceiver();
std::move(callback).Run(std::move(pending_receiver));
bool status = libchrometts_.GoogleTtsInitBuffered((char*)&text_jspb[0],
text_jspb.size());
if (!status) {
tts_event_observer_->OnError();
return;
}
char* event = libchrometts_.GoogleTtsGetEventBufferPtr();
std::vector<uint8_t> send_event(libchrometts_.GoogleTtsGetEventBufferLen());
for (size_t i = 0; i < send_event.size(); i++)
send_event[i] = event[i];
std::vector<mojom::TimepointPtr> timepoints(
libchrometts_.GoogleTtsGetTimepointsCount());
for (size_t i = 0; i < timepoints.size(); i++) {
timepoints[i] = mojom::Timepoint::New(
libchrometts_.GoogleTtsGetTimepointsTimeInSecsAtIndex(i),
libchrometts_.GoogleTtsGetTimepointsCharIndexAtIndex(i));
output_device_->Play();
}
void TtsService::Stop() {
base::AutoLock al(state_lock_);
StopLocked();
}
void TtsService::SetVolume(float volume) {
base::AutoLock al(state_lock_);
output_device_->SetVolume(volume);
}
int TtsService::Render(base::TimeDelta delay,
base::TimeTicks delay_timestamp,
int prior_frames_skipped,
media::AudioBus* dest) {
// Careful to not block the render callback. Only try to acquire the lock
// here, but early return if we are processing a series of other calls. This
// can be extremely important if there's a long queue of pending Speak/Stop
// pairs being processed on the main thread. This can occur if the tts api
// receives lots of tts requests.
if (!state_lock_.Try()) {
return 0;
}
size_t frames = 0;
int32_t status =
libchrometts_.GoogleTtsReadBuffered(dest->channel(0), &frames);
if (status <= 0) {
// -1 means an error, 0 means done.
if (status == -1)
tts_event_observer_->OnError();
dest->Zero();
StopLocked();
state_lock_.Release();
return 0;
}
if (frames == 0) {
state_lock_.Release();
return 0;
}
if (!got_first_buffer_) {
got_first_buffer_ = true;
tts_event_observer_->OnStart();
}
std::move(callback).Run(mojom::TtsStreamItem::New(send_event, status == 0,
std::move(timepoints)));
// There's only really ever one timepoint since we play this buffer in one
// chunk.
int char_index = -1;
if (libchrometts_.GoogleTtsGetTimepointsCount() > 0)
char_index = libchrometts_.GoogleTtsGetTimepointsCharIndexAtIndex(0);
if (char_index != -1)
tts_event_observer_->OnTimepoint(char_index);
state_lock_.Release();
return frames;
}
void TtsService::Finalize() {
void TtsService::OnRenderError() {}
void TtsService::StopLocked() {
output_device_->Pause();
libchrometts_.GoogleTtsFinalizeBuffered();
if (tts_event_observer_ && got_first_buffer_)
tts_event_observer_->OnEnd();
got_first_buffer_ = false;
}
} // namespace tts
......
......@@ -5,36 +5,77 @@
#ifndef CHROMEOS_SERVICES_TTS_TTS_SERVICE_H_
#define CHROMEOS_SERVICES_TTS_TTS_SERVICE_H_
#include "base/synchronization/lock.h"
#include "base/thread_annotations.h"
#include "chromeos/services/tts/public/mojom/tts_service.mojom.h"
#include "library_loaders/libchrometts.h"
#include "media/base/audio_renderer_sink.h"
#include "mojo/public/cpp/bindings/receiver.h"
#include "mojo/public/cpp/bindings/remote.h"
namespace audio {
class OutputDevice;
}
namespace chromeos {
namespace tts {
class TtsService : public mojom::TtsService, public mojom::TtsStream {
class TtsService : public mojom::TtsService,
public mojom::TtsStream,
public media::AudioRendererSink::RenderCallback {
public:
explicit TtsService(mojo::PendingReceiver<mojom::TtsService> receiver);
~TtsService() override;
private:
// TtsService:
void BindTtsStream(mojo::PendingReceiver<mojom::TtsStream> receiver) override;
// mojom::TtsService:
void BindTtsStream(
mojo::PendingReceiver<mojom::TtsStream> receiver,
mojo::PendingRemote<audio::mojom::StreamFactory> factory) override;
// TtsStream:
// mojom::TtsStream:
void InstallVoice(const std::string& voice_name,
const std::vector<uint8_t>& voice_bytes,
InstallVoiceCallback callback) override;
void SelectVoice(const std::string& voice_name,
SelectVoiceCallback callback) override;
void Init(const std::vector<uint8_t>& text_jspb,
InitCallback callback) override;
void Read(ReadCallback callback) override;
void Finalize() override;
void Speak(const std::vector<uint8_t>& text_jspb,
SpeakCallback callback) override;
void Stop() override;
void SetVolume(float volume) override;
// media::AudioRendererSink::RenderCallback:
int Render(base::TimeDelta delay,
base::TimeTicks delay_timestamp,
int prior_frames_skipped,
media::AudioBus* dest) override;
void OnRenderError() override;
// Handles stopping tts.
void StopLocked() EXCLUSIVE_LOCKS_REQUIRED(state_lock_);
LibChromeTtsLoader libchrometts_;
// Connection to tts in the browser.
mojo::Receiver<mojom::TtsService> service_receiver_;
mojo::Receiver<mojom::TtsStream> stream_receiver_;
// Protects access to state from main thread and audio thread.
base::Lock state_lock_;
// Prebuilt.
LibChromeTtsLoader libchrometts_ GUARDED_BY(state_lock_);
// Connection to tts in the component extension.
mojo::Receiver<mojom::TtsStream> stream_receiver_ GUARDED_BY(state_lock_);
// Connection to send tts events to component extension.
mojo::Remote<mojom::TtsEventObserver> tts_event_observer_
GUARDED_BY(state_lock_);
// Outputs speech synthesis to audio.
std::unique_ptr<audio::OutputDevice> output_device_ GUARDED_BY(state_lock_);
// Tracks whether any audio data came as a result of |Speak|. Reset for every
// call to |Speak|.
bool got_first_buffer_ GUARDED_BY(state_lock_);
};
} // namespace tts
......
......@@ -25,6 +25,13 @@ TtsProcessPolicy::TtsProcessPolicy() {}
TtsProcessPolicy::~TtsProcessPolicy() {}
ResultExpr TtsProcessPolicy::EvaluateSyscall(int sysno) const {
switch (sysno) {
case __NR_sched_setscheduler:
return RestrictSchedTarget(GetPolicyPid(), sysno);
default:
break;
}
auto* sandbox_linux = SandboxLinux::GetInstance();
if (sandbox_linux->ShouldBrokerHandleSyscall(sysno))
return sandbox_linux->HandleViaBroker();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment