Native audio tts playback

Moves text-to-speech playback from js-based web audio to C++. See bug for more details. Depends on https://chromium-review.googlesource.com/c/chromiumos/overlays/chromiumos-overlay/+/2456208/1 Fixed: 1134289 Change-Id: I7ecfa800f6b688f6a3d0d655f557e86efa73188a Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2435919 Commit-Queue: David Tseng <dtseng@chromium.org> Reviewed-by: Dominic Mazzoni <dmazzoni@chromium.org> Reviewed-by: Robert Sesek <rsesek@chromium.org> Reviewed-by: Dale Curtis <dalecurtis@chromium.org> Cr-Commit-Position: refs/heads/master@{#814980}

Native audio tts playback
Moves text-to-speech playback from js-based web audio to C++. See bug for more details. Depends on https://chromium-review.googlesource.com/c/chromiumos/overlays/chromiumos-overlay/+/2456208/1 Fixed: 1134289 Change-Id: I7ecfa800f6b688f6a3d0d655f557e86efa73188a Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2435919 Commit-Queue: David Tseng <dtseng@chromium.org> Reviewed-by: Dominic Mazzoni <dmazzoni@chromium.org> Reviewed-by: Robert Sesek <rsesek@chromium.org> Reviewed-by: Dale Curtis <dalecurtis@chromium.org> Cr-Commit-Position: refs/heads/master@{#814980}
47754cf1 · David Tseng · Commit Bot · 04ebb1f4 · 47754cf1 · 47754cf1
Commit 47754cf1 authored Oct 08, 2020 by David Tseng Committed by Commit Bot Oct 08, 2020
10 changed files
--- a/chrome/browser/speech/extension_api/tts_engine_extension_observer.cc
+++ b/chrome/browser/speech/extension_api/tts_engine_extension_observer.cc
@@ -148,7 +148,10 @@ void TtsEngineExtensionObserver::BindTtsStream(
              .WithDisplayName("TtsService")
              .Pass());

-  tts_service_->BindTtsStream(std::move(receiver));
+  mojo::PendingRemote<audio::mojom::StreamFactory> factory_remote;
+  auto factory_receiver = factory_remote.InitWithNewPipeAndPassReceiver();
+  content::GetAudioService().BindStreamFactory(std::move(factory_receiver));
+  tts_service_->BindTtsStream(std::move(receiver), std::move(factory_remote));
 }
 #endif  // defined(OS_CHROMEOS)


--- a/chrome/browser/speech/extension_api/tts_engine_extension_observer.h
+++ b/chrome/browser/speech/extension_api/tts_engine_extension_observer.h
@@ -8,6 +8,7 @@
 #include "base/macros.h"
 #include "base/scoped_observer.h"
 #include "components/keyed_service/core/keyed_service.h"
+#include "content/public/browser/audio_service.h"
 #include "extensions/browser/event_router.h"
 #include "extensions/browser/extension_registry.h"
 #include "extensions/browser/extension_registry_observer.h"

--- a/chromeos/services/tts/BUILD.gn
+++ b/chromeos/services/tts/BUILD.gn
@@ -16,6 +16,7 @@ source_set("tts") {
    ":libchrometts",
    "//base",
    "//chromeos/services/tts/public/mojom",
+    "//services/audio/public/cpp:cpp",
  ]
 }

@@ -54,5 +55,6 @@ generate_library_loader("libchrometts") {
    "GoogleTtsGetTimepointsCount",
    "GoogleTtsGetTimepointsTimeInSecsAtIndex",
    "GoogleTtsGetTimepointsCharIndexAtIndex",
+    "GoogleTtsGetFramesInAudioBuffer",
  ]
 }
--- a/chromeos/services/tts/DEPS
+++ b/chromeos/services/tts/DEPS
 include_rules = [
+  "+media/base",
  "+mojo/public",
  "+sandbox",
  "+sandbox/policy",
+  "+services/audio/public/cpp",
 ]
--- a/chromeos/services/tts/chrome_tts.h
+++ b/chromeos/services/tts/chrome_tts.h
@@ -20,7 +20,7 @@ bool GoogleTtsInstallVoice(const char* voice_name,

 bool GoogleTtsInitBuffered(const char* text_jspb, int text_jspb_len);

-int GoogleTtsReadBuffered();
+int GoogleTtsReadBuffered(float* audio_channel_buffer, size_t* frames_written);

 void GoogleTtsFinalizeBuffered();

@@ -33,4 +33,7 @@ int GoogleTtsGetTimepointsCharIndexAtIndex(size_t index);
 char* GoogleTtsGetEventBufferPtr();

 size_t GoogleTtsGetEventBufferLen();
+
+size_t GoogleTtsGetFramesInAudioBuffer();
+
 #endif  // CHROMEOS_SERVICES_TTS_CHROME_TTS_H_
--- a/chromeos/services/tts/public/mojom/BUILD.gn
+++ b/chromeos/services/tts/public/mojom/BUILD.gn
@@ -6,4 +6,6 @@ import("//mojo/public/tools/bindings/mojom.gni")

 mojom("mojom") {
  sources = [ "tts_service.mojom" ]
+
+  public_deps = [ "//services/audio/public/mojom" ]
 }
--- a/chromeos/services/tts/public/mojom/tts_service.mojom
+++ b/chromeos/services/tts/public/mojom/tts_service.mojom
@@ -4,26 +4,7 @@

 module chromeos.tts.mojom;

-// Structure describing a point in time during speech synthesis.
-struct Timepoint {
-  // The time, in seconds.
-  float time_sec;
-
-  // The index in the text being spoken.
-  int32 char_index;
-};
-
-// A TTS event and associated metadata within a TTS stream.
-struct TtsStreamItem {
-  // An internal serialized proto.speech.tts.TtsControllerEvent proto.
-  array<uint8> event_buffer_bytes;
-
-  // Whether streaming is complete.
-  bool done;
-
-  // A list of timepoints associated with the event above.
-  array<Timepoint> timepoints;
-};
+import "services/audio/public/mojom/stream_factory.mojom";

 // The main interface to the TTS engine on Chrome OS. Only used by and private
 // to the Chrome OS Google TTS engine component extension. TtsService lives in a
@@ -32,12 +13,14 @@ struct TtsStreamItem {
 // and the Google TTS engine component extension through a TtsStream, but does
 // not participate otherwise.
 interface TtsService {
-  // Binds a TtsStream to this service.
-  BindTtsStream(pending_receiver<TtsStream> receiver);
+  // Binds a TtsStream to this service and returns an AudioOutputStream receiver
+  // which this service uses to play audio.
+  BindTtsStream(pending_receiver<TtsStream> receiver,
+                pending_remote<audio.mojom.StreamFactory> stream_factory);
 };

-// Interface for the Google component TTS engine to control and consume a stream
-// of TtsStreamItems produced by TtsService. There is only ever one TtsStream
+// Interface for the Google component TTS engine to control
+// the TtsService's production of audio. There is only ever one TtsStream
 // owned by the TtsService.
 //
 // The component extension sets up the stream's voice by doing:
@@ -45,25 +28,18 @@ interface TtsService {
 // InstallVoice(other_data, "other_voice")
 // SelectVoice("other_voice")
 //
-// After reading from the stream (see below), the component extension can do:
+// After speaking using the stream (see below), the component extension can do:
 // SelectVoice("voice")
 // to change voices.
 //
-// The component extension calls the following three methods repeatedly, in
-// order to read from the stream given text. For example,
+// The component extension calls the following two methods repeatedly and
+// optionally observes events.
 //
-// Init(<a proto containing text "Hello there.">)
-// Read()
-// Read()
-// ...
-// Finalize()
-// Init(<proto containing text "Testing 1, 2, 3.")
-// Read()
-// Read()
-// ...
-// Finalize()
+// Speak(<a proto containing text "Hello there.">)
+// Speak(<proto containing text "Testing 1, 2, 3.")
+// Stop()
 //
-// Note that the component extension may call Finalize() early, if the TTS api
+// Note that the component extension may call Stop() early, if the TTS api
 // wants to, for example, stop speech.
 interface TtsStream {
  // Forward and install the |voice_name| encoded by |voice_bytes|.
@@ -73,13 +49,30 @@ interface TtsStream {
  // Selects a voice for streaming given a |voice_name|.
  SelectVoice(string voice_name) => (bool success);

-  // Initialize a new TTS stream given a serialized proto.speech.tts.Text proto.
-  Init(array<uint8> text_jspb)
-      => (bool success);
+  // Speak text described by a serialized proto.speech.tts.Text proto.
+  Speak(array<uint8> text_jspb)
+      => (pending_receiver<TtsEventObserver> event_observer);
+
+  // Stop speaking the currently speaking text, if any.
+  Stop();
+
+  // Sets the volume of the tts playback (0.0 to 1.0).
+  SetVolume(float volume);
+};
+
+// Returned to callers of TtsStream.speak(). It receives notable events
+// pertaining to the text spoken.
+interface TtsEventObserver {
+  // TtsStream.Speak started speech playback.
+  OnStart();
+
+  // TtsStream.Speak is playing text at |char_index| approximately at the
+  // current time.
+  OnTimepoint(int32 char_index);

-  // Read the next stream item.
-  Read() => (TtsStreamItem item);
+  // TtsStream.Speak ended speech playback.
+  OnEnd();

-  // Clean up and finish the current TTS stream.
-  Finalize();
+  // TtsStream.Speak encountered an error.
+  OnError();
 };
--- a/chromeos/services/tts/tts_service.cc
+++ b/chromeos/services/tts/tts_service.cc
@@ -5,9 +5,13 @@
 #include "chromeos/services/tts/tts_service.h"

 #include <dlfcn.h>
+#include <sys/resource.h>

 #include "base/files/file_util.h"
 #include "chromeos/services/tts/constants.h"
+#include "media/base/audio_parameters.h"
+#include "media/base/audio_sample_types.h"
+#include "services/audio/public/cpp/output_device.h"

 namespace chromeos {
 namespace tts {
@@ -37,24 +41,46 @@ void HandleLibraryLogging(int severity, const char* message) {
 // methods utilize C features only.

 TtsService::TtsService(mojo::PendingReceiver<mojom::TtsService> receiver)
-    : service_receiver_(this, std::move(receiver)), stream_receiver_(this) {
+    : service_receiver_(this, std::move(receiver)),
+      stream_receiver_(this),
+      got_first_buffer_(false) {
+  if (setpriority(PRIO_PROCESS, 0, -10 /* real time audio */) != 0) {
+    PLOG(ERROR) << "Unable to request real time priority; performance will be "
+                   "impacted.";
+  }
  bool loaded = libchrometts_.Load(kLibchromettsPath);
-  if (!loaded)
-    LOG(ERROR) << "Unable to load libchrometts.so: " << dlerror();
-  else
+  if (!loaded) {
+    LOG(ERROR) << "Unable to load libchrometts.so.";
+    exit(0);
+  } else {
    libchrometts_.GoogleTtsSetLogger(HandleLibraryLogging);
+  }
 }

 TtsService::~TtsService() = default;

 void TtsService::BindTtsStream(
-    mojo::PendingReceiver<mojom::TtsStream> receiver) {
+    mojo::PendingReceiver<mojom::TtsStream> receiver,
+    mojo::PendingRemote<audio::mojom::StreamFactory> factory) {
+  base::AutoLock al(state_lock_);
  stream_receiver_.Bind(std::move(receiver));
+
+  // TODO(accessibility): The sample rate below can change based on the audio
+  // data retrieved. Plumb this data through and re-create the output device if
+  // it changes.
+  media::AudioParameters params(
+      media::AudioParameters::AUDIO_PCM_LOW_LATENCY, media::CHANNEL_LAYOUT_MONO,
+      22050 /* sample rate */, libchrometts_.GoogleTtsGetFramesInAudioBuffer());
+
+  output_device_ = std::make_unique<audio::OutputDevice>(
+      std::move(factory), params, this, std::string());
 }

 void TtsService::InstallVoice(const std::string& voice_name,
                              const std::vector<uint8_t>& voice_bytes,
                              InstallVoiceCallback callback) {
+  base::AutoLock al(state_lock_);
+
  // Create a directory to place extracted voice data.
  base::FilePath voice_data_path(kTempDataDirectory);
  voice_data_path = voice_data_path.Append(voice_name);
@@ -75,6 +101,8 @@ void TtsService::InstallVoice(const std::string& voice_name,

 void TtsService::SelectVoice(const std::string& voice_name,
                             SelectVoiceCallback callback) {
+  base::AutoLock al(state_lock_);
+
  base::FilePath path_prefix =
      base::FilePath(kTempDataDirectory).Append(voice_name);
  base::FilePath pipeline_path = path_prefix.Append("pipeline");
@@ -82,39 +110,93 @@ void TtsService::SelectVoice(const std::string& voice_name,
      pipeline_path.value().c_str(), path_prefix.value().c_str()));
 }

-void TtsService::Init(const std::vector<uint8_t>& text_jspb,
-                      InitCallback callback) {
-  std::move(callback).Run(libchrometts_.GoogleTtsInitBuffered(
-      (char*)&text_jspb[0], text_jspb.size()));
-}
+void TtsService::Speak(const std::vector<uint8_t>& text_jspb,
+                       SpeakCallback callback) {
+  base::AutoLock al(state_lock_);

-void TtsService::Read(ReadCallback callback) {
-  int32_t status = libchrometts_.GoogleTtsReadBuffered();
-  if (status == -1) {
-    std::move(callback).Run(mojom::TtsStreamItem::New(
-        std::vector<uint8_t>(), true, std::vector<mojom::TimepointPtr>()));
+  tts_event_observer_.reset();
+  auto pending_receiver = tts_event_observer_.BindNewPipeAndPassReceiver();
+  std::move(callback).Run(std::move(pending_receiver));
+
+  bool status = libchrometts_.GoogleTtsInitBuffered((char*)&text_jspb[0],
+                                                    text_jspb.size());
+  if (!status) {
+    tts_event_observer_->OnError();
    return;
  }

-  char* event = libchrometts_.GoogleTtsGetEventBufferPtr();
-  std::vector<uint8_t> send_event(libchrometts_.GoogleTtsGetEventBufferLen());
-  for (size_t i = 0; i < send_event.size(); i++)
-    send_event[i] = event[i];
-
-  std::vector<mojom::TimepointPtr> timepoints(
-      libchrometts_.GoogleTtsGetTimepointsCount());
-  for (size_t i = 0; i < timepoints.size(); i++) {
-    timepoints[i] = mojom::Timepoint::New(
-        libchrometts_.GoogleTtsGetTimepointsTimeInSecsAtIndex(i),
-        libchrometts_.GoogleTtsGetTimepointsCharIndexAtIndex(i));
+  output_device_->Play();
+}
+
+void TtsService::Stop() {
+  base::AutoLock al(state_lock_);
+  StopLocked();
+}
+
+void TtsService::SetVolume(float volume) {
+  base::AutoLock al(state_lock_);
+  output_device_->SetVolume(volume);
+}
+
+int TtsService::Render(base::TimeDelta delay,
+                       base::TimeTicks delay_timestamp,
+                       int prior_frames_skipped,
+                       media::AudioBus* dest) {
+  // Careful to not block the render callback. Only try to acquire the lock
+  // here, but early return if we are processing a series of other calls. This
+  // can be extremely important if there's a long queue of pending Speak/Stop
+  // pairs being processed on the main thread. This can occur if the tts api
+  // receives lots of tts requests.
+  if (!state_lock_.Try()) {
+    return 0;
+  }
+
+  size_t frames = 0;
+  int32_t status =
+      libchrometts_.GoogleTtsReadBuffered(dest->channel(0), &frames);
+
+  if (status <= 0) {
+    // -1 means an error, 0 means done.
+    if (status == -1)
+      tts_event_observer_->OnError();
+
+    dest->Zero();
+    StopLocked();
+    state_lock_.Release();
+    return 0;
+  }
+
+  if (frames == 0) {
+    state_lock_.Release();
+    return 0;
+  }
+
+  if (!got_first_buffer_) {
+    got_first_buffer_ = true;
+    tts_event_observer_->OnStart();
  }

-  std::move(callback).Run(mojom::TtsStreamItem::New(send_event, status == 0,
-                                                    std::move(timepoints)));
+  // There's only really ever one timepoint since we play this buffer in one
+  // chunk.
+  int char_index = -1;
+  if (libchrometts_.GoogleTtsGetTimepointsCount() > 0)
+    char_index = libchrometts_.GoogleTtsGetTimepointsCharIndexAtIndex(0);
+
+  if (char_index != -1)
+    tts_event_observer_->OnTimepoint(char_index);
+
+  state_lock_.Release();
+  return frames;
 }

-void TtsService::Finalize() {
+void TtsService::OnRenderError() {}
+
+void TtsService::StopLocked() {
+  output_device_->Pause();
  libchrometts_.GoogleTtsFinalizeBuffered();
+  if (tts_event_observer_ && got_first_buffer_)
+    tts_event_observer_->OnEnd();
+  got_first_buffer_ = false;
 }

 }  // namespace tts

--- a/chromeos/services/tts/tts_service.h
+++ b/chromeos/services/tts/tts_service.h
@@ -5,36 +5,77 @@
 #ifndef CHROMEOS_SERVICES_TTS_TTS_SERVICE_H_
 #define CHROMEOS_SERVICES_TTS_TTS_SERVICE_H_

+#include "base/synchronization/lock.h"
+#include "base/thread_annotations.h"
 #include "chromeos/services/tts/public/mojom/tts_service.mojom.h"
 #include "library_loaders/libchrometts.h"
+#include "media/base/audio_renderer_sink.h"
 #include "mojo/public/cpp/bindings/receiver.h"
+#include "mojo/public/cpp/bindings/remote.h"
+
+namespace audio {
+class OutputDevice;
+}

 namespace chromeos {
 namespace tts {

-class TtsService : public mojom::TtsService, public mojom::TtsStream {
+class TtsService : public mojom::TtsService,
+                   public mojom::TtsStream,
+                   public media::AudioRendererSink::RenderCallback {
 public:
  explicit TtsService(mojo::PendingReceiver<mojom::TtsService> receiver);
  ~TtsService() override;

 private:
-  // TtsService:
-  void BindTtsStream(mojo::PendingReceiver<mojom::TtsStream> receiver) override;
+  // mojom::TtsService:
+  void BindTtsStream(
+      mojo::PendingReceiver<mojom::TtsStream> receiver,
+      mojo::PendingRemote<audio::mojom::StreamFactory> factory) override;

-  // TtsStream:
+  // mojom::TtsStream:
  void InstallVoice(const std::string& voice_name,
                    const std::vector<uint8_t>& voice_bytes,
                    InstallVoiceCallback callback) override;
  void SelectVoice(const std::string& voice_name,
                   SelectVoiceCallback callback) override;
-  void Init(const std::vector<uint8_t>& text_jspb,
-            InitCallback callback) override;
-  void Read(ReadCallback callback) override;
-  void Finalize() override;
+  void Speak(const std::vector<uint8_t>& text_jspb,
+             SpeakCallback callback) override;
+  void Stop() override;
+  void SetVolume(float volume) override;
+
+  // media::AudioRendererSink::RenderCallback:
+  int Render(base::TimeDelta delay,
+             base::TimeTicks delay_timestamp,
+             int prior_frames_skipped,
+             media::AudioBus* dest) override;
+  void OnRenderError() override;
+
+  // Handles stopping tts.
+  void StopLocked() EXCLUSIVE_LOCKS_REQUIRED(state_lock_);

-  LibChromeTtsLoader libchrometts_;
+  // Connection to tts in the browser.
  mojo::Receiver<mojom::TtsService> service_receiver_;
-  mojo::Receiver<mojom::TtsStream> stream_receiver_;
+
+  // Protects access to state from main thread and audio thread.
+  base::Lock state_lock_;
+
+  // Prebuilt.
+  LibChromeTtsLoader libchrometts_ GUARDED_BY(state_lock_);
+
+  // Connection to tts in the component extension.
+  mojo::Receiver<mojom::TtsStream> stream_receiver_ GUARDED_BY(state_lock_);
+
+  // Connection to send tts events to component extension.
+  mojo::Remote<mojom::TtsEventObserver> tts_event_observer_
+      GUARDED_BY(state_lock_);
+
+  // Outputs speech synthesis to audio.
+  std::unique_ptr<audio::OutputDevice> output_device_ GUARDED_BY(state_lock_);
+
+  // Tracks whether any audio data came as a result of |Speak|. Reset for every
+  // call to |Speak|.
+  bool got_first_buffer_ GUARDED_BY(state_lock_);
 };

 }  // namespace tts

--- a/sandbox/policy/linux/bpf_tts_policy_linux.cc
+++ b/sandbox/policy/linux/bpf_tts_policy_linux.cc
@@ -25,6 +25,13 @@ TtsProcessPolicy::TtsProcessPolicy() {}
 TtsProcessPolicy::~TtsProcessPolicy() {}

 ResultExpr TtsProcessPolicy::EvaluateSyscall(int sysno) const {
+  switch (sysno) {
+    case __NR_sched_setscheduler:
+      return RestrictSchedTarget(GetPolicyPid(), sysno);
+    default:
+      break;
+  }
+
  auto* sandbox_linux = SandboxLinux::GetInstance();
  if (sandbox_linux->ShouldBrokerHandleSyscall(sysno))
    return sandbox_linux->HandleViaBroker();