Commit b69cbcbf authored by Evan Liu's avatar Evan Liu Committed by Commit Bot

Revert "Add protos required by Cloud Speech-to-Text"

This reverts commit bfe52a91.

Reason for revert: The Chrome security team has prohibited the use of gRPC in its current state as it does not go through Chrome's network stack. The Cloud Speech-to-Text implementation will be replaced with an alternative REST-based API that goes through the network stack.

Original change's description:
> Add protos required by Cloud Speech-to-Text
> 
> This CL adds the protos required by the Cloud Speech-to-Text service
> used by the Live Caption feature. The proto files come from Google3 with
> the following modifications:
> 
> 1. The package name of some of the protos were changed to avoid
>    conflicting with identical protos in the third_party/grpc repository.
> 
> 2. The license text was changed to align with Chromium conventions.
> 
> 3. any.proto and status.proto were renamed to prevent conflict with
>    protos in components/offline_pages/core/prefetch/proto.
> 
> 4. All protos are optimized for LITE_RUNTIME.
> 
> Bug: 1071626
> Change-Id: I351d98c9272e7ec7991a8182dbc305673c8e239c
> Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2153759
> Reviewed-by: Tommy Nyquist <nyquist@chromium.org>
> Reviewed-by: Yuwei Huang <yuweih@chromium.org>
> Reviewed-by: Albert J. Wong <ajwong@chromium.org>
> Commit-Queue: Evan Liu <evliu@google.com>
> Cr-Commit-Position: refs/heads/master@{#764033}

TBR=ajwong@chromium.org,nyquist@chromium.org,yuweih@chromium.org,lgrey@chromium.org,evliu@google.com

# Not skipping CQ checks because original CL landed > 1 day ago.

Bug: 1071626
Change-Id: Ie0f8b65ed29217cbbedb28e8551e49189028203e
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2210799Reviewed-by: default avatarEvan Liu <evliu@google.com>
Reviewed-by: default avatarTommy Nyquist <nyquist@chromium.org>
Commit-Queue: Evan Liu <evliu@google.com>
Cr-Commit-Position: refs/heads/master@{#771079}
parent 5a0d81b4
...@@ -2,6 +2,4 @@ include_rules = [ ...@@ -2,6 +2,4 @@ include_rules = [
"+chrome/services/soda/internal", "+chrome/services/soda/internal",
"+components/soda/constants.h", "+components/soda/constants.h",
"+media", "+media",
"+third_party/grpc",
"+third_party/protobuf",
] ]
# Copyright 2020 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import("//third_party/grpc/grpc_library.gni")
import("//third_party/protobuf/proto_library.gni")
proto_library("proto") {
sources = [
"any_speech.proto",
"duration.proto",
"status_speech.proto",
]
}
cc_grpc_library("cloud_speech_library") {
sources = [ "cloud_speech.proto" ]
deps = [ ":proto" ]
}
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
syntax = "proto3";
package cloud_speech.proto;
option csharp_namespace = "Google.Protobuf.WellKnownTypes";
option go_package = "github.com/golang/protobuf/ptypes/any";
option java_package = "com.google.protobuf";
option java_outer_classname = "AnyProto";
option java_multiple_files = true;
option objc_class_prefix = "GPB";
option optimize_for = LITE_RUNTIME;
// `Any` contains an arbitrary serialized protocol buffer message along with a
// URL that describes the type of the serialized message.
//
// Protobuf library provides support to pack/unpack Any values in the form
// of utility functions or additional generated methods of the Any type.
//
// Example 1: Pack and unpack a message in C++.
//
// Foo foo = ...;
// Any any;
// any.PackFrom(foo);
// ...
// if (any.UnpackTo(&foo)) {
// ...
// }
//
// Example 2: Pack and unpack a message in Java.
//
// Foo foo = ...;
// Any any = Any.pack(foo);
// ...
// if (any.is(Foo.class)) {
// foo = any.unpack(Foo.class);
// }
//
// Example 3: Pack and unpack a message in Python.
//
// foo = Foo(...)
// any = Any()
// any.Pack(foo)
// ...
// if any.Is(Foo.DESCRIPTOR):
// any.Unpack(foo)
// ...
//
// Example 4: Pack and unpack a message in Go
//
// foo := &pb.Foo{...}
// any, err := ptypes.MarshalAny(foo)
// ...
// foo := &pb.Foo{}
// if err := ptypes.UnmarshalAny(any, foo); err != nil {
// ...
// }
//
// The pack methods provided by protobuf library will by default use
// 'type.googleapis.com/full.type.name' as the type URL and the unpack
// methods only use the fully qualified type name after the last '/'
// in the type URL, for example "foo.bar.com/x/y.z" will yield type
// name "y.z".
//
//
// JSON
// ====
// The JSON representation of an `Any` value uses the regular
// representation of the deserialized, embedded message, with an
// additional field `@type` which contains the type URL. Example:
//
// package google.profile;
// message Person {
// string first_name = 1;
// string last_name = 2;
// }
//
// {
// "@type": "type.googleapis.com/google.profile.Person",
// "firstName": <string>,
// "lastName": <string>
// }
//
// If the embedded message type is well-known and has a custom JSON
// representation, that representation will be embedded adding a field
// `value` which holds the custom JSON in addition to the `@type`
// field. Example (for message [cloud_speech.proto.Duration][]):
//
// {
// "@type": "type.googleapis.com/cloud_speech.proto.Duration",
// "value": "1.212s"
// }
//
message Any {
// A URL/resource name that uniquely identifies the type of the serialized
// protocol buffer message. This string must contain at least
// one "/" character. The last segment of the URL's path must represent
// the fully qualified name of the type (as in
// `path/cloud_speech.proto.Duration`). The name should be in a canonical form
// (e.g., leading "." is not accepted).
//
// In practice, teams usually precompile into the binary all types that they
// expect it to use in the context of Any. However, for URLs which use the
// scheme `http`, `https`, or no scheme, one can optionally set up a type
// server that maps type URLs to message definitions as follows:
//
// * If no scheme is provided, `https` is assumed.
// * An HTTP GET on the URL must yield a [google.protobuf.Type][]
// value in binary format, or produce an error.
// * Applications are allowed to cache lookup results based on the
// URL, or have them precompiled into a binary to avoid any
// lookup. Therefore, binary compatibility needs to be preserved
// on changes to types. (Use versioned type names to manage
// breaking changes.)
//
// Note: this functionality is not currently available in the official
// protobuf release, and it is not used for type URLs beginning with
// type.googleapis.com.
//
// Schemes other than `http`, `https` (or the empty scheme) might be
// used with implementation specific semantics.
//
string type_url = 1;
// Must be a valid serialized protocol buffer of the above specified type.
bytes value = 2;
}
\ No newline at end of file
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
syntax = "proto3";
package google.cloud.speech.v1;
import "duration.proto";
import "status_speech.proto";
option cc_enable_arenas = true;
option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1;speech";
option java_multiple_files = true;
option java_outer_classname = "SpeechProto";
option java_package = "com.google.cloud.speech.v1";
option objc_class_prefix = "GCS";
option optimize_for = LITE_RUNTIME;
// Service that implements Google Cloud Speech API.
service Speech {
// Performs bidirectional streaming speech recognition: receive results while
// sending audio. This method is only available via the gRPC API (not REST).
rpc StreamingRecognize(stream StreamingRecognizeRequest)
returns (stream StreamingRecognizeResponse) {}
}
// The top-level message sent by the client for the `Recognize` method.
message RecognizeRequest {
// Required. Provides information to the recognizer that specifies how to
// process the request.
RecognitionConfig config = 1;
// Required. The audio data to be recognized.
RecognitionAudio audio = 2;
}
// The top-level message sent by the client for the `StreamingRecognize` method.
// Multiple `StreamingRecognizeRequest` messages are sent. The first message
// must contain a `streaming_config` message and must not contain
// `audio_content`. All subsequent messages must contain `audio_content` and
// must not contain a `streaming_config` message.
message StreamingRecognizeRequest {
// The streaming request, which is either a streaming config or audio content.
oneof streaming_request {
// Provides information to the recognizer that specifies how to process the
// request. The first `StreamingRecognizeRequest` message must contain a
// `streaming_config` message.
StreamingRecognitionConfig streaming_config = 1;
// The audio data to be recognized. Sequential chunks of audio data are sent
// in sequential `StreamingRecognizeRequest` messages. The first
// `StreamingRecognizeRequest` message must not contain `audio_content` data
// and all subsequent `StreamingRecognizeRequest` messages must contain
// `audio_content` data. The audio bytes must be encoded as specified in
// `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
// pure binary representation (not base64). See
// [content limits](https://cloud.google.com/speech-to-text/quotas#content).
bytes audio_content = 2;
}
}
// Provides information to the recognizer that specifies how to process the
// request.
message StreamingRecognitionConfig {
// Required. Provides information to the recognizer that specifies how to
// process the request.
RecognitionConfig config = 1;
// If `false` or omitted, the recognizer will perform continuous
// recognition (continuing to wait for and process audio even if the user
// pauses speaking) until the client closes the input stream (gRPC API) or
// until the maximum time limit has been reached. May return multiple
// `StreamingRecognitionResult`s with the `is_final` flag set to `true`.
//
// If `true`, the recognizer will detect a single spoken utterance. When it
// detects that the user has paused or stopped speaking, it will return an
// `END_OF_SINGLE_UTTERANCE` event and cease recognition. It will return no
// more than one `StreamingRecognitionResult` with the `is_final` flag set to
// `true`.
bool single_utterance = 2;
// If `true`, interim results (tentative hypotheses) may be
// returned as they become available (these interim results are indicated with
// the `is_final=false` flag).
// If `false` or omitted, only `is_final=true` result(s) are returned.
bool interim_results = 3;
}
// Provides information to the recognizer that specifies how to process the
// request.
message RecognitionConfig {
// The encoding of the audio data sent in the request.
//
// All encodings support only 1 channel (mono) audio, unless the
// `audio_channel_count` and `enable_separate_recognition_per_channel` fields
// are set.
//
// For best results, the audio source should be captured and transmitted using
// a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
// recognition can be reduced if lossy codecs are used to capture or transmit
// audio, particularly if background noise is present. Lossy codecs include
// `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, and `MP3`.
//
// The `FLAC` and `WAV` audio file formats include a header that describes the
// included audio content. You can request recognition for `WAV` files that
// contain either `LINEAR16` or `MULAW` encoded audio.
// If you send `FLAC` or `WAV` audio file format in
// your request, you do not need to specify an `AudioEncoding`; the audio
// encoding format is determined from the file header. If you specify
// an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the
// encoding configuration must match the encoding described in the audio
// header; otherwise the request returns an
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error
// code.
enum AudioEncoding {
// Not specified.
ENCODING_UNSPECIFIED = 0;
// Uncompressed 16-bit signed little-endian samples (Linear PCM).
LINEAR16 = 1;
// `FLAC` (Free Lossless Audio
// Codec) is the recommended encoding because it is
// lossless--therefore recognition is not compromised--and
// requires only about half the bandwidth of `LINEAR16`. `FLAC` stream
// encoding supports 16-bit and 24-bit samples, however, not all fields in
// `STREAMINFO` are supported.
FLAC = 2;
// 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
MULAW = 3;
// Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
AMR = 4;
// Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
AMR_WB = 5;
// Opus encoded audio frames in Ogg container
// ([OggOpus](https://wiki.xiph.org/OggOpus)).
// `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000.
OGG_OPUS = 6;
// Although the use of lossy encodings is not recommended, if a very low
// bitrate encoding is required, `OGG_OPUS` is highly preferred over
// Speex encoding. The [Speex](https://speex.org/) encoding supported by
// Cloud Speech API has a header byte in each block, as in MIME type
// `audio/x-speex-with-header-byte`.
// It is a variant of the RTP Speex encoding defined in
// [RFC 5574](https://tools.ietf.org/html/rfc5574).
// The stream is a sequence of blocks, one block per RTP packet. Each block
// starts with a byte containing the length of the block, in bytes, followed
// by one or more frames of Speex data, padded to an integral number of
// bytes (octets) as specified in RFC 5574. In other words, each RTP header
// is replaced with a single byte containing the block length. Only Speex
// wideband is supported. `sample_rate_hertz` must be 16000.
SPEEX_WITH_HEADER_BYTE = 7;
}
// Encoding of audio data sent in all `RecognitionAudio` messages.
// This field is optional for `FLAC` and `WAV` audio files and required
// for all other audio formats. For details, see
// [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
AudioEncoding encoding = 1;
// Sample rate in Hertz of the audio data sent in all
// `RecognitionAudio` messages. Valid values are: 8000-48000.
// 16000 is optimal. For best results, set the sampling rate of the audio
// source to 16000 Hz. If that's not possible, use the native sample rate of
// the audio source (instead of re-sampling).
// This field is optional for FLAC and WAV audio files, but is
// required for all other audio formats. For details, see
// [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
int32 sample_rate_hertz = 2;
// The number of channels in the input audio data.
// ONLY set this for MULTI-CHANNEL recognition.
// Valid values for LINEAR16 and FLAC are `1`-`8`.
// Valid values for OGG_OPUS are '1'-'254'.
// Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.
// If `0` or omitted, defaults to one channel (mono).
// Note: We only recognize the first channel by default.
// To perform independent recognition on each channel set
// `enable_separate_recognition_per_channel` to 'true'.
int32 audio_channel_count = 7;
// This needs to be set to `true` explicitly and `audio_channel_count` > 1
// to get each channel recognized separately. The recognition result will
// contain a `channel_tag` field to state which channel that result belongs
// to. If this is not true, we will only recognize the first channel. The
// request is billed cumulatively for all channels recognized:
// `audio_channel_count` multiplied by the length of the audio.
bool enable_separate_recognition_per_channel = 12;
// Required. The language of the supplied audio as a
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
// Example: "en-US".
// See [Language
// Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
// of the currently supported language codes.
string language_code = 3;
// Maximum number of recognition hypotheses to be returned.
// Specifically, the maximum number of `SpeechRecognitionAlternative` messages
// within each `SpeechRecognitionResult`.
// The server may return fewer than `max_alternatives`.
// Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of
// one. If omitted, will return a maximum of one.
int32 max_alternatives = 4;
// If set to `true`, the server will attempt to filter out
// profanities, replacing all but the initial character in each filtered word
// with asterisks, e.g. "f***". If set to `false` or omitted, profanities
// won't be filtered out.
bool profanity_filter = 5;
// Array of [SpeechContext][google.cloud.speech.v1.SpeechContext].
// A means to provide context to assist the speech recognition. For more
// information, see
// [speech
// adaptation](https://cloud.google.com/speech-to-text/docs/context-strength).
repeated SpeechContext speech_contexts = 6;
// If `true`, the top result includes a list of words and
// the start and end time offsets (timestamps) for those words. If
// `false`, no word-level time offset information is returned. The default is
// `false`.
bool enable_word_time_offsets = 8;
// If 'true', adds punctuation to recognition result hypotheses.
// This feature is only available in select languages. Setting this for
// requests in other languages has no effect at all.
// The default 'false' value does not add punctuation to result hypotheses.
// Note: This is currently offered as an experimental service, complimentary
// to all users. In the future this may be exclusively available as a
// premium feature.
bool enable_automatic_punctuation = 11;
// Metadata regarding this request.
RecognitionMetadata metadata = 9;
// Which model to select for the given request. Select the model
// best suited to your domain to get best results. If a model is not
// explicitly specified, then we auto-select a model based on the parameters
// in the RecognitionConfig.
// <table>
// <tr>
// <td><b>Model</b></td>
// <td><b>Description</b></td>
// </tr>
// <tr>
// <td><code>command_and_search</code></td>
// <td>Best for short queries such as voice commands or voice search.</td>
// </tr>
// <tr>
// <td><code>phone_call</code></td>
// <td>Best for audio that originated from a phone call (typically
// recorded at an 8khz sampling rate).</td>
// </tr>
// <tr>
// <td><code>video</code></td>
// <td>Best for audio that originated from from video or includes multiple
// speakers. Ideally the audio is recorded at a 16khz or greater
// sampling rate. This is a premium model that costs more than the
// standard rate.</td>
// </tr>
// <tr>
// <td><code>default</code></td>
// <td>Best for audio that is not one of the specific audio models.
// For example, long-form audio. Ideally the audio is high-fidelity,
// recorded at a 16khz or greater sampling rate.</td>
// </tr>
// </table>
string model = 13;
// Set to true to use an enhanced model for speech recognition.
// If `use_enhanced` is set to true and the `model` field is not set, then
// an appropriate enhanced model is chosen if an enhanced model exists for
// the audio.
//
// If `use_enhanced` is true and an enhanced version of the specified model
// does not exist, then the speech is recognized using the standard version
// of the specified model.
bool use_enhanced = 14;
}
// Description of audio data to be recognized.
message RecognitionMetadata {
// Use case categories that the audio recognition request can be described
// by.
enum InteractionType {
// Use case is either unknown or is something other than one of the other
// values below.
INTERACTION_TYPE_UNSPECIFIED = 0;
// Multiple people in a conversation or discussion. For example in a
// meeting with two or more people actively participating. Typically
// all the primary people speaking would be in the same room (if not,
// see PHONE_CALL)
DISCUSSION = 1;
// One or more persons lecturing or presenting to others, mostly
// uninterrupted.
PRESENTATION = 2;
// A phone-call or video-conference in which two or more people, who are
// not in the same room, are actively participating.
PHONE_CALL = 3;
// A recorded message intended for another person to listen to.
VOICEMAIL = 4;
// Professionally produced audio (eg. TV Show, Podcast).
PROFESSIONALLY_PRODUCED = 5;
// Transcribe spoken questions and queries into text.
VOICE_SEARCH = 6;
// Transcribe voice commands, such as for controlling a device.
VOICE_COMMAND = 7;
// Transcribe speech to text to create a written document, such as a
// text-message, email or report.
DICTATION = 8;
}
// The use case most closely describing the audio content to be recognized.
InteractionType interaction_type = 1;
// The industry vertical to which this speech recognition request most
// closely applies. This is most indicative of the topics contained
// in the audio. Use the 6-digit NAICS code to identify the industry
// vertical - see https://www.naics.com/search/.
uint32 industry_naics_code_of_audio = 3;
// The device used to make the recording. Examples 'Nexus 5X' or
// 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
// 'Cardioid Microphone'.
string recording_device_name = 7;
// Mime type of the original audio file. For example `audio/m4a`,
// `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
// A list of possible audio mime types is maintained at
// http://www.iana.org/assignments/media-types/media-types.xhtml#audio
string original_mime_type = 8;
// Description of the content. Eg. "Recordings of federal supreme court
// hearings from 2012".
string audio_topic = 10;
}
// Provides "hints" to the speech recognizer to favor specific words and phrases
// in the results.
message SpeechContext {
// A list of strings containing words and phrases "hints" so that
// the speech recognition is more likely to recognize them. This can be used
// to improve the accuracy for specific words and phrases, for example, if
// specific commands are typically spoken by the user. This can also be used
// to add additional words to the vocabulary of the recognizer. See
// [usage limits](https://cloud.google.com/speech-to-text/quotas#content).
//
// List items can also be set to classes for groups of words that represent
// common concepts that occur in natural language. For example, rather than
// providing phrase hints for every month of the year, using the $MONTH class
// improves the likelihood of correctly transcribing audio that includes
// months.
repeated string phrases = 1;
}
// Contains audio data in the encoding specified in the `RecognitionConfig`.
// Either `content` or `uri` must be supplied. Supplying both or neither
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
// See [content limits](https://cloud.google.com/speech-to-text/quotas#content).
message RecognitionAudio {
// The audio source, which is either inline content or a Google Cloud
// Storage uri.
oneof audio_source {
// The audio data bytes encoded as specified in
// `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
// pure binary representation, whereas JSON representations use base64.
bytes content = 1;
// URI that points to a file that contains audio data bytes as specified in
// `RecognitionConfig`. The file must not be compressed (for example, gzip).
// Currently, only Google Cloud Storage URIs are
// supported, which must be specified in the following format:
// `gs://bucket_name/object_name` (other URI formats return
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]).
// For more information, see [Request
// URIs](https://cloud.google.com/storage/docs/reference-uris).
string uri = 2;
}
}
// The only message returned to the client by the `Recognize` method. It
// contains the result as zero or more sequential `SpeechRecognitionResult`
// messages.
message RecognizeResponse {
// Sequential list of transcription results corresponding to
// sequential portions of audio.
repeated SpeechRecognitionResult results = 2;
}
// `StreamingRecognizeResponse` is the only message returned to the client by
// `StreamingRecognize`. A series of zero or more `StreamingRecognizeResponse`
// messages are streamed back to the client. If there is no recognizable
// audio, and `single_utterance` is set to false, then no messages are streamed
// back to the client.
//
// Here's an example of a series of ten `StreamingRecognizeResponse`s that might
// be returned while processing audio:
//
// 1. results { alternatives { transcript: "tube" } stability: 0.01 }
//
// 2. results { alternatives { transcript: "to be a" } stability: 0.01 }
//
// 3. results { alternatives { transcript: "to be" } stability: 0.9 }
// results { alternatives { transcript: " or not to be" } stability: 0.01 }
//
// 4. results { alternatives { transcript: "to be or not to be"
// confidence: 0.92 }
// alternatives { transcript: "to bee or not to bee" }
// is_final: true }
//
// 5. results { alternatives { transcript: " that's" } stability: 0.01 }
//
// 6. results { alternatives { transcript: " that is" } stability: 0.9 }
// results { alternatives { transcript: " the question" } stability: 0.01 }
//
// 7. results { alternatives { transcript: " that is the question"
// confidence: 0.98 }
// alternatives { transcript: " that was the question" }
// is_final: true }
//
// Notes:
//
// - Only two of the above responses #4 and #7 contain final results; they are
// indicated by `is_final: true`. Concatenating these together generates the
// full transcript: "to be or not to be that is the question".
//
// - The others contain interim `results`. #3 and #6 contain two interim
// `results`: the first portion has a high stability and is less likely to
// change; the second portion has a low stability and is very likely to
// change. A UI designer might choose to show only high stability `results`.
//
// - The specific `stability` and `confidence` values shown above are only for
// illustrative purposes. Actual values may vary.
//
// - In each response, only one of these fields will be set:
// `error`,
// `speech_event_type`, or
// one or more (repeated) `results`.
message StreamingRecognizeResponse {
// Indicates the type of speech event.
enum SpeechEventType {
// No speech event specified.
SPEECH_EVENT_UNSPECIFIED = 0;
// This event indicates that the server has detected the end of the user's
// speech utterance and expects no additional speech. Therefore, the server
// will not process additional audio (although it may subsequently return
// additional results). The client should stop sending additional audio
// data, half-close the gRPC connection, and wait for any additional results
// until the server closes the gRPC connection. This event is only sent if
// `single_utterance` was set to `true`, and is not used otherwise.
END_OF_SINGLE_UTTERANCE = 1;
}
// If set, returns a [google.rpc.Status][google.rpc.Status] message that
// specifies the error for the operation.
google.rpc.Status error = 1;
// This repeated list contains zero or more results that
// correspond to consecutive portions of the audio currently being processed.
// It contains zero or one `is_final=true` result (the newly settled portion),
// followed by zero or more `is_final=false` results (the interim results).
repeated StreamingRecognitionResult results = 2;
// Indicates the type of speech event.
SpeechEventType speech_event_type = 4;
}
// A streaming speech recognition result corresponding to a portion of the audio
// that is currently being processed.
message StreamingRecognitionResult {
// May contain one or more recognition hypotheses (up to the
// maximum specified in `max_alternatives`).
// These alternatives are ordered in terms of accuracy, with the top (first)
// alternative being the most probable, as ranked by the recognizer.
repeated SpeechRecognitionAlternative alternatives = 1;
// If `false`, this `StreamingRecognitionResult` represents an
// interim result that may change. If `true`, this is the final time the
// speech service will return this particular `StreamingRecognitionResult`,
// the recognizer will not return any further hypotheses for this portion of
// the transcript and corresponding audio.
bool is_final = 2;
// An estimate of the likelihood that the recognizer will not
// change its guess about this interim result. Values range from 0.0
// (completely unstable) to 1.0 (completely stable).
// This field is only provided for interim results (`is_final=false`).
// The default of 0.0 is a sentinel value indicating `stability` was not set.
float stability = 3;
// Time offset of the end of this result relative to the
// beginning of the audio.
cloud_speech.proto.Duration result_end_time = 4;
// For multi-channel audio, this is the channel number corresponding to the
// recognized result for the audio from that channel.
// For audio_channel_count = N, its output values can range from '1' to 'N'.
int32 channel_tag = 5;
// The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of
// the language in this result. This language code was detected to have the
// most likelihood of being spoken in the audio.
string language_code = 6;
}
// A speech recognition result corresponding to a portion of the audio.
message SpeechRecognitionResult {
// May contain one or more recognition hypotheses (up to the
// maximum specified in `max_alternatives`).
// These alternatives are ordered in terms of accuracy, with the top (first)
// alternative being the most probable, as ranked by the recognizer.
repeated SpeechRecognitionAlternative alternatives = 1;
// For multi-channel audio, this is the channel number corresponding to the
// recognized result for the audio from that channel.
// For audio_channel_count = N, its output values can range from '1' to 'N'.
int32 channel_tag = 2;
}
// Alternative hypotheses (a.k.a. n-best list).
message SpeechRecognitionAlternative {
// Transcript text representing the words that the user spoke.
string transcript = 1;
// The confidence estimate between 0.0 and 1.0. A higher number
// indicates an estimated greater likelihood that the recognized words are
// correct. This field is set only for the top alternative of a non-streaming
// result or, of a streaming result where `is_final=true`.
// This field is not guaranteed to be accurate and users should not rely on it
// to be always provided.
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
float confidence = 2;
}
\ No newline at end of file
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
syntax = "proto3";
package cloud_speech.proto;
option csharp_namespace = "Google.Protobuf.WellKnownTypes";
option cc_enable_arenas = true;
option go_package = "github.com/golang/protobuf/ptypes/duration";
option java_package = "com.google.protobuf";
option java_outer_classname = "DurationProto";
option java_multiple_files = true;
option objc_class_prefix = "GPB";
option optimize_for = LITE_RUNTIME;
// A Duration represents a signed, fixed-length span of time represented
// as a count of seconds and fractions of seconds at nanosecond
// resolution. It is independent of any calendar and concepts like "day"
// or "month". It is related to Timestamp in that the difference between
// two Timestamp values is a Duration and it can be added or subtracted
// from a Timestamp. Range is approximately +-10,000 years.
//
// # Examples
//
// Example 1: Compute Duration from two Timestamps in pseudo code.
//
// Timestamp start = ...;
// Timestamp end = ...;
// Duration duration = ...;
//
// duration.seconds = end.seconds - start.seconds;
// duration.nanos = end.nanos - start.nanos;
//
// if (duration.seconds < 0 && duration.nanos > 0) {
// duration.seconds += 1;
// duration.nanos -= 1000000000;
// } else if (durations.seconds > 0 && duration.nanos < 0) {
// duration.seconds -= 1;
// duration.nanos += 1000000000;
// }
//
// Example 2: Compute Timestamp from Timestamp + Duration in pseudo code.
//
// Timestamp start = ...;
// Duration duration = ...;
// Timestamp end = ...;
//
// end.seconds = start.seconds + duration.seconds;
// end.nanos = start.nanos + duration.nanos;
//
// if (end.nanos < 0) {
// end.seconds -= 1;
// end.nanos += 1000000000;
// } else if (end.nanos >= 1000000000) {
// end.seconds += 1;
// end.nanos -= 1000000000;
// }
//
// Example 3: Compute Duration from datetime.timedelta in Python.
//
// td = datetime.timedelta(days=3, minutes=10)
// duration = Duration()
// duration.FromTimedelta(td)
//
// # JSON Mapping
//
// In JSON format, the Duration type is encoded as a string rather than an
// object, where the string ends in the suffix "s" (indicating seconds) and
// is preceded by the number of seconds, with nanoseconds expressed as
// fractional seconds. For example, 3 seconds with 0 nanoseconds should be
// encoded in JSON format as "3s", while 3 seconds and 1 nanosecond should
// be expressed in JSON format as "3.000000001s", and 3 seconds and 1
// microsecond should be expressed in JSON format as "3.000001s".
//
//
message Duration {
// Signed seconds of the span of time. Must be from -315,576,000,000
// to +315,576,000,000 inclusive. Note: these bounds are computed from:
// 60 sec/min * 60 min/hr * 24 hr/day * 365.25 days/year * 10000 years
int64 seconds = 1;
// Signed fractions of a second at nanosecond resolution of the span
// of time. Durations less than one second are represented with a 0
// `seconds` field and a positive or negative `nanos` field. For durations
// of one second or more, a non-zero value for the `nanos` field must be
// of the same sign as the `seconds` field. Must be from -999,999,999
// to +999,999,999 inclusive.
int32 nanos = 2;
}
\ No newline at end of file
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
syntax = "proto3";
package google.rpc;
import "any_speech.proto";
option go_package = "google.golang.org/genproto/googleapis/rpc/status;status";
option java_multiple_files = true;
option java_outer_classname = "StatusProto";
option java_package = "com.google.rpc";
option objc_class_prefix = "RPC";
option optimize_for = LITE_RUNTIME;
// The `Status` type defines a logical error model that is suitable for
// different programming environments, including REST APIs and RPC APIs. It is
// used by [gRPC](https://github.com/grpc). The error model is designed to be:
//
// - Simple to use and understand for most users
// - Flexible enough to meet unexpected needs
//
// # Overview
//
// The `Status` message contains three pieces of data: error code, error
// message, and error details. The error code should be an enum value of
// [google.rpc.Code][google.rpc.Code], but it may accept additional error codes
// if needed. The error message should be a developer-facing English message
// that helps developers *understand* and *resolve* the error. If a localized
// user-facing error message is needed, put the localized message in the error
// details or localize it in the client. The optional error details may contain
// arbitrary information about the error. There is a predefined set of error
// detail types in the package `google.rpc` that can be used for common error
// conditions.
//
// # Language mapping
//
// The `Status` message is the logical representation of the error model, but it
// is not necessarily the actual wire format. When the `Status` message is
// exposed in different client libraries and different wire protocols, it can be
// mapped differently. For example, it will likely be mapped to some exceptions
// in Java, but more likely mapped to some error codes in C.
//
// # Other uses
//
// The error model and the `Status` message can be used in a variety of
// environments, either with or without APIs, to provide a
// consistent developer experience across different environments.
//
// Example uses of this error model include:
//
// - Partial errors. If a service needs to return partial errors to the client,
// it may embed the `Status` in the normal response to indicate the partial
// errors.
//
// - Workflow errors. A typical workflow has multiple steps. Each step may
// have a `Status` message for error reporting.
//
// - Batch operations. If a client uses batch request and batch response, the
// `Status` message should be used directly inside batch response, one for
// each error sub-response.
//
// - Asynchronous operations. If an API call embeds asynchronous operation
// results in its response, the status of those operations should be
// represented directly using the `Status` message.
//
// - Logging. If some API errors are stored in logs, the message `Status` could
// be used directly after any stripping needed for security/privacy reasons.
message Status {
// The status code, which should be an enum value of
// [google.rpc.Code][google.rpc.Code].
int32 code = 1;
// A developer-facing error message, which should be in English. Any
// user-facing error message should be localized and sent in the
// [google.rpc.Status.details][google.rpc.Status.details] field, or localized
// by the client.
string message = 2;
// A list of messages that carry the error details. There is a common set of
// message types for APIs to use.
repeated cloud_speech.proto.Any details = 3;
}
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment