feat: add alternative_language_codes to RecognitionConfig (#824)

- [ ] Regenerate this pull request now. PiperOrigin-RevId: 413453425 Source-Link: googleapis/googleapis@2b47b24 Source-Link: googleapis/googleapis-gen@7ffe6e0 Copy-Tag: eyJwIjoiLmdpdGh1Yi8uT3dsQm90LnlhbWwiLCJoIjoiN2ZmZTZlMGExYmY2M2Q4NTQwMDA5Y2U2OTg2NjBlYmI3MWM1NGZmMSJ9 feat: add WEBM_OPUS codec feat: add SpeechAdaptation configuration feat: add word confidence feat: add spoken punctuation and spoken emojis feat: add hint boost in SpeechContext
googleapis · Dec 6, 2021 · 3860264 · 3860264
1 parent e9bdfd7
commit 3860264
Show file tree

Hide file tree

Showing 8 changed files with 3,086 additions and 417 deletions.
diff --git a/packages/google-cloud-node/protos/google/cloud/speech/v1/cloud_speech.proto b/packages/google-cloud-node/protos/google/cloud/speech/v1/cloud_speech.proto
@@ -19,6 +19,7 @@ package google.cloud.speech.v1;
 import "google/api/annotations.proto";
 import "google/api/client.proto";
 import "google/api/field_behavior.proto";
+import "google/cloud/speech/v1/resource.proto";
 import "google/longrunning/operations.proto";
 import "google/protobuf/any.proto";
 import "google/protobuf/duration.proto";
@@ -181,7 +182,8 @@ message RecognitionConfig {
   // a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
   // recognition can be reduced if lossy codecs are used to capture or transmit
   // audio, particularly if background noise is present. Lossy codecs include
-  // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, `MP3`.
+  // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, `MP3`,
+  // and `WEBM_OPUS`.
   //
   // The `FLAC` and `WAV` audio file formats include a header that describes the
   // included audio content. You can request recognition for `WAV` files that
@@ -236,6 +238,11 @@ message RecognitionConfig {
     // is replaced with a single byte containing the block length. Only Speex
     // wideband is supported. `sample_rate_hertz` must be 16000.
     SPEEX_WITH_HEADER_BYTE = 7;
+
+    // Opus encoded audio frames in WebM container
+    // ([OggOpus](https://wiki.xiph.org/OggOpus)). `sample_rate_hertz` must be
+    // one of 8000, 12000, 16000, 24000, or 48000.
+    WEBM_OPUS = 9;
   }
 
   // Encoding of audio data sent in all `RecognitionAudio` messages.
@@ -279,6 +286,20 @@ message RecognitionConfig {
   // of the currently supported language codes.
   string language_code = 3 [(google.api.field_behavior) = REQUIRED];
 
+  // A list of up to 3 additional
+  // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,
+  // listing possible alternative languages of the supplied audio.
+  // See [Language
+  // Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
+  // of the currently supported language codes. If alternative languages are
+  // listed, recognition result will contain recognition in the most likely
+  // language detected including the main language_code. The recognition result
+  // will include the language tag of the language detected in the audio. Note:
+  // This feature is only supported for Voice Command and Voice Search use cases
+  // and performance may vary for other use cases (e.g., phone call
+  // transcription).
+  repeated string alternative_language_codes = 18;
+
   // Maximum number of recognition hypotheses to be returned.
   // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
   // within each `SpeechRecognitionResult`.
@@ -293,6 +314,13 @@ message RecognitionConfig {
   // won't be filtered out.
   bool profanity_filter = 5;
 
+  // Speech adaptation configuration improves the accuracy of speech
+  // recognition. For more information, see the [speech
+  // adaptation](https://cloud.google.com/speech-to-text/docs/adaptation)
+  // documentation.
+  // When speech adaptation is set it supersedes the `speech_contexts` field.
+  SpeechAdaptation adaptation = 20;
+
   // Array of [SpeechContext][google.cloud.speech.v1.SpeechContext].
   // A means to provide context to assist the speech recognition. For more
   // information, see
@@ -306,12 +334,33 @@ message RecognitionConfig {
   // `false`.
   bool enable_word_time_offsets = 8;
 
+  // If `true`, the top result includes a list of words and the
+  // confidence for those words. If `false`, no word-level confidence
+  // information is returned. The default is `false`.
+  bool enable_word_confidence = 15;
+
   // If 'true', adds punctuation to recognition result hypotheses.
   // This feature is only available in select languages. Setting this for
   // requests in other languages has no effect at all.
   // The default 'false' value does not add punctuation to result hypotheses.
   bool enable_automatic_punctuation = 11;
 
+  // The spoken punctuation behavior for the call
+  // If not set, uses default behavior based on model of choice
+  // e.g. command_and_search will enable spoken punctuation by default
+  // If 'true', replaces spoken punctuation with the corresponding symbols in
+  // the request. For example, "how are you question mark" becomes "how are
+  // you?". See https://cloud.google.com/speech-to-text/docs/spoken-punctuation
+  // for support. If 'false', spoken punctuation is not replaced.
+  google.protobuf.BoolValue enable_spoken_punctuation = 22;
+
+  // The spoken emoji behavior for the call
+  // If not set, uses default behavior based on model of choice
+  // If 'true', adds spoken emoji formatting for the request. This will replace
+  // spoken emojis with the corresponding Unicode symbols in the final
+  // transcript. If 'false', spoken emojis are not replaced.
+  google.protobuf.BoolValue enable_spoken_emojis = 23;
+
   // Config to enable speaker diarization and set additional
   // parameters to make diarization better suited for your application.
   // Note: When this is enabled, we send all the words from the beginning of the
@@ -537,6 +586,16 @@ message SpeechContext {
   // improves the likelihood of correctly transcribing audio that includes
   // months.
   repeated string phrases = 1;
+
+  // Hint Boost. Positive value will increase the probability that a specific
+  // phrase will be recognized over other similar sounding phrases. The higher
+  // the boost, the higher the chance of false positive recognition as well.
+  // Negative boost values would correspond to anti-biasing. Anti-biasing is not
+  // enabled, so negative boost will simply be ignored. Though `boost` can
+  // accept a wide range of positive values, most use cases are best served with
+  // values between 0 and 20. We recommend using a binary search approach to
+  // finding the optimal value for your use case.
+  float boost = 4;
 }
 
 // Contains audio data in the encoding specified in the `RecognitionConfig`.
@@ -587,6 +646,12 @@ message LongRunningRecognizeResponse {
 
   // When available, billed audio seconds for the corresponding request.
   google.protobuf.Duration total_billed_time = 3;
+
+  // Original output config if present in the request.
+  TranscriptOutputConfig output_config = 6;
+
+  // If the transcript output fails this field contains the relevant error.
+  google.rpc.Status output_error = 7;
 }
 
 // Describes the progress of a long-running `LongRunningRecognize` call. It is
@@ -723,11 +788,10 @@ message StreamingRecognitionResult {
   // For audio_channel_count = N, its output values can range from '1' to 'N'.
   int32 channel_tag = 5;
 
-  // The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of
-  // the language in this result. This language code was detected to have the
-  // most likelihood of being spoken in the audio.
-  string language_code = 6
-    [(google.api.field_behavior) = OUTPUT_ONLY];
+  // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
+  // of the language in this result. This language code was detected to have
+  // the most likelihood of being spoken in the audio.
+  string language_code = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
 }
 
 // A speech recognition result corresponding to a portion of the audio.
@@ -742,6 +806,15 @@ message SpeechRecognitionResult {
   // recognized result for the audio from that channel.
   // For audio_channel_count = N, its output values can range from '1' to 'N'.
   int32 channel_tag = 2;
+
+  // Time offset of the end of this result relative to the
+  // beginning of the audio.
+  google.protobuf.Duration result_end_time = 4;
+
+  // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
+  // of the language in this result. This language code was detected to have
+  // the most likelihood of being spoken in the audio.
+  string language_code = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
 }
 
 // Alternative hypotheses (a.k.a. n-best list).
@@ -785,6 +858,15 @@ message WordInfo {
   // The word corresponding to this set of information.
   string word = 3;
 
+  // The confidence estimate between 0.0 and 1.0. A higher number
+  // indicates an estimated greater likelihood that the recognized words are
+  // correct. This field is set only for the top alternative of a non-streaming
+  // result or, of a streaming result where `is_final=true`.
+  // This field is not guaranteed to be accurate and users should not rely on it
+  // to be always provided.
+  // The default of 0.0 is a sentinel value indicating `confidence` was not set.
+  float confidence = 4;
+
   // Output only. A distinct integer value is assigned for every speaker within
   // the audio. This field specifies which one of those speakers was detected to
   // have spoken this word. Value ranges from '1' to diarization_speaker_count.

diff --git a/packages/google-cloud-node/protos/google/cloud/speech/v1/resource.proto b/packages/google-cloud-node/protos/google/cloud/speech/v1/resource.proto
@@ -0,0 +1,140 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package google.cloud.speech.v1;
+
+import "google/api/resource.proto";
+import "google/protobuf/timestamp.proto";
+import "google/api/annotations.proto";
+
+option cc_enable_arenas = true;
+option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1;speech";
+option java_multiple_files = true;
+option java_outer_classname = "SpeechResourceProto";
+option java_package = "com.google.cloud.speech.v1";
+option objc_class_prefix = "GCS";
+
+// A set of words or phrases that represents a common concept likely to appear
+// in your audio, for example a list of passenger ship names. CustomClass items
+// can be substituted into placeholders that you set in PhraseSet phrases.
+message CustomClass {
+  option (google.api.resource) = {
+    type: "speech.googleapis.com/CustomClass"
+    pattern: "projects/{project}/locations/{location}/customClasses/{custom_class}"
+  };
+
+  // An item of the class.
+  message ClassItem {
+    // The class item's value.
+    string value = 1;
+  }
+
+  // The resource name of the custom class.
+  string name = 1;
+
+  // If this custom class is a resource, the custom_class_id is the resource id
+  // of the CustomClass. Case sensitive.
+  string custom_class_id = 2;
+
+  // A collection of class items.
+  repeated ClassItem items = 3;
+}
+
+// Provides "hints" to the speech recognizer to favor specific words and phrases
+// in the results.
+message PhraseSet {
+  option (google.api.resource) = {
+    type: "speech.googleapis.com/PhraseSet"
+    pattern: "projects/{project}/locations/{location}/phraseSets/{phrase_set}"
+  };
+
+  // A phrases containing words and phrase "hints" so that
+  // the speech recognition is more likely to recognize them. This can be used
+  // to improve the accuracy for specific words and phrases, for example, if
+  // specific commands are typically spoken by the user. This can also be used
+  // to add additional words to the vocabulary of the recognizer. See
+  // [usage limits](https://cloud.google.com/speech-to-text/quotas#content).
+  //
+  // List items can also include pre-built or custom classes containing groups
+  // of words that represent common concepts that occur in natural language. For
+  // example, rather than providing a phrase hint for every month of the
+  // year (e.g. "i was born in january", "i was born in febuary", ...), use the
+  // pre-built `$MONTH` class improves the likelihood of correctly transcribing
+  // audio that includes months (e.g. "i was born in $month").
+  // To refer to pre-built classes, use the class' symbol prepended with `$`
+  // e.g. `$MONTH`. To refer to custom classes that were defined inline in the
+  // request, set the class's `custom_class_id` to a string unique to all class
+  // resources and inline classes. Then use the class' id wrapped in $`{...}`
+  // e.g. "${my-months}". To refer to custom classes resources, use the class'
+  // id wrapped in `${}` (e.g. `${my-months}`).
+  //
+  // Speech-to-Text supports three locations: `global`, `us` (US North America),
+  // and `eu` (Europe). If you are calling the `speech.googleapis.com`
+  // endpoint, use the `global` location. To specify a region, use a
+  // [regional endpoint](/speech-to-text/docs/endpoints) with matching `us` or
+  // `eu` location value.
+  message Phrase {
+    // The phrase itself.
+    string value = 1;
+
+    // Hint Boost. Overrides the boost set at the phrase set level.
+    // Positive value will increase the probability that a specific phrase will
+    // be recognized over other similar sounding phrases. The higher the boost,
+    // the higher the chance of false positive recognition as well. Negative
+    // boost will simply be ignored. Though `boost` can accept a wide range of
+    // positive values, most use cases are best served
+    // with values between 0 and 20. We recommend using a binary search approach
+    // to finding the optimal value for your use case. Speech recognition
+    // will skip PhraseSets with a boost value of 0.
+    float boost = 2;
+  }
+
+  // The resource name of the phrase set.
+  string name = 1;
+
+  // A list of word and phrases.
+  repeated Phrase phrases = 2;
+
+  // Hint Boost. Positive value will increase the probability that a specific
+  // phrase will be recognized over other similar sounding phrases. The higher
+  // the boost, the higher the chance of false positive recognition as well.
+  // Negative boost values would correspond to anti-biasing. Anti-biasing is not
+  // enabled, so negative boost will simply be ignored. Though `boost` can
+  // accept a wide range of positive values, most use cases are best served with
+  // values between 0 (exclusive) and 20. We recommend using a binary search
+  // approach to finding the optimal value for your use case. Speech recognition
+  // will skip PhraseSets with a boost value of 0.
+  float boost = 4;
+}
+
+// Speech adaptation configuration.
+message SpeechAdaptation {
+  // A collection of phrase sets. To specify the hints inline, leave the
+  // phrase set's `name` blank and fill in the rest of its fields. Any
+  // phrase set can use any custom class.
+  repeated PhraseSet phrase_sets = 1;
+
+  // A collection of phrase set resource names to use.
+  repeated string phrase_set_references = 2 [(google.api.resource_reference) = {
+                                               type: "speech.googleapis.com/PhraseSet"
+                                             }];
+
+  // A collection of custom classes. To specify the classes inline, leave the
+  // class' `name` blank and fill in the rest of its fields, giving it a unique
+  // `custom_class_id`. Refer to the inline defined class in phrase hints by its
+  // `custom_class_id`.
+  repeated CustomClass custom_classes = 3;
+}