samples: Add samples for enhanced models and metadata (#1093)

GoogleCloudPlatform · Aug 15, 2020 · feecc26 · feecc26
1 parent 6835abd
commit feecc26
Show file tree

Hide file tree

Showing 3 changed files with 118 additions and 1 deletion.
diff --git a/speech/snippets/resources/commercial_mono.wav b/speech/snippets/resources/commercial_mono.wav
diff --git a/speech/snippets/src/main/java/com/example/speech/Recognize.java b/speech/snippets/src/main/java/com/example/speech/Recognize.java
@@ -24,6 +24,10 @@
 import com.google.cloud.speech.v1p1beta1.RecognitionAudio;
 import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
 import com.google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding;
+import com.google.cloud.speech.v1p1beta1.RecognitionMetadata;
+import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.InteractionType;
+import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance;
+import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType;
 import com.google.cloud.speech.v1p1beta1.RecognizeResponse;
 import com.google.cloud.speech.v1p1beta1.SpeechClient;
 import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
@@ -53,7 +57,7 @@ public static void main(String... args) throws Exception {
           "\tjava %s \"<command>\" \"<path-to-image>\"\n"
           + "Commands:\n"
           + "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets | model-selection\n"
-          + "\t| auto-punctuation | stream-punctuation\n"
+          + "\t| auto-punctuation | stream-punctuation | enhanced-model | metadata\n"
           + "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI "
           + "for a Cloud Storage resource (gs://...)\n",
           Recognize.class.getCanonicalName());
@@ -97,6 +101,10 @@ public static void main(String... args) throws Exception {
       }
     } else if (command.equals("stream-punctuation")) {
       streamingTranscribeWithAutomaticPunctuation(path);
+    } else if (command.equals("enhanced-model")) {
+      transcribeFileWithEnhancedModel(path);
+    } else if (command.equals("metadata")) {
+      transcribeFileWithMetadata(path);
     }
   }
 
@@ -678,4 +686,97 @@ public SettableFuture<List<T>> future() {
     }
   }
   // [END speech_stream_recognize_punctuation]
+
+  // [START speech_transcribe_file_with_enhanced_model]
+  /**
+   * Transcribe the given audio file using an enhanced model.
+   *
+   * @param fileName the path to an audio file.
+   */
+  public static void transcribeFileWithEnhancedModel(String fileName) throws Exception {
+    Path path = Paths.get(fileName);
+    byte[] content = Files.readAllBytes(path);
+
+    try (SpeechClient speechClient = SpeechClient.create()) {
+      // Get the contents of the local audio file
+      RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder()
+          .setContent(ByteString.copyFrom(content))
+          .build();
+
+      // Configure request to enable enhanced models
+      RecognitionConfig config = RecognitionConfig.newBuilder()
+          .setEncoding(AudioEncoding.LINEAR16)
+          .setLanguageCode("en-US")
+          .setSampleRateHertz(8000)
+          // Enhanced models are only available to projects that
+          // opt in for audio data collection.
+          .setUseEnhanced(true)
+          // A model must be specified to use enhanced model.
+          .setModel("phone_call")
+          .build();
+
+      // Perform the transcription request
+      RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
+
+      // Print out the results
+      for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
+        // There can be several alternative transcripts for a given chunk of speech. Just use the
+        // first (most likely) one here.
+        SpeechRecognitionAlternative alternative = result.getAlternatives(0);
+        System.out.format("Transcript: %s\n\n", alternative.getTranscript());
+      }
+    }
+  }
+  // [END speech_transcribe_file_with_enhanced_model]
+
+  // [START speech_transcribe_file_with_metadata]
+  /**
+   * Transcribe the given audio file and include recognition metadata in the request.
+   *
+   * @param fileName the path to an audio file.
+   */
+  public static void transcribeFileWithMetadata(String fileName) throws Exception {
+    Path path = Paths.get(fileName);
+    byte[] content = Files.readAllBytes(path);
+
+    try (SpeechClient speechClient = SpeechClient.create()) {
+      // Get the contents of the local audio file
+      RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder()
+          .setContent(ByteString.copyFrom(content))
+          .build();
+
+      // Construct a recognition metadata object.
+      // Most metadata fields are specified as enums that can be found
+      // in speech.enums.RecognitionMetadata
+      RecognitionMetadata metadata = RecognitionMetadata.newBuilder()
+          .setInteractionType(InteractionType.DISCUSSION)
+          .setMicrophoneDistance(MicrophoneDistance.NEARFIELD)
+          .setRecordingDeviceType(RecordingDeviceType.SMARTPHONE)
+          .setRecordingDeviceName("Pixel 2 XL") // Some metadata fields are free form strings
+          // And some are integers, for instance the 6 digit NAICS code
+          // https://www.naics.com/search/
+          .setIndustryNaicsCodeOfAudio(519190)
+          .build();
+
+      // Configure request to enable enhanced models
+      RecognitionConfig config = RecognitionConfig.newBuilder()
+          .setEncoding(AudioEncoding.LINEAR16)
+          .setLanguageCode("en-US")
+          .setSampleRateHertz(8000)
+          .setMetadata(metadata) // Add the metadata to the config
+          .build();
+
+      // Perform the transcription request
+      RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
+
+      // Print out the results
+      for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
+        // There can be several alternative transcripts for a given chunk of speech. Just use the
+        // first (most likely) one here.
+        SpeechRecognitionAlternative alternative = result.getAlternatives(0);
+        System.out.format("Transcript: %s\n\n", alternative.getTranscript());
+      }
+    }
+  }
+  // [END speech_transcribe_file_with_metadata]
 }
diff --git a/speech/snippets/src/test/java/com/example/speech/RecognizeIT.java b/speech/snippets/src/test/java/com/example/speech/RecognizeIT.java
@@ -46,6 +46,8 @@ public class RecognizeIT {
   private String videoFileName = "./resources/Google_Gnome.wav";
   private String gcsVideoPath = "gs://" + BUCKET + "/speech/Google_Gnome.wav";
 
+  private String recognitionAudioFile = "./resources/commercial_mono.wav";
+
   @Before
   public void setUp() {
     bout = new ByteArrayOutputStream();
@@ -145,4 +147,18 @@ public void testStreamAutoPunctuation() throws Exception {
     String got = bout.toString();
     assertThat(got).contains("How old is the Brooklyn Bridge?");
   }
+
+  @Test
+  public void testEnhancedModel() throws Exception {
+    Recognize.transcribeFileWithEnhancedModel(recognitionAudioFile);
+    String got = bout.toString();
+    assertThat(got).contains("Chrome");
+  }
+
+  @Test
+  public void testMetadata() throws Exception {
+    Recognize.transcribeFileWithMetadata(recognitionAudioFile);
+    String got = bout.toString();
+    assertThat(got).contains("Chrome");
+  }
 }