diff --git a/speech/cloud-client/README.md b/speech/cloud-client/README.md index 3f6cbc4a42e..4be5d92d080 100644 --- a/speech/cloud-client/README.md +++ b/speech/cloud-client/README.md @@ -91,3 +91,15 @@ Performing streaming speech transcription and punctuation on an audio file ``` mvn exec:java -DRecognize -Dexec.args="stream-punctuation ./resources/audio.raw" ``` + +## Enhanced Model +Transcribe an audio file using an enhanced model +``` +mvn exec:java -DRecognize -Dexec.args="enhanced-model ./resources/commercial_mono.wav" +``` + +## Recognition Metadata +Transcribe an audio file with recognition metadata +``` +mvn exec:java -DRecognize -Dexec.args="metadata ./resources/commercial_mono.wav" +``` diff --git a/speech/cloud-client/pom.xml b/speech/cloud-client/pom.xml index ca80e6989bc..a761ad332b5 100644 --- a/speech/cloud-client/pom.xml +++ b/speech/cloud-client/pom.xml @@ -40,7 +40,7 @@ com.google.cloud google-cloud-speech - 0.42.0-alpha + 0.46.0-alpha diff --git a/speech/cloud-client/resources/commercial_mono.wav b/speech/cloud-client/resources/commercial_mono.wav new file mode 100644 index 00000000000..e6b9ed434f9 Binary files /dev/null and b/speech/cloud-client/resources/commercial_mono.wav differ diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java index dab73a3aa29..396cc5110ea 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java @@ -24,6 +24,10 @@ import com.google.cloud.speech.v1p1beta1.RecognitionAudio; import com.google.cloud.speech.v1p1beta1.RecognitionConfig; import com.google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding; +import com.google.cloud.speech.v1p1beta1.RecognitionMetadata; +import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.InteractionType; +import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance; +import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType; import com.google.cloud.speech.v1p1beta1.RecognizeResponse; import com.google.cloud.speech.v1p1beta1.SpeechClient; import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative; @@ -53,7 +57,7 @@ public static void main(String... args) throws Exception { "\tjava %s \"\" \"\"\n" + "Commands:\n" + "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets | model-selection\n" - + "\t| auto-punctuation | stream-punctuation\n" + + "\t| auto-punctuation | stream-punctuation | enhanced-model | metadata\n" + "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI " + "for a Cloud Storage resource (gs://...)\n", Recognize.class.getCanonicalName()); @@ -97,6 +101,10 @@ public static void main(String... args) throws Exception { } } else if (command.equals("stream-punctuation")) { streamingTranscribeWithAutomaticPunctuation(path); + } else if (command.equals("enhanced-model")) { + transcribeFileWithEnhancedModel(path); + } else if (command.equals("metadata")) { + transcribeFileWithMetadata(path); } } @@ -678,4 +686,97 @@ public SettableFuture> future() { } } // [END speech_stream_recognize_punctuation] + + // [START speech_transcribe_file_with_enhanced_model] + /** + * Transcribe the given audio file using an enhanced model. + * + * @param fileName the path to an audio file. + */ + public static void transcribeFileWithEnhancedModel(String fileName) throws Exception { + Path path = Paths.get(fileName); + byte[] content = Files.readAllBytes(path); + + try (SpeechClient speechClient = SpeechClient.create()) { + // Get the contents of the local audio file + RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder() + .setContent(ByteString.copyFrom(content)) + .build(); + + // Configure request to enable enhanced models + RecognitionConfig config = RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(8000) + // Enhanced models are only available to projects that + // opt in for audio data collection. + .setUseEnhanced(true) + // A model must be specified to use enhanced model. + .setModel("phone_call") + .build(); + + // Perform the transcription request + RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); + + // Print out the results + for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternatives(0); + System.out.format("Transcript: %s\n\n", alternative.getTranscript()); + } + } + } + // [END speech_transcribe_file_with_enhanced_model] + + // [START speech_transcribe_file_with_metadata] + /** + * Transcribe the given audio file and include recognition metadata in the request. + * + * @param fileName the path to an audio file. + */ + public static void transcribeFileWithMetadata(String fileName) throws Exception { + Path path = Paths.get(fileName); + byte[] content = Files.readAllBytes(path); + + try (SpeechClient speechClient = SpeechClient.create()) { + // Get the contents of the local audio file + RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder() + .setContent(ByteString.copyFrom(content)) + .build(); + + // Construct a recognition metadata object. + // Most metadata fields are specified as enums that can be found + // in speech.enums.RecognitionMetadata + RecognitionMetadata metadata = RecognitionMetadata.newBuilder() + .setInteractionType(InteractionType.DISCUSSION) + .setMicrophoneDistance(MicrophoneDistance.NEARFIELD) + .setRecordingDeviceType(RecordingDeviceType.SMARTPHONE) + .setRecordingDeviceName("Pixel 2 XL") // Some metadata fields are free form strings + // And some are integers, for instance the 6 digit NAICS code + // https://www.naics.com/search/ + .setIndustryNaicsCodeOfAudio(519190) + .build(); + + // Configure request to enable enhanced models + RecognitionConfig config = RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(8000) + .setMetadata(metadata) // Add the metadata to the config + .build(); + + // Perform the transcription request + RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); + + // Print out the results + for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternatives(0); + System.out.format("Transcript: %s\n\n", alternative.getTranscript()); + } + } + } + // [END speech_transcribe_file_with_metadata] } diff --git a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java index 022e24f7d1a..983d3a8724f 100644 --- a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java +++ b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java @@ -46,6 +46,8 @@ public class RecognizeIT { private String videoFileName = "./resources/Google_Gnome.wav"; private String gcsVideoPath = "gs://" + BUCKET + "/speech/Google_Gnome.wav"; + private String recognitionAudioFile = "./resources/commercial_mono.wav"; + @Before public void setUp() { bout = new ByteArrayOutputStream(); @@ -145,4 +147,18 @@ public void testStreamAutoPunctuation() throws Exception { String got = bout.toString(); assertThat(got).contains("How old is the Brooklyn Bridge?"); } + + @Test + public void testEnhancedModel() throws Exception { + Recognize.transcribeFileWithEnhancedModel(recognitionAudioFile); + String got = bout.toString(); + assertThat(got).contains("Chrome"); + } + + @Test + public void testMetadata() throws Exception { + Recognize.transcribeFileWithMetadata(recognitionAudioFile); + String got = bout.toString(); + assertThat(got).contains("Chrome"); + } }