diff --git a/speech/cloud-client/README.md b/speech/cloud-client/README.md index 9a7055f32ff..98d6c69cfc4 100644 --- a/speech/cloud-client/README.md +++ b/speech/cloud-client/README.md @@ -92,6 +92,11 @@ Performing streaming speech transcription and punctuation on an audio file mvn exec:java -DRecognize -Dexec.args="stream-punctuation ./resources/audio.raw" ``` +Perform microphone streaming speech recognition +``` +mvn exec:java -DRecognize -Dexec.args="micstreamrecognize" +``` + ## Enhanced Model Transcribe an audio file using an enhanced model ``` diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java index 9771ad2a8e9..15beaba3e1a 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java @@ -19,6 +19,9 @@ import com.google.api.gax.longrunning.OperationFuture; import com.google.api.gax.rpc.ApiStreamObserver; import com.google.api.gax.rpc.BidiStreamingCallable; +import com.google.api.gax.rpc.ClientStream; +import com.google.api.gax.rpc.ResponseObserver; +import com.google.api.gax.rpc.StreamController; import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeMetadata; import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeResponse; import com.google.cloud.speech.v1p1beta1.RecognitionAudio; @@ -47,6 +50,13 @@ import java.util.ArrayList; import java.util.List; +import javax.sound.sampled.AudioFormat; +import javax.sound.sampled.AudioInputStream; +import javax.sound.sampled.AudioSystem; +import javax.sound.sampled.DataLine; +import javax.sound.sampled.DataLine.Info; +import javax.sound.sampled.TargetDataLine; + public class Recognize { /** Run speech recognition tasks. */ @@ -56,9 +66,10 @@ public static void main(String... args) throws Exception { System.out.printf( "\tjava %s \"\" \"\"\n" + "Commands:\n" - + "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets\n" - + "\t| model-selection | auto-punctuation | stream-punctuation | enhanced-model\n" - + "\t| metadata | diarization | multi-channel | multi-language | word-level-conf" + + "\tsyncrecognize | asyncrecognize | streamrecognize | micstreamrecognize \n" + + "\t| wordoffsets | model-selection | auto-punctuation | stream-punctuation \n" + + "\t| enhanced-model| metadata | diarization | multi-channel | multi-language \n" + + "\t | word-level-conf" + "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI " + "for a Cloud Storage resource (gs://...)\n", Recognize.class.getCanonicalName()); @@ -88,6 +99,8 @@ public static void main(String... args) throws Exception { } } else if (command.equals("streamrecognize")) { streamingRecognizeFile(path); + } else if (command.equals("micstreamrecognize")) { + streamingMicRecognize(); } else if (command.equals("model-selection")) { if (path.startsWith("gs://")) { transcribeModelSelectionGcs(path); @@ -704,6 +717,97 @@ public SettableFuture> future() { } // [END speech_stream_recognize_punctuation] + // [START speech_streaming_mic_recognize] + /** Performs microphone streaming speech recognition with a duration of 1 minute. */ + public static void streamingMicRecognize() throws Exception { + + ResponseObserver responseObserver = null; + try (SpeechClient client = SpeechClient.create()) { + + responseObserver = + new ResponseObserver() { + ArrayList responses = new ArrayList<>(); + + public void onStart(StreamController controller) {} + + public void onResponse(StreamingRecognizeResponse response) { + responses.add(response); + } + + public void onComplete() { + for (StreamingRecognizeResponse response : responses) { + StreamingRecognitionResult result = response.getResultsList().get(0); + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + System.out.printf("Transcript : %s\n", alternative.getTranscript()); + } + } + + public void onError(Throwable t) { + System.out.println(t); + } + }; + + ClientStream clientStream = + client.streamingRecognizeCallable().splitCall(responseObserver); + + RecognitionConfig recognitionConfig = + RecognitionConfig.newBuilder() + .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .build(); + StreamingRecognitionConfig streamingRecognitionConfig = + StreamingRecognitionConfig.newBuilder().setConfig(recognitionConfig).build(); + + StreamingRecognizeRequest request = + StreamingRecognizeRequest.newBuilder() + .setStreamingConfig(streamingRecognitionConfig) + .build(); // The first request in a streaming call has to be a config + + clientStream.send(request); + // SampleRate:16000Hz, SampleSizeInBits: 16, Number of channels: 1, Signed: true, + // bigEndian: false + AudioFormat audioFormat = new AudioFormat(16000, 16, 1, true, false); + DataLine.Info targetInfo = + new Info( + TargetDataLine.class, + audioFormat); // Set the system information to read from the microphone audio stream + + if (!AudioSystem.isLineSupported(targetInfo)) { + System.out.println("Microphone not supported"); + System.exit(0); + } + // Target data line captures the audio stream the microphone produces. + TargetDataLine targetDataLine = (TargetDataLine) AudioSystem.getLine(targetInfo); + targetDataLine.open(audioFormat); + targetDataLine.start(); + System.out.println("Start speaking"); + long startTime = System.currentTimeMillis(); + // Audio Input Stream + AudioInputStream audio = new AudioInputStream(targetDataLine); + while (true) { + long estimatedTime = System.currentTimeMillis() - startTime; + byte[] data = new byte[6400]; + audio.read(data); + if (estimatedTime > 60000) { // 60 seconds + System.out.println("Stop speaking."); + targetDataLine.stop(); + targetDataLine.close(); + break; + } + request = + StreamingRecognizeRequest.newBuilder() + .setAudioContent(ByteString.copyFrom(data)) + .build(); + clientStream.send(request); + } + } catch (Exception e) { + System.out.println(e); + } + responseObserver.onComplete(); + } + // [END speech_streaming_mic_recognize] + // [START speech_transcribe_file_with_enhanced_model] /** * Transcribe the given audio file using an enhanced model. @@ -833,8 +937,9 @@ public static void transcribeDiarization(String fileName) throws Exception { SpeechRecognitionAlternative alternative = result.getAlternatives(0); System.out.format("Transcript : %s\n", alternative.getTranscript()); // The words array contains the entire transcript up until that point. - //Referencing the last spoken word to get the associated Speaker tag - System.out.format("Speaker Tag %s: %s\n", + // Referencing the last spoken word to get the associated Speaker tag + System.out.format( + "Speaker Tag %s: %s\n", alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(), alternative.getTranscript()); } @@ -877,8 +982,9 @@ public static void transcribeDiarizationGcs(String gcsUri) throws Exception { // use the first (most likely) one here. SpeechRecognitionAlternative alternative = result.getAlternatives(0); // The words array contains the entire transcript up until that point. - //Referencing the last spoken word to get the associated Speaker tag - System.out.format("Speaker Tag %s:%s\n", + // Referencing the last spoken word to get the associated Speaker tag + System.out.format( + "Speaker Tag %s:%s\n", alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(), alternative.getTranscript()); }