[DO_NOT_MERGE] Microphone streaming with a 1 minute duration. (#1185)

* Microphone streaming with a 1 minute duration. * Fixed audit issues. * Fixing issues after review. * Fixing review issues.
GoogleCloudPlatform · Aug 17, 2018 · a7690e7 · a7690e7
1 parent a73dd58
commit a7690e7
Show file tree

Hide file tree

Showing 2 changed files with 118 additions and 7 deletions.
diff --git a/speech/cloud-client/README.md b/speech/cloud-client/README.md
@@ -92,6 +92,11 @@ Performing streaming speech transcription and punctuation on an audio file
 mvn exec:java -DRecognize -Dexec.args="stream-punctuation ./resources/audio.raw"
 ```
 
+Perform microphone streaming speech recognition
+```
+mvn exec:java -DRecognize -Dexec.args="micstreamrecognize"
+```
+
 ## Enhanced Model
 Transcribe an audio file using an enhanced model
 ```

diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java
@@ -19,6 +19,9 @@
 import com.google.api.gax.longrunning.OperationFuture;
 import com.google.api.gax.rpc.ApiStreamObserver;
 import com.google.api.gax.rpc.BidiStreamingCallable;
+import com.google.api.gax.rpc.ClientStream;
+import com.google.api.gax.rpc.ResponseObserver;
+import com.google.api.gax.rpc.StreamController;
 import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeMetadata;
 import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeResponse;
 import com.google.cloud.speech.v1p1beta1.RecognitionAudio;
@@ -47,6 +50,13 @@
 import java.util.ArrayList;
 import java.util.List;
 
+import javax.sound.sampled.AudioFormat;
+import javax.sound.sampled.AudioInputStream;
+import javax.sound.sampled.AudioSystem;
+import javax.sound.sampled.DataLine;
+import javax.sound.sampled.DataLine.Info;
+import javax.sound.sampled.TargetDataLine;
+
 public class Recognize {
 
   /** Run speech recognition tasks. */
@@ -56,9 +66,10 @@ public static void main(String... args) throws Exception {
       System.out.printf(
           "\tjava %s \"<command>\" \"<path-to-image>\"\n"
               + "Commands:\n"
-              + "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets\n"
-              + "\t| model-selection | auto-punctuation | stream-punctuation | enhanced-model\n"
-              + "\t| metadata | diarization | multi-channel | multi-language | word-level-conf"
+              + "\tsyncrecognize | asyncrecognize | streamrecognize | micstreamrecognize \n"
+              + "\t| wordoffsets | model-selection | auto-punctuation | stream-punctuation \n"
+              + "\t| enhanced-model| metadata | diarization | multi-channel | multi-language \n"
+              + "\t | word-level-conf"
               + "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI "
               + "for a Cloud Storage resource (gs://...)\n",
           Recognize.class.getCanonicalName());
@@ -88,6 +99,8 @@ public static void main(String... args) throws Exception {
       }
     } else if (command.equals("streamrecognize")) {
       streamingRecognizeFile(path);
+    } else if (command.equals("micstreamrecognize")) {
+      streamingMicRecognize();
     } else if (command.equals("model-selection")) {
       if (path.startsWith("gs://")) {
         transcribeModelSelectionGcs(path);
@@ -704,6 +717,97 @@ public SettableFuture<List<T>> future() {
   }
   // [END speech_stream_recognize_punctuation]
 
+  // [START speech_streaming_mic_recognize]
+  /** Performs microphone streaming speech recognition with a duration of 1 minute. */
+  public static void streamingMicRecognize() throws Exception {
+
+    ResponseObserver<StreamingRecognizeResponse> responseObserver = null;
+    try (SpeechClient client = SpeechClient.create()) {
+
+      responseObserver =
+          new ResponseObserver<StreamingRecognizeResponse>() {
+            ArrayList<StreamingRecognizeResponse> responses = new ArrayList<>();
+
+            public void onStart(StreamController controller) {}
+
+            public void onResponse(StreamingRecognizeResponse response) {
+              responses.add(response);
+            }
+
+            public void onComplete() {
+              for (StreamingRecognizeResponse response : responses) {
+                StreamingRecognitionResult result = response.getResultsList().get(0);
+                SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
+                System.out.printf("Transcript : %s\n", alternative.getTranscript());
+              }
+            }
+
+            public void onError(Throwable t) {
+              System.out.println(t);
+            }
+          };
+
+      ClientStream<StreamingRecognizeRequest> clientStream =
+          client.streamingRecognizeCallable().splitCall(responseObserver);
+
+      RecognitionConfig recognitionConfig =
+          RecognitionConfig.newBuilder()
+              .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
+              .setLanguageCode("en-US")
+              .setSampleRateHertz(16000)
+              .build();
+      StreamingRecognitionConfig streamingRecognitionConfig =
+          StreamingRecognitionConfig.newBuilder().setConfig(recognitionConfig).build();
+
+      StreamingRecognizeRequest request =
+          StreamingRecognizeRequest.newBuilder()
+              .setStreamingConfig(streamingRecognitionConfig)
+              .build(); // The first request in a streaming call has to be a config
+
+      clientStream.send(request);
+      // SampleRate:16000Hz, SampleSizeInBits: 16, Number of channels: 1, Signed: true,
+      // bigEndian: false
+      AudioFormat audioFormat = new AudioFormat(16000, 16, 1, true, false);
+      DataLine.Info targetInfo =
+          new Info(
+              TargetDataLine.class,
+              audioFormat); // Set the system information to read from the microphone audio stream
+
+      if (!AudioSystem.isLineSupported(targetInfo)) {
+        System.out.println("Microphone not supported");
+        System.exit(0);
+      }
+      // Target data line captures the audio stream the microphone produces.
+      TargetDataLine targetDataLine = (TargetDataLine) AudioSystem.getLine(targetInfo);
+      targetDataLine.open(audioFormat);
+      targetDataLine.start();
+      System.out.println("Start speaking");
+      long startTime = System.currentTimeMillis();
+      // Audio Input Stream
+      AudioInputStream audio = new AudioInputStream(targetDataLine);
+      while (true) {
+        long estimatedTime = System.currentTimeMillis() - startTime;
+        byte[] data = new byte[6400];
+        audio.read(data);
+        if (estimatedTime > 60000) { // 60 seconds
+          System.out.println("Stop speaking.");
+          targetDataLine.stop();
+          targetDataLine.close();
+          break;
+        }
+        request =
+            StreamingRecognizeRequest.newBuilder()
+                .setAudioContent(ByteString.copyFrom(data))
+                .build();
+        clientStream.send(request);
+      }
+    } catch (Exception e) {
+      System.out.println(e);
+    }
+    responseObserver.onComplete();
+  }
+  // [END speech_streaming_mic_recognize]
+
   // [START speech_transcribe_file_with_enhanced_model]
   /**
    * Transcribe the given audio file using an enhanced model.
@@ -833,8 +937,9 @@ public static void transcribeDiarization(String fileName) throws Exception {
         SpeechRecognitionAlternative alternative = result.getAlternatives(0);
         System.out.format("Transcript : %s\n", alternative.getTranscript());
         // The words array contains the entire transcript up until that point.
-        //Referencing the last spoken word to get the associated Speaker tag
-        System.out.format("Speaker Tag %s: %s\n",
+        // Referencing the last spoken word to get the associated Speaker tag
+        System.out.format(
+            "Speaker Tag %s: %s\n",
             alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(),
             alternative.getTranscript());
       }
@@ -877,8 +982,9 @@ public static void transcribeDiarizationGcs(String gcsUri) throws Exception {
         // use the first (most likely) one here.
         SpeechRecognitionAlternative alternative = result.getAlternatives(0);
         // The words array contains the entire transcript up until that point.
-        //Referencing the last spoken word to get the associated Speaker tag
-        System.out.format("Speaker Tag %s:%s\n",
+        // Referencing the last spoken word to get the associated Speaker tag
+        System.out.format(
+            "Speaker Tag %s:%s\n",
             alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(),
             alternative.getTranscript());
       }