Updates to highlight word time offsets (#787)

GoogleCloudPlatform · Aug 3, 2017 · 14f31f0 · 14f31f0
1 parent 47dc3d2
commit 14f31f0
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 7 deletions.
diff --git a/speech/cloud-client/README.md b/speech/cloud-client/README.md
@@ -45,3 +45,15 @@ Build your project with:
     java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \
     com.example.speech.Recognize asyncrecognize gs://cloud-samples-tests/speech/vr.flac
 ```
+
+### Synchronously transcribe an audio file and print word offsets
+```
+    java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \
+    com.example.speech.Recognize wordoffsets ./resources/audio.raw
+```
+
+### Asynchronously transcribe a remote audio file and print word offsets
+```
+    java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \
+    com.example.speech.Recognize wordoffsets gs://cloud-samples-tests/speech/vr.flac
+```
diff --git a/speech/cloud-client/pom.xml b/speech/cloud-client/pom.xml
@@ -21,9 +21,10 @@
 
   <!-- Parent defines config for testing & linting. -->
   <parent>
-      <groupId>com.google.cloud.samples</groupId>
-      <artifactId>shared-configuration</artifactId>
-      <version>1.0.5</version>
+    <artifactId>doc-samples</artifactId>
+    <groupId>com.google.cloud</groupId>
+    <version>1.0.0</version>
+    <relativePath>../..</relativePath>
   </parent>
 
   <properties>

diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java
@@ -50,7 +50,7 @@ public static void main(String... args) throws Exception {
       System.out.printf(
           "\tjava %s \"<command>\" \"<path-to-image>\"\n"
           + "Commands:\n"
-          + "\tsyncrecognize | asyncrecognize | streamrecognize\n"
+          + "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets\n"
           + "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI "
           + "for a Cloud Storage resource (gs://...)\n",
           Recognize.class.getCanonicalName());
@@ -66,6 +66,12 @@ public static void main(String... args) throws Exception {
       } else {
         syncRecognizeFile(path);
       }
+    } else if (command.equals("wordoffsets")) {
+      if (path.startsWith("gs://")) {
+        asyncRecognizeWords(path);
+      } else {
+        syncRecognizeWords(path);
+      }
     } else if (command.equals("asyncrecognize")) {
       if (path.startsWith("gs://")) {
         asyncRecognizeGcs(path);
@@ -113,6 +119,51 @@ public static void syncRecognizeFile(String fileName) throws Exception, IOExcept
     speech.close();
   }
 
+  /**
+   * Performs sync recognize and prints word time offsets.
+   *
+   * @param fileName the path to a PCM audio file to transcribe get offsets on.
+   */
+  public static void syncRecognizeWords(String fileName) throws Exception, IOException {
+    SpeechClient speech = SpeechClient.create();
+
+    Path path = Paths.get(fileName);
+    byte[] data = Files.readAllBytes(path);
+    ByteString audioBytes = ByteString.copyFrom(data);
+
+    // Configure request with local raw PCM audio
+    RecognitionConfig config = RecognitionConfig.newBuilder()
+        .setEncoding(AudioEncoding.LINEAR16)
+        .setLanguageCode("en-US")
+        .setSampleRateHertz(16000)
+        .setEnableWordTimeOffsets(true)
+        .build();
+    RecognitionAudio audio = RecognitionAudio.newBuilder()
+        .setContent(audioBytes)
+        .build();
+
+    // Use blocking call to get audio transcript
+    RecognizeResponse response = speech.recognize(config, audio);
+    List<SpeechRecognitionResult> results = response.getResultsList();
+
+    for (SpeechRecognitionResult result: results) {
+      List<SpeechRecognitionAlternative> alternatives = result.getAlternativesList();
+      for (SpeechRecognitionAlternative alternative: alternatives) {
+        System.out.printf("Transcription: %s%n", alternative.getTranscript());
+        for (WordInfo wordInfo: alternative.getWordsList()) {
+          System.out.println(wordInfo.getWord());
+          System.out.printf("\t%s.%s sec - %s.%s sec\n",
+              wordInfo.getStartTime().getSeconds(),
+              wordInfo.getStartTime().getNanos() / 100000000,
+              wordInfo.getEndTime().getSeconds(),
+              wordInfo.getEndTime().getNanos() / 100000000);
+        }
+      }
+    }
+    speech.close();
+  }
+
+
   /**
    * Performs speech recognition on remote FLAC file and prints the transcription.
    *
@@ -193,11 +244,11 @@ public static void asyncRecognizeFile(String fileName) throws Exception, IOExcep
 
   /**
    * Performs non-blocking speech recognition on remote FLAC file and prints
-   * the transcription.
+   * the transcription as well as word time offsets.
    *
    * @param gcsUri the path to the remote LINEAR16 audio file to transcribe.
    */
-  public static void asyncRecognizeGcs(String gcsUri) throws Exception, IOException {
+  public static void asyncRecognizeWords(String gcsUri) throws Exception, IOException {
     // Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS
     SpeechClient speech = SpeechClient.create();
 
@@ -240,6 +291,47 @@ public static void asyncRecognizeGcs(String gcsUri) throws Exception, IOExceptio
     speech.close();
   }
 
+  /**
+   * Performs non-blocking speech recognition on remote FLAC file and prints
+   * the transcription.
+   *
+   * @param gcsUri the path to the remote LINEAR16 audio file to transcribe.
+   */
+  public static void asyncRecognizeGcs(String gcsUri) throws Exception, IOException {
+    // Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS
+    SpeechClient speech = SpeechClient.create();
+
+    // Configure remote file request for Linear16
+    RecognitionConfig config = RecognitionConfig.newBuilder()
+        .setEncoding(AudioEncoding.FLAC)
+        .setLanguageCode("en-US")
+        .setSampleRateHertz(16000)
+        .build();
+    RecognitionAudio audio = RecognitionAudio.newBuilder()
+        .setUri(gcsUri)
+        .build();
+
+    // Use non-blocking call for getting file transcription
+    OperationFuture<LongRunningRecognizeResponse, LongRunningRecognizeMetadata,
+            Operation> response =
+        speech.longRunningRecognizeAsync(config, audio);
+    while (!response.isDone()) {
+      System.out.println("Waiting for response...");
+      Thread.sleep(10000);
+    }
+
+    List<SpeechRecognitionResult> results = response.get().getResultsList();
+
+    for (SpeechRecognitionResult result: results) {
+      List<SpeechRecognitionAlternative> alternatives = result.getAlternativesList();
+      for (SpeechRecognitionAlternative alternative: alternatives) {
+        System.out.printf("Transcription: %s\n",alternative.getTranscript());
+      }
+    }
+    speech.close();
+  }
+
+
   /**
    * Performs streaming speech recognition on raw PCM audio data.
    *

diff --git a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java
@@ -62,6 +62,14 @@ public void testRecognizeFile() throws Exception {
     assertThat(got).contains("how old is the Brooklyn Bridge");
   }
 
+  @Test
+  public void testRecognizeWordoffset() throws Exception {
+    Recognize.syncRecognizeWords(fileName);
+    String got = bout.toString();
+    assertThat(got).contains("how old is the Brooklyn Bridge");
+    assertThat(got).contains("\t0.0 sec -");
+  }
+
   @Test
   public void testRecognizeGcs() throws Exception {
     Recognize.syncRecognizeGcs(gcsPath);
@@ -85,8 +93,9 @@ public void testAsyncRecognizeGcs() throws Exception {
 
   @Test
   public void testAsyncWordoffset() throws Exception {
-    Recognize.asyncRecognizeGcs(gcsPath);
+    Recognize.asyncRecognizeWords(gcsPath);
     String got = bout.toString();
+    assertThat(got).contains("how old is the Brooklyn Bridge");
     assertThat(got).contains("\t0.0 sec -");
   }