nvidia-riva · rmittal-github · Jun 27, 2024 · Jun 24, 2024 · Jun 25, 2024 · Jun 26, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "common"]
 	path = common
-	url = https://github.com/nvidia-riva/common.git
-	branch = main
+	url = https://github.com/sarane22/common.git
+	branch = endpointing_stop_eou_threshold_param
diff --git a/common b/common
diff --git a/riva/client/argparse_utils.py b/riva/client/argparse_utils.py
@@ -52,7 +52,7 @@ def add_asr_config_argparse_parameters(
         "--start-history",
         default=-1,
         type=int,
-        help="Value to detect and initiate start of speech utterance",
+        help="Value (in milliseconds) to detect and initiate start of speech utterance",
     )
     parser.add_argument(
         "--start-threshold",
@@ -64,19 +64,25 @@ def add_asr_config_argparse_parameters(
         "--stop-history",
         default=-1,
         type=int,
-        help="Value to reset the endpoint detection history",
+        help="Value (in milliseconds) to detect end of utterance and reset decoder",
+    )
+    parser.add_argument(
+        "--stop-threshold",
+        default=-1.0,
+        type=float,
+        help="Threshold value for detecting the end of speech utterance",
     )
     parser.add_argument(
         "--stop-history-eou",
         default=-1,
         type=int,
-        help="Value to determine the response history for endpoint detection",
+        help="Value (in milliseconds) to detect end of utterance for the 1st pass and generate an intermediate final transcript",
     )
     parser.add_argument(
-        "--stop-threshold",
+        "--stop-threshold-eou",
         default=-1.0,
         type=float,
-        help="Threshold value for detecting the end of speech utterance",
+        help="Threshold value for likelihood of blanks before detecting end of utterance",
     )
     return parser
 

diff --git a/riva/client/asr.py b/riva/client/asr.py
@@ -130,8 +130,9 @@ def add_endpoint_parameters_to_config(
     stop_history: int,
     stop_history_eou: int,
     stop_threshold: float,
+    stop_threshold_eou: float,
 ) -> None:
-    if not (start_history > 0 or start_threshold > 0 or stop_history > 0 or stop_history_eou > 0 or stop_threshold > 0):
+    if not (start_history > 0 or start_threshold > 0 or stop_history > 0 or stop_history_eou > 0 or stop_threshold > 0 or stop_threshold_eou > 0):
         return 
 
     inner_config: rasr.RecognitionConfig = config if isinstance(config, rasr.RecognitionConfig) else config.config
@@ -146,6 +147,8 @@ def add_endpoint_parameters_to_config(
         endpointing_config.stop_history_eou = stop_history_eou
     if stop_threshold > 0:
         endpointing_config.stop_threshold = stop_threshold
+    if stop_threshold_eou > 0:
+        endpointing_config.stop_threshold_eou = stop_threshold_eou
     inner_config.endpointing_config.CopyFrom(endpointing_config)
 
 

diff --git a/scripts/asr/riva_streaming_asr_client.py b/scripts/asr/riva_streaming_asr_client.py
@@ -64,12 +64,13 @@ def streaming_transcription_worker(
             interim_results=True,
         )
         riva.client.add_endpoint_parameters_to_config(
-            config, 
-            args.start_history, 
-            args.start_threshold, 
-            args.stop_history, 
-            args.stop_history_eou, 
-            args.stop_threshold
+            config,
+            args.start_history,
+            args.start_threshold,
+            args.stop_history,
+            args.stop_history_eou,
+            args.stop_threshold,
+            args.stop_threshold_eou
         )
         riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
         for _ in range(args.num_iterations):

diff --git a/scripts/asr/transcribe_file.py b/scripts/asr/transcribe_file.py
@@ -80,12 +80,13 @@ def main() -> None:
     )
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
     riva.client.add_endpoint_parameters_to_config(
-        config, 
-        args.start_history, 
-        args.start_threshold, 
-        args.stop_history, 
-        args.stop_history_eou, 
-        args.stop_threshold
+        config,
+        args.start_history,
+        args.start_threshold,
+        args.stop_history,
+        args.stop_history_eou,
+        args.stop_threshold,
+        args.stop_threshold_eou
     )
     sound_callback = None
     try:

diff --git a/scripts/asr/transcribe_file_offline.py b/scripts/asr/transcribe_file_offline.py
@@ -39,12 +39,13 @@ def main() -> None:
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
     riva.client.add_speaker_diarization_to_config(config, args.speaker_diarization)
     riva.client.add_endpoint_parameters_to_config(
-        config, 
-        args.start_history, 
-        args.start_threshold, 
-        args.stop_history, 
-        args.stop_history_eou, 
-        args.stop_threshold
+        config,
+        args.start_history,
+        args.start_threshold,
+        args.stop_history,
+        args.stop_history_eou,
+        args.stop_threshold,
+        args.stop_threshold_eou
     )    
     with args.input_file.open('rb') as fh:
         data = fh.read()

diff --git a/scripts/asr/transcribe_mic.py b/scripts/asr/transcribe_mic.py
@@ -58,12 +58,13 @@ def main() -> None:
     )
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
     riva.client.add_endpoint_parameters_to_config(
-        config, 
-        args.start_history, 
-        args.start_threshold, 
-        args.stop_history, 
-        args.stop_history_eou, 
-        args.stop_threshold
+        config,
+        args.start_history,
+        args.start_threshold,
+        args.stop_history,
+        args.stop_history_eou,
+        args.stop_threshold,
+        args.stop_threshold_eou
     )
     with riva.client.audio_io.MicrophoneStream(
         args.sample_rate_hz,