diff --git a/.gitmodules b/.gitmodules index bfe31f8..5d0c8fe 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,4 @@ [submodule "common"] path = common url = https://github.com/nvidia-riva/common.git - branch = main + branch = release/2.16.0 diff --git a/common b/common index 9dfc052..a5707ad 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 9dfc052bba9a0cc2cc4b4f156e0ca8a273e9444e +Subproject commit a5707ad2c4e3bf904905a9c5165fdecf9fab133b diff --git a/riva/client/argparse_utils.py b/riva/client/argparse_utils.py index 3d2b835..b92d657 100644 --- a/riva/client/argparse_utils.py +++ b/riva/client/argparse_utils.py @@ -52,7 +52,7 @@ def add_asr_config_argparse_parameters( "--start-history", default=-1, type=int, - help="Value to detect and initiate start of speech utterance", + help="Value (in milliseconds) to detect and initiate start of speech utterance", ) parser.add_argument( "--start-threshold", @@ -64,19 +64,25 @@ def add_asr_config_argparse_parameters( "--stop-history", default=-1, type=int, - help="Value to reset the endpoint detection history", + help="Value (in milliseconds) to detect end of utterance and reset decoder", + ) + parser.add_argument( + "--stop-threshold", + default=-1.0, + type=float, + help="Threshold value for detecting the end of speech utterance", ) parser.add_argument( "--stop-history-eou", default=-1, type=int, - help="Value to determine the response history for endpoint detection", + help="Value (in milliseconds) to detect end of utterance for the 1st pass and generate an intermediate final transcript", ) parser.add_argument( - "--stop-threshold", + "--stop-threshold-eou", default=-1.0, type=float, - help="Threshold value for detecting the end of speech utterance", + help="Threshold value for likelihood of blanks before detecting end of utterance", ) return parser diff --git a/riva/client/asr.py b/riva/client/asr.py index 12e5293..5095fd6 100644 --- a/riva/client/asr.py +++ b/riva/client/asr.py @@ -130,8 +130,9 @@ def add_endpoint_parameters_to_config( stop_history: int, stop_history_eou: int, stop_threshold: float, + stop_threshold_eou: float, ) -> None: - if not (start_history > 0 or start_threshold > 0 or stop_history > 0 or stop_history_eou > 0 or stop_threshold > 0): + if not (start_history > 0 or start_threshold > 0 or stop_history > 0 or stop_history_eou > 0 or stop_threshold > 0 or stop_threshold_eou > 0): return inner_config: rasr.RecognitionConfig = config if isinstance(config, rasr.RecognitionConfig) else config.config @@ -146,6 +147,8 @@ def add_endpoint_parameters_to_config( endpointing_config.stop_history_eou = stop_history_eou if stop_threshold > 0: endpointing_config.stop_threshold = stop_threshold + if stop_threshold_eou > 0: + endpointing_config.stop_threshold_eou = stop_threshold_eou inner_config.endpointing_config.CopyFrom(endpointing_config) diff --git a/scripts/asr/riva_streaming_asr_client.py b/scripts/asr/riva_streaming_asr_client.py index 8f47e6c..8d4fd9b 100644 --- a/scripts/asr/riva_streaming_asr_client.py +++ b/scripts/asr/riva_streaming_asr_client.py @@ -64,12 +64,13 @@ def streaming_transcription_worker( interim_results=True, ) riva.client.add_endpoint_parameters_to_config( - config, - args.start_history, - args.start_threshold, - args.stop_history, - args.stop_history_eou, - args.stop_threshold + config, + args.start_history, + args.start_threshold, + args.stop_history, + args.stop_history_eou, + args.stop_threshold, + args.stop_threshold_eou ) riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score) for _ in range(args.num_iterations): diff --git a/scripts/asr/transcribe_file.py b/scripts/asr/transcribe_file.py index c6550af..976cdb9 100644 --- a/scripts/asr/transcribe_file.py +++ b/scripts/asr/transcribe_file.py @@ -80,12 +80,13 @@ def main() -> None: ) riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score) riva.client.add_endpoint_parameters_to_config( - config, - args.start_history, - args.start_threshold, - args.stop_history, - args.stop_history_eou, - args.stop_threshold + config, + args.start_history, + args.start_threshold, + args.stop_history, + args.stop_history_eou, + args.stop_threshold, + args.stop_threshold_eou ) sound_callback = None try: diff --git a/scripts/asr/transcribe_file_offline.py b/scripts/asr/transcribe_file_offline.py index 203a622..5dcda00 100644 --- a/scripts/asr/transcribe_file_offline.py +++ b/scripts/asr/transcribe_file_offline.py @@ -39,12 +39,13 @@ def main() -> None: riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score) riva.client.add_speaker_diarization_to_config(config, args.speaker_diarization) riva.client.add_endpoint_parameters_to_config( - config, - args.start_history, - args.start_threshold, - args.stop_history, - args.stop_history_eou, - args.stop_threshold + config, + args.start_history, + args.start_threshold, + args.stop_history, + args.stop_history_eou, + args.stop_threshold, + args.stop_threshold_eou ) with args.input_file.open('rb') as fh: data = fh.read() diff --git a/scripts/asr/transcribe_mic.py b/scripts/asr/transcribe_mic.py index 2aa9ca4..595563b 100644 --- a/scripts/asr/transcribe_mic.py +++ b/scripts/asr/transcribe_mic.py @@ -58,12 +58,13 @@ def main() -> None: ) riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score) riva.client.add_endpoint_parameters_to_config( - config, - args.start_history, - args.start_threshold, - args.stop_history, - args.stop_history_eou, - args.stop_threshold + config, + args.start_history, + args.start_threshold, + args.stop_history, + args.stop_history_eou, + args.stop_threshold, + args.stop_threshold_eou ) with riva.client.audio_io.MicrophoneStream( args.sample_rate_hz,