Skip to content

Commit

Permalink
Merge release/2.16.0 to main (#84)
Browse files Browse the repository at this point in the history
* add list voices support to tts client (#78)

add --list-voices parameter to tts client to query supported voices

* Add ASR endpointing stop_threshold_eou parameter (#83)

* Exposing the 'stop_historu_eou_th' parameter

* updating submodule

* Updating param name

* Updating help for VAD param

* Adding check for stop_threshold_eou

* Updating proto branch

* updating the submodule

---------

Co-authored-by: Viraj Karandikar <[email protected]>
Co-authored-by: sarane22 <[email protected]>
  • Loading branch information
3 people authored Jul 3, 2024
1 parent 18d6f8f commit 330aa60
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 39 deletions.
2 changes: 1 addition & 1 deletion common
16 changes: 11 additions & 5 deletions riva/client/argparse_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def add_asr_config_argparse_parameters(
"--start-history",
default=-1,
type=int,
help="Value to detect and initiate start of speech utterance",
help="Value (in milliseconds) to detect and initiate start of speech utterance",
)
parser.add_argument(
"--start-threshold",
Expand All @@ -64,19 +64,25 @@ def add_asr_config_argparse_parameters(
"--stop-history",
default=-1,
type=int,
help="Value to reset the endpoint detection history",
help="Value (in milliseconds) to detect end of utterance and reset decoder",
)
parser.add_argument(
"--stop-threshold",
default=-1.0,
type=float,
help="Threshold value for detecting the end of speech utterance",
)
parser.add_argument(
"--stop-history-eou",
default=-1,
type=int,
help="Value to determine the response history for endpoint detection",
help="Value (in milliseconds) to detect end of utterance for the 1st pass and generate an intermediate final transcript",
)
parser.add_argument(
"--stop-threshold",
"--stop-threshold-eou",
default=-1.0,
type=float,
help="Threshold value for detecting the end of speech utterance",
help="Threshold value for likelihood of blanks before detecting end of utterance",
)
return parser

Expand Down
5 changes: 4 additions & 1 deletion riva/client/asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,9 @@ def add_endpoint_parameters_to_config(
stop_history: int,
stop_history_eou: int,
stop_threshold: float,
stop_threshold_eou: float,
) -> None:
if not (start_history > 0 or start_threshold > 0 or stop_history > 0 or stop_history_eou > 0 or stop_threshold > 0):
if not (start_history > 0 or start_threshold > 0 or stop_history > 0 or stop_history_eou > 0 or stop_threshold > 0 or stop_threshold_eou > 0):
return

inner_config: rasr.RecognitionConfig = config if isinstance(config, rasr.RecognitionConfig) else config.config
Expand All @@ -146,6 +147,8 @@ def add_endpoint_parameters_to_config(
endpointing_config.stop_history_eou = stop_history_eou
if stop_threshold > 0:
endpointing_config.stop_threshold = stop_threshold
if stop_threshold_eou > 0:
endpointing_config.stop_threshold_eou = stop_threshold_eou
inner_config.endpointing_config.CopyFrom(endpointing_config)


Expand Down
13 changes: 7 additions & 6 deletions scripts/asr/riva_streaming_asr_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,13 @@ def streaming_transcription_worker(
interim_results=True,
)
riva.client.add_endpoint_parameters_to_config(
config,
args.start_history,
args.start_threshold,
args.stop_history,
args.stop_history_eou,
args.stop_threshold
config,
args.start_history,
args.start_threshold,
args.stop_history,
args.stop_history_eou,
args.stop_threshold,
args.stop_threshold_eou
)
riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
for _ in range(args.num_iterations):
Expand Down
13 changes: 7 additions & 6 deletions scripts/asr/transcribe_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,13 @@ def main() -> None:
)
riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
riva.client.add_endpoint_parameters_to_config(
config,
args.start_history,
args.start_threshold,
args.stop_history,
args.stop_history_eou,
args.stop_threshold
config,
args.start_history,
args.start_threshold,
args.stop_history,
args.stop_history_eou,
args.stop_threshold,
args.stop_threshold_eou
)
sound_callback = None
try:
Expand Down
13 changes: 7 additions & 6 deletions scripts/asr/transcribe_file_offline.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,13 @@ def main() -> None:
riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
riva.client.add_speaker_diarization_to_config(config, args.speaker_diarization)
riva.client.add_endpoint_parameters_to_config(
config,
args.start_history,
args.start_threshold,
args.stop_history,
args.stop_history_eou,
args.stop_threshold
config,
args.start_history,
args.start_threshold,
args.stop_history,
args.stop_history_eou,
args.stop_threshold,
args.stop_threshold_eou
)
with args.input_file.open('rb') as fh:
data = fh.read()
Expand Down
13 changes: 7 additions & 6 deletions scripts/asr/transcribe_mic.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,13 @@ def main() -> None:
)
riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
riva.client.add_endpoint_parameters_to_config(
config,
args.start_history,
args.start_threshold,
args.stop_history,
args.stop_history_eou,
args.stop_threshold
config,
args.start_history,
args.start_threshold,
args.stop_history,
args.stop_history_eou,
args.stop_threshold,
args.stop_threshold_eou
)
with riva.client.audio_io.MicrophoneStream(
args.sample_rate_hz,
Expand Down
37 changes: 29 additions & 8 deletions scripts/tts/talk.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import argparse
import time
import wave
import json
from pathlib import Path

import riva.client
Expand All @@ -21,12 +22,12 @@ def parse_args() -> argparse.Namespace:
help="A voice name to use. If this parameter is missing, then the server will try a first available model "
"based on parameter `--language-code`.",
)
parser.add_argument("--text", type=str, required=True, help="Text input to synthesize.")
parser.add_argument("--text", type=str, required=False, help="Text input to synthesize.")
parser.add_argument(
"--audio_prompt_file",
type=Path,
help="An input audio prompt (.wav) file for zero shot model. This is required to do zero shot inferencing.")
parser.add_argument("-o", "--output", type=Path, help="Output file .wav file to write synthesized audio.")
parser.add_argument("-o", "--output", type=Path, default="output.wav", help="Output file .wav file to write synthesized audio.")
parser.add_argument("--quality", type=int, help="Number of times decoder should be run on the output audio. A higher number improves quality of the produced output but introduces latencies.")
parser.add_argument(
"--play-audio",
Expand All @@ -35,6 +36,7 @@ def parse_args() -> argparse.Namespace:
"then the default output audio device will be used.",
)
parser.add_argument("--list-devices", action="store_true", help="List output audio devices indices.")
parser.add_argument("--list-voices", action="store_true", help="List available voices.")
parser.add_argument("--output-device", type=int, help="Output device to use.")
parser.add_argument("--language-code", default='en-US', help="A language of input text.")
parser.add_argument(
Expand All @@ -49,11 +51,6 @@ def parse_args() -> argparse.Namespace:
)
parser = add_connection_argparse_parameters(parser)
args = parser.parse_args()
if args.output is None and not args.play_audio and args.output_device is None and not args.list_devices:
parser.error(
f"You have to provide at least one of arguments: `--play-audio`, `--output-device`, `--output`, "
f"`--list-devices`."
)
if args.output is not None:
args.output = args.output.expanduser()
if args.list_devices or args.output_device or args.play_audio:
Expand All @@ -65,12 +62,36 @@ def main() -> None:
args = parse_args()
if args.list_devices:
riva.client.audio_io.list_output_devices()
return

auth = riva.client.Auth(args.ssl_cert, args.use_ssl, args.server, args.metadata)
service = riva.client.SpeechSynthesisService(auth)
nchannels = 1
sampwidth = 2
sound_stream, out_f = None, None

if args.list_voices:
config_response = service.stub.GetRivaSynthesisConfig(
riva.client.proto.riva_tts_pb2.RivaSynthesisConfigRequest()
)
tts_models = dict()
for model_config in config_response.model_config:
language_code = model_config.parameters['language_code']
voice_name = model_config.parameters['voice_name']
subvoices = [voice.split(':')[0] for voice in model_config.parameters['subvoices'].split(',')]
full_voice_names = [voice_name + "." + subvoice for subvoice in subvoices]

if language_code in tts_models:
tts_models[language_code]['voices'].extend(full_voice_names)
else:
tts_models[language_code] = {"voices": full_voice_names}

tts_models = dict(sorted(tts_models.items()))
print(json.dumps(tts_models, indent=4))

if not args.text:
print("No input text provided")
return

try:
if args.output_device is not None or args.play_audio:
sound_stream = riva.client.audio_io.SoundCallBack(
Expand Down

0 comments on commit 330aa60

Please sign in to comment.