-
Notifications
You must be signed in to change notification settings - Fork 0
/
benchmark.py
409 lines (331 loc) · 15.3 KB
/
benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
import argparse
import asyncio
import json
import os
from dataclasses import dataclass
from typing import (
Any,
Callable,
Dict,
Optional,
Sequence,
Tuple,
)
import matplotlib.pyplot as plt
from tqdm import tqdm
from data import TextDatasets, TextDataset
from llm import LLMs, LLM
from tts import (
Synthesizers,
Synthesizer,
Timer,
)
DEFAULT_RESULTS_FOLDER = os.path.join(os.path.dirname(__file__), "results")
DEFAULT_DATASET = TextDatasets.TASKMASTER2
@dataclass
class TimingResult:
voice_assistant_response_time: float
time_to_first_token: float
first_token_to_speech: float
tts_process_seconds: float
num_words: int
num_tokens_per_second: float
@staticmethod
def _compute_statistics(results: Sequence['TimingResult'], fn: Callable) -> 'TimingResult':
return TimingResult(
voice_assistant_response_time=fn([r.voice_assistant_response_time for r in results]),
time_to_first_token=fn([r.time_to_first_token for r in results]),
first_token_to_speech=fn([r.first_token_to_speech for r in results]),
tts_process_seconds=fn([r.tts_process_seconds for r in results]),
num_words=int(fn([r.num_words for r in results])),
num_tokens_per_second=fn([r.num_tokens_per_second for r in results]),
)
@classmethod
def mean_from_results(cls, results: Sequence['TimingResult']) -> 'TimingResult':
if len(results) == 0:
print("WARNING: Cannot compute mean of empty list")
return TimingResult(0, 0, 0, 0, 0, 0)
def _mean(values: Sequence[Any]) -> float:
return sum(values) / len(values)
return cls._compute_statistics(results, _mean)
@classmethod
def std_from_results(cls, results: Sequence['TimingResult']) -> 'TimingResult':
if len(results) == 0:
print("WARNING: Cannot compute standard deviation of empty list")
return TimingResult(0, 0, 0, 0, 0, 0)
def _std(values):
mean = sum(values) / len(values)
return (sum((x - mean) ** 2 for x in values) / len(values)) ** 0.5
return cls._compute_statistics(results, _std)
class Stats:
MAX_LLM_DELAY_SECONDS = 0.6
def __init__(self, tts: Synthesizers, results_folder: Optional[str] = None) -> None:
self._results = []
self._tts_type_string = tts.value
self._output_folder = os.path.join(results_folder or DEFAULT_RESULTS_FOLDER)
os.makedirs(self._output_folder, exist_ok=True)
def accumulate(self, timing_result: TimingResult) -> None:
self._results.append(timing_result)
def _filter_outliers(self, results: Sequence[TimingResult]) -> Sequence[TimingResult]:
filtered_results = []
for result in results:
if result.time_to_first_token > self.MAX_LLM_DELAY_SECONDS:
continue
filtered_results.append(result)
return filtered_results
def save_results(self) -> None:
results = self._filter_outliers(self._results)
num_sentences = len(results)
mean = TimingResult.mean_from_results(results)
std = TimingResult.std_from_results(results)
print("Summary statistics:")
print(f"Total number of sentences: {num_sentences}")
print(
"Voice Assistant Response Time: "
f"{mean.voice_assistant_response_time:.2f} +- {std.first_token_to_speech:.2f} s")
print(f"Time to First Token: {mean.time_to_first_token:.2f} +- {std.time_to_first_token:.2f} s")
print(f"First Token to Speech: {mean.first_token_to_speech:.2f} +- {std.first_token_to_speech:.2f} s")
print(f"TTS processing time: {mean.tts_process_seconds:.2f} +- {std.tts_process_seconds:.2f} s")
print(f"Mean number of words per sentence: {mean.num_words:.1f} +- {std.num_words:.1f}")
print(f"Mean tokens per second: {mean.num_tokens_per_second:.2f} +- {std.num_tokens_per_second:.2f}")
fig, axs = plt.subplots(3, 2, figsize=(14, 8))
axs[0, 0].hist([r.voice_assistant_response_time for r in self._results], bins=10)
axs[0, 0].set_title('voice_assistant_response_time')
axs[0, 1].hist([r.time_to_first_token for r in self._results], bins=10)
axs[0, 1].set_title('time_to_first_token')
axs[0, 1].axvline(x=self.MAX_LLM_DELAY_SECONDS, color='r', linestyle='--')
axs[1, 0].hist([r.first_token_to_speech for r in self._results], bins=10)
axs[1, 0].set_title('first_token_to_speech')
axs[1, 1].hist([r.num_words for r in self._results], bins=10)
axs[1, 1].set_title('num_words')
axs[2, 0].hist([r.num_tokens_per_second for r in self._results], bins=10)
axs[2, 0].set_title('num_tokens_per_second')
axs[2, 1].hist([r.tts_process_seconds for r in self._results], bins=10)
axs[2, 1].set_title('tts_process_seconds')
output_path = os.path.join(self._output_folder, f"hists_tts_{self._tts_type_string}.png")
plt.savefig(output_path)
plt.close()
results_json_path = os.path.join(self._output_folder, f"results_tts_{self._tts_type_string}.json")
results_dict = {
"total_sentences": num_sentences,
"mean_voice_assistant_response_time": mean.voice_assistant_response_time,
"mean_time_to_first_token": mean.time_to_first_token,
"mean_first_token_to_speech": mean.first_token_to_speech,
"mean_tts_process_seconds": mean.tts_process_seconds,
"mean_num_words": mean.num_words,
"mean_num_tokens_per_second": mean.num_tokens_per_second,
"std_voice_assistant_response_time": std.voice_assistant_response_time,
"std_time_to_first_token": std.time_to_first_token,
"std_first_token_to_speech": std.first_token_to_speech,
"std_tts_process_seconds": std.tts_process_seconds,
"std_num_words": std.num_words,
"std_num_tokens_per_second": std.num_tokens_per_second,
}
with open(results_json_path, "w") as f:
json.dump(results_dict, f, indent=4)
print("Results saved to:", self._output_folder)
@staticmethod
def load_results(json_path: str, scale: float = 1.0) -> Tuple[Synthesizers, TimingResult, TimingResult]:
tts_type_string = None
for synthesizer in Synthesizers:
if synthesizer.value in json_path:
tts_type_string = synthesizer.value
if tts_type_string is None:
raise ValueError(f"Could not determine TTS type from path: `{json_path}`")
with open(json_path, "r") as f:
results_dict = json.load(f)
mean = TimingResult(
voice_assistant_response_time=results_dict["mean_voice_assistant_response_time"] * scale,
time_to_first_token=results_dict["mean_time_to_first_token"] * scale,
first_token_to_speech=results_dict["mean_first_token_to_speech"] * scale,
tts_process_seconds=results_dict["mean_tts_process_seconds"] * scale,
num_words=results_dict["mean_num_words"] * scale,
num_tokens_per_second=results_dict["mean_num_tokens_per_second"] * scale)
std = TimingResult(
voice_assistant_response_time=results_dict["std_voice_assistant_response_time"] * scale,
time_to_first_token=results_dict["std_time_to_first_token"] * scale,
first_token_to_speech=results_dict["std_first_token_to_speech"] * scale,
tts_process_seconds=results_dict["std_tts_process_seconds"] * scale,
num_words=results_dict["std_num_words"] * scale,
num_tokens_per_second=results_dict["std_num_tokens_per_second"] * scale)
return Synthesizers(tts_type_string), mean, std
def get_default_llm_type(tts_type: Synthesizers) -> LLMs:
return LLMs.OPENAI if tts_type is not Synthesizers.ELEVENLABS_WEBSOCKET else LLMs.OPENAI_ASYNC
def get_llm_init_kwargs(args: argparse.Namespace) -> Dict[str, str]:
kwargs = dict()
llm_type = get_default_llm_type(Synthesizers(args.synthesizer))
if llm_type is LLMs.OPENAI or llm_type is LLMs.OPENAI_ASYNC:
if args.openai_api_key is None:
raise ValueError(
f"An OpenAI access key is required when using OpenAI models. Specify with `--openai-api-key`.")
kwargs["api_key"] = args.openai_api_key
return kwargs
def get_synthesizer_init_kwargs(args: argparse.Namespace) -> Dict[str, str]:
kwargs = dict()
synthesizer_type = Synthesizers(args.synthesizer)
if synthesizer_type is Synthesizers.PICOVOICE_ORCA:
if args.picovoice_access_key is None:
raise ValueError(
"Picovoice access key is required when using Picovoice TTS. Specify with `--picovoice-access-key`.")
kwargs["access_key"] = args.picovoice_access_key
kwargs["model_path"] = args.orca_model_path
kwargs["library_path"] = args.orca_library_path
elif synthesizer_type is Synthesizers.AZURE_TTS:
if args.azure_speech_key is None or args.azure_speech_region is None:
raise ValueError(
"Azure speech key and region are required when using Azure TTS. "
"Specify with `--azure-speech-key` and `--azure-speech-region`.")
kwargs['speech_key'] = args.azure_speech_key
kwargs['speech_region'] = args.azure_speech_region
elif synthesizer_type is Synthesizers.AMAZON_POLLY:
if args.aws_profile_name is None:
raise ValueError(
"AWS profile name is required when using AWS Polly. Specify with `--aws-profile-name`.")
kwargs["aws_profile_name"] = args.aws_profile_name
elif synthesizer_type is Synthesizers.ELEVENLABS or synthesizer_type is Synthesizers.ELEVENLABS_WEBSOCKET:
if args.elevenlabs_api_key is None:
raise ValueError(
"Elevenlabs API key is required when using Elevenlabs TTS. Specify with `--elevenlabs-api-key`.")
kwargs["api_key"] = args.elevenlabs_api_key
elif synthesizer_type is Synthesizers.IBM_WATSON_TTS:
if args.ibm_watson_api_key is None or args.ibm_watson_service_url is None:
raise ValueError(
"IBM Watson API key and service URL are required when using IBM Watson TTS. "
"Specify with `--ibm-watson-api-key` and `--ibm-watson-service-url`.")
kwargs["api_key"] = args.ibm_watson_api_key
kwargs["service_url"] = args.ibm_watson_service_url
elif synthesizer_type is Synthesizers.OPENAI_TTS:
if args.openai_api_key is None:
raise ValueError(
f"An OpenAI access key is required when using OpenAI models. Specify with `--openai-api-key`.")
kwargs["api_key"] = args.openai_api_key
return kwargs
async def _run_benchmark_iteration(
llm: LLM,
synthesizer: Synthesizer,
sentence: str,
timer: Timer,
stats: Stats,
results_folder: str,
verbose: bool,
counter: int) -> None:
timer.reset()
timer.log_time_llm_request()
if synthesizer.is_async:
await synthesizer.synthesize_async(text_stream=llm.query_async(sentence))
else:
synthesizer.synthesize(text_stream=llm.query(sentence))
timer.wait_for_first_audio()
timing_result = TimingResult(
voice_assistant_response_time=timer.voice_assistant_response_time(),
time_to_first_token=timer.time_to_first_token(),
first_token_to_speech=timer.first_token_to_speech(),
tts_process_seconds=timer.tts_process_seconds(),
num_words=len(llm.last_response.split()),
num_tokens_per_second=timer.num_tokens_per_second())
stats.accumulate(timing_result=timing_result)
if verbose:
print(f"Question: {sentence}")
print(f"LLM response: {llm.last_response}")
print(f"Voice Assistant Response Time: {timing_result.voice_assistant_response_time:.2f} s")
print(f"Time to First Token: {timing_result.time_to_first_token:.2f} s")
print(f"First Token to Speech: {timing_result.first_token_to_speech:.2f} s")
timer.wait_for_last_audio()
audio_path = os.path.join(results_folder, f"audio_{counter}.wav")
synthesizer.save_and_reset_last_audio(audio_path)
print(f"Saved audio to `{audio_path}`")
print()
async def main(args: argparse.Namespace) -> None:
num_interactions = args.num_interactions
results_folder = args.results_folder
verbose = args.verbose
tts_type = Synthesizers(args.synthesizer)
llm_type = get_default_llm_type(tts_type)
dataset = TextDataset.create(DEFAULT_DATASET)
timer = Timer()
synthesizer_init_kwargs = get_synthesizer_init_kwargs(args)
synthesizer = Synthesizer.create(
Synthesizers(args.synthesizer),
timer=timer,
**synthesizer_init_kwargs)
llm_init_kwargs = get_llm_init_kwargs(args)
llm = LLM.create(llm_type, **llm_init_kwargs)
benchmark_sentences = dataset.get_random_sentences(num=num_interactions)
stats = Stats(tts=tts_type, results_folder=results_folder)
counter = 0
print("Running benchmark ...")
for sentence in tqdm(benchmark_sentences):
await _run_benchmark_iteration(
llm=llm,
synthesizer=synthesizer,
sentence=sentence,
timer=timer,
stats=stats,
results_folder=results_folder,
verbose=verbose,
counter=counter)
counter += 1
stats.save_results()
synthesizer.terminate()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--openai-api-key",
required=True,
help="Open AI API key. Needed when using openai models")
parser.add_argument(
"--engine",
dest="synthesizer",
default=Synthesizers.PICOVOICE_ORCA.value,
choices=[s.value for s in Synthesizers],
help="Choose voice synthesizer to use")
parser.add_argument(
"--picovoice-access-key",
default=None,
help="AccessKey obtained from Picovoice Console")
parser.add_argument(
"--orca-model-path",
default=None,
help="Path to the model parameters file")
parser.add_argument(
"--orca-library-path",
default=None,
help="Path to Orca's dynamic library")
parser.add_argument(
"--aws-profile-name",
default=None,
help="AWS profile name to use for AWS Polly")
parser.add_argument(
"--azure-speech-key",
default=None,
help="Azure access token")
parser.add_argument(
"--azure-speech-region",
default=None,
help="Azure speech location")
parser.add_argument(
"--ibm-watson-api-key",
default=None,
help="IBM Watson API key")
parser.add_argument(
"--ibm-watson-service-url",
default=None,
help="IBM Watson service URL")
parser.add_argument(
"--elevenlabs-api-key",
default=None,
help="Elevenlabs API key")
parser.add_argument(
"--num-interactions",
type=int,
default=200,
help="Number of interactions to benchmark")
parser.add_argument(
"--results-folder",
default=DEFAULT_RESULTS_FOLDER,
help="Folder to save results")
parser.add_argument(
"--verbose",
action="store_true",
help="Print verbose output")
asyncio.run(main(parser.parse_args()))