forked from stlukey/whispercpp.py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
whispercpp.pyx
115 lines (92 loc) · 3.47 KB
/
whispercpp.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!python
# cython: language_level=3
import ffmpeg
import numpy as np
import requests
import os
from pathlib import Path
MODELS_DIR = str(Path('~/.ggml-models').expanduser())
print("Saving models to:", MODELS_DIR)
cimport numpy as cnp
cdef int SAMPLE_RATE = 16000
cdef char* TEST_FILE = 'test.wav'
cdef char* DEFAULT_MODEL = 'tiny'
cdef char* LANGUAGE = b'fr'
cdef int N_THREADS = os.cpu_count()
MODELS = {
'ggml-tiny.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin',
'ggml-base.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-base.bin',
'ggml-small.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-small.bin',
'ggml-medium.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin',
'ggml-large.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-large.bin',
}
def model_exists(model):
return os.path.exists(Path(MODELS_DIR).joinpath(model))
def download_model(model):
if model_exists(model):
return
print(f'Downloading {model}...')
url = MODELS[model]
r = requests.get(url, allow_redirects=True)
os.makedirs(MODELS_DIR, exist_ok=True)
with open(Path(MODELS_DIR).joinpath(model), 'wb') as f:
f.write(r.content)
cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] load_audio(bytes file, int sr = SAMPLE_RATE):
try:
out = (
ffmpeg.input(file, threads=0)
.output(
"-", format="s16le",
acodec="pcm_s16le",
ac=1, ar=sr
)
.run(
cmd=["ffmpeg", "-nostdin"],
capture_stdout=True,
capture_stderr=True
)
)[0]
except:
raise RuntimeError(f"File '{file}' not found")
cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = (
np.frombuffer(out, np.int16)
.flatten()
.astype(np.float32)
) / pow(2, 15)
return frames
cdef whisper_full_params default_params() nogil:
cdef whisper_full_params params = whisper_full_default_params(
whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY
)
params.print_realtime = True
params.print_progress = True
params.translate = False
params.language = <const char *> LANGUAGE
n_threads = N_THREADS
return params
cdef class Whisper:
cdef whisper_context * ctx
cdef whisper_full_params params
def __init__(self, model=DEFAULT_MODEL, pb=None):
model_fullname = f'ggml-{model}.bin'
download_model(model_fullname)
model_path = Path(MODELS_DIR).joinpath(model_fullname)
cdef bytes model_b = str(model_path).encode('utf8')
self.ctx = whisper_init(model_b)
self.params = default_params()
whisper_print_system_info()
def __dealloc__(self):
whisper_free(self.ctx)
def transcribe(self, filename=TEST_FILE):
print("Loading data..")
cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = load_audio(<bytes>filename)
print("Transcribing..")
return whisper_full(self.ctx, self.params, &frames[0], len(frames))
def extract_text(self, int res):
print("Extracting text...")
if res != 0:
raise RuntimeError
cdef int n_segments = whisper_full_n_segments(self.ctx)
return [
whisper_full_get_segment_text(self.ctx, i).decode() for i in range(n_segments)
]