-
Notifications
You must be signed in to change notification settings - Fork 3
/
preprocess.py
110 lines (86 loc) · 4.56 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import argparse
from multiprocessing import cpu_count
import os
from tqdm import tqdm
from datasets import preprocessor
from hparams import hparams
def preprocess(args, input_folders, out_dir, hparams):
mel_dir = os.path.join(out_dir, 'mels')
wav_dir = os.path.join(out_dir, 'audio')
os.makedirs(mel_dir, exist_ok=True)
os.makedirs(wav_dir, exist_ok=True)
metadata = preprocessor.build_from_path(hparams, input_folders, mel_dir, wav_dir, args.n_jobs, tqdm=tqdm)
write_metadata(metadata, out_dir)
def write_metadata(metadata, out_dir):
with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
for m in metadata:
f.write('|'.join([str(x) for x in m]) + '\n')
mel_frames = sum([int(m[3]) for m in metadata])
timesteps = sum([int(m[2]) for m in metadata])
sr = hparams.sample_rate
hours = timesteps / sr / 3600
print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
len(metadata), mel_frames, timesteps, hours))
print('Max input length (text chars): {}'.format(max(len(m[-1]) for m in metadata)))
print('Max mel frames length: {}'.format(max(int(m[3]) for m in metadata)))
print('Max audio timesteps length: {}'.format(max(m[2] for m in metadata)))
def norm_data(args):
merge_books = (args.merge_books == 'True')
print('Selecting data folders..')
supported_datasets = ['LJSpeech-1.0', 'LJSpeech-1.1', 'M-AILABS']
if args.dataset not in supported_datasets:
raise ValueError('dataset value entered {} does not belong to supported datasets: {}'.format(
args.dataset, supported_datasets))
if args.dataset.startswith('LJSpeech'):
return [os.path.join(args.input_dir, args.dataset)]
if args.dataset == 'M-AILABS':
supported_languages = ['en_US', 'en_UK', 'fr_FR', 'it_IT', 'de_DE', 'es_ES', 'ru_RU',
'uk_UK', 'pl_PL', 'nl_NL', 'pt_PT', 'fi_FI', 'se_SE', 'tr_TR', 'ar_SA']
if args.language not in supported_languages:
raise ValueError('Please enter a supported language to use from M-AILABS dataset! \n{}'.format(
supported_languages))
supported_voices = ['female', 'male', 'mix']
if args.voice not in supported_voices:
raise ValueError('Please enter a supported voice option to use from M-AILABS dataset! \n{}'.format(
supported_voices))
path = os.path.join(args.input_dir, args.language, 'by_book', args.voice)
supported_readers = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))]
if args.reader not in supported_readers:
raise ValueError('Please enter a valid reader for your language and voice settings! \n{}'.format(
supported_readers))
path = os.path.join(path, args.reader)
supported_books = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))]
if merge_books:
return [os.path.join(path, book) for book in supported_books]
else:
if args.book not in supported_books:
raise ValueError('Please enter a valid book for your reader settings! \n{}'.format(
supported_books))
return [os.path.join(path, args.book)]
def run_preprocess(args, hparams):
input_folders = norm_data(args)
output_folder = os.path.join(args.output_dir, args.output)
preprocess(args, input_folders, output_folder, hparams)
def main():
print('initializing preprocessing..')
parser = argparse.ArgumentParser()
parser.add_argument('--input_dir', default='')
parser.add_argument('--output_dir', default='')
parser.add_argument('--hparams', default='',
help='Hyperparameter overrides as a comma-separated list of name=value pairs')
parser.add_argument('--dataset', default='LJSpeech-1.1')
parser.add_argument('--language', default='en_US')
parser.add_argument('--voice', default='female')
parser.add_argument('--reader', default='mary_ann')
parser.add_argument('--merge_books', default='False')
parser.add_argument('--book', default='northandsouth')
parser.add_argument('--output', default='training_data_dual_channels')
#parser.add_argument('--n_jobs', type=int, default=cpu_count())
parser.add_argument('--n_jobs', type=int, default=1)
args = parser.parse_args()
modified_hp = hparams.parse(args.hparams)
assert args.merge_books in ('False', 'True')
run_preprocess(args, modified_hp)
print('Warning: preprocessed format is audio [T], mel [T, C] linear[T, C]')
if __name__ == '__main__':
main()