-
Notifications
You must be signed in to change notification settings - Fork 65
/
Copy pathhparams.py
197 lines (142 loc) · 8.46 KB
/
hparams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np
hparams = tf.contrib.training.HParams(
name = "Tacotron-Wavenet-Vocoder",
# tacotron hyper parameter
cleaners = 'korean_cleaners', # 'korean_cleaners' or 'english_cleaners'
skip_path_filter = False, # npz파일에서 불필요한 것을 거르는 작업을 할지 말지 결정. receptive_field 보다 짧은 data를 걸러야 하기 때문에 해 줘야 한다.
use_lws = False,
# Audio
sample_rate = 24000, #
# shift can be specified by either hop_size(우선) or frame_shift_ms
hop_size = 300, # frame_shift_ms = 12.5ms
fft_size=2048, # n_fft. 주로 1024로 되어있는데, tacotron에서 2048사용
win_size = 1200, # 50ms
num_mels=80,
#Spectrogram Pre-Emphasis (Lfilter: Reduce spectrogram noise and helps model certitude levels. Also allows for better G&L phase reconstruction)
preemphasize = True, #whether to apply filter
preemphasis = 0.97,
min_level_db = -100,
ref_level_db = 20,
signal_normalization = True, #Whether to normalize mel spectrograms to some predefined range (following below parameters)
allow_clipping_in_normalization = True, #Only relevant if mel_normalization = True
symmetric_mels = True, #Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2, faster and cleaner convergence)
max_abs_value = 4., #max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not be too big to avoid gradient explosion, not too small for fast convergence)
rescaling=True,
rescaling_max=0.999,
trim_silence = True, #Whether to clip silence in Audio (at beginning and end of audio only, not the middle)
#M-AILABS (and other datasets) trim params (there parameters are usually correct for any data, but definitely must be tuned for specific speakers)
trim_fft_size = 512,
trim_hop_size = 128,
trim_top_db = 23,
clip_mels_length = True, #For cases of OOM (Not really recommended, only use if facing unsolvable OOM errors, also consider clipping your samples to smaller chunks)
max_mel_frames = 1000, #Only relevant when clip_mels_length = True, please only use after trying output_per_steps=3 and still getting OOM errors.
l2_regularization_strength = 0, # Coefficient in the L2 regularization.
sample_size = 15000, # Concatenate and cut audio samples to this many samples
silence_threshold = 0, # Volume threshold below which to trim the start and the end from the training set samples. e.g. 2
filter_width = 2,
gc_channels = 32, # global_condition_vector의 차원. 이것 지정함으로써, global conditioning을 모델에 반영하라는 의미가 된다.
input_type="raw", # 'mulaw-quantize', 'mulaw', 'raw', mulaw, raw 2가지는 scalar input
scalar_input = True, # input_type과 맞아야 함.
dilations = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512,
1, 2, 4, 8, 16, 32, 64, 128, 256, 512,
1, 2, 4, 8, 16, 32, 64, 128, 256, 512,
1, 2, 4, 8, 16, 32, 64, 128, 256, 512,
1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
residual_channels = 32,
dilation_channels = 32,
quantization_channels = 256,
out_channels = 30, # discretized_mix_logistic_loss를 적용하기 때문에, 3의 배수
skip_channels = 512,
use_biases = True,
initial_filter_width = 32,
upsample_factor=[5, 5, 12], # np.prod(upsample_factor) must equal to hop_size
# wavenet training hp
wavenet_batch_size = 8, # 16--> OOM. wavenet은 batch_size가 고정되어야 한다.
store_metadata = False,
num_steps = 200000, # Number of training steps
#Learning rate schedule
wavenet_learning_rate = 1e-3, #wavenet initial learning rate
wavenet_decay_rate = 0.5, #Only used with 'exponential' scheme. Defines the decay rate.
wavenet_decay_steps = 300000, #Only used with 'exponential' scheme. Defines the decay steps.
#Regularization parameters
wavenet_clip_gradients = False, #Whether the clip the gradients during wavenet training.
optimizer = 'adam',
momentum = 0.9, # 'Specify the momentum to be used by sgd or rmsprop optimizer. Ignored by the adam optimizer.
max_checkpoints = 3, # 'Maximum amount of checkpoints that will be kept alive. Default: '
####################################
####################################
####################################
# TACOTRON HYPERPARAMETERS
# Training
adam_beta1 = 0.9,
adam_beta2 = 0.999,
use_fixed_test_inputs = False,
tacotron_initial_learning_rate = 1e-3,
decay_learning_rate_mode = 0, # True in deepvoice2 paper
initial_data_greedy = True,
initial_phase_step = 8000, # 여기서 지정한 step 이전에는 data_dirs의 각각의 디렉토리에 대하여 같은 수의 example을 만들고, 이후, weght 비듈에 따라 ... 즉, 아래의 'main_data_greedy_factor'의 영향을 받는다.
main_data_greedy_factor = 0,
main_data = [''], # 이곳에 있는 directory 속에 있는 data는 가중치를 'main_data_greedy_factor' 만큼 더 준다.
prioritize_loss = False,
# Model
model_type = 'deepvoice', # [single, simple, deepvoice]
speaker_embedding_size = 16,
embedding_size = 256, # 'ᄀ', 'ᄂ', 'ᅡ' 에 대한 embedding dim
dropout_prob = 0.5,
# Encoder
enc_prenet_sizes = [256, 128],
enc_bank_size = 16, # cbhg에서 conv1d의 out kernel size를 1,2,..., enc_bank_size 까지 반복 적용
enc_bank_channel_size = 128, # cbhg에서 conv1d의 out channel size
enc_maxpool_width = 2, # cbhg에서 max pooling size
enc_highway_depth = 4,
enc_rnn_size = 128,
enc_proj_sizes = [128, 128], # cbhg, projection layer (2번째 conv1d), channel size
enc_proj_width = 3, #cbhg, projection layer (2번째 conv1d), kernel size
# Attention
attention_type = 'bah_mon_norm',
attention_size = 256,
attention_state_size = 256,
# Decoder recurrent network
dec_layer_num = 2,
dec_rnn_size = 256,
# Decoder
dec_prenet_sizes = [256, 128],
post_bank_size = 8,
post_bank_channel_size = 128,
post_maxpool_width = 2,
post_highway_depth = 4,
post_rnn_size = 128,
post_proj_sizes = [256, 80], # num_mels=80
post_proj_width = 3,
reduction_factor = 5,
# Eval
min_tokens = 30, #originally 50, 30 is good for korean, text를 token으로 쪼갰을 때, 최소 길이 이상되어야 train에 사용
min_iters = 30, # min_n_frame = reduction_factor * min_iters, reduction_factor와 곱해서 min_n_frame을 설정한다.
max_iters = 200,
skip_inadequate = False,
griffin_lim_iters = 60,
power = 1.5,
# 사용안되는 것들인데, error방지
recognition_loss_coeff = 0.2, # 이게 1이 아니면, 'ignore_recognition_level' = 1,2에 걸리는 data는 무시됨.
ignore_recognition_level = 0
)
if hparams.use_lws:
# Does not work if fft_size is not multiple of hop_size!!
# sample size = 20480, hop_size=256=12.5ms. fft_size는 window_size를 결정하는데, 2048을 시간으로 환산하면 2048/20480 = 0.1초=100ms
hparams.sample_rate = 20480 #
# shift can be specified by either hop_size(우선) or frame_shift_ms
hparams.hop_size = 256 # frame_shift_ms = 12.5ms
hparams.frame_shift_ms=None # hop_size= sample_rate * frame_shift_ms / 1000
hparams.fft_size=2048 # 주로 1024로 되어있는데, tacotron에서 2048사용==> output size = 1025
hparams.win_size = None # 256x4 --> 50ms
else:
# 미리 정의되 parameter들로 부터 consistant하게 정의해 준다.
hparams.num_freq = int(hparams.fft_size/2 + 1)
hparams.frame_shift_ms = hparams.hop_size * 1000.0/ hparams.sample_rate # hop_size= sample_rate * frame_shift_ms / 1000
hparams.frame_length_ms = hparams.win_size * 1000.0/ hparams.sample_rate
def hparams_debug_string():
values = hparams.values()
hp = [' %s: %s' % (name, values[name]) for name in sorted(values)]
return 'Hyperparameters:\n' + '\n'.join(hp)