-
Notifications
You must be signed in to change notification settings - Fork 803
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'modelscope:main' into main
- Loading branch information
Showing
34 changed files
with
5,126 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../MinMo_gitlab |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
121 changes: 121 additions & 0 deletions
121
examples/aishell/e_paraformer/conf/e_paraformer_conformer_12e_6d_2048_256.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
|
||
# network architecture | ||
model: EParaformer | ||
model_conf: | ||
ctc_weight: 0.0 | ||
lsm_weight: 0.1 | ||
length_normalized_loss: false | ||
predictor_weight: 1.0 | ||
predictor_bias: 2 | ||
sampling_ratio: 0.4 | ||
use_1st_decoder_loss: true | ||
|
||
# encoder | ||
encoder: ConformerEncoder | ||
encoder_conf: | ||
output_size: 256 # dimension of attention | ||
attention_heads: 4 | ||
linear_units: 2048 # the number of units of position-wise feed forward | ||
num_blocks: 12 # the number of encoder blocks | ||
dropout_rate: 0.1 | ||
positional_dropout_rate: 0.1 | ||
attention_dropout_rate: 0.0 | ||
input_layer: conv2d # encoder architecture type | ||
normalize_before: true | ||
pos_enc_layer_type: rel_pos | ||
selfattention_layer_type: rel_selfattn | ||
activation_type: swish | ||
macaron_style: true | ||
use_cnn_module: true | ||
cnn_module_kernel: 15 | ||
|
||
# decoder | ||
decoder: ParaformerSANDecoder | ||
decoder_conf: | ||
attention_heads: 4 | ||
linear_units: 2048 | ||
num_blocks: 6 | ||
dropout_rate: 0.1 | ||
positional_dropout_rate: 0.1 | ||
self_attention_dropout_rate: 0.0 | ||
src_attention_dropout_rate: 0.0 | ||
|
||
# predictor | ||
predictor: PifPredictor | ||
predictor_conf: | ||
idim: 256 | ||
threshold: 1.0 | ||
l_order: 1 | ||
r_order: 1 | ||
sigma: 0.5 | ||
bias: 0.0 | ||
sigma_heads: 4 | ||
|
||
# frontend related | ||
frontend: WavFrontend | ||
frontend_conf: | ||
fs: 16000 | ||
window: hamming | ||
n_mels: 80 | ||
frame_length: 25 | ||
frame_shift: 10 | ||
lfr_m: 1 | ||
lfr_n: 1 | ||
|
||
specaug: SpecAug | ||
specaug_conf: | ||
apply_time_warp: true | ||
time_warp_window: 5 | ||
time_warp_mode: bicubic | ||
apply_freq_mask: true | ||
freq_mask_width_range: | ||
- 0 | ||
- 30 | ||
num_freq_mask: 2 | ||
apply_time_mask: true | ||
time_mask_width_range: | ||
- 0 | ||
- 40 | ||
num_time_mask: 2 | ||
|
||
train_conf: | ||
accum_grad: 4 | ||
grad_clip: 5 | ||
max_epoch: 150 | ||
keep_nbest_models: 20 | ||
avg_nbest_model: 15 | ||
log_interval: 50 | ||
|
||
optim: adam | ||
optim_conf: | ||
lr: 0.0005 | ||
scheduler: warmuplr | ||
scheduler_conf: | ||
warmup_steps: 30000 | ||
|
||
dataset: AudioDataset | ||
dataset_conf: | ||
index_ds: IndexDSJsonl | ||
batch_sampler: EspnetStyleBatchSampler | ||
batch_type: length # example or length | ||
batch_size: 25000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len; | ||
max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length, | ||
buffer_size: 1024 | ||
shuffle: True | ||
num_workers: 4 | ||
preprocessor_speech: SpeechPreprocessSpeedPerturb | ||
preprocessor_speech_conf: | ||
speed_perturb: [0.9, 1.0, 1.1] | ||
|
||
tokenizer: CharTokenizer | ||
tokenizer_conf: | ||
unk_symbol: <unk> | ||
|
||
ctc_conf: | ||
dropout_rate: 0.0 | ||
ctc_type: builtin | ||
reduce: true | ||
ignore_nan_grad: true | ||
normalize: null | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. | ||
# MIT License (https://opensource.org/licenses/MIT) | ||
|
||
|
||
|
||
python -m funasr.bin.inference \ | ||
--config-path="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3" \ | ||
--config-name="config.yaml" \ | ||
++init_param="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3/model.pt.ep38" \ | ||
++tokenizer_conf.token_list="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/zh_token_list/char/tokens.txt" \ | ||
++frontend_conf.cmvn_file="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/train/am.mvn" \ | ||
++input="/mnt/nfs/zhifu.gzf/data/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav" \ | ||
++output_dir="./outputs/debug" \ | ||
++device="cuda:0" \ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. | ||
# MIT License (https://opensource.org/licenses/MIT) | ||
|
||
|
||
# which gpu to train or finetune | ||
export CUDA_VISIBLE_DEVICES="0,1" | ||
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') | ||
|
||
# data dir, which contains: train.json, val.json, tokens.jsonl/tokens.txt, am.mvn | ||
data_dir="/Users/zhifu/funasr1.0/data/list" | ||
|
||
## generate jsonl from wav.scp and text.txt | ||
#python -m funasr.datasets.audio_datasets.scp2jsonl \ | ||
#++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \ | ||
#++data_type_list='["source", "target"]' \ | ||
#++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl | ||
|
||
train_data="${data_dir}/train.jsonl" | ||
val_data="${data_dir}/val.jsonl" | ||
tokens="${data_dir}/tokens.json" | ||
cmvn_file="${data_dir}/am.mvn" | ||
|
||
# exp output dir | ||
output_dir="/Users/zhifu/exp" | ||
log_file="${output_dir}/log.txt" | ||
|
||
workspace=`pwd` | ||
config="paraformer_conformer_12e_6d_2048_256.yaml" | ||
|
||
init_param="${output_dir}/model.pt" | ||
|
||
mkdir -p ${output_dir} | ||
echo "log_file: ${log_file}" | ||
|
||
torchrun \ | ||
--nnodes 1 \ | ||
--nproc_per_node ${gpu_num} \ | ||
../../../funasr/bin/train.py \ | ||
--config-path "${workspace}/conf" \ | ||
--config-name "${config}" \ | ||
++train_data_set_list="${train_data}" \ | ||
++valid_data_set_list="${val_data}" \ | ||
++tokenizer_conf.token_list="${tokens}" \ | ||
++frontend_conf.cmvn_file="${cmvn_file}" \ | ||
++dataset_conf.batch_size=32 \ | ||
++dataset_conf.batch_type="example" \ | ||
++dataset_conf.num_workers=4 \ | ||
++train_conf.max_epoch=150 \ | ||
++optim_conf.lr=0.0002 \ | ||
++init_param="${init_param}" \ | ||
++output_dir="${output_dir}" &> ${log_file} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2017 Xingyu Na | ||
# Apache 2.0 | ||
|
||
#. ./path.sh || exit 1; | ||
|
||
if [ $# != 3 ]; then | ||
echo "Usage: $0 <audio-path> <text-path> <output-path>" | ||
echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript data" | ||
exit 1; | ||
fi | ||
|
||
aishell_audio_dir=$1 | ||
aishell_text=$2/aishell_transcript_v0.8.txt | ||
output_dir=$3 | ||
|
||
train_dir=$output_dir/data/local/train | ||
dev_dir=$output_dir/data/local/dev | ||
test_dir=$output_dir/data/local/test | ||
tmp_dir=$output_dir/data/local/tmp | ||
|
||
mkdir -p $train_dir | ||
mkdir -p $dev_dir | ||
mkdir -p $test_dir | ||
mkdir -p $tmp_dir | ||
|
||
# data directory check | ||
if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then | ||
echo "Error: $0 requires two directory arguments" | ||
exit 1; | ||
fi | ||
|
||
# find wav audio file for train, dev and test resp. | ||
find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist | ||
n=`cat $tmp_dir/wav.flist | wc -l` | ||
[ $n -ne 141925 ] && \ | ||
echo Warning: expected 141925 data data files, found $n | ||
|
||
grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; | ||
grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; | ||
grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; | ||
|
||
rm -r $tmp_dir | ||
|
||
# Transcriptions preparation | ||
for dir in $train_dir $dev_dir $test_dir; do | ||
echo Preparing $dir transcriptions | ||
sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list | ||
paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all | ||
utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt | ||
awk '{print $1}' $dir/transcripts.txt > $dir/utt.list | ||
utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp | ||
sort -u $dir/transcripts.txt > $dir/text | ||
done | ||
|
||
mkdir -p $output_dir/data/train $output_dir/data/dev $output_dir/data/test | ||
|
||
for f in wav.scp text; do | ||
cp $train_dir/$f $output_dir/data/train/$f || exit 1; | ||
cp $dev_dir/$f $output_dir/data/dev/$f || exit 1; | ||
cp $test_dir/$f $output_dir/data/test/$f || exit 1; | ||
done | ||
|
||
echo "$0: AISHELL data preparation succeeded" | ||
exit 0; |
105 changes: 105 additions & 0 deletions
105
examples/aishell/e_paraformer/local/download_and_untar.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Copyright 2014 Johns Hopkins University (author: Daniel Povey) | ||
# 2017 Xingyu Na | ||
# Apache 2.0 | ||
|
||
remove_archive=false | ||
|
||
if [ "$1" == --remove-archive ]; then | ||
remove_archive=true | ||
shift | ||
fi | ||
|
||
if [ $# -ne 3 ]; then | ||
echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>" | ||
echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell" | ||
echo "With --remove-archive it will remove the archive after successfully un-tarring it." | ||
echo "<corpus-part> can be one of: data_aishell, resource_aishell." | ||
fi | ||
|
||
data=$1 | ||
url=$2 | ||
part=$3 | ||
|
||
if [ ! -d "$data" ]; then | ||
echo "$0: no such directory $data" | ||
exit 1; | ||
fi | ||
|
||
part_ok=false | ||
list="data_aishell resource_aishell" | ||
for x in $list; do | ||
if [ "$part" == $x ]; then part_ok=true; fi | ||
done | ||
if ! $part_ok; then | ||
echo "$0: expected <corpus-part> to be one of $list, but got '$part'" | ||
exit 1; | ||
fi | ||
|
||
if [ -z "$url" ]; then | ||
echo "$0: empty URL base." | ||
exit 1; | ||
fi | ||
|
||
if [ -f $data/$part/.complete ]; then | ||
echo "$0: data part $part was already successfully extracted, nothing to do." | ||
exit 0; | ||
fi | ||
|
||
# sizes of the archive files in bytes. | ||
sizes="15582913665 1246920" | ||
|
||
if [ -f $data/$part.tgz ]; then | ||
size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') | ||
size_ok=false | ||
for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done | ||
if ! $size_ok; then | ||
echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" | ||
echo "does not equal the size of one of the archives." | ||
rm $data/$part.tgz | ||
else | ||
echo "$data/$part.tgz exists and appears to be complete." | ||
fi | ||
fi | ||
|
||
if [ ! -f $data/$part.tgz ]; then | ||
if ! command -v wget >/dev/null; then | ||
echo "$0: wget is not installed." | ||
exit 1; | ||
fi | ||
full_url=$url/$part.tgz | ||
echo "$0: downloading data from $full_url. This may take some time, please be patient." | ||
|
||
cd $data || exit 1 | ||
if ! wget --no-check-certificate $full_url; then | ||
echo "$0: error executing wget $full_url" | ||
exit 1; | ||
fi | ||
fi | ||
|
||
cd $data || exit 1 | ||
|
||
if ! tar -xvzf $part.tgz; then | ||
echo "$0: error un-tarring archive $data/$part.tgz" | ||
exit 1; | ||
fi | ||
|
||
touch $data/$part/.complete | ||
|
||
if [ $part == "data_aishell" ]; then | ||
cd $data/$part/wav || exit 1 | ||
for wav in ./*.tar.gz; do | ||
echo "Extracting wav from $wav" | ||
tar -zxf $wav && rm $wav | ||
done | ||
fi | ||
|
||
echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" | ||
|
||
if $remove_archive; then | ||
echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." | ||
rm $data/$part.tgz | ||
fi | ||
|
||
exit 0; |
Oops, something went wrong.