Merge branch 'main' of https://github.com/may-/joeynmt into joeys2t

may- · Jan 20, 2024 · eccf78d · eccf78d
2 parents 28c6251 + 8ebc0a0
commit eccf78d
Show file tree

Hide file tree

Showing 99 changed files with 15,232 additions and 33,347 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -25,17 +25,20 @@ jobs:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
       - uses: actions/checkout@v3
 
-      # Set up Python 3.10
-      - name: Set up Python 3.10
+      # Set up Python 3.11
+      - name: Set up Python 3.11
         uses: actions/setup-python@v3
         with:
-          python-version: '3.10'
+          python-version: '3.11'
 
       # Install packages
+      - name: Install dependencies
+        run: sudo apt-get install -y libsox-dev
+
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          python -m pip install --upgrade torch torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+          python -m pip install --upgrade pip sox soundfile ffmpeg
+          python -m pip install --upgrade torch torchaudio --index-url https://download.pytorch.org/whl/cpu
           python -m pip install -e .
 
       # Check code format

diff --git a/.pylintrc b/.pylintrc
@@ -396,4 +396,4 @@ exclude-protected=_asdict,_fields,_replace,_source,_make
 
 # Exceptions that will emit a warning when being caught. Defaults to
 # "Exception"
-overgeneral-exceptions=Exception
+overgeneral-exceptions=builtins.BaseException,builtins.Exception
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -1,13 +1,28 @@
-formats:
-   - pdf
+# Read the Docs configuration file for Sphinx projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 
-requirements_file: docs/requirements.txt
+# Required
+version: 2
 
+# Set the OS, Python version and other tools you might need
 build:
-   image: latest
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
 
-python:
-   version: 3.6
-   setup_py_install: true
-   pip_install: false
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/source/conf.py
+  fail_on_warning: false
 
+# Optionally build your docs in additional formats such as PDF and ePub
+formats:
+    - pdf
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+  install:
+    - requirements: docs/requirements.txt
+    - requirements: requirements.txt
diff --git a/README.md b/README.md
@@ -1,9 +1,9 @@
-# &nbsp; ![Joey-NMT](joey2-small.png) Joey S2T
+# &nbsp; ![Joey-S2T](joey2-small.png) Joey S2T
 [![build](https://github.com/may-/joeys2t/actions/workflows/main.yml/badge.svg)](https://github.com/may-/joeys2t/actions/workflows/main.yml)
 [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 
 
-JoeyS2T is an extention of [JoeyNMT](https://github.com/joeynmt/joeynmt) for Speech-to-Text tasks.
+JoeyS2T is an extension of [JoeyNMT](https://github.com/joeynmt/joeynmt) for Speech-to-Text tasks.
 
 
 ## What's new
@@ -15,7 +15,7 @@ JoeyS2T is an extention of [JoeyNMT](https://github.com/joeynmt/joeynmt) for Spe
 Joey S2T implements the following features:
 - Transformer Encoder-Decoder
 - 1d-Conv Subsampling
-- Cross-entropy and CTC joint obvective
+- Cross-entropy and CTC joint objective
 - Mel filterbank spectrogram extraction
 - CMVN, SpecAugment
 - WER evaluation
@@ -28,16 +28,16 @@ Furthermore, all the functionalities in JoeyNMT v2 are also available from JoeyS
 - Attention visualization
 - Learning curve plotting
 - Scoring hypotheses and references
-
+- Multilingual translation with language tags
 
 ## Installation
 
 JoeyS2T is built on [PyTorch](https://pytorch.org/). Please make sure you have a compatible environment.
 We tested JoeyS2T with
-- python 3.10
-- torch 2.0.0
-- torchaudio 2.0.0
-- cuda 11.7
+- python 3.11
+- torch 2.1.2
+- torchaudio 2.1.2
+- cuda 12.1
 
 Clone this repository and install via pip:
 
@@ -54,27 +54,25 @@ Please check the JoeyNMT's [documentation](https://joeynmt.readthedocs.io) first
 For details, follow the tutorials in [notebooks](notebooks) dir.
 
 - [quick-start-with-joeynmt2](notebooks/quick-start-with-joeynmt2.ipynb)
-- [speech-to-text-with-joeynmt2](notebooks/joeyS2T_ASR_tutorial.ipynb)
-
+- [speech-to-text-with-joeys2t](notebooks/joeyS2T_ASR_tutorial.ipynb)
 
 ## Benchmarks & pretrained models
 
-We provide [benchmarks](benchmarks_s2t.md) and pretraind models for Speech recognition (ASR) and speech-to-text translation (ST) with JoeyS2T.
+We provide [benchmarks](benchmarks_s2t.md) and pretraind models for Speech Recognition (ASR) and Speech Translation (ST) with JoeyS2T.
 
 - [ASR on LibriSpeech](benchmarks_s2t.md#librispeech)
 - [ST on MuST-C en-de](benchmarks_s2t.md#must-c-v2-en-de)
 
-
 Models are also available via Torch Hub!
 ```python
 import torch
 
 model = torch.hub.load('may-/joeys2t', 'mustc_v2_ende_st')
 translations = model.generate(['test.wav'])
 print(translations[0])
-# 'Hallo, world!'
+# 'Hallo, Welt!'
 ```
-> :warning: **Attention**
+> :warning: **Warning**
 > The 1d-conv layer may raise an error for too short audio inputs.
 > (We cannot convolve the frames shorter than the kernel size!)
 

diff --git a/benchmarks_s2t.md b/benchmarks_s2t.md
@@ -11,18 +11,18 @@ See [benchmarks](notebooks/benchmarks.ipynb) for details.
 **Data Preparation:**
 JoeyS2T requires tsv format input file to feed the data. You can get the tsv input file using the following script:
 ```
-$ python scripts/prepare_librispeech.py --data_root data/LibriSpeech
+python scripts/prepare_librispeech.py --data-root data/LibriSpeech
 ```
 Then specify the path to the tsv files generated above in the configuration file.
 
 **Training:**
 ```
-$ python -m joeynmt train configs/librispeech_{100h|960h}.yaml
+python -m joeynmt train configs/librispeech_{100h|960h}.yaml
 ```
 
 **Inference:**
 ```
-$ python -m joeynmt test configs/librispeech_{100h|960h}.yaml --output_path models/librispeech_{100h|960h}/hyps
+python -m joeynmt test configs/librispeech_{100h|960h}.yaml --output-path models/librispeech_{100h|960h}/hyps
 ```
 
 
@@ -54,21 +54,21 @@ JoeyS2T | Transformer | 3.50 | 8.44 | 3.78 | 8.32 | 102M | [librispeech960h.tar.
 ## MuST-C v2 en-de
 
 **Data Preparation:**
-First, download the dataset builder [here](https://github.com/may-/datasets/blob/main/datasets/mustc/mustc.py), and follow the instruction there to download the [data](https://ict.fbk.eu/must-c/).
+First, download the dataset builder [here](https://huggingface.co/datasets/may-ohta/MUST-C), and follow the instruction there to download the [data](https://ict.fbk.eu/must-c/).
 Second, run the following preparation script and generate the input tsv files.
 ```
-$ python scripts/prepare_mustc.py --data_root data/MuSTC_v2.0 --trg_lang de
+python scripts/prepare_mustc.py --data-root data/MuSTC_v2.0 --trg-lang de
 ```
 Then specify the path to the tsv files generated above in the configuration file.
 
 **Training:**
 ```
-$ python -m joeynmt train configs/mustc_{asr|mt|st}.yaml
+python -m joeynmt train configs/mustc_{asr|mt|st}.yaml
 ```
 
 **Inference:**
 ```
-$ python -m joeynmt test configs/mustc_{asr|mt|st}.yaml --output_path models/mustc_{asr|mt|st}/hyps
+python -m joeynmt test configs/mustc_{asr|mt|st}.yaml --output-path models/mustc_{asr|mt|st}/hyps
 ```
 
 

diff --git a/configs/iwslt14_deen_bpe.yaml b/configs/iwslt14_deen_bpe.yaml
@@ -1,3 +1,22 @@
+############## https://github.com/joeynmt/joeynmt/pull/216
+#  Install the latest JoeyNMT:
+#   $ git clone https://github.com/joeynmt/joeynmt.git
+#   $ cd joeynmt
+#   $ python -m pip install -e .
+#
+#  Prepare data:
+#   $ cd /path/to/joeynmt/scripts # Call the bash script from /path/to/joeynmt/scripts dir.
+#   $ bash get_iwslt14_bpe.sh     # This will create /path/to/joeynmt/test/data/iwslt14/{train | valid | test}.{en | de}
+#                                 # Make sure that /path/to/joeynmt/test/data/iwslt14/bpe.32000 exists, too.
+#                                 # No need to call `build_vocab.py` script!
+#   $ cd ..                       # now back to /path/to/joeynmt/
+#
+#  Train: comment out the `voc_file` lines in the data section -> vocab files will be created in the training process 
+#   $ python -m joeynmt train configs/iwslt14_deen_bpe.yaml --skip-test
+#
+#  Test: uncomment the `vocab_file` lines below -> make sure that src_vocab.txt and trg_vocab.txt exist in model_dir
+#   $ python -m joeynmt test configs/iwslt14_deen_bpe.yaml
+#
 name: "transformer_iwslt14_deen_bpe"
 joeynmt_version: "2.3.0"
 task: "MT"
@@ -19,7 +38,7 @@ data:
         normalize: False
         level: "bpe"
         voc_min_freq: 1
-        voc_file: "test/data/iwslt14/bpe_vocab.txt"
+        #voc_file: "models/transformer_iwslt14_deen_bpe/src_vocab.txt"
         tokenizer_type: "subword-nmt"
         tokenizer_cfg:
             num_merges: 32000
@@ -32,12 +51,21 @@ data:
         normalize: False
         level: "bpe"
         voc_min_freq: 1
-        voc_file: "test/data/iwslt14/bpe_vocab.txt"
+        #voc_file: "models/transformer_iwslt14_deen_bpe/trg_vocab.txt"
         tokenizer_type: "subword-nmt"
         tokenizer_cfg:
             num_merges: 32000
             codes: "test/data/iwslt14/bpe.32000"
             pretokenizer: "none"
+    special_symbols:
+        unk_token: "<unk>"
+        unk_id: 0
+        pad_token: "<pad>"
+        pad_id: 1
+        bos_token: "<s>"
+        bos_id: 2
+        eos_token: "</s>"
+        eos_id: 3
 
 testing:
     load_model: "models/transformer_iwslt14_deen_bpe/best.ckpt"
@@ -55,8 +83,8 @@ testing:
         lowercase: True
 
 training:
-    #load_model: "models/transformer_iwslt14_deen_bpe/best.ckpt"
-    optimizer: "adam"
+    #load_model: "models/transformer_iwslt14_deen_bpe/latest.ckpt"
+    optimizer: "adamw"
     normalization: "tokens"
     adam_betas: [0.9, 0.999]
     scheduling: "plateau"

diff --git a/configs/iwslt14_deen_sp.yaml b/configs/iwslt14_deen_sp.yaml
@@ -1,3 +1,19 @@
+############## https://github.com/joeynmt/joeynmt/pull/216
+#  Install the latest JoeyNMT:
+#   $ git clone https://github.com/joeynmt/joeynmt.git
+#   $ cd joeynmt
+#   $ python -m pip install -e .
+#
+#  Prepare data: call `build_vocab.py` script from the project root path
+#  This will generate sp.model and sp.vocab in /path/to/joeynmt/test/data/iwslt14/.
+#   $ python scripts/build_vocab.py configs/iwslt14_deen_sp.yaml --joint
+#
+#  Train and test:
+#   $ python -m joeynmt train configs/iwslt14_deen_bpe.yaml
+#
+#  Note that the data splits and preprocessing here are different from
+#  the data splits created by scripts/get_iwslt14_sp.sh
+#
 name: "iwslt14_deen_sp"
 joeynmt_version: "2.3.0"
 task: "MT"
@@ -8,13 +24,13 @@ random_seed: 42
 num_workers: 0
 
 data:
-    # CAUTION: the data splits and preprocessing here are different from the data splits created by scripts/get_iwslt14_sp.sh
-    train: "iwslt14"    # cf. https://github.com/may-/datasets/blob/master/datasets/iwslt14/iwslt14.py
-    dev: "iwslt14"      # ['TED.dev2010', 'TEDX.dev2012']
-    test: "iwslt14"     # ['TED.tst2010', 'TED.tst2011', 'TED.tst2012']
+    train: "may-ohta/iwslt14"    # this downloads data from https://wit3.fbk.eu/2014-01
+    dev: "may-ohta/iwslt14"      # ['TED.dev2010', 'TEDX.dev2012']
+    test: "may-ohta/iwslt14"     # ['TED.tst2010', 'TED.tst2011', 'TED.tst2012']
     dataset_type: "huggingface"
     dataset_cfg:
         name: "de-en"
+        trust_remote_code: True  # cf. https://huggingface.co/datasets/may-ohta/iwslt14
     src:
         lang: "de"
         max_length: 512
@@ -47,8 +63,18 @@ data:
             character_coverage: 1.0
             alpha: 0.1
             pretokenizer: "moses"
+    special_symbols:
+        unk_token: "<unk>"
+        unk_id: 0
+        pad_token: "<pad>"
+        pad_id: 1
+        bos_token: "<s>"
+        bos_id: 2
+        eos_token: "</s>"
+        eos_id: 3
 
 testing:
+    load_model: "models/iwslt14_deen_sp/best.ckpt"
     n_best: 1
     beam_size: 5
     beam_alpha: 1.0
@@ -66,7 +92,7 @@ training:
     #reset_scheduler: True
     #reset_optimizer: True
     #reset_iter_state: True
-    optimizer: "adam"
+    optimizer: "adamw"
     normalization: "tokens"
     adam_betas: [0.9, 0.999]
     scheduling: "warmupinversesquareroot"
@@ -78,7 +104,7 @@ training:
     loss: "crossentropy"
     batch_size: 8192
     batch_type: "token"
-    batch_multiplier: 1
+    batch_multiplier: 4
     early_stopping_metric: "bleu"
     epochs: 100
     validation_freq: 1000

diff --git a/configs/jparacrawl_enja_sp.yaml b/configs/jparacrawl_enja_sp.yaml
@@ -47,6 +47,15 @@ data:
             character_coverage: 0.995
             nbest_size: 10
             alpha: 0.9
+    special_symbols:
+        unk_token: "<unk>"
+        unk_id: 0
+        pad_token: "<pad>"
+        pad_id: 1
+        bos_token: "<s>"
+        bos_id: 2
+        eos_token: "</s>"
+        eos_id: 3
 
 testing:
     load_model: "models/jparacrawl_enja/best.ckpt"
@@ -66,8 +75,8 @@ testing:
         tokenize: "ja-mecab"
 
 training:
-    #load_model: "models/jparacrawl_enja/best.ckpt"
-    optimizer: "adam"
+    #load_model: "models/jparacrawl_enja/latest.ckpt"
+    optimizer: "adamw"
     normalization: "tokens"
     adam_betas: [0.9, 0.98]
     scheduling: "warmupinversesquareroot"

diff --git a/configs/jparacrawl_jaen_sp.yaml b/configs/jparacrawl_jaen_sp.yaml
@@ -47,9 +47,18 @@ data:
             character_coverage: 1.0
             nbest_size: 10
             alpha: 0.9
+    special_symbols:
+        unk_token: "<unk>"
+        unk_id: 0
+        pad_token: "<pad>"
+        pad_id: 1
+        bos_token: "<s>"
+        bos_id: 2
+        eos_token: "</s>"
+        eos_id: 3
 
 testing:
-    load_model: "models/jparacrawl_jaen/best.ckpt"
+    load_model: "models/jparacrawl_enja/best.ckpt"
     n_best: 1
     beam_size: 5
     beam_alpha: 1.0
@@ -66,8 +75,8 @@ testing:
         tokenize: "intl"
 
 training:
-    #load_model: "models/jparacrawl_jaen/best.ckpt"
-    optimizer: "adam"
+    #load_model: "models/jparacrawl_enja/latest.ckpt"
+    optimizer: "adamw"
     normalization: "tokens"
     adam_betas: [0.9, 0.98]
     scheduling: "warmupinversesquareroot"