Merge branch 'main' into boxiangw/add-config-validation

NVIDIA · Nov 12, 2024 · 2e46fea · 2e46fea
2 parents 7789fbc + b26c220
commit 2e46fea
Show file tree

Hide file tree

Showing 34 changed files with 935 additions and 355 deletions.
diff --git a/.github/workflows/mcore-tag-bump-bot.yml b/.github/workflows/mcore-tag-bump-bot.yml
@@ -6,54 +6,15 @@ on:
     - cron: 0 0 * * *
 
 jobs:
-  main:
-    runs-on: ubuntu-latest
-    environment: main
-    steps:
-      - name: Checkout NVIDIA/Megatron-LM
-        uses: actions/checkout@v4
-        with:
-          repository: NVIDIA/Megatron-LM
-          ref: main
-          path: ${{ github.run_id }}
-
-      - name: Get latest mcore commit
-        id: ref
-        run: |
-          cd ${{ github.run_id }}      
-          sha=$(git rev-parse origin/main)
-          echo "sha=${sha}" >> "$GITHUB_OUTPUT"
-          echo "short_sha=${sha:0:7}" >> "$GITHUB_OUTPUT"
-          echo "date=$(date +%F)" >> "$GITHUB_OUTPUT"
-
-      - name: Checkout ${{ github.repository }}
-        uses: actions/checkout@v4
-        with:
-          path: ${{ github.run_id }}
-          token: ${{ secrets.PAT }}
-
-      - name: Bump MCORE_TAG
-        run: |
-          cd ${{ github.run_id }}     
-          sed -i 's/^ARG MCORE_TAG=.*$/ARG MCORE_TAG=${{ steps.ref.outputs.sha }}/' Dockerfile.ci
-
-      - name: Create Bump PR
-        uses: peter-evans/create-pull-request@v6
-        id: create-pull-request
-        with:
-          path: ${{ github.run_id }}
-          branch: bump-ci-container-${{ steps.ref.outputs.date }}
-          base: main
-          title: 'Bump `Dockerfile.ci` (${{ steps.ref.outputs.date }})'
-          token: ${{ secrets.PAT }}
-          body: |
-            🚀 PR to Bump `Dockerfile.ci`.  
-
-            📝 Please remember the following to-do's before merge: 
-            - [ ] Verify the presubmit CI  
-
-            🙏 Please merge this PR only if the CI workflow completed successfully.
-          commit-message: "[🤠]: Howdy folks, let's bump `Dockerfile.ci` to ${{ steps.ref.outputs.short_sha }} !"
-          signoff: true
-          reviewers: 'pablo-garay'
-          labels: 'Run CICD'
+  mcore:
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/[email protected]
+    with:
+      source-repository: NVIDIA/Megatron-LM
+      source-ref: main
+      build-arg: MCORE_TAG
+      dockerfile: Dockerfile.ci
+      base-branch: main
+      cicd-label: Run CICD
+      pr-reviewers: 'pablo-garay'
+    secrets:
+      PAT: ${{ secrets.PAT }}
diff --git a/.github/workflows/secrets-detector.yml b/.github/workflows/secrets-detector.yml
@@ -25,13 +25,24 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          path: ${{ github.run_id }}
+          # setup repository and ref for PRs, see
+          # https://github.com/EndBug/add-and-commit?tab=readme-ov-file#working-with-prs
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.ref }}
+          # custom token is required to trigger actions after reformatting + pushing
           fetch-depth: 0
+          token: ${{ secrets.NEMO_REFORMAT_TOKEN }}
 
       - name: Install secrets detector
         run: pip install detect-secrets
 
       - name: Run on change-set
         run: |
-          cd ${{ github.run_id }}
-          git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --baseline .secrets.baseline 
+          git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --baseline .secrets.baseline 
+      
+      - uses: EndBug/add-and-commit@v9
+        # Commit changes. Nothing is committed if no changes.
+        if: always()
+        with:
+          message: Update baseline
+          commit: --signoff
diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -54,7 +54,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.19.0
-ARG MCORE_TAG=bc8c4f356240ea4ccadce426251171e6e430c9d3
+ARG MCORE_TAG=47ff44e5b98061bf81295ce7df899ee62529d5e3
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \

diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst
@@ -16,10 +16,39 @@ After :ref:`installing NeMo<installation>`, you can transcribe an audio file as
     asr_model = nemo_asr.models.ASRModel.from_pretrained("stt_en_fastconformer_transducer_large")
     transcript = asr_model.transcribe(["path/to/audio_file.wav"])
 
-Obtain word/segment timestamps
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Obtain timestamps
+^^^^^^^^^^^^^^^^^
 
-You can also obtain timestamps for each word or segment in the transcription as follows:
+Obtaining char(token), word or segment timestamps is also possible with NeMo ASR Models. 
+
+Currently, timestamps are available for Parakeet Models with all types of decoders (CTC/RNNT/TDT). Support for AED models would be added soon.
+
+There are two ways to obtain timestamps:
+1. By using the `timestamps=True` flag in the `transcribe` method.
+2. For more control over the timestamps, you can update the decoding config to mention type of timestamps (char, word, segment) and also specify the segment seperators or word seperator for segment and word level timestamps.
+
+With the `timestamps=True` flag, you can obtain timestamps for each character in the transcription as follows:
+
+.. code-block:: python
+    
+    # import nemo_asr and instantiate asr_model as above
+    import nemo.collections.asr as nemo_asr
+    asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt_ctc-110m")
+
+    # specify flag `timestamps=True`
+    hypotheses = asr_model.transcribe(["path/to/audio_file.wav"], timestamps=True)
+
+    # by default, timestamps are enabled for char, word and segment level
+    word_timestamps = hypotheses[0][0].timestep['word'] # word level timestamps for first sample
+    segment_timestamps = hypotheses[0][0].timestep['segment'] # segment level timestamps
+    char_timestamps = hypotheses[0][0].timestep['char'] # char level timestamps
+
+    for stamp in segment_timestamps:
+        print(f"{stamp['start']}s - {stamp['end']}s : {stamp['segment']}")
+
+    # segment level timestamps (if model supports Punctuation and Capitalization, segment level timestamps are displayed based on punctuation otherwise complete transcription is considered as a single segment)
+    
+For more control over the timestamps, you can update the decoding config to mention type of timestamps (char, word, segment) and also specify the segment seperators or word seperator for segment and word level timestamps as follows:
 
 .. code-block:: python
 

diff --git a/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py b/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
@@ -13,11 +13,13 @@
 # limitations under the License.
 
 """
-This script chunks long audios into non-overlapping segments of `chunk_len_in_secs` seconds and performs inference on each 
+This script chunks long audios into non-overlapping segments of `chunk_len_in_secs` 
+seconds and performs inference on each 
 segment individually. The results are then concatenated to form the final output.
 
 Below is an example of how to run this script with the Canary-1b model.
-It's recommended to use manifest input, otherwise the model will perform English ASR with punctuations and capitalizations. 
+It's recommended to use manifest input, otherwise the model will perform English ASR 
+with punctuations and capitalizations. 
 An example manifest line:
 {
     "audio_filepath": "/path/to/audio.wav",  # path to the audio file
@@ -41,11 +43,10 @@
     
 """
 
-import contextlib
 import copy
 import glob
 import os
-from dataclasses import dataclass, is_dataclass
+from dataclasses import dataclass
 from typing import Optional
 
 import pytorch_lightning as pl
@@ -67,6 +68,10 @@
 
 @dataclass
 class TranscriptionConfig:
+    """
+    Transcription config
+    """
+
     # Required configs
     model_path: Optional[str] = None  # Path to a .nemo file
     pretrained_name: Optional[str] = None  # Name of a pretrained model
@@ -116,6 +121,10 @@ class TranscriptionConfig:
 
 @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
 def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
+    """
+    Transcribes the input audio and can be used to infer long audio files by chunking
+    them into smaller segments.
+    """
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
     torch.set_grad_enabled(False)
 
@@ -160,7 +169,8 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
 
     if model_cfg.preprocessor.normalize != "per_feature":
         logging.error(
-            "Only EncDecMultiTaskModel models trained with per_feature normalization are supported currently"
+            "Only EncDecMultiTaskModel models trained with per_feature normalization are supported \
+            currently"
         )
 
     # Disable config overwriting
@@ -206,7 +216,7 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
             )
 
     output_filename, pred_text_attr_name = write_transcription(
-        hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, compute_timestamps=False
+        hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, timestamps=False
     )
     logging.info(f"Finished writing predictions to {output_filename}!")
 

diff --git a/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py b/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py
@@ -35,12 +35,11 @@
     You can use `DEBUG=1 python speech_to_text_buffered_infer_ctc.py ...` to print out the
     predictions of the model, and ground-truth text if presents in manifest.
 """
-import contextlib
 import copy
 import glob
 import math
 import os
-from dataclasses import dataclass, is_dataclass
+from dataclasses import dataclass
 from typing import Optional
 
 import pytorch_lightning as pl
@@ -65,6 +64,10 @@
 
 @dataclass
 class TranscriptionConfig:
+    """
+    Transcription Configuration for buffered inference.
+    """
+
     # Required configs
     model_path: Optional[str] = None  # Path to a .nemo file
     pretrained_name: Optional[str] = None  # Name of a pretrained model
@@ -114,6 +117,10 @@ class TranscriptionConfig:
 
 @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
 def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
+    """
+    Transcribes the input audio and can be used to infer long audio files by chunking
+    them into smaller segments.
+    """
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
     torch.set_grad_enabled(False)
 
@@ -221,7 +228,7 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
             filepaths,
         )
     output_filename, pred_text_attr_name = write_transcription(
-        hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, compute_timestamps=False
+        hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, timestamps=False
     )
     logging.info(f"Finished writing predictions to {output_filename}!")
 

diff --git a/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py b/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py
@@ -61,7 +61,7 @@
 import glob
 import math
 import os
-from dataclasses import dataclass, is_dataclass
+from dataclasses import dataclass
 from typing import Optional
 
 import pytorch_lightning as pl
@@ -87,6 +87,10 @@
 
 @dataclass
 class TranscriptionConfig:
+    """
+    Transcription Configuration for buffered inference.
+    """
+
     # Required configs
     model_path: Optional[str] = None  # Path to a .nemo file
     pretrained_name: Optional[str] = None  # Name of a pretrained model
@@ -143,6 +147,10 @@ class TranscriptionConfig:
 
 @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
 def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
+    """
+    Transcribes the input audio and can be used to infer long audio files by chunking
+    them into smaller segments.
+    """
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
     torch.set_grad_enabled(False)
 
@@ -274,7 +282,7 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
     )
 
     output_filename, pred_text_attr_name = write_transcription(
-        hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, compute_timestamps=False
+        hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, timestamps=False
     )
     logging.info(f"Finished writing predictions to {output_filename}!")
 

diff --git a/examples/asr/speech_translation/translate_speech.py b/examples/asr/speech_translation/translate_speech.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import contextlib
 import json
 import os
 from dataclasses import dataclass, is_dataclass
@@ -65,13 +64,19 @@
 
 @dataclass
 class ModelChangeConfig:
+    """
+    Sub-config for changes specific to the Conformer Encoder
+    """
 
-    # Sub-config for changes specific to the Conformer Encoder
     conformer: ConformerChangeConfig = ConformerChangeConfig()
 
 
 @dataclass
 class TranslationConfig:
+    """
+    Translation Configuration for audio to text translation.
+    """
+
     # Required configs
     model_path: Optional[str] = None  # Path to a .nemo file
     pretrained_name: Optional[str] = None  # Name of a pretrained model
@@ -106,6 +111,9 @@ class TranslationConfig:
 
 @hydra_runner(config_name="TranslationConfig", schema=TranslationConfig)
 def main(cfg: TranslationConfig) -> Union[TranslationConfig, List[str]]:
+    """
+    Main function to translate audio to text using a pretrained/finetuned model.
+    """
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
 
     for key in cfg: