Skip to content

Commit

Permalink
Merge branch 'main' into boxiangw/add-config-validation
Browse files Browse the repository at this point in the history
  • Loading branch information
BoxiangW committed Nov 12, 2024
2 parents 7789fbc + b26c220 commit 2e46fea
Show file tree
Hide file tree
Showing 34 changed files with 935 additions and 355 deletions.
63 changes: 12 additions & 51 deletions .github/workflows/mcore-tag-bump-bot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,54 +6,15 @@ on:
- cron: 0 0 * * *

jobs:
main:
runs-on: ubuntu-latest
environment: main
steps:
- name: Checkout NVIDIA/Megatron-LM
uses: actions/checkout@v4
with:
repository: NVIDIA/Megatron-LM
ref: main
path: ${{ github.run_id }}

- name: Get latest mcore commit
id: ref
run: |
cd ${{ github.run_id }}
sha=$(git rev-parse origin/main)
echo "sha=${sha}" >> "$GITHUB_OUTPUT"
echo "short_sha=${sha:0:7}" >> "$GITHUB_OUTPUT"
echo "date=$(date +%F)" >> "$GITHUB_OUTPUT"
- name: Checkout ${{ github.repository }}
uses: actions/checkout@v4
with:
path: ${{ github.run_id }}
token: ${{ secrets.PAT }}

- name: Bump MCORE_TAG
run: |
cd ${{ github.run_id }}
sed -i 's/^ARG MCORE_TAG=.*$/ARG MCORE_TAG=${{ steps.ref.outputs.sha }}/' Dockerfile.ci
- name: Create Bump PR
uses: peter-evans/create-pull-request@v6
id: create-pull-request
with:
path: ${{ github.run_id }}
branch: bump-ci-container-${{ steps.ref.outputs.date }}
base: main
title: 'Bump `Dockerfile.ci` (${{ steps.ref.outputs.date }})'
token: ${{ secrets.PAT }}
body: |
🚀 PR to Bump `Dockerfile.ci`.
📝 Please remember the following to-do's before merge:
- [ ] Verify the presubmit CI
🙏 Please merge this PR only if the CI workflow completed successfully.
commit-message: "[🤠]: Howdy folks, let's bump `Dockerfile.ci` to ${{ steps.ref.outputs.short_sha }} !"
signoff: true
reviewers: 'pablo-garay'
labels: 'Run CICD'
mcore:
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/[email protected]
with:
source-repository: NVIDIA/Megatron-LM
source-ref: main
build-arg: MCORE_TAG
dockerfile: Dockerfile.ci
base-branch: main
cicd-label: Run CICD
pr-reviewers: 'pablo-garay'
secrets:
PAT: ${{ secrets.PAT }}
17 changes: 14 additions & 3 deletions .github/workflows/secrets-detector.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,24 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v4
with:
path: ${{ github.run_id }}
# setup repository and ref for PRs, see
# https://github.com/EndBug/add-and-commit?tab=readme-ov-file#working-with-prs
repository: ${{ github.event.pull_request.head.repo.full_name }}
ref: ${{ github.event.pull_request.head.ref }}
# custom token is required to trigger actions after reformatting + pushing
fetch-depth: 0
token: ${{ secrets.NEMO_REFORMAT_TOKEN }}

- name: Install secrets detector
run: pip install detect-secrets

- name: Run on change-set
run: |
cd ${{ github.run_id }}
git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --baseline .secrets.baseline
git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --baseline .secrets.baseline
- uses: EndBug/add-and-commit@v9
# Commit changes. Nothing is committed if no changes.
if: always()
with:
message: Update baseline
commit: --signoff
2 changes: 1 addition & 1 deletion Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
# Install NeMo requirements
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG MODELOPT_VERSION=0.19.0
ARG MCORE_TAG=bc8c4f356240ea4ccadce426251171e6e430c9d3
ARG MCORE_TAG=47ff44e5b98061bf81295ce7df899ee62529d5e3

ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN \
Expand Down
35 changes: 32 additions & 3 deletions docs/source/asr/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,39 @@ After :ref:`installing NeMo<installation>`, you can transcribe an audio file as
asr_model = nemo_asr.models.ASRModel.from_pretrained("stt_en_fastconformer_transducer_large")
transcript = asr_model.transcribe(["path/to/audio_file.wav"])
Obtain word/segment timestamps
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Obtain timestamps
^^^^^^^^^^^^^^^^^

You can also obtain timestamps for each word or segment in the transcription as follows:
Obtaining char(token), word or segment timestamps is also possible with NeMo ASR Models.

Currently, timestamps are available for Parakeet Models with all types of decoders (CTC/RNNT/TDT). Support for AED models would be added soon.

There are two ways to obtain timestamps:
1. By using the `timestamps=True` flag in the `transcribe` method.
2. For more control over the timestamps, you can update the decoding config to mention type of timestamps (char, word, segment) and also specify the segment seperators or word seperator for segment and word level timestamps.

With the `timestamps=True` flag, you can obtain timestamps for each character in the transcription as follows:

.. code-block:: python
# import nemo_asr and instantiate asr_model as above
import nemo.collections.asr as nemo_asr
asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt_ctc-110m")
# specify flag `timestamps=True`
hypotheses = asr_model.transcribe(["path/to/audio_file.wav"], timestamps=True)
# by default, timestamps are enabled for char, word and segment level
word_timestamps = hypotheses[0][0].timestep['word'] # word level timestamps for first sample
segment_timestamps = hypotheses[0][0].timestep['segment'] # segment level timestamps
char_timestamps = hypotheses[0][0].timestep['char'] # char level timestamps
for stamp in segment_timestamps:
print(f"{stamp['start']}s - {stamp['end']}s : {stamp['segment']}")
# segment level timestamps (if model supports Punctuation and Capitalization, segment level timestamps are displayed based on punctuation otherwise complete transcription is considered as a single segment)
For more control over the timestamps, you can update the decoding config to mention type of timestamps (char, word, segment) and also specify the segment seperators or word seperator for segment and word level timestamps as follows:

.. code-block:: python
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@
# limitations under the License.

"""
This script chunks long audios into non-overlapping segments of `chunk_len_in_secs` seconds and performs inference on each
This script chunks long audios into non-overlapping segments of `chunk_len_in_secs`
seconds and performs inference on each
segment individually. The results are then concatenated to form the final output.
Below is an example of how to run this script with the Canary-1b model.
It's recommended to use manifest input, otherwise the model will perform English ASR with punctuations and capitalizations.
It's recommended to use manifest input, otherwise the model will perform English ASR
with punctuations and capitalizations.
An example manifest line:
{
"audio_filepath": "/path/to/audio.wav", # path to the audio file
Expand All @@ -41,11 +43,10 @@
"""

import contextlib
import copy
import glob
import os
from dataclasses import dataclass, is_dataclass
from dataclasses import dataclass
from typing import Optional

import pytorch_lightning as pl
Expand All @@ -67,6 +68,10 @@

@dataclass
class TranscriptionConfig:
"""
Transcription config
"""

# Required configs
model_path: Optional[str] = None # Path to a .nemo file
pretrained_name: Optional[str] = None # Name of a pretrained model
Expand Down Expand Up @@ -116,6 +121,10 @@ class TranscriptionConfig:

@hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
"""
Transcribes the input audio and can be used to infer long audio files by chunking
them into smaller segments.
"""
logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
torch.set_grad_enabled(False)

Expand Down Expand Up @@ -160,7 +169,8 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:

if model_cfg.preprocessor.normalize != "per_feature":
logging.error(
"Only EncDecMultiTaskModel models trained with per_feature normalization are supported currently"
"Only EncDecMultiTaskModel models trained with per_feature normalization are supported \
currently"
)

# Disable config overwriting
Expand Down Expand Up @@ -206,7 +216,7 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
)

output_filename, pred_text_attr_name = write_transcription(
hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, compute_timestamps=False
hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, timestamps=False
)
logging.info(f"Finished writing predictions to {output_filename}!")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,11 @@
You can use `DEBUG=1 python speech_to_text_buffered_infer_ctc.py ...` to print out the
predictions of the model, and ground-truth text if presents in manifest.
"""
import contextlib
import copy
import glob
import math
import os
from dataclasses import dataclass, is_dataclass
from dataclasses import dataclass
from typing import Optional

import pytorch_lightning as pl
Expand All @@ -65,6 +64,10 @@

@dataclass
class TranscriptionConfig:
"""
Transcription Configuration for buffered inference.
"""

# Required configs
model_path: Optional[str] = None # Path to a .nemo file
pretrained_name: Optional[str] = None # Name of a pretrained model
Expand Down Expand Up @@ -114,6 +117,10 @@ class TranscriptionConfig:

@hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
"""
Transcribes the input audio and can be used to infer long audio files by chunking
them into smaller segments.
"""
logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
torch.set_grad_enabled(False)

Expand Down Expand Up @@ -221,7 +228,7 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
filepaths,
)
output_filename, pred_text_attr_name = write_transcription(
hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, compute_timestamps=False
hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, timestamps=False
)
logging.info(f"Finished writing predictions to {output_filename}!")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
import glob
import math
import os
from dataclasses import dataclass, is_dataclass
from dataclasses import dataclass
from typing import Optional

import pytorch_lightning as pl
Expand All @@ -87,6 +87,10 @@

@dataclass
class TranscriptionConfig:
"""
Transcription Configuration for buffered inference.
"""

# Required configs
model_path: Optional[str] = None # Path to a .nemo file
pretrained_name: Optional[str] = None # Name of a pretrained model
Expand Down Expand Up @@ -143,6 +147,10 @@ class TranscriptionConfig:

@hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
"""
Transcribes the input audio and can be used to infer long audio files by chunking
them into smaller segments.
"""
logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
torch.set_grad_enabled(False)

Expand Down Expand Up @@ -274,7 +282,7 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
)

output_filename, pred_text_attr_name = write_transcription(
hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, compute_timestamps=False
hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, timestamps=False
)
logging.info(f"Finished writing predictions to {output_filename}!")

Expand Down
12 changes: 10 additions & 2 deletions examples/asr/speech_translation/translate_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import contextlib
import json
import os
from dataclasses import dataclass, is_dataclass
Expand Down Expand Up @@ -65,13 +64,19 @@

@dataclass
class ModelChangeConfig:
"""
Sub-config for changes specific to the Conformer Encoder
"""

# Sub-config for changes specific to the Conformer Encoder
conformer: ConformerChangeConfig = ConformerChangeConfig()


@dataclass
class TranslationConfig:
"""
Translation Configuration for audio to text translation.
"""

# Required configs
model_path: Optional[str] = None # Path to a .nemo file
pretrained_name: Optional[str] = None # Name of a pretrained model
Expand Down Expand Up @@ -106,6 +111,9 @@ class TranslationConfig:

@hydra_runner(config_name="TranslationConfig", schema=TranslationConfig)
def main(cfg: TranslationConfig) -> Union[TranslationConfig, List[str]]:
"""
Main function to translate audio to text using a pretrained/finetuned model.
"""
logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')

for key in cfg:
Expand Down
Loading

0 comments on commit 2e46fea

Please sign in to comment.