From bcbc4f7ea9ff6ea0c4961c8dcff4e2881e2b96af Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 28 Oct 2022 15:10:31 -0700 Subject: [PATCH] Pcla tutorial fixes (#5271) (#5273) * Fixed typos Signed-off-by: Matvei Novikov * Fixed cell type and tatoeba reference Signed-off-by: Matvei Novikov * Fixed typo Signed-off-by: Matvei Novikov * Fixed branch variable Signed-off-by: Matvei Novikov Signed-off-by: Matvei Novikov Signed-off-by: Matvei Novikov Co-authored-by: Matvei Novikov --- ...ion_and_Capitalization_Lexical_Audio.ipynb | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb b/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb index ef8f0bd33353..1125a89d1fd9 100644 --- a/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb +++ b/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb @@ -99,7 +99,7 @@ "- whether the word should be capitalized\n", "\n", "\n", - "In some cases lexical only model can't predict punctutation correctly without audio. It is especially hard for conversational speech.\n", + "In some cases lexical only model can't predict punctuation correctly without audio. It is especially hard for conversational speech.\n", "\n", "For example:\n", "\n", @@ -119,7 +119,7 @@ "## Architecture\n", "Punctuation and capitaalization lexical audio model is based on [Multimodal Semi-supervised Learning Framework for Punctuation Prediction in Conversational Speech](https://arxiv.org/pdf/2008.00702.pdf). Model consists of lexical encoder (BERT-like model), acoustic encoder (i.e. Conformer's audio encoder), fusion of lexical and audio features (attention based fusion) and prediction layers.\n", "\n", - "Fusion is needed because encoded text and audio might have different length therfore can't be alligned one-to-one. As model predicts punctuation and capitalization per text token we use cross-attention between encoded lexical and encoded audio input." + "Fusion is needed because encoded text and audio might have different length therefore can't be aligned one-to-one. As model predicts punctuation and capitalization per text token we use cross-attention between encoded lexical and encoded audio input." ] }, { @@ -279,22 +279,23 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "outputs": [], "source": [ - "## download get_tatoeba_data.py script to download and preprocess the Tatoeba data\n", + "## download get_libritts_data.py script to download and preprocess the LibriTTS data\n", "os.makedirs(WORK_DIR, exist_ok=True)\n", "if not os.path.exists(WORK_DIR + '/get_libritts_data.py'):\n", " print('Downloading get_libritts_data.py...')\n", " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/token_classification/data/get_libritts_data.py', WORK_DIR)\n", "else:\n", " print ('get_libritts_data.py already exists')" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } + ] }, { "cell_type": "code",