diff --git a/docs/source/asr/api.rst b/docs/source/asr/api.rst
index a35ea49ea2c4..c99d92c0371a 100644
--- a/docs/source/asr/api.rst
+++ b/docs/source/asr/api.rst
@@ -276,21 +276,6 @@ RNNT Decoding
     :show-inheritance:
     :members:
 
-TDT Decoding
-~~~~~~~~~~~~~
-
-.. autoclass:: nemo.collections.asr.parts.submodules.rnnt_greedy_decoding.GreedyTDTInfer
-    :show-inheritance:
-    :members:
-
-.. autoclass:: nemo.collections.asr.parts.submodules.rnnt_greedy_decoding.GreedyBatchedTDTInfer
-    :show-inheritance:
-    :members:
-
-.. autoclass:: nemo.collections.asr.parts.submodules.tdt_beam_decoding.BeamTDTInfer
-    :show-inheritance:
-    :members:
-
 Hypotheses
 ~~~~~~~~~~
 
diff --git a/examples/multimodal_autoregressive/README.md b/examples/multimodal_autoregressive/README.md
new file mode 100644
index 000000000000..5934074a7d17
--- /dev/null
+++ b/examples/multimodal_autoregressive/README.md
@@ -0,0 +1,3 @@
+### MULTIMODAL AUTOREGRESSIVE GENERTION
+
+For information on how to get started with autoregressive generation for multimodal datasets using discrete tokenizers follow this [guide](nemo/collections/multimodal_autoregressive/data/README.md)
diff --git a/examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_image_generation.yaml b/examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_image_generation.yaml
new file mode 100644
index 000000000000..806800c96155
--- /dev/null
+++ b/examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_image_generation.yaml
@@ -0,0 +1,36 @@
+inference:
+  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: True # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  end_strings: ["<|extra_204|>"]  # generation will stop when one of these tokens is generated
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: bf16 # 16, 32, or bf16
+  use_distributed_sampler: False
+
+tensor_model_parallel_size: -1
+pipeline_model_parallel_size: -1
+pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
+megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
+image_encoder: Cosmos-Tokenizer-DV8x16x16
+gpt_model_file: null  # GPT nemo file path
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
+captions: # prompts for GPT inference
+  - "a drawing of a green pokemon with red eyes"
+  - "a red pokemon with green eyes"
+  - "a cartoon fish with a big smile"
+images_output_path: null # Path to the directory to store the output images
+
diff --git a/examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_vision_understanding.yaml b/examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_vision_understanding.yaml
new file mode 100644
index 000000000000..c392f5dcc5c2
--- /dev/null
+++ b/examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_vision_understanding.yaml
@@ -0,0 +1,32 @@
+inference:
+  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: False # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  end_strings: ["<|extra_204|>"]  # generation will stop when one of these tokens is generated
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: bf16 # 16, 32, or bf16
+  use_distributed_sampler: False
+
+tensor_model_parallel_size: -1
+pipeline_model_parallel_size: -1
+pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
+megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
+gpt_model_file: null  # GPT nemo file path
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
+images_path: # prompts for GPT inference
+  - "/path/to/image1"
+  - "/path/to/image2"
diff --git a/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py
new file mode 100644
index 000000000000..ae8dddb29553
--- /dev/null
+++ b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import math
+import os
+import re
+
+import torch
+import torchvision
+from examples.nlp.language_modeling.megatron_gpt_eval import (
+    load_model_from_config,
+    remove_padded_prompts,
+    round_to_mult,
+)
+from pytorch_lightning.trainer.trainer import Trainer
+
+# pylint: disable=line-too-long
+from nemo.collections.common.video_tokenizers.cosmos_tokenizer import CausalVideoTokenizer
+from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
+from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy
+from nemo.core.config import hydra_runner
+
+"""
+This is the script to run multimodal autoregresssive text generation.
+
+Make sure you  install tiktoken==0.6.0
+
+Usage:
+    Assume the model has TP=1, PP=1 in the following use cases.
+    a. run greedy inference from a nemo file:
+        python megatron_mm_autoregresssive_eval.py \
+            gpt_model_file=PATH_TO_MODEL \
+            inference.greedy=True \
+            inference.add_BOS=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            captions=[caption1,caption2]
+
+    b. run greedy inference from a PTL checkpoint file:
+        python megatron_mm_autoregresssive_eval.py \
+            checkpoint_dir=PATH_TO_CHECKPOINT_FILE \
+            checkpoint_name=CHECKPOINT_FILE_NAME \
+            hparams_file=HPARAMS_FILE \
+            inference.greedy=True \
+            inference.add_BOS=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            captions=[caption1,caption2]
+
+    c. run top_p inference from a nemo file:
+        python megatron_mm_autoregresssive_eval.py \
+            gpt_model_file=PATH_TO_MODEL \
+            inference.greedy=False \
+            inference.top_k=0 \
+            inference.top_p=0.9 \
+            inference.repetition_penalty=1.2 \
+            inference.add_BOS=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            captions=[caption1,caption2]
+
+    d. If you don't need to generate tokens and need model to compute logprobs:
+         python megatron_mm_autoregresssive_eval.py \
+            gpt_model_file=PATH_TO_MODEL \
+            inference.compute_logprob=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            captions=[caption1,caption2]
+"""
+
+
+def to_img(tokens_string, image_tokenizer):
+    """Converts visual tokens to images
+
+    Given input visual tokens, we extract the indices, pass it to the decoder to get the image
+    """
+    visual_token_pattern = r"<\|visual token (\d+)\|>"
+    visual_tokens = [int(match) for match in re.findall(visual_token_pattern, tokens_string)]
+    # We assume image is square. So if 64 tokensa are present, we reshape it to 8x8 and then pass it to decoder
+    dim = int(math.sqrt(len(visual_tokens)))
+    visual_tokens_tensor = torch.tensor(visual_tokens[: dim * dim])
+    # Decoder accepts input of the following format [bs, channel_dim, h, w]
+    visual_tokens_tensor_reshaped = visual_tokens_tensor.reshape((dim, dim)).unsqueeze(0).unsqueeze(0)
+    visual_tokens_final = visual_tokens_tensor_reshaped.to(image_tokenizer._device)
+    img = image_tokenizer.decode(visual_tokens_final)
+
+    # Convert from bf16 to 16 and to format [channel_dim, h, w]
+    image = torchvision.transforms.functional.to_pil_image(img.float().squeeze())
+    return image
+
+
+def load_prompts(cfg):
+    """Function to return the prompts passed into the model"""
+    prompts = []
+    for caption in cfg.captions:
+        prompt = f'You are a helpful assistant. Draw a picture for the caption given by the user. USER: {caption}. ASSISTANT: '
+        prompts.append(prompt)
+    return prompts
+
+
+if not torch.cuda.is_available():
+    raise EnvironmentError("GPU is needed for the inference")
+
+
+@hydra_runner(config_path="conf", config_name="megatron_mm_ar_inference_image_generation")
+def main(cfg) -> None:
+    """Main function"""
+
+    callbacks = []
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    # trainer required for restoring model parallel models
+    trainer = Trainer(
+        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)),
+        **cfg.trainer,
+        callbacks=callbacks,
+    )
+
+    image_tokenizer = CausalVideoTokenizer.from_pretrained(
+        tokenizer_type=cfg.image_encoder, load_encoder=False, load_decoder=True, load_full_model=False
+    )
+
+    model = load_model_from_config(trainer, cfg)
+    model.freeze()
+
+    # Have to turn off activations_checkpoint_method for inference
+    try:
+        model.model.language_model.encoder.activations_checkpoint_method = None
+    except AttributeError:
+        pass
+
+    length_params: LengthParam = {
+        "max_length": cfg.inference.tokens_to_generate,
+        "min_length": cfg.inference.min_tokens_to_generate,
+    }
+
+    sampling_params: SamplingParam = {
+        "use_greedy": cfg.inference.greedy,
+        "temperature": cfg.inference.temperature,
+        "top_k": cfg.inference.top_k,
+        "top_p": cfg.inference.top_p,
+        "repetition_penalty": cfg.inference.repetition_penalty,
+        "add_BOS": cfg.inference.add_BOS,
+        "all_probs": cfg.inference.all_probs,
+        "compute_logprob": cfg.inference.compute_logprob,
+        "end_strings": cfg.inference.end_strings,
+    }
+
+    prompts = []
+    with torch.no_grad():
+        prompts = load_prompts(cfg)
+
+    fp8_enabled = hasattr(model.cfg, "fp8") and (model.cfg.fp8 == True)
+    if fp8_enabled and len(prompts) > 0:
+        padded_len = round_to_mult(len(prompts), 8)
+        nb_paddings = padded_len - len(prompts)
+        if nb_paddings > 0:
+            nb_paddings += [''] * nb_paddings
+
+    # First method of running text generation, call model.generate method
+    response = model.generate(inputs=prompts, length_params=length_params, sampling_params=sampling_params)
+
+    if fp8_enabled:
+        response = remove_padded_prompts(response, nb_paddings)
+
+    output_tokens_strings = response['sentences']
+    for idx, output_token_string in enumerate(output_tokens_strings):
+        image = to_img(output_token_string, image_tokenizer)
+        image.save(os.path.join(cfg.images_output_path, f'{idx}.jpg'))
+
+    print(f'Images saved to {cfg.images_output_path}')
+
+
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter
diff --git a/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_vision_understanding.py b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_vision_understanding.py
new file mode 100644
index 000000000000..4aea4d9898ae
--- /dev/null
+++ b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_vision_understanding.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import datetime
+
+import torch
+import torchvision
+from examples.nlp.language_modeling.megatron_gpt_eval import (
+    RequestDataSet,
+    load_model_from_config,
+    remove_padded_prompts,
+    round_to_mult,
+)
+from omegaconf import OmegaConf
+from PIL import Image
+from pytorch_lightning.trainer.trainer import Trainer
+from torch.utils.data import DataLoader
+from transformers import AutoModel, AutoTokenizer
+
+# pylint: disable=line-too-long
+from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
+from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy
+from nemo.core.config import hydra_runner
+
+"""
+This is the script to run multimodal autoregresssive text generation.
+
+Make sure you  install tiktoken==0.6.0
+
+Usage:
+    Assume the model has TP=1, PP=1 in the following use cases.
+    a. run greedy inference from a nemo file:
+        python megatron_mm_autoregresssive_eval.py \
+            gpt_model_file=PATH_TO_MODEL \
+            inference.greedy=True \
+            inference.add_BOS=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            images_path=[image_path1,image_path2]
+
+    b. run greedy inference from a PTL checkpoint file:
+        python megatron_mm_autoregresssive_eval.py \
+            checkpoint_dir=PATH_TO_CHECKPOINT_FILE \
+            checkpoint_name=CHECKPOINT_FILE_NAME \
+            hparams_file=HPARAMS_FILE \
+            inference.greedy=True \
+            inference.add_BOS=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            images_path=[image_path1,image_path2]
+
+    c. run top_p inference from a nemo file:
+        python megatron_mm_autoregresssive_eval.py \
+            gpt_model_file=PATH_TO_MODEL \
+            inference.greedy=False \
+            inference.top_k=0 \
+            inference.top_p=0.9 \
+            inference.repetition_penalty=1.2 \
+            inference.add_BOS=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            images_path=[image_path1,image_path2]
+
+    d. If you don't need to generate tokens and need model to compute logprobs:
+         python megatron_mm_autoregresssive_eval.py \
+            gpt_model_file=PATH_TO_MODEL \
+            inference.compute_logprob=True \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            images_path=[image_path1,image_path2]
+"""
+
+EMU_HUB = "BAAI/Emu3-Gen"
+VQ_HUB = "BAAI/Emu3-VisionTokenizer"
+
+
+def to_imgstr(image_tokens, tokenizer):
+    """Convert integer image tokens to visual tokens string"""
+    image_tokens = image_tokens.cpu().numpy().tolist()
+    image_token_str = [
+        ['<|visual token {token_id:0>6d}|>'.format(token_id=token_id) for token_id in token_row]
+        for token_row in image_tokens
+    ]
+    image_row_str = ["".join(token_row) for token_row in image_token_str]
+    imgstr = tokenizer.eol_token.join(image_row_str)
+    return imgstr
+
+
+def load_prompts(cfg, image_tokenizer, tokenizer):
+    """Function to generate prompts
+
+    The prompts generated here are fed to the model.
+    """
+    prompts = []
+    text = "Please describe the image"
+    for image_path in cfg.images_path:
+        image = Image.open(image_path)
+        image_tensor = torchvision.transforms.functional.pil_to_tensor(image).unsqueeze(0)
+        image_tokens = image_tokenizer.encode(image_tensor.to(image_tokenizer.device, image_tokenizer.dtype))
+        bs, h, w = image_tokens.shape
+        imgstr = to_imgstr(image_tokens[0], tokenizer=tokenizer)
+        image_prompt = (
+            tokenizer.boi_token
+            + f'{h}*{w}'
+            + tokenizer.img_token
+            + imgstr
+            + tokenizer.eol_token
+            + tokenizer.eof_token
+            + tokenizer.eoi_token
+        )
+        prompt = f'{tokenizer.bos_token}You are a helpful assistant. USER: {image_prompt}{text}. ASSISTANT:'
+        prompts.append(prompt)
+    return prompts
+
+
+if not torch.cuda.is_available():
+    raise EnvironmentError("GPU is needed for the inference")
+
+
+@hydra_runner(config_path="conf", config_name="megatron_mm_ar_inference_vision_understanding")
+def main(cfg) -> None:
+    """Main function"""
+
+    callbacks = []
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    # trainer required for restoring model parallel models
+    trainer = Trainer(
+        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)),
+        **cfg.trainer,
+        callbacks=callbacks,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(EMU_HUB, trust_remote_code=True)
+    image_tokenizer = AutoModel.from_pretrained(VQ_HUB, device_map="cuda", trust_remote_code=True).eval()
+
+    model = load_model_from_config(trainer, cfg)
+    model.freeze()
+
+    # Have to turn off activations_checkpoint_method for inference
+    try:
+        model.model.language_model.encoder.activations_checkpoint_method = None
+    except AttributeError:
+        pass
+
+    length_params: LengthParam = {
+        "max_length": cfg.inference.tokens_to_generate,
+        "min_length": cfg.inference.min_tokens_to_generate,
+    }
+
+    sampling_params: SamplingParam = {
+        "use_greedy": cfg.inference.greedy,
+        "temperature": cfg.inference.temperature,
+        "top_k": cfg.inference.top_k,
+        "top_p": cfg.inference.top_p,
+        "repetition_penalty": cfg.inference.repetition_penalty,
+        "add_BOS": cfg.inference.add_BOS,
+        "all_probs": cfg.inference.all_probs,
+        "compute_logprob": cfg.inference.compute_logprob,
+        "end_strings": cfg.inference.end_strings,
+    }
+
+    prompts = []
+    with torch.no_grad():
+        prompts = load_prompts(cfg, image_tokenizer, tokenizer)
+
+    fp8_enabled = hasattr(model.cfg, "fp8") and (model.cfg.fp8 == True)
+    if fp8_enabled and len(prompts) > 0:
+        padded_len = round_to_mult(len(prompts), 8)
+        nb_paddings = padded_len - len(prompts)
+        if nb_paddings > 0:
+            nb_paddings += [''] * nb_paddings
+
+    # First method of running text generation, call model.generate method
+    response = model.generate(inputs=prompts, length_params=length_params, sampling_params=sampling_params)
+
+    if fp8_enabled:
+        response = remove_padded_prompts(response, nb_paddings)
+    print("***************************")
+    print(response)
+    print("***************************")
+
+    # Second method of running text generation, call trainer.predict [recommended]
+    bs = 8 if fp8_enabled else 2
+    ds = RequestDataSet(prompts)
+    request_dl = DataLoader(dataset=ds, batch_size=bs)
+    config = OmegaConf.to_container(cfg.inference)
+    model.set_inference_config(config)
+    response = trainer.predict(model, request_dl)
+
+    if fp8_enabled:
+        response[-1] = remove_padded_prompts(response[-1], nb_paddings)
+    print("***************************")
+    print(response)
+    print("***************************")
+
+
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter
diff --git a/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py
index e0bd47bb8ce0..c01f2363db75 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py
@@ -55,20 +55,6 @@
 
 
 def pack_hypotheses(hypotheses: List[Hypothesis]) -> List[Hypothesis]:
-    """
-    Packs a list of hypotheses into a tensor and prepares decoder states.
-
-    This function takes a list of token sequences (hypotheses) and converts
-    it into a tensor format. If any decoder states are on the GPU, they
-    are moved to the CPU. Additionally, the function removes any timesteps
-    with a value of -1 from the sequences.
-
-    Args:
-        hypotheses (list): A list of token sequences representing hypotheses.
-
-    Returns:
-        list: A list of packed hypotheses in tensor format.
-    """
     for idx, hyp in enumerate(hypotheses):  # type: rnnt_utils.Hypothesis
         hyp.y_sequence = torch.tensor(hyp.y_sequence, dtype=torch.long)
 
@@ -83,18 +69,6 @@ def pack_hypotheses(hypotheses: List[Hypothesis]) -> List[Hypothesis]:
 
 
 def _states_to_device(dec_state, device='cpu'):
-    """
-    Transfers decoder states to the specified device.
-
-    This function moves the provided decoder states to the specified device (e.g., 'cpu' or 'cuda').
-
-    Args:
-        dec_state (Tensor): The decoder states to be transferred.
-        device (str): The target device to which the decoder states should be moved. Defaults to 'cpu'.
-
-    Returns:
-        Tensor: The decoder states on the specified device.
-    """
     if torch.is_tensor(dec_state):
         dec_state = dec_state.to(device)
 
@@ -132,8 +106,7 @@ class BeamRNNTInfer(Typing):
                     however the time required for the search also grows steadily.
 
                 `tsd` - time synchronous decoding. Please refer to the paper:
-                    [Alignment-Length Synchronous Decoding for RNN Transducer]
-                    (https://ieeexplore.ieee.org/document/9053040)
+                    [Alignment-Length Synchronous Decoding for RNN Transducer](https://ieeexplore.ieee.org/document/9053040)
                     for details on the algorithm implemented.
 
                     Time synchronous decoding (TSD) execution time grows by the factor T * max_symmetric_expansions.
@@ -141,8 +114,7 @@ class BeamRNNTInfer(Typing):
                     good results. This also requires greater memory to execute.
 
                 `alsd` - alignment-length synchronous decoding. Please refer to the paper:
-                    [Alignment-Length Synchronous Decoding for RNN Transducer]
-                    (https://ieeexplore.ieee.org/document/9053040)
+                    [Alignment-Length Synchronous Decoding for RNN Transducer](https://ieeexplore.ieee.org/document/9053040)
                     for details on the algorithm implemented.
 
                     Alignment-length synchronous decoding (ALSD) execution time is faster than TSD, with growth
@@ -155,8 +127,7 @@ class BeamRNNTInfer(Typing):
                     For a given decoding accuracy, it is possible to attain faster decoding via ALSD than TSD.
 
                 `maes` = modified adaptive expansion searcn. Please refer to the paper:
-                    [Accelerating RNN Transducer Inference via Adaptive Expansion Search]
-                    (https://ieeexplore.ieee.org/document/9250505)
+                    [Accelerating RNN Transducer Inference via Adaptive Expansion Search](https://ieeexplore.ieee.org/document/9250505)
 
                     Modified Adaptive Synchronous Decoding (mAES) execution time is adaptive w.r.t the
                     number of expansions (for tokens) required per timestep. The number of expansions can usually
@@ -198,10 +169,10 @@ class BeamRNNTInfer(Typing):
             and affects the speed of inference since large values will perform large beam search in the next step.
 
         maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the expansions.
-            The default (2.3) is selected from the paper. It performs a comparison
-            (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the Vocab set and max_log_prob
-            is the "most" likely token to be predicted. Gamma therefore provides a margin of additional tokens which
-            can be potential candidates for expansion apart from the "most likely" candidate.
+            The default (2.3) is selected from the paper. It performs a comparison (max_log_prob - gamma <= log_prob[v])
+            where v is all vocabulary indices in the Vocab set and max_log_prob is the "most" likely token to be
+            predicted. Gamma therefore provides a margin of additional tokens which can be potential candidates for
+            expansion apart from the "most likely" candidate.
             Lower values will reduce the number of expansions (by increasing pruning-by-value, thereby improving speed
             but hurting accuracy). Higher values will increase the number of expansions (by reducing pruning-by-value,
             thereby reducing speed but potentially improving accuracy). This is a hyper parameter to be experimentally
@@ -211,7 +182,7 @@ class BeamRNNTInfer(Typing):
 
         preserve_alignments: Bool flag which preserves the history of alignments generated during
             beam decoding (sample). When set to true, the Hypothesis will contain
-            the non-null value for `alignments` in it. Here, `alignments` is a List of List of Tensor (of length V + 1)
+            the non-null value for `alignments` in it. Here, `alignments` is a List of List of Tensor (of length V + 1).
 
             The length of the list corresponds to the Acoustic Length (T).
             Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary.
@@ -1485,11 +1456,8 @@ def compute_ngram_score(self, current_lm_state: "kenlm.State", label: int) -> Tu
         return lm_score, next_state
 
     def set_decoding_type(self, decoding_type: str):
-        """
-        Sets decoding type. Please check train_kenlm.py in scripts/asr_language_modeling/ to find out why we need
-        Args:
-            decoding_type: decoding type
-        """
+
+        # Please check train_kenlm.py in scripts/asr_language_modeling/ to find out why we need
         # TOKEN_OFFSET for BPE-based models
         if decoding_type == 'subword':
             from nemo.collections.asr.parts.submodules.ctc_beam_decoding import DEFAULT_TOKEN_OFFSET
@@ -1499,10 +1467,6 @@ def set_decoding_type(self, decoding_type: str):
 
 @dataclass
 class BeamRNNTInferConfig:
-    """
-    Beam RNNT Inference config.
-    """
-
     beam_size: int
     search_type: str = 'default'
     score_norm: bool = True
diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
index d3a63467c485..da280a0c6b3c 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
@@ -23,7 +23,7 @@
 import torch
 from omegaconf import OmegaConf
 
-from nemo.collections.asr.parts.submodules import rnnt_beam_decoding, rnnt_greedy_decoding, tdt_beam_decoding
+from nemo.collections.asr.parts.submodules import rnnt_beam_decoding, rnnt_greedy_decoding
 from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceConfig, ConfidenceMixin
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis, NBestHypotheses
 from nemo.collections.common.tokenizers.aggregate_tokenizer import AggregateTokenizer
@@ -67,15 +67,15 @@ class AbstractRNNTDecoding(ConfidenceMixin):
 
             rnnt_timestamp_type: A str value, which represents the types of timestamps that should be calculated.
                 Can take the following values - "char" for character/subword time stamps, "word" for word level
-                time stamps, "segment" for segment level time stamps and "all" (default), for character, word and
-                segment level time stamps.
+                time stamps, "segment" for segment level time stamps and "all" (default), for character,
+                word and segment level time stamps.
 
             word_seperator: Str token representing the seperator between words.
 
             segment_seperators: List containing tokens representing the seperator(s) between segments.
 
-            segment_gap_threshold: The threshold (in frames) that caps the gap between two words necessary for forming
-            the segments.
+            segment_gap_threshold: The threshold (in frames) that caps the gap between two words necessary
+            for forming the segments.
 
             preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores
                 generated during decoding (sample / batched). When set to true, the Hypothesis will contain
@@ -106,8 +106,8 @@ class AbstractRNNTDecoding(ConfidenceMixin):
                     from the `token_confidence`.
                 aggregation: Which aggregation type to use for collapsing per-token confidence into per-word
                     confidence. Valid options are `mean`, `min`, `max`, `prod`.
-                tdt_include_duration: Bool flag indicating that the duration confidence scores are to be calculated and
-                    attached to the regular frame confidence,
+                tdt_include_duration: Bool flag indicating that the duration confidence scores are to be calculated
+                    and attached to the regular frame confidence,
                     making TDT frame confidence element a pair: (`prediction_confidence`, `duration_confidence`).
                 method_cfg: A dict-like object which contains the method name and settings to compute per-frame
                     confidence scores.
@@ -179,23 +179,23 @@ class AbstractRNNTDecoding(ConfidenceMixin):
                 maes_num_steps: Number of adaptive steps to take. From the paper, 2 steps is generally sufficient,
                     and can be reduced to 1 to improve decoding speed while sacrificing some accuracy. int > 0.
 
-                maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to keep
-                    this as 1 in order to reduce expensive beam search cost later. int >= 0.
+                maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to
+                keep this as 1 in order to reduce expensive beam search cost later. int >= 0.
 
                 maes_expansion_beta: Maximum number of prefix expansions allowed, in addition to the beam size.
                     Effectively, the number of hypothesis = beam_size + maes_expansion_beta. Must be an int >= 0,
-                    and affects the speed of inference since large values will perform large beam search in the next
-                    step.
+                    and affects the speed of inference since large values will perform large beam search in the
+                    next step.
 
                 maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the
                     expansions. The default (2.3) is selected from the paper. It performs a comparison
-                    (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the Vocab set and
-                    max_log_prob is the "most" likely token to be predicted. Gamma therefore provides a margin of
-                    additional tokens which can be potential candidates for expansion apart from the "most likely"
+                    (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the Vocab set
+                    and max_log_prob is the "most" likely token to be predicted. Gamma therefore provides a margin
+                    of additional tokens which can be potential candidates for expansion apart from the "most likely"
                     candidate. Lower values will reduce the number of expansions (by increasing pruning-by-value,
                     thereby improving speed but hurting accuracy). Higher values will increase the number of expansions
-                    (by reducing pruning-by-value, thereby reducing speed but potentially improving accuracy). This is
-                    a hyper parameter to be experimentally tuned on a validation set.
+                    (by reducing pruning-by-value, thereby reducing speed but potentially improving accuracy).
+                    This is a hyper parameter to be experimentally tuned on a validation set.
 
                 softmax_temperature: Scales the logits of the joint prior to computing log_softmax.
 
@@ -234,10 +234,8 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int, supported_punctu
                 raise ValueError("blank_id must equal len(non_blank_vocabs) for TDT models")
             if self.big_blank_durations is not None and self.big_blank_durations != []:
                 raise ValueError("duration and big_blank_durations can't both be not None")
-            if self.cfg.strategy not in ['greedy', 'greedy_batch', 'beam', 'maes']:
-                raise ValueError(
-                    "currently only greedy, greedy_batch, beam and maes inference is supported for TDT models"
-                )
+            if self.cfg.strategy not in ['greedy', 'greedy_batch']:
+                raise ValueError("currently only greedy and greedy_batch inference is supported for TDT models")
 
         if (
             self.big_blank_durations is not None and self.big_blank_durations != []
@@ -388,32 +386,20 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int, supported_punctu
                 )
 
         elif self.cfg.strategy == 'beam':
-            if self.big_blank_durations is None or self.big_blank_durations == []:
-                if not self._is_tdt:
-                    self.decoding = rnnt_beam_decoding.BeamRNNTInfer(
-                        decoder_model=decoder,
-                        joint_model=joint,
-                        beam_size=self.cfg.beam.beam_size,
-                        return_best_hypothesis=decoding_cfg.beam.get('return_best_hypothesis', True),
-                        search_type='default',
-                        score_norm=self.cfg.beam.get('score_norm', True),
-                        softmax_temperature=self.cfg.beam.get('softmax_temperature', 1.0),
-                        preserve_alignments=self.preserve_alignments,
-                    )
-                else:
-                    self.decoding = tdt_beam_decoding.BeamTDTInfer(
-                        decoder_model=decoder,
-                        joint_model=joint,
-                        durations=self.durations,
-                        beam_size=self.cfg.beam.beam_size,
-                        return_best_hypothesis=decoding_cfg.beam.get('return_best_hypothesis', True),
-                        search_type='default',
-                        score_norm=self.cfg.beam.get('score_norm', True),
-                        softmax_temperature=self.cfg.beam.get('softmax_temperature', 1.0),
-                        preserve_alignments=self.preserve_alignments,
-                    )
+
+            self.decoding = rnnt_beam_decoding.BeamRNNTInfer(
+                decoder_model=decoder,
+                joint_model=joint,
+                beam_size=self.cfg.beam.beam_size,
+                return_best_hypothesis=decoding_cfg.beam.get('return_best_hypothesis', True),
+                search_type='default',
+                score_norm=self.cfg.beam.get('score_norm', True),
+                softmax_temperature=self.cfg.beam.get('softmax_temperature', 1.0),
+                preserve_alignments=self.preserve_alignments,
+            )
 
         elif self.cfg.strategy == 'tsd':
+
             self.decoding = rnnt_beam_decoding.BeamRNNTInfer(
                 decoder_model=decoder,
                 joint_model=joint,
@@ -427,6 +413,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int, supported_punctu
             )
 
         elif self.cfg.strategy == 'alsd':
+
             self.decoding = rnnt_beam_decoding.BeamRNNTInfer(
                 decoder_model=decoder,
                 joint_model=joint,
@@ -440,44 +427,26 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int, supported_punctu
             )
 
         elif self.cfg.strategy == 'maes':
-            if self.big_blank_durations is None or self.big_blank_durations == []:
-                if not self._is_tdt:
-                    self.decoding = rnnt_beam_decoding.BeamRNNTInfer(
-                        decoder_model=decoder,
-                        joint_model=joint,
-                        beam_size=self.cfg.beam.beam_size,
-                        return_best_hypothesis=decoding_cfg.beam.get('return_best_hypothesis', True),
-                        search_type='maes',
-                        score_norm=self.cfg.beam.get('score_norm', True),
-                        maes_num_steps=self.cfg.beam.get('maes_num_steps', 2),
-                        maes_prefix_alpha=self.cfg.beam.get('maes_prefix_alpha', 1),
-                        maes_expansion_gamma=self.cfg.beam.get('maes_expansion_gamma', 2.3),
-                        maes_expansion_beta=self.cfg.beam.get('maes_expansion_beta', 2.0),
-                        softmax_temperature=self.cfg.beam.get('softmax_temperature', 1.0),
-                        preserve_alignments=self.preserve_alignments,
-                        ngram_lm_model=self.cfg.beam.get('ngram_lm_model', None),
-                        ngram_lm_alpha=self.cfg.beam.get('ngram_lm_alpha', 0.0),
-                        hat_subtract_ilm=self.cfg.beam.get('hat_subtract_ilm', False),
-                        hat_ilm_weight=self.cfg.beam.get('hat_ilm_weight', 0.0),
-                    )
-                else:
-                    self.decoding = tdt_beam_decoding.BeamTDTInfer(
-                        decoder_model=decoder,
-                        joint_model=joint,
-                        durations=self.durations,
-                        beam_size=self.cfg.beam.beam_size,
-                        return_best_hypothesis=decoding_cfg.beam.get('return_best_hypothesis', True),
-                        search_type='maes',
-                        score_norm=self.cfg.beam.get('score_norm', True),
-                        maes_num_steps=self.cfg.beam.get('maes_num_steps', 2),
-                        maes_prefix_alpha=self.cfg.beam.get('maes_prefix_alpha', 1),
-                        maes_expansion_gamma=self.cfg.beam.get('maes_expansion_gamma', 2.3),
-                        maes_expansion_beta=self.cfg.beam.get('maes_expansion_beta', 2.0),
-                        softmax_temperature=self.cfg.beam.get('softmax_temperature', 1.0),
-                        preserve_alignments=self.preserve_alignments,
-                        ngram_lm_model=self.cfg.beam.get('ngram_lm_model', None),
-                        ngram_lm_alpha=self.cfg.beam.get('ngram_lm_alpha', 0.3),
-                    )
+
+            self.decoding = rnnt_beam_decoding.BeamRNNTInfer(
+                decoder_model=decoder,
+                joint_model=joint,
+                beam_size=self.cfg.beam.beam_size,
+                return_best_hypothesis=decoding_cfg.beam.get('return_best_hypothesis', True),
+                search_type='maes',
+                score_norm=self.cfg.beam.get('score_norm', True),
+                maes_num_steps=self.cfg.beam.get('maes_num_steps', 2),
+                maes_prefix_alpha=self.cfg.beam.get('maes_prefix_alpha', 1),
+                maes_expansion_gamma=self.cfg.beam.get('maes_expansion_gamma', 2.3),
+                maes_expansion_beta=self.cfg.beam.get('maes_expansion_beta', 2.0),
+                softmax_temperature=self.cfg.beam.get('softmax_temperature', 1.0),
+                preserve_alignments=self.preserve_alignments,
+                ngram_lm_model=self.cfg.beam.get('ngram_lm_model', None),
+                ngram_lm_alpha=self.cfg.beam.get('ngram_lm_alpha', 0.0),
+                hat_subtract_ilm=self.cfg.beam.get('hat_subtract_ilm', False),
+                hat_ilm_weight=self.cfg.beam.get('hat_ilm_weight', 0.0),
+            )
+
         else:
 
             raise ValueError(
@@ -759,15 +728,6 @@ def decode_ids_to_langs(self, tokens: List[int]) -> List[str]:
         raise NotImplementedError()
 
     def update_joint_fused_batch_size(self):
-        """ "
-        Updates the fused batch size for the joint module if applicable.
-
-        If `joint_fused_batch_size` is set, verifies that the joint module has
-        the required `set_fused_batch_size` and `set_fuse_loss_wer` functions.
-        If present, updates the batch size; otherwise, logs a warning.
-
-        If `joint_fused_batch_size` is <= 0, disables fused batch processing.
-        """
         if self.joint_fused_batch_size is None:
             # do nothing and let the Joint itself handle setting up of the fused batch
             return
@@ -794,21 +754,6 @@ def update_joint_fused_batch_size(self):
             self.decoding.joint.set_fuse_loss_wer(False)
 
     def compute_rnnt_timestamps(self, hypothesis: Hypothesis, timestamp_type: str = "all"):
-        """
-        Computes character, word, and segment timestamps for an RNN-T hypothesis.
-
-        This function generates timestamps for characters, words, and segments within
-        a hypothesis sequence. The type of timestamps computed depends on `timestamp_type`,
-        which can be 'char', 'word', 'segment', or 'all'.
-
-        Args:
-            hypothesis (Hypothesis): Hypothesis.
-            timestamp_type (str): Type of timestamps to compute. Options are 'char', 'word', 'segment', or 'all'.
-                                Defaults to 'all'.
-
-        Returns:
-            Hypothesis: The updated hypothesis with computed timestamps for characters, words, and/or segments.
-        """
         assert timestamp_type in ['char', 'word', 'segment', 'all']
 
         # Unpack the temporary storage
@@ -945,7 +890,7 @@ def _compute_offsets(
 
         # Construct the start and end indices brackets
         end_indices = np.asarray(token_repetitions).cumsum()
-        start_indices = np.concatenate(([start_index], end_indices[:-1]))
+        start_indices = np.concatenate(([int(start_index)], end_indices[:-1]))
 
         # Process the TxU dangling alignment tensor, containing pairs of (logits, label)
         alignment_labels = [al_logits_labels for al_logits_labels in hypothesis.text[1]]
@@ -1008,8 +953,8 @@ def _refine_timestamps_tdt(
 
             # Check if token is a punctuation mark
             # If so, set its start and end offset as start and end of the previous token
-            # This is done because there was observed a behaviour, when punctuation marks are
-            # predicted long after preceding token (i.e. after silence)
+            # This is done because there was observed a behaviour, when punctuation marks are predicted long
+            # after preceding token (i.e. after silence)
             if offset['char'][0] in supported_punctuation and i > 0:
                 encoded_char_offsets[i]['start_offset'] = offset['start_offset'] = char_offsets[i - 1]['end_offset']
                 encoded_char_offsets[i]['end_offset'] = offset['end_offset'] = offset['start_offset']
@@ -1169,8 +1114,7 @@ def _get_segment_offsets(
             offsets: A list of dictionaries, each containing "word", "start_offset" and "end_offset".
             segments_delimiter_tokens: List containing tokens representing the seperator(s) between segments.
             supported_punctuation: Set containing punctuation marks in the vocabulary.
-            segment_gap_threshold: Number of frames between 2 consecutive words necessary to form segments out of plain
-            text.
+            segment_gap_threshold: Number of frames between 2 consecutive words necessary to form segments out of plain text.
         Returns:
             A list of dictionaries containing the segment offsets. Each item contains "segment", "start_offset" and
             "end_offset".
@@ -1298,10 +1242,9 @@ class RNNTDecoding(AbstractRNNTDecoding):
                 exclude_blank: Bool flag indicating that blank token confidence scores are to be excluded
                     from the `token_confidence`.
                 aggregation: Which aggregation type to use for collapsing per-token confidence into per-word
-                    confidence.
-                    Valid options are `mean`, `min`, `max`, `prod`.
-                tdt_include_duration: Bool flag indicating that the duration confidence scores are to be calculated and
-                    attached to the regular frame confidence,
+                    confidence. Valid options are `mean`, `min`, `max`, `prod`.
+                tdt_include_duration: Bool flag indicating that the duration confidence scores are to be calculated
+                    and attached to the regular frame confidence,
                     making TDT frame confidence element a pair: (`prediction_confidence`, `duration_confidence`).
                 method_cfg: A dict-like object which contains the method name and settings to compute per-frame
                     confidence scores.
@@ -1388,7 +1331,7 @@ class RNNTDecoding(AbstractRNNTDecoding):
                         and can be reduced to 1 to improve decoding speed while sacrificing some accuracy. int > 0.
 
                     maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to
-                    keep this as 1 in order to reduce expensive beam search cost later. int >= 0.
+                        keep this as 1 in order to reduce expensive beam search cost later. int >= 0.
 
                     maes_expansion_beta: Maximum number of prefix expansions allowed, in addition to the beam size.
                         Effectively, the number of hypothesis = beam_size + maes_expansion_beta. Must be an int >= 0,
@@ -1396,7 +1339,8 @@ class RNNTDecoding(AbstractRNNTDecoding):
                         next step.
 
                     maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the
-                        expansions. The default (2.3) is selected from the paper. It performs a comparison
+                        expansions.
+                        The default (2.3) is selected from the paper. It performs a comparison
                         (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the Vocab set and
                         max_log_prob is the "most" likely token to be predicted. Gamma therefore provides a margin of
                         additional tokens which can be potential candidates for expansion apart from the "most likely"
@@ -1438,9 +1382,7 @@ def __init__(
             supported_punctuation=supported_punctuation,
         )
 
-        if isinstance(self.decoding, rnnt_beam_decoding.BeamRNNTInfer) or isinstance(
-            self.decoding, tdt_beam_decoding.BeamTDTInfer
-        ):
+        if isinstance(self.decoding, rnnt_beam_decoding.BeamRNNTInfer):
             self.decoding.set_decoding_type('char')
 
     def _aggregate_token_confidence(self, hypothesis: Hypothesis) -> List[float]:
@@ -1556,8 +1498,8 @@ class RNNTBPEDecoding(AbstractRNNTDecoding):
 
             segment_seperators: List containing tokens representing the seperator(s) between segments.
 
-            segment_gap_threshold: The threshold (in frames) that caps the gap between two words necessary for forming
-                the segments.
+            segment_gap_threshold: The threshold (in frames) that caps the gap between two words necessary for
+                forming the segments.
 
             preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores
                 generated during decoding (sample / batched). When set to true, the Hypothesis will contain
@@ -1588,8 +1530,8 @@ class RNNTBPEDecoding(AbstractRNNTDecoding):
                     from the `token_confidence`.
                 aggregation: Which aggregation type to use for collapsing per-token confidence into per-word
                     confidence. Valid options are `mean`, `min`, `max`, `prod`.
-                tdt_include_duration: Bool flag indicating that the duration confidence scores are to be calculated and
-                    attached to the regular frame confidence,
+                tdt_include_duration: Bool flag indicating that the duration confidence scores are to be
+                    calculated and attached to the regular frame confidence,
                     making TDT frame confidence element a pair: (`prediction_confidence`, `duration_confidence`).
                 method_cfg: A dict-like object which contains the method name and settings to compute per-frame
                     confidence scores.
@@ -1660,7 +1602,7 @@ class RNNTBPEDecoding(AbstractRNNTDecoding):
                         at increased cost to execution time.
 
                     alsd_max_target_len: optional int or float, determines the potential maximum target sequence
-                        length.If an integer is provided, it can decode sequences of that particular maximum length.
+                        length. If an integer is provided, it can decode sequences of that particular maximum length.
                         If a float is provided, it can decode sequences of int(alsd_max_target_len * seq_len),
                         where seq_len is the length of the acoustic model output (T).
 
@@ -1680,15 +1622,16 @@ class RNNTBPEDecoding(AbstractRNNTDecoding):
                         and affects the speed of inference since large values will perform large beam search in the
                         next step.
 
-                    maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the
-                        expansions. The default (2.3) is selected from the paper. It performs a comparison
-                        (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the Vocab set and
-                        max_log_prob is the "most" likely token to be predicted. Gamma therefore provides a margin of
-                        additional tokens which can be potential candidates for expansion apart from the "most likely"
-                        candidate. Lower values will reduce the number of expansions (by increasing pruning-by-value,
-                        thereby improving speed but hurting accuracy). Higher values will increase the number of
-                        expansions (by reducing pruning-by-value, thereby reducing speed but potentially improving
-                        accuracy). This is a hyper parameter to be experimentally tuned on a validation set.
+                    maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when
+                        computing the expansions. The default (2.3) is selected from the paper. It performs a
+                        comparison (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the
+                        Vocab set and max_log_prob is the "most" likely token to be predicted. Gamma therefore
+                        provides a margin of additional tokens which can be potential candidates for expansion
+                        apart from the "most likely" candidate. Lower values will reduce the number of expansions
+                        (by increasing pruning-by-value, thereby improving speed but hurting accuracy). Higher
+                        values will increase the number of expansions (by reducing pruning-by-value, thereby
+                        reducing speed but potentially improving accuracy). This is a hyper parameter to be
+                        experimentally tuned on a validation set.
 
                     softmax_temperature: Scales the logits of the joint prior to computing log_softmax.
 
@@ -1715,9 +1658,7 @@ def __init__(self, decoding_cfg, decoder, joint, tokenizer: TokenizerSpec):
             supported_punctuation=supported_punctuation,
         )
 
-        if isinstance(self.decoding, rnnt_beam_decoding.BeamRNNTInfer) or isinstance(
-            self.decoding, tdt_beam_decoding.BeamTDTInfer
-        ):
+        if isinstance(self.decoding, rnnt_beam_decoding.BeamRNNTInfer):
             self.decoding.set_decoding_type('subword')
 
     def _aggregate_token_confidence(self, hypothesis: Hypothesis) -> List[float]:
@@ -1818,8 +1759,8 @@ def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hyp
                     hypotheses[ind].langs_chars = self.decode_ids_to_langs(prediction)
             else:
                 logging.warning(
-                    "Ignoring request for lang output in hypotheses since the model does not use an aggregate \
-                        tokenizer"
+                    "Ignoring request for lang output in hypotheses since the model does not use an aggregate\
+                          tokenizer"
                 )
 
         return hypotheses
@@ -1827,10 +1768,6 @@ def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hyp
 
 @dataclass
 class RNNTDecodingConfig:
-    """
-    RNNT Decoding config
-    """
-
     model_type: str = "rnnt"  # one of "rnnt", "multiblank" or "tdt"
     strategy: str = "greedy_batch"
 
@@ -1888,8 +1825,4 @@ class RNNTDecodingConfig:
 
 @dataclass
 class RNNTBPEDecodingConfig(RNNTDecodingConfig):
-    """
-    RNNT BPE Decoding Config
-    """
-
     pass
diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
index bd169d0d224e..f9cf368fe405 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -49,20 +49,7 @@ def pack_hypotheses(
     hypotheses: List[rnnt_utils.Hypothesis],
     logitlen: torch.Tensor,
 ) -> List[rnnt_utils.Hypothesis]:
-    """
-    Packs a list of hypotheses into a tensor and prepares decoder states.
-
-    This function takes a list of token sequences (hypotheses) and converts
-    it into a tensor format. If any decoder states are on the GPU, they
-    are moved to the CPU. Additionally, the function removes any timesteps
-    with a value of -1 from the sequences.
-
-    Args:
-        hypotheses (list): A list of token sequences representing hypotheses.
 
-    Returns:
-        list: A list of packed hypotheses in tensor format.
-    """
     if hasattr(logitlen, 'cpu'):
         logitlen_cpu = logitlen.to('cpu')
     else:
@@ -591,8 +578,7 @@ class GreedyBatchedRNNTInfer(_GreedyRNNTInfer, WithOptionalCudaGraphs):
             (evaluating Joint multiple times in inner loop); It uses a minimal possible amount of calls
             to prediction network (with maximum possible batch size),
             which makes it especially useful for scaling the prediction network.
-        use_cuda_graph_decoder: if CUDA graphs should be enabled for decoding
-                                (currently recommended only for inference)
+        use_cuda_graph_decoder: if CUDA graphs should be enabled for decoding (currently recommended only for inference)
     """
 
     def __init__(
@@ -1183,10 +1169,6 @@ def _greedy_decode_masked(
 
 
 class ExportedModelGreedyBatchedRNNTInfer:
-    """
-    Exported Model Greedy Batched RNNT Infer class
-    """
-
     def __init__(self, encoder_model: str, decoder_joint_model: str, max_symbols_per_step: Optional[int] = None):
         self.encoder_model_path = encoder_model
         self.decoder_joint_model_path = decoder_joint_model
@@ -1362,25 +1344,9 @@ def _setup_blank_index(self):
         raise NotImplementedError()
 
     def run_encoder(self, audio_signal, length):
-        """
-        Runs encoder network:
-
-        Args:
-            audio_signal: audio signal
-            length: audio length
-        """
         raise NotImplementedError()
 
     def run_decoder_joint(self, enc_logits, targets, target_length, *states):
-        """
-        Runs decoder joint networks.
-
-        Args:
-            enc_logits: encoder logits
-            targets: targets
-            target_length: target length
-            states: states
-        """
         raise NotImplementedError()
 
     def _get_initial_states(self, batchsize):
@@ -1388,10 +1354,6 @@ def _get_initial_states(self, batchsize):
 
 
 class ONNXGreedyBatchedRNNTInfer(ExportedModelGreedyBatchedRNNTInfer):
-    """
-    ONNX Greedy Batched RNNT Infer class
-    """
-
     def __init__(self, encoder_model: str, decoder_joint_model: str, max_symbols_per_step: Optional[int] = 10):
         super().__init__(
             encoder_model=encoder_model,
@@ -1471,8 +1433,7 @@ def _setup_blank_index(self):
 
         self._blank_index = log_probs.shape[-1] - 1  # last token of vocab size is blank token
         logging.info(
-            f"Enc-Dec-Joint step was evaluated, \
-                blank token id = {self._blank_index}; vocab size = {log_probs.shape[-1]}"
+            f"Enc-Dec-Joint step was evaluated, blank token id = {self._blank_index}; vocab size = {log_probs.shape[-1]}"
         )
 
     def run_encoder(self, audio_signal, length):
@@ -1551,10 +1512,6 @@ def _get_initial_states(self, batchsize):
 
 
 class TorchscriptGreedyBatchedRNNTInfer(ExportedModelGreedyBatchedRNNTInfer):
-    """
-    Torchscript Greedy Batched RNNT Infer
-    """
-
     def __init__(
         self,
         encoder_model: str,
@@ -2379,8 +2336,6 @@ def _greedy_decode_masked(
 
 @dataclass
 class GreedyRNNTInferConfig:
-    """Greedy RNNT Infer Config"""
-
     max_symbols_per_step: Optional[int] = 10
     preserve_alignments: bool = False
     preserve_frame_confidence: bool = False
@@ -2399,8 +2354,6 @@ def __post_init__(self):
 
 @dataclass
 class GreedyBatchedRNNTInferConfig:
-    """Greedy Batched RNNT Infer Config"""
-
     max_symbols_per_step: Optional[int] = 10
     preserve_alignments: bool = False
     preserve_frame_confidence: bool = False
@@ -2755,8 +2708,7 @@ class GreedyBatchedTDTInfer(_GreedyRNNTInfer, WithOptionalCudaGraphs):
                     - 'lin' for using the linear mapping.
                     - 'exp' for using exponential mapping with linear shift.
 
-        use_cuda_graph_decoder: if CUDA graphs should be enabled for decoding
-                                (currently recommended only for inference)
+        use_cuda_graph_decoder: if CUDA graphs should be enabled for decoding (currently recommended only for inference)
     """
 
     def __init__(
diff --git a/nemo/collections/asr/parts/submodules/tdt_beam_decoding.py b/nemo/collections/asr/parts/submodules/tdt_beam_decoding.py
deleted file mode 100644
index 908fc1c13d19..000000000000
--- a/nemo/collections/asr/parts/submodules/tdt_beam_decoding.py
+++ /dev/null
@@ -1,800 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional, Tuple
-
-import numpy as np
-import torch
-from tqdm import tqdm
-
-from nemo.collections.asr.modules import rnnt_abstract
-from nemo.collections.asr.parts.submodules.rnnt_beam_decoding import pack_hypotheses
-from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis, NBestHypotheses, is_prefix
-from nemo.core.classes import Typing, typecheck
-from nemo.core.neural_types import AcousticEncodedRepresentation, HypothesisType, LengthsType, NeuralType
-from nemo.utils import logging
-
-try:
-    import kenlm
-
-    KENLM_AVAILABLE = True
-except (ImportError, ModuleNotFoundError):
-    KENLM_AVAILABLE = False
-
-
-class BeamTDTInfer(Typing):
-    """
-    Beam search implementation for Token-andDuration Transducer (TDT) models.
-
-    Sequence level beam decoding or batched-beam decoding, performed auto-repressively
-    depending on the search type chosen.
-
-    Args:
-        decoder_model: rnnt_utils.AbstractRNNTDecoder implementation.
-        joint_model: rnnt_utils.AbstractRNNTJoint implementation.
-        durations: list of duration values from TDT model.
-
-        beam_size: number of beams for beam search. Must be a positive integer >= 1.
-            If beam size is 1, defaults to stateful greedy search.
-            For accurate greedy results, please use GreedyRNNTInfer or GreedyBatchedRNNTInfer.
-
-        search_type: str representing the type of beam search to perform.
-            Must be one of ['beam', 'maes'].
-
-            Algorithm used:
-
-                `default` - basic beam search strategy. Larger beams generally result in better decoding,
-                    however the time required for the search also grows steadily.
-
-                `maes` = modified adaptive expansion search. Please refer to the paper:
-                    [Accelerating RNN Transducer Inference via Adaptive Expansion Search]
-                    (https://ieeexplore.ieee.org/document/9250505)
-
-                    Modified Adaptive Synchronous Decoding (mAES) execution time is adaptive w.r.t the
-                    number of expansions (for tokens) required per timestep. The number of expansions can usually
-                    be constrained to 1 or 2, and in most cases 2 is sufficient.
-
-                    This beam search technique can possibly obtain superior WER while sacrificing some evaluation time.
-
-        score_norm: bool, whether to normalize the scores of the log probabilities.
-
-        return_best_hypothesis: bool, decides whether to return a single hypothesis (the best out of N),
-            or return all N hypothesis (sorted with best score first). The container class changes based
-            this flag -
-            When set to True (default), returns a single Hypothesis.
-            When set to False, returns a NBestHypotheses container, which contains a list of Hypothesis.
-
-        # The following arguments are specific to the chosen `search_type`
-
-        # mAES flags
-        maes_num_steps: Number of adaptive steps to take. From the paper, 2 steps is generally sufficient. int > 1.
-
-        maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to keep this as 1
-            in order to reduce expensive beam search cost later. int >= 0.
-
-        maes_expansion_beta: Maximum number of prefix expansions allowed, in addition to the beam size.
-            Effectively, the number of hypothesis = beam_size + maes_expansion_beta. Must be an int >= 0,
-            and affects the speed of inference since large values will perform large beam search in the next step.
-
-        maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the expansions.
-            The default (2.3) is selected from the paper. It performs a comparison
-            (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the Vocab set and max_log_prob
-            is the "most" likely token to be predicted. Gamma therefore provides a margin of additional tokens which
-            can be potential candidates for expansion apart from the "most likely" candidate.
-            Lower values will reduce the number of expansions (by increasing pruning-by-value, thereby improving speed
-            but hurting accuracy). Higher values will increase the number of expansions (by reducing pruning-by-value,
-            thereby reducing speed but potentially improving accuracy). This is a hyper parameter to be experimentally
-            tuned on a validation set.
-
-        softmax_temperature: Scales the logits of the joint prior to computing log_softmax.
-
-        preserve_alignments: Bool flag which preserves the history of alignments generated during
-            beam decoding (sample). When set to true, the Hypothesis will contain
-            the non-null value for `alignments` in it. Here, `alignments` is a List of List of Tensor (of length V + 1)
-
-            The length of the list corresponds to the Acoustic Length (T).
-            Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary.
-            U is the number of target tokens for the current timestep Ti.
-
-            NOTE: `preserve_alignments` is an invalid argument for any `search_type`
-            other than basic beam search.
-
-        ngram_lm_model: str
-            The path to the N-gram LM.
-        ngram_lm_alpha: float
-            Alpha weight of N-gram LM.
-    """
-
-    @property
-    def input_types(self):
-        """Returns definitions of module input ports."""
-        return {
-            "encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
-            "encoded_lengths": NeuralType(tuple('B'), LengthsType()),
-            "partial_hypotheses": [NeuralType(elements_type=HypothesisType(), optional=True)],  # must always be last
-        }
-
-    @property
-    def output_types(self):
-        """Returns definitions of module output ports."""
-        return {"predictions": [NeuralType(elements_type=HypothesisType())]}
-
-    def __init__(
-        self,
-        decoder_model: rnnt_abstract.AbstractRNNTDecoder,
-        joint_model: rnnt_abstract.AbstractRNNTJoint,
-        durations: list,
-        beam_size: int,
-        search_type: str = 'default',
-        score_norm: bool = True,
-        return_best_hypothesis: bool = True,
-        maes_num_steps: int = 2,
-        maes_prefix_alpha: int = 1,
-        maes_expansion_gamma: float = 2.3,
-        maes_expansion_beta: int = 2,
-        softmax_temperature: float = 1.0,
-        preserve_alignments: bool = False,
-        ngram_lm_model: Optional[str] = None,
-        ngram_lm_alpha: float = 0.3,
-    ):
-        self.joint = joint_model
-        self.decoder = decoder_model
-        self.durations = durations
-
-        self.token_offset = 0
-        self.search_type = search_type
-        self.blank = decoder_model.blank_idx
-        self.vocab_size = decoder_model.vocab_size
-        self.return_best_hypothesis = return_best_hypothesis
-
-        self.beam_size = beam_size
-        self.score_norm = score_norm
-        self.max_candidates = beam_size
-        self.softmax_temperature = softmax_temperature
-        self.preserve_alignments = preserve_alignments
-
-        if preserve_alignments:
-            raise ValueError("Alignment preservation has not been implemented.")
-        if beam_size < 1:
-            raise ValueError("Beam search size cannot be less than 1!")
-
-        if self.preserve_alignments:
-            raise NotImplementedError("Preserving alignments is not implemented.")
-
-        if search_type == "default":
-            if self.beam_size == 1:
-                logging.info(
-                    """If beam size is 1, defaults to stateful greedy search.
-                     For accurate greedy results, please use GreedyTDTInfer or GreedyBatchedTDTInfer."""
-                )
-            self.search_algorithm = self.default_beam_search
-        elif search_type == "tsd":
-            raise NotImplementedError("`tsd` (Time Synchronous Decoding) has not been implemented.")
-        elif search_type == "alsd":
-            raise NotImplementedError("`alsd` (Alignment Length Synchronous Decoding) has not been implemented.")
-        elif search_type == "nsc":
-            raise NotImplementedError("`nsc` (Constrained Beam Search) has not been implemented.")
-        elif search_type == "maes":
-            self.search_algorithm = self.modified_adaptive_expansion_search
-        else:
-            raise NotImplementedError(
-                f"The search type ({search_type}) supplied is not supported!\n" f"Please use one of : (default, maes)"
-            )
-
-        if self.search_type == 'maes':
-            self.maes_num_steps = int(maes_num_steps)
-            self.maes_prefix_alpha = int(maes_prefix_alpha)
-            self.maes_expansion_beta = int(maes_expansion_beta)
-            self.maes_expansion_gamma = float(maes_expansion_gamma)
-
-            self.max_candidates += maes_expansion_beta
-
-            if self.maes_prefix_alpha < 0:
-                raise ValueError("`maes_prefix_alpha` must be a positive integer.")
-
-            if self.vocab_size < beam_size + maes_expansion_beta:
-                raise ValueError(
-                    f"beam_size ({beam_size}) + expansion_beta ({maes_expansion_beta}) "
-                    f"should be smaller or equal to vocabulary size ({self.vocab_size})."
-                )
-
-            if self.maes_num_steps < 1:
-                raise ValueError("`maes_num_steps` must be greater than 0.")
-
-        try:
-            self.zero_duration_idx = self.durations.index(0)
-        except ValueError:
-            self.zero_duration_idx = None
-        self.min_non_zero_duration_idx = int(
-            np.argmin(np.ma.masked_where(np.array(self.durations) == 0, self.durations))
-        )
-
-        if ngram_lm_model:
-            if search_type != "maes":
-                raise ValueError("For decoding with language model `maes` decoding strategy must be chosen.")
-
-            if KENLM_AVAILABLE:
-                self.ngram_lm = kenlm.Model(ngram_lm_model)
-                self.ngram_lm_alpha = ngram_lm_alpha
-            else:
-                raise ImportError(
-                    "KenLM package (https://github.com/kpu/kenlm) is not installed. " "Use ngram_lm_model=None."
-                )
-        else:
-            self.ngram_lm = None
-
-    @typecheck()
-    def __call__(
-        self,
-        encoder_output: torch.Tensor,
-        encoded_lengths: torch.Tensor,
-        partial_hypotheses: tuple[list[Hypothesis | NBestHypotheses],] = None,
-    ) -> tuple[list[Hypothesis | NBestHypotheses],]:
-        """Perform general beam search.
-
-        Args:
-            encoder_output: encoder outputs (batch, features, timesteps).
-            encoded_lengths: lengths of the encoder outputs.
-
-        Returns:
-            Either a list containing a single Hypothesis (when `return_best_hypothesis=True`,
-            otherwise a list containing a single NBestHypotheses, which itself contains a list of
-            Hypothesis. This list is sorted such that the best hypothesis is the first element.
-        """
-        # Preserve decoder and joint training state
-        decoder_training_state = self.decoder.training
-        joint_training_state = self.joint.training
-
-        with torch.inference_mode():
-            # Apply optional preprocessing
-            encoder_output = encoder_output.transpose(1, 2)  # (B, T, D)
-
-            self.decoder.eval()
-            self.joint.eval()
-
-            hypotheses = []
-            with tqdm(
-                range(encoder_output.size(0)),
-                desc='Beam search progress:',
-                total=encoder_output.size(0),
-                unit='sample',
-            ) as idx_gen:
-
-                _p = next(self.joint.parameters())
-                dtype = _p.dtype
-
-                # Decode every sample in the batch independently.
-                for batch_idx in idx_gen:
-                    inseq = encoder_output[batch_idx : batch_idx + 1, : encoded_lengths[batch_idx], :]  # [1, T, D]
-                    logitlen = encoded_lengths[batch_idx]
-
-                    if inseq.dtype != dtype:
-                        inseq = inseq.to(dtype=dtype)
-
-                    # Extract partial hypothesis if exists
-                    partial_hypothesis = partial_hypotheses[batch_idx] if partial_hypotheses is not None else None
-
-                    # Execute the specific search strategy
-                    nbest_hyps = self.search_algorithm(
-                        inseq, logitlen, partial_hypotheses=partial_hypothesis
-                    )  # sorted list of hypothesis
-
-                    # Prepare the list of hypotheses
-                    nbest_hyps = pack_hypotheses(nbest_hyps)
-
-                    # Pack the result
-                    if self.return_best_hypothesis:
-                        best_hypothesis: Hypothesis = nbest_hyps[0]
-                    else:
-                        best_hypothesis: NBestHypotheses = NBestHypotheses(nbest_hyps)
-                    hypotheses.append(best_hypothesis)
-
-        self.decoder.train(decoder_training_state)
-        self.joint.train(joint_training_state)
-
-        return (hypotheses,)
-
-    def default_beam_search(
-        self,
-        encoder_outputs: torch.Tensor,
-        encoded_lengths: torch.Tensor,
-        partial_hypotheses: Optional[Hypothesis] = None,
-    ) -> List[Hypothesis]:
-        """Default Beam search implementation for TDT models.
-
-        Args:
-            encoder_outputs: encoder outputs (batch, features, timesteps).
-            encoded_lengths: lengths of the encoder outputs.
-            partial_hypotheses: partial hypoteses.
-
-        Returns:
-            nbest_hyps: N-best decoding results
-        """
-        if partial_hypotheses is not None:
-            raise NotImplementedError("Support for `partial_hypotheses` is not implemented.")
-
-        beam = min(self.beam_size, self.vocab_size)
-        beam_k = min(beam, (self.vocab_size - 1))
-        durations_beam_k = min(beam, len(self.durations))
-
-        # Initialize zero vector states.
-        decoder_state = self.decoder.initialize_state(encoder_outputs)
-        # Cache decoder results to avoid duplicate computations.
-        cache = {}
-
-        # Initialize hypothesis array with blank hypothesis.
-        start_hyp = Hypothesis(
-            score=0.0, y_sequence=[self.blank], dec_state=decoder_state, timestep=[-1], length=0, last_frame=0
-        )
-        kept_hyps = [start_hyp]
-
-        for time_idx in range(int(encoded_lengths)):
-            # Retrieve hypotheses for current and future frames
-            hyps = [hyp for hyp in kept_hyps if hyp.last_frame == time_idx]  # hypotheses for current frame
-            kept_hyps = [hyp for hyp in kept_hyps if hyp.last_frame > time_idx]  # hypothesis for future frames
-
-            # Loop over hypotheses of current frame
-            while len(hyps) > 0:
-                max_hyp = max(hyps, key=lambda x: x.score)
-                hyps.remove(max_hyp)
-
-                # Update decoder state and get probability distribution over vocabulary and durations.
-                encoder_output = encoder_outputs[:, time_idx : time_idx + 1, :]  # [1, 1, D]
-                decoder_output, decoder_state, _ = self.decoder.score_hypothesis(max_hyp, cache)  # [1, 1, D]
-                logits = (
-                    self.joint.joint(encoder_output, decoder_output) / self.softmax_temperature
-                )  # [1, 1, 1, V + NUM_DURATIONS + 1]
-                logp = torch.log_softmax(logits[0, 0, 0, : -len(self.durations)], dim=-1)  # [V + 1]
-                durations_logp = torch.log_softmax(logits[0, 0, 0, -len(self.durations) :], dim=-1)  # [NUM_DURATIONS]
-
-                # Proccess non-blank tokens
-                # Retrieve the top `beam_k` most probable tokens and the top `duration_beam_k` most probable durations.
-                # Then, select the top `beam_k` pairs of (token, duration) based on the highest combined probabilities.
-                # Note that indices are obtained in the flattened array.
-                logp_topks, logp_topk_idxs = logp[:-1].topk(beam_k, dim=-1)  # topk of tokens without blank token
-                durations_logp_topks, durations_logp_topk_idxs = durations_logp.topk(durations_beam_k, dim=-1)
-                total_logp_topks, total_logp_topk_idxs = (
-                    torch.cartesian_prod(durations_logp_topks, logp_topks).sum(dim=-1).topk(beam_k, dim=-1)
-                )
-
-                # Loop over pairs of (token, duration) with highest combined log prob
-                for total_logp_topk, total_logp_topk_idx in zip(total_logp_topks, total_logp_topk_idxs):
-                    # Restore indices from flattened array indices
-                    token_idx = int(logp_topk_idxs[total_logp_topk_idx % beam_k])
-                    duration_idx = int(durations_logp_topk_idxs[total_logp_topk_idx // beam_k])
-
-                    duration = self.durations[duration_idx]
-                    # Construct hypothesis for non-blank token
-                    new_hyp = Hypothesis(
-                        score=float(max_hyp.score + total_logp_topk),  # update score
-                        y_sequence=max_hyp.y_sequence + [token_idx],  # update hypothesis sequence
-                        dec_state=decoder_state,  # update decoder state
-                        timestep=max_hyp.timestep + [time_idx + duration],  # update timesteps
-                        length=encoded_lengths,
-                        last_frame=max_hyp.last_frame + duration,
-                    )  # update frame idx where last token appeared
-
-                    # Update current frame hypotheses if duration is zero and future frame hypotheses otherwise
-                    if duration == 0:
-                        hyps.append(new_hyp)
-                    else:
-                        kept_hyps.append(new_hyp)
-
-                # Update future frames with blank tokens
-                # Note: blank token can have only non-zero duration
-                for duration_idx in durations_logp_topk_idxs:
-                    duration_idx = int(duration_idx)
-                    # If zero is the only duration in topk, switch to closest non-zero duration to continue
-                    if duration_idx == self.zero_duration_idx:
-                        if durations_logp_topk_idxs.shape[0] == 1:
-                            duration_idx = self.min_non_zero_duration_idx
-                        else:
-                            continue
-
-                    duration = self.durations[duration_idx]
-                    new_hyp = Hypothesis(
-                        score=float(max_hyp.score + logp[self.blank] + durations_logp[duration_idx]),  # update score
-                        y_sequence=max_hyp.y_sequence[:],  # no need to update sequence
-                        dec_state=max_hyp.dec_state,  # no need to update decoder state
-                        timestep=max_hyp.timestep[:],  # no need to update timesteps
-                        length=encoded_lengths,
-                        last_frame=max_hyp.last_frame + duration,
-                    )  # update frame idx where last token appeared
-                    kept_hyps.append(new_hyp)
-
-                # Merge duplicate hypotheses.
-                # If two consecutive blank tokens are predicted and their duration values sum up to the same number,
-                # it will produce two hypotheses with the same token sequence but different scores.
-                kept_hyps = self.merge_duplicate_hypotheses(kept_hyps)
-
-                if len(hyps) > 0:
-                    # Keep those hypothesis that have scores greater than next search generation
-                    hyps_max = float(max(hyps, key=lambda x: x.score).score)
-                    kept_most_prob = sorted(
-                        [hyp for hyp in kept_hyps if hyp.score > hyps_max],
-                        key=lambda x: x.score,
-                    )
-                    # If enough hypotheses have scores greater than next search generation,
-                    # stop beam search.
-                    if len(kept_most_prob) >= beam:
-                        kept_hyps = kept_most_prob
-                        break
-                else:
-                    # If there are no hypotheses in a current frame,
-                    # keep only `beam` best hypotheses for the next search generation.
-                    kept_hyps = sorted(kept_hyps, key=lambda x: x.score, reverse=True)[:beam]
-        return self.sort_nbest(kept_hyps)
-
-    def modified_adaptive_expansion_search(
-        self,
-        encoder_outputs: torch.Tensor,
-        encoded_lengths: torch.Tensor,
-        partial_hypotheses: Optional[Hypothesis] = None,
-    ) -> List[Hypothesis]:
-        """
-        Modified Adaptive Exoansion Search algorithm for TDT models.
-        Based on/modified from https://ieeexplore.ieee.org/document/9250505.
-        Supports N-gram language model shallow fusion.
-
-        Args:
-            encoder_outputs: encoder outputs (batch, features, timesteps).
-            encoded_lengths: lengths of the encoder outputs.
-            partial_hypotheses: partial hypotheses.
-
-        Returns:
-            nbest_hyps: N-best decoding results
-        """
-        if partial_hypotheses is not None:
-            raise NotImplementedError("Support for `partial_hypotheses` is not implemented.")
-
-        beam = min(self.beam_size, self.vocab_size)
-        beam_state = self.decoder.initialize_state(
-            torch.zeros(1, device=encoder_outputs.device, dtype=encoder_outputs.dtype)
-        )  # [L, B, H], [L, B, H] for LSTMS
-
-        # Initialize first hypothesis for the beam (blank).
-        start_hyp = Hypothesis(
-            y_sequence=[self.blank],
-            score=0.0,
-            dec_state=self.decoder.batch_select_state(beam_state, 0),
-            timestep=[-1],
-            length=0,
-            last_frame=0,
-        )
-        init_tokens = [start_hyp]
-
-        # Cache decoder results to avoid duplicate computations.
-        cache = {}
-
-        # Decode a batch of beam states and scores
-        beam_decoder_output, beam_state = self.decoder.batch_score_hypothesis(init_tokens, cache)
-        state = beam_state[0]
-
-        # Initialize first hypothesis for the beam (blank) for kept hypotheses
-        start_hyp_kept = Hypothesis(
-            y_sequence=[self.blank],
-            score=0.0,
-            dec_state=state,
-            dec_out=[beam_decoder_output[0]],
-            timestep=[-1],
-            length=0,
-            last_frame=0,
-        )
-
-        kept_hyps = [start_hyp_kept]
-
-        # Setup ngram LM:
-        if self.ngram_lm:
-            init_lm_state = kenlm.State()
-            self.ngram_lm.BeginSentenceWrite(init_lm_state)
-            start_hyp_kept.ngram_lm_state = init_lm_state
-
-        for time_idx in range(encoded_lengths):
-            # Select current iteration hypotheses
-            hyps = [x for x in kept_hyps if x.last_frame == time_idx]
-            kept_hyps = [x for x in kept_hyps if x.last_frame > time_idx]
-
-            if len(hyps) == 0:
-                continue
-
-            beam_encoder_output = encoder_outputs[:, time_idx : time_idx + 1]  # [1, 1, D]
-            # Perform prefix search to update hypothesis scores.
-            if self.zero_duration_idx is not None:
-                hyps = self.prefix_search(
-                    sorted(hyps, key=lambda x: len(x.y_sequence), reverse=True),
-                    beam_encoder_output,
-                    prefix_alpha=self.maes_prefix_alpha,
-                )
-
-            list_b = []  # List that contains the blank token emissions
-            list_nb = []  # List that contains the non-zero duration non-blank token emissions
-            # Repeat for number of mAES steps
-            for n in range(self.maes_num_steps):
-                # Pack the decoder logits for all current hypotheses
-                beam_decoder_output = torch.stack([h.dec_out[-1] for h in hyps])  # [H, 1, D]
-
-                # Extract the log probabilities
-                beam_logits = self.joint.joint(beam_encoder_output, beam_decoder_output) / self.softmax_temperature
-                beam_logp = torch.log_softmax(beam_logits[:, 0, 0, : -len(self.durations)], dim=-1)
-                beam_duration_logp = torch.log_softmax(beam_logits[:, 0, 0, -len(self.durations) :], dim=-1)
-
-                # Retrieve the top `max_candidades` most probable tokens.
-                # Then, select the top `max_candidates` pairs of (token, duration)
-                # based on the highest combined probabilities.
-                # Note that indices are obtained in flattened array.
-                beam_logp_topks, beam_idx_topks = beam_logp.topk(self.max_candidates, dim=-1)
-                beam_total_logp = (beam_duration_logp[:, :, None] + beam_logp_topks[:, None, :]).view(
-                    len(hyps), -1
-                )  # [B, MAX_CANDIDATES*DURATION_BEAM]
-                beam_total_logp_topks, beam_total_logp_topk_idxs = beam_total_logp.topk(
-                    self.max_candidates, dim=-1
-                )  # [B, MAX_CANDIDATES]
-
-                # Prune hypothesis to obtain k expansions
-                beam_best_expansion_scores = beam_total_logp_topks.max(dim=-1, keepdim=True).values
-                beam_masks = beam_total_logp_topks >= beam_best_expansion_scores - self.maes_expansion_gamma
-                beam_kexpansions_idxs = [
-                    sum_logp_topk_idxs[mask] for sum_logp_topk_idxs, mask in zip(beam_total_logp_topk_idxs, beam_masks)
-                ]
-
-                list_exp = []  # List that contains the hypothesis expansion
-                list_nb_exp = []  # List that contains the hypothesis expansion
-                for hyp_idx, hyp in enumerate(hyps):  # For all hypothesis
-                    for idx in beam_kexpansions_idxs[hyp_idx]:  # For all expansions within this hypothesis
-                        # Restore indices in logp and durations_logp arrays from flattened indices.
-                        k = int(beam_idx_topks[hyp_idx][idx % self.max_candidates])
-                        duration = self.durations[int(idx // self.max_candidates)]
-                        total_logp = float(beam_total_logp[hyp_idx][idx])
-
-                        # Forcing blank token to have non-zero duration
-                        if k == self.blank and duration == 0:
-                            duration = self.durations[self.min_non_zero_duration_idx]
-
-                        new_hyp = Hypothesis(
-                            score=hyp.score + total_logp,
-                            y_sequence=hyp.y_sequence[:],
-                            dec_out=hyp.dec_out[:],
-                            dec_state=hyp.dec_state,
-                            timestep=hyp.timestep[:],
-                            length=time_idx,
-                            last_frame=hyp.last_frame + duration,
-                        )
-
-                        if self.ngram_lm:
-                            new_hyp.ngram_lm_state = hyp.ngram_lm_state
-
-                        # If the expansion was for blank
-                        if k == self.blank:
-                            list_b.append(new_hyp)
-                        else:
-                            new_hyp.y_sequence.append(k)
-                            new_hyp.timestep.append(time_idx + duration)
-
-                            if self.ngram_lm:
-                                lm_score, new_hyp.ngram_lm_state = self.compute_ngram_score(hyp.ngram_lm_state, int(k))
-                                new_hyp.score += self.ngram_lm_alpha * lm_score
-
-                            # If token duration is 0 adding to expansions list
-                            if duration == 0:
-                                list_exp.append(new_hyp)
-                            else:
-                                list_nb_exp.append(new_hyp)
-
-                # Update states for hypothesis that do not end with blank
-                hyps_to_update = list_nb_exp + list_exp
-                if len(hyps_to_update) > 0:
-                    # Decode a batch of beam states and scores
-                    beam_decoder_output, beam_state = self.decoder.batch_score_hypothesis(
-                        hyps_to_update,
-                        cache,
-                    )
-                    for hyp_idx, hyp in enumerate(hyps_to_update):
-                        # Preserve the decoder logits for the current beam
-                        hyp.dec_out.append(beam_decoder_output[hyp_idx])
-                        hyp.dec_state = beam_state[hyp_idx]
-
-                # If there were no token expansions in any of the hypotheses,
-                # Early exit
-                list_nb += list_nb_exp
-                if not list_exp:
-                    kept_hyps = kept_hyps + list_b + list_nb
-                    kept_hyps = self.merge_duplicate_hypotheses(kept_hyps)
-                    kept_hyps = sorted(kept_hyps, key=lambda x: x.score, reverse=True)[:beam]
-
-                    break
-                else:
-                    # If this isn't the last mAES step
-                    if n < (self.maes_num_steps - 1):
-                        # Copy the expanded hypothesis for the next iteration
-                        hyps = self.merge_duplicate_hypotheses(list_exp)
-                    else:
-                        # If this is the last mAES step add probabilities of the blank token to the end.
-                        # Extract the log probabilities
-                        beam_decoder_output = torch.stack([h.dec_out[-1] for h in list_exp])  # [H, 1, D]
-                        beam_logits = (
-                            self.joint.joint(beam_encoder_output, beam_decoder_output) / self.softmax_temperature
-                        )
-                        beam_logp = torch.log_softmax(beam_logits[:, 0, 0, : -len(self.durations)], dim=-1)
-
-                        # Get most probable durations
-                        beam_duration_logp = torch.log_softmax(beam_logits[:, 0, 0, -len(self.durations) :], dim=-1)
-                        _, beam_max_duration_idx = torch.max(beam_duration_logp, dim=-1)
-
-                        # For all expansions, add the score for the blank label
-                        for hyp_idx, hyp in enumerate(list_exp):
-                            # If zero duration was obtained, change to the closest non-zero duration
-                            duration_idx = int(beam_max_duration_idx[hyp_idx])
-                            if duration_idx == self.zero_duration_idx:
-                                duration_idx = self.min_non_zero_duration_idx
-
-                            total_logp = float(
-                                beam_logp[hyp_idx, self.blank] + beam_duration_logp[hyp_idx, duration_idx]
-                            )
-                            hyp.score += total_logp
-                            hyp.last_frame += self.durations[duration_idx]
-
-                        # Finally, update the kept hypothesis of sorted top Beam candidates
-                        kept_hyps = kept_hyps + list_b + list_exp + list_nb
-                        kept_hyps = self.merge_duplicate_hypotheses(kept_hyps)
-                        kept_hyps = sorted(kept_hyps, key=lambda x: x.score, reverse=True)[:beam]
-
-        # Sort the hypothesis with best scores
-        return self.sort_nbest(kept_hyps)
-
-    def merge_duplicate_hypotheses(self, hypotheses):
-        """
-        Merges hypotheses with identical token sequences and lengths.
-        The combined hypothesis's probability is the sum of the probabilities of all duplicates.
-        Duplicate hypotheses occur when two consecutive blank tokens are predicted
-        and their duration values sum up to the same number.
-
-        Args:
-            hypotheses: list of hypotheses.
-
-        Returns:
-            hypotheses: list if hypotheses without duplicates.
-        """
-        sorted_hyps = sorted(hypotheses, key=lambda x: x.score, reverse=True)
-        kept_hyps = {}
-        for hyp in sorted_hyps:
-            hyp_key = (tuple(hyp.y_sequence), int(hyp.last_frame))
-            if hyp_key in kept_hyps:
-                kept_hyp = kept_hyps[hyp_key]
-                kept_hyp.score = float(torch.logaddexp(torch.tensor(kept_hyp.score), torch.tensor(hyp.score)))
-            else:
-                kept_hyps[hyp_key] = hyp
-        return list(kept_hyps.values())
-
-    def set_decoding_type(self, decoding_type: str):
-        """
-        Sets decoding type. Please check train_kenlm.py in scripts/asr_language_modeling/ to find out why we need
-        Args:
-            decoding_type: decoding type
-        """
-        # TOKEN_OFFSET for BPE-based models
-        if decoding_type == 'subword':
-            from nemo.collections.asr.parts.submodules.ctc_beam_decoding import DEFAULT_TOKEN_OFFSET
-
-            self.token_offset = DEFAULT_TOKEN_OFFSET
-
-    def prefix_search(
-        self, hypotheses: List[Hypothesis], encoder_output: torch.Tensor, prefix_alpha: int
-    ) -> List[Hypothesis]:
-        """
-        Performs a prefix search and updates the scores of the hypotheses in place.
-        Based on https://arxiv.org/pdf/1211.3711.pdf.
-
-        Args:
-            hypotheses: a list of hypotheses sorted by the length from the longest to the shortest.
-            encoder_output: encoder output.
-            prefix_alpha: maximum allowable length difference between hypothesis and a prefix.
-
-        Returns:
-            hypotheses: list of hypotheses with updated scores.
-        """
-        # Iterate over hypotheses.
-        for curr_idx, curr_hyp in enumerate(hypotheses[:-1]):
-            # For each hypothesis, iterate over the subsequent hypotheses.
-            # If a hypothesis is a prefix of the current one, update current score.
-            for pref_hyp in hypotheses[(curr_idx + 1) :]:
-                curr_hyp_length = len(curr_hyp.y_sequence)
-                pref_hyp_length = len(pref_hyp.y_sequence)
-
-                if (
-                    is_prefix(curr_hyp.y_sequence, pref_hyp.y_sequence)
-                    and (curr_hyp_length - pref_hyp_length) <= prefix_alpha
-                ):
-                    # Compute the score of the first token
-                    # that follows the prefix hypothesis tokens in current hypothesis.
-                    # Use the decoder output, which is stored in the prefix hypothesis.
-                    logits = self.joint.joint(encoder_output, pref_hyp.dec_out[-1]) / self.softmax_temperature
-                    logp = torch.log_softmax(logits[0, 0, 0, : -len(self.durations)], dim=-1)
-                    duration_logp = torch.log_softmax(logits[0, 0, 0, -len(self.durations) :], dim=-1)
-                    curr_score = pref_hyp.score + float(
-                        logp[curr_hyp.y_sequence[pref_hyp_length]] + duration_logp[self.zero_duration_idx]
-                    )
-
-                    if self.ngram_lm:
-                        lm_score, next_state = self.compute_ngram_score(
-                            pref_hyp.ngram_lm_state, int(curr_hyp.y_sequence[pref_hyp_length])
-                        )
-                        curr_score += self.ngram_lm_alpha * lm_score
-
-                    for k in range(pref_hyp_length, (curr_hyp_length - 1)):
-                        # Compute the score of the next token.
-                        # Approximate decoder output with the one that is stored in current hypothesis.
-                        logits = self.joint.joint(encoder_output, curr_hyp.dec_out[k]) / self.softmax_temperature
-                        logp = torch.log_softmax(logits[0, 0, 0, : -len(self.durations)], dim=-1)
-                        duration_logp = torch.log_softmax(logits[0, 0, 0, -len(self.durations) :], dim=-1)
-                        curr_score += float(logp[curr_hyp.y_sequence[k + 1]] + duration_logp[self.zero_duration_idx])
-
-                        if self.ngram_lm:
-                            lm_score, next_state = self.compute_ngram_score(
-                                next_state, int(curr_hyp.y_sequence[k + 1])
-                            )
-                            curr_score += self.ngram_lm_alpha * lm_score
-
-                    # Update current hypothesis score
-                    curr_hyp.score = np.logaddexp(curr_hyp.score, curr_score)
-        return hypotheses
-
-    def compute_ngram_score(self, current_lm_state: "kenlm.State", label: int) -> Tuple[float, "kenlm.State"]:
-        """
-        Computes the score for KenLM Ngram language model.
-
-        Args:
-            current_lm_state: current state of the KenLM language model.
-            label: next label.
-
-        Returns:
-            lm_score: score for `label`.
-        """
-        if self.token_offset:
-            label = chr(label + self.token_offset)
-        else:
-            label = str(label)
-
-        next_state = kenlm.State()
-        lm_score = self.ngram_lm.BaseScore(current_lm_state, label, next_state)
-        lm_score *= 1.0 / np.log10(np.e)
-
-        return lm_score, next_state
-
-    def sort_nbest(self, hyps: List[Hypothesis]) -> List[Hypothesis]:
-        """Sort hypotheses by score or score given sequence length.
-
-        Args:
-            hyps: list of hypotheses
-
-        Return:
-            hyps: sorted list of hypotheses
-        """
-        if self.score_norm:
-            return sorted(hyps, key=lambda x: x.score / len(x.y_sequence), reverse=True)
-        else:
-            return sorted(hyps, key=lambda x: x.score, reverse=True)
diff --git a/nemo/collections/asr/parts/utils/rnnt_utils.py b/nemo/collections/asr/parts/utils/rnnt_utils.py
index 8d2755fcc0ae..76e9da6087ed 100644
--- a/nemo/collections/asr/parts/utils/rnnt_utils.py
+++ b/nemo/collections/asr/parts/utils/rnnt_utils.py
@@ -85,8 +85,6 @@ class Hypothesis:
     tokens: (Optional) A list of decoded tokens (can be characters or word-pieces.
 
     last_token (Optional): A token or batch of tokens which was predicted in the last step.
-
-    last_frame (Optional): Index of the last decoding step hypothesis was updated including blank token prediction.
     """
 
     score: float
@@ -107,7 +105,6 @@ class Hypothesis:
     tokens: Optional[Union[List[int], torch.Tensor]] = None
     last_token: Optional[torch.Tensor] = None
     token_duration: Optional[List[int]] = None
-    last_frame: Optional[int] = None
 
     @property
     def non_blank_frame_confidence(self) -> List[float]:
@@ -247,8 +244,7 @@ def __init__(
 
         Args:
             batch_size: batch size for hypotheses
-            init_length: initial estimate for the length of hypotheses (if the real length is higher,
-                tensors will be reallocated)
+            init_length: initial estimate for the length of hypotheses (if the real length is higher, tensors will be reallocated)
             device: device for storing hypotheses
             float_dtype: float type for scores
         """
@@ -278,9 +274,6 @@ def __init__(
         self._ones_batch = torch.ones_like(self._batch_indices)
 
     def clear_(self):
-        """
-        Clears batched hypotheses state.
-        """
         self.current_lengths.fill_(0)
         self.transcript.fill_(0)
         self.timesteps.fill_(0)
@@ -504,9 +497,6 @@ def __init__(
         self._batch_indices = torch.arange(batch_size, device=device)
 
     def clear_(self):
-        """
-        Clears batched hypotheses state.
-        """
         self.current_lengths.fill_(0)
         self.timesteps.fill_(0)
         self.logits.fill_(0.0)
diff --git a/nemo/collections/multimodal_autoregressive/data/README.md b/nemo/collections/multimodal_autoregressive/data/README.md
index c4814ad267f8..3f6d5a6c6a81 100644
--- a/nemo/collections/multimodal_autoregressive/data/README.md
+++ b/nemo/collections/multimodal_autoregressive/data/README.md
@@ -8,27 +8,7 @@ This is an example of how to do autoregressive generation for multiple modalitie
 ### 1. Vision Understanding using EMU3 Tokenizer
 
 #### Download and Extract data 
-We will be working with coyo dataset which has 700 million images. 
-
-First create credentials for rclone . Create this file at `~/.config/rclone/rclone.conf`
-```
-[pbss-team-vfm-share-ro-s3]
-type = s3
-env_auth = true
-access_key_id = <ACCESS ID>
-secret_access_key = <ACCESS KEY>
-region = us-east-1
-endpoint = https://pdx.s8k.io
-```
-To download the images
-```
-rclone copy pbss-team-vfm-share-ro-s3:webdataset_images/webdataset_edify_image_v3/coyo_700m/resolution_lt_720/aspect_ratio_16_9/images images --transfers=16 --multi-thread-streams=16 --checkers=8 -P --stats 5s
-```
-
-To download the captions 
-```
-rclone copy pbss-team-vfm-share-ro-s3:webdataset_images/webdataset_edify_image_v3/coyo_700m/resolution_lt_720/aspect_ratio_16_9/captions_ai_v3p1 captions_ai_v3p1 --transfers=16 --multi-thread-streams=16 --checkers=8 -P --stats 5s
-```
+Download the [COYO700M dataset](https://github.com/kakaobrain/coyo-dataset)
 
 Once downloaded extract the data using tar utilities. 
 
@@ -70,13 +50,13 @@ Follow usual nemo instructions to train any autoregressive model.
 ```
 
 #### Inference 
-To run inference edit the [inference config file](examples/multimodal_autoregressive/conf/megatron_mm_ar_inference.yaml)
+To run inference edit the [inference config file](examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_vision_understanding.yaml)
 *NOTE* Make sure you have a .nemo file (checkpoint). If you just have a regular megatron checkpoint you  have to do a conversion as shown in [this doc](https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/gpt/checkpointconversion.html?highlight=convert)
 
 Run inference as follows
 
 ```
-torchrun --nproc-per-node 2 examples/multimodal_autoregressive/megatron_mm_autoregressive_eval.py
+torchrun --nproc-per-node 2 examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_vision_understanding.py
 ```
 
 
@@ -116,13 +96,11 @@ Follow usual nemo instructions to train any autoregressive model.
 ```
 
 #### Inference 
-To run inference edit the [inference config file](examples/multimodal_autoregressive/conf/megatron_mm_ar_inference.yaml)
+To run inference edit the [inference config file](examples/multimodal_autoregressive/conf/megatron_mm_ar_inference_image_generation.yaml)
 *NOTE* Make sure you have a .nemo file (checkpoint). If you just have a regular megatron checkpoint you  have to do a conversion as shown in [this doc](https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/gpt/checkpointconversion.html?highlight=convert)
 
 Run inference as follows
 
 ```
-torchrun --nproc-per-node 2 examples/multimodal_autoregressive/megatron_mm_autoregressive_eval.py
-```
-
-TODO : Instructions to convert visual tokens to images coming soon. 
\ No newline at end of file
+torchrun --nproc-per-node 2 examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py
+```
\ No newline at end of file
diff --git a/tests/collections/asr/decoding/test_rnnt_decoding.py b/tests/collections/asr/decoding/test_rnnt_decoding.py
index b5250ad5f144..82b5d00bede6 100644
--- a/tests/collections/asr/decoding/test_rnnt_decoding.py
+++ b/tests/collections/asr/decoding/test_rnnt_decoding.py
@@ -22,9 +22,8 @@
 from nemo.collections.asr.models import ASRModel
 from nemo.collections.asr.modules import RNNTDecoder, RNNTJoint
 from nemo.collections.asr.parts.mixins import mixins
-from nemo.collections.asr.parts.submodules import rnnt_beam_decoding
+from nemo.collections.asr.parts.submodules import rnnt_beam_decoding as beam_decode
 from nemo.collections.asr.parts.submodules import rnnt_greedy_decoding as greedy_decode
-from nemo.collections.asr.parts.submodules import tdt_beam_decoding
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTBPEDecoding, RNNTDecoding, RNNTDecodingConfig
 from nemo.collections.asr.parts.utils import rnnt_utils
 from nemo.core.utils import numba_utils
@@ -167,39 +166,6 @@ def check_subword_timestamps(hyp: rnnt_utils.Hypothesis, decoding: RNNTBPEDecodi
     assert len(hyp.timestep['segment']) == segments_count
 
 
-def check_beam_decoding(test_data_dir, beam_config):
-    beam_size = beam_config.pop("beam_size", 1)
-    model, encoded, encoded_len = get_model_encoder_output(test_data_dir, 'nvidia/parakeet-tdt_ctc-110m')
-
-    model_config = model.to_config_dict()
-    durations = list(model_config["model_defaults"]["tdt_durations"])
-
-    beam = tdt_beam_decoding.BeamTDTInfer(
-        model.decoder,
-        model.joint,
-        beam_size=beam_size,
-        return_best_hypothesis=False,
-        durations=durations,
-        **beam_config,
-    )
-
-    enc_out = encoded
-    enc_len = encoded_len
-
-    with torch.no_grad():
-        hyps: rnnt_utils.Hypothesis = beam(encoder_output=enc_out, encoded_lengths=enc_len)[0]
-        _, all_hyps = decode_text_from_nbest_hypotheses(hyps, model.decoding)
-        all_hyps = all_hyps[0]
-
-        print("Beam search algorithm :", beam_config['search_type'])
-        for idx, hyp_ in enumerate(all_hyps):
-            print("Hyp index", idx + 1, "text :", hyp_.text)
-
-            assert len(hyp_.timestep) > 0
-            print("Timesteps", hyp_.timestep)
-            print()
-
-
 class TestRNNTDecoding:
     @pytest.mark.unit
     def test_constructor(self):
@@ -346,10 +312,10 @@ def test_batched_greedy_decoding_preserve_alignments(self, test_data_dir, loop_l
             {"search_type": "maes", "maes_num_steps": 3, "maes_expansion_beta": 1, "beam_size": 2},
         ],
     )
-    def test_rnnt_beam_decoding_preserve_alignments(self, test_data_dir, beam_config):
+    def test_beam_decoding_preserve_alignments(self, test_data_dir, beam_config):
         beam_size = beam_config.pop("beam_size", 1)
         model, encoded, encoded_len = get_model_encoder_output(test_data_dir, 'stt_en_conformer_transducer_small')
-        beam = rnnt_beam_decoding.BeamRNNTInfer(
+        beam = beam_decode.BeamRNNTInfer(
             model.decoder,
             model.joint,
             beam_size=beam_size,
@@ -476,51 +442,3 @@ def test_char_decoding_compute_timestamps(self, test_data_dir, decoding_strategy
         hyps, _ = decoding.rnnt_decoder_predictions_tensor(encoded, encoded_len, return_hypotheses=True)
 
         check_char_timestamps(hyps[0], decoding)
-
-    @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE,
-        reason='RNNTLoss has not been compiled with appropriate numba version.',
-    )
-    @pytest.mark.with_downloads
-    @pytest.mark.unit
-    @pytest.mark.parametrize(
-        "beam_config",
-        [
-            {
-                "search_type": "default",
-                "beam_size": 2,
-            },
-            {"search_type": "maes", "maes_num_steps": 2, "maes_expansion_beta": 2, "beam_size": 2},
-            {"search_type": "maes", "maes_num_steps": 2, "maes_expansion_beta": 1, "beam_size": 4},
-        ],
-    )
-    def test_tdt_beam_decoding(self, test_data_dir, beam_config):
-        check_beam_decoding(test_data_dir, beam_config)
-
-    @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE,
-        reason='RNNTLoss has not been compiled with appropriate numba version.',
-    )
-    @pytest.mark.with_downloads
-    @pytest.mark.unit
-    @pytest.mark.parametrize(
-        "beam_config",
-        [
-            {
-                "search_type": "maes",
-                "maes_num_steps": 2,
-                "maes_expansion_beta": 1,
-                "beam_size": 4,
-                "ngram_lm_alpha": 0.3,
-            },
-        ],
-    )
-    def test_tdt_beam_decoding_with_kenlm(self, test_data_dir, beam_config):
-        # skipping if kenlm is not installed
-        pytest.importorskip("kenlm", reason="Skipping test because 'kenlm' is not installed.")
-
-        kenlm_model_path = os.path.join(
-            test_data_dir, "asr", "kenlm_ngram_lm", "parakeet-tdt_ctc-110m-libri-1024.kenlm.tmp.arpa"
-        )
-        beam_config["ngram_lm_model"] = kenlm_model_path
-        check_beam_decoding(test_data_dir, beam_config)
diff --git a/tutorials/llm/llama-3/README.rst b/tutorials/llm/llama-3/README.rst
index 3bb1a0896b82..bb6171e6f582 100755
--- a/tutorials/llm/llama-3/README.rst
+++ b/tutorials/llm/llama-3/README.rst
@@ -17,6 +17,6 @@ This repository contains jupyter notebook tutorials using NeMo Framework for Lla
    * - `Llama 3.1 Law-Domain LoRA Fine-Tuning and Deployment with NeMo Framework and NVIDIA NIM <./sdg-law-title-generation>`_
      - `Law StackExchange <https://huggingface.co/datasets/ymoslem/Law-StackExchange>`_
      - Perform LoRA PEFT on Llama 3.1 8B Instruct using a synthetically augmented version of Law StackExchange with NeMo Framework, followed by deployment with NVIDIA NIM. As a pre-requisite, follow the tutorial for  `data curation using NeMo Curator <https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/peft-curation-with-sdg>`__.
-   * - `Llama 3.1 Pruning and Distillation with NeMo Framework <./pruning-distillation>`_
+   * - `Llama 3.1 WikiText Pruning and Distillation with NeMo Framework <./pruning-distillation>`_
      - `WikiText-103-v1 <https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1>`_
-     - Perform pruning and distillation on Llama 3.1 8B using the WikiText-103-v1 dataset with NeMo Framework.
+     - Perform pruning and distillation on Llama 3.1 8B Instruct using the WikiText-103-v1 dataset with NeMo Framework.
diff --git a/tutorials/llm/llama-3/pruning-distillation/01_data_preparation.ipynb b/tutorials/llm/llama-3/pruning-distillation/01_data_preparation.ipynb
deleted file mode 100644
index 1f84dd2719e6..000000000000
--- a/tutorials/llm/llama-3/pruning-distillation/01_data_preparation.ipynb
+++ /dev/null
@@ -1,102 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "ab9e2e97-7f10-4353-859e-693842bde465",
-   "metadata": {},
-   "source": [
-    "### Step 1: Prepare the dataset\n",
-    "\n",
-    "The dataset has to be preprocessed using the [preprocess_data_for_megatron.py](https://github.com/NVIDIA/NeMo/blob/main/scripts/nlp_language_modeling/preprocess_data_for_megatron.py) script included in the NeMo Framework. This step will also tokenize data using the `meta-llama/Meta-Llama-3.1-8B` tokenizer model to convert the data into a memory map format.\n",
-    "\n",
-    "> `NOTE:` In the block of code below, pass the paths to your train, test and validation data files."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6505c00b-9eb4-4087-9e49-423f6228e690",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "!python /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \\\n",
-    "--input=\"./wikitext-data/wikitext-train.jsonl\" \\\n",
-    "--tokenizer-library='huggingface' \\\n",
-    "--tokenizer-type='meta-llama/Meta-Llama-3.1-8B' \\\n",
-    "--output-prefix=wikitext_tokenized_train \\\n",
-    "--append-eod \\\n",
-    "--workers=32"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fb1aa80f-70bc-4dff-8b08-3bff48d9a1c3",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "!python /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \\\n",
-    "--input=\"./wikitext-data/wikitext-test.jsonl\" \\\n",
-    "--tokenizer-library='huggingface' \\\n",
-    "--tokenizer-type='meta-llama/Meta-Llama-3.1-8B' \\\n",
-    "--output-prefix=wikitext_tokenized_test \\\n",
-    "--append-eod \\\n",
-    "--workers=32"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "42bec54a-94f6-4c87-8e14-2726ef6c2625",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "!python /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \\\n",
-    "--input=\"./wikitext-data/wikitext-val.jsonl\" \\\n",
-    "--tokenizer-library='huggingface' \\\n",
-    "--tokenizer-type='meta-llama/Meta-Llama-3.1-8B' \\\n",
-    "--output-prefix=wikitext_tokenized_val \\\n",
-    "--append-eod \\\n",
-    "--workers=32"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5d77ee8a-e0dc-44f7-b5e8-3b6025d979d7",
-   "metadata": {},
-   "source": [
-    "After running the above scripts, you will see the preprocesed `wikitext_tokenized_{train/val/test}_text_document.{idx/bin}`files. These output files will be used in the next step."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/tutorials/llm/llama-3/pruning-distillation/02_teacher_finetuning.ipynb b/tutorials/llm/llama-3/pruning-distillation/02_teacher_finetuning.ipynb
deleted file mode 100644
index 8d08793bbe9a..000000000000
--- a/tutorials/llm/llama-3/pruning-distillation/02_teacher_finetuning.ipynb
+++ /dev/null
@@ -1,153 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "84b146ba-08b6-4adb-a858-8e4294c5e781",
-   "metadata": {},
-   "source": [
-    "\n",
-    "### Step 2: Finetune the teacher on the dataset\n",
-    "\n",
-    "NeMo framework includes a standard python script [megatron_gpt_pretraining.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_pretraining.py) for training a model. Once you have your model downloaded and the dataset ready, fine-tuning the teacher model with NeMo is essentially just running this script!\n",
-    "\n",
-    "We finetune the unpruned model on our dataset to correct the distribution shift across the original dataset the model was trained on. Per the [blog](https://developer.nvidia.com/blog/how-to-prune-and-distill-llama-3-1-8b-to-an-nvidia-llama-3-1-minitron-4b-model/) and [tech report](https://arxiv.org/pdf/2408.11796), experiments showed that, without correcting for the distribution shift, the teacher provides suboptimal guidance on the dataset when being distilled.\n",
-    "\n",
-    "For this demonstration, this training run is capped by `STEPS`, and validation is carried out every `VAL_INTERVAL` steps.\n",
-    "\n",
-    "> `NOTE:` In the block of code below, pass the paths to your pre-processed train, test and validation data files as well as path to the teacher .nemo model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "12007ac8-2fd5-4de8-8964-97821c2198c0",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "%%bash \n",
-    "\n",
-    "export CUDA_DEVICE_MAX_CONNECTIONS=1\n",
-    "\n",
-    "# Set path(s) if different:\n",
-    "\n",
-    "MODEL=\"/workspace/llama-3_1-8b-nemo_v1.0/llama3_1_8b.nemo\"\n",
-    "\n",
-    "# Can change these to accommodate resources:\n",
-    "\n",
-    "TENSOR_PARALLEL_SIZE=8\n",
-    "NODES=1\n",
-    "MICRO_BATCH_SIZE=4\n",
-    "\n",
-    "# Don't change the following:\n",
-    "\n",
-    "EXPERIMENT_DIR=\"distill_trainings\"\n",
-    "EXPERIMENT_NAME=\"megatron_llama_ft\"\n",
-    "\n",
-    "DATA_TRAIN='wikitext_tokenized_train_text_document'\n",
-    "DATA_VAL='wikitext_tokenized_test_text_document'\n",
-    "DATA_TEST='wikitext_tokenized_val_text_document'\n",
-    "\n",
-    "STEPS=30\n",
-    "GLOBAL_BATCH_SIZE=128\n",
-    "\n",
-    "LOG_INTERVAL=1\n",
-    "VAL_INTERVAL=10\n",
-    "NUM_VAL_BATCHES=5\n",
-    "\n",
-    "LR=1e-4\n",
-    "MIN_LR=1e-5\n",
-    "WARMUP_STEPS=2\n",
-    "\n",
-    "cmd=\"torchrun --nproc-per-node=${TENSOR_PARALLEL_SIZE}\"\n",
-    "\n",
-    "${cmd} /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \\\n",
-    "    --config-path /opt/NeMo/examples/nlp/language_modeling/conf/ \\\n",
-    "    --config-name megatron_llama_distill.yaml \\\n",
-    "    \\\n",
-    "    name=${EXPERIMENT_NAME} \\\n",
-    "    \\\n",
-    "    exp_manager.exp_dir=${EXPERIMENT_DIR} \\\n",
-    "    exp_manager.checkpoint_callback_params.save_top_k=1 \\\n",
-    "    exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \\\n",
-    "    \\\n",
-    "    trainer.max_steps=${STEPS} \\\n",
-    "    trainer.log_every_n_steps=${LOG_INTERVAL} \\\n",
-    "    trainer.val_check_interval=${VAL_INTERVAL} \\\n",
-    "    trainer.limit_val_batches=${NUM_VAL_BATCHES} \\\n",
-    "    +trainer.num_sanity_val_steps=0 \\\n",
-    "    \\\n",
-    "    trainer.precision=bf16 \\\n",
-    "    trainer.devices=${TENSOR_PARALLEL_SIZE} \\\n",
-    "    trainer.num_nodes=${NODES} \\\n",
-    "    \\\n",
-    "    \"model.data.data_prefix={train:[1.0,$DATA_TRAIN],validation:[$DATA_VAL],test:[$DATA_TEST]}\" \\\n",
-    "    \\\n",
-    "    model.restore_from_path=${MODEL} \\\n",
-    "    +model.dist_ckpt_load_strictness=log_all \\\n",
-    "    \\\n",
-    "    ~model.tokenizer \\\n",
-    "    +model.tokenizer='{library: huggingface, type: meta-llama/Meta-Llama-3.1-8B, use_fast: True}' \\\n",
-    "    \\\n",
-    "    model.tensor_model_parallel_size=${TENSOR_PARALLEL_SIZE} \\\n",
-    "    model.sequence_parallel=True \\\n",
-    "    model.micro_batch_size=${MICRO_BATCH_SIZE} \\\n",
-    "    model.global_batch_size=${GLOBAL_BATCH_SIZE} \\\n",
-    "    \\\n",
-    "    model.encoder_seq_length=8192 \\\n",
-    "    model.num_layers=32 \\\n",
-    "    model.hidden_size=4096 \\\n",
-    "    model.ffn_hidden_size=14336 \\\n",
-    "    model.num_attention_heads=32 \\\n",
-    "    model.hidden_dropout=0.0 \\\n",
-    "    model.attention_dropout=0.0 \\\n",
-    "    model.apply_query_key_layer_scaling=True \\\n",
-    "    model.normalization='rmsnorm' \\\n",
-    "    model.bias=False \\\n",
-    "    model.activation='fast-swiglu' \\\n",
-    "    model.position_embedding_type='rope' \\\n",
-    "    model.share_embeddings_and_output_weights=False \\\n",
-    "    model.num_query_groups=8 \\\n",
-    "    ++model.scale_positional_embedding=True \\\n",
-    "    ++model.rotary_base=500000.0 \\\n",
-    "    \\\n",
-    "    model.optim.name=distributed_fused_adam \\\n",
-    "    model.optim.lr=${LR} \\\n",
-    "    model.optim.sched.min_lr=${MIN_LR} \\\n",
-    "    model.optim.sched.warmup_steps=${WARMUP_STEPS}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3040a993-8423-475f-8bc6-d1dd1ce16a83",
-   "metadata": {},
-   "source": [
-    "This will create a finetuned teacher model named `megatron_llama_ft.nemo` in `./distill_trainings/megatron_llama_ft/checkpoints/`. We'll use this later.\n",
-    "> `NOTE:`This script takes at least 20 minutes to run (depending on GPU) and will generate the finetuned teacher model."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/tutorials/llm/llama-3/pruning-distillation/03_a_depth_pruning.ipynb b/tutorials/llm/llama-3/pruning-distillation/03_a_depth_pruning.ipynb
deleted file mode 100644
index a195c2f3a405..000000000000
--- a/tutorials/llm/llama-3/pruning-distillation/03_a_depth_pruning.ipynb
+++ /dev/null
@@ -1,77 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "8bc99d2f-9ac6-40c2-b072-12b6cb7b9aca",
-   "metadata": {},
-   "source": [
-    "### Step 3: Prune the finetuned-teacher model to create a student\n",
-    "In this step, we will explore two methods to prune the finetuned teacher model. Refer to the ``NOTE`` in the **_step-by-step instructions_** section of [introduction.ipynb](./introduction.ipynb) to decide which pruning techniques you would like to explore.\n",
-    "\n",
-    "In the first method, depth-pruning, we trim the layers of the model."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "72fa494e-6268-4044-a1d6-c0518d450cfd",
-   "metadata": {},
-   "source": [
-    "#### Step 3.a.: Using depth-pruning \n",
-    "To depth-prune, we will trim the last 16 layers in the finetined teacher model. For depth-pruning, we would be using the [megatron_gpt_drop_layers](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_drop_layers.py) script. \n",
-    "\n",
-    "Per the [blog](https://developer.nvidia.com/blog/how-to-prune-and-distill-llama-3-1-8b-to-an-nvidia-llama-3-1-minitron-4b-model/) and [tech report](https://arxiv.org/pdf/2408.11796), removing contiguous layers from the second last block (layers 16 to 31 continuously) yields the best overall results. \n",
-    "\n",
-    "> `NOTE:` In the block of code below, pass the paths to your finetuned teacher .nemo model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "60cae073-a192-4d47-b220-b09736d39a93",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "!python -m torch.distributed.launch --nproc_per_node=8 \\\n",
-    "     /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_drop_layers.py \\\n",
-    "     --path_to_nemo \"./distill_trainings/megatron_llama_ft/checkpoints/megatron_llama_ft.nemo\" \\\n",
-    "     --path_to_save \"/workspace/4b_depth_pruned_model.nemo\" \\\n",
-    "     --tensor_model_parallel_size 8 \\\n",
-    "     --pipeline_model_parallel_size 1 \\\n",
-    "     --gpus_per_node 8 \\\n",
-    "     --drop_layers 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "375f298a-0363-4f44-b40c-2c8e9bab7d76",
-   "metadata": {},
-   "source": [
-    "Running this script will save the depth-pruned model `4b_depth_pruned_model.nemo` to your workspace."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/tutorials/llm/llama-3/pruning-distillation/03_b_width_pruning.ipynb b/tutorials/llm/llama-3/pruning-distillation/03_b_width_pruning.ipynb
deleted file mode 100644
index 7d91d36cbb32..000000000000
--- a/tutorials/llm/llama-3/pruning-distillation/03_b_width_pruning.ipynb
+++ /dev/null
@@ -1,92 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "8bc99d2f-9ac6-40c2-b072-12b6cb7b9aca",
-   "metadata": {},
-   "source": [
-    "### Step 3: Prune the finetuned-teacher model to create a student\n",
-    "In the second method, we will width-prune. In width-pruning, we trim the neurons, attention heads and embedding channels. \n",
-    "\n",
-    "Refer to the ``NOTE`` in the **_step-by-step instructions_** section of [introduction.ipynb](./introduction.ipynb) to decide which pruning techniques you would like to explore."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9207ed14-2f37-4712-88f3-543a128663ac",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "#### Step 3.b.: Using width-pruning\n",
-    "To width-prune the model, we do the following:\n",
-    "- prune (trim) the MLP intermediate dimension from 14336 to 9216.\n",
-    "- prune the hidden size from 4096 to 3072.\n",
-    "- and retrain the attention headcount and number of layers\n",
-    "\n",
-    "For width-pruning we will use the [megatron_gpt_prune.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_prune.py) script in the NeMo Framework. To see the detailed list of parameters for width-pruning, you can view the [megatron_gpt_prune.yaml](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml) file.\n",
-    "\n",
-    "We use the above parameters to get a competitive model for this demonstration. You can use other strategies or parameters from the [blog](https://developer.nvidia.com/blog/how-to-prune-and-distill-llama-3-1-8b-to-an-nvidia-llama-3-1-minitron-4b-model/) or the [tech report](https://arxiv.org/pdf/2408.11796) for your experiments. \n",
-    "\n",
-    "> `NOTE:` In the block of code below, pass the paths to your finetuned teacher .nemo model.\n",
-    "\n",
-    "> `TIP:` You can increase the ``batch_size`` (upto 1024) to speed up the width-pruning script execution."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "571d1483-dd4c-403e-b321-293342e7a62a",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "!torchrun --nproc-per-node=8 /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_prune.py \\\n",
-    "     model.restore_from_path=\"./distill_trainings/megatron_llama_ft/checkpoints/megatron_llama_ft.nemo\" \\\n",
-    "     model.tensor_model_parallel_size=1 \\\n",
-    "     model.pipeline_model_parallel_size=8 \\\n",
-    "     +model.dist_ckpt_load_strictness=log_all \\\n",
-    "     inference.batch_size=64 \\\n",
-    "     trainer.num_nodes=1 \\\n",
-    "     trainer.precision=bf16 \\\n",
-    "     trainer.devices=8 \\\n",
-    "     prune.ffn_hidden_size=9216 \\\n",
-    "     prune.num_attention_heads=null \\\n",
-    "     prune.num_query_groups=null \\\n",
-    "     prune.hidden_size=3072 \\\n",
-    "     export.save_path=\"/workspace/4b_width_pruned_model.nemo\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e9fb0977-5c02-4ecc-b602-54d74b2e2184",
-   "metadata": {},
-   "source": [
-    "Running this script will save the width-pruned model `4b_width_pruned_model.nemo` to your workspace."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/tutorials/llm/llama-3/pruning-distillation/04_a_distilling_depth_pruned_student.ipynb b/tutorials/llm/llama-3/pruning-distillation/04_a_distilling_depth_pruned_student.ipynb
deleted file mode 100644
index ccbe1cbf394b..000000000000
--- a/tutorials/llm/llama-3/pruning-distillation/04_a_distilling_depth_pruned_student.ipynb
+++ /dev/null
@@ -1,136 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "09d30e35-8e9d-4d2e-bd14-738c627a3963",
-   "metadata": {},
-   "source": [
-    "### Step 4: Distill knowledge from teacher into student\n",
-    "Distillation of a model with NeMo Framework is also possible using a python script: [megatron_gpt_distillation.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_distillation.py). In this notebook, we will explore distillation with the depth-pruned model as the `STUDENT` model. \n",
-    "\n",
-    "For this demonstration, the `TEACHER` would be the finetuned teacher model `megatron_llama_ft.nemo` and the `STUDENT` model would be the pruned 4B model. This training run is capped by `STEPS`, and validation is carried out every `VAL_INTERVAL` steps."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c33cf641-0d27-417f-b3ee-c06701698184",
-   "metadata": {},
-   "source": [
-    "#### Step 4.a.: Using depth-pruned student\n",
-    "While distilling knowledge from the teacher to depth-pruned model, the `STUDENT` model would be  `4b_depth_pruned_model.nemo` as produced by the [depth-pruning](./03_a_depth_pruning.ipynb) notebook. This training run is capped by `STEPS`, and validation is carried out every `VAL_INTERVAL` steps.\n",
-    "\n",
-    "> `NOTE:` In the block of code below, pass the paths to your pre-processed train, test and validation data files as well as path to the teacher and student .nemo models."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5d23a01e-4912-47cb-bf21-b4fd72007ec1",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "%%bash \n",
-    "\n",
-    "export CUDA_DEVICE_MAX_CONNECTIONS=1\n",
-    "\n",
-    "# Can change these to accommodate resources:\n",
-    "\n",
-    "TENSOR_PARALLEL_SIZE=8\n",
-    "NODES=1\n",
-    "MICRO_BATCH_SIZE=4\n",
-    "\n",
-    "# Don't change the following:\n",
-    "\n",
-    "EXPERIMENT_DIR=\"distill_trainings\"\n",
-    "EXPERIMENT_NAME=\"megatron_llama_distill_depth_pruned_student\"\n",
-    "\n",
-    "TEACHER=\"${EXPERIMENT_DIR}/megatron_llama_ft/checkpoints/megatron_llama_ft.nemo\"\n",
-    "STUDENT=\"/workspace/4b_depth_pruned_model.nemo\"\n",
-    "\n",
-    "FINAL_MODEL_PATH=\"${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/checkpoints/depth_pruned_distilled_4b_model.nemo\"\n",
-    "\n",
-    "DATA_TRAIN='wikitext_tokenized_train_text_document'\n",
-    "DATA_VAL='wikitext_tokenized_test_text_document'\n",
-    "DATA_TEST='wikitext_tokenized_val_text_document'\n",
-    "\n",
-    "STEPS=30\n",
-    "GLOBAL_BATCH_SIZE=128\n",
-    "\n",
-    "LOG_INTERVAL=1\n",
-    "VAL_INTERVAL=10\n",
-    "NUM_VAL_BATCHES=5\n",
-    "\n",
-    "LR=1e-4\n",
-    "MIN_LR=1e-5\n",
-    "WARMUP_STEPS=2\n",
-    "\n",
-    "cmd=\"torchrun --nproc-per-node=${TENSOR_PARALLEL_SIZE}\"\n",
-    "\n",
-    "${cmd} /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_distillation.py \\\n",
-    "    name=${EXPERIMENT_NAME} \\\n",
-    "    \\\n",
-    "    exp_manager.exp_dir=${EXPERIMENT_DIR} \\\n",
-    "    exp_manager.checkpoint_callback_params.save_top_k=1 \\\n",
-    "    \\\n",
-    "    trainer.max_steps=${STEPS} \\\n",
-    "    trainer.log_every_n_steps=${LOG_INTERVAL} \\\n",
-    "    trainer.val_check_interval=${VAL_INTERVAL} \\\n",
-    "    trainer.limit_val_batches=${NUM_VAL_BATCHES} \\\n",
-    "    +trainer.num_sanity_val_steps=0 \\\n",
-    "    \\\n",
-    "    trainer.precision=bf16 \\\n",
-    "    trainer.devices=${TENSOR_PARALLEL_SIZE} \\\n",
-    "    trainer.num_nodes=${NODES} \\\n",
-    "    \\\n",
-    "    \"model.data.data_prefix={train:[1.0,$DATA_TRAIN],validation:[$DATA_VAL],test:[$DATA_TEST]}\" \\\n",
-    "    \\\n",
-    "    model.restore_from_path=${STUDENT} \\\n",
-    "    model.kd_teacher_restore_from_path=${TEACHER} \\\n",
-    "    model.nemo_path=${FINAL_MODEL_PATH} \\\n",
-    "    \\\n",
-    "    model.tensor_model_parallel_size=${TENSOR_PARALLEL_SIZE} \\\n",
-    "    model.sequence_parallel=True \\\n",
-    "    model.micro_batch_size=${MICRO_BATCH_SIZE} \\\n",
-    "    model.global_batch_size=${GLOBAL_BATCH_SIZE} \\\n",
-    "    \\\n",
-    "    model.optim.name=distributed_fused_adam \\\n",
-    "    model.optim.lr=${LR} \\\n",
-    "    model.optim.sched.min_lr=${MIN_LR} \\\n",
-    "    model.optim.sched.warmup_steps=${WARMUP_STEPS}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "42d910d9-14dd-44ba-bf2c-0064737c70fa",
-   "metadata": {},
-   "source": [
-    "This will create the final distilled model named `depth_pruned_distilled_4b_model.nemo` in `./distill_trainings/megatron_llama_distill_depth_pruned_student/checkpoints`.\n",
-    "> `NOTE:`This script takes at least 35 minutes to run (depends on GPU) and generate the final distilled model."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/tutorials/llm/llama-3/pruning-distillation/04_b_distilling_width_pruned_student.ipynb b/tutorials/llm/llama-3/pruning-distillation/04_b_distilling_width_pruned_student.ipynb
deleted file mode 100644
index 48e81c96cdcf..000000000000
--- a/tutorials/llm/llama-3/pruning-distillation/04_b_distilling_width_pruned_student.ipynb
+++ /dev/null
@@ -1,138 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "d5062f23-c604-479b-9a4e-69989598b131",
-   "metadata": {},
-   "source": [
-    "### Step 4: Distill knowledge from teacher into student\n",
-    "Distillation of a model with NeMo Framework is also possible using a python script: [megatron_gpt_distillation.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_distillation.py). \n",
-    "In this notebook, we will explore distillation with the width-pruned model as the `STUDENT` model.\n",
-    "\n",
-    "For this demonstration, the `TEACHER` would be the finetuned teacher model `megatron_llama_ft.nemo` and the `STUDENT` model would be the pruned 4B model. This training run is capped by `STEPS`, and validation is carried out every `VAL_INTERVAL` steps."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "be7de691-dd1d-4719-9872-98501a22e3c9",
-   "metadata": {},
-   "source": [
-    "#### Step 4.b.: Using width-pruned student\n",
-    "While distilling knowledge from the teacher to width-pruned model, the `STUDENT` model would be  `4b_width_pruned_model.nemo` as produced by the [width-pruning](./03_b_width_pruning.ipynb) notebook. This training run is capped by `STEPS`, and validation is carried out every `VAL_INTERVAL` steps.\n",
-    "\n",
-    "> `NOTE:` In the block of code below, pass the paths to your pre-processed train, test and validation data files as well as path to the teacher and student .nemo models."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0070b526-771a-4a8d-b0ba-ab218b382bd9",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "%%bash \n",
-    "\n",
-    "export CUDA_DEVICE_MAX_CONNECTIONS=1\n",
-    "\n",
-    "# Can change these to accommodate resources:\n",
-    "\n",
-    "TENSOR_PARALLEL_SIZE=8\n",
-    "NODES=1\n",
-    "MICRO_BATCH_SIZE=4\n",
-    "\n",
-    "# Don't change the following:\n",
-    "\n",
-    "EXPERIMENT_DIR=\"distill_trainings\"\n",
-    "EXPERIMENT_NAME=\"megatron_llama_distill_width_pruned_student\"\n",
-    "\n",
-    "TEACHER=\"${EXPERIMENT_DIR}/megatron_llama_ft/checkpoints/megatron_llama_ft.nemo\"\n",
-    "STUDENT=\"/workspace/4b_width_pruned_model.nemo\"\n",
-    "\n",
-    "FINAL_MODEL_PATH=\"${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/checkpoints/width_pruned_distilled_4b_model.nemo\"\n",
-    "\n",
-    "DATA_TRAIN='wikitext_tokenized_train_text_document'\n",
-    "DATA_VAL='wikitext_tokenized_test_text_document'\n",
-    "DATA_TEST='wikitext_tokenized_val_text_document'\n",
-    "\n",
-    "STEPS=30\n",
-    "GLOBAL_BATCH_SIZE=128\n",
-    "\n",
-    "LOG_INTERVAL=1\n",
-    "VAL_INTERVAL=10\n",
-    "NUM_VAL_BATCHES=5\n",
-    "\n",
-    "LR=1e-4\n",
-    "MIN_LR=1e-5\n",
-    "WARMUP_STEPS=2\n",
-    "\n",
-    "cmd=\"torchrun --nproc-per-node=${TENSOR_PARALLEL_SIZE}\"\n",
-    "\n",
-    "${cmd} /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_distillation.py \\\n",
-    "    name=${EXPERIMENT_NAME} \\\n",
-    "    \\\n",
-    "    exp_manager.exp_dir=${EXPERIMENT_DIR} \\\n",
-    "    exp_manager.checkpoint_callback_params.save_top_k=1 \\\n",
-    "    \\\n",
-    "    trainer.max_steps=${STEPS} \\\n",
-    "    trainer.log_every_n_steps=${LOG_INTERVAL} \\\n",
-    "    trainer.val_check_interval=${VAL_INTERVAL} \\\n",
-    "    trainer.limit_val_batches=${NUM_VAL_BATCHES} \\\n",
-    "    +trainer.num_sanity_val_steps=0 \\\n",
-    "    \\\n",
-    "    trainer.precision=bf16 \\\n",
-    "    trainer.devices=${TENSOR_PARALLEL_SIZE} \\\n",
-    "    trainer.num_nodes=${NODES} \\\n",
-    "    \\\n",
-    "    \"model.data.data_prefix={train:[1.0,$DATA_TRAIN],validation:[$DATA_VAL],test:[$DATA_TEST]}\" \\\n",
-    "    \\\n",
-    "    model.restore_from_path=${STUDENT} \\\n",
-    "    model.kd_teacher_restore_from_path=${TEACHER} \\\n",
-    "    model.nemo_path=${FINAL_MODEL_PATH} \\\n",
-    "    \\\n",
-    "    model.tensor_model_parallel_size=${TENSOR_PARALLEL_SIZE} \\\n",
-    "    model.sequence_parallel=True \\\n",
-    "    model.micro_batch_size=${MICRO_BATCH_SIZE} \\\n",
-    "    model.global_batch_size=${GLOBAL_BATCH_SIZE} \\\n",
-    "    \\\n",
-    "    model.optim.name=distributed_fused_adam \\\n",
-    "    model.optim.lr=${LR} \\\n",
-    "    model.optim.sched.min_lr=${MIN_LR} \\\n",
-    "    model.optim.sched.warmup_steps=${WARMUP_STEPS} \\\n",
-    "    +model.dist_ckpt_load_strictness=log_all"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d9dbc377-e19a-49e0-b245-fa828cca415a",
-   "metadata": {},
-   "source": [
-    "This will create the final width-pruned distilled model named `width_pruned_distilled_4b_model.nemo` in `./distill_trainings/megatron_llama_distill_width_pruned_student/checkpoints`.\n",
-    "> `NOTE:`This script takes at least 20 minutes to run (depends on GPU) and generate the final distilled model."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/tutorials/llm/llama-3/pruning-distillation/05_display_results.ipynb b/tutorials/llm/llama-3/pruning-distillation/05_display_results.ipynb
deleted file mode 100644
index 0264cc288957..000000000000
--- a/tutorials/llm/llama-3/pruning-distillation/05_display_results.ipynb
+++ /dev/null
@@ -1,168 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "6c91263b-b312-4ab2-b13f-0ee4b6e8bd0f",
-   "metadata": {},
-   "source": [
-    "### Step 5: Display the validation loss\n",
-    "\n",
-    "Now that the results are in, let's visualize the validation loss of the two distilled models using the `tensorboard` library. \n",
-    "> `NOTE:` This notebook demonstrates the use of the teacher finetuning, pruning and the distillation script. These scripts should ideally be run on a multi-node cluster with a larger `GLOBAL_BATCH_SIZE` and `STEPS` to see improvement in the validation loss."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b5822d62-8131-4046-8c22-0bf0fce81df7",
-   "metadata": {},
-   "source": [
-    "#### Validation Loss using depth-pruned model as student in distillation script\n",
-    "Here is an image of the validation loss over 30 steps of running the training step in the distillation script when we distill the knowledge from the finetuned teacher model to the depth-pruned student."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0a665fe1-df45-4126-8694-f182af113133",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "%load_ext tensorboard\n",
-    "%tensorboard --logdir \"distill_trainings/megatron_llama_distill_depth_pruned_student/\" --port=6007"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "db6fcf26-8ae8-40e1-875a-0a10bf85be81",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<h5>Validation Loss over 30 Training Steps with Depth-Pruned model as Student</h5>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<img src=\"https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/val_loss_depth_pruned_student_distillation.png\" width=\"400\"/>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.Image object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from IPython.display import Image, display, HTML\n",
-    "title = \"Validation Loss over 30 Training Steps with Depth-Pruned model as Student\"\n",
-    "display(HTML(f\"<h5>{title}</h5>\"))\n",
-    "display(Image(url=\"https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/val_loss_depth_pruned_student_distillation.png\", width=400))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f10041ae-6533-47de-9f76-f97d4469c27a",
-   "metadata": {},
-   "source": [
-    "#### Validation Loss using width-pruned model as student in distillation script\n",
-    "Here is an image of the validation loss over 30 steps of running the training step in the distillation script when we distill the knowledge from the finetuned teacher model to the width-pruned student."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7b0c3118-4987-4df3-88bd-fcffdb521c5d",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "%load_ext tensorboard\n",
-    "%tensorboard --logdir \"distill_trainings/megatron_llama_distill_width_pruned_student/\" --port=6008"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "ecd79583-f662-40c6-a690-9f4bb847de4e",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<h5>Validation Loss over 30 Training Steps with Width-Pruned model as Student</h5>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<img src=\"https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/val_loss_width_pruned_student_distillation.png\" width=\"400\"/>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.Image object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from IPython.display import Image, display, HTML\n",
-    "title = \"Validation Loss over 30 Training Steps with Width-Pruned model as Student\"\n",
-    "display(HTML(f\"<h5>{title}</h5>\"))\n",
-    "display(Image(url=\"https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/val_loss_width_pruned_student_distillation.png\", width=400))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7ab6ed6f-8bc3-4188-919f-7cee842635ed",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/tutorials/llm/llama-3/pruning-distillation/README.rst b/tutorials/llm/llama-3/pruning-distillation/README.rst
index 34febcffa366..9d4207a5c968 100644
--- a/tutorials/llm/llama-3/pruning-distillation/README.rst
+++ b/tutorials/llm/llama-3/pruning-distillation/README.rst
@@ -1,26 +1,18 @@
-Llama 3.1 Pruning and Distillation with NeMo Framework
+Llama 3.1 WikiText Pruning and Distillation with NeMo Framework
 =======================================================================================
 
 `Llama 3.1 <https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/>`_ are open-source large language models by Meta that deliver state-of-the-art performance on popular industry benchmarks. They have been pretrained on over 15 trillion tokens, and support a 128K token context length. They are available in three sizes, 8B, 70B, and 405B, and each size has two variants—base pretrained and instruction tuned.
 
 `NVIDIA NeMo Framework <https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html>`_ provides tools to perform teacher finetuning, pruning and distillation on Llama 3.1 to fit your use case.
 
-`NVIDIA TensorRT Model Optimizer <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_ is a library (referred to as **Model Optimizer**, or **ModelOpt**) comprising state-of-the-art model optimization techniques including `quantization <https://github.com/NVIDIA/TensorRT-Model-Optimizer#quantization>`_, `sparsity <https://github.com/NVIDIA/TensorRT-Model-Optimizer#sparsity>`_, `distillation <https://github.com/NVIDIA/TensorRT-Model-Optimizer#distillation>`_, and `pruning <https://github.com/NVIDIA/TensorRT-Model-Optimizer#pruning>`_ to compress models.
-
 `LLM Pruning and Distillation in Practice: The Minitron Approach <https://arxiv.org/abs/2408.11796>`_ provides tools to perform teacher finetuning, pruning and distillation on Llama 3.1 as described in the `tech report <https://arxiv.org/abs/2408.11796>`_.
 
-`How to Prune and Distill Llama-3.1 8B to an NVIDIA Llama-3.1-Minitron 4B Model <https://developer.nvidia.com/blog/how-to-prune-and-distill-llama-3-1-8b-to-an-nvidia-llama-3-1-minitron-4b-model/>`_ provides practical and effective structured compression best practices for LLMs that combine depth, width, attention, and MLP pruning with knowledge distillation-based retraining. These strategies are presented in the `Compact Language Models via Pruning and Knowledge Distillation <https://arxiv.org/pdf/2407.14679>`_ paper.
-
-`Mistral-NeMo-Minitron 8B Model Delivers Unparalleled Accuracy <https://developer.nvidia.com/blog/mistral-nemo-minitron-8b-foundation-model-delivers-unparalleled-accuracy/>`_ introduces the Mistral-NeMo-Minitron 8B, a state-of-the-art 8 billion parameter language model created by pruning and distilling the larger Mistral NeMo 12B model.
-
 Objectives
 ----------
 
-This tutorial shows how to perform depth-pruning, teacher finetuning and distillation on **Llama 3.1 8B** using the `WikiText-103-v1 <https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1>`_ dataset with NeMo Framework. The `WikiText-103-v1 <https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1>`_ language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia. For this demonstration, we will perform teacher correction by running a light finetuning procedure on the ``Meta Llama 3.1 8B`` teacher model to generate a finetuned teacher model ``megatron_llama_ft.nemo`` needed for optimal distillation. This finetuned teacher model is then trimmed. There are two methods to prune a model: depth-pruning and width-pruning. We will be exploring both pruning techniques which will yield ``4b_depth_pruned_model.nemo`` and ``4b_width_pruned_model.nemo`` respectively. These models will serve as a starting point for distillation to create the final distilled 4B models.
+This tutorial shows how to perform depth-pruning, teacher finetuning and distillation on **Llama 3.1 8B Instruct** using the `WikiText-103-v1 <https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1>`_ dataset with NeMo Framework. The `WikiText-103-v1 <https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1>`_ language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia. For this demonstration, we will perform a light finetuning procedure on the ``Meta Llama 3.1 8B Instruct`` teacher model to generate a finetuned teacher model ``megatron_llama_ft.nemo`` needed for optimal distillation. This finetuned teacher model is then depth-pruned to create a trimmed model ``4b_trimmed_model.nemo``. These models will serve as a starting point for distillation to create a final distilled 4B model.
 We are using models utilizing the ``meta-llama/Meta-Llama-3.1-8B`` tokenizer for this demonstration.
 
-``NOTE:`` A subset of functions is being demonstrated in the notebooks. Some features like Neural Architecture Search (NAS) are unavailable but will be supported in future releases.
-
 Requirements
 -------------
 
@@ -39,16 +31,14 @@ Create a pruned and distilled model with NeMo Framework
 
 For pruning and distilling the model, you will use the NeMo Framework which is available as a `docker container <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo>`_.
 
-``NOTE:`` These notebooks use `NVIDIA TensorRT Model Optimizer <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_ under the hood for pruning and distillation.
-
 
-1. Download the `Llama 3.1 8B .nemo <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/llama-3_1-8b-nemo>`_ from NVIDIA NGC using the `NGC CLI <https://org.ngc.nvidia.com/setup/installers/cli>`_. Generate the ``NGC_API_KEY`` following these `instructions <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#option-2-from-ngc>`_. The following command saves the ``.nemo`` format model in a folder named ``llama-3_1-8b-nemo_v1.0`` in the current directory. You can specify another path using the ``-d`` option in the CLI tool.
+1. Download the `Llama 3.1 8B Instruct .nemo <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/llama-3_1-8b-instruct-nemo>`_ from NVIDIA NGC using the `NGC CLI <https://org.ngc.nvidia.com/setup/installers/cli>`_. Generate the ``NGC_API_KEY`` following these `instructions <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#option-2-from-ngc>`_. The following command saves the ``.nemo`` format model in a folder named ``llama-3_1-8b-instruct-nemo_v1.0`` in the current directory. You can specify another path using the ``-d`` option in the CLI tool.
 
 .. code:: bash
 
-   ngc registry model download-version "nvidia/nemo/llama-3_1-8b-nemo:1.0"
+   ngc registry model download-version "nvidia/nemo/llama-3_1-8b-instruct-nemo:1.0"
 
-2. Run the container using the following command. It is assumed that you have the dataset, notebook(s), and the ``llama3_1_8b.nemo`` model available in the current directory. If not, mount the appropriate folder to ``/workspace``.
+2. Run the container using the following command. It is assumed that you have the dataset, notebook(s), and the ``llama-3.1-8b-instruct`` model available in the current directory. If not, mount the appropriate folder to ``/workspace``.
 
 .. code:: bash
 
@@ -73,38 +63,17 @@ For pruning and distilling the model, you will use the NeMo Framework which is a
 
    jupyter lab --ip 0.0.0.0 --port=8888 --allow-root
 
-4. Then, navigate to `this notebook <./introduction.ipynb>`_ to get started.
+4. Then, navigate to `this notebook <./llama3-pruning-distillation-nemofw.ipynb>`_.
 
-This directory contains a list of notebooks which will go over all the steps to create a distilled 4B model.
-
-:: 
-
-   <$pruning_distillation>
-   └── introduction.ipynb
-   └── 01_data_preparation.ipynb
-   └── 02_teacher_finetuning.ipynb
-   └── 03_a_depth_pruning.ipynb
-   └── 03_b_width_pruning.ipynb
-   └── 04_a_distilling_depth_pruned_student.ipynb
-   └── 04_b_distilling_width_pruned_student.ipynb
-   └── 05_display_results.ipynb
-   
 Results
 ------------------------------------------------------------------------------
-``NOTE:`` This notebook demonstrates the use of the teacher finetuning, pruning and the distillation scripts. These scripts should ideally be run on a multi-node cluster with a larger ``GLOBAL_BATCH_SIZE`` and ``STEPS`` to see improvement in the validation loss.
-
-Here are the validation loss plots over 30 steps of running the training step in the distillation script (at the end of the `notebook <./05_display_results.ipynb>`_).
+``NOTE:`` This notebook demonstrates the use of the teacher finetuning, pruning and the distillation script. These scripts should ideally be run on a multi-node cluster with a larger ``GLOBAL_BATCH_SIZE`` and ``STEPS`` to see improvement in the validation loss.
 
-.. figure:: https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/val_loss_depth_pruned_student_distillation.png
-  :width: 400px
-  :alt: Diagram showing the validation loss over 30 steps of running the training step in the distillation script when using the depth-pruned model as the student
-  :align: center
+Here is the validation loss over 30 steps of running the training step in the distillation script (at the end of the `notebook <./llama3-pruning-distillation-nemofw.ipynb>`_).
 
-  Figure 1: Validation Loss Plot when using the depth-pruned model as the student
-  
-.. figure:: https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/val_loss_width_pruned_student_distillation.png
+.. figure:: https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/val_loss_distillation.png
   :width: 400px
-  :alt: Diagram showing the validation loss over 30 steps of running the training step in the distillation script when using the width-pruned model as the student
+  :alt: Diagram showing the validation loss over 30 steps of running the training step in the distillation script
   :align: center
 
-  Figure 2: Validation Loss Plot when using the width-pruned model as the student 
\ No newline at end of file
+  Figure 1: Validation Loss Plot
\ No newline at end of file
diff --git a/tutorials/llm/llama-3/pruning-distillation/introduction.ipynb b/tutorials/llm/llama-3/pruning-distillation/introduction.ipynb
deleted file mode 100644
index 1a3efc9f5f1e..000000000000
--- a/tutorials/llm/llama-3/pruning-distillation/introduction.ipynb
+++ /dev/null
@@ -1,190 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "411e6711-60fc-4488-8aa1-c6463cac8695",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "# Pruning and Distillation of Llama 3.1 model with NeMo Framework"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "03fd1cf4-c67a-4b8d-a5e5-46531be0f991",
-   "metadata": {},
-   "source": [
-    "This demonstration showcases performing pruning and distillation on **Llama 3.1-8B** with the [WikiText-103-v1](https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1) dataset using NeMo Framework. The [WikiText-103-v1](https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1) language modeling dataset is a collection of over 100 million tokens extracted from the set of verified 'Good' and 'Featured' articles on Wikipedia. \n",
-    "\n",
-    "For this demonstration, we will perform a light finetuning procedure on the `Meta Llama 3.1 8B` teacher model to generate a finetuned teacher model. This finetuned teacher model will then be trimmed. There are two methods to prune a model: depth-pruning and width-pruning. This workflow will showcase both methods which will yield `4b_depth_pruned_model.nemo` and `4b_width_pruned_model.nemo` respectively, that will serve as a starting point for distillation to the final 4B models. \n",
-    "\n",
-    "> We are using models utilizing the `meta-llama/Meta-Llama-3.1-8B` tokenizer for this demonstration.\n",
-    "\n",
-    "> `NOTE:` Ensure that you run this notebook inside the [NeMo Framework container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) which has all the required dependencies. \n",
-    "\n",
-    "**Instructions are available in the associated tutorial README to download the model and the container.**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5a5026ce-39f1-43e3-93af-4c4f1e9da1f2",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "!pip install --upgrade ipywidgets notebook\n",
-    "!pip install datasets"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "afe59b07-bb48-4913-90cc-bb416b48196c",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "---\n",
-    "## Prerequisites\n",
-    "Ensure you have the following -\n",
-    "1. **Get the teacher model**: Download the `Meta Llama 3.1 8B .nemo` model. You must follow the instructions in the associated README to download and mount the folder to the NeMo FW container."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b9d48b81-e978-4894-8ba4-4f183f698bb1",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "!ls /workspace/llama-3_1-8b-nemo_v1.0/llama3_1_8b.nemo"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7129d44e-0536-4e62-bdbc-0f1ad44dc84a",
-   "metadata": {},
-   "source": [
-    "2. **Set the Hugging Face Access Token**: You can obtain this from your [Hugging Face account](https://huggingface.co/docs/hub/en/security-tokens). "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "481417ed-1456-4962-8f67-4350bde1aabd",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from huggingface_hub import login\n",
-    "login(token=\"<YOUR_HF_ACCESS_TOKEN>\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "245eda8d-c999-431e-9ebc-5c92c4f21f3b",
-   "metadata": {},
-   "source": [
-    "3. **Obtain the dataset**: Generate the `wikitext-{train/val/test}.jsonl` splits after loading the [WikiText-103-v1](https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1) dataset."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "eaef2c7d-41f7-41ad-a76a-2d714e9c35de",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# Split into train, test and val files\n",
-    "\n",
-    "import json\n",
-    "import os\n",
-    "from datasets import load_dataset\n",
-    "\n",
-    "# Load the WikiText-103 dataset\n",
-    "dataset = load_dataset(\"wikitext\", \"wikitext-103-v1\")\n",
-    "\n",
-    "# Define the destination folder\n",
-    "data_folder = 'wikitext-data'\n",
-    "os.makedirs(data_folder, exist_ok=True)\n",
-    "\n",
-    "# Define file paths and destination paths\n",
-    "file_paths = {\n",
-    "    'train': os.path.join(data_folder, 'wikitext-train.jsonl'),\n",
-    "    'validation': os.path.join(data_folder, 'wikitext-val.jsonl'),\n",
-    "    'test': os.path.join(data_folder, 'wikitext-test.jsonl')\n",
-    "}\n",
-    "\n",
-    "# Function to save dataset split to a JSONL file\n",
-    "def save_to_jsonl(file_path, data):\n",
-    "    with open(file_path, 'w') as file:\n",
-    "        for item in data:\n",
-    "            file.write(json.dumps(item) + '\\n')\n",
-    "\n",
-    "# Define splits\n",
-    "splits = [\"train\", \"validation\", \"test\"]\n",
-    "\n",
-    "# Save splits to JSONL files and calculate their sizes\n",
-    "for split in splits:\n",
-    "    if split in dataset:\n",
-    "        save_to_jsonl(file_paths[split], dataset[split])\n",
-    "    else:\n",
-    "        print(f\"Split {split} not found in the dataset.\")\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2d0cc359-0598-40aa-af80-9503ecd4dac1",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "---\n",
-    "##  Step-by-step instructions\n",
-    "\n",
-    "This workflow is structured into seven notebooks:\n",
-    "1. [Prepare the dataset](./01_data_preparation.ipynb)\n",
-    "2. [Finetune the teacher on the dataset](./02_teacher_finetuning.ipynb)\n",
-    "3. Prune the finetuned-teacher model to create a student \n",
-    "   - 3.a. [Using depth-pruning](./03_a_depth_pruning.ipynb)\n",
-    "   - 3.b. [Using width-pruning](./03_b_width_pruning.ipynb)\n",
-    "4. Distill knowledge from teacher into student\n",
-    "   - 4.a. [Using depth-pruned student](./04_a_distilling_depth_pruned_student.ipynb)\n",
-    "   - 4.b. [Using width-pruned student](./04_b_distilling_width_pruned_student.ipynb)\n",
-    "5. [Display the validation loss](./05_display_results.ipynb)\n",
-    "\n",
-    "> `NOTE:` We are exploring two methods to prune the finetuned teacher model: [depth-pruning](./03_a_depth_pruning.ipynb) and [width-pruning](./03_b_width_pruning.ipynb). Per the [tech report](https://arxiv.org/pdf/2408.11796), we can observe that width-pruning generally outperforms depth-pruning so users can choose to perform either [depth-pruning](./03_a_depth_pruning.ipynb) or [width-pruning](./03_b_width_pruning.ipynb) or both methods."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/tutorials/llm/llama-3/pruning-distillation/llama3-pruning-distillation-nemofw.ipynb b/tutorials/llm/llama-3/pruning-distillation/llama3-pruning-distillation-nemofw.ipynb
new file mode 100644
index 000000000000..8b31ad4de018
--- /dev/null
+++ b/tutorials/llm/llama-3/pruning-distillation/llama3-pruning-distillation-nemofw.ipynb
@@ -0,0 +1,587 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "363a6974-810c-41c5-84da-4751a92fb72b",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Pruning and Distillation of Llama 3.1 model with NeMo Framework"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6d4ed6d-8ecd-4647-bd0a-e48fec64c199",
+   "metadata": {},
+   "source": [
+    "This notebook showcases performing pruning and distillation on **Llama 3.1-8B-Instruct** with the [WikiText-103-v1](https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1) dataset using NeMo Framework. The [WikiText-103-v1](https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1) language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia. \n",
+    "\n",
+    "For this demonstration, we will perform a light finetuning procedure on the `Meta Llama 3.1 8B Instruct` teacher model to generate a finetuned teacher model. This finetuned teacher model will then be trimmed to create a depth-pruned model `4b_trimmed_model.nemo` that will serve as a starting point for distillation to a final 4B model. \n",
+    "\n",
+    "> We are using models utilizing the `meta-llama/Meta-Llama-3.1-8B` tokenizer for this demonstration.\n",
+    "\n",
+    "> `NOTE:` Ensure that you run this notebook inside the [NeMo Framework container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) which has all the required dependencies. \n",
+    "\n",
+    "**Instructions are available in the associated tutorial README to download the model and the container.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d0dc714-5bbf-4266-805a-9841ff486c05",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!pip install --upgrade ipywidgets notebook\n",
+    "!pip install datasets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2658505d-7990-40a5-a269-866ddd8a0181",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "---\n",
+    "## Prerequisites\n",
+    "Ensure you have the following -\n",
+    "1. **Get the teacher model**: Download the `Meta Llama 3.1 8B Instruct .nemo` model. You must follow the instructions in the associated README to download and mount the folder to the NeMo FW container."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a30cfe8a-87a8-4511-be5f-e20d7fe558d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls /workspace/llama-3_1-8b-instruct-nemo_v1.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "251a670e-9636-4807-bc98-a91c6137454d",
+   "metadata": {},
+   "source": [
+    "2. **Set the Hugging Face Access Token**: You can obtain this from your [Hugging Face account](https://huggingface.co/docs/hub/en/security-tokens). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47d7887d-b582-4a1e-81cd-fdc1be8d9afb",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import login\n",
+    "login(token=\"<YOUR_HF_ACCESS_TOKEN>\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b5384e9a-6c40-4454-abe8-413ad9d5db96",
+   "metadata": {},
+   "source": [
+    "3. **Obtain the dataset**: Generate the `wikitext-{train/val/test}.jsonl` splits after loading the [WikiText-103-v1](https://huggingface.co/datasets/Salesforce/wikitext/viewer/wikitext-103-v1) dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b420bd44-3628-45e2-92e7-df38f72a658a",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Split into train, test and val files\n",
+    "\n",
+    "import json\n",
+    "import os\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "# Load the WikiText-103 dataset\n",
+    "dataset = load_dataset(\"wikitext\", \"wikitext-103-v1\")\n",
+    "\n",
+    "# Define the destination folder\n",
+    "data_folder = 'wikitext-data'\n",
+    "os.makedirs(data_folder, exist_ok=True)\n",
+    "\n",
+    "# Define file paths and destination paths\n",
+    "file_paths = {\n",
+    "    'train': os.path.join(data_folder, 'wikitext-train.jsonl'),\n",
+    "    'validation': os.path.join(data_folder, 'wikitext-val.jsonl'),\n",
+    "    'test': os.path.join(data_folder, 'wikitext-test.jsonl')\n",
+    "}\n",
+    "\n",
+    "# Function to save dataset split to a JSONL file\n",
+    "def save_to_jsonl(file_path, data):\n",
+    "    with open(file_path, 'w') as file:\n",
+    "        for item in data:\n",
+    "            file.write(json.dumps(item) + '\\n')\n",
+    "\n",
+    "# Define splits\n",
+    "splits = [\"train\", \"validation\", \"test\"]\n",
+    "\n",
+    "# Save splits to JSONL files and calculate their sizes\n",
+    "for split in splits:\n",
+    "    if split in dataset:\n",
+    "        save_to_jsonl(file_paths[split], dataset[split])\n",
+    "    else:\n",
+    "        print(f\"Split {split} not found in the dataset.\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0185a0a9-904d-46de-a450-db4c84c4cde4",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "---\n",
+    "##  Step-by-step instructions\n",
+    "\n",
+    "This notebook is structured into five steps:\n",
+    "1. Prepare the dataset\n",
+    "2. Finetune the teacher on the dataset\n",
+    "3. Prune the finetuned-teacher model to create a student\n",
+    "3. Distill knowledge from teacher into student\n",
+    "4. Display the validation loss"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf1d41ff-2cba-4efc-84e3-7d713df0cdb8",
+   "metadata": {},
+   "source": [
+    "### Step 1: Prepare the dataset\n",
+    "\n",
+    "The dataset has to be preprocessed using the [preprocess_data_for_megatron.py](https://github.com/NVIDIA/NeMo/blob/main/scripts/nlp_language_modeling/preprocess_data_for_megatron.py) script included in the NeMo Framework. This step will also tokenize data using the `meta-llama/Meta-Llama-3.1-8B` tokenizer model to convert the data into a memory map format.\n",
+    "\n",
+    "> `NOTE:` In the block of code below, pass the paths to your train, test and validation data files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c49c1b8-2447-426c-9f24-bf5956aa2941",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!python /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \\\n",
+    "--input=\"./wikitext-data/wikitext-train.jsonl\" \\\n",
+    "--tokenizer-library='huggingface' \\\n",
+    "--tokenizer-type='meta-llama/Meta-Llama-3.1-8B' \\\n",
+    "--output-prefix=wikitext_tokenized_train \\\n",
+    "--append-eod \\\n",
+    "--workers=32"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72d14fd7-702f-4b74-a6e5-af3a60eef3a9",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!python /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \\\n",
+    "--input=\"./wikitext-data/wikitext-test.jsonl\" \\\n",
+    "--tokenizer-library='huggingface' \\\n",
+    "--tokenizer-type='meta-llama/Meta-Llama-3.1-8B' \\\n",
+    "--output-prefix=wikitext_tokenized_test \\\n",
+    "--append-eod \\\n",
+    "--workers=32"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1338a1ce-f0e2-4151-ad3d-d34db75ea1bd",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!python /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \\\n",
+    "--input=\"./wikitext-data/wikitext-val.jsonl\" \\\n",
+    "--tokenizer-library='huggingface' \\\n",
+    "--tokenizer-type='meta-llama/Meta-Llama-3.1-8B' \\\n",
+    "--output-prefix=wikitext_tokenized_val \\\n",
+    "--append-eod \\\n",
+    "--workers=32"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eb80e212-c343-4e51-a92d-184db43df011",
+   "metadata": {},
+   "source": [
+    "After running the above scripts, you will see the preprocesed `wikitext_tokenized_{train/val/test}_text_document.{idx/bin}`files. These output files will be used in the next step."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9f30c0a-4315-4017-b014-add4291a3fde",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Step 2: Finetune the teacher on the dataset\n",
+    "\n",
+    "NeMo framework includes a standard python script [megatron_gpt_pretraining.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_pretraining.py) for training a model. Once you have your model downloaded and the dataset ready, fine-tuning the teacher model with NeMo is essentially just running this script!\n",
+    "\n",
+    "For this demonstration, this training run is capped by `STEPS`, and validation is carried out every `VAL_INTERVAL` steps.\n",
+    "\n",
+    "> `NOTE:` In the block of code below, pass the paths to your pre-processed train, test and validation data files as well as path to the teacher .nemo model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c31fd642-0304-43ed-9211-041dc36f22c3",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash \n",
+    "\n",
+    "export CUDA_DEVICE_MAX_CONNECTIONS=1\n",
+    "\n",
+    "\n",
+    "# Set path(s) if different:\n",
+    "\n",
+    "MODEL=\"/workspace/llama-3_1-8b-instruct-nemo_v1.0/llama3_1_8b_instruct.nemo\"\n",
+    "\n",
+    "# Can change these to accommodate resources:\n",
+    "\n",
+    "TENSOR_PARALLEL_SIZE=8\n",
+    "NODES=1\n",
+    "MICRO_BATCH_SIZE=4\n",
+    "\n",
+    "# Don't change the following:\n",
+    "\n",
+    "EXPERIMENT_DIR=\"distill_trainings\"\n",
+    "EXPERIMENT_NAME=\"megatron_llama_ft\"\n",
+    "\n",
+    "DATA_TRAIN='wikitext_tokenized_train_text_document'\n",
+    "DATA_VAL='wikitext_tokenized_test_text_document'\n",
+    "DATA_TEST='wikitext_tokenized_val_text_document'\n",
+    "\n",
+    "STEPS=30\n",
+    "GLOBAL_BATCH_SIZE=128\n",
+    "\n",
+    "LOG_INTERVAL=1\n",
+    "VAL_INTERVAL=10\n",
+    "NUM_VAL_BATCHES=5\n",
+    "\n",
+    "LR=1e-4\n",
+    "MIN_LR=1e-5\n",
+    "WARMUP_STEPS=2\n",
+    "\n",
+    "\n",
+    "cmd=\"torchrun --nproc-per-node=${TENSOR_PARALLEL_SIZE}\"\n",
+    "\n",
+    "${cmd} /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \\\n",
+    "    --config-path /opt/NeMo/examples/nlp/language_modeling/conf/ \\\n",
+    "    --config-name megatron_llama_distill.yaml \\\n",
+    "    \\\n",
+    "    name=${EXPERIMENT_NAME} \\\n",
+    "    \\\n",
+    "    exp_manager.exp_dir=${EXPERIMENT_DIR} \\\n",
+    "    exp_manager.checkpoint_callback_params.save_top_k=1 \\\n",
+    "    exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \\\n",
+    "    \\\n",
+    "    trainer.max_steps=${STEPS} \\\n",
+    "    trainer.log_every_n_steps=${LOG_INTERVAL} \\\n",
+    "    trainer.val_check_interval=${VAL_INTERVAL} \\\n",
+    "    trainer.limit_val_batches=${NUM_VAL_BATCHES} \\\n",
+    "    +trainer.num_sanity_val_steps=0 \\\n",
+    "    \\\n",
+    "    trainer.precision=bf16 \\\n",
+    "    trainer.devices=${TENSOR_PARALLEL_SIZE} \\\n",
+    "    trainer.num_nodes=${NODES} \\\n",
+    "    \\\n",
+    "    \"model.data.data_prefix={train:[1.0,$DATA_TRAIN],validation:[$DATA_VAL],test:[$DATA_TEST]}\" \\\n",
+    "    \\\n",
+    "    model.restore_from_path=${MODEL} \\\n",
+    "    \\\n",
+    "    ~model.tokenizer \\\n",
+    "    +model.tokenizer='{library: huggingface, type: meta-llama/Meta-Llama-3.1-8B, use_fast: True}' \\\n",
+    "    \\\n",
+    "    model.tensor_model_parallel_size=${TENSOR_PARALLEL_SIZE} \\\n",
+    "    model.sequence_parallel=True \\\n",
+    "    model.micro_batch_size=${MICRO_BATCH_SIZE} \\\n",
+    "    model.global_batch_size=${GLOBAL_BATCH_SIZE} \\\n",
+    "    \\\n",
+    "    model.encoder_seq_length=8192 \\\n",
+    "    model.num_layers=32 \\\n",
+    "    model.hidden_size=4096 \\\n",
+    "    model.ffn_hidden_size=14336 \\\n",
+    "    model.num_attention_heads=32 \\\n",
+    "    model.hidden_dropout=0.0 \\\n",
+    "    model.attention_dropout=0.0 \\\n",
+    "    model.apply_query_key_layer_scaling=True \\\n",
+    "    model.normalization='rmsnorm' \\\n",
+    "    model.bias=False \\\n",
+    "    model.activation='fast-swiglu' \\\n",
+    "    model.position_embedding_type='rope' \\\n",
+    "    model.share_embeddings_and_output_weights=False \\\n",
+    "    model.num_query_groups=8 \\\n",
+    "    ++model.scale_positional_embedding=True \\\n",
+    "    ++model.rotary_base=500000.0 \\\n",
+    "    \\\n",
+    "    model.optim.name=distributed_fused_adam \\\n",
+    "    model.optim.lr=${LR} \\\n",
+    "    model.optim.sched.min_lr=${MIN_LR} \\\n",
+    "    model.optim.sched.warmup_steps=${WARMUP_STEPS}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8aaf604a-efc0-4908-9055-5cf3bb0a05ae",
+   "metadata": {},
+   "source": [
+    "This will create a finetuned teacher model named `megatron_llama_ft.nemo` in `./distill_trainings/megatron_llama_ft/checkpoints/`. We'll use this later.\n",
+    "> `NOTE:`This script takes at least 20 minutes to run (depending on GPU) and will generate the finetuned teacher model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2709ccc0-bbb8-44ba-b00d-15b1dc5d60a7",
+   "metadata": {},
+   "source": [
+    "### Step 3: Prune the finetuned-teacher model to create a student\n",
+    "\n",
+    "The next step is to trim the last 16 layers in the finetined teacher model. In this notebook, we are using depth-pruning and would be using the [megatron_gpt_drop_layers](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_drop_layers.py) script. \n",
+    "> `NOTE:` In the block of code below, pass the paths to your finetuned teacher .nemo model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9715a1b-7a23-437f-b5e1-feec8e6c68e0",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!python -m torch.distributed.launch --nproc_per_node=8 \\\n",
+    "     /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_drop_layers.py \\\n",
+    "     --path_to_nemo \"./distill_trainings/megatron_llama_ft/checkpoints/megatron_llama_ft.nemo\" \\\n",
+    "     --path_to_save \"/workspace/4b_trimmed_model.nemo\" \\\n",
+    "     --tensor_model_parallel_size 8 \\\n",
+    "     --pipeline_model_parallel_size 1 \\\n",
+    "     --gpus_per_node 8 \\\n",
+    "     --drop_layers 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1e9553db-9478-4074-9de1-1fa01a0e835c",
+   "metadata": {},
+   "source": [
+    "Running this script will save the depth-pruned model `4b_trimmed_model.nemo` to your workspace."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b8ada696-5d77-4113-9d15-a603113fdd58",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Step 4: Distill knowledge from teacher into student\n",
+    "\n",
+    "Distillation of a model with NeMo Framework is also possible using a python script: [megatron_gpt_distillation.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_distillation.py). \n",
+    "\n",
+    "For this demonstration, the `TEACHER` would be the finetuned teacher model `megatron_llama_ft.nemo` and the `STUDENT` model would be the pruned 4B model `4b_trimmed_model.nemo`. This training run is capped by `STEPS`, and validation is carried out every `VAL_INTERVAL` steps.\n",
+    "\n",
+    "> `NOTE:` In the block of code below, pass the paths to your pre-processed train, test and validation data files as well as path to the teacher and student .nemo models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "61c0c69d-9401-4355-8725-78aa72eee8da",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash \n",
+    "\n",
+    "export CUDA_DEVICE_MAX_CONNECTIONS=1\n",
+    "\n",
+    "\n",
+    "# Can change these to accommodate resources:\n",
+    "\n",
+    "TENSOR_PARALLEL_SIZE=8\n",
+    "NODES=1\n",
+    "MICRO_BATCH_SIZE=4\n",
+    "\n",
+    "# Don't change the following:\n",
+    "\n",
+    "EXPERIMENT_DIR=\"distill_trainings\"\n",
+    "EXPERIMENT_NAME=\"megatron_llama_distill\"\n",
+    "\n",
+    "TEACHER=\"${EXPERIMENT_DIR}/megatron_llama_ft/checkpoints/megatron_llama_ft.nemo\"\n",
+    "STUDENT=\"/workspace/4b_trimmed_model.nemo\"\n",
+    "\n",
+    "FINAL_MODEL_PATH=\"${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/checkpoints/distilled_4b_model.nemo\"\n",
+    "\n",
+    "DATA_TRAIN='wikitext_tokenized_train_text_document'\n",
+    "DATA_VAL='wikitext_tokenized_test_text_document'\n",
+    "DATA_TEST='wikitext_tokenized_val_text_document'\n",
+    "\n",
+    "STEPS=30\n",
+    "GLOBAL_BATCH_SIZE=128\n",
+    "\n",
+    "LOG_INTERVAL=1\n",
+    "VAL_INTERVAL=10\n",
+    "NUM_VAL_BATCHES=5\n",
+    "\n",
+    "LR=1e-4\n",
+    "MIN_LR=1e-5\n",
+    "WARMUP_STEPS=2\n",
+    "\n",
+    "\n",
+    "cmd=\"torchrun --nproc-per-node=${TENSOR_PARALLEL_SIZE}\"\n",
+    "\n",
+    "${cmd} /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_distillation.py \\\n",
+    "    name=${EXPERIMENT_NAME} \\\n",
+    "    \\\n",
+    "    exp_manager.exp_dir=${EXPERIMENT_DIR} \\\n",
+    "    exp_manager.checkpoint_callback_params.save_top_k=1 \\\n",
+    "    \\\n",
+    "    trainer.max_steps=${STEPS} \\\n",
+    "    trainer.log_every_n_steps=${LOG_INTERVAL} \\\n",
+    "    trainer.val_check_interval=${VAL_INTERVAL} \\\n",
+    "    trainer.limit_val_batches=${NUM_VAL_BATCHES} \\\n",
+    "    +trainer.num_sanity_val_steps=0 \\\n",
+    "    \\\n",
+    "    trainer.precision=bf16 \\\n",
+    "    trainer.devices=${TENSOR_PARALLEL_SIZE} \\\n",
+    "    trainer.num_nodes=${NODES} \\\n",
+    "    \\\n",
+    "    \"model.data.data_prefix={train:[1.0,$DATA_TRAIN],validation:[$DATA_VAL],test:[$DATA_TEST]}\" \\\n",
+    "    \\\n",
+    "    model.restore_from_path=${STUDENT} \\\n",
+    "    model.kd_teacher_restore_from_path=${TEACHER} \\\n",
+    "    model.nemo_path=${FINAL_MODEL_PATH} \\\n",
+    "    \\\n",
+    "    model.tensor_model_parallel_size=${TENSOR_PARALLEL_SIZE} \\\n",
+    "    model.sequence_parallel=True \\\n",
+    "    model.micro_batch_size=${MICRO_BATCH_SIZE} \\\n",
+    "    model.global_batch_size=${GLOBAL_BATCH_SIZE} \\\n",
+    "    \\\n",
+    "    model.optim.name=distributed_fused_adam \\\n",
+    "    model.optim.lr=${LR} \\\n",
+    "    model.optim.sched.min_lr=${MIN_LR} \\\n",
+    "    model.optim.sched.warmup_steps=${WARMUP_STEPS}\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fe7034ba-8c69-4edb-8c0f-84fdca43c152",
+   "metadata": {},
+   "source": [
+    "This will create the final distilled model named `distilled_4b_model.nemo` in `./distill_trainings/megatron_llama_distill/checkpoints`.\n",
+    "> `NOTE:`This script takes at least 35 minutes to run and generate the final distilled model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c9a66d44-5028-47f9-9df3-9f07692e9461",
+   "metadata": {},
+   "source": [
+    "### Step 5: Display the validation loss\n",
+    "\n",
+    "Now that the results are in, let's visualize the validation loss of the distilled model using the `tensorboard` library. \n",
+    "> `NOTE:` This notebook demonstrates the use of the teacher finetuning, pruning and the distillation script. These scripts should ideally be run on a multi-node cluster with a larger `GLOBAL_BATCH_SIZE` and `STEPS` to see improvement in the validation loss."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be4da14c-c03f-4c28-accd-8f676dbef8a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext tensorboard\n",
+    "%tensorboard --logdir \"distill_trainings/megatron_llama_distill/\" --port=6007"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "08c63b80-0f24-4dde-b5d6-11db444726ed",
+   "metadata": {},
+   "source": [
+    "Here is an image of the validation loss over 30 steps of running the training step in the distillation script."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "648424fc-6a51-43ca-8f19-6ad05f949054",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<img src=\"https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/val_loss_distillation.png\" width=\"400\"/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Image object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from IPython.display import Image, display\n",
+    "display(Image(url=\"https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/val_loss_distillation.png\", width=400))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}