Skip to content

Commit

Permalink
Apply isort and black reformatting
Browse files Browse the repository at this point in the history
Signed-off-by: yaoyu-33 <[email protected]>
  • Loading branch information
yaoyu-33 committed Nov 5, 2024
1 parent dcf115c commit 03838bf
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 62 deletions.
54 changes: 26 additions & 28 deletions nemo/collections/vlm/mllama/data/lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,14 @@
import logging
import os
import re
from typing import Any, Dict, List, Sequence
from typing import Optional
from typing import Any, Dict, List, Optional, Sequence

import pytorch_lightning as pl
import torch
import torch.nn.functional as F
from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
from torch.utils import data
from torch.utils.data import DataLoader
from torch.utils.data import default_collate
from torch.utils.data import DataLoader, default_collate

from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
from nemo.collections.vlm.mllama.model.utils import create_vision_mask_tensor
Expand All @@ -38,12 +36,12 @@ class MLlamaDataset(LazySupervisedDataset):
"""Dataset for supervised fine-tuning."""

def __init__(
self,
data_path,
data_config,
tokenizer,
image_processor,
sequence_length,
self,
data_path,
data_config,
tokenizer,
image_processor,
sequence_length,
):

if data_path.endswith(".json"):
Expand Down Expand Up @@ -174,24 +172,24 @@ def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:

class MLlamaLazyDataModule(pl.LightningDataModule):
def __init__(
self,
paths: str|List[str],
weights: Optional[List[float]] = None,
data_config: Optional[DataConfig] = ImageDataConfig,
seq_length: int = 2048,
decoder_seq_length: Optional[int] = None,
tokenizer: Optional = None,
image_processor: Optional = None,
micro_batch_size: int = 4,
global_batch_size: int = 8,
num_train_samples: int = 10_000,
num_val_samples: int = 10_000,
num_test_samples: int = 10_000,
num_workers: int = 8,
pin_memory: bool = True,
persistent_workers: bool = False,
use_packed_sequence: bool = False,
seed: int = 1234,
self,
paths: str | List[str],
weights: Optional[List[float]] = None,
data_config: Optional[DataConfig] = ImageDataConfig,
seq_length: int = 2048,
decoder_seq_length: Optional[int] = None,
tokenizer: Optional = None,
image_processor: Optional = None,
micro_batch_size: int = 4,
global_batch_size: int = 8,
num_train_samples: int = 10_000,
num_val_samples: int = 10_000,
num_test_samples: int = 10_000,
num_workers: int = 8,
pin_memory: bool = True,
persistent_workers: bool = False,
use_packed_sequence: bool = False,
seed: int = 1234,
) -> None:
super().__init__()
if not isinstance(paths, (list, tuple)):
Expand Down
66 changes: 32 additions & 34 deletions nemo/collections/vlm/neva/data/lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
import os
import re
import tarfile
from typing import Any, Dict, List, Sequence
from typing import Optional
from typing import Any, Dict, List, Optional, Sequence

import decord
import numpy as np
Expand All @@ -28,8 +27,7 @@
from PIL import Image
from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
from torch.utils import data
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, default_collate
from torch.utils.data import DataLoader, Dataset, default_collate
from transformers import CLIPImageProcessor, SiglipImageProcessor

from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
Expand Down Expand Up @@ -239,7 +237,7 @@ def find_pattern_indices(template, pattern, search_start_index=0, allow_first_to
template_len = len(template)
pattern_len = len(pattern)
for i in range(search_start_index, template_len - pattern_len + 1):
match = template[i: i + pattern_len] == pattern
match = template[i : i + pattern_len] == pattern
if torch.all(match) or (allow_first_token_mismatch and torch.all(match[1:])):
return i, i + pattern_len
return -1, -1
Expand All @@ -248,12 +246,12 @@ def find_pattern_indices(template, pattern, search_start_index=0, allow_first_to
class LazySupervisedDataset(Dataset):

def __init__(
self,
data_path,
data_config,
tokenizer,
image_processor,
sequence_length,
self,
data_path,
data_config,
tokenizer,
image_processor,
sequence_length,
):
super().__init__()
if data_path is not None:
Expand Down Expand Up @@ -352,7 +350,7 @@ def _tokenize_and_label(self, conversations):
for i in range(1, len(self.conv.messages), 2):
stop_str = getattr(self.conv, "stop_str", None)
assert (
stop_str is not None
stop_str is not None
), "If `stop_str` is not provided, issues might occur in labeling the answer tokens."
answer_tokens = self.tokenizer.encode(
self.conv.messages[i][1] + ("" if stop_str is None else stop_str),
Expand All @@ -378,11 +376,11 @@ class NevaDataset(LazySupervisedDataset):
"""Dataset for supervised fine-tuning."""

def __init__(
self,
data_path,
data_config,
tokenizer,
image_processor,
self,
data_path,
data_config,
tokenizer,
image_processor,
):

if data_path.endswith(".json"):
Expand Down Expand Up @@ -494,23 +492,23 @@ def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:

class NevaLazyDataModule(pl.LightningDataModule):
def __init__(
self,
paths: str|List[str],
weights: Optional[List[float]] = None,
data_config: Optional[DataConfig] = ImageDataConfig,
seq_length: int = 2048,
tokenizer: Optional = None,
image_processor: Optional = None,
micro_batch_size: int = 4,
global_batch_size: int = 8,
num_train_samples: int = 10_000,
num_val_samples: int = 10_000,
num_test_samples: int = 10_000,
num_workers: int = 8,
pin_memory: bool = True,
persistent_workers: bool = False,
use_packed_sequence: bool = False,
seed: int = 1234,
self,
paths: str | List[str],
weights: Optional[List[float]] = None,
data_config: Optional[DataConfig] = ImageDataConfig,
seq_length: int = 2048,
tokenizer: Optional = None,
image_processor: Optional = None,
micro_batch_size: int = 4,
global_batch_size: int = 8,
num_train_samples: int = 10_000,
num_val_samples: int = 10_000,
num_test_samples: int = 10_000,
num_workers: int = 8,
pin_memory: bool = True,
persistent_workers: bool = False,
use_packed_sequence: bool = False,
seed: int = 1234,
) -> None:
super().__init__()
if not isinstance(paths, (list, tuple)):
Expand Down

0 comments on commit 03838bf

Please sign in to comment.