Skip to content

Commit

Permalink
[Feature] Support load_json_file with json.load (#610)
Browse files Browse the repository at this point in the history
support load_json_file with json.load
  • Loading branch information
HIT-cwh authored Apr 24, 2024
1 parent 0e6241f commit 649cab9
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 14 deletions.
21 changes: 7 additions & 14 deletions xtuner/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .intern_repo import (build_packed_dataset,
load_intern_repo_tokenized_dataset,
load_intern_repo_untokenized_dataset)
from .json_dataset import load_json_file
from .llava import LLaVADataset
from .modelscope import process_ms_dataset
from .moss_sft import MOSSSFTDataset
Expand All @@ -17,19 +18,11 @@
warnings.simplefilter(action='ignore', category=FutureWarning)

__all__ = [
'process_hf_dataset',
'ConcatDataset',
'MOSSSFTDataset',
'process_ms_dataset',
'LLaVADataset',
'expand2square',
'decode_base64_to_image',
'load_image',
'process_ms_dataset',
'process_hf_dataset', 'ConcatDataset', 'MOSSSFTDataset',
'process_ms_dataset', 'LLaVADataset', 'expand2square',
'decode_base64_to_image', 'load_image', 'process_ms_dataset',
'load_intern_repo_tokenized_dataset',
'load_intern_repo_untokenized_dataset',
'build_packed_dataset',
'RefCOCOJsonDataset',
'RefCOCOJsonEvalDataset',
'InvRefCOCOJsonDataset',
'load_intern_repo_untokenized_dataset', 'build_packed_dataset',
'RefCOCOJsonDataset', 'RefCOCOJsonEvalDataset', 'InvRefCOCOJsonDataset',
'load_json_file'
]
24 changes: 24 additions & 0 deletions xtuner/dataset/json_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import json
import os

from datasets import Dataset, concatenate_datasets


def load_json_file(data_files=None, data_dir=None, suffix=None):
assert (data_files is not None) != (data_dir is not None)
if data_dir is not None:
data_files = os.listdir(data_dir)
data_files = [os.path.join(data_dir, fn) for fn in data_files]
if suffix is not None:
data_files = [fp for fp in data_files if fp.endswith(suffix)]
elif isinstance(data_files, str):
data_files = [data_files]

dataset_list = []
for fp in data_files:
with open(fp, encoding='utf-8') as file:
data = json.load(file)
ds = Dataset.from_list(data)
dataset_list.append(ds)
dataset = concatenate_datasets(dataset_list)
return dataset

0 comments on commit 649cab9

Please sign in to comment.