Skip to content

Commit

Permalink
feat(client): support dataset build from jsonl files (#2606)
Browse files Browse the repository at this point in the history
  • Loading branch information
tianweidut authored Aug 11, 2023
1 parent 7dc8468 commit 69d4303
Show file tree
Hide file tree
Showing 13 changed files with 260 additions and 153 deletions.
136 changes: 112 additions & 24 deletions client/starwhale/api/_impl/dataset/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
_DType = t.TypeVar("_DType", bound="Dataset")
_ItemType = t.Union[str, int, slice]
_GItemType = t.Optional[t.Union[DataRow, t.List[DataRow]]]
_IterFeatureDict = t.Iterable[t.Dict[str, t.Any]]

_DEFAULT_LOADER_WORKERS = 2
_DEFAULT_LOADER_CACHE_SIZE = 20
Expand Down Expand Up @@ -1266,36 +1267,76 @@ def from_huggingface(
def from_json(
cls,
name: str,
json_text: str,
path: PathLike | t.List[PathLike] | None = None,
text: str | None = None,
field_selector: str = "",
alignment_size: int | str = D_ALIGNMENT_SIZE,
volume_size: int | str = D_FILE_VOLUME_SIZE,
mode: DatasetChangeMode | str = DatasetChangeMode.PATCH,
tags: t.List[str] | None = None,
encoding: str | None = None,
) -> Dataset:
"""Create a new dataset from a json text.
The dataset created by the json text will use the auto increment index as the row index.
path and text arguments are mutually exclusive, one of them must be specified.
Arguments:
name: (str, required) The dataset name you would like to use.
json_text: (str, required) The json text from which you would like to create this dataset.
path: (str|Path|List[str]|List[Path], optional) Json or json line files.
text: (str, optional) The json text from which you would like to create this dataset.
field_selector: (str, optional) The filed from which you would like to extract dataset array items.
The default value is "" which indicates that the json object is an array contains all the items.
alignment_size: (int|str, optional) The blob alignment size. The default value is 128.
volume_size: (int|str, optional) The blob volume size. The default value is 64MB.
mode: (str|DatasetChangeMode, optional) The dataset change mode. The default value is `patch`. Mode choices are `patch` and `overwrite`.
tags: (list(str), optional) The tags for the dataset version.`latest` and `^v\d+$` tags are reserved tags.
encoding: (str, optional) The encoding used to decode the input file. The default is None.
encoding does not support text parameter.
Returns:
A Dataset Object
Json text format:
```json
[
{"a": 1, "b": 2},
{"a": 10, "b": 20},
]
```
Using field_selector: p1.p2.p3 to extract dataset array items:
```json
{
"p1": {
"p2":{
"p3": [
{"a": 1, "b": 2},
{"a": 10, "b": 20},
]
}
}
}
```
Json line text format:
```jsonl
{"a": 1, "b": 2}
{"a": 10, "b": 20}
```
Using field_selector: p1.p2.p3 to extract dataset array items:
```jsonl
{"p1": {"p2": {"p3": {"a": 1, "b": 2}}}}
{"p1": {"p2": {"p3": {"a": 10, "b": 20}}}}
```
Examples:
```python
from starwhale import Dataset
myds = Dataset.from_json(
name="translation",
json_text='[{"en":"hello","zh-cn":"你好"},{"en":"how are you","zh-cn":"最近怎么样"}]'
text='[{"en":"hello","zh-cn":"你好"},{"en":"how are you","zh-cn":"最近怎么样"}]'
)
print(myds[0].features.en)
```
Expand All @@ -1304,33 +1345,79 @@ def from_json(
from starwhale import Dataset
myds = Dataset.from_json(
name="translation",
json_text='{"content":{"child_content":[{"en":"hello","zh-cn":"你好"},{"en":"how are you","zh-cn":"最近怎么样"}]}}',
text='{"content":{"child_content":[{"en":"hello","zh-cn":"你好"},{"en":"how are you","zh-cn":"最近怎么样"}]}}',
field_selector="content.child_content"
)
print(myds[0].features["zh-cn"])
```
```python
from starwhale import Dataset
# create a dataset from /path/to/data.json file.
Dataset.from_json(path="/path/to/data.json", name="myds"))
# create a dataset from /path/to/dir folder.
Dataset.from_json(path="/path/to/dir", name="myds")
# create a dataset from /path/to/data1.json, /path/to/data2.json
Dataset.from_json(path=["/path/to/data1.json", "/path/to/data2.json"], name="myds")
# create a dataset from http://example.com/data.json file.
Dataset.from_json(path="http://example.com/data.json", name="myds")
```
"""
mode = DatasetChangeMode(mode)
data_items = json.loads(json_text)

if field_selector:
# Split field selector by dots
fields = field_selector.split(".")
# Iterate over selected fields
for field in fields:
if field in data_items:
data_items = data_items[field]

def _selector(_data: t.Sequence | t.Dict) -> t.Sequence | t.Dict:
if field_selector:
fields = field_selector.split(".")
for field in fields:
if not isinstance(_data, dict):
raise NoSupportError(
f"field_selector only supports dict type: {_data}"
)
if field in _data:
_data = _data[field]
else:
raise ValueError(
f"The field_selector {field_selector} isn't in json text: {_data}"
)
return _data

def _decode() -> t.Iterable[t.Sequence | t.Dict]:
if path is not None and text is not None:
raise ValueError("paths and text arguments are mutually exclusive")
elif path:
for fp, suffix in iter_pathlike_io(
path, encoding=encoding, accepted_file_types=[".json", ".jsonl"]
):
if suffix == ".json":
yield _selector(json.load(fp))
elif suffix == ".jsonl":
for line in fp.readlines():
_r = json.loads(line)
_r = _selector(_r)
if isinstance(_r, dict):
_r = [_r]
yield _r
else:
raise ValueError(f"unsupported file type: {suffix}")
elif text:
yield _selector(json.loads(text))
else:
raise ValueError("paths or text argument must be specified")

def _iter(_iter: t.Iterable[t.Sequence | t.Dict]) -> _IterFeatureDict:
for i in _iter:
if isinstance(i, (list, tuple)):
for j in i:
yield j
elif isinstance(i, dict):
yield i
else:
raise ValueError(
f"The field_selector {field_selector} isn't in json_text: {json_text}"
)
if not isinstance(data_items, list):
raise ValueError(
f"The field selected by field_selector {field_selector} isn't an array: {data_items}"
)
raise ValueError(f"json text:{i} must be dict, list or tuple type")

return cls.from_dict_items(
data_items,
_iter(_decode()),
name=name,
volume_size=volume_size,
alignment_size=alignment_size,
Expand Down Expand Up @@ -1404,8 +1491,8 @@ def from_csv(
```
"""

def _iter_records() -> t.Iterator[t.Dict[str, t.Any]]:
for fp in iter_pathlike_io(
def _iter_records() -> _IterFeatureDict:
for fp, _ in iter_pathlike_io(
path=path, encoding=encoding, newline="", accepted_file_types=[".csv"]
):
for record in csv.DictReader(
Expand Down Expand Up @@ -1473,6 +1560,7 @@ def _iter_records():
ds = Dataset.from_dict_items(iter([(1, {"a": 1, "b: 2, "c": 3})]), name="my-dataset")
```
"""
mode = DatasetChangeMode(mode)
StandaloneTag.check_tags_validation(tags)

with cls.dataset(name) as ds:
Expand Down
55 changes: 30 additions & 25 deletions client/starwhale/core/dataset/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,14 @@ def dataset_cmd(ctx: click.Context) -> None:
help="Build dataset from dataset.yaml file. Default uses dataset.yaml in the work directory(pwd).",
)
@optgroup.option( # type: ignore[no-untyped-call]
"json_file",
"json_files",
"-jf",
"--json-file",
"--json",
multiple=True,
help=(
"Build dataset from json file, the json file option is a json file path or a http downloaded url."
"The json content structure should be a list[dict] or tuple[dict]."
"Build dataset from json or json line files, local path or http downloaded url is supported."
"For the json file: the json content structure should be a list[dict] or tuple[dict] from the original format or ingest format by field-selector."
"For the json line file: each line is a json dict."
),
)
@optgroup.option( # type: ignore[no-untyped-call]
Expand Down Expand Up @@ -141,6 +143,11 @@ def dataset_cmd(ctx: click.Context) -> None:
multiple=True,
help="dataset tags, the option can be used multiple times. `latest` and `^v\d+$` tags are reserved tags.",
)
@optgroup.option( # type: ignore[no-untyped-call]
"file_encoding",
"--encoding",
help="The csv/json/jsonl file encoding.",
)
@optgroup.option("-r", "--runtime", help="runtime uri") # type: ignore[no-untyped-call]
@optgroup.group("\n ** Handler Build Source Configurations")
@optgroup.option("-w", "--workdir", default=".", help="work dir to search handler, the option only works for the handler build source.") # type: ignore[no-untyped-call]
Expand All @@ -152,8 +159,9 @@ def dataset_cmd(ctx: click.Context) -> None:
default=True,
help="Whether to auto label by the sub-folder name. The default value is True",
)
@optgroup.group("\n ** JsonFile Build Source Configurations")
@optgroup.group("\n ** Json Build Source Configurations")
@optgroup.option( # type: ignore[no-untyped-call]
"json_field_selector",
"--field-selector",
default="",
help=(
Expand Down Expand Up @@ -244,11 +252,6 @@ def dataset_cmd(ctx: click.Context) -> None:
show_default=True,
help="When True, raise exception Error if the csv is not well formed.",
)
@optgroup.option( # type: ignore[no-untyped-call]
"csv_encoding",
"--encoding",
help="The csv file encoding.",
)
@click.pass_obj
def _build(
view: DatasetTermView,
Expand All @@ -265,8 +268,8 @@ def _build(
audio_folder: str,
video_folder: str,
auto_label: bool,
json_file: str,
field_selector: str,
json_files: t.List[str],
json_field_selector: str,
mode: str,
hf_repo: str,
hf_subsets: t.List[str],
Expand All @@ -281,7 +284,7 @@ def _build(
csv_quotechar: str,
csv_skipinitialspace: bool,
csv_strict: bool,
csv_encoding: str,
file_encoding: str,
) -> None:
"""Build Starwhale Dataset.
This command only supports to build standalone dataset.
Expand Down Expand Up @@ -328,11 +331,13 @@ def _build(
swcli dataset build --video-folder /path/to/video/folder # build dataset from /path/to/video/folder, search all video type files.
\b
- from json file
swcli dataset build --json-file /path/to/example.json
swcli dataset build --json-file http://example.com/example.json
swcli dataset build --json-file /path/to/example.json --field-selector a.b.c # extract the json_content["a"]["b"]["c"] field from the json file.
swcli dataset build --name qald9 --json-file https://raw.githubusercontent.com/ag-sc/QALD/master/9/data/qald-9-test-multilingual.json --field-selector questions
- from json or json line files
swcli dataset build --json /path/to/example.json
swcli dataset build --json http://example.com/example.json
swcli dataset build --json /path/to/example.json --field-selector a.b.c # extract the json_content["a"]["b"]["c"] field from the json file.
swcli dataset build --name qald9 --json https://raw.githubusercontent.com/ag-sc/QALD/master/9/data/qald-9-test-multilingual.json --field-selector questions
swcli dataset build --json /path/to/test01.jsonl --json /path/to/test02.jsonl
swcli dataset build --json https://modelscope.cn/api/v1/datasets/damo/100PoisonMpts/repo\?Revision\=master\&FilePath\=train.jsonl
\b
- from huggingface dataset
Expand Down Expand Up @@ -373,17 +378,17 @@ def _build(
mode=mode_type,
tags=tags,
)
elif json_file:
json_file = json_file.strip().rstrip("/")
view.build_from_json_file(
json_file,
name=name or json_file.split("/")[-1].split(".")[0],
elif json_files:
view.build_from_json_files(
json_files,
name=name or f"json-{random_str()}",
project_uri=project,
volume_size=volume_size,
alignment_size=alignment_size,
field_selector=field_selector,
field_selector=json_field_selector,
mode=mode_type,
tags=tags,
encoding=file_encoding,
)
elif csv_files:
view.build_from_csv_files(
Expand All @@ -399,7 +404,7 @@ def _build(
quotechar=csv_quotechar,
skipinitialspace=csv_skipinitialspace,
strict=csv_strict,
encoding=csv_encoding,
encoding=file_encoding,
)
elif python_handler:
_workdir = Path(workdir).absolute()
Expand Down
25 changes: 4 additions & 21 deletions client/starwhale/core/dataset/model.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
from __future__ import annotations

import os
import typing as t
from abc import ABCMeta, abstractmethod
from http import HTTPStatus
from pathlib import Path
from collections import defaultdict

import yaml
import requests

from starwhale.utils import console, load_yaml
from starwhale.consts import (
Expand All @@ -33,7 +31,6 @@
from starwhale.utils.http import ignore_error
from starwhale.base.bundle import BaseBundle, LocalStorageBundleMixin
from starwhale.utils.error import NoSupportError
from starwhale.utils.retry import http_retry
from starwhale.base.uri.project import Project
from starwhale.base.uri.resource import Resource, ResourceType
from starwhale.core.dataset.copy import DatasetCopy
Expand Down Expand Up @@ -81,7 +78,7 @@ def build_from_folder(
) -> None:
raise NotImplementedError

def build_from_json_file(self, json_file_path: str, **kwargs: t.Any) -> None:
def build_from_json_files(self, paths: t.List[PathLike], **kwargs: t.Any) -> None:
raise NotImplementedError

def build_from_csv_files(self, paths: t.List[PathLike], **kwargs: t.Any) -> None:
Expand Down Expand Up @@ -268,26 +265,12 @@ def build_from_huggingface(self, repo: str, **kwargs: t.Any) -> None:
f"[red bold blink] swcli dataset info {self.name}/version/{ds.committed_version[:SHORT_VERSION_CNT]}[/]"
)

def build_from_json_file(self, json_file_path: str | Path, **kwargs: t.Any) -> None:
def build_from_json_files(self, paths: t.List[PathLike], **kwargs: t.Any) -> None:
from starwhale.api._impl.dataset.model import Dataset as SDKDataset

json_file_path = str(json_file_path)
if json_file_path.startswith(("http://", "https://")):

@http_retry
def _r(url: str) -> str:
return requests.get(url, verify=False, timeout=90).text

json_text = _r(json_file_path)
elif os.path.isfile(json_file_path):
json_text = Path(json_file_path).read_text()
else:
raise RuntimeError(f"json file path:{json_file_path} not exists")

ds = SDKDataset.from_json(name=self.name, json_text=json_text, **kwargs)

ds = SDKDataset.from_json(name=self.name, path=paths, **kwargs)
console.print(
f":hibiscus: congratulation! dataset build from {json_file_path} has been built. You can run "
f":hibiscus: congratulation! dataset build from {paths} has been built. You can run "
f"[red bold blink] swcli dataset info {self.name}/version/{ds.committed_version[:SHORT_VERSION_CNT]}[/]"
)

Expand Down
Loading

0 comments on commit 69d4303

Please sign in to comment.