feat(client): support dataset build from jsonl files (#2606)

star-whale · Aug 11, 2023 · 69d4303 · 69d4303
1 parent 7dc8468
commit 69d4303
Show file tree

Hide file tree

Showing 13 changed files with 260 additions and 153 deletions.
diff --git a/client/starwhale/api/_impl/dataset/model.py b/client/starwhale/api/_impl/dataset/model.py
@@ -73,6 +73,7 @@
 _DType = t.TypeVar("_DType", bound="Dataset")
 _ItemType = t.Union[str, int, slice]
 _GItemType = t.Optional[t.Union[DataRow, t.List[DataRow]]]
+_IterFeatureDict = t.Iterable[t.Dict[str, t.Any]]
 
 _DEFAULT_LOADER_WORKERS = 2
 _DEFAULT_LOADER_CACHE_SIZE = 20
@@ -1266,36 +1267,76 @@ def from_huggingface(
     def from_json(
         cls,
         name: str,
-        json_text: str,
+        path: PathLike | t.List[PathLike] | None = None,
+        text: str | None = None,
         field_selector: str = "",
         alignment_size: int | str = D_ALIGNMENT_SIZE,
         volume_size: int | str = D_FILE_VOLUME_SIZE,
         mode: DatasetChangeMode | str = DatasetChangeMode.PATCH,
         tags: t.List[str] | None = None,
+        encoding: str | None = None,
     ) -> Dataset:
         """Create a new dataset from a json text.
 
         The dataset created by the json text will use the auto increment index as the row index.
 
+        path and text arguments are mutually exclusive, one of them must be specified.
+
         Arguments:
             name: (str, required) The dataset name you would like to use.
-            json_text: (str, required) The json text from which you would like to create this dataset.
+            path: (str|Path|List[str]|List[Path], optional) Json or json line files.
+            text: (str, optional) The json text from which you would like to create this dataset.
             field_selector: (str, optional) The filed from which you would like to extract dataset array items.
                 The default value is "" which indicates that the json object is an array contains all the items.
             alignment_size: (int|str, optional) The blob alignment size. The default value is 128.
             volume_size: (int|str, optional) The blob volume size. The default value is 64MB.
             mode: (str|DatasetChangeMode, optional) The dataset change mode. The default value is `patch`. Mode choices are `patch` and `overwrite`.
             tags: (list(str), optional) The tags for the dataset version.`latest` and `^v\d+$` tags are reserved tags.
+            encoding: (str, optional) The encoding used to decode the input file. The default is None.
+                encoding does not support text parameter.
 
         Returns:
                 A Dataset Object
 
+        Json text format:
+
+        ```json
+        [
+            {"a": 1, "b": 2},
+            {"a": 10, "b": 20},
+        ]
+        ```
+        Using field_selector: p1.p2.p3 to extract dataset array items:
+        ```json
+        {
+            "p1": {
+                "p2":{
+                    "p3": [
+                        {"a": 1, "b": 2},
+                        {"a": 10, "b": 20},
+                    ]
+                }
+            }
+        }
+        ```
+
+        Json line text format:
+        ```jsonl
+        {"a": 1, "b": 2}
+        {"a": 10, "b": 20}
+        ```
+        Using field_selector: p1.p2.p3 to extract dataset array items:
+        ```jsonl
+        {"p1": {"p2": {"p3": {"a": 1, "b": 2}}}}
+        {"p1": {"p2": {"p3": {"a": 10, "b": 20}}}}
+        ```
+
         Examples:
         ```python
         from starwhale import Dataset
         myds = Dataset.from_json(
             name="translation",
-            json_text='[{"en":"hello","zh-cn":"你好"},{"en":"how are you","zh-cn":"最近怎么样"}]'
+            text='[{"en":"hello","zh-cn":"你好"},{"en":"how are you","zh-cn":"最近怎么样"}]'
         )
         print(myds[0].features.en)
         ```
@@ -1304,33 +1345,79 @@ def from_json(
         from starwhale import Dataset
         myds = Dataset.from_json(
             name="translation",
-            json_text='{"content":{"child_content":[{"en":"hello","zh-cn":"你好"},{"en":"how are you","zh-cn":"最近怎么样"}]}}',
+            text='{"content":{"child_content":[{"en":"hello","zh-cn":"你好"},{"en":"how are you","zh-cn":"最近怎么样"}]}}',
             field_selector="content.child_content"
         )
         print(myds[0].features["zh-cn"])
         ```
+
+        ```python
+        from starwhale import Dataset
+        # create a dataset from /path/to/data.json file.
+        Dataset.from_json(path="/path/to/data.json", name="myds"))
+
+        # create a dataset from /path/to/dir folder.
+        Dataset.from_json(path="/path/to/dir", name="myds")
+
+        # create a dataset from /path/to/data1.json, /path/to/data2.json
+        Dataset.from_json(path=["/path/to/data1.json", "/path/to/data2.json"], name="myds")
+
+        # create a dataset from http://example.com/data.json file.
+        Dataset.from_json(path="http://example.com/data.json", name="myds")
+        ```
         """
-        mode = DatasetChangeMode(mode)
-        data_items = json.loads(json_text)
-
-        if field_selector:
-            # Split field selector by dots
-            fields = field_selector.split(".")
-            # Iterate over selected fields
-            for field in fields:
-                if field in data_items:
-                    data_items = data_items[field]
+
+        def _selector(_data: t.Sequence | t.Dict) -> t.Sequence | t.Dict:
+            if field_selector:
+                fields = field_selector.split(".")
+                for field in fields:
+                    if not isinstance(_data, dict):
+                        raise NoSupportError(
+                            f"field_selector only supports dict type: {_data}"
+                        )
+                    if field in _data:
+                        _data = _data[field]
+                    else:
+                        raise ValueError(
+                            f"The field_selector {field_selector} isn't in json text: {_data}"
+                        )
+            return _data
+
+        def _decode() -> t.Iterable[t.Sequence | t.Dict]:
+            if path is not None and text is not None:
+                raise ValueError("paths and text arguments are mutually exclusive")
+            elif path:
+                for fp, suffix in iter_pathlike_io(
+                    path, encoding=encoding, accepted_file_types=[".json", ".jsonl"]
+                ):
+                    if suffix == ".json":
+                        yield _selector(json.load(fp))
+                    elif suffix == ".jsonl":
+                        for line in fp.readlines():
+                            _r = json.loads(line)
+                            _r = _selector(_r)
+                            if isinstance(_r, dict):
+                                _r = [_r]
+                            yield _r
+                    else:
+                        raise ValueError(f"unsupported file type: {suffix}")
+            elif text:
+                yield _selector(json.loads(text))
+            else:
+                raise ValueError("paths or text argument must be specified")
+
+        def _iter(_iter: t.Iterable[t.Sequence | t.Dict]) -> _IterFeatureDict:
+            for i in _iter:
+                if isinstance(i, (list, tuple)):
+                    for j in i:
+                        yield j
+                elif isinstance(i, dict):
+                    yield i
                 else:
-                    raise ValueError(
-                        f"The field_selector {field_selector} isn't in json_text: {json_text}"
-                    )
-        if not isinstance(data_items, list):
-            raise ValueError(
-                f"The field selected by field_selector {field_selector} isn't an array: {data_items}"
-            )
+                    raise ValueError(f"json text:{i} must be dict, list or tuple type")
 
         return cls.from_dict_items(
-            data_items,
+            _iter(_decode()),
             name=name,
             volume_size=volume_size,
             alignment_size=alignment_size,
@@ -1404,8 +1491,8 @@ def from_csv(
         ```
         """
 
-        def _iter_records() -> t.Iterator[t.Dict[str, t.Any]]:
-            for fp in iter_pathlike_io(
+        def _iter_records() -> _IterFeatureDict:
+            for fp, _ in iter_pathlike_io(
                 path=path, encoding=encoding, newline="", accepted_file_types=[".csv"]
             ):
                 for record in csv.DictReader(
@@ -1473,6 +1560,7 @@ def _iter_records():
         ds = Dataset.from_dict_items(iter([(1, {"a": 1, "b: 2, "c": 3})]), name="my-dataset")
         ```
         """
+        mode = DatasetChangeMode(mode)
         StandaloneTag.check_tags_validation(tags)
 
         with cls.dataset(name) as ds:

diff --git a/client/starwhale/core/dataset/cli.py b/client/starwhale/core/dataset/cli.py
@@ -71,12 +71,14 @@ def dataset_cmd(ctx: click.Context) -> None:
     help="Build dataset from dataset.yaml file. Default uses dataset.yaml in the work directory(pwd).",
 )
 @optgroup.option(  # type: ignore[no-untyped-call]
-    "json_file",
+    "json_files",
     "-jf",
-    "--json-file",
+    "--json",
+    multiple=True,
     help=(
-        "Build dataset from json file, the json file option is a json file path or a http downloaded url."
-        "The json content structure should be a list[dict] or tuple[dict]."
+        "Build dataset from json or json line files, local path or http downloaded url is supported."
+        "For the json file: the json content structure should be a list[dict] or tuple[dict] from the original format or ingest format by field-selector."
+        "For the json line file: each line is a json dict."
     ),
 )
 @optgroup.option(  # type: ignore[no-untyped-call]
@@ -141,6 +143,11 @@ def dataset_cmd(ctx: click.Context) -> None:
     multiple=True,
     help="dataset tags, the option can be used multiple times. `latest` and `^v\d+$` tags are reserved tags.",
 )
+@optgroup.option(  # type: ignore[no-untyped-call]
+    "file_encoding",
+    "--encoding",
+    help="The csv/json/jsonl file encoding.",
+)
 @optgroup.option("-r", "--runtime", help="runtime uri")  # type: ignore[no-untyped-call]
 @optgroup.group("\n  ** Handler Build Source Configurations")
 @optgroup.option("-w", "--workdir", default=".", help="work dir to search handler, the option only works for the handler build source.")  # type: ignore[no-untyped-call]
@@ -152,8 +159,9 @@ def dataset_cmd(ctx: click.Context) -> None:
     default=True,
     help="Whether to auto label by the sub-folder name. The default value is True",
 )
-@optgroup.group("\n  ** JsonFile Build Source Configurations")
+@optgroup.group("\n  ** Json Build Source Configurations")
 @optgroup.option(  # type: ignore[no-untyped-call]
+    "json_field_selector",
     "--field-selector",
     default="",
     help=(
@@ -244,11 +252,6 @@ def dataset_cmd(ctx: click.Context) -> None:
     show_default=True,
     help="When True, raise exception Error if the csv is not well formed.",
 )
-@optgroup.option(  # type: ignore[no-untyped-call]
-    "csv_encoding",
-    "--encoding",
-    help="The csv file encoding.",
-)
 @click.pass_obj
 def _build(
     view: DatasetTermView,
@@ -265,8 +268,8 @@ def _build(
     audio_folder: str,
     video_folder: str,
     auto_label: bool,
-    json_file: str,
-    field_selector: str,
+    json_files: t.List[str],
+    json_field_selector: str,
     mode: str,
     hf_repo: str,
     hf_subsets: t.List[str],
@@ -281,7 +284,7 @@ def _build(
     csv_quotechar: str,
     csv_skipinitialspace: bool,
     csv_strict: bool,
-    csv_encoding: str,
+    file_encoding: str,
 ) -> None:
     """Build Starwhale Dataset.
     This command only supports to build standalone dataset.
@@ -328,11 +331,13 @@ def _build(
         swcli dataset build --video-folder /path/to/video/folder  # build dataset from /path/to/video/folder, search all video type files.
 
         \b
-        - from json file
-        swcli dataset build --json-file /path/to/example.json
-        swcli dataset build --json-file http://example.com/example.json
-        swcli dataset build --json-file /path/to/example.json --field-selector a.b.c # extract the json_content["a"]["b"]["c"] field from the json file.
-        swcli dataset build --name qald9 --json-file https://raw.githubusercontent.com/ag-sc/QALD/master/9/data/qald-9-test-multilingual.json --field-selector questions
+        - from json or json line files
+        swcli dataset build --json /path/to/example.json
+        swcli dataset build --json http://example.com/example.json
+        swcli dataset build --json /path/to/example.json --field-selector a.b.c # extract the json_content["a"]["b"]["c"] field from the json file.
+        swcli dataset build --name qald9 --json https://raw.githubusercontent.com/ag-sc/QALD/master/9/data/qald-9-test-multilingual.json --field-selector questions
+        swcli dataset build --json /path/to/test01.jsonl --json /path/to/test02.jsonl
+        swcli dataset build --json https://modelscope.cn/api/v1/datasets/damo/100PoisonMpts/repo\?Revision\=master\&FilePath\=train.jsonl
 
         \b
         - from huggingface dataset
@@ -373,17 +378,17 @@ def _build(
             mode=mode_type,
             tags=tags,
         )
-    elif json_file:
-        json_file = json_file.strip().rstrip("/")
-        view.build_from_json_file(
-            json_file,
-            name=name or json_file.split("/")[-1].split(".")[0],
+    elif json_files:
+        view.build_from_json_files(
+            json_files,
+            name=name or f"json-{random_str()}",
             project_uri=project,
             volume_size=volume_size,
             alignment_size=alignment_size,
-            field_selector=field_selector,
+            field_selector=json_field_selector,
             mode=mode_type,
             tags=tags,
+            encoding=file_encoding,
         )
     elif csv_files:
         view.build_from_csv_files(
@@ -399,7 +404,7 @@ def _build(
             quotechar=csv_quotechar,
             skipinitialspace=csv_skipinitialspace,
             strict=csv_strict,
-            encoding=csv_encoding,
+            encoding=file_encoding,
         )
     elif python_handler:
         _workdir = Path(workdir).absolute()

diff --git a/client/starwhale/core/dataset/model.py b/client/starwhale/core/dataset/model.py
@@ -1,14 +1,12 @@
 from __future__ import annotations
 
-import os
 import typing as t
 from abc import ABCMeta, abstractmethod
 from http import HTTPStatus
 from pathlib import Path
 from collections import defaultdict
 
 import yaml
-import requests
 
 from starwhale.utils import console, load_yaml
 from starwhale.consts import (
@@ -33,7 +31,6 @@
 from starwhale.utils.http import ignore_error
 from starwhale.base.bundle import BaseBundle, LocalStorageBundleMixin
 from starwhale.utils.error import NoSupportError
-from starwhale.utils.retry import http_retry
 from starwhale.base.uri.project import Project
 from starwhale.base.uri.resource import Resource, ResourceType
 from starwhale.core.dataset.copy import DatasetCopy
@@ -81,7 +78,7 @@ def build_from_folder(
     ) -> None:
         raise NotImplementedError
 
-    def build_from_json_file(self, json_file_path: str, **kwargs: t.Any) -> None:
+    def build_from_json_files(self, paths: t.List[PathLike], **kwargs: t.Any) -> None:
         raise NotImplementedError
 
     def build_from_csv_files(self, paths: t.List[PathLike], **kwargs: t.Any) -> None:
@@ -268,26 +265,12 @@ def build_from_huggingface(self, repo: str, **kwargs: t.Any) -> None:
             f"[red bold blink] swcli dataset info {self.name}/version/{ds.committed_version[:SHORT_VERSION_CNT]}[/]"
         )
 
-    def build_from_json_file(self, json_file_path: str | Path, **kwargs: t.Any) -> None:
+    def build_from_json_files(self, paths: t.List[PathLike], **kwargs: t.Any) -> None:
         from starwhale.api._impl.dataset.model import Dataset as SDKDataset
 
-        json_file_path = str(json_file_path)
-        if json_file_path.startswith(("http://", "https://")):
-
-            @http_retry
-            def _r(url: str) -> str:
-                return requests.get(url, verify=False, timeout=90).text
-
-            json_text = _r(json_file_path)
-        elif os.path.isfile(json_file_path):
-            json_text = Path(json_file_path).read_text()
-        else:
-            raise RuntimeError(f"json file path:{json_file_path} not exists")
-
-        ds = SDKDataset.from_json(name=self.name, json_text=json_text, **kwargs)
-
+        ds = SDKDataset.from_json(name=self.name, path=paths, **kwargs)
         console.print(
-            f":hibiscus: congratulation! dataset build from {json_file_path} has been built. You can run "
+            f":hibiscus: congratulation! dataset build from {paths} has been built. You can run "
             f"[red bold blink] swcli dataset info {self.name}/version/{ds.committed_version[:SHORT_VERSION_CNT]}[/]"
         )