openvinotoolkit · zhiltsov-max · Sep 29, 2021 · Sep 8, 2021 · Sep 23, 2021 · Sep 23, 2021
@@ -16,11 +16,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - "dataset revpath" concept in CLI, allowing to pass a dataset path with
   the dataset format in `diff`, `merge`, `explain` and `info` CLI commands
   (<https://github.com/openvinotoolkit/datumaro/pull/238>)
-- `add`, `remove`, `commit`, `checkout`, `log`, `status`, `info` CLI commands
+- `import`, `remove`, `commit`, `checkout`, `log`, `status`, `info` CLI commands
   (<https://github.com/openvinotoolkit/datumaro/pull/238>)
 - `Coco*Extractor` classes now have an option to preserve label IDs from the
   original annotation file
   (<https://github.com/openvinotoolkit/datumaro/pull/453>)
+- `patch` CLI command to patch datasets
+  (<https://github.com/openvinotoolkit/datumaro/pull/401>)
 
 ### Changed
 - A project can contain and manage multiple datasets instead of a single one.
@@ -46,6 +48,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `Importer`s do not create `Project`s anymore and just return a list of
   extractor configurations
   (<https://github.com/openvinotoolkit/datumaro/pull/238>)
+- `ProjectLabels` transform (library API only) to arrange dataset labels
+  for merging etc. (<https://github.com/openvinotoolkit/datumaro/pull/401>)
 
 ### Deprecated
 - TBD

@@ -81,6 +81,7 @@ def _get_known_commands():
         ('filter', commands.filter, "Filter dataset items"),
         ('transform', commands.transform, "Modify dataset items"),
         ('merge', commands.merge, "Merge datasets"),
+        ('patch', commands.patch, "Update dataset from another one"),
         ('convert', commands.convert, "Convert dataset between formats"),
         ('diff', commands.diff, "Compare datasets"),
         ('stats', commands.stats, "Compute dataset statistics"),

@@ -6,5 +6,5 @@
 
 from . import (
     checkout, commit, convert, create, diff, explain, export, filter, import_,
-    info, log, merge, remove, stats, status, transform, validate,
+    info, log, merge, patch, remove, stats, status, transform, validate,
 )
@@ -134,7 +134,7 @@ def _parse_comparison_method(s):
 
 def get_sensitive_args():
     return {
-        diff_command: ['first_taret', 'second_target', 'dst_dir', 'project_dir',],
+        diff_command: ['first_target', 'second_target', 'dst_dir', 'project_dir',],
     }
 
 @scoped

@@ -0,0 +1,147 @@
+# Copyright (C) 2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import argparse
+import logging as log
+import os
+import os.path as osp
+
+from datumaro.components.environment import Environment
+from datumaro.components.errors import ProjectNotFoundError
+from datumaro.util.scope import scope_add, scoped
+
+from ..util import MultilineFormatter
+from ..util.errors import CliException
+from ..util.project import load_project, parse_full_revpath
+
+
+def build_parser(parser_ctor=argparse.ArgumentParser):
+    parser = parser_ctor(help="Updates dataset from another one",
+        description="""
+        Updates items of the first dataset with items from the second one.
+        By default, datasets are updated in-place. The '-o/--output-dir'
+        option can be used to specify another output directory. When
+        updating inplace, use the '--overwrite' parameter along with the
+        '--save-images' export option (inplace updates fail by default
+        to prevent data loss).|n
+        |n
+        Unlike the regular project data source joining, the datasets are not
+        required to have the same labels. The labels from the "patch"
+        dataset are projected onto the labels of the patched dataset,
+        so only the annotations with the matching labels are used, i.e.
+        all the annotations having unknown labels are ignored. Currently,
+        this command doesn't allow to update the label information in the
+        patched dataset.|n
+        |n
+        The command supports passing extra exporting options for the output
+        dataset. The extra options should be passed after the main arguments
+        and after the '--' separator. Particularly, this is useful to include
+        images in the output dataset with '--save-images'.|n
+        |n
+        This command can be applied to the current project targets or
+        arbitrary datasets outside a project. Note that if the destination
+        is read-only (e.g. if it is a project, stage or a cache entry),
+        the output directory must be provided.|n
+        |n
+        This command has the following invocation syntax:
+        - %(prog)s <target dataset revpath> <patch dataset revpath>|n
+        |n
+        <revpath> - either a dataset path or a revision path. The full
+        syntax is:|n
+        - Dataset paths:|n
+        |s|s- <dataset path>[ :<format> ]|n
+        - Revision paths:|n
+        |s|s- <project path> [ @<rev> ] [ :<target> ]|n
+        |s|s- <rev> [ :<target> ]|n
+        |s|s- <target>|n
+        |n
+        The current project (-p/--project) is also used as a context for
+        plugins, so it can be useful for dataset paths having custom formats.
+        When not specified, the current project's working tree is used.|n
+        |n
+        Examples:|n
+        - Update a VOC-like dataset with a COCO-like annotations:|n
+        |s|s%(prog)s --overwrite dataset1/:voc dataset2/:coco -- --save-images|n
+        |n
+        - Generate a patched dataset, based on a project:|n
+        |s|s%(prog)s -o patched_proj1/ proj1/ proj2/|n
+        |n
+        - Update the "source1" the current project with a dataset:|n
+        |s|s%(prog)s -p proj/ --overwrite source1 path/to/dataset2:coco|n
+        |n
+        - Generate a patched source from a previous revision and a dataset:|n
+        |s|s%(prog)s -o new_src2/ HEAD~2:source-2 path/to/dataset2:yolo|n
+        |n
+        - Update a dataset in a custom format, described in a project plugin:|n
+        |s|s%(prog)s -p proj/ --overwrite dataset/:my_format dataset2/:coco
+        """,
+        formatter_class=MultilineFormatter)
+
+    parser.add_argument('target', help="Target dataset revpath")
+    parser.add_argument('patch', help="Patch dataset revpath")
+    parser.add_argument('-o', '--output-dir', dest='dst_dir', default=None,
+        help="Output directory (default: save in-place)")
+    parser.add_argument('--overwrite', action='store_true',
+        help="Overwrite existing files in the save directory, "
+            "if it is not empty")
+    parser.add_argument('-p', '--project', dest='project_dir',
+        help="Directory of the 'current' project (default: current dir)")
+    parser.add_argument('extra_args', nargs=argparse.REMAINDER,
+        help="Additional arguments for exporting (pass '-- -h' for help). "
+            "Must be specified after the main command arguments and after "
+            "the '--' separator")
+    parser.set_defaults(command=patch_command)
+
+    return parser
+
+def get_sensitive_args():
+    return {
+        patch_command: ['target', 'patch', 'dst_dir', 'project_dir',],
+    }
+
+@scoped
+def patch_command(args):
+    project = None
+    try:
+        project = scope_add(load_project(args.project_dir))
+    except ProjectNotFoundError:
+        if args.project_dir:
+            raise
+
+    if project is not None:
+        env = project.env
+    else:
+        env = Environment()
+
+    target_dataset, _project = parse_full_revpath(args.target, project)
+    if _project is not None:
+        scope_add(_project)
+
+    try:
+        converter = env.converters[target_dataset.format]
+    except KeyError:
+        raise CliException("Converter for format '%s' is not found" % \
+            args.format)
+
+    extra_args = converter.parse_cmdline(args.extra_args)
+
+    dst_dir = args.dst_dir
+    if not dst_dir:
+        dst_dir = target_dataset.data_path
+    if not args.overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir):
+        raise CliException("Directory '%s' already exists "
+            "(pass --overwrite to overwrite)" % dst_dir)
+    dst_dir = osp.abspath(dst_dir)
+
+    patch_dataset, _project = parse_full_revpath(args.patch, project)
+    if _project is not None:
+        scope_add(_project)
+
+    target_dataset.update(patch_dataset)
+
+    target_dataset.save(save_dir=dst_dir, **extra_args)
+
+    log.info("Patched dataset has been saved to '%s'" % dst_dir)
+
+    return 0
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: MIT
 
 from enum import Enum, auto
-from typing import Tuple
+from typing import Tuple, Union
 
 from attr import attrib, attrs
 import attr
@@ -110,8 +110,11 @@ def find(self, name: str):
     def __getitem__(self, idx):
         return self.items[idx]
 
-    def __contains__(self, idx):
-        return 0 <= idx and idx < len(self.items)
+    def __contains__(self, value: Union[int, str]):
+        if isinstance(value, str):
+            return self.find(value)[1] is not None
+        else:
+            return 0 <= value and value < len(self.items)
 
     def __len__(self):
         return len(self.items)

@@ -19,13 +19,15 @@
 )
 from datumaro.components.environment import Environment
 from datumaro.components.errors import (
-    CategoriesRedefinedError, MultipleFormatsMatchError, NoMatchingFormatsError,
-    RepeatedItemError, UnknownFormatError,
+    CategoriesRedefinedError, ConflictingCategoriesError,
+    MultipleFormatsMatchError, NoMatchingFormatsError, RepeatedItemError,
+    UnknownFormatError,
 )
 from datumaro.components.extractor import (
     DEFAULT_SUBSET_NAME, CategoriesInfo, DatasetItem, Extractor, IExtractor,
     ItemTransform, Transform,
 )
+from datumaro.plugins.transforms import ProjectLabels
 from datumaro.util import is_method_redefined
 from datumaro.util.log_utils import logging_disabled
 from datumaro.util.os_util import rmtree
@@ -552,7 +554,7 @@ def transform(self, method: Type[Transform], *args, **kwargs):
         self._length = None
 
     def has_updated_items(self):
-        return self._transforms or self._updated_items
+        return bool(self._transforms) or bool(self._updated_items)
 
     def get_patch(self):
         # Patch includes only added or modified items.
@@ -572,14 +574,33 @@ def get_patch(self):
             else:
                 patch.put(self._storage.get(item_id, subset))
 
-        return DatasetPatch(patch, self._categories,
-            self._updated_items)
+        return DatasetPatch(patch, self._categories, self._updated_items)
 
     def flush_changes(self):
         self._updated_items = {}
         if not (self.is_cache_initialized() or self._is_unchanged_wrapper):
             self._flush_changes = True
 
+    def update(self,
+            source: Union[DatasetPatch, IExtractor, Iterable[DatasetItem]]):
+        # TODO: provide a more efficient implementation with patch reuse
+
+        if isinstance(source, DatasetPatch):
+            if source.categories() != self.categories():
+                raise ConflictingCategoriesError()
+
+            for item_id, status in source.updated_items.items():
+                if status == ItemStatus.removed:
+                    self.remove(*item_id)
+                else:
+                    self.put(source.data.get(*item_id))
+        elif isinstance(source, IExtractor):
+            for item in ProjectLabels(source, self.categories().get(
+                    AnnotationType.label, LabelCategories())):
+                self.put(item)
+        else:
+            for item in source:
+                self.put(item)
 
 class Dataset(IDataset):
     _global_eager = False
@@ -691,9 +712,22 @@ def filter(self, expr: str, filter_annotations: bool = False,
         else:
             return self.transform(XPathDatasetFilter, expr)
 
-    def update(self, items: Iterable[DatasetItem]) -> 'Dataset':
-        for item in items:
-            self.put(item)
+    def update(self,
+            source: Union[DatasetPatch, IExtractor, Iterable[DatasetItem]]) \
+                -> 'Dataset':
+        """
+        Updates items of the current dataset from other dataset or an
+        iterable (the source). Items from the source overwrite matching
+        items in the current dataset. Unmatched items are just appended.
+
+        If the source is a DatasetPatch, the removed items in the patch
+        will be removed in the current dataset.
+
+        If the source is a dataset, labels are matched. If the labels match,
+        but the order is different, the annotation labels will be remapped to
+        the current dataset label order during updating.
+        """
+        self._data.update(source)
         return self
 
     def transform(self, method: Union[str, Type[Transform]],
@@ -793,6 +827,9 @@ def flush_changes(self):
 
     @scoped
     def export(self, save_dir: str, format, **kwargs):
+        if not save_dir:
+            raise ValueError("Dataset export path is not specified")
+
         inplace = (save_dir == self._source_path and format == self._format)
 
         if isinstance(format, str):