Skip to content

Commit

Permalink
Refactor and simplify plugins (#22)
Browse files Browse the repository at this point in the history
- Extracted common code for Importers, Transforms, Extractors, removed unnecessary code
- Improved caching in transforms. Creation of a transform could led to unnecessary source dataset traversal with corresponding performance consequences
- Dataset class moved to its own file
- All DatasetItems and Extractors now use default subset name "default" instead of "None". The values are interchangeable
  • Loading branch information
Maxim Zhiltsov authored Oct 1, 2020
1 parent 785b4d9 commit 48105f1
Show file tree
Hide file tree
Showing 42 changed files with 546 additions and 1,018 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-

### Changed
-
- Implementation of format plugins simplified (<https://github.com/openvinotoolkit/datumaro/pull/22>)
- `default` is now a default subset name, instead of `None`. The values are interchangeable. (<https://github.com/openvinotoolkit/datumaro/pull/22>)
- Improved performance of transforms (<https://github.com/openvinotoolkit/datumaro/pull/22>)

### Deprecated
-
Expand Down
9 changes: 2 additions & 7 deletions datumaro/components/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,7 @@
from datumaro.util.image import save_image


class IConverter:
@classmethod
def convert(cls, extractor, save_dir, **options):
raise NotImplementedError("Should be implemented in a subclass")

class Converter(IConverter, CliPlugin):
class Converter(CliPlugin):
DEFAULT_IMAGE_EXT = None

@classmethod
Expand Down Expand Up @@ -65,7 +60,7 @@ def _save_image(self, item, path=None):
image = item.image.data
if image is None:
log.warning("Item '%s' has no image", item.id)
return item.image.path
return

path = path or self._make_image_filename(item)

Expand Down
186 changes: 186 additions & 0 deletions datumaro/components/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
# Copyright (C) 2020 Intel Corporation
#
# SPDX-License-Identifier: MIT

from collections import OrderedDict, defaultdict
from typing import Iterable, Union, Dict, List

from datumaro.components.extractor import (Extractor, LabelCategories,
AnnotationType, DatasetItem, DEFAULT_SUBSET_NAME)
from datumaro.components.dataset_filter import \
XPathDatasetFilter, XPathAnnotationsFilter


class Dataset(Extractor):
class Subset(Extractor):
def __init__(self, parent):
self.parent = parent
self.items = OrderedDict()

def __iter__(self):
yield from self.items.values()

def __len__(self):
return len(self.items)

def categories(self):
return self.parent.categories()

@classmethod
def from_iterable(cls, iterable: Iterable[DatasetItem],
categories: Union[Dict, List[str]] = None):
if isinstance(categories, list):
categories = { AnnotationType.label:
LabelCategories.from_iterable(categories)
}

if not categories:
categories = {}

class _extractor(Extractor):
def __iter__(self):
return iter(iterable)

def categories(self):
return categories

return cls.from_extractors(_extractor())

@classmethod
def from_extractors(cls, *sources):
categories = cls._merge_categories(s.categories() for s in sources)
dataset = Dataset(categories=categories)

# merge items
subsets = defaultdict(lambda: cls.Subset(dataset))
for source in sources:
for item in source:
existing_item = subsets[item.subset].items.get(item.id)
if existing_item is not None:
path = existing_item.path
if item.path != path:
path = None
item = cls._merge_items(existing_item, item, path=path)

subsets[item.subset].items[item.id] = item

dataset._subsets = dict(subsets)
return dataset

def __init__(self, categories=None):
super().__init__()

self._subsets = {}

if not categories:
categories = {}
self._categories = categories

def __iter__(self):
for subset in self._subsets.values():
for item in subset:
yield item

def __len__(self):
if self._length is None:
self._length = sum(len(s) for s in self._subsets.values())
return self._length

def get_subset(self, name):
return self._subsets[name]

def subsets(self):
return self._subsets

def categories(self):
return self._categories

def get(self, item_id, subset=None, path=None):
if path:
raise KeyError("Requested dataset item path is not found")
item_id = str(item_id)
subset = subset or DEFAULT_SUBSET_NAME
subset = self._subsets[subset]
return subset.items[item_id]

def put(self, item, item_id=None, subset=None, path=None):
if path:
raise KeyError("Requested dataset item path is not found")

if item_id is None:
item_id = item.id
if subset is None:
subset = item.subset

item = item.wrap(id=item_id, subset=subset, path=None)
if subset not in self._subsets:
self._subsets[subset] = self.Subset(self)
self._subsets[subset].items[item_id] = item
self._length = None

return item

def filter(self, expr, filter_annotations=False, remove_empty=False):
if filter_annotations:
return self.transform(XPathAnnotationsFilter, expr, remove_empty)
else:
return self.transform(XPathDatasetFilter, expr)

def update(self, items):
for item in items:
self.put(item)
return self

def define_categories(self, categories):
assert not self._categories
self._categories = categories

@staticmethod
def _lazy_image(item):
# NOTE: avoid https://docs.python.org/3/faq/programming.html#why-do-lambdas-defined-in-a-loop-with-different-values-all-return-the-same-result
return lambda: item.image

@classmethod
def _merge_items(cls, existing_item, current_item, path=None):
return existing_item.wrap(path=path,
image=cls._merge_images(existing_item, current_item),
annotations=cls._merge_anno(
existing_item.annotations, current_item.annotations))

@staticmethod
def _merge_images(existing_item, current_item):
image = None
if existing_item.has_image and current_item.has_image:
if existing_item.image.has_data:
image = existing_item.image
else:
image = current_item.image

if existing_item.image.path != current_item.image.path:
if not existing_item.image.path:
image._path = current_item.image.path

if all([existing_item.image._size, current_item.image._size]):
assert existing_item.image._size == current_item.image._size, "Image info differs for item '%s'" % existing_item.id
elif existing_item.image._size:
image._size = existing_item.image._size
else:
image._size = current_item.image._size
elif existing_item.has_image:
image = existing_item.image
else:
image = current_item.image

return image

@staticmethod
def _merge_anno(a, b):
# TODO: implement properly with merging and annotations remapping
from .operations import merge_annotations_equal
return merge_annotations_equal(a, b)

@staticmethod
def _merge_categories(sources):
# TODO: implement properly with merging and annotations remapping
from .operations import merge_categories
return merge_categories(sources)
Loading

0 comments on commit 48105f1

Please sign in to comment.