Skip to content

Commit

Permalink
Support for LFW dataset format (cvat-ai#110)
Browse files Browse the repository at this point in the history
* add support for LFW dataset format

* update documentation

* update Changelog

Co-authored-by: Maxim Zhiltsov <[email protected]>
  • Loading branch information
yasakova-anastasia and Maxim Zhiltsov authored Feb 26, 2021
1 parent 1325eef commit dad5c05
Show file tree
Hide file tree
Showing 10 changed files with 295 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `Icdar13/15` dataset format (<https://github.com/openvinotoolkit/datumaro/pull/96>)
- Laziness, source caching, tracking of changes and partial updating for `Dataset` (<https://github.com/openvinotoolkit/datumaro/pull/102>)
- `Market-1501` dataset format (<https://github.com/openvinotoolkit/datumaro/pull/108>)
- `LFW` dataset format (<https://github.com/openvinotoolkit/datumaro/pull/110>)

### Changed
- OpenVINO model launcher is updated for OpenVINO r2021.1 (<https://github.com/openvinotoolkit/datumaro/pull/100>)
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ CVAT annotations ---> Publication, statistics etc.
- [LabelMe](http://labelme.csail.mit.edu/Release3.0)
- [ICDAR13/15](https://rrc.cvc.uab.es/?ch=2) (`word_recognition`, `text_localization`, `text_segmentation`)
- [Market-1501](https://www.aitribune.com/dataset/2018051063) (`person re-identification`)
- [LFW](http://vis-www.cs.umass.edu/lfw/) (`person re-identification`, `landmarks`)
- Dataset building
- Merging multiple datasets into one
- Dataset filtering by a custom criteria:
Expand Down
135 changes: 135 additions & 0 deletions datumaro/plugins/lfw_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# Copyright (C) 2020 Intel Corporation
#
# SPDX-License-Identifier: MIT

import os
import os.path as osp
import re

from datumaro.components.converter import Converter
from datumaro.components.extractor import (AnnotationType, DatasetItem,
Importer, Points, SourceExtractor)


class LfwPath:
IMAGES_DIR = 'images'
LANDMARKS_FILE = 'landmarks.txt'
PAIRS_FILE = 'pairs.txt'
IMAGE_EXT = '.jpg'
PATTERN = re.compile(r'([\w]+)_([-\d]+)')

class LfwExtractor(SourceExtractor):
def __init__(self, path):
if not osp.isfile(path):
raise NotADirectoryError("Can't read annotation file '%s'" % path)
super().__init__(subset=osp.basename(osp.dirname(path)))
self._dataset_dir = osp.dirname(osp.dirname(path))
self._items = list(self._load_items(path).values())

def _load_items(self, path):
items = {}
images_dir = osp.join(self._dataset_dir, self._subset, LfwPath.IMAGES_DIR)
with open(path, encoding='utf-8') as f:
for line in f:
pair = line.strip().split()
if len(pair) == 3:
image1 = self.get_image_name(pair[0], pair[1])
image2 = self.get_image_name(pair[0], pair[2])
if image1 not in items:
items[image1] = DatasetItem(id=image1, subset=self._subset,
image=osp.join(images_dir, image1 + LfwPath.IMAGE_EXT),
attributes={'positive_pairs': [], 'negative_pairs': []})
if image2 not in items:
items[image2] = DatasetItem(id=image2, subset=self._subset,
image=osp.join(images_dir, image2 + LfwPath.IMAGE_EXT),
attributes={'positive_pairs': [], 'negative_pairs': []})

attributes = items[image1].attributes
attributes['positive_pairs'].append(image2)
elif len(pair) == 4:
image1 = self.get_image_name(pair[0], pair[1])
image2 = self.get_image_name(pair[2], pair[3])
if image1 not in items:
items[image1] = DatasetItem(id=image1, subset=self._subset,
image=osp.join(images_dir, image1 + LfwPath.IMAGE_EXT),
attributes={'positive_pairs': [], 'negative_pairs': []})
if image2 not in items:
items[image2] = DatasetItem(id=image2, subset=self._subset,
image=osp.join(images_dir, image2 + LfwPath.IMAGE_EXT),
attributes={'positive_pairs': [], 'negative_pairs': []})

attributes = items[image1].attributes
attributes['negative_pairs'].append(image2)

landmarks_file = osp.join(self._dataset_dir, self._subset,
LfwPath.LANDMARKS_FILE)
if osp.isfile(landmarks_file):
with open(landmarks_file, encoding='utf-8') as f:
for line in f:
line = line.split('\t')

item_id = line[0]
if item_id.endswith(LfwPath.IMAGE_EXT):
item_id = item_id[:-len(LfwPath.IMAGE_EXT)]
if item_id not in items:
items[item_id] = DatasetItem(id=item_id, subset=self._subset,
image=osp.join(images_dir, line[0]),
attributes={'positive_pairs': [], 'negative_pairs': []})

annotations = items[item_id].annotations
annotations.append(Points([float(p) for p in line[1:]]))
return items

@staticmethod
def get_image_name(person, image_id):
return '{}/{}_{:04d}'.format(person, person, int(image_id))

class LfwImporter(Importer):
@classmethod
def find_sources(cls, path):
return cls._find_sources_recursive(path, LfwPath.PAIRS_FILE, 'lfw')

class LfwConverter(Converter):
DEFAULT_IMAGE_EXT = '.jpg'

def apply(self):
for subset_name, subset in self._extractor.subsets().items():
positive_pairs = []
negative_pairs = []
landmarks = []
for item in subset:
if item.has_image and self._save_images:
self._save_image(item, osp.join(self._save_dir, subset_name,
LfwPath.IMAGES_DIR, item.id + LfwPath.IMAGE_EXT))

person1, num1 = LfwPath.PATTERN.search(item.id).groups()
num1 = int(num1)
if 'positive_pairs' in item.attributes:
for pair in item.attributes['positive_pairs']:
num2 = LfwPath.PATTERN.search(pair).groups()[1]
num2 = int(num2)
positive_pairs.append('%s\t%s\t%s' % (person1, num1, num2))
if 'negative_pairs' in item.attributes:
for pair in item.attributes['negative_pairs']:
person2, num2 = LfwPath.PATTERN.search(pair).groups()
num2 = int(num2)
negative_pairs.append('%s\t%s\t%s\t%s' % \
(person1, num1, person2, num2))

item_landmarks = [p for p in item.annotations
if p.type == AnnotationType.points]
for landmark in item_landmarks:
landmarks.append('%s\t%s' % (item.id + LfwPath.IMAGE_EXT,
'\t'.join(str(p) for p in landmark.points)))

pairs_file = osp.join(self._save_dir, subset_name, LfwPath.PAIRS_FILE)
os.makedirs(osp.dirname(pairs_file), exist_ok=True)
with open(pairs_file, 'w', encoding='utf-8') as f:
f.writelines(['%s\n' % pair for pair in positive_pairs])
f.writelines(['%s\n' % pair for pair in negative_pairs])

if landmarks:
landmarks_file = osp.join(self._save_dir, subset_name,
LfwPath.LANDMARKS_FILE)
with open(landmarks_file, 'w', encoding='utf-8') as f:
f.writelines(['%s\n' % landmark for landmark in landmarks])
3 changes: 3 additions & 0 deletions docs/user_manual.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,9 @@ List of supported formats:
- Market-1501 (`person re-identification`)
- [Format specification](https://www.aitribune.com/dataset/2018051063)
- [Dataset example](../tests/assets/market1501_dataset)
- LFW (`person re-identification`, `landmarks`)
- [Format specification](http://vis-www.cs.umass.edu/lfw/)
- [Dataset example](../tests/assets/lfw_dataset)

List of supported annotation types:
- Labels
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 3 additions & 0 deletions tests/assets/lfw_dataset/test/landmarks.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
name0/name0_0001.jpg 0 4 3 3 2 2 1 0 3 0
name1/name1_0001.jpg 1 6 4 6 3 3 2 1 4 1
name1/name1_0002.jpg 0 5 3 5 2 2 1 0 3 0
5 changes: 5 additions & 0 deletions tests/assets/lfw_dataset/test/pairs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1 2
name1 1 2
name0 1 name1 1
name0 1 name1 2

147 changes: 147 additions & 0 deletions tests/test_lfw_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import os.path as osp
from unittest import TestCase

import numpy as np
from datumaro.components.dataset import Dataset
from datumaro.components.extractor import DatasetItem, Points
from datumaro.plugins.lfw_format import LfwConverter, LfwImporter
from datumaro.util.test_utils import TestDir, compare_datasets


class LfwFormatTest(TestCase):
def test_can_save_and_load(self):
source_dataset = Dataset.from_iterable([
DatasetItem(id='name0/name0_0001',
subset='test', image=np.ones((2, 5, 3)),
attributes = {
'positive_pairs': ['name0/name0_0002'],
'negative_pairs': []
}
),
DatasetItem(id='name0/name0_0002',
subset='test', image=np.ones((2, 5, 3)),
attributes = {
'positive_pairs': [],
'negative_pairs': ['name1/name1_0001']
}
),
DatasetItem(id='name1/name1_0001',
subset='test', image=np.ones((2, 5, 3)),
attributes = {
'positive_pairs': ['name1/name1_0002'],
'negative_pairs': []
}
),
DatasetItem(id='name1/name1_0002',
subset='test', image=np.ones((2, 5, 3)),
attributes = {
'positive_pairs': [],
'negative_pairs': ['name0/name0_0001']
}
),
])

with TestDir() as test_dir:
LfwConverter.convert(source_dataset, test_dir, save_images=True)
parsed_dataset = Dataset.import_from(test_dir, 'lfw')

compare_datasets(self, source_dataset, parsed_dataset)

def test_can_save_and_load_with_landmarks(self):
source_dataset = Dataset.from_iterable([
DatasetItem(id='name0/name0_0001',
subset='test', image=np.ones((2, 5, 3)),
attributes = {
'positive_pairs': ['name0/name0_0002'],
'negative_pairs': []
},
annotations=[
Points([0, 4, 3, 3, 2, 2, 1, 0, 3, 0]),
]
),
DatasetItem(id='name0/name0_0002',
subset='test', image=np.ones((2, 5, 3)),
attributes = {
'positive_pairs': [],
'negative_pairs': []
},
annotations=[
Points([0, 5, 3, 5, 2, 2, 1, 0, 3, 0]),
]
),
])

with TestDir() as test_dir:
LfwConverter.convert(source_dataset, test_dir, save_images=True)
parsed_dataset = Dataset.import_from(test_dir, 'lfw')

compare_datasets(self, source_dataset, parsed_dataset)

def test_can_save_and_load_with_no_subsets(self):
source_dataset = Dataset.from_iterable([
DatasetItem(id='name0/name0_0001',
image=np.ones((2, 5, 3)),
attributes = {
'positive_pairs': ['name0/name0_0002'],
'negative_pairs': []
},
),
DatasetItem(id='name0/name0_0002',
image=np.ones((2, 5, 3)),
attributes = {
'positive_pairs': [],
'negative_pairs': []
},
),
])

with TestDir() as test_dir:
LfwConverter.convert(source_dataset, test_dir, save_images=True)
parsed_dataset = Dataset.import_from(test_dir, 'lfw')

compare_datasets(self, source_dataset, parsed_dataset)

DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'lfw_dataset')

class LfwImporterTest(TestCase):
def test_can_detect(self):
self.assertTrue(LfwImporter.detect(DUMMY_DATASET_DIR))

def test_can_import(self):
expected_dataset = Dataset.from_iterable([
DatasetItem(id='name0/name0_0001',
subset='test', image=np.ones((2, 5, 3)),
attributes = {
'positive_pairs': [],
'negative_pairs': ['name1/name1_0001',
'name1/name1_0002']
},
annotations=[
Points([0, 4, 3, 3, 2, 2, 1, 0, 3, 0]),
]
),
DatasetItem(id='name1/name1_0001',
subset='test', image=np.ones((2, 5, 3)),
attributes = {
'positive_pairs': ['name1/name1_0002'],
'negative_pairs': []
},
annotations=[
Points([1, 6, 4, 6, 3, 3, 2, 1, 4, 1]),
]
),
DatasetItem(id='name1/name1_0002',
subset='test', image=np.ones((2, 5, 3)),
attributes = {
'positive_pairs': [],
'negative_pairs': []
},
annotations=[
Points([0, 5, 3, 5, 2, 2, 1, 0, 3, 0]),
]
),
])

dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'lfw')

compare_datasets(self, expected_dataset, dataset)

0 comments on commit dad5c05

Please sign in to comment.