From 057740e608a3fea06736799bc6b77ef0a1158f3b Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Thu, 3 Jun 2021 19:54:01 +0300 Subject: [PATCH] Release 0.1.9 (#277) * Rename 'openvino' plugin to 'openvino_plugin' (#205) Co-authored-by: Jihyeon Yi * Make remap labels more accurate, allow explicit label deletion, add docs, update tests (#203) * Kate/handling multiple attributes and speed up detection split (#207) * better handling multi-attributes for classification_split * handling multi-attributes better for detection * bugfix in calculating required number of images for splitting 2 correct side effect of the changes for re-id split * allow multiple subsets with arbitrary names * rename _is_number to _is_float and improve it * Fix voc to coco example (#209) * Fix export filtering * update example in readme * Fix export filename for LabelMe format (#200) * change export filename for LabelMe format * Allow simple merge for datasets with no labels * Add a more complex test on relative paths * Support escaping in attributes * update changelog Co-authored-by: Maxim Zhiltsov * split unlabeled data into subsets for task-specific splitters (#211) * split unlabeled data into subsets for classification, detection. for re-id, 'not-supported' subsets for this data * Fix image ext on saving in cvat format (#214) * fix image saving in cvat format * update changelog * Label "face" for bounding boxes in Wider Face (#215) * add face label * update changelog * Adding "difficult", "truncated", "occluded" attributes when converting to Pascal VOC if they are not present (#216) * remove check for 'difficult' attribute * remove check for 'truncated' and 'occluded' attributes * update changelog * Ignore empty lines in YOLO annotations (#221) * Ignore empty lines in yolo annotations * Add type hints for image class, catch image opening errors in image.size * update changelog * Classification task in LFW dataset format (#222) * add classification * update changelog * update documentation * Add splitter for segmentation task (#223) * added segmentation_split * updated changelog * rename reidentification to reid * Support for CIFAR-10/100 format (#225) * add CIFAR dataset format * add CIFAR to documentation * update Changelog * add validation item for instance segmentation (#227) * add validation item for instance segmentation * Add panoptic and stuff COCO format (#210) * add coco stuff and panoptic formats * update CHANGELOG Co-authored-by: Maxim Zhiltsov * update detection splitter algorithm from # of samples to # of instances (#235) * add documentation for validator (#233) * add documentation for validator * add validation item description (#237) * Fix converter for Pascal VOC format (#239) * User documentation for Pascal VOC format (#228) * add user documentation for Pascal VOC format * add integration tests * update changelog * Support for MNIST dataset format (#234) * add mnist format * add mnist csv format * add mnist to documentation * make formats docs folder, create COCO format documentation (#241) * Make formats docs folder, move format docs * Create COCO format documentation * Fixes in CIFAR dataset format (#243) * Add folder creation * Update changelog * Add user documentation file and integration tests for YOLO format (#246) * add user documentation file for yolo * add integraion tests * update user manual * update changelog * Add Cityscapes format (#249) * add cityscapes format * add format docs * update changelog * Fix saving attribute in WiderFace extractor (#251) * add fixes * update changelog * Fix spelling errors (#252) * Configurable Threshold CLI support (#250) * add validator cli * add configurable validator threshold * update changelog * CI. Move to GitHub actions. (#263) * Moving to GitHub Actions * Sending a coverage report if python3.6 (#264) * Rename workflows (#265) * Rename workflows * Update repo config and badge (#266) * Update PR template * Update build status badge * Fix deprecation warnings (#270) * Update RISE docs (#255) * Update rise docs * Update cli help * Pytest related changes (#248) * Tests moved to pytest. Updated CI. Updated requirements. * Updated contribution guide * Added annotations for tests * Updated tests * Added code style guide * Fix CI (#272) * Fix script call * change script call to binary call * Fix help program name, add mark_bug (#275) * Fix prog name * Add mark_bug test annotation * Fix labelmap parameter in CamVid (#262) * Fix labelmap parameter in camvid * Release 0.1.9 (dev) (#276) * Update version * Update changelog * Fix numpy conflict (#278) Co-authored-by: Emily Chun Co-authored-by: Jihyeon Yi Co-authored-by: Kirill Sizov Co-authored-by: Anastasia Yasakova Co-authored-by: Harim Kang Co-authored-by: Zoya Maslova Co-authored-by: Roman Donchenko Co-authored-by: Seungyoon Woo Co-authored-by: Dmitry Kruchinin <33020454+dvkruchinin@users.noreply.github.com> Co-authored-by: Slawomir Strehlke --- .github/pull_request_template.md | 6 +- .github/workflows/health_check.yml | 30 + .github/workflows/pr_checks.yml | 29 + .gitignore | 5 +- .travis.yml | 38 - CHANGELOG.md | 41 +- CONTRIBUTING.md | 191 ++- README.md | 22 +- datumaro/cli/__main__.py | 5 +- datumaro/cli/commands/explain.py | 31 +- datumaro/cli/contexts/project/__init__.py | 28 +- datumaro/components/environment.py | 6 +- datumaro/components/errors.py | 35 +- datumaro/components/extractor.py | 21 +- datumaro/components/operations.py | 26 +- datumaro/components/validator.py | 1248 +++++++++++------ .../accuracy_checker_plugin/details/ac.py | 4 +- .../details/representation.py | 4 +- datumaro/plugins/camvid_format.py | 10 + datumaro/plugins/cifar_format.py | 183 +++ datumaro/plugins/cityscapes_format.py | 357 +++++ datumaro/plugins/coco_format/converter.py | 85 +- datumaro/plugins/coco_format/extractor.py | 97 +- datumaro/plugins/coco_format/format.py | 5 +- datumaro/plugins/coco_format/importer.py | 9 +- datumaro/plugins/cvat_format/converter.py | 2 +- datumaro/plugins/datumaro_format/converter.py | 7 +- datumaro/plugins/labelme_format.py | 187 ++- datumaro/plugins/lfw_format.py | 242 +++- datumaro/plugins/mnist_csv_format.py | 170 +++ datumaro/plugins/mnist_format.py | 209 +++ datumaro/plugins/ndr.py | 24 +- .../{openvino => openvino_plugin}/README.md | 4 +- .../{openvino => openvino_plugin}/__init__.py | 0 .../{openvino => openvino_plugin}/launcher.py | 0 .../samples/coco.class | 0 .../samples/imagenet.class | 0 .../samples/mobilenet_v2_pytorch_interp.py | 0 .../samples/ssd_face_detection_interp.py | 0 .../ssd_mobilenet_coco_detection_interp.py | 0 .../samples/ssd_person_detection_interp.py | 0 ...sd_person_vehicle_bike_detection_interp.py | 0 .../samples/ssd_vehicle_detection_interp.py | 0 datumaro/plugins/splitter.py | 644 ++++++--- datumaro/plugins/transforms.py | 37 +- datumaro/plugins/voc_format/converter.py | 22 +- datumaro/plugins/widerface_format.py | 43 +- datumaro/plugins/yolo_format/converter.py | 2 +- datumaro/plugins/yolo_format/extractor.py | 6 +- datumaro/util/__init__.py | 21 +- datumaro/util/image.py | 44 +- datumaro/util/mask_tools.py | 13 +- datumaro/version.py | 2 +- docs/cli_design.mm | 2 +- docs/design.md | 2 +- docs/developer_guide.md | 2 +- docs/formats/cityscapes_user_manual.md | 176 +++ docs/formats/coco_user_manual.md | 218 +++ docs/formats/mnist_user_manual.md | 179 +++ docs/formats/pascal_voc_user_manual.md | 317 +++++ docs/formats/yolo_user_manual.md | 210 +++ docs/user_manual.md | 322 ++++- pytest.ini | 3 + requirements.txt | 3 +- setup.py | 11 +- tests/assets/cifar_dataset/batches.meta | Bin 0 -> 70 bytes tests/assets/cifar_dataset/data_batch_1 | Bin 0 -> 3317 bytes tests/assets/cifar_dataset/test_batch | Bin 0 -> 9494 bytes ...tcity_000001_000031_gtFine_instanceIds.png | Bin 0 -> 76 bytes ...tcity_000001_000032_gtFine_instanceIds.png | Bin 0 -> 76 bytes ...tcity_000002_000045_gtFine_instanceIds.png | Bin 0 -> 76 bytes ...tcity_000001_000019_gtFine_instanceIds.png | Bin 0 -> 76 bytes .../defaultcity_000001_000031_leftImg8bit.png | Bin 0 -> 70 bytes .../defaultcity_000001_000032_leftImg8bit.png | Bin 0 -> 70 bytes .../defaultcity_000002_000045_leftImg8bit.png | Bin 0 -> 70 bytes .../defaultcity_000001_000019_leftImg8bit.png | Bin 0 -> 70 bytes .../annotations/panoptic_val.json | 75 + .../annotations/panoptic_val/000000000001.png | Bin 0 -> 78 bytes .../coco_panoptic/images/val/000000000001.jpg | Bin 0 -> 631 bytes .../coco_stuff/annotations/stuff_val.json | 50 + .../coco_stuff/images/val/000000000001.jpg | Bin 0 -> 631 bytes tests/assets/lfw_dataset/test/people.txt | 3 + tests/assets/mnist_csv_dataset/mnist_test.csv | 3 + .../assets/mnist_csv_dataset/mnist_train.csv | 2 + .../mnist_dataset/t10k-images-idx3-ubyte.gz | Bin 0 -> 74 bytes .../mnist_dataset/t10k-labels-idx1-ubyte.gz | Bin 0 -> 54 bytes .../mnist_dataset/train-images-idx3-ubyte.gz | Bin 0 -> 70 bytes .../mnist_dataset/train-labels-idx1-ubyte.gz | Bin 0 -> 54 bytes .../Annotations/2007_000001.xml | 0 .../ImageSets/Action/test.txt | 0 .../ImageSets/Action/train.txt | 0 .../ImageSets/Layout/test.txt | 0 .../ImageSets/Layout/train.txt | 0 .../ImageSets/Main/aeroplane_train.txt | 0 .../ImageSets/Main/background_train.txt | 0 .../ImageSets/Main/bicycle_train.txt | 0 .../ImageSets/Main/bird_train.txt | 0 .../ImageSets/Main/boat_train.txt | 0 .../ImageSets/Main/bottle_train.txt | 0 .../ImageSets/Main/bus_train.txt | 0 .../ImageSets/Main/car_train.txt | 0 .../ImageSets/Main/cat_train.txt | 0 .../ImageSets/Main/chair_train.txt | 0 .../ImageSets/Main/cow_train.txt | 0 .../ImageSets/Main/diningtable_train.txt | 0 .../ImageSets/Main/dog_train.txt | 0 .../ImageSets/Main/horse_train.txt | 0 .../ImageSets/Main/ignored_train.txt | 0 .../ImageSets/Main/motorbike_train.txt | 0 .../ImageSets/Main/person_train.txt | 0 .../ImageSets/Main/pottedplant_train.txt | 0 .../ImageSets/Main/sheep_train.txt | 0 .../ImageSets/Main/sofa_train.txt | 0 .../ImageSets/Main/test.txt | 0 .../ImageSets/Main/train.txt | 0 .../ImageSets/Main/train_train.txt | 0 .../ImageSets/Main/tvmonitor_train.txt | 0 .../ImageSets/Segmentation/test.txt | 0 .../ImageSets/Segmentation/train.txt | 0 .../voc_dataset1/JPEGImages/2007_000001.jpg | Bin 0 -> 336 bytes .../JPEGImages/2007_000002.jpg | Bin .../SegmentationClass/2007_000001.png | Bin .../SegmentationObject/2007_000001.png | Bin .../voc_dataset2/Annotations/a.xml | 22 + .../voc_dataset2/Annotations/b.xml | 22 + .../voc_dataset2/Annotations/c.xml | 22 + .../voc_dataset2/Annotations/d.xml | 22 + .../ImageSets/Action/trainval.txt | 4 + .../ImageSets/Layout/trainval.txt | 4 + .../ImageSets/Main/aeroplane_trainval.txt | 0 .../ImageSets/Main/background_trainval.txt | 0 .../ImageSets/Main/bicycle_trainval.txt | 0 .../ImageSets/Main/bird_trainval.txt | 0 .../ImageSets/Main/boat_trainval.txt | 0 .../ImageSets/Main/bottle_trainval.txt | 0 .../ImageSets/Main/bus_trainval.txt | 0 .../ImageSets/Main/car_trainval.txt | 0 .../ImageSets/Main/cat_trainval.txt | 0 .../ImageSets/Main/chair_trainval.txt | 0 .../ImageSets/Main/cow_trainval.txt | 0 .../ImageSets/Main/diningtable_trainval.txt | 0 .../ImageSets/Main/dog_trainval.txt | 0 .../ImageSets/Main/horse_trainval.txt | 0 .../ImageSets/Main/ignored_trainval.txt | 0 .../ImageSets/Main/motorbike_trainval.txt | 0 .../ImageSets/Main/person_trainval.txt | 0 .../ImageSets/Main/pottedplant_trainval.txt | 0 .../ImageSets/Main/sheep_trainval.txt | 0 .../ImageSets/Main/sofa_trainval.txt | 0 .../ImageSets/Main/train_trainval.txt | 0 .../voc_dataset2/ImageSets/Main/trainval.txt | 4 + .../ImageSets/Main/tvmonitor_trainval.txt | 0 .../ImageSets/Segmentation/trainval.txt | 0 .../voc_dataset/voc_dataset2/labelmap.txt | 23 + tests/cli/test_diff.py | 2 + tests/cli/test_voc_format.py | 291 ++++ tests/cli/test_yolo_format.py | 163 +++ tests/conftest.py | 17 + tests/requirements.py | 49 + tests/test_RISE.py | 3 + tests/test_camvid_format.py | 13 + tests/test_cifar_format.py | 158 +++ tests/test_cityscapes_format.py | 350 +++++ tests/test_coco_format.py | 171 ++- tests/test_command_targets.py | 13 + tests/test_config.py | 2 + tests/test_cvat_format.py | 18 +- tests/test_dataset.py | 66 +- tests/test_datumaro_format.py | 8 + tests/test_diff.py | 11 + tests/test_icdar_format.py | 14 +- tests/test_image.py | 5 + tests/test_image_dir_format.py | 6 + tests/test_imagenet_format.py | 8 + tests/test_imagenet_txt_format.py | 8 + tests/test_images.py | 8 + tests/test_labelme_format.py | 108 +- tests/test_lfw_format.py | 185 +-- tests/test_market1501_format.py | 9 + tests/test_masks.py | 9 + tests/test_mnist_csv_format.py | 195 +++ tests/test_mnist_format.py | 194 +++ tests/test_mot_format.py | 5 + tests/test_mots_format.py | 6 + tests/test_ndr.py | 13 + tests/test_ops.py | 11 + tests/test_project.py | 21 + tests/test_sampler.py | 8 + tests/test_splitter.py | 819 ++++++++--- tests/test_tfrecord_format.py | 13 + tests/test_transforms.py | 61 +- tests/test_util.py | 8 + tests/test_validator.py | 372 ++++- tests/test_vgg_face2_format.py | 10 + tests/test_voc_format.py | 56 +- tests/test_widerface_format.py | 67 +- tests/test_yolo_format.py | 10 + 197 files changed, 8057 insertions(+), 1403 deletions(-) create mode 100644 .github/workflows/health_check.yml create mode 100644 .github/workflows/pr_checks.yml delete mode 100644 .travis.yml create mode 100644 datumaro/plugins/cifar_format.py create mode 100644 datumaro/plugins/cityscapes_format.py create mode 100644 datumaro/plugins/mnist_csv_format.py create mode 100644 datumaro/plugins/mnist_format.py rename datumaro/plugins/{openvino => openvino_plugin}/README.md (98%) rename datumaro/plugins/{openvino => openvino_plugin}/__init__.py (100%) rename datumaro/plugins/{openvino => openvino_plugin}/launcher.py (100%) rename datumaro/plugins/{openvino => openvino_plugin}/samples/coco.class (100%) rename datumaro/plugins/{openvino => openvino_plugin}/samples/imagenet.class (100%) rename datumaro/plugins/{openvino => openvino_plugin}/samples/mobilenet_v2_pytorch_interp.py (100%) rename datumaro/plugins/{openvino => openvino_plugin}/samples/ssd_face_detection_interp.py (100%) rename datumaro/plugins/{openvino => openvino_plugin}/samples/ssd_mobilenet_coco_detection_interp.py (100%) rename datumaro/plugins/{openvino => openvino_plugin}/samples/ssd_person_detection_interp.py (100%) rename datumaro/plugins/{openvino => openvino_plugin}/samples/ssd_person_vehicle_bike_detection_interp.py (100%) rename datumaro/plugins/{openvino => openvino_plugin}/samples/ssd_vehicle_detection_interp.py (100%) create mode 100644 docs/formats/cityscapes_user_manual.md create mode 100644 docs/formats/coco_user_manual.md create mode 100644 docs/formats/mnist_user_manual.md create mode 100644 docs/formats/pascal_voc_user_manual.md create mode 100644 docs/formats/yolo_user_manual.md create mode 100644 pytest.ini create mode 100644 tests/assets/cifar_dataset/batches.meta create mode 100644 tests/assets/cifar_dataset/data_batch_1 create mode 100644 tests/assets/cifar_dataset/test_batch create mode 100644 tests/assets/cityscapes_dataset/gtFine/test/defaultcity/defaultcity_000001_000031_gtFine_instanceIds.png create mode 100644 tests/assets/cityscapes_dataset/gtFine/test/defaultcity/defaultcity_000001_000032_gtFine_instanceIds.png create mode 100644 tests/assets/cityscapes_dataset/gtFine/train/defaultcity/defaultcity_000002_000045_gtFine_instanceIds.png create mode 100644 tests/assets/cityscapes_dataset/gtFine/val/defaultcity/defaultcity_000001_000019_gtFine_instanceIds.png create mode 100644 tests/assets/cityscapes_dataset/imgsFine/leftImg8bit/test/defaultcity/defaultcity_000001_000031_leftImg8bit.png create mode 100644 tests/assets/cityscapes_dataset/imgsFine/leftImg8bit/test/defaultcity/defaultcity_000001_000032_leftImg8bit.png create mode 100644 tests/assets/cityscapes_dataset/imgsFine/leftImg8bit/train/defaultcity/defaultcity_000002_000045_leftImg8bit.png create mode 100644 tests/assets/cityscapes_dataset/imgsFine/leftImg8bit/val/defaultcity/defaultcity_000001_000019_leftImg8bit.png create mode 100644 tests/assets/coco_dataset/coco_panoptic/annotations/panoptic_val.json create mode 100644 tests/assets/coco_dataset/coco_panoptic/annotations/panoptic_val/000000000001.png create mode 100644 tests/assets/coco_dataset/coco_panoptic/images/val/000000000001.jpg create mode 100644 tests/assets/coco_dataset/coco_stuff/annotations/stuff_val.json create mode 100644 tests/assets/coco_dataset/coco_stuff/images/val/000000000001.jpg create mode 100644 tests/assets/lfw_dataset/test/people.txt create mode 100644 tests/assets/mnist_csv_dataset/mnist_test.csv create mode 100644 tests/assets/mnist_csv_dataset/mnist_train.csv create mode 100644 tests/assets/mnist_dataset/t10k-images-idx3-ubyte.gz create mode 100644 tests/assets/mnist_dataset/t10k-labels-idx1-ubyte.gz create mode 100644 tests/assets/mnist_dataset/train-images-idx3-ubyte.gz create mode 100644 tests/assets/mnist_dataset/train-labels-idx1-ubyte.gz rename tests/assets/voc_dataset/{ => voc_dataset1}/Annotations/2007_000001.xml (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Action/test.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Action/train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Layout/test.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Layout/train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/aeroplane_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/background_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/bicycle_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/bird_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/boat_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/bottle_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/bus_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/car_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/cat_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/chair_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/cow_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/diningtable_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/dog_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/horse_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/ignored_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/motorbike_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/person_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/pottedplant_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/sheep_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/sofa_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/test.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/train_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Main/tvmonitor_train.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Segmentation/test.txt (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/ImageSets/Segmentation/train.txt (100%) create mode 100644 tests/assets/voc_dataset/voc_dataset1/JPEGImages/2007_000001.jpg rename tests/assets/voc_dataset/{ => voc_dataset1}/JPEGImages/2007_000002.jpg (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/SegmentationClass/2007_000001.png (100%) rename tests/assets/voc_dataset/{ => voc_dataset1}/SegmentationObject/2007_000001.png (100%) create mode 100644 tests/assets/voc_dataset/voc_dataset2/Annotations/a.xml create mode 100644 tests/assets/voc_dataset/voc_dataset2/Annotations/b.xml create mode 100644 tests/assets/voc_dataset/voc_dataset2/Annotations/c.xml create mode 100644 tests/assets/voc_dataset/voc_dataset2/Annotations/d.xml create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Action/trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Layout/trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/aeroplane_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/background_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/bicycle_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/bird_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/boat_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/bottle_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/bus_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/car_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/cat_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/chair_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/cow_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/diningtable_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/dog_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/horse_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/ignored_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/motorbike_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/person_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/pottedplant_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/sheep_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/sofa_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/train_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/tvmonitor_trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/ImageSets/Segmentation/trainval.txt create mode 100644 tests/assets/voc_dataset/voc_dataset2/labelmap.txt create mode 100644 tests/cli/test_voc_format.py create mode 100644 tests/cli/test_yolo_format.py create mode 100644 tests/conftest.py create mode 100644 tests/requirements.py create mode 100644 tests/test_cifar_format.py create mode 100644 tests/test_cityscapes_format.py create mode 100644 tests/test_mnist_csv_format.py create mode 100644 tests/test_mnist_format.py diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index d40cfb94da00..9708adff8087 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -24,17 +24,17 @@ not fully covered by unit tests or manual testing can be complicated. --> https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly - [ ] I have added tests to cover my changes - [ ] I have [linked related issues]( - https://help.github.com/en/github/managing-your-work-on-github/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword)) + https://help.github.com/en/github/managing-your-work-on-github/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword) ### License - [ ] I submit _my code changes_ under the same [MIT License]( - https://github.com/opencv/cvat/blob/develop/LICENSE) that covers the project. + https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [ ] I have updated the license header for each file (see an example below) ```python -# Copyright (C) 2020 Intel Corporation +# Copyright (C) 2021 Intel Corporation # # SPDX-License-Identifier: MIT ``` diff --git a/.github/workflows/health_check.yml b/.github/workflows/health_check.yml new file mode 100644 index 000000000000..611e6c0a1761 --- /dev/null +++ b/.github/workflows/health_check.yml @@ -0,0 +1,30 @@ +name: Build +on: + push: + branches: + - 'develop' +jobs: + coverage_tests_on_python: + strategy: + fail-fast: false + matrix: + python-version: ['3.6', '3.7', '3.8', '3.9'] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Installing python + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Installing dependencies + run: | + pip install tensorflow pytest pytest-cov + pip install -e ./ + - name: Code instrumentation + run: | + pytest -v --cov --cov-report xml:coverage.xml + datum -h + - name: Sending coverage results + if: matrix.python-version == '3.6' + run: | + bash <(curl -Ls https://coverage.codacy.com/get.sh) report -r coverage.xml -t ${{ secrets.CODACY_PROJECT_TOKEN }} diff --git a/.github/workflows/pr_checks.yml b/.github/workflows/pr_checks.yml new file mode 100644 index 000000000000..0201fb6f6936 --- /dev/null +++ b/.github/workflows/pr_checks.yml @@ -0,0 +1,29 @@ +name: CI +on: + push: + branches: + - 'master' + pull_request: + branches: + - '*' +jobs: + build_and_tests_on_python: + strategy: + fail-fast: false + matrix: + python-version: ['3.6', '3.7', '3.8', '3.9'] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Installing python + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Installing dependencies + run: | + pip install tensorflow pytest + pip install -e ./ + - name: Unit testing + run: | + pytest -v + datum -h diff --git a/.gitignore b/.gitignore index 79362022a996..78d001033601 100644 --- a/.gitignore +++ b/.gitignore @@ -54,4 +54,7 @@ coverage.xml cover/ # Sphinx documentation -docs/_build/ \ No newline at end of file +docs/_build/ + +#Pycharm config files +.idea/ \ No newline at end of file diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 6b9f252237a1..000000000000 --- a/.travis.yml +++ /dev/null @@ -1,38 +0,0 @@ -language: python - -cache: pip - -python: - - '3.6' - - '3.7' - - '3.8' - -matrix: - include: - - dist: xenial - - # measure coverage here - - dist: bionic - python: '3.6' - before_install: - - pip install coverage - script: - - coverage run -m unittest discover -v - - coverage run -a datum.py -h - after_success: - - coverage xml - - bash <(curl -Ls https://coverage.codacy.com/get.sh) report -r coverage.xml - - - dist: bionic - python: '3.7' - - dist: bionic - python: '3.8' - -install: - - pip install -e ./ - - pip install tensorflow - - pip install pandas - -script: - - python -m unittest discover -v - - datum -h \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 560e81d12e4b..94b8d7b49d64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,45 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 03/06/2021 - Release v0.1.9 +### Added +- Support for escaping in attribute values in LabelMe format () +- Support for Segmentation Splitting () +- Support for CIFAR-10/100 dataset format (, ) +- Support for COCO panoptic and stuff format () +- Documentation file and integration tests for Pascal VOC format () +- Support for MNIST and MNIST in CSV dataset formats () +- Documentation file for COCO format () +- Documentation file and integration tests for YOLO format () +- Support for Cityscapes dataset format () +- Support for Validator configurable threshold () + +### Changed +- LabelMe format saves dataset items with their relative paths by subsets without changing names () +- Allowed arbitrary subset count and names in classification and detection splitters () +- Annotation-less dataset elements are now participate in subset splitting () +- Classification task in LFW dataset format () +- Testing is now performed with pytest instead of unittest () + +### Deprecated +- + +### Removed +- + +### Fixed +- Added support for auto-merging (joining) of datasets with no labels and having labels () +- Allowed explicit label removal in `remap_labels` transform () +- Image extension in CVAT format export () +- Added a label "face" for bounding boxes in Wider Face () +- Allowed adding "difficult", "truncated", "occluded" attributes when converting to Pascal VOC if these attributes are not present () +- Empty lines in YOLO annotations are ignored () +- Export in VOC format when no image info is available () +- Fixed saving attribute in WiderFace extractor () + +### Security +- + ## 31/03/2021 - Release v0.1.8 ### Added - @@ -139,7 +178,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - `CamVid` dataset format () - Ability to install `opencv-python-headless` dependency with `DATUMARO_HEADLESS=1` - enviroment variable instead of `opencv-python` () + environment variable instead of `opencv-python` () ### Changed - Allow empty supercategory in COCO () diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4822c000757b..5d7787182011 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,9 +1,12 @@ ## Table of Contents +- [Design document](docs/design.md) +- [Developer guide](docs/developer_guide.md) - [Installation](#installation) - [Usage](#usage) +- [Code style](#code-style) +- [Development environment](#environment) - [Testing](#testing) -- [Design](#design-and-code-structure) ## Installation @@ -64,15 +67,73 @@ python datum.py --help import datumaro ``` -## Testing +## Code style + +Try to be readable and consistent with the existing codebase. + +The project mostly follows PEP8 with little differences. +Continuation lines have a standard indentation step by default, +or any other, if it improves readability. For long conditionals use 2 steps. +No trailing whitespaces, 80 characters per line. + +Example: + +```python +def do_important_work(parameter1, parameter2, parameter3, + option1=None, option2=None, option3=None) -> str: + """ + Optional description. Mandatory for API. + Use comments for implementation specific information, use docstrings + to give information to user / developer. + + Returns: status (str) - Possible values: 'done', 'failed' + """ + + ... do stuff ... + + # Use +1 level of indentation for continuation lines + variable_with_a_long_but_meaningful_name = \ + function_with_a_long_but_meaningful_name(arg1, arg2, arg3, + kwarg1=value_with_a_long_name, kwarg2=value_with_a_long_name) + + # long conditions, loops, with etc. also use +1 level of indentation + if condition1 and long_condition2 or \ + not condition3 and condition4 and condition5 or \ + condition6 and condition7: + + ... do other stuff ... + + elif other_conditions: + + ... some other things ... + + # in some cases special formatting can improve code readability + specific_case_formatting = np.array([ + [0, 1, 1, 0], + [1, 1, 0, 0], + [1, 1, 0, 1], + ], dtype=np.int32) + + return status +``` + +## Environment + +The recommended editor is VS Code with the Python language plugin. + +## Testing It is expected that all Datumaro functionality is covered and checked by unit tests. Tests are placed in `tests/` directory. +Currently, we use [`pytest`](https://docs.pytest.org/) for testing, but we +also compatible with `unittest`. To run tests use: ``` bash -python -m unittest discover -s tests +pytest -v +# or +python -m pytest -v ``` If you're working inside of a CVAT environment, you can also use: @@ -81,19 +142,121 @@ If you're working inside of a CVAT environment, you can also use: python manage.py test datumaro/ ``` -## Design and code structure -- [Design document](docs/design.md) -- [Developer guide](docs/developer_guide.md) +### Test cases -## Code style +### Test marking -Try to be readable and consistent with the existing codebase. -The project mostly follows PEP8 with little differences. -Continuation lines have a standard indentation step by default, -or any other, if it improves readability. For long conditionals use 2 steps. -No trailing whitespaces, 80 characters per line. +For better integration with CI and requirements tracking, +we use special annotations for tests. -## Environment +A test needs to marked with a requirement it is related to. To mark a test, use: + +```python +from unittest import TestCase +from .requirements import Requirements, mark_requirement + +class MyTests(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_my_requirement(self): + ... do stuff ... +``` + +Such marking will apply markings from the requirement specified. +They can be overriden for a specific test: + +```python +import pytest + + @pytest.mark.proirity_low + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_my_requirement(self): + ... do stuff ... +``` + +#### Requirements + +Requirements and other links need to be added to [`tests/requirements.py`](tests/requirements.py): + +```python +DATUM_244 = "Add Snyk integration" +DATUM_BUG_219 = "Return format is not uniform" +``` + +```python +# Fully defined in GitHub issues: +@pytest.mark.reqids(Requirements.DATUM_244, Requirements.DATUM_333) + +# And defined ony other way: +@pytest.mark.reqids(Requirements.DATUM_GENERAL_REQ) +``` + + +##### Available annotations for tests and requirements + +Markings are defined in [`tests/conftest.py`](tests/conftest.py). + +**A list of requirements and bugs** +```python +@pytest.mark.requids(Requirements.DATUM_123) +@pytest.mark.bugs(Requirements.DATUM_BUG_456) +``` + +**A priority** +```python +@pytest.mark.priority_low +@pytest.mark.priority_medium +@pytest.mark.priority_high +``` + +**Component** +The marking used for indication of different system components + +```python +@pytest.mark.components(DatumaroComponent.Datumaro) +``` + +**Skipping tests** + +```python +@pytest.mark.skip(SkipMessages.NOT_IMPLEMENTED) +``` + +**Parametrized runs** + +Parameters are used for running the same test with different parameters e.g. + +```python +@pytest.mark.parametrize("numpy_array, batch_size", [ + (np.zeros([2]), 0), + (np.zeros([2]), 1), + (np.zeros([2]), 2), + (np.zeros([2]), 5), + (np.zeros([5]), 2), +]) +``` + +### Test documentation + +Tests are documented with docstrings. Test descriptions must contain +the following: sections: `Description`, `Expected results` and `Steps`. + +```python +def test_can_convert_polygons_to_mask(self): + """ + Description: + Ensure that the dataset polygon annotation can be properly converted + into dataset segmentation mask. + + Expected results: + Dataset segmentation mask converted from dataset polygon annotation + is equal to an expected mask. -The recommended editor is VS Code with the Python plugin. \ No newline at end of file + Steps: + 1. Prepare dataset with polygon annotation + 2. Prepare dataset with expected mask segmentation mode + 3. Convert source dataset to target, with conversion of annotation + from polygon to mask. + 4. Verify that resulting segmentation mask is equal to the expected mask. + """ +``` \ No newline at end of file diff --git a/README.md b/README.md index 65aa2817c275..725c671f1fa6 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Dataset Management Framework (Datumaro) -[![Build Status](https://travis-ci.org/openvinotoolkit/datumaro.svg?branch=develop)](https://travis-ci.org/openvinotoolkit/datumaro) +[![Build status](https://github.com/openvinotoolkit/datumaro/actions/workflows/health_check.yml/badge.svg)](https://github.com/openvinotoolkit/datumaro/actions/workflows/health_check.yml) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/759d2d873b59495aa3d3f8c51b786246)](https://app.codacy.com/gh/openvinotoolkit/datumaro?utm_source=github.com&utm_medium=referral&utm_content=openvinotoolkit/datumaro&utm_campaign=Badge_Grade_Dashboard) [![Codacy Badge](https://app.codacy.com/project/badge/Coverage/9511b691ff134e739ea6fc524f7cc760)](https://www.codacy.com/gh/openvinotoolkit/datumaro?utm_source=github.com&utm_medium=referral&utm_content=openvinotoolkit/datumaro&utm_campaign=Badge_Coverage) @@ -38,7 +38,8 @@ CVAT annotations ---> Publication, statistics etc. # http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar datum convert --input-format voc --input-path \ --output-format coco \ - --filter '/item[annotation/label="cat"]' + --filter '/item[annotation/label="cat"]' \ + -- --reindex 1 # avoid annotation id conflicts ``` - Convert only non-`occluded` annotations from a [CVAT](https://github.com/opencv/cvat) project to TFrecord: @@ -123,7 +124,7 @@ CVAT annotations ---> Publication, statistics etc. [(Back to top)](#table-of-contents) - Dataset reading, writing, conversion in any direction. [Supported formats](docs/user_manual.md#supported-formats): - - [COCO](http://cocodataset.org/#format-data) (`image_info`, `instances`, `person_keypoints`, `captions`, `labels`*) + - [COCO](http://cocodataset.org/#format-data) (`image_info`, `instances`, `person_keypoints`, `captions`, `labels`, `panoptic`, `stuff`) - [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/index.html) (`classification`, `detection`, `segmentation`, `action_classification`, `person_layout`) - [YOLO](https://github.com/AlexeyAB/darknet#how-to-train-pascal-voc-data) (`bboxes`) - [TF Detection API](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/using_your_own_dataset.md) (`bboxes`, `masks`) @@ -132,12 +133,16 @@ CVAT annotations ---> Publication, statistics etc. - [MOT sequences](https://arxiv.org/pdf/1906.04567.pdf) - [MOTS PNG](https://www.vision.rwth-aachen.de/page/mots) - [ImageNet](http://image-net.org/) + - [CIFAR-10/100](https://www.cs.toronto.edu/~kriz/cifar.html) (`classification`) + - [MNIST](http://yann.lecun.com/exdb/mnist/) (`classification`) + - [MNIST in CSV](https://pjreddie.com/projects/mnist-in-csv/) (`classification`) - [CamVid](http://mi.eng.cam.ac.uk/research/projects/VideoRec/CamVid/) + - [Cityscapes](https://www.cityscapes-dataset.com/) - [CVAT](https://github.com/opencv/cvat/blob/develop/cvat/apps/documentation/xml_format.md) - [LabelMe](http://labelme.csail.mit.edu/Release3.0) - [ICDAR13/15](https://rrc.cvc.uab.es/?ch=2) (`word_recognition`, `text_localization`, `text_segmentation`) - [Market-1501](https://www.aitribune.com/dataset/2018051063) (`person re-identification`) - - [LFW](http://vis-www.cs.umass.edu/lfw/) (`person re-identification`, `landmarks`) + - [LFW](http://vis-www.cs.umass.edu/lfw/) (`classification`, `person re-identification`, `landmarks`) - Dataset building - Merging multiple datasets into one - Dataset filtering by a custom criteria: @@ -147,7 +152,7 @@ CVAT annotations ---> Publication, statistics etc. - keep only vertically-oriented images - remove small area bounding boxes from annotations - Annotation conversions, for instance: - - polygons to instance masks and vise-versa + - polygons to instance masks and vice-versa - apply a custom colormap for mask annotations - rename or remove dataset labels - Splitting a dataset into multiple subsets like `train`, `val`, and `test`: @@ -159,19 +164,22 @@ CVAT annotations ---> Publication, statistics etc. - for re-identification task, based on labels, avoiding having same IDs in training and test splits - Sampling a dataset - - analyzes inference result from the given dataset + - analyzes inference result from the given dataset and selects the ‘best’ and the ‘least amount of’ samples for annotation. - Select the sample that best suits model training. - sampling with Entropy based algorithm - Dataset quality checking - Simple checking for errors - - Comparison with model infernece + - Comparison with model inference - Merging and comparison of multiple datasets + - Annotation validation based on the task type(classification, etc) - Dataset comparison - Dataset statistics (image mean and std, annotation statistics) - Model integration - Inference (OpenVINO, Caffe, PyTorch, TensorFlow, MxNet, etc.) - Explainable AI ([RISE algorithm](https://arxiv.org/abs/1806.07421)) + - RISE for classification + - RISE for object detection > Check [the design document](docs/design.md) for a full list of features. > Check [the user manual](docs/user_manual.md) for usage instructions. diff --git a/datumaro/cli/__main__.py b/datumaro/cli/__main__.py index 2ecf9f7a788d..528c6d019e95 100644 --- a/datumaro/cli/__main__.py +++ b/datumaro/cli/__main__.py @@ -5,6 +5,7 @@ import argparse import logging as log +import os.path as osp import sys from . import contexts, commands @@ -50,9 +51,11 @@ def _make_subcommands_help(commands, help_line_start=0): return desc def make_parser(): - parser = argparse.ArgumentParser(prog="datumaro", + parser = argparse.ArgumentParser( description="Dataset Framework", formatter_class=argparse.RawDescriptionHelpFormatter) + if parser.prog == osp.basename(__file__): # python -m datumaro ... + parser.prog = 'datumaro' parser.add_argument('--version', action='version', version=VERSION) _LogManager._define_loglevel_option(parser) diff --git a/datumaro/cli/commands/explain.py b/datumaro/cli/commands/explain.py index 9c3e1d147a67..dc4256194ced 100644 --- a/datumaro/cli/commands/explain.py +++ b/datumaro/cli/commands/explain.py @@ -17,13 +17,38 @@ def build_parser(parser_ctor=argparse.ArgumentParser): parser = parser_ctor(help="Run Explainable AI algorithm", - description="Runs an explainable AI algorithm for a model.") + description=""" + Runs an explainable AI algorithm for a model.|n + |n + This tool is supposed to help an AI developer to debug + a model and a dataset. Basically, it executes inference and + tries to find problems in the trained model - determine decision + boundaries and belief intervals for the classifier.|n + |n + Currently, the only available algorithm is + RISE (https://arxiv.org/pdf/1806.07421.pdf), which runs + inference and then re-runs a model multiple times + on each image to produce a heatmap of activations for + each output of the first inference. As a result, we obtain + few heatmaps, which shows, how image pixels affected + the inference result. This algorithm doesn't require any special + information about the model, but it requires the model to + return all the outputs and confidences. Check the User Manual + for usage examples.|n + Supported scenarios:|n + - RISE for classification|n + - RISE for Object Detection|n + |n + Examples:|n + - Run RISE on an image, display results:|n + |s|s%(prog)s -t path/to/image.jpg -m mymodel rise --max-samples 50 + """, formatter_class=MultilineFormatter) parser.add_argument('-m', '--model', required=True, help="Model to use for inference") parser.add_argument('-t', '--target', default=None, help="Inference target - image, source, project " - "(default: current dir)") + "(default: current project)") parser.add_argument('-o', '--output-dir', dest='save_dir', default=None, help="Directory to save output (default: display only)") @@ -152,7 +177,7 @@ def explain_command(args): for item in dataset: image = item.image.data if image is None: - log.warn( + log.warning( "Dataset item %s does not have image data. Skipping." % \ (item.id)) continue diff --git a/datumaro/cli/contexts/project/__init__.py b/datumaro/cli/contexts/project/__init__.py index 44e9c82529e8..ff4dfb10bd69 100644 --- a/datumaro/cli/contexts/project/__init__.py +++ b/datumaro/cli/contexts/project/__init__.py @@ -8,6 +8,7 @@ import os import os.path as osp import shutil +import numpy as np from enum import Enum from datumaro.components.dataset_filter import DatasetItemEncoder @@ -345,7 +346,8 @@ def export_command(args): if args.filter: dataset = dataset.filter(args.filter, **filter_args) - dataset.export(format=args.format, save_dir=dst_dir, **extra_args) + converter = project.env.converters[args.format] + converter.convert(dataset, save_dir=dst_dir, **extra_args) log.info("Project exported to '%s' as '%s'" % (dst_dir, args.format)) @@ -806,6 +808,8 @@ def build_validate_parser(parser_ctor=argparse.ArgumentParser): help="Subset to validate (default: None)") parser.add_argument('-p', '--project', dest='project_dir', default='.', help="Directory of the project to validate (default: current dir)") + parser.add_argument('extra_args', nargs=argparse.REMAINDER, default=None, + help="Optional arguments for validator (pass '-- -h' for help)") parser.set_defaults(command=validate_command) return parser @@ -814,28 +818,38 @@ def validate_command(args): project = load_project(args.project_dir) task_type = args.task_type subset_name = args.subset_name - dst_file_name = 'validation_results' + dst_file_name = f'validation_results-{task_type}' dataset = project.make_dataset() if subset_name is not None: dataset = dataset.get_subset(subset_name) dst_file_name += f'-{subset_name}' - validation_results = validate_annotations(dataset, task_type) - def _convert_tuple_keys_to_str(d): + extra_args = {} + from datumaro.components.validator import _Validator + extra_args = _Validator.parse_cmdline(args.extra_args) + validation_results = validate_annotations(dataset, task_type, **extra_args) + + def numpy_encoder(obj): + if isinstance(obj, np.generic): + return obj.item() + + def _make_serializable(d): for key, val in list(d.items()): + # tuple key to str if isinstance(key, tuple): d[str(key)] = val d.pop(key) if isinstance(val, dict): - _convert_tuple_keys_to_str(val) + _make_serializable(val) - _convert_tuple_keys_to_str(validation_results) + _make_serializable(validation_results) dst_file = generate_next_file_name(dst_file_name, ext='.json') log.info("Writing project validation results to '%s'" % dst_file) with open(dst_file, 'w') as f: - json.dump(validation_results, f, indent=4, sort_keys=True) + json.dump(validation_results, f, indent=4, sort_keys=True, + default=numpy_encoder) def build_parser(parser_ctor=argparse.ArgumentParser): parser = parser_ctor( diff --git a/datumaro/components/environment.py b/datumaro/components/environment.py index 3db6b1b4008a..c27131a84143 100644 --- a/datumaro/components/environment.py +++ b/datumaro/components/environment.py @@ -146,12 +146,12 @@ def __init__(self, config=None): custom = self._load_plugins2(osp.join(env_dir, config.plugins_dir)) select = lambda seq, t: [e for e in seq if issubclass(e, t)] from datumaro.components.converter import Converter - from datumaro.components.extractor import (Importer, SourceExtractor, + from datumaro.components.extractor import (Importer, Extractor, Transform) from datumaro.components.launcher import Launcher self.extractors = PluginRegistry( - builtin=select(builtin, SourceExtractor), - local=select(custom, SourceExtractor) + builtin=select(builtin, Extractor), + local=select(custom, Extractor) ) self.extractors.register(self.PROJECT_EXTRACTOR_NAME, load_project_as_dataset) diff --git a/datumaro/components/errors.py b/datumaro/components/errors.py index 3d8da0629bb8..717399b060d3 100644 --- a/datumaro/components/errors.py +++ b/datumaro/components/errors.py @@ -119,10 +119,13 @@ def __str__(self): return "Metadata (ex. LabelCategories) should be defined" \ " to validate a dataset." + @attrs -class MissingLabelAnnotation(DatasetItemValidationError): +class MissingAnnotation(DatasetItemValidationError): + ann_type = attrib() def __str__(self): - return "Item needs a label, but not found." + return f"Item needs '{self.ann_type}' annotation(s), " \ + "but not found." @attrs class MultiLabelAnnotations(DatasetItemValidationError): @@ -228,32 +231,26 @@ def __str__(self): f" '{self. attr_name}' for the label '{self.label_name}'." @attrs -class ImbalancedBboxDistInLabel(DatasetValidationError): +class ImbalancedDistInLabel(DatasetValidationError): label_name = attrib() prop = attrib() def __str__(self): - return f"Values of bbox '{self.prop}' are not evenly " \ + return f"Values of '{self.prop}' are not evenly " \ f"distributed for '{self.label_name}' label." @attrs -class ImbalancedBboxDistInAttribute(DatasetValidationError): +class ImbalancedDistInAttribute(DatasetValidationError): label_name = attrib() attr_name = attrib() attr_value = attrib() prop = attrib() def __str__(self): - return f"Values of bbox '{self.prop}' are not evenly " \ + return f"Values of '{self.prop}' are not evenly " \ f"distributed for '{self.attr_name}' = '{self.attr_value}' for " \ f"the '{self.label_name}' label." -@attrs -class MissingBboxAnnotation(DatasetItemValidationError): - def __str__(self): - return 'Item needs one or more bounding box annotations, ' \ - 'but not found.' - @attrs class NegativeLength(DatasetItemValidationError): ann_id = attrib() @@ -261,7 +258,7 @@ class NegativeLength(DatasetItemValidationError): val = attrib() def __str__(self): - return f"Bounding box annotation '{self.ann_id}' in " \ + return f"Annotation '{self.ann_id}' in " \ "the item should have a positive value of " \ f"'{self.prop}' but got '{self.val}'." @@ -271,9 +268,9 @@ class InvalidValue(DatasetItemValidationError): prop = attrib() def __str__(self): - return f"Bounding box annotation '{self.ann_id}' in " \ + return f"Annotation '{self.ann_id}' in " \ 'the item has an inf or a NaN value of ' \ - f"bounding box '{self.prop}'." + f"'{self.prop}'." @attrs class FarFromLabelMean(DatasetItemValidationError): @@ -284,8 +281,8 @@ class FarFromLabelMean(DatasetItemValidationError): val = attrib() def __str__(self): - return f"Bounding box annotation '{self.ann_id}' in " \ - f"the item has a value of bounding box '{self.prop}' that " \ + return f"Annotation '{self.ann_id}' in " \ + f"the item has a value of '{self.prop}' that " \ "is too far from the label average. (mean of " \ f"'{self.label_name}' label: {self.mean}, got '{self.val}')." @@ -300,8 +297,8 @@ class FarFromAttrMean(DatasetItemValidationError): val = attrib() def __str__(self): - return f"Bounding box annotation '{self.ann_id}' in the " \ - f"item has a value of bounding box '{self.prop}' that " \ + return f"Annotation '{self.ann_id}' in the " \ + f"item has a value of '{self.prop}' that " \ "is too far from the attribute average. (mean of " \ f"'{self.attr_name}' = '{self.attr_value}' for the " \ f"'{self.label_name}' label: {self.mean}, got '{self.val}')." diff --git a/datumaro/components/extractor.py b/datumaro/components/extractor.py index b913dece132b..e8cc4f89c99b 100644 --- a/datumaro/components/extractor.py +++ b/datumaro/components/extractor.py @@ -1,5 +1,5 @@ -# Copyright (C) 2019-2020 Intel Corporation +# Copyright (C) 2019-2021 Intel Corporation # # SPDX-License-Identifier: MIT @@ -7,6 +7,7 @@ from glob import iglob from typing import Iterable, List, Dict, Optional import numpy as np +import os import os.path as osp import attr @@ -68,7 +69,7 @@ def from_iterable(cls, iterable): iterable ([type]): This iterable object can be: 1)simple str - will generate one Category with str as name 2)list of str - will interpreted as list of Category names - 3)list of positional argumetns - will generate Categories + 3)list of positional arguments - will generate Categories with this arguments @@ -236,7 +237,7 @@ def __eq__(self, other): class CompiledMask: @staticmethod def from_instance_masks(instance_masks, - instance_ids=None, instance_labels=None): + instance_ids=None, instance_labels=None, dtype=None): from datumaro.util.mask_tools import make_index_mask if instance_ids is not None: @@ -266,7 +267,7 @@ def from_instance_masks(instance_masks, m, idx, instance_id, class_id = next(it) if not class_id: idx = 0 - index_mask = make_index_mask(m, idx) + index_mask = make_index_mask(m, idx, dtype=dtype) instance_map.append(instance_id) class_map.append(class_id) @@ -282,8 +283,8 @@ def from_instance_masks(instance_masks, else: merged_instance_mask = np.array(instance_map, dtype=np.min_scalar_type(instance_map))[index_mask] - merged_class_mask = np.array(class_map, - dtype=np.min_scalar_type(class_map))[index_mask] + dtype_mask = dtype if dtype else np.min_scalar_type(class_map) + merged_class_mask = np.array(class_map, dtype=dtype_mask)[index_mask] return __class__(class_mask=merged_class_mask, instance_mask=merged_instance_mask) @@ -447,7 +448,7 @@ def from_iterable(cls, iterable): Args: iterable ([type]): This iterable object can be: - 1) list of positional argumetns - will generate Categories + 1) list of positional arguments - will generate Categories with these arguments Returns: @@ -673,7 +674,11 @@ def __call__(self, path, **extra_params): @classmethod def _find_sources_recursive(cls, path, ext, extractor_name, filename='*', dirname='', file_filter=None, max_depth=3): - if path.endswith(ext) and osp.isfile(path): + + if (path.endswith(ext) and osp.isfile(path)) or \ + (not ext and osp.isdir(path) and dirname and \ + os.sep + osp.normpath(dirname) + os.sep in \ + osp.abspath(path) + os.sep): sources = [{'url': path, 'format': extractor_name}] else: sources = [] diff --git a/datumaro/components/operations.py b/datumaro/components/operations.py index 5482b5ed1778..002e76f62386 100644 --- a/datumaro/components/operations.py +++ b/datumaro/components/operations.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Intel Corporation +# Copyright (C) 2020-2021 Intel Corporation # # SPDX-License-Identifier: MIT @@ -55,10 +55,13 @@ def merge_categories(sources): for source in sources: for cat_type, source_cat in source.items(): existing_cat = categories.setdefault(cat_type, source_cat) - if existing_cat != source_cat: - raise DatumaroError( - "Merging of datasets with different categories is " - "only allowed in 'merge' command.") + if existing_cat != source_cat and len(source_cat) != 0: + if len(existing_cat) == 0: + categories[cat_type] = source_cat + else: + raise DatumaroError( + "Merging of datasets with different categories is " + "only allowed in 'merge' command.") return categories class MergingStrategy(CliPlugin): @@ -150,7 +153,7 @@ class Conf: quorum = attrib(converter=int, default=0) ignored_attributes = attrib(converter=set, factory=set) - def _groups_conveter(value): + def _groups_converter(value): result = [] for group in value: rg = set() @@ -160,7 +163,7 @@ def _groups_conveter(value): rg.add((name, optional)) result.append(rg) return result - groups = attrib(converter=_groups_conveter, factory=list) + groups = attrib(converter=_groups_converter, factory=list) close_distance = attrib(converter=float, default=0.75) conf = attrib(converter=ensure_cls(Conf), factory=Conf) @@ -954,6 +957,11 @@ def mean_std(dataset): var = lambda i, s: s[i][1] for i, item in enumerate(dataset): + size = item.image.size + if size is None: + log.warning("Item %s: can't detect image size, " + "the image will be skipped from pixel statistics", item.id) + continue counts[i] = np.prod(item.image.size) image = item.image.data @@ -1024,8 +1032,8 @@ def _extractor_stats(extractor): for item in extractor: if not (item.has_image and item.image.has_data): available = False - log.warn("Item %s has no image. Image stats won't be computed", - item.id) + log.warning("Item %s has no image, it will be excluded from " + "image stats", item.id) break stats = { diff --git a/datumaro/components/validator.py b/datumaro/components/validator.py index dfa524d7529e..1e910029f8a2 100644 --- a/datumaro/components/validator.py +++ b/datumaro/components/validator.py @@ -10,38 +10,46 @@ from datumaro.components.dataset import IDataset from datumaro.components.errors import (MissingLabelCategories, - MissingLabelAnnotation, MultiLabelAnnotations, MissingAttribute, + MissingAnnotation, MultiLabelAnnotations, MissingAttribute, UndefinedLabel, UndefinedAttribute, LabelDefinedButNotFound, AttributeDefinedButNotFound, OnlyOneLabel, FewSamplesInLabel, FewSamplesInAttribute, ImbalancedLabels, ImbalancedAttribute, - ImbalancedBboxDistInLabel, ImbalancedBboxDistInAttribute, - MissingBboxAnnotation, NegativeLength, InvalidValue, FarFromLabelMean, + ImbalancedDistInLabel, ImbalancedDistInAttribute, + NegativeLength, InvalidValue, FarFromLabelMean, FarFromAttrMean, OnlyOneAttributeValue) from datumaro.components.extractor import AnnotationType, LabelCategories +from datumaro.components.cli_plugin import CliPlugin from datumaro.util import parse_str_enum_value Severity = Enum('Severity', ['warning', 'error']) -TaskType = Enum('TaskType', ['classification', 'detection']) - +TaskType = Enum('TaskType', ['classification', 'detection', 'segmentation']) + + +class _Validator(CliPlugin): + # statistics templates + numerical_stat_template = { + 'items_far_from_mean': {}, + 'mean': None, + 'stdev': None, + 'min': None, + 'max': None, + 'median': None, + 'histogram': { + 'bins': [], + 'counts': [], + }, + 'distribution': np.array([]) + } -class _Validator: - DEFAULT_FEW_SAMPLES = 1 - DEFAULT_IMBALANCE_RATIO = 50 """ A base class for task-specific validators. - ... - Attributes ---------- task_type : str or TaskType - task type (ie. classification, detection etc.) - ann_type : str or AnnotationType - annotation type to validate (default is AnnotationType.label) - far_from_mean_thr : float - constant used to define mean +/- k * stdev (default is None) + task type (ie. classification, detection, segmentation) Methods ------- @@ -51,42 +59,78 @@ class _Validator: Abstract method that must be implemented in a subclass. """ - def __init__(self, task_type=None, ann_type=None, far_from_mean_thr=None): - task_type = parse_str_enum_value(task_type, TaskType, - default=TaskType.classification) - ann_type = parse_str_enum_value(ann_type, AnnotationType, - default=AnnotationType.label) - - self.task_type = task_type - self.ann_type = ann_type - - self.far_from_mean_thr = far_from_mean_thr - self.imbalance_ratio_thr = self.DEFAULT_IMBALANCE_RATIO - self.few_samples_thr = self.DEFAULT_FEW_SAMPLES - - def compute_statistics(self, dataset): + @classmethod + def build_cmdline_parser(cls, **kwargs): + parser = super().build_cmdline_parser(**kwargs) + parser.add_argument('-fs', '--few_samples_thr', default=1, type=int, + help="Threshold for giving a warning for minimum number of" + "samples per class") + parser.add_argument('-ir', '--imbalance_ratio_thr', default=50, type=int, + help="Threshold for giving data imbalance warning;" + "IR(imbalance ratio) = majority/minority") + parser.add_argument('-m', '--far_from_mean_thr', default=5.0, type=float, + help="Threshold for giving a warning that data is far from mean;" + "A constant used to define mean +/- k * standard deviation;") + parser.add_argument('-dr', '--dominance_ratio_thr', default=0.8, type=float, + help="Threshold for giving a warning for bounding box imbalance;" + "Dominace_ratio = ratio of Top-k bin to total in histogram;") + parser.add_argument('-k', '--topk_bins', default=0.1, type=float, + help="Ratio of bins with the highest number of data" + "to total bins in the histogram; [0, 1]; 0.1 = 10%;") + return parser + + def __init__(self, task_type, few_samples_thr=None, + imbalance_ratio_thr=None, far_from_mean_thr=None, + dominance_ratio_thr=None, topk_bins=None): """ - Computes various statistics of the dataset based on task type. + Validator Parameters - ---------- - dataset : IDataset object - - Returns - ------- - stats (dict): A dict object containing statistics of the dataset. + --------------- + few_samples_thr: int + minimum number of samples per class + warn user when samples per class is less than threshold + imbalance_ratio_thr: int + ratio of majority attribute to minority attribute + warn user when annotations are unevenly distributed + far_from_mean_thr: float + constant used to define mean +/- m * stddev + warn user when there are too big or small values + dominance_ratio_thr: float + ratio of Top-k bin to total + warn user when dominance ratio is over threshold + topk_bins: float + ratio of selected bins with most item number to total bins + warn user when values are not evenly distributed """ + self.task_type = parse_str_enum_value(task_type, TaskType, + default=TaskType.classification) + + if self.task_type == TaskType.classification: + self.ann_types = {AnnotationType.label} + self.str_ann_type = "label" + elif self.task_type == TaskType.detection: + self.ann_types = {AnnotationType.bbox} + self.str_ann_type = "bounding box" + elif self.task_type == TaskType.segmentation: + self.ann_types = {AnnotationType.mask, AnnotationType.polygon} + self.str_ann_type = "mask or polygon" + + self.few_samples_thr = few_samples_thr + self.imbalance_ratio_thr = imbalance_ratio_thr + self.far_from_mean_thr = far_from_mean_thr + self.dominance_thr = dominance_ratio_thr + self.topk_bins_ratio = topk_bins + def _compute_common_statistics(self, dataset): defined_attr_template = { 'items_missing_attribute': [], 'distribution': {} } - undefined_attr_template = { 'items_with_undefined_attr': [], 'distribution': {} } - undefined_label_template = { 'count': 0, 'items_with_undefined_label': [], @@ -102,6 +146,8 @@ def compute_statistics(self, dataset): 'undefined_attributes': {} }, } + stats['total_ann_count'] = 0 + stats['items_missing_annotation'] = [] label_dist = stats['label_distribution'] attr_dist = stats['attribute_distribution'] @@ -114,292 +160,129 @@ def compute_statistics(self, dataset): LabelCategories()) base_valid_attrs = label_categories.attributes - if self.task_type == TaskType.classification: - stats['total_label_count'] = 0 - stats['items_missing_label'] = [] - stats['items_with_multiple_labels'] = [] - - elif self.task_type == TaskType.detection: - bbox_info_template = { - 'items_far_from_mean': {}, - 'mean': None, - 'stdev': None, - 'min': None, - 'max': None, - 'median': None, - 'histogram': { - 'bins': [], - 'counts': [], - }, - 'distribution': np.array([]) - } - - bbox_template = { - 'width': deepcopy(bbox_info_template), - 'height': deepcopy(bbox_info_template), - 'area(wxh)': deepcopy(bbox_info_template), - 'ratio(w/h)': deepcopy(bbox_info_template), - 'short': deepcopy(bbox_info_template), - 'long': deepcopy(bbox_info_template) - } - - stats['total_bbox_count'] = 0 - stats['items_missing_bbox'] = [] - stats['items_with_negative_length'] = {} - stats['items_with_invalid_value'] = {} - stats['bbox_distribution_in_label'] = {} - stats['bbox_distribution_in_attribute'] = {} - stats['bbox_distribution_in_dataset_item'] = {} - - bbox_dist_by_label = stats['bbox_distribution_in_label'] - bbox_dist_by_attr = stats['bbox_distribution_in_attribute'] - bbox_dist_in_item = stats['bbox_distribution_in_dataset_item'] - items_w_neg_len = stats['items_with_negative_length'] - items_w_invalid_val = stats['items_with_invalid_value'] - _k = self.far_from_mean_thr - - def _update_prop_distributions(ann_bbox_info, target_stats): - for prop, val in ann_bbox_info.items(): - prop_stats = target_stats[prop] - prop_dist = prop_stats['distribution'] - prop_stats['distribution'] = np.append(prop_dist, val) - - def _generate_ann_bbox_info(_x, _y, _w, _h, area, - ratio, _short, _long): - return { - 'x': _x, - 'y': _y, - 'width': _w, - 'height': _h, - 'area(wxh)': area, - 'ratio(w/h)': ratio, - 'short': _short, - 'long': _long, - } - - def _update_bbox_stats_by_label(item, ann, bbox_label_stats): - bbox_has_error = False - - _x, _y, _w, _h = ann.get_bbox() - area = ann.get_area() - - if _h != 0 and _h != float('inf'): - ratio = _w / _h - else: - ratio = float('nan') - - _short = _w if _w < _h else _h - _long = _w if _w > _h else _h - - ann_bbox_info = _generate_ann_bbox_info( - _x, _y, _w, _h, area, ratio, _short, _long) - - for prop, val in ann_bbox_info.items(): - if val == float('inf') or np.isnan(val): - bbox_has_error = True - anns_w_invalid_val = items_w_invalid_val.setdefault( - (item.id, item.subset), {}) - invalid_props = anns_w_invalid_val.setdefault( - ann.id, []) - invalid_props.append(prop) - - for prop in ['width', 'height']: - val = ann_bbox_info[prop] - if val < 1: - bbox_has_error = True - anns_w_neg_len = items_w_neg_len.setdefault( - (item.id, item.subset), {}) - neg_props = anns_w_neg_len.setdefault(ann.id, {}) - neg_props[prop] = val - - if not bbox_has_error: - ann_bbox_info.pop('x') - ann_bbox_info.pop('y') - _update_prop_distributions(ann_bbox_info, bbox_label_stats) - - return ann_bbox_info, bbox_has_error - - def _compute_prop_stats_from_dist(): - for label_name, bbox_stats in bbox_dist_by_label.items(): - prop_stats_list = list(bbox_stats.values()) - bbox_attr_label = bbox_dist_by_attr.get(label_name, {}) - for vals in bbox_attr_label.values(): - for val_stats in vals.values(): - prop_stats_list += list(val_stats.values()) - - for prop_stats in prop_stats_list: - prop_dist = prop_stats.pop('distribution', []) - if len(prop_dist) > 0: - prop_stats['mean'] = np.mean(prop_dist) - prop_stats['stdev'] = np.std(prop_dist) - prop_stats['min'] = np.min(prop_dist) - prop_stats['max'] = np.max(prop_dist) - prop_stats['median'] = np.median(prop_dist) - - counts, bins = np.histogram(prop_dist) - prop_stats['histogram']['bins'] = bins.tolist() - prop_stats['histogram']['counts'] = counts.tolist() - - def _is_valid_bbox(item, ann): - is_bbox = ann.type == self.ann_type - has_defined_label = 0 <= ann.label < len(label_categories) - if not is_bbox or not has_defined_label: - return False - - bbox_has_neg_len = ann.id in items_w_neg_len.get( - (item.id, item.subset), {}) - bbox_has_invalid_val = ann.id in items_w_invalid_val.get( - (item.id, item.subset), {}) - return not (bbox_has_neg_len or bbox_has_invalid_val) - - def _far_from_mean(val, mean, stdev): - return val > mean + (_k * stdev) or val < mean - (_k * stdev) - - def _update_props_far_from_mean(item, ann): - valid_attrs = base_valid_attrs.union( - label_categories[ann.label].attributes) - label_name = label_categories[ann.label].name - bbox_label_stats = bbox_dist_by_label[label_name] - - _x, _y, _w, _h = ann.get_bbox() - area = ann.get_area() - ratio = _w / _h - _short = _w if _w < _h else _h - _long = _w if _w > _h else _h - - ann_bbox_info = _generate_ann_bbox_info( - _x, _y, _w, _h, area, ratio, _short, _long) - ann_bbox_info.pop('x') - ann_bbox_info.pop('y') - - for prop, val in ann_bbox_info.items(): - prop_stats = bbox_label_stats[prop] - items_far_from_mean = prop_stats['items_far_from_mean'] - mean = prop_stats['mean'] - stdev = prop_stats['stdev'] - - if _far_from_mean(val, mean, stdev): - bboxs_far_from_mean = items_far_from_mean.setdefault( - (item.id, item.subset), {}) - bboxs_far_from_mean[ann.id] = val - - for attr, value in ann.attributes.items(): - if attr in valid_attrs: - bbox_attr_stats = bbox_dist_by_attr[label_name][attr] - bbox_val_stats = bbox_attr_stats[str(value)] - - for prop, val in ann_bbox_info.items(): - prop_stats = bbox_val_stats[prop] - items_far_from_mean = \ - prop_stats['items_far_from_mean'] - mean = prop_stats['mean'] - stdev = prop_stats['stdev'] - - if _far_from_mean(val, mean, stdev): - bboxs_far_from_mean = \ - items_far_from_mean.setdefault( - (item.id, item.subset), {}) - bboxs_far_from_mean[ann.id] = val - for category in label_categories: defined_label_dist[category.name] = 0 + filtered_anns = [] for item in dataset: - ann_count = [ann.type == self.ann_type - for ann in item.annotations].count(True) - - if self.task_type == TaskType.classification: - if ann_count == 0: - stats['items_missing_label'].append((item.id, item.subset)) - elif ann_count > 1: - stats['items_with_multiple_labels'].append( - (item.id, item.subset)) - stats['total_label_count'] += ann_count - - elif self.task_type == TaskType.detection: - if ann_count < 1: - stats['items_missing_bbox'].append((item.id, item.subset)) - stats['total_bbox_count'] += ann_count - bbox_dist_in_item[(item.id, item.subset)] = ann_count - + item_key = (item.id, item.subset) + annotations = [] for ann in item.annotations: - if ann.type == self.ann_type: - if not 0 <= ann.label < len(label_categories): - label_name = ann.label - - label_stats = undefined_label_dist.setdefault( - ann.label, deepcopy(undefined_label_template)) - label_stats['items_with_undefined_label'].append( - (item.id, item.subset)) - - label_stats['count'] += 1 - valid_attrs = set() - missing_attrs = set() - else: - label_name = label_categories[ann.label].name - defined_label_dist[label_name] += 1 + if ann.type in self.ann_types: + annotations.append(ann) + ann_count = len(annotations) + filtered_anns.append((item_key, annotations)) + + if ann_count == 0: + stats['items_missing_annotation'].append(item_key) + stats['total_ann_count'] += ann_count + + for ann in annotations: + if not 0 <= ann.label < len(label_categories): + label_name = ann.label + + label_stats = undefined_label_dist.setdefault( + ann.label, deepcopy(undefined_label_template)) + label_stats['items_with_undefined_label'].append( + item_key) + + label_stats['count'] += 1 + valid_attrs = set() + missing_attrs = set() + else: + label_name = label_categories[ann.label].name + defined_label_dist[label_name] += 1 - defined_attr_stats = defined_attr_dist.setdefault( - label_name, {}) + defined_attr_stats = defined_attr_dist.setdefault( + label_name, {}) - valid_attrs = base_valid_attrs.union( - label_categories[ann.label].attributes) - ann_attrs = getattr(ann, 'attributes', {}).keys() - missing_attrs = valid_attrs.difference(ann_attrs) + valid_attrs = base_valid_attrs.union( + label_categories[ann.label].attributes) + ann_attrs = getattr(ann, 'attributes', {}).keys() + missing_attrs = valid_attrs.difference(ann_attrs) - for attr in valid_attrs: - defined_attr_stats.setdefault( - attr, deepcopy(defined_attr_template)) + for attr in valid_attrs: + defined_attr_stats.setdefault( + attr, deepcopy(defined_attr_template)) - if self.task_type == TaskType.detection: - bbox_label_stats = bbox_dist_by_label.setdefault( - label_name, deepcopy(bbox_template)) - ann_bbox_info, bbox_has_error = \ - _update_bbox_stats_by_label( - item, ann, bbox_label_stats) + for attr in missing_attrs: + attr_dets = defined_attr_stats[attr] + attr_dets['items_missing_attribute'].append( + item_key) - for attr in missing_attrs: + for attr, value in ann.attributes.items(): + if attr not in valid_attrs: + undefined_attr_stats = \ + undefined_attr_dist.setdefault( + label_name, {}) + attr_dets = undefined_attr_stats.setdefault( + attr, deepcopy(undefined_attr_template)) + attr_dets['items_with_undefined_attr'].append( + item_key) + else: attr_dets = defined_attr_stats[attr] - attr_dets['items_missing_attribute'].append( - (item.id, item.subset)) - - for attr, value in ann.attributes.items(): - if attr not in valid_attrs: - undefined_attr_stats = \ - undefined_attr_dist.setdefault( - label_name, {}) - attr_dets = undefined_attr_stats.setdefault( - attr, deepcopy(undefined_attr_template)) - attr_dets['items_with_undefined_attr'].append( - (item.id, item.subset)) - else: - attr_dets = defined_attr_stats[attr] - - if self.task_type == TaskType.detection and \ - ann.type == self.ann_type: - bbox_attr_label = bbox_dist_by_attr.setdefault( - label_name, {}) - bbox_attr_stats = bbox_attr_label.setdefault( - attr, {}) - bbox_val_stats = bbox_attr_stats.setdefault( - str(value), deepcopy(bbox_template)) - - if not bbox_has_error: - _update_prop_distributions( - ann_bbox_info, bbox_val_stats) - - attr_dets['distribution'].setdefault(str(value), 0) - attr_dets['distribution'][str(value)] += 1 - - if self.task_type == TaskType.detection: - _compute_prop_stats_from_dist() - - for item in dataset: - for ann in item.annotations: - if _is_valid_bbox(item, ann): - _update_props_far_from_mean(item, ann) - return stats + attr_dets['distribution'].setdefault(str(value), 0) + attr_dets['distribution'][str(value)] += 1 + + return stats, filtered_anns + + @staticmethod + def _update_prop_distributions(curr_prop_stats, target_stats): + for prop, val in curr_prop_stats.items(): + prop_stats = target_stats[prop] + prop_dist = prop_stats['distribution'] + prop_stats['distribution'] = np.append(prop_dist, val) + + @staticmethod + def _compute_prop_stats_from_dist(dist_by_label, dist_by_attr): + for label_name, stats in dist_by_label.items(): + prop_stats_list = list(stats.values()) + attr_label = dist_by_attr.get(label_name, {}) + for vals in attr_label.values(): + for val_stats in vals.values(): + prop_stats_list += list(val_stats.values()) + + for prop_stats in prop_stats_list: + prop_dist = prop_stats.pop('distribution', []) + if len(prop_dist) > 0: + prop_stats['mean'] = np.mean(prop_dist) + prop_stats['stdev'] = np.std(prop_dist) + prop_stats['min'] = np.min(prop_dist) + prop_stats['max'] = np.max(prop_dist) + prop_stats['median'] = np.median(prop_dist) + + counts, bins = np.histogram(prop_dist) + prop_stats['histogram']['bins'] = bins.tolist() + prop_stats['histogram']['counts'] = counts.tolist() + + def _compute_far_from_mean(self, prop_stats, val, item_key, ann): + def _far_from_mean(val, mean, stdev): + thr = self.far_from_mean_thr + return val > mean + (thr * stdev) or val < mean - (thr * stdev) + + mean = prop_stats['mean'] + stdev = prop_stats['stdev'] + + if _far_from_mean(val, mean, stdev): + items_far_from_mean = prop_stats['items_far_from_mean'] + far_from_mean = items_far_from_mean.setdefault( + item_key, {}) + far_from_mean[ann.id] = val + + def compute_statistics(self, dataset): + """ + Computes statistics of the dataset based on task type. + + Parameters + ---------- + dataset : IDataset object + + Returns + ------- + stats (dict): A dict object containing statistics of the dataset. + """ + return NotImplementedError def _check_missing_label_categories(self, stats): validation_reports = [] @@ -410,6 +293,17 @@ def _check_missing_label_categories(self, stats): return validation_reports + def _check_missing_annotation(self, stats): + validation_reports = [] + + items_missing = stats['items_missing_annotation'] + for item_id, item_subset in items_missing: + validation_reports += self._generate_validation_report( + MissingAnnotation, Severity.warning, item_id, item_subset, + self.str_ann_type) + + return validation_reports + def _check_missing_attribute(self, label_name, attr_name, attr_dets): validation_reports = [] @@ -561,6 +455,114 @@ def _check_imbalanced_attribute(self, label_name, attr_name, attr_dets): return validation_reports + def _check_imbalanced_dist_in_label(self, label_name, label_stats): + validation_reports = [] + thr = self.dominance_thr + topk_ratio = self.topk_bins_ratio + + for prop, prop_stats in label_stats.items(): + value_counts = prop_stats['histogram']['counts'] + n_bucket = len(value_counts) + if n_bucket < 2: + continue + topk = max(1, int(np.around(n_bucket * topk_ratio))) + + if topk > 0: + topk_values = np.sort(value_counts)[-topk:] + ratio = np.sum(topk_values) / np.sum(value_counts) + if ratio >= thr: + details = (label_name, f"{self.str_ann_type} {prop}") + validation_reports += self._generate_validation_report( + ImbalancedDistInLabel, Severity.warning, *details) + + return validation_reports + + def _check_imbalanced_dist_in_attr(self, label_name, attr_name, attr_stats): + validation_reports = [] + thr = self.dominance_thr + topk_ratio = self.topk_bins_ratio + + for attr_value, value_stats in attr_stats.items(): + for prop, prop_stats in value_stats.items(): + value_counts = prop_stats['histogram']['counts'] + n_bucket = len(value_counts) + if n_bucket < 2: + continue + topk = max(1, int(np.around(n_bucket * topk_ratio))) + + if topk > 0: + topk_values = np.sort(value_counts)[-topk:] + ratio = np.sum(topk_values) / np.sum(value_counts) + if ratio >= thr: + details = (label_name, attr_name, attr_value, + f"{self.str_ann_type} {prop}") + validation_reports += self._generate_validation_report( + ImbalancedDistInAttribute, + Severity.warning, + *details + ) + + return validation_reports + + def _check_invalid_value(self, stats): + validation_reports = [] + + items_w_invalid_val = stats['items_with_invalid_value'] + for item_dets, anns_w_invalid_val in items_w_invalid_val.items(): + item_id, item_subset = item_dets + for ann_id, props in anns_w_invalid_val.items(): + for prop in props: + details = (item_subset, ann_id, + f"{self.str_ann_type} {prop}") + validation_reports += self._generate_validation_report( + InvalidValue, Severity.error, item_id, *details) + + return validation_reports + + def _check_far_from_label_mean(self, label_name, label_stats): + validation_reports = [] + + for prop, prop_stats in label_stats.items(): + items_far_from_mean = prop_stats['items_far_from_mean'] + if prop_stats['mean'] is not None: + mean = round(prop_stats['mean'], 2) + + for item_dets, anns_far in items_far_from_mean.items(): + item_id, item_subset = item_dets + for ann_id, val in anns_far.items(): + val = round(val, 2) + details = (item_subset, label_name, ann_id, + f"{self.str_ann_type} {prop}", mean, val) + validation_reports += self._generate_validation_report( + FarFromLabelMean, Severity.warning, item_id, *details) + + return validation_reports + + def _check_far_from_attr_mean(self, label_name, attr_name, attr_stats): + validation_reports = [] + + for attr_value, value_stats in attr_stats.items(): + for prop, prop_stats in value_stats.items(): + items_far_from_mean = prop_stats['items_far_from_mean'] + if prop_stats['mean'] is not None: + mean = round(prop_stats['mean'], 2) + + for item_dets, anns_far in items_far_from_mean.items(): + item_id, item_subset = item_dets + for ann_id, val in anns_far.items(): + val = round(val, 2) + details = (item_subset, label_name, ann_id, attr_name, + attr_value, f"{self.str_ann_type} {prop}", + mean, val) + validation_reports += self._generate_validation_report( + FarFromAttrMean, + Severity.warning, + item_id, + *details + ) + + return validation_reports + def generate_reports(self, stats): raise NotImplementedError('Should be implemented in a subclass.') @@ -573,18 +575,13 @@ class ClassificationValidator(_Validator): A validator class for classification tasks. """ - def __init__(self): - super().__init__(TaskType.classification, AnnotationType.label) - - def _check_missing_label_annotation(self, stats): - validation_reports = [] - - items_missing_label = stats['items_missing_label'] - for item_id, item_subset in items_missing_label: - validation_reports += self._generate_validation_report( - MissingLabelAnnotation, Severity.warning, item_id, item_subset) - - return validation_reports + def __init__(self, few_samples_thr, imbalance_ratio_thr, + far_from_mean_thr, dominance_ratio_thr, topk_bins): + super().__init__(task_type=TaskType.classification, + few_samples_thr=few_samples_thr, + imbalance_ratio_thr=imbalance_ratio_thr, + far_from_mean_thr=far_from_mean_thr, + dominance_ratio_thr=dominance_ratio_thr, topk_bins=topk_bins) def _check_multi_label_annotations(self, stats): validation_reports = [] @@ -596,6 +593,30 @@ def _check_multi_label_annotations(self, stats): return validation_reports + def compute_statistics(self, dataset): + """ + Computes statistics of the dataset for the classification task. + + Parameters + ---------- + dataset : IDataset object + + Returns + ------- + stats (dict): A dict object containing statistics of the dataset. + """ + + stats, filtered_anns = self._compute_common_statistics(dataset) + + stats['items_with_multiple_labels'] = [] + + for item_key, anns in filtered_anns: + ann_count = len(anns) + if ann_count > 1: + stats['items_with_multiple_labels'].append(item_key) + + return stats + def generate_reports(self, stats): """ Validates the dataset for classification tasks based on its statistics. @@ -613,7 +634,7 @@ def generate_reports(self, stats): reports = [] reports += self._check_missing_label_categories(stats) - reports += self._check_missing_label_annotation(stats) + reports += self._check_missing_annotation(stats) reports += self._check_multi_label_annotations(stats) reports += self._check_label_defined_but_not_found(stats) reports += self._check_only_one_label(stats) @@ -658,76 +679,13 @@ class DetectionValidator(_Validator): """ A validator class for detection tasks. """ - - DEFAULT_FAR_FROM_MEAN = 5.0 - DEFAULT_BBOX_IMBALANCE = 0.8 - DEFAULT_BBOX_TOPK_BINS = 0.1 - - def __init__(self): - super().__init__(TaskType.detection, AnnotationType.bbox, - far_from_mean_thr=self.DEFAULT_FAR_FROM_MEAN) - self.bbox_imbalance_thr = self.DEFAULT_BBOX_IMBALANCE - self.bbox_topk_bins_ratio = self.DEFAULT_BBOX_TOPK_BINS - - def _check_imbalanced_bbox_dist_in_label(self, label_name, - bbox_label_stats): - validation_reports = [] - thr = self.bbox_imbalance_thr - topk_ratio = self.bbox_topk_bins_ratio - - for prop, prop_stats in bbox_label_stats.items(): - value_counts = prop_stats['histogram']['counts'] - n_bucket = len(value_counts) - if n_bucket < 2: - continue - topk = max(1, int(np.around(n_bucket * topk_ratio))) - - if topk > 0: - topk_values = np.sort(value_counts)[-topk:] - ratio = np.sum(topk_values) / np.sum(value_counts) - if ratio >= thr: - details = (label_name, prop) - validation_reports += self._generate_validation_report( - ImbalancedBboxDistInLabel, Severity.warning, *details) - - return validation_reports - - def _check_imbalanced_bbox_dist_in_attr(self, label_name, attr_name, - bbox_attr_stats): - validation_reports = [] - thr = self.bbox_imbalance_thr - topk_ratio = self.bbox_topk_bins_ratio - - for attr_value, value_stats in bbox_attr_stats.items(): - for prop, prop_stats in value_stats.items(): - value_counts = prop_stats['histogram']['counts'] - n_bucket = len(value_counts) - if n_bucket < 2: - continue - topk = max(1, int(np.around(n_bucket * topk_ratio))) - - if topk > 0: - topk_values = np.sort(value_counts)[-topk:] - ratio = np.sum(topk_values) / np.sum(value_counts) - if ratio >= thr: - details = (label_name, attr_name, attr_value, prop) - validation_reports += self._generate_validation_report( - ImbalancedBboxDistInAttribute, - Severity.warning, - *details - ) - - return validation_reports - - def _check_missing_bbox_annotation(self, stats): - validation_reports = [] - - items_missing_bbox = stats['items_missing_bbox'] - for item_id, item_subset in items_missing_bbox: - validation_reports += self._generate_validation_report( - MissingBboxAnnotation, Severity.warning, item_id, item_subset) - - return validation_reports + def __init__(self, few_samples_thr, imbalance_ratio_thr, + far_from_mean_thr, dominance_ratio_thr, topk_bins): + super().__init__(task_type=TaskType.detection, + few_samples_thr=few_samples_thr, + imbalance_ratio_thr=imbalance_ratio_thr, + far_from_mean_thr=far_from_mean_thr, + dominance_ratio_thr=dominance_ratio_thr, topk_bins=topk_bins) def _check_negative_length(self, stats): validation_reports = [] @@ -738,67 +696,193 @@ def _check_negative_length(self, stats): for ann_id, props in anns_w_neg_len.items(): for prop, val in props.items(): val = round(val, 2) - details = (item_subset, ann_id, prop, val) + details = (item_subset, ann_id, + f"{self.str_ann_type} {prop}", val) validation_reports += self._generate_validation_report( NegativeLength, Severity.error, item_id, *details) return validation_reports - def _check_invalid_value(self, stats): - validation_reports = [] + def compute_statistics(self, dataset): + """ + Computes statistics of the dataset for the detection task. + Parameters + ---------- + dataset : IDataset object + + Returns + ------- + stats (dict): A dict object containing statistics of the dataset. + """ + + stats, filtered_anns = self._compute_common_statistics(dataset) + + # detection-specific + bbox_template = { + 'width': deepcopy(self.numerical_stat_template), + 'height': deepcopy(self.numerical_stat_template), + 'area(wxh)': deepcopy(self.numerical_stat_template), + 'ratio(w/h)': deepcopy(self.numerical_stat_template), + 'short': deepcopy(self.numerical_stat_template), + 'long': deepcopy(self.numerical_stat_template) + } + + stats['items_with_negative_length'] = {} + stats['items_with_invalid_value'] = {} + stats['bbox_distribution_in_label'] = {} + stats['bbox_distribution_in_attribute'] = {} + stats['bbox_distribution_in_dataset_item'] = {} + + dist_by_label = stats['bbox_distribution_in_label'] + dist_by_attr = stats['bbox_distribution_in_attribute'] + bbox_dist_in_item = stats['bbox_distribution_in_dataset_item'] + items_w_neg_len = stats['items_with_negative_length'] items_w_invalid_val = stats['items_with_invalid_value'] - for item_dets, anns_w_invalid_val in items_w_invalid_val.items(): - item_id, item_subset = item_dets - for ann_id, props in anns_w_invalid_val.items(): - for prop in props: - details = (item_subset, ann_id, prop) - validation_reports += self._generate_validation_report( - InvalidValue, Severity.error, item_id, *details) - return validation_reports + def _generate_ann_bbox_info(_x, _y, _w, _h, area, + ratio, _short, _long): + return { + 'x': _x, + 'y': _y, + 'width': _w, + 'height': _h, + 'area(wxh)': area, + 'ratio(w/h)': ratio, + 'short': _short, + 'long': _long, + } - def _check_far_from_label_mean(self, label_name, bbox_label_stats): - validation_reports = [] + def _update_bbox_stats_by_label(item_key, ann, bbox_label_stats): + bbox_has_error = False - for prop, prop_stats in bbox_label_stats.items(): - items_far_from_mean = prop_stats['items_far_from_mean'] - if prop_stats['mean'] is not None: - mean = round(prop_stats['mean'], 2) + _x, _y, _w, _h = ann.get_bbox() + area = ann.get_area() - for item_dets, anns_far in items_far_from_mean.items(): - item_id, item_subset = item_dets - for ann_id, val in anns_far.items(): - val = round(val, 2) - details = (item_subset, label_name, ann_id, prop, mean, val) - validation_reports += self._generate_validation_report( - FarFromLabelMean, Severity.warning, item_id, *details) + if _h != 0 and _h != float('inf'): + ratio = _w / _h + else: + ratio = float('nan') + + _short = _w if _w < _h else _h + _long = _w if _w > _h else _h + + ann_bbox_info = _generate_ann_bbox_info( + _x, _y, _w, _h, area, ratio, _short, _long) + + for prop, val in ann_bbox_info.items(): + if val == float('inf') or np.isnan(val): + bbox_has_error = True + anns_w_invalid_val = items_w_invalid_val.setdefault( + item_key, {}) + invalid_props = anns_w_invalid_val.setdefault( + ann.id, []) + invalid_props.append(prop) + + for prop in ['width', 'height']: + val = ann_bbox_info[prop] + if val < 1: + bbox_has_error = True + anns_w_neg_len = items_w_neg_len.setdefault( + item_key, {}) + neg_props = anns_w_neg_len.setdefault(ann.id, {}) + neg_props[prop] = val + + if not bbox_has_error: + ann_bbox_info.pop('x') + ann_bbox_info.pop('y') + self._update_prop_distributions(ann_bbox_info, bbox_label_stats) - return validation_reports + return ann_bbox_info, bbox_has_error - def _check_far_from_attr_mean(self, label_name, attr_name, bbox_attr_stats): - validation_reports = [] + label_categories = dataset.categories().get(AnnotationType.label, + LabelCategories()) + base_valid_attrs = label_categories.attributes - for attr_value, value_stats in bbox_attr_stats.items(): - for prop, prop_stats in value_stats.items(): - items_far_from_mean = prop_stats['items_far_from_mean'] - if prop_stats['mean'] is not None: - mean = round(prop_stats['mean'], 2) + for item_key, annotations in filtered_anns: + ann_count = len(annotations) - for item_dets, anns_far in items_far_from_mean.items(): - item_id, item_subset = item_dets - for ann_id, val in anns_far.items(): - val = round(val, 2) - details = (item_subset, label_name, ann_id, attr_name, - attr_value, prop, mean, val) - validation_reports += self._generate_validation_report( - FarFromAttrMean, - Severity.warning, - item_id, - *details - ) + bbox_dist_in_item[item_key] = ann_count - return validation_reports + for ann in annotations: + if not 0 <= ann.label < len(label_categories): + label_name = ann.label + valid_attrs = set() + else: + label_name = label_categories[ann.label].name + valid_attrs = base_valid_attrs.union( + label_categories[ann.label].attributes) + + bbox_label_stats = dist_by_label.setdefault( + label_name, deepcopy(bbox_template)) + ann_bbox_info, bbox_has_error = \ + _update_bbox_stats_by_label( + item_key, ann, bbox_label_stats) + + for attr, value in ann.attributes.items(): + if attr in valid_attrs: + bbox_attr_label = dist_by_attr.setdefault( + label_name, {}) + bbox_attr_stats = bbox_attr_label.setdefault( + attr, {}) + bbox_val_stats = bbox_attr_stats.setdefault( + str(value), deepcopy(bbox_template)) + + if not bbox_has_error: + self._update_prop_distributions( + ann_bbox_info, bbox_val_stats) + + # Compute prop stats from distribution + self._compute_prop_stats_from_dist(dist_by_label, dist_by_attr) + + def _is_valid_ann(item_key, ann): + has_defined_label = 0 <= ann.label < len(label_categories) + if not has_defined_label: + return False + + bbox_has_neg_len = ann.id in items_w_neg_len.get( + item_key, {}) + bbox_has_invalid_val = ann.id in items_w_invalid_val.get( + item_key, {}) + return not (bbox_has_neg_len or bbox_has_invalid_val) + + def _update_props_far_from_mean(item_key, ann): + valid_attrs = base_valid_attrs.union( + label_categories[ann.label].attributes) + label_name = label_categories[ann.label].name + bbox_label_stats = dist_by_label[label_name] + + _x, _y, _w, _h = ann.get_bbox() + area = ann.get_area() + ratio = _w / _h + _short = _w if _w < _h else _h + _long = _w if _w > _h else _h + + ann_bbox_info = _generate_ann_bbox_info( + _x, _y, _w, _h, area, ratio, _short, _long) + ann_bbox_info.pop('x') + ann_bbox_info.pop('y') + + for prop, val in ann_bbox_info.items(): + prop_stats = bbox_label_stats[prop] + self._compute_far_from_mean(prop_stats, val, item_key, ann) + + for attr, value in ann.attributes.items(): + if attr in valid_attrs: + bbox_attr_stats = dist_by_attr[label_name][attr] + bbox_val_stats = bbox_attr_stats[str(value)] + + for prop, val in ann_bbox_info.items(): + prop_stats = bbox_val_stats[prop] + self._compute_far_from_mean(prop_stats, val, + item_key, ann) + + for item_key, annotations in filtered_anns: + for ann in annotations: + if _is_valid_ann(item_key, ann): + _update_props_far_from_mean(item_key, ann) + + return stats def generate_reports(self, stats): """ @@ -817,7 +901,7 @@ def generate_reports(self, stats): reports = [] reports += self._check_missing_label_categories(stats) - reports += self._check_missing_bbox_annotation(stats) + reports += self._check_missing_annotation(stats) reports += self._check_label_defined_but_not_found(stats) reports += self._check_only_one_label(stats) reports += self._check_few_samples_in_label(stats) @@ -831,8 +915,8 @@ def generate_reports(self, stats): undefined_label_dist = label_dist['undefined_labels'] undefined_attr_dist = attr_dist['undefined_attributes'] - bbox_dist_by_label = stats['bbox_distribution_in_label'] - bbox_dist_by_attr = stats['bbox_distribution_in_attribute'] + dist_by_label = stats['bbox_distribution_in_label'] + dist_by_attr = stats['bbox_distribution_in_attribute'] defined_labels = defined_attr_dist.keys() for label_name in defined_labels: @@ -851,18 +935,18 @@ def generate_reports(self, stats): reports += self._check_missing_attribute( label_name, attr_name, attr_dets) - bbox_label_stats = bbox_dist_by_label[label_name] - bbox_attr_label = bbox_dist_by_attr.get(label_name, {}) + bbox_label_stats = dist_by_label[label_name] + bbox_attr_label = dist_by_attr.get(label_name, {}) reports += self._check_far_from_label_mean( label_name, bbox_label_stats) - reports += self._check_imbalanced_bbox_dist_in_label( + reports += self._check_imbalanced_dist_in_label( label_name, bbox_label_stats) for attr_name, bbox_attr_stats in bbox_attr_label.items(): reports += self._check_far_from_attr_mean( label_name, attr_name, bbox_attr_stats) - reports += self._check_imbalanced_bbox_dist_in_attr( + reports += self._check_imbalanced_dist_in_attr( label_name, attr_name, bbox_attr_stats) for label_name, label_stats in undefined_label_dist.items(): @@ -876,14 +960,256 @@ def generate_reports(self, stats): return reports -def validate_annotations(dataset: IDataset, task_type: Union[str, TaskType]): +class SegmentationValidator(_Validator): + """ + A validator class for (instance) segmentation tasks. + """ + + def __init__(self, few_samples_thr, imbalance_ratio_thr, + far_from_mean_thr, dominance_ratio_thr, topk_bins): + super().__init__(task_type=TaskType.segmentation, + few_samples_thr=few_samples_thr, + imbalance_ratio_thr=imbalance_ratio_thr, + far_from_mean_thr=far_from_mean_thr, + dominance_ratio_thr=dominance_ratio_thr, topk_bins=topk_bins) + + def compute_statistics(self, dataset): + """ + Computes statistics of the dataset for the segmentation task. + + Parameters + ---------- + dataset : IDataset object + + Returns + ------- + stats (dict): A dict object containing statistics of the dataset. + """ + + stats, filtered_anns = self._compute_common_statistics(dataset) + + # segmentation-specific + mask_template = { + 'area': deepcopy(self.numerical_stat_template), + 'width': deepcopy(self.numerical_stat_template), + 'height': deepcopy(self.numerical_stat_template) + } + + stats['items_with_invalid_value'] = {} + stats['mask_distribution_in_label'] = {} + stats['mask_distribution_in_attribute'] = {} + stats['mask_distribution_in_dataset_item'] = {} + + dist_by_label = stats['mask_distribution_in_label'] + dist_by_attr = stats['mask_distribution_in_attribute'] + mask_dist_in_item = stats['mask_distribution_in_dataset_item'] + items_w_invalid_val = stats['items_with_invalid_value'] + + def _generate_ann_mask_info(area, _w, _h): + return { + 'area': area, + 'width': _w, + 'height': _h, + } + + def _update_mask_stats_by_label(item_key, ann, mask_label_stats): + mask_has_error = False + + _x, _y, _w, _h = ann.get_bbox() + + # Detete the following block when #226 is resolved + # https://github.com/openvinotoolkit/datumaro/issues/226 + if ann.type == AnnotationType.mask: + _w += 1 + _h += 1 + + area = ann.get_area() + + ann_mask_info = _generate_ann_mask_info(area, _w, _h) + + for prop, val in ann_mask_info.items(): + if val == float('inf') or np.isnan(val): + mask_has_error = True + anns_w_invalid_val = items_w_invalid_val.setdefault( + item_key, {}) + invalid_props = anns_w_invalid_val.setdefault( + ann.id, []) + invalid_props.append(prop) + + if not mask_has_error: + self._update_prop_distributions(ann_mask_info, mask_label_stats) + + return ann_mask_info, mask_has_error + + label_categories = dataset.categories().get(AnnotationType.label, + LabelCategories()) + base_valid_attrs = label_categories.attributes + + for item_key, annotations in filtered_anns: + ann_count = len(annotations) + mask_dist_in_item[item_key] = ann_count + + for ann in annotations: + if not 0 <= ann.label < len(label_categories): + label_name = ann.label + valid_attrs = set() + else: + label_name = label_categories[ann.label].name + valid_attrs = base_valid_attrs.union( + label_categories[ann.label].attributes) + + mask_label_stats = dist_by_label.setdefault( + label_name, deepcopy(mask_template)) + ann_mask_info, mask_has_error = \ + _update_mask_stats_by_label( + item_key, ann, mask_label_stats) + + for attr, value in ann.attributes.items(): + if attr in valid_attrs: + mask_attr_label = dist_by_attr.setdefault( + label_name, {}) + mask_attr_stats = mask_attr_label.setdefault( + attr, {}) + mask_val_stats = mask_attr_stats.setdefault( + str(value), deepcopy(mask_template)) + + if not mask_has_error: + self._update_prop_distributions( + ann_mask_info, mask_val_stats) + + # compute prop stats from dist. + self._compute_prop_stats_from_dist(dist_by_label, dist_by_attr) + + def _is_valid_ann(item_key, ann): + has_defined_label = 0 <= ann.label < len(label_categories) + if not has_defined_label: + return False + + mask_has_invalid_val = ann.id in items_w_invalid_val.get( + item_key, {}) + return not mask_has_invalid_val + + def _update_props_far_from_mean(item_key, ann): + valid_attrs = base_valid_attrs.union( + label_categories[ann.label].attributes) + label_name = label_categories[ann.label].name + mask_label_stats = dist_by_label[label_name] + + _x, _y, _w, _h = ann.get_bbox() + + # Detete the following block when #226 is resolved + # https://github.com/openvinotoolkit/datumaro/issues/226 + if ann.type == AnnotationType.mask: + _w += 1 + _h += 1 + area = ann.get_area() + + ann_mask_info = _generate_ann_mask_info(area, _w, _h) + + for prop, val in ann_mask_info.items(): + prop_stats = mask_label_stats[prop] + self._compute_far_from_mean(prop_stats, val, item_key, ann) + + for attr, value in ann.attributes.items(): + if attr in valid_attrs: + mask_attr_stats = dist_by_attr[label_name][attr] + mask_val_stats = mask_attr_stats[str(value)] + + for prop, val in ann_mask_info.items(): + prop_stats = mask_val_stats[prop] + self._compute_far_from_mean(prop_stats, val, + item_key, ann) + + for item_key, annotations in filtered_anns: + for ann in annotations: + if _is_valid_ann(item_key, ann): + _update_props_far_from_mean(item_key, ann) + + return stats + + def generate_reports(self, stats): + """ + Validates the dataset for segmentation tasks based on its statistics. + + Parameters + ---------- + dataset : IDataset object + stats : Dict object + + Returns + ------- + reports (list): List of validation reports (DatasetValidationError). + """ + + reports = [] + + reports += self._check_missing_label_categories(stats) + reports += self._check_missing_annotation(stats) + reports += self._check_label_defined_but_not_found(stats) + reports += self._check_only_one_label(stats) + reports += self._check_few_samples_in_label(stats) + reports += self._check_imbalanced_labels(stats) + reports += self._check_invalid_value(stats) + + label_dist = stats['label_distribution'] + attr_dist = stats['attribute_distribution'] + defined_attr_dist = attr_dist['defined_attributes'] + undefined_label_dist = label_dist['undefined_labels'] + undefined_attr_dist = attr_dist['undefined_attributes'] + + dist_by_label = stats['mask_distribution_in_label'] + dist_by_attr = stats['mask_distribution_in_attribute'] + + defined_labels = defined_attr_dist.keys() + for label_name in defined_labels: + attr_stats = defined_attr_dist[label_name] + + reports += self._check_attribute_defined_but_not_found( + label_name, attr_stats) + + for attr_name, attr_dets in attr_stats.items(): + reports += self._check_few_samples_in_attribute( + label_name, attr_name, attr_dets) + reports += self._check_imbalanced_attribute( + label_name, attr_name, attr_dets) + reports += self._check_only_one_attribute_value( + label_name, attr_name, attr_dets) + reports += self._check_missing_attribute( + label_name, attr_name, attr_dets) + + mask_label_stats = dist_by_label[label_name] + mask_attr_label = dist_by_attr.get(label_name, {}) + + reports += self._check_far_from_label_mean( + label_name, mask_label_stats) + reports += self._check_imbalanced_dist_in_label( + label_name, mask_label_stats) + + for attr_name, mask_attr_stats in mask_attr_label.items(): + reports += self._check_far_from_attr_mean( + label_name, attr_name, mask_attr_stats) + reports += self._check_imbalanced_dist_in_attr( + label_name, attr_name, mask_attr_stats) + + for label_name, label_stats in undefined_label_dist.items(): + reports += self._check_undefined_label(label_name, label_stats) + + for label_name, attr_stats in undefined_attr_dist.items(): + for attr_name, attr_dets in attr_stats.items(): + reports += self._check_undefined_attribute( + label_name, attr_name, attr_dets) + + return reports + + +def validate_annotations(dataset: IDataset, task_type: Union[str, TaskType], **extra_args): """ Returns the validation results of a dataset based on task type. Args: dataset (IDataset): Dataset to be validated task_type (str or TaskType): Type of the task - (classification, detection etc.) + (classification, detection, segmentation) Raises: ValueError @@ -894,13 +1220,33 @@ def validate_annotations(dataset: IDataset, task_type: Union[str, TaskType]): """ + few_samples_thr = extra_args['few_samples_thr'] + imbalance_ratio_thr = extra_args['imbalance_ratio_thr'] + far_from_mean_thr = extra_args['far_from_mean_thr'] + dominance_ratio_thr = extra_args['dominance_ratio_thr'] + topk_bins = extra_args['topk_bins'] + validation_results = {} task_type = parse_str_enum_value(task_type, TaskType) if task_type == TaskType.classification: - validator = ClassificationValidator() + validator = ClassificationValidator(few_samples_thr=few_samples_thr, + imbalance_ratio_thr=imbalance_ratio_thr, + far_from_mean_thr=far_from_mean_thr, + dominance_ratio_thr=dominance_ratio_thr, + topk_bins=topk_bins) elif task_type == TaskType.detection: - validator = DetectionValidator() + validator = DetectionValidator(few_samples_thr=few_samples_thr, + imbalance_ratio_thr=imbalance_ratio_thr, + far_from_mean_thr=far_from_mean_thr, + dominance_ratio_thr=dominance_ratio_thr, + topk_bins=topk_bins) + elif task_type == TaskType.segmentation: + validator = SegmentationValidator(few_samples_thr=few_samples_thr, + imbalance_ratio_thr=imbalance_ratio_thr, + far_from_mean_thr=far_from_mean_thr, + dominance_ratio_thr=dominance_ratio_thr, + topk_bins=topk_bins) if not isinstance(dataset, IDataset): raise TypeError("Invalid dataset type '%s'" % type(dataset)) diff --git a/datumaro/plugins/accuracy_checker_plugin/details/ac.py b/datumaro/plugins/accuracy_checker_plugin/details/ac.py index 4fc2ffb5c696..b235e5784869 100644 --- a/datumaro/plugins/accuracy_checker_plugin/details/ac.py +++ b/datumaro/plugins/accuracy_checker_plugin/details/ac.py @@ -1,10 +1,10 @@ -# Copyright (C) 2020 Intel Corporation +# Copyright (C) 2020-2021 Intel Corporation # # SPDX-License-Identifier: MIT from datumaro.util.tf_util import import_tf -import_tf() # prevent TF loading and potential interpeter crash +import_tf() # prevent TF loading and potential interpreter crash from itertools import groupby diff --git a/datumaro/plugins/accuracy_checker_plugin/details/representation.py b/datumaro/plugins/accuracy_checker_plugin/details/representation.py index d7007806bfde..76da49ee0a56 100644 --- a/datumaro/plugins/accuracy_checker_plugin/details/representation.py +++ b/datumaro/plugins/accuracy_checker_plugin/details/representation.py @@ -1,10 +1,10 @@ -# Copyright (C) 2020 Intel Corporation +# Copyright (C) 2020-2021 Intel Corporation # # SPDX-License-Identifier: MIT from datumaro.util.tf_util import import_tf -import_tf() # prevent TF loading and potential interpeter crash +import_tf() # prevent TF loading and potential interpreter crash import accuracy_checker.representation as ac diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py index 76de81819675..0abca1751b5a 100644 --- a/datumaro/plugins/camvid_format.py +++ b/datumaro/plugins/camvid_format.py @@ -218,6 +218,16 @@ def find_sources(cls, path): class CamvidConverter(Converter): DEFAULT_IMAGE_EXT = CamvidPath.IMAGE_EXT + @staticmethod + def _get_labelmap(s): + if osp.isfile(s): + return s + try: + return LabelmapType[s].name + except KeyError: + import argparse + raise argparse.ArgumentTypeError() + @classmethod def build_cmdline_parser(cls, **kwargs): parser = super().build_cmdline_parser(**kwargs) diff --git a/datumaro/plugins/cifar_format.py b/datumaro/plugins/cifar_format.py new file mode 100644 index 000000000000..b6f3a01a2343 --- /dev/null +++ b/datumaro/plugins/cifar_format.py @@ -0,0 +1,183 @@ +# Copyright (C) 2020-2021 Intel Corporation +# +# SPDX-License-Identifier: MIT + +import os +import os.path as osp +import pickle + +import numpy as np +from datumaro.components.converter import Converter +from datumaro.components.extractor import (AnnotationType, DatasetItem, + Importer, Label, LabelCategories, SourceExtractor) +from datumaro.util import cast + + +class CifarPath: + BATCHES_META = 'batches.meta' + TRAIN_ANNOTATION_FILE = 'data_batch_' + IMAGES_DIR = 'images' + IMAGE_SIZE = 32 + +CifarLabel = ['airplane', 'automobile', 'bird', 'cat', + 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'] + +# Support for Python version CIFAR-10/100 + +class CifarExtractor(SourceExtractor): + def __init__(self, path, subset=None): + if not osp.isfile(path): + raise FileNotFoundError("Can't read annotation file '%s'" % path) + + if not subset: + file_name = osp.splitext(osp.basename(path))[0] + if file_name.startswith(CifarPath.TRAIN_ANNOTATION_FILE): + subset = 'train_%s' % file_name.split('_')[-1] + else: + subset = file_name.rsplit('_', maxsplit=1)[0] + + super().__init__(subset=subset) + + batches_meta_file = osp.join(osp.dirname(path), CifarPath.BATCHES_META) + self._categories = self._load_categories(batches_meta_file) + + self._items = list(self._load_items(path).values()) + + def _load_categories(self, path): + label_cat = LabelCategories() + + if osp.isfile(path): + # num_cases_per_batch: 1000 + # label_names: ['airplane', 'automobile', 'bird', 'cat', 'deer', + # 'dog', 'frog', 'horse', 'ship', 'truck'] + # num_vis: 3072 + with open(path, 'rb') as labels_file: + data = pickle.load(labels_file) + for label in data['label_names']: + label_cat.add(label) + else: + for label in CifarLabel: + label_cat.add(label) + + return { AnnotationType.label: label_cat } + + def _load_items(self, path): + items = {} + + # 'batch_label': 'training batch 1 of 5' + # 'data': ndarray + # 'filenames': list + # 'labels': list + with open(path, 'rb') as anno_file: + annotation_dict = pickle.load(anno_file) + + labels = annotation_dict.get('labels', []) + filenames = annotation_dict.get('filenames', []) + images_data = annotation_dict.get('data') + size = annotation_dict.get('image_sizes') + + if len(labels) != len(filenames): + raise Exception("The sizes of the arrays 'filenames', " \ + "'labels' don't match.") + + if 0 < len(images_data) and len(images_data) != len(filenames): + raise Exception("The sizes of the arrays 'data', " \ + "'filenames', 'labels' don't match.") + + for i, (filename, label) in enumerate(zip(filenames, labels)): + item_id = osp.splitext(filename)[0] + annotations = [] + if label != None: + annotations.append(Label(label)) + + image = None + if 0 < len(images_data): + image = images_data[i] + if size is not None and image is not None: + image = image.reshape(size[i][0], + size[i][1], 3).astype(np.uint8) + elif image is not None: + image = image.reshape(CifarPath.IMAGE_SIZE, + CifarPath.IMAGE_SIZE, 3).astype(np.uint8) + + items[item_id] = DatasetItem(id=item_id, subset=self._subset, + image=image, annotations=annotations) + + return items + + +class CifarImporter(Importer): + @classmethod + def find_sources(cls, path): + return cls._find_sources_recursive(path, '', 'cifar', + file_filter=lambda p: osp.basename(p) not in + {CifarPath.BATCHES_META, CifarPath.IMAGES_DIR}) + + +class CifarConverter(Converter): + DEFAULT_IMAGE_EXT = '.png' + + def apply(self): + os.makedirs(self._save_dir, exist_ok=True) + + label_categories = self._extractor.categories()[AnnotationType.label] + label_names = [] + for label in label_categories: + label_names.append(label.name) + labels_dict = { 'label_names': label_names } + batches_meta_file = osp.join(self._save_dir, CifarPath.BATCHES_META) + with open(batches_meta_file, 'wb') as labels_file: + pickle.dump(labels_dict, labels_file) + + for subset_name, subset in self._extractor.subsets().items(): + filenames = [] + labels = [] + data = [] + image_sizes = {} + for item in subset: + filenames.append(item.id + self._find_image_ext(item)) + + anns = [a.label for a in item.annotations + if a.type == AnnotationType.label] + label = None + if anns: + label = anns[0] + labels.append(label) + + if item.has_image and self._save_images: + image = item.image + if not image.has_data: + data.append(None) + else: + image = image.data + data.append(image.reshape(-1).astype(np.uint8)) + if image.shape[0] != CifarPath.IMAGE_SIZE or \ + image.shape[1] != CifarPath.IMAGE_SIZE: + image_sizes[len(data) - 1] = (image.shape[0], image.shape[1]) + + annotation_dict = {} + annotation_dict['filenames'] = filenames + annotation_dict['labels'] = labels + annotation_dict['data'] = np.array(data, dtype=object) + if len(image_sizes): + size = (CifarPath.IMAGE_SIZE, CifarPath.IMAGE_SIZE) + # 'image_sizes' isn't included in the standard format, + # needed for different image sizes + annotation_dict['image_sizes'] = [image_sizes.get(p, size) + for p in range(len(data))] + + filename = '%s_batch' % subset_name + batch_label = None + if subset_name.startswith('train_') and \ + cast(subset_name.split('_')[1], int) is not None: + num = subset_name.split('_')[1] + filename = CifarPath.TRAIN_ANNOTATION_FILE + num + batch_label = 'training batch %s of 5' % (num, ) + if subset_name == 'test': + batch_label = 'testing batch 1 of 1' + if batch_label: + annotation_dict['batch_label'] = batch_label + + annotation_file = osp.join(self._save_dir, filename) + with open(annotation_file, 'wb') as labels_file: + pickle.dump(annotation_dict, labels_file) diff --git a/datumaro/plugins/cityscapes_format.py b/datumaro/plugins/cityscapes_format.py new file mode 100644 index 000000000000..34aca8bd1cbf --- /dev/null +++ b/datumaro/plugins/cityscapes_format.py @@ -0,0 +1,357 @@ + +# Copyright (C) 2020 Intel Corporation +# +# SPDX-License-Identifier: MIT + +import logging as log +import os +import os.path as osp +from collections import OrderedDict +from enum import Enum +from glob import iglob + +import numpy as np + +from datumaro.components.converter import Converter +from datumaro.components.extractor import (AnnotationType, CompiledMask, + DatasetItem, Importer, LabelCategories, Mask, + MaskCategories, SourceExtractor) +from datumaro.util import str_to_bool +from datumaro.util.annotation_util import make_label_id_mapping +from datumaro.util.image import save_image, load_image +from datumaro.util.mask_tools import generate_colormap, paint_mask + + +CityscapesLabelMap = OrderedDict([ + ('unlabeled', (0, 0, 0)), + ('egovehicle', (0, 0, 0)), + ('rectificationborder', (0, 0, 0)), + ('outofroi', (0, 0, 0)), + ('static', (0, 0, 0)), + ('dynamic', (111, 74, 0)), + ('ground', (81, 0, 81)), + ('road', (128, 64, 128)), + ('sidewalk', (244, 35, 232)), + ('parking', (250, 170, 160)), + ('railtrack', (230, 150, 140)), + ('building', (70, 70, 70)), + ('wall', (102, 102, 156)), + ('fence', (190, 153, 153)), + ('guardrail', (180, 165, 180)), + ('bridge', (150, 100, 100)), + ('tunnel', (150, 120, 90)), + ('pole', (153, 153, 153)), + ('polegroup', (153, 153, 153)), + ('trafficlight', (250, 170, 30)), + ('trafficsign', (220, 220, 0)), + ('vegetation', (107, 142, 35)), + ('terrain', (152, 251, 152)), + ('sky', (70, 130, 180)), + ('person', (220, 20, 60)), + ('rider', (255, 0, 0)), + ('car', (0, 0, 142)), + ('truck', (0, 0, 70)), + ('bus', (0, 60, 100)), + ('caravan', (0, 0, 90)), + ('trailer', (0, 0, 110)), + ('train', (0, 80, 100)), + ('motorcycle', (0, 0, 230)), + ('bicycle', (119, 11, 32)), + ('licenseplate', (0, 0, 142)), +]) + +class CityscapesPath: + GT_FINE_DIR = 'gtFine' + IMGS_FINE_DIR = 'imgsFine' + ORIGINAL_IMAGE_DIR = 'leftImg8bit' + ORIGINAL_IMAGE = '_%s.png' % ORIGINAL_IMAGE_DIR + INSTANCES_IMAGE = '_instanceIds.png' + COLOR_IMAGE = '_color.png' + LABELIDS_IMAGE = '_labelIds.png' + + LABELMAP_FILE = 'label_colors.txt' + +def make_cityscapes_categories(label_map=None): + if label_map is None: + label_map = CityscapesLabelMap + + categories = {} + label_categories = LabelCategories() + for label in label_map: + label_categories.add(label) + categories[AnnotationType.label] = label_categories + + has_colors = any(v is not None for v in label_map.values()) + if not has_colors: # generate new colors + colormap = generate_colormap(len(label_map)) + else: # only copy defined colors + label_id = lambda label: label_categories.find(label)[0] + colormap = { label_id(name): (desc[0], desc[1], desc[2]) + for name, desc in label_map.items() } + mask_categories = MaskCategories(colormap) + mask_categories.inverse_colormap # pylint: disable=pointless-statement + categories[AnnotationType.mask] = mask_categories + return categories + +def parse_label_map(path): + if not path: + return None + + label_map = OrderedDict() + with open(path, 'r') as f: + for line in f: + # skip empty and commented lines + line = line.strip() + if not line or line and line[0] == '#': + continue + + # color, name + label_desc = line.strip().split() + + if 2 < len(label_desc): + name = label_desc[3] + color = tuple([int(c) for c in label_desc[:-1]]) + else: + name = label_desc[0] + color = None + + if name in label_map: + raise ValueError("Label '%s' is already defined" % name) + + label_map[name] = color + return label_map + +def write_label_map(path, label_map): + with open(path, 'w') as f: + for label_name, label_desc in label_map.items(): + if label_desc: + color_rgb = ' '.join(str(c) for c in label_desc) + else: + color_rgb = '' + f.write('%s %s\n' % (color_rgb, label_name)) + +class CityscapesExtractor(SourceExtractor): + def __init__(self, path, subset=None): + assert osp.isdir(path), path + self._path = path + + if not subset: + subset = osp.splitext(osp.basename(path))[0] + self._subset = subset + super().__init__(subset=subset) + + self._categories = self._load_categories(osp.join(self._path, '../../../')) + self._items = list(self._load_items().values()) + + def _load_categories(self, path): + label_map = None + label_map_path = osp.join(path, CityscapesPath.LABELMAP_FILE) + if osp.isfile(label_map_path): + label_map = parse_label_map(label_map_path) + else: + label_map = CityscapesLabelMap + self._labels = [label for label in label_map] + return make_cityscapes_categories(label_map) + + def _load_items(self): + items = {} + annotations_path = osp.normpath(osp.join(self._path, '../../../', + CityscapesPath.GT_FINE_DIR, self._subset)) + + for image_path in iglob( + osp.join(self._path, '**', '*' + CityscapesPath.ORIGINAL_IMAGE), + recursive=True): + sample_id = osp.relpath(image_path, self._path) \ + .replace(CityscapesPath.ORIGINAL_IMAGE, '') + anns = [] + instances_path = osp.join(annotations_path, sample_id + '_' + + CityscapesPath.GT_FINE_DIR + CityscapesPath.INSTANCES_IMAGE) + if osp.isfile(instances_path): + instances_mask = load_image(instances_path, dtype=np.int32) + segm_ids = np.unique(instances_mask) + for segm_id in segm_ids: + if segm_id < 1000: + semanticId = segm_id + isCrowd = True + ann_id = segm_id + else: + semanticId = segm_id // 1000 + isCrowd = False + ann_id = segm_id % 1000 + anns.append(Mask( + image=self._lazy_extract_mask(instances_mask, segm_id), + label=semanticId, id=ann_id, + attributes = { 'is_crowd': isCrowd })) + items[sample_id] = DatasetItem(id=sample_id, subset=self._subset, + image=image_path, annotations=anns) + return items + + @staticmethod + def _lazy_extract_mask(mask, c): + return lambda: mask == c + + +class CityscapesImporter(Importer): + @classmethod + def find_sources(cls, path): + return cls._find_sources_recursive(path, '', 'cityscapes', + dirname=osp.join(CityscapesPath.IMGS_FINE_DIR, + CityscapesPath.ORIGINAL_IMAGE_DIR), + max_depth=1) + + +LabelmapType = Enum('LabelmapType', ['cityscapes', 'source']) + +class CityscapesConverter(Converter): + DEFAULT_IMAGE_EXT = '.png' + + @staticmethod + def _get_labelmap(s): + if osp.isfile(s): + return s + try: + return LabelmapType[s].name + except KeyError: + import argparse + raise argparse.ArgumentTypeError() + + @classmethod + def build_cmdline_parser(cls, **kwargs): + parser = super().build_cmdline_parser(**kwargs) + + parser.add_argument('--apply-colormap', type=str_to_bool, default=True, + help="Use colormap for class masks (default: %(default)s)") + parser.add_argument('--label-map', type=cls._get_labelmap, default=None, + help="Labelmap file path or one of %s" % \ + ', '.join(t.name for t in LabelmapType)) + return parser + + def __init__(self, extractor, save_dir, + apply_colormap=True, label_map=None, **kwargs): + super().__init__(extractor, save_dir, **kwargs) + + self._apply_colormap = apply_colormap + + if label_map is None: + label_map = LabelmapType.source.name + self._load_categories(label_map) + + def apply(self): + os.makedirs(self._save_dir, exist_ok=True) + + for subset_name, subset in self._extractor.subsets().items(): + for item in subset: + image_path = osp.join(CityscapesPath.IMGS_FINE_DIR, + CityscapesPath.ORIGINAL_IMAGE_DIR, subset_name, + item.id + CityscapesPath.ORIGINAL_IMAGE) + if self._save_images: + self._save_image(item, osp.join(self._save_dir, image_path)) + + common_folder_path = osp.join(CityscapesPath.GT_FINE_DIR, + subset_name) + + masks = [a for a in item.annotations + if a.type == AnnotationType.mask] + if not masks: + continue + + common_image_name = item.id + '_' + CityscapesPath.GT_FINE_DIR + + compiled_class_mask = CompiledMask.from_instance_masks(masks, + instance_labels=[self._label_id_mapping(m.label) + for m in masks]) + color_mask_path = osp.join(common_folder_path, + common_image_name + CityscapesPath.COLOR_IMAGE) + self.save_mask(osp.join(self._save_dir, color_mask_path), + compiled_class_mask.class_mask) + + labelids_mask_path = osp.join(common_folder_path, + common_image_name + CityscapesPath.LABELIDS_IMAGE) + self.save_mask(osp.join(self._save_dir, labelids_mask_path), + compiled_class_mask.class_mask, apply_colormap=False, + dtype=np.int32) + + compiled_instance_mask = CompiledMask.from_instance_masks(masks, + instance_labels=[m.id if m.attributes.get('is_crowd', True) + else m.label * 1000 + m.id for m in masks]) + inst_path = osp.join(common_folder_path, + common_image_name + CityscapesPath.INSTANCES_IMAGE) + self.save_mask(osp.join(self._save_dir, inst_path), + compiled_instance_mask.class_mask, apply_colormap=False, + dtype=np.int32) + self.save_label_map() + + def save_label_map(self): + path = osp.join(self._save_dir, CityscapesPath.LABELMAP_FILE) + write_label_map(path, self._label_map) + + def _load_categories(self, label_map_source): + if label_map_source == LabelmapType.cityscapes.name: + # use the default Cityscapes colormap + label_map = CityscapesLabelMap + + elif label_map_source == LabelmapType.source.name and \ + AnnotationType.mask not in self._extractor.categories(): + # generate colormap for input labels + labels = self._extractor.categories() \ + .get(AnnotationType.label, LabelCategories()) + label_map = OrderedDict((item.name, None) + for item in labels.items) + + elif label_map_source == LabelmapType.source.name and \ + AnnotationType.mask in self._extractor.categories(): + # use source colormap + labels = self._extractor.categories()[AnnotationType.label] + colors = self._extractor.categories()[AnnotationType.mask] + label_map = OrderedDict() + for idx, item in enumerate(labels.items): + color = colors.colormap.get(idx) + if color is not None: + label_map[item.name] = color + + elif isinstance(label_map_source, dict): + label_map = OrderedDict( + sorted(label_map_source.items(), key=lambda e: e[0])) + + elif isinstance(label_map_source, str) and osp.isfile(label_map_source): + label_map = parse_label_map(label_map_source) + + else: + raise Exception("Wrong labelmap specified, " + "expected one of %s or a file path" % \ + ', '.join(t.name for t in LabelmapType)) + + self._categories = make_cityscapes_categories(label_map) + self._label_map = label_map + self._label_id_mapping = self._make_label_id_map() + + def _make_label_id_map(self): + map_id, id_mapping, src_labels, dst_labels = make_label_id_mapping( + self._extractor.categories().get(AnnotationType.label), + self._categories[AnnotationType.label]) + + void_labels = [src_label for src_id, src_label in src_labels.items() + if src_label not in dst_labels] + if void_labels: + log.warning("The following labels are remapped to background: %s" % + ', '.join(void_labels)) + log.debug("Saving segmentations with the following label mapping: \n%s" % + '\n'.join(["#%s '%s' -> #%s '%s'" % + ( + src_id, src_label, id_mapping[src_id], + self._categories[AnnotationType.label] \ + .items[id_mapping[src_id]].name + ) + for src_id, src_label in src_labels.items() + ]) + ) + + return map_id + + def save_mask(self, path, mask, colormap=None, apply_colormap=True, + dtype=np.uint8): + if self._apply_colormap and apply_colormap: + if colormap is None: + colormap = self._categories[AnnotationType.mask].colormap + mask = paint_mask(mask, colormap) + save_image(path, mask, create_dir=True, dtype=dtype) diff --git a/datumaro/plugins/coco_format/converter.py b/datumaro/plugins/coco_format/converter.py index 0caf89de3b35..f7caa56b965d 100644 --- a/datumaro/plugins/coco_format/converter.py +++ b/datumaro/plugins/coco_format/converter.py @@ -5,6 +5,7 @@ import json import logging as log +import numpy as np import os import os.path as osp from enum import Enum @@ -19,6 +20,7 @@ _COORDINATE_ROUNDING_DIGITS, AnnotationType, Points) from datumaro.components.dataset import ItemStatus from datumaro.util import cast, find, str_to_bool +from datumaro.util.image import save_image from .format import CocoPath, CocoTask @@ -263,7 +265,7 @@ def save_annotations(self, item): return if not item.has_image: - log.warn("Item '%s': skipping writing instances " + log.warning("Item '%s': skipping writing instances " "since no image info available" % item.id) return h, w = item.image.size @@ -451,6 +453,67 @@ def save_annotations(self, item): self.annotations.append(elem) +class _StuffConverter(_InstancesConverter): + pass + +class _PanopticConverter(_TaskConverter): + def write(self, path): + with open(path, 'w') as outfile: + json.dump(self._data, outfile) + + def save_categories(self, dataset): + label_categories = dataset.categories().get(AnnotationType.label) + if label_categories is None: + return + + for idx, cat in enumerate(label_categories.items): + self.categories.append({ + 'id': 1 + idx, + 'name': cast(cat.name, str, ''), + 'supercategory': cast(cat.parent, str, ''), + 'isthing': 0, # TODO: can't represent this information yet + }) + + def save_annotations(self, item): + if not item.has_image: + return + + ann_filename = item.id + CocoPath.PANOPTIC_EXT + + segments_info = list() + masks = [] + next_id = self._min_ann_id + for ann in item.annotations: + if ann.type != AnnotationType.mask: + continue + + if not ann.id: + ann.id = next_id + next_id += 1 + + segment_info = {} + segment_info['id'] = ann.id + segment_info['category_id'] = cast(ann.label, int, -1) + 1 + segment_info['area'] = float(ann.get_area()) + segment_info['bbox'] = [float(p) for p in ann.get_bbox()] + segment_info['iscrowd'] = cast(ann.attributes.get("is_crowd"), int, 0) + segments_info.append(segment_info) + masks.append(ann) + + if masks: + pan_format = mask_tools.merge_masks( + ((m.image, m.id) for m in masks), + start=np.zeros(item.image.size, dtype=np.uint32)) + save_image(osp.join(self._context._segmentation_dir, ann_filename), + mask_tools.index2bgr(pan_format), create_dir=True) + + elem = { + 'image_id': self._get_image_id(item), + 'file_name': ann_filename, + 'segments_info': segments_info + } + self.annotations.append(elem) + class CocoConverter(Converter): @staticmethod def _split_tasks_string(s): @@ -497,6 +560,8 @@ def build_cmdline_parser(cls, **kwargs): CocoTask.person_keypoints: _KeypointsConverter, CocoTask.captions: _CaptionsConverter, CocoTask.labels: _LabelsConverter, + CocoTask.panoptic: _PanopticConverter, + CocoTask.stuff: _StuffConverter, } def __init__(self, extractor, save_dir, @@ -541,6 +606,11 @@ def _make_dirs(self): self._ann_dir = osp.join(self._save_dir, CocoPath.ANNOTATIONS_DIR) os.makedirs(self._ann_dir, exist_ok=True) + def _make_segmentation_dir(self, subset_name): + self._segmentation_dir = osp.join(self._save_dir, + CocoPath.ANNOTATIONS_DIR, 'panoptic_'+ subset_name) + os.makedirs(self._segmentation_dir, exist_ok=True) + def _make_task_converter(self, task): if task not in self._TASK_CONVERTER: raise NotImplementedError() @@ -568,6 +638,8 @@ def apply(self): task_converters = self._make_task_converters() for task_conv in task_converters.values(): task_conv.save_categories(subset) + if CocoTask.panoptic in task_converters: + self._make_segmentation_dir(subset_name) for item in subset: if self._save_images: @@ -637,3 +709,14 @@ class CocoLabelsConverter(CocoConverter): def __init__(self, *args, **kwargs): kwargs['tasks'] = CocoTask.labels super().__init__(*args, **kwargs) + +class CocoPanopticConverter(CocoConverter): + def __init__(self, *args, **kwargs): + kwargs['tasks'] = CocoTask.panoptic + super().__init__(*args, **kwargs) + +class CocoStuffConverter(CocoConverter): + def __init__(self, *args, **kwargs): + kwargs['tasks'] = CocoTask.stuff + kwargs['segmentation_mode'] = SegmentationMode.mask + super().__init__(*args, **kwargs) diff --git a/datumaro/plugins/coco_format/extractor.py b/datumaro/plugins/coco_format/extractor.py index 29b97f7e2734..faecf79f8e28 100644 --- a/datumaro/plugins/coco_format/extractor.py +++ b/datumaro/plugins/coco_format/extractor.py @@ -4,18 +4,20 @@ # SPDX-License-Identifier: MIT from collections import OrderedDict +import json import logging as log import os.path as osp from pycocotools.coco import COCO import pycocotools.mask as mask_utils -from datumaro.components.extractor import (SourceExtractor, +from datumaro.components.extractor import (CompiledMask, Mask, SourceExtractor, DEFAULT_SUBSET_NAME, DatasetItem, AnnotationType, Label, RleMask, Points, Polygon, Bbox, Caption, LabelCategories, PointsCategories ) -from datumaro.util.image import Image +from datumaro.util.image import Image, lazy_image, load_image +from datumaro.util.mask_tools import bgr2index from .format import CocoTask, CocoPath @@ -42,16 +44,24 @@ def __init__(self, path, task, merge_instance_polygons=False, subset=None): self._merge_instance_polygons = merge_instance_polygons - loader = self._make_subset_loader(path) - self._load_categories(loader) - self._items = list(self._load_items(loader).values()) + if self._task == CocoTask.panoptic: + #panoptic is not added to pycocotools + panoptic_config = self._load_panoptic_config(path) + panoptic_images = osp.splitext(path)[0] + + self._load_panoptic_categories(panoptic_config) + self._items = list(self._load_panoptic_items(panoptic_config, + panoptic_images).values()) + else: + loader = self._make_subset_loader(path) + self._load_categories(loader) + self._items = list(self._load_items(loader).values()) @staticmethod def _make_subset_loader(path): # COCO API has an 'unclosed file' warning coco_api = COCO() with open(path, 'r') as f: - import json dataset = json.load(f) coco_api.dataset = dataset @@ -62,9 +72,7 @@ def _load_categories(self, loader): self._categories = {} if self._task in [CocoTask.instances, CocoTask.labels, - CocoTask.person_keypoints, - # TODO: Task.stuff, CocoTask.panoptic - ]: + CocoTask.person_keypoints, CocoTask.stuff]: label_categories, label_map = self._load_label_categories(loader) self._categories[AnnotationType.label] = label_categories self._label_map = label_map @@ -100,6 +108,22 @@ def _load_person_kp_categories(self, loader): return categories + @staticmethod + def _load_panoptic_config(path): + with open(path, 'r') as f: + return json.load(f) + + def _load_panoptic_categories(self, config): + label_categories = LabelCategories() + label_map = {} + for idx, cat in enumerate(config['categories']): + label_map[cat['id']] = idx + label_categories.add(name=cat['name'], + parent=cat.get('supercategory')) + + self._categories[AnnotationType.label] = label_categories + self._label_map = label_map + def _load_items(self, loader): items = OrderedDict() @@ -124,6 +148,48 @@ def _load_items(self, loader): return items + def _load_panoptic_items(self, config, panoptic_images): + items = OrderedDict() + + imgs_info = {} + for img in config['images']: + imgs_info[img['id']] = img + + for ann in config['annotations']: + img_id = int(ann['image_id']) + image_path = osp.join(self._images_dir, imgs_info[img_id]['file_name']) + image_size = (imgs_info[img_id].get('height'), + imgs_info[img_id].get('width')) + if all(image_size): + image_size = (int(image_size[0]), int(image_size[1])) + else: + image_size = None + image = Image(path=image_path, size=image_size) + anns = [] + + mask_path = osp.join(panoptic_images, ann['file_name']) + mask = lazy_image(mask_path, loader=self._load_pan_mask) + mask = CompiledMask(instance_mask=mask) + for segm_info in ann['segments_info']: + cat_id = self._get_label_id(segm_info) + segm_id = segm_info['id'] + attributes = { 'is_crowd': bool(segm_info['iscrowd']) } + anns.append(Mask(image=mask.lazy_extract(segm_id), + label=cat_id, id=segm_id, + group=segm_id, attributes=attributes)) + + items[img_id] = DatasetItem( + id=osp.splitext(imgs_info[img_id]['file_name'])[0], + subset=self._subset, image=image, + annotations=anns, attributes={'id': img_id}) + return items + + @staticmethod + def _load_pan_mask(path): + mask = load_image(path) + mask = bgr2index(mask) + return mask + def _get_label_id(self, ann): cat_id = ann.get('category_id') if cat_id in [0, None]: @@ -147,7 +213,8 @@ def _load_annotations(self, ann, image_info=None): group = ann_id # make sure all tasks' annotations are merged - if self._task in [CocoTask.instances, CocoTask.person_keypoints]: + if self._task in [CocoTask.instances, CocoTask.person_keypoints, + CocoTask.stuff]: x, y, w, h = ann['bbox'] label_id = self._get_label_id(ann) @@ -250,3 +317,13 @@ class CocoLabelsExtractor(_CocoExtractor): def __init__(self, path, **kwargs): kwargs['task'] = CocoTask.labels super().__init__(path, **kwargs) + +class CocoPanopticExtractor(_CocoExtractor): + def __init__(self, path, **kwargs): + kwargs['task'] = CocoTask.panoptic + super().__init__(path, **kwargs) + +class CocoStuffExtractor(_CocoExtractor): + def __init__(self, path, **kwargs): + kwargs['task'] = CocoTask.stuff + super().__init__(path, **kwargs) diff --git a/datumaro/plugins/coco_format/format.py b/datumaro/plugins/coco_format/format.py index 5129d49d9a9c..7a37bb709c63 100644 --- a/datumaro/plugins/coco_format/format.py +++ b/datumaro/plugins/coco_format/format.py @@ -12,8 +12,8 @@ 'captions', 'labels', # extension, does not exist in the original COCO format 'image_info', - # 'panoptic', - # 'stuff', + 'panoptic', + 'stuff', ]) class CocoPath: @@ -21,3 +21,4 @@ class CocoPath: ANNOTATIONS_DIR = 'annotations' IMAGE_EXT = '.jpg' + PANOPTIC_EXT = '.png' diff --git a/datumaro/plugins/coco_format/importer.py b/datumaro/plugins/coco_format/importer.py index f613143e157e..8d41376ea89a 100644 --- a/datumaro/plugins/coco_format/importer.py +++ b/datumaro/plugins/coco_format/importer.py @@ -21,6 +21,8 @@ class CocoImporter(Importer): CocoTask.captions: 'coco_captions', CocoTask.labels: 'coco_labels', CocoTask.image_info: 'coco_image_info', + CocoTask.panoptic: 'coco_panoptic', + CocoTask.stuff: 'coco_stuff', } @classmethod @@ -39,7 +41,8 @@ def __call__(self, path, **extra_params): # TODO: should be removed when proper label merging is implemented conflicting_types = {CocoTask.instances, - CocoTask.person_keypoints, CocoTask.labels} + CocoTask.person_keypoints, CocoTask.labels, + CocoTask.panoptic, CocoTask.stuff} ann_types = set(t for s in subsets.values() for t in s) \ & conflicting_types if 1 <= len(ann_types): @@ -85,10 +88,10 @@ def find_sources(path): try: ann_type = CocoTask[ann_type] except KeyError: - log.warn("Skipping '%s': unknown subset " + log.warning("Skipping '%s': unknown subset " "type '%s', the only known are: %s" % \ (subset_path, ann_type, - ', '.join([e.name for e in CocoTask]))) + ', '.join(e.name for e in CocoTask))) continue subset_name = name_parts[1] subsets[subset_name][ann_type] = subset_path diff --git a/datumaro/plugins/cvat_format/converter.py b/datumaro/plugins/cvat_format/converter.py index 5a9f64469bcf..4b8c07982bbc 100644 --- a/datumaro/plugins/cvat_format/converter.py +++ b/datumaro/plugins/cvat_format/converter.py @@ -164,7 +164,7 @@ def _write_item(self, item, index): if not self._context._reindex: index = cast(item.attributes.get('frame'), int, index) image_info = OrderedDict([ ("id", str(index)), ]) - filename = item.id + CvatPath.IMAGE_EXT + filename = self._context._make_image_filename(item) image_info["name"] = filename if item.has_image: size = item.image.size diff --git a/datumaro/plugins/datumaro_format/converter.py b/datumaro/plugins/datumaro_format/converter.py index 6e9de7142cd1..18d16c14286b 100644 --- a/datumaro/plugins/datumaro_format/converter.py +++ b/datumaro/plugins/datumaro_format/converter.py @@ -131,11 +131,16 @@ def _convert_mask_object(self, obj): rle = mask_utils.encode( np.require(obj.image, dtype=np.uint8, requirements='F')) + if isinstance(rle['counts'], str): + counts = rle['counts'] + else: + counts = rle['counts'].decode('ascii') + converted.update({ 'label_id': cast(obj.label, int), 'rle': { # serialize as compressed COCO mask - 'counts': rle['counts'].decode('ascii'), + 'counts': counts, 'size': list(int(c) for c in rle['size']), }, 'z_order': obj.z_order, diff --git a/datumaro/plugins/labelme_format.py b/datumaro/plugins/labelme_format.py index 5580dbc77bf1..80d9d0d44f2e 100644 --- a/datumaro/plugins/labelme_format.py +++ b/datumaro/plugins/labelme_format.py @@ -4,15 +4,18 @@ from collections import defaultdict from defusedxml import ElementTree +from functools import partial +from glob import glob, iglob import logging as log import numpy as np import os import os.path as osp -from datumaro.components.extractor import (SourceExtractor, Importer, - DatasetItem, AnnotationType, Mask, Bbox, Polygon, LabelCategories -) +from datumaro.components.extractor import (Extractor, Importer, + DatasetItem, AnnotationType, Mask, Bbox, Polygon, LabelCategories) from datumaro.components.converter import Converter +from datumaro.util import cast, escape, unescape +from datumaro.util.os_util import split_path from datumaro.util.image import Image, save_image from datumaro.util.mask_tools import load_mask, find_mask_bbox @@ -21,70 +24,109 @@ class LabelMePath: MASKS_DIR = 'Masks' IMAGE_EXT = '.jpg' -class LabelMeExtractor(SourceExtractor): - def __init__(self, path, subset=None): + ATTR_IMPORT_ESCAPES = [ + ('\\=', r'%%{eq}%%'), + ('\\"', r'%%{doublequote}%%'), + ('\\,', r'%%{comma}%%'), + ('\\\\', r'%%{backslash}%%'), # keep last + ] + ATTR_EXPORT_ESCAPES = [ + ('\\', '\\\\'), # keep first + ('=', '\\='), + ('"', '\\"'), + (',', '\\,'), + ] + +class LabelMeExtractor(Extractor): + def __init__(self, path): assert osp.isdir(path), path - super().__init__(subset=subset) + super().__init__() - items, categories = self._parse(path) - self._categories = categories - self._items = items + self._items, self._categories, self._subsets = self._parse(path) + self._length = len(self._items) - def _parse(self, path): - categories = { - AnnotationType.label: LabelCategories(attributes={ - 'occluded', 'username' - }) + def _parse(self, dataset_root): + items = [] + subsets = set() + categories = { AnnotationType.label: + LabelCategories(attributes={ 'occluded', 'username' }) } - items = [] - for p in os.listdir(path): - if not p.endswith('.xml'): - continue - root = ElementTree.parse(osp.join(path, p)) + for xml_path in sorted( + glob(osp.join(dataset_root, '**', '*.xml'), recursive=True)): + item_path = osp.relpath(xml_path, dataset_root) + path_parts = split_path(item_path) + subset = '' + if 1 < len(path_parts): + subset = path_parts[0] + item_path = osp.join(*path_parts[1:]) + + root = ElementTree.parse(xml_path) item_id = osp.join(root.find('folder').text or '', - root.find('filename').text) - image_path = osp.join(path, item_id) + root.find('filename').text) or \ + item_path + image_path = osp.join(osp.dirname(xml_path), osp.basename(item_id)) + item_id = osp.splitext(item_id)[0] + image_size = None imagesize_elem = root.find('imagesize') if imagesize_elem is not None: width_elem = imagesize_elem.find('ncols') height_elem = imagesize_elem.find('nrows') image_size = (int(height_elem.text), int(width_elem.text)) + image = Image(path=image_path, size=image_size) - annotations = self._parse_annotations(root, path, categories) + annotations = self._parse_annotations(root, + osp.join(dataset_root, subset), categories) + + items.append(DatasetItem(id=item_id, subset=subset, + image=image, annotations=annotations)) + subsets.add(subset) + return items, categories, subsets + + def _escape(s): + return escape(s, LabelMePath.ATTR_IMPORT_ESCAPES) - items.append(DatasetItem(id=osp.splitext(item_id)[0], - subset=self._subset, image=image, annotations=annotations)) - return items, categories + def _unescape(s): + s = unescape(s, LabelMePath.ATTR_IMPORT_ESCAPES) + s = unescape(s, LabelMePath.ATTR_EXPORT_ESCAPES) + return s @classmethod - def _parse_annotations(cls, xml_root, dataset_root, categories): - def parse_attributes(attr_str): + def _parse_annotations(cls, xml_root, subset_root, categories): + def _parse_attributes(attr_str): parsed = [] if not attr_str: return parsed - for attr in [a.strip() for a in attr_str.split(',') if a.strip()]: + for attr in [a.strip() for a in cls._escape(attr_str).split(',')]: + if not attr: + continue + if '=' in attr: name, value = attr.split('=', maxsplit=1) if value.lower() in {'true', 'false'}: value = value.lower() == 'true' + elif 1 < len(value) and value[0] == '"' and value[-1] == '"': + value = value[1:-1] else: - try: - value = float(value) - except ValueError: - pass - parsed.append((name, value)) + for t in [int, float]: + casted = cast(value, t) + if casted is not None and str(casted) == value: + value = casted + break + if isinstance(value, str): + value = cls._unescape(value) + parsed.append((cls._unescape(name), value)) else: - parsed.append((attr, True)) + parsed.append((cls._unescape(attr), True)) return parsed label_cat = categories[AnnotationType.label] - def get_label_id(label): + def _get_label_id(label): if not label: return None idx, _ = label_cat.find(label) @@ -102,12 +144,12 @@ def get_label_id(label): ann_items = [] - label = get_label_id(obj_elem.find('name').text) + label = _get_label_id(obj_elem.find('name').text) attributes = [] attributes_elem = obj_elem.find('attributes') if attributes_elem is not None and attributes_elem.text: - attributes = parse_attributes(attributes_elem.text) + attributes = _parse_attributes(attributes_elem.text) occluded = False occluded_elem = obj_elem.find('occluded') @@ -156,7 +198,7 @@ def get_label_id(label): user = user_elem.text attributes.append(('username', user)) - mask_path = osp.join(dataset_root, LabelMePath.MASKS_DIR, + mask_path = osp.join(subset_root, LabelMePath.MASKS_DIR, segm_elem.find('mask').text) if not osp.isfile(mask_path): raise Exception("Can't find mask at '%s'" % mask_path) @@ -220,32 +262,28 @@ def get_label_id(label): return image_annotations + def categories(self): + return self._categories + + def __iter__(self): + yield from self._items + class LabelMeImporter(Importer): EXTRACTOR = 'label_me' @classmethod def find_sources(cls, path): - subset_paths = [] + subsets = [] if not osp.isdir(path): return [] - path = osp.normpath(path) - - def has_annotations(d): - return len([p for p in os.listdir(d) if p.endswith('.xml')]) != 0 - - if has_annotations(path): - subset_paths.append({'url': path, 'format': cls.EXTRACTOR}) - else: - for d in os.listdir(path): - subset = d - d = osp.join(path, d) - if osp.isdir(d) and has_annotations(d): - subset_paths.append({'url': d, 'format': cls.EXTRACTOR, - 'options': {'subset': subset} - }) - return subset_paths + try: + next(iglob(osp.join(path, '**', '*.xml'), recursive=True)) + subsets.append({'url': osp.normpath(path), 'format': cls.EXTRACTOR}) + except StopIteration: + pass + return subsets class LabelMeConverter(Converter): @@ -255,18 +293,18 @@ def apply(self): for subset_name, subset in self._extractor.subsets().items(): subset_dir = osp.join(self._save_dir, subset_name) os.makedirs(subset_dir, exist_ok=True) - os.makedirs(osp.join(subset_dir, LabelMePath.MASKS_DIR), - exist_ok=True) - for index, item in enumerate(subset): - self._save_item(item, subset_dir, index) + for item in subset: + self._save_item(item, subset_dir) def _get_label(self, label_id): if label_id is None: return '' return self._extractor.categories()[AnnotationType.label][label_id].name - def _save_item(self, item, subset_dir, index): + _escape = partial(escape, escapes=LabelMePath.ATTR_EXPORT_ESCAPES) + + def _save_item(self, item, subset_dir): from lxml import etree as ET log.debug("Converting item '%s'", item.id) @@ -305,7 +343,7 @@ def _save_item(self, item, subset_dir, index): ET.SubElement(obj_elem, 'deleted').text = '0' ET.SubElement(obj_elem, 'verified').text = '0' ET.SubElement(obj_elem, 'occluded').text = \ - 'yes' if ann.attributes.pop('occluded', '') == True else 'no' + 'yes' if ann.attributes.get('occluded') == True else 'no' ET.SubElement(obj_elem, 'date').text = '' ET.SubElement(obj_elem, 'id').text = str(obj_id) @@ -328,7 +366,7 @@ def _save_item(self, item, subset_dir, index): ET.SubElement(point_elem, 'y').text = '%.2f' % y ET.SubElement(poly_elem, 'username').text = \ - str(ann.attributes.pop('username', '')) + str(ann.attributes.get('username', '')) elif ann.type == AnnotationType.polygon: poly_elem = ET.SubElement(obj_elem, 'polygon') for x, y in zip(ann.points[::2], ann.points[1::2]): @@ -337,13 +375,12 @@ def _save_item(self, item, subset_dir, index): ET.SubElement(point_elem, 'y').text = '%.2f' % y ET.SubElement(poly_elem, 'username').text = \ - str(ann.attributes.pop('username', '')) + str(ann.attributes.get('username', '')) elif ann.type == AnnotationType.mask: - mask_filename = '%s_mask_%s.png' % \ - (item.id.replace('/', '_'), obj_id) + mask_filename = '%s_mask_%s.png' % (item.id, obj_id) save_image(osp.join(subset_dir, LabelMePath.MASKS_DIR, mask_filename), - self._paint_mask(ann.image)) + self._paint_mask(ann.image), create_dir=True) segm_elem = ET.SubElement(obj_elem, 'segm') ET.SubElement(segm_elem, 'mask').text = mask_filename @@ -358,13 +395,21 @@ def _save_item(self, item, subset_dir, index): '%.2f' % (bbox[1] + bbox[3]) ET.SubElement(segm_elem, 'username').text = \ - str(ann.attributes.pop('username', '')) + str(ann.attributes.get('username', '')) else: raise NotImplementedError("Unknown shape type '%s'" % ann.type) attrs = [] for k, v in ann.attributes.items(): - attrs.append('%s=%s' % (k, v)) + if k in { 'username' , 'occluded' }: + continue + if isinstance(v, str): + if cast(v, float) is not None and str(float(v)) == v or \ + cast(v, int) is not None and str(int(v)) == v: + v = f'"{v}"' # add escaping for string values + else: + v = self._escape(v) + attrs.append('%s=%s' % (self._escape(k), v)) ET.SubElement(obj_elem, 'attributes').text = ', '.join(attrs) obj_id += 1 @@ -380,7 +425,11 @@ def _save_item(self, item, subset_dir, index): ET.SubElement(parts_elem, 'hasparts').text = '' ET.SubElement(parts_elem, 'ispartof').text = str(leader_id) - xml_path = osp.join(subset_dir, 'item_%09d.xml' % index) + os.makedirs(osp.join(subset_dir, osp.dirname(image_filename)), + exist_ok=True) + xml_path = osp.join(subset_dir, osp.splitext(image_filename)[0] + '.xml') + if osp.exists(xml_path): + xml_path = osp.join(subset_dir, image_filename + '.xml') with open(xml_path, 'w', encoding='utf-8') as f: xml_data = ET.tostring(root_elem, encoding='unicode', pretty_print=True) diff --git a/datumaro/plugins/lfw_format.py b/datumaro/plugins/lfw_format.py index 5799ad87e587..c4806647cbaf 100644 --- a/datumaro/plugins/lfw_format.py +++ b/datumaro/plugins/lfw_format.py @@ -8,7 +8,7 @@ from datumaro.components.converter import Converter from datumaro.components.extractor import (AnnotationType, DatasetItem, - Importer, Points, SourceExtractor) + Importer, Label, LabelCategories, Points, SourceExtractor) from datumaro.util.image import find_images @@ -16,8 +16,9 @@ class LfwPath: IMAGES_DIR = 'images' LANDMARKS_FILE = 'landmarks.txt' PAIRS_FILE = 'pairs.txt' + PEOPLE_FILE = 'people.txt' IMAGE_EXT = '.jpg' - PATTERN = re.compile(r'([\w]+)_([-\d]+)') + PATTERN = re.compile(r'([\w-]+)_([-\d]+)') class LfwExtractor(SourceExtractor): def __init__(self, path, subset=None): @@ -29,14 +30,29 @@ def __init__(self, path, subset=None): super().__init__(subset=subset) self._dataset_dir = osp.dirname(osp.dirname(path)) + + people_file = osp.join(osp.dirname(path), LfwPath.PEOPLE_FILE) + self._categories = self._load_categories(people_file) + self._items = list(self._load_items(path).values()) + def _load_categories(self, path): + label_cat = LabelCategories() + if osp.isfile(path): + with open(path, encoding='utf-8') as labels_file: + for line in labels_file: + objects = line.strip().split('\t') + if len(objects) == 2: + label_cat.add(objects[0]) + return { AnnotationType.label: label_cat } + def _load_items(self, path): items = {} + label_categories = self._categories.get(AnnotationType.label) images_dir = osp.join(self._dataset_dir, self._subset, LfwPath.IMAGES_DIR) if osp.isdir(images_dir): - images = { osp.splitext(osp.relpath(p, images_dir))[0]: p + images = { osp.splitext(osp.relpath(p, images_dir))[0].replace('\\', '/'): p for p in find_images(images_dir, recursive=True) } else: images = {} @@ -44,44 +60,71 @@ def _load_items(self, path): with open(path, encoding='utf-8') as f: for line in f: pair = line.strip().split('\t') - if len(pair) == 3: - if pair[0] == '-': - image1 = pair[1] - image2 = pair[2] - else: - image1 = self.get_image_name(pair[0], pair[1]) - image2 = self.get_image_name(pair[0], pair[2]) - if image1 not in items: - items[image1] = DatasetItem(id=image1, subset=self._subset, - image=images.get(image1), - attributes={'positive_pairs': [], 'negative_pairs': []}) - if image2 not in items: - items[image2] = DatasetItem(id=image2, subset=self._subset, - image=images.get(image2), - attributes={'positive_pairs': [], 'negative_pairs': []}) + if len(pair) == 1 and pair[0] != '': + annotations = [] + image = pair[0] + item_id = pair[0] + objects = item_id.split('/') + if 1 < len(objects): + label_name = objects[0] + label = label_categories.find(label_name)[0] + if label != None: + annotations.append(Label(label)) + item_id = item_id[len(label_name) + 1:] + if item_id not in items: + items[item_id] = DatasetItem(id=item_id, subset=self._subset, + image=images.get(image), annotations=annotations) + elif len(pair) == 3: + image1, id1 = self.get_image_name(pair[0], pair[1]) + image2, id2 = self.get_image_name(pair[0], pair[2]) + label = label_categories.find(pair[0])[0] + if label == None: + raise Exception("Line %s: people file doesn't " + "contain person %s " % (line, pair[0])) + if id1 not in items: + annotations = [] + annotations.append(Label(label)) + items[id1] = DatasetItem(id=id1, subset=self._subset, + image=images.get(image1), annotations=annotations) + if id2 not in items: + annotations = [] + annotations.append(Label(label)) + items[id2] = DatasetItem(id=id2, subset=self._subset, + image=images.get(image2), annotations=annotations) # pairs form a directed graph - items[image1].attributes['positive_pairs'].append(image2) + if not items[id1].annotations[0].attributes.get('positive_pairs'): + items[id1].annotations[0].attributes['positive_pairs'] = [] + items[id1].annotations[0].attributes['positive_pairs'].append(image2) + elif len(pair) == 4: - if pair[0] == '-': - image1 = pair[1] - else: - image1 = self.get_image_name(pair[0], pair[1]) + image1, id1 = self.get_image_name(pair[0], pair[1]) if pair[2] == '-': image2 = pair[3] + id2 = pair[3] else: - image2 = self.get_image_name(pair[2], pair[3]) - if image1 not in items: - items[image1] = DatasetItem(id=image1, subset=self._subset, - image=images.get(image1), - attributes={'positive_pairs': [], 'negative_pairs': []}) - if image2 not in items: - items[image2] = DatasetItem(id=image2, subset=self._subset, - image=images.get(image2), - attributes={'positive_pairs': [], 'negative_pairs': []}) + image2, id2 = self.get_image_name(pair[2], pair[3]) + if id1 not in items: + annotations = [] + label = label_categories.find(pair[0])[0] + if label == None: + raise Exception("Line %s: people file doesn't " + "contain person %s " % (line, pair[0])) + annotations.append(Label(label)) + items[id1] = DatasetItem(id=id1, subset=self._subset, + image=images.get(image1), annotations=annotations) + if id2 not in items: + annotations = [] + label = label_categories.find(pair[2])[0] + if label != None: + annotations.append(Label(label)) + items[id2] = DatasetItem(id=id2, subset=self._subset, + image=images.get(image2), annotations=annotations) # pairs form a directed graph - items[image1].attributes['negative_pairs'].append(image2) + if not items[id1].annotations[0].attributes.get('negative_pairs'): + items[id1].annotations[0].attributes['negative_pairs'] = [] + items[id1].annotations[0].attributes['negative_pairs'].append(image2) landmarks_file = osp.join(self._dataset_dir, self._subset, LfwPath.LANDMARKS_FILE) @@ -91,10 +134,15 @@ def _load_items(self, path): line = line.split('\t') item_id = osp.splitext(line[0])[0] + objects = item_id.split('/') + if 1 < len(objects): + label_name = objects[0] + label = label_categories.find(label_name)[0] + if label != None: + item_id = item_id[len(label_name) + 1:] if item_id not in items: items[item_id] = DatasetItem(id=item_id, subset=self._subset, - image=osp.join(images_dir, line[0]), - attributes={'positive_pairs': [], 'negative_pairs': []}) + image=osp.join(images_dir, line[0])) annotations = items[item_id].annotations annotations.append(Points([float(p) for p in line[1:]])) @@ -103,7 +151,15 @@ def _load_items(self, path): @staticmethod def get_image_name(person, image_id): - return '{}/{}_{:04d}'.format(person, person, int(image_id)) + image, item_id = '', '' + try: + image_id = int(image_id) + image = '{}/{}_{:04d}'.format(person, person, image_id) + item_id = '{}_{:04d}'.format(person, image_id) + except ValueError: + image = '{}/{}'.format(person, image_id) + item_id = image_id + return image, item_id class LfwImporter(Importer): @classmethod @@ -115,42 +171,90 @@ class LfwConverter(Converter): def apply(self): for subset_name, subset in self._extractor.subsets().items(): + label_categories = self._extractor.categories()[AnnotationType.label] + labels = {} + for label in label_categories: + f = label.name + labels[label.name] = 0 + positive_pairs = [] negative_pairs = [] + neutral_items = [] landmarks = [] + included_items = [] for item in subset: + anns = [ann for ann in item.annotations + if ann.type == AnnotationType.label] + label, label_name = None, None + if anns: + label = anns[0] + label_name = label_categories[anns[0].label].name + labels[label_name] += 1 + if self._save_images and item.has_image: - self._save_image(item, - subdir=osp.join(subset_name, LfwPath.IMAGES_DIR)) - - search = LfwPath.PATTERN.search(item.id) - if search: - person1, num1 = search.groups() - num1 = int(num1) - else: - person1 = '-' + subdir=osp.join(subset_name, LfwPath.IMAGES_DIR) + if label_name: + subdir=osp.join(subdir, label_name) + self._save_image(item, subdir=subdir) + + if label != None: + person1 = label_name num1 = item.id - if 'positive_pairs' in item.attributes: - for pair in item.attributes['positive_pairs']: - search = LfwPath.PATTERN.search(pair) - if search: - num2 = search.groups()[1] - num2 = int(num2) - else: - num2 = pair - positive_pairs.append('%s\t%s\t%s' % (person1, num1, num2)) - if 'negative_pairs' in item.attributes: - for pair in item.attributes['negative_pairs']: - search = LfwPath.PATTERN.search(pair) - if search: - person2, num2 = search.groups() - num2 = int(num2) - else: - person2 = '-' - num2 = pair - negative_pairs.append('%s\t%s\t%s\t%s' % \ - (person1, num1, person2, num2)) + if num1.startswith(person1): + num1 = int(num1.replace(person1, '')[1:]) + curr_item = person1 + '/' + str(num1) + + if 'positive_pairs' in label.attributes: + if curr_item not in included_items: + included_items.append(curr_item) + for pair in label.attributes['positive_pairs']: + search = LfwPath.PATTERN.search(pair) + if search: + num2 = search.groups()[1] + num2 = int(num2) + else: + num2 = pair + if num2.startswith(person1): + num2 = num2.replace(person1, '')[1:] + curr_item = person1 + '/' + str(num2) + if curr_item not in included_items: + included_items.append(curr_item) + positive_pairs.append('%s\t%s\t%s' % (person1, num1, num2)) + + if 'negative_pairs' in label.attributes: + if curr_item not in included_items: + included_items.append(curr_item) + for pair in label.attributes['negative_pairs']: + search = LfwPath.PATTERN.search(pair) + curr_item = '' + if search: + person2, num2 = search.groups() + num2 = int(num2) + curr_item += person2 + '/' + else: + person2 = '-' + num2 = pair + objects = pair.split('/') + if 1 < len(objects) and objects[0] in labels: + person2 = objects[0] + num2 = pair.replace(person2, '')[1:] + curr_item += person2 + '/' + curr_item += str(num2) + if curr_item not in included_items: + included_items.append(curr_item) + negative_pairs.append('%s\t%s\t%s\t%s' % \ + (person1, num1, person2, num2)) + + if 'positive_pairs' not in label.attributes and \ + 'negative_pairs' not in label.attributes and \ + curr_item not in included_items: + neutral_items.append('%s/%s' % (person1, item.id)) + included_items.append(curr_item) + + elif item.id not in included_items: + neutral_items.append(item.id) + included_items.append(item.id) item_landmarks = [p for p in item.annotations if p.type == AnnotationType.points] @@ -163,9 +267,17 @@ def apply(self): with open(pairs_file, 'w', encoding='utf-8') as f: f.writelines(['%s\n' % pair for pair in positive_pairs]) f.writelines(['%s\n' % pair for pair in negative_pairs]) + f.writelines(['%s\n' % item for item in neutral_items]) if landmarks: landmarks_file = osp.join(self._save_dir, subset_name, LfwPath.LANDMARKS_FILE) with open(landmarks_file, 'w', encoding='utf-8') as f: f.writelines(['%s\n' % landmark for landmark in landmarks]) + + if labels: + people_file = osp.join(self._save_dir, subset_name, + LfwPath.PEOPLE_FILE) + with open(people_file, 'w', encoding='utf-8') as f: + f.writelines(['%s\t%d\n' % (label, labels[label]) + for label in labels]) diff --git a/datumaro/plugins/mnist_csv_format.py b/datumaro/plugins/mnist_csv_format.py new file mode 100644 index 000000000000..ae0fa8bf8c80 --- /dev/null +++ b/datumaro/plugins/mnist_csv_format.py @@ -0,0 +1,170 @@ +# Copyright (C) 2021 Intel Corporation +# +# SPDX-License-Identifier: MIT + +import os +import os.path as osp + +import numpy as np +from datumaro.components.converter import Converter +from datumaro.components.extractor import (AnnotationType, DatasetItem, + Importer, Label, LabelCategories, SourceExtractor) + + +class MnistCsvPath: + IMAGE_SIZE = 28 + NONE_LABEL = -1 + +class MnistCsvExtractor(SourceExtractor): + def __init__(self, path, subset=None): + if not osp.isfile(path): + raise FileNotFoundError("Can't read annotation file '%s'" % path) + + if not subset: + file_name = osp.splitext(osp.basename(path))[0] + subset = file_name.rsplit('_', maxsplit=1)[-1] + + super().__init__(subset=subset) + self._dataset_dir = osp.dirname(path) + + self._categories = self._load_categories() + + self._items = list(self._load_items(path).values()) + + def _load_categories(self): + label_cat = LabelCategories() + + labels_file = osp.join(self._dataset_dir, 'labels.txt') + if osp.isfile(labels_file): + with open(labels_file, encoding='utf-8') as f: + for line in f: + line = line.strip() + if not line: + continue + label_cat.add(line) + else: + for i in range(10): + label_cat.add(str(i)) + + return { AnnotationType.label: label_cat } + + def _load_items(self, path): + items = {} + with open(path, 'r', encoding='utf-8') as f: + annotation_table = f.readlines() + + metafile = osp.join(self._dataset_dir, 'meta_%s.csv' % self._subset) + meta = [] + if osp.isfile(metafile): + with open(metafile, 'r', encoding='utf-8') as f: + meta = f.readlines() + + for i, data in enumerate(annotation_table): + data = data.split(',') + item_anno = [] + label = int(data[0]) + if label != MnistCsvPath.NONE_LABEL: + item_anno.append(Label(label)) + + if 0 < len(meta): + meta[i] = meta[i].strip().split(',') + + # support for single-channel image only + image = None + if 1 < len(data): + if 0 < len(meta) and 1 < len(meta[i]): + image = np.array([int(pix) for pix in data[1:]], + dtype='uint8').reshape(int(meta[i][-2]), int(meta[i][-1])) + else: + image = np.array([int(pix) for pix in data[1:]], + dtype='uint8').reshape(28, 28) + + if 0 < len(meta) and len(meta[i]) in [1, 3]: + i = meta[i][0] + + items[i] = DatasetItem(id=i, subset=self._subset, + image=image, annotations=item_anno) + return items + +class MnistCsvImporter(Importer): + @classmethod + def find_sources(cls, path): + return cls._find_sources_recursive(path, '.csv', 'mnist_csv', + file_filter=lambda p: not osp.basename(p).startswith('meta')) + +class MnistCsvConverter(Converter): + DEFAULT_IMAGE_EXT = '.png' + + def apply(self): + os.makedirs(self._save_dir, exist_ok=True) + for subset_name, subset in self._extractor.subsets().items(): + data = [] + item_ids = {} + image_sizes = {} + for item in subset: + anns = [a.label for a in item.annotations + if a.type == AnnotationType.label] + label = MnistCsvPath.NONE_LABEL + if anns: + label = anns[0] + + if item.has_image and self._save_images: + image = item.image + if not image.has_data: + data.append([label, None]) + else: + if image.data.shape[0] != MnistCsvPath.IMAGE_SIZE or \ + image.data.shape[1] != MnistCsvPath.IMAGE_SIZE: + image_sizes[len(data)] = [image.data.shape[0], + image.data.shape[1]] + image = image.data.reshape(-1).astype(np.uint8).tolist() + image.insert(0, label) + data.append(image) + else: + data.append([label]) + + if item.id != str(len(data) - 1): + item_ids[len(data) - 1] = item.id + + anno_file = osp.join(self._save_dir, 'mnist_%s.csv' % subset_name) + self.save_in_csv(anno_file, data) + + # it is't in the original format, + # this is for storng other names and sizes of images + if len(item_ids) or len(image_sizes): + meta = [] + if len(item_ids) and len(image_sizes): + # other names and sizes of images + size = [MnistCsvPath.IMAGE_SIZE, MnistCsvPath.IMAGE_SIZE] + for i in range(len(data)): + w, h = image_sizes.get(i, size) + meta.append([item_ids.get(i, i), w, h]) + + elif len(item_ids): + # other names of images + for i in range(len(data)): + meta.append([item_ids.get(i, i)]) + + elif len(image_sizes): + # other sizes of images + size = [MnistCsvPath.IMAGE_SIZE, MnistCsvPath.IMAGE_SIZE] + for i in range(len(data)): + meta.append(image_sizes.get(i, size)) + + metafile = osp.join(self._save_dir, 'meta_%s.csv' % subset_name) + self.save_in_csv(metafile, meta) + + self.save_labels() + + def save_in_csv(self, path, data): + with open(path, 'w', encoding='utf-8') as f: + for row in data: + f.write(','.join([str(p) for p in row]) + "\n") + + def save_labels(self): + labels_file = osp.join(self._save_dir, 'labels.txt') + with open(labels_file, 'w', encoding='utf-8') as f: + f.writelines(l.name + '\n' + for l in self._extractor.categories().get( + AnnotationType.label, LabelCategories()) + ) diff --git a/datumaro/plugins/mnist_format.py b/datumaro/plugins/mnist_format.py new file mode 100644 index 000000000000..0cd97b06dfde --- /dev/null +++ b/datumaro/plugins/mnist_format.py @@ -0,0 +1,209 @@ +# Copyright (C) 2021 Intel Corporation +# +# SPDX-License-Identifier: MIT + +import gzip +import os +import os.path as osp + +import numpy as np +from datumaro.components.converter import Converter +from datumaro.components.extractor import (AnnotationType, DatasetItem, + Importer, Label, LabelCategories, SourceExtractor) + + +class MnistPath: + TEST_LABELS_FILE = 't10k-labels-idx1-ubyte.gz' + TEST_IMAGES_FILE = 't10k-images-idx3-ubyte.gz' + LABELS_FILE = '-labels-idx1-ubyte.gz' + IMAGES_FILE = '-images-idx3-ubyte.gz' + IMAGE_SIZE = 28 + NONE_LABEL = 255 + +class MnistExtractor(SourceExtractor): + def __init__(self, path, subset=None): + if not osp.isfile(path): + raise FileNotFoundError("Can't read annotation file '%s'" % path) + + if not subset: + file_name = osp.splitext(osp.basename(path))[0] + if file_name.startswith('t10k'): + subset = 'test' + else: + subset = file_name.split('-', maxsplit=1)[0] + + super().__init__(subset=subset) + self._dataset_dir = osp.dirname(path) + + self._categories = self._load_categories() + + self._items = list(self._load_items(path).values()) + + def _load_categories(self): + label_cat = LabelCategories() + + labels_file = osp.join(self._dataset_dir, 'labels.txt') + if osp.isfile(labels_file): + with open(labels_file, encoding='utf-8') as f: + for line in f: + line = line.strip() + if not line: + continue + label_cat.add(line) + else: + for i in range(10): + label_cat.add(str(i)) + + return { AnnotationType.label: label_cat } + + def _load_items(self, path): + items = {} + with gzip.open(path, 'rb') as lbpath: + labels = np.frombuffer(lbpath.read(), dtype=np.uint8, offset=8) + + meta = [] + metafile = osp.join(self._dataset_dir, self._subset + '-meta.gz') + if osp.isfile(metafile): + with gzip.open(metafile, 'rb') as f: + meta = np.frombuffer(f.read(), dtype=' ssd_mobilenet_v2_coco source /opt/intel/openvino/bin/setupvars.sh - cd datumaro/plugins/openvino + cd datumaro/plugins/openvino_plugin datum create -o proj_ssd_mobilenet_v2_coco_detection datum model add -l openvino -p proj_ssd_mobilenet_v2_coco_detection --copy -- \ --output-layers=do_ExpandDims_conf/sigmoid \ @@ -79,7 +79,7 @@ You need to implement your own interpreter samples to support the other OpenVINO # Classification> mobilenet-v2-pytorch source /opt/intel/openvino/bin/setupvars.sh - cd datumaro/plugins/openvino + cd datumaro/plugins/openvino_plugin datum create -o proj_mobilenet_v2_classification datum model add -l openvino -p proj_mobilenet_v2_classification --copy -- \ -d model/mobilenet-v2-pytorch.xml \ diff --git a/datumaro/plugins/openvino/__init__.py b/datumaro/plugins/openvino_plugin/__init__.py similarity index 100% rename from datumaro/plugins/openvino/__init__.py rename to datumaro/plugins/openvino_plugin/__init__.py diff --git a/datumaro/plugins/openvino/launcher.py b/datumaro/plugins/openvino_plugin/launcher.py similarity index 100% rename from datumaro/plugins/openvino/launcher.py rename to datumaro/plugins/openvino_plugin/launcher.py diff --git a/datumaro/plugins/openvino/samples/coco.class b/datumaro/plugins/openvino_plugin/samples/coco.class similarity index 100% rename from datumaro/plugins/openvino/samples/coco.class rename to datumaro/plugins/openvino_plugin/samples/coco.class diff --git a/datumaro/plugins/openvino/samples/imagenet.class b/datumaro/plugins/openvino_plugin/samples/imagenet.class similarity index 100% rename from datumaro/plugins/openvino/samples/imagenet.class rename to datumaro/plugins/openvino_plugin/samples/imagenet.class diff --git a/datumaro/plugins/openvino/samples/mobilenet_v2_pytorch_interp.py b/datumaro/plugins/openvino_plugin/samples/mobilenet_v2_pytorch_interp.py similarity index 100% rename from datumaro/plugins/openvino/samples/mobilenet_v2_pytorch_interp.py rename to datumaro/plugins/openvino_plugin/samples/mobilenet_v2_pytorch_interp.py diff --git a/datumaro/plugins/openvino/samples/ssd_face_detection_interp.py b/datumaro/plugins/openvino_plugin/samples/ssd_face_detection_interp.py similarity index 100% rename from datumaro/plugins/openvino/samples/ssd_face_detection_interp.py rename to datumaro/plugins/openvino_plugin/samples/ssd_face_detection_interp.py diff --git a/datumaro/plugins/openvino/samples/ssd_mobilenet_coco_detection_interp.py b/datumaro/plugins/openvino_plugin/samples/ssd_mobilenet_coco_detection_interp.py similarity index 100% rename from datumaro/plugins/openvino/samples/ssd_mobilenet_coco_detection_interp.py rename to datumaro/plugins/openvino_plugin/samples/ssd_mobilenet_coco_detection_interp.py diff --git a/datumaro/plugins/openvino/samples/ssd_person_detection_interp.py b/datumaro/plugins/openvino_plugin/samples/ssd_person_detection_interp.py similarity index 100% rename from datumaro/plugins/openvino/samples/ssd_person_detection_interp.py rename to datumaro/plugins/openvino_plugin/samples/ssd_person_detection_interp.py diff --git a/datumaro/plugins/openvino/samples/ssd_person_vehicle_bike_detection_interp.py b/datumaro/plugins/openvino_plugin/samples/ssd_person_vehicle_bike_detection_interp.py similarity index 100% rename from datumaro/plugins/openvino/samples/ssd_person_vehicle_bike_detection_interp.py rename to datumaro/plugins/openvino_plugin/samples/ssd_person_vehicle_bike_detection_interp.py diff --git a/datumaro/plugins/openvino/samples/ssd_vehicle_detection_interp.py b/datumaro/plugins/openvino_plugin/samples/ssd_vehicle_detection_interp.py similarity index 100% rename from datumaro/plugins/openvino/samples/ssd_vehicle_detection_interp.py rename to datumaro/plugins/openvino_plugin/samples/ssd_vehicle_detection_interp.py diff --git a/datumaro/plugins/splitter.py b/datumaro/plugins/splitter.py index e4e3b432f75a..5d414e1333d7 100644 --- a/datumaro/plugins/splitter.py +++ b/datumaro/plugins/splitter.py @@ -1,52 +1,197 @@ -# Copyright (C) 2020 Intel Corporation +# Copyright (C) 2020-2021 Intel Corporation # # SPDX-License-Identifier: MIT import logging as log import numpy as np +import copy +from math import gcd +from enum import Enum from datumaro.components.extractor import (Transform, AnnotationType, DEFAULT_SUBSET_NAME) from datumaro.components.cli_plugin import CliPlugin +from datumaro.util import cast NEAR_ZERO = 1e-7 +SplitTask = Enum( + "split", ["classification", "detection", "segmentation", "reid"] +) -class _TaskSpecificSplit(Transform, CliPlugin): - _default_split = [('train', 0.5), ('val', 0.2), ('test', 0.3)] + +class Split(Transform, CliPlugin): + """ + - classification split |n + Splits dataset into subsets(train/val/test) in class-wise manner. |n + Splits dataset images in the specified ratio, keeping the initial class + distribution.|n + |n + - detection & segmentation split |n + Each image can have multiple object annotations - + (bbox, mask, polygon). Since an image shouldn't be included + in multiple subsets at the same time, and image annotations + shouldn't be split, in general, dataset annotations are unlikely + to be split exactly in the specified ratio. |n + This split tries to split dataset images as close as possible + to the specified ratio, keeping the initial class distribution.|n + |n + - reidentification split |n + In this task, the test set should consist of images of unseen + people or objects during the training phase. |n + This function splits a dataset in the following way:|n + 1. Splits the dataset into 'train + val' and 'test' sets|n + |s|sbased on person or object ID.|n + 2. Splits 'test' set into 'test-gallery' and 'test-query' sets|n + |s|sin class-wise manner.|n + 3. Splits the 'train + val' set into 'train' and 'val' sets|n + |s|sin the same way.|n + The final subsets would be + 'train', 'val', 'test-gallery' and 'test-query'. |n + |n + Notes:|n + - Each image is expected to have only one Annotation. Unlabeled or + multi-labeled images will be split into subsets randomly. |n + - If Labels also have attributes, also splits by attribute values.|n + - If there is not enough images in some class or attributes group, + the split ratio can't be guaranteed.|n + In reidentification task, |n + - Object ID can be described by Label, or by attribute (--attr parameter)|n + - The splits of the test set are controlled by '--query' parameter |n + |s|sGallery ratio would be 1.0 - query.|n + |n + Example:|n + |s|s%(prog)s -t classification --subset train:.5 --subset val:.2 --subset test:.3 |n + |s|s%(prog)s -t detection --subset train:.5 --subset val:.2 --subset test:.3 |n + |s|s%(prog)s -t segmentation --subset train:.5 --subset val:.2 --subset test:.3 |n + |s|s%(prog)s -t reid --subset train:.5 --subset val:.2 --subset test:.3 --query .5 |n + Example: use 'person_id' attribute for splitting|n + |s|s%(prog)s --attr person_id + """ + + _default_split = [("train", 0.5), ("val", 0.2), ("test", 0.3)] + _default_query_ratio = 0.5 @classmethod def build_cmdline_parser(cls, **kwargs): parser = super().build_cmdline_parser(**kwargs) - parser.add_argument('-s', '--subset', action='append', - type=cls._split_arg, dest='splits', + parser.add_argument( + "-t", + "--task", + default=SplitTask.classification.name, + choices=[t.name for t in SplitTask], + help="(one of {}; default: %(default)s)".format( + ", ".join(t.name for t in SplitTask) + ), + ) + parser.add_argument( + "-s", + "--subset", + action="append", + type=cls._split_arg, + dest="splits", help="Subsets in the form: ':' " - "(repeatable, default: %s)" % dict(cls._default_split)) - parser.add_argument('--seed', type=int, help="Random seed") + "(repeatable, default: %s)" % dict(cls._default_split), + ) + parser.add_argument( + "--query", + type=float, + default=None, + help="Query ratio in the test set (default: %.3f)" + % cls._default_query_ratio, + ) + parser.add_argument( + "--attr", + type=str, + dest="attr_for_id", + default=None, + help="Attribute name representing the ID (default: use label)", + ) + parser.add_argument("--seed", type=int, help="Random seed") return parser @staticmethod def _split_arg(s): - parts = s.split(':') + parts = s.split(":") if len(parts) != 2: import argparse + raise argparse.ArgumentTypeError() return (parts[0], float(parts[1])) - def __init__(self, dataset, splits, seed): + def __init__(self, dataset, task, splits, query=None, attr_for_id=None, seed=None): super().__init__(dataset) if splits is None: splits = self._default_split - snames, sratio = self._validate_splits(splits) + self.task = task + self.splitter = self._get_splitter( + task, dataset, splits, seed, query, attr_for_id + ) + self._initialized = False + self._subsets = self.splitter._subsets + + @staticmethod + def _get_splitter(task, dataset, splits, seed, query, attr_for_id): + if task == SplitTask.classification.name: + splitter = _ClassificationSplit(dataset=dataset, splits=splits, seed=seed) + elif task in {SplitTask.detection.name, SplitTask.segmentation.name}: + splitter = _InstanceSpecificSplit( + dataset=dataset, splits=splits, seed=seed, task=task + ) + elif task == SplitTask.reid.name: + splitter = _ReidentificationSplit( + dataset=dataset, + splits=splits, + seed=seed, + query=query, + attr_for_id=attr_for_id, + ) + else: + raise Exception( + f"Unknown task '{task}', available " + f"splitter format: {[a.name for a in SplitTask]}" + ) + return splitter + + def __iter__(self): + # lazy splitting + if self._initialized is False: + self.splitter._split_dataset() + self._initialized = True + for i, item in enumerate(self._extractor): + yield self.wrap_item(item, subset=self.splitter._find_split(i)) + + def get_subset(self, name): + # lazy splitting + if self._initialized is False: + self.splitter._split_dataset() + self._initialized = True + return super().get_subset(name) + + def subsets(self): + # lazy splitting + if self._initialized is False: + self.splitter._split_dataset() + self._initialized = True + return super().subsets() + + +class _TaskSpecificSplit: + def __init__(self, dataset, splits, seed, restrict=False): + self._extractor = dataset + + snames, sratio, subsets = self._validate_splits(splits, restrict) self._snames = snames self._sratio = sratio self._seed = seed - self._subsets = {"train", "val", "test"} # output subset names + # remove subset name restriction + # https://github.com/openvinotoolkit/datumaro/issues/194 + self._subsets = subsets self._parts = [] self._length = "parent" @@ -60,31 +205,46 @@ def _set_parts(self, by_splits): @staticmethod def _get_uniq_annotations(dataset): annotations = [] - for item in dataset: - labels = [a for a in item.annotations - if a.type == AnnotationType.label] - if len(labels) != 1: - raise Exception("Item '%s' contains %s labels, " - "but exactly one is expected" % (item.id, len(labels))) - annotations.append(labels[0]) - return annotations + unlabeled_or_multi = [] + + for idx, item in enumerate(dataset): + labels = [a for a in item.annotations if a.type == AnnotationType.label] + if len(labels) == 1: + annotations.append(labels[0]) + else: + unlabeled_or_multi.append(idx) + + return annotations, unlabeled_or_multi @staticmethod - def _validate_splits(splits, valid=None): + def _validate_splits(splits, restrict=False): snames = [] ratios = [] - if valid is None: - valid = ["train", "val", "test"] + subsets = set() + valid = ["train", "val", "test"] for subset, ratio in splits: - assert subset in valid, \ - "Subset name must be one of %s, but got %s" % (valid, subset) - assert 0.0 <= ratio and ratio <= 1.0, \ - "Ratio is expected to be in the range " \ - "[0, 1], but got %s for %s" % (ratio, subset) + # remove subset name restriction + # https://github.com/openvinotoolkit/datumaro/issues/194 + if restrict: + assert subset in valid, "Subset name must be one of %s, got %s" % ( + valid, + subset, + ) + assert ( + 0.0 <= ratio and ratio <= 1.0 + ), "Ratio is expected to be in the range " "[0, 1], but got %s for %s" % ( + ratio, + subset, + ) # ignore near_zero ratio because it may produce partition error. if ratio > NEAR_ZERO: + # handling duplication + if subset in snames: + raise Exception("Subset (%s) is duplicated" % subset) snames.append(subset) ratios.append(float(ratio)) + subsets.add(subset) + ratios = np.array(ratios) total_ratio = np.sum(ratios) @@ -94,15 +254,26 @@ def _validate_splits(splits, valid=None): % (splits, total_ratio) ) - return snames, ratios + return snames, ratios, subsets @staticmethod def _get_required(ratio): - min_value = np.max(ratio) - for i in ratio: - if NEAR_ZERO < i and i < min_value: - min_value = i - required = int(np.around(1.0) / min_value) + if len(ratio) < 2: + return 1 + + for scale in [10, 100]: + farray = np.array(ratio) * scale + iarray = farray.astype(int) + if np.array_equal(iarray, farray): + break + + # find gcd + common_divisor = iarray[0] + for val in iarray[1:]: + common_divisor = gcd(common_divisor, val) + + required = np.sum(np.array(iarray / common_divisor).astype(int)) + return required @staticmethod @@ -119,47 +290,105 @@ def _get_sections(dataset_size, ratio): n_splits[ii] += 1 n_splits[midx] -= 1 sections = np.add.accumulate(n_splits[:-1]) - return sections + return sections, n_splits @staticmethod def _group_by_attr(items): """ Args: - items: list of (idx, ann). ann is the annotation from Label object. + items: list of (idx_img, ann). ann is the annotation from Label object. Returns: by_attributes: dict of { combination-of-attrs : list of index } """ + + # float--> numerical, others(int, string, bool) --> categorical + def _is_float(value): + if isinstance(value, str): + casted = cast(value, float) + if casted is not None: + if cast(casted, str) == value: + return True + return False + elif isinstance(value, float): + cast(value, float) + return True + return False + # group by attributes by_attributes = dict() - for idx, ann in items: - attributes = tuple(sorted(ann.attributes.items())) + for idx_img, ann in items: + # ignore numeric attributes + filtered = {} + for attr, value in ann.attributes.items(): + if _is_float(value): + continue + filtered[attr] = value + attributes = tuple(sorted(filtered.items())) if attributes not in by_attributes: by_attributes[attributes] = [] - by_attributes[attributes].append(idx) + by_attributes[attributes].append(idx_img) + return by_attributes - def _split_by_attr(self, datasets, snames, ratio, out_splits, - dataset_key=None): + def _split_by_attr( + self, datasets, snames, ratio, out_splits, merge_small_classes=True + ): + def _split_indice(indice): + sections, _ = self._get_sections(len(indice), ratio) + splits = np.array_split(indice, sections) + for subset, split in zip(snames, splits): + if 0 < len(split): + out_splits[subset].extend(split) + required = self._get_required(ratio) - if dataset_key is None: - dataset_key = "label" - for key, items in datasets.items(): + rest = [] + for _, items in datasets.items(): np.random.shuffle(items) by_attributes = self._group_by_attr(items) - for attributes, indice in by_attributes.items(): - gname = "%s: %s, attrs: %s" % (dataset_key, key, attributes) - splits = self._split_indice(indice, gname, ratio, required) - for subset, split in zip(snames, splits): - if 0 < len(split): - out_splits[subset].extend(split) - - def _split_indice(self, indice, group_name, ratio, required): - filtered_size = len(indice) - if filtered_size < required: - log.warning("Not enough samples for a group, '%s'" % group_name) - sections = self._get_sections(filtered_size, ratio) - splits = np.array_split(indice, sections) - return splits + attr_combinations = list(by_attributes.keys()) + np.random.shuffle(attr_combinations) # add randomness + for attr in attr_combinations: + indice = by_attributes[attr] + quo = len(indice) // required + if quo > 0: + filtered_size = quo * required + _split_indice(indice[:filtered_size]) + rest.extend(indice[filtered_size:]) + else: + rest.extend(indice) + + quo = len(rest) // required + if quo > 0: + filtered_size = quo * required + _split_indice(rest[:filtered_size]) + rest = rest[filtered_size:] + + if not merge_small_classes and len(rest) > 0: + _split_indice(rest) + rest = [] + + if len(rest) > 0: + _split_indice(rest) + + def _split_unlabeled(self, unlabeled, by_splits): + """ + split unlabeled data into subsets (detection, classification) + Args: + unlabeled: list of index of unlabeled or multi-labeled data + by_splits: splits up to now + Returns: + by_splits: final splits + """ + dataset_size = len(self._extractor) + _, n_splits = list(self._get_sections(dataset_size, self._sratio)) + counts = [len(by_splits[sname]) for sname in self._snames] + expected = [max(0, v) for v in np.subtract(n_splits, counts)] + sections = np.add.accumulate(expected[:-1]) + np.random.shuffle(unlabeled) + splits = np.array_split(unlabeled, sections) + for subset, split in zip(self._snames, splits): + if 0 < len(split): + by_splits[subset].extend(split) def _find_split(self, index): for subset_indices, subset in self._parts: @@ -170,30 +399,24 @@ def _find_split(self, index): def _split_dataset(self): raise NotImplementedError() - def __iter__(self): - # lazy splitting - if self._initialized is False: - self._split_dataset() - self._initialized = True - for i, item in enumerate(self._extractor): - yield self.wrap_item(item, subset=self._find_split(i)) - -class ClassificationSplit(_TaskSpecificSplit): +class _ClassificationSplit(_TaskSpecificSplit): """ - Splits dataset into train/val/test set in class-wise manner. |n + Splits dataset into subsets(train/val/test) in class-wise manner. |n Splits dataset images in the specified ratio, keeping the initial class distribution.|n |n Notes:|n - - Each image is expected to have only one Label|n + - Each image is expected to have only one Label. Unlabeled or + multi-labeled images will be split into subsets randomly. |n - If Labels also have attributes, also splits by attribute values.|n - If there is not enough images in some class or attributes group, the split ratio can't be guaranteed.|n |n Example:|n - |s|s%(prog)s --subset train:.5 --subset val:.2 --subset test:.3 + |s|s%(prog)s -t classification --subset train:.5 --subset val:.2 --subset test:.3 """ + def __init__(self, dataset, splits, seed=None): """ Parameters @@ -201,7 +424,6 @@ def __init__(self, dataset, splits, seed=None): dataset : Dataset splits : list A list of (subset(str), ratio(float)) - Subset is expected to be one of ["train", "val", "test"]. The sum of ratios is expected to be 1. seed : int, optional """ @@ -213,9 +435,10 @@ def _split_dataset(self): # support only single label for a DatasetItem # 1. group by label by_labels = dict() - annotations = self._get_uniq_annotations(self._extractor) + annotations, unlabeled = self._get_uniq_annotations(self._extractor) + for idx, ann in enumerate(annotations): - label = getattr(ann, 'label', None) + label = getattr(ann, "label", None) if label not in by_labels: by_labels[label] = [] by_labels[label].append((idx, ann)) @@ -226,10 +449,16 @@ def _split_dataset(self): # 2. group by attributes self._split_by_attr(by_labels, self._snames, self._sratio, by_splits) + + # 3. split unlabeled data + if len(unlabeled) > 0: + self._split_unlabeled(unlabeled, by_splits) + + # 4. set parts self._set_parts(by_splits) -class ReidentificationSplit(_TaskSpecificSplit): +class _ReidentificationSplit(_TaskSpecificSplit): """ Splits a dataset for re-identification task.|n Produces a split with a specified ratio of images, avoiding having same @@ -248,32 +477,22 @@ class ReidentificationSplit(_TaskSpecificSplit): 'train', 'val', 'test-gallery' and 'test-query'. |n |n Notes:|n - - Each image is expected to have a single Label|n + - Each image is expected to have a single Label. Unlabeled or multi-labeled + images will be split into 'not-supported'.|n - Object ID can be described by Label, or by attribute (--attr parameter)|n - The splits of the test set are controlled by '--query' parameter. |n |s|sGallery ratio would be 1.0 - query.|n |n Example: split a dataset in the specified ratio, split the test set|n |s|s|s|sinto gallery and query in 1:1 ratio|n - |s|s%(prog)s --subset train:.5 --subset val:.2 --subset test:.3 --query .5|n + |s|s%(prog)s -t reidentification --subset train:.5 --subset val:.2 --subset test:.3 --query .5|n Example: use 'person_id' attribute for splitting|n |s|s%(prog)s --attr person_id """ _default_query_ratio = 0.5 - @classmethod - def build_cmdline_parser(cls, **kwargs): - parser = super().build_cmdline_parser(**kwargs) - parser.add_argument('--query', type=float, - help="Query ratio in the test set (default: %.3f)" - % cls._default_query_ratio) - parser.add_argument('--attr', type=str, dest='attr_for_id', - help="Attribute name representing the ID (default: use label)") - return parser - - def __init__(self, dataset, splits, query=None, - attr_for_id=None, seed=None): + def __init__(self, dataset, splits, query=None, attr_for_id=None, seed=None): """ Parameters ---------- @@ -290,17 +509,17 @@ def __init__(self, dataset, splits, query=None, if this is not specified, label would be used. seed : int, optional """ - super().__init__(dataset, splits, seed) + super().__init__(dataset, splits, seed, restrict=True) if query is None: query = self._default_query_ratio - assert 0.0 <= query and query <= 1.0, \ - "Query ratio is expected to be in the range " \ - "[0, 1], but got %f" % query - test_splits = [('test-query', query), ('test-gallery', 1.0 - query)] + assert 0.0 <= query and query <= 1.0, ( + "Query ratio is expected to be in the range " "[0, 1], but got %f" % query + ) + test_splits = [("test-query", query), ("test-gallery", 1.0 - query)] - # reset output subset names + # remove subset name restriction self._subsets = {"train", "val", "test-gallery", "test-query"} self._test_splits = test_splits self._attr_for_id = attr_for_id @@ -315,18 +534,19 @@ def _split_dataset(self): # group by ID(attr_for_id) by_id = dict() - annotations = self._get_uniq_annotations(dataset) + annotations, unlabeled = self._get_uniq_annotations(dataset) if attr_for_id is None: # use label for idx, ann in enumerate(annotations): - ID = getattr(ann, 'label', None) + ID = getattr(ann, "label", None) if ID not in by_id: by_id[ID] = [] by_id[ID].append((idx, ann)) else: # use attr_for_id for idx, ann in enumerate(annotations): attributes = dict(ann.attributes.items()) - assert attr_for_id in attributes, \ + assert attr_for_id in attributes, ( "'%s' is expected as an attribute name" % attr_for_id + ) ID = attributes[attr_for_id] if ID not in by_id: by_id[ID] = [] @@ -334,9 +554,9 @@ def _split_dataset(self): required = self._get_required(id_ratio) if len(by_id) < required: - log.warning("There's not enough IDs, which is %s, " - "so train/val/test ratio can't be guaranteed." - % len(by_id) + log.warning( + "There's not enough IDs, which is %s, " + "so train/val/test ratio can't be guaranteed." % len(by_id) ) # 1. split dataset into trval and test @@ -346,14 +566,15 @@ def _split_dataset(self): split_ratio = np.array([test, 1.0 - test]) IDs = list(by_id.keys()) np.random.shuffle(IDs) - sections = self._get_sections(len(IDs), split_ratio) + sections, _ = self._get_sections(len(IDs), split_ratio) splits = np.array_split(IDs, sections) testset = {pid: by_id[pid] for pid in splits[0]} trval = {pid: by_id[pid] for pid in splits[1]} - # follow the ratio of datasetitems as possible. # naive heuristic: exchange the best item one by one. - expected_count = int(len(self._extractor) * split_ratio[0]) + expected_count = int( + (len(self._extractor) - len(unlabeled)) * split_ratio[0] + ) testset_total = int(np.sum([len(v) for v in testset.values()])) self._rebalancing(testset, trval, expected_count, testset_total) else: @@ -372,8 +593,9 @@ def _split_dataset(self): test_snames.append(sname) test_ratio.append(float(ratio)) - self._split_by_attr(testset, test_snames, test_ratio, by_splits, - dataset_key=attr_for_id) + self._split_by_attr( + testset, test_snames, test_ratio, by_splits, merge_small_classes=False + ) # 3. split 'trval' into 'train' and 'val' trval_snames = ["train", "val"] @@ -388,14 +610,20 @@ def _split_dataset(self): total_ratio = np.sum(trval_ratio) if total_ratio < NEAR_ZERO: trval_splits = list(zip(["train", "val"], trval_ratio)) - log.warning("Sum of ratios is expected to be positive, " - "got %s, which is %s" - % (trval_splits, total_ratio) + log.warning( + "Sum of ratios is expected to be positive, " + "got %s, which is %s" % (trval_splits, total_ratio) ) else: trval_ratio /= total_ratio # normalize - self._split_by_attr(trval, trval_snames, trval_ratio, by_splits, - dataset_key=attr_for_id) + self._split_by_attr( + trval, trval_snames, trval_ratio, by_splits, merge_small_classes=False + ) + + # split unlabeled data into 'not-supported'. + if len(unlabeled) > 0: + self._subsets.add("not-supported") + by_splits["not-supported"] = unlabeled self._set_parts(by_splits) @@ -446,57 +674,66 @@ def _rebalancing(test, trval, expected_count, testset_total): trval[id_test] = test.pop(id_test) -class DetectionSplit(_TaskSpecificSplit): +class _InstanceSpecificSplit(_TaskSpecificSplit): """ - Splits a dataset into train/val/test subsets for detection task, + Splits a dataset into subsets(train/val/test), using object annotations as a basis for splitting.|n Tries to produce an image split with the specified ratio, keeping the initial distribution of class objects.|n |n - In a detection dataset, each image can have multiple object annotations - - instance bounding boxes. Since an image shouldn't be included + each image can have multiple object annotations - + (instance bounding boxes, masks, polygons). Since an image shouldn't be included in multiple subsets at the same time, and image annotations - shoudln't be split, in general, dataset annotations are unlikely to be split + shouldn't be split, in general, dataset annotations are unlikely to be split exactly in the specified ratio. |n This split tries to split dataset images as close as possible to the specified ratio, keeping the initial class distribution.|n |n Notes:|n - - Each image is expected to have one or more Bbox annotations.|n - - Only Bbox annotations are considered.|n + - Each image is expected to have one or more annotations.|n + - Only bbox annotations are considered in detection task.|n + - Mask or Polygon annotations are considered in segmentation task.|n |n Example: split dataset so that each object class annotations were split|n |s|s|s|sin the specified ratio between subsets|n - |s|s%(prog)s --subset train:.5 --subset val:.2 --subset test:.3 + |s|s%(prog)s -t detection --subset train:.5 --subset val:.2 --subset test:.3 |n + |s|s%(prog)s -t segmentation --subset train:.5 --subset val:.2 --subset test:.3 """ - def __init__(self, dataset, splits, seed=None): + + def __init__(self, dataset, splits, task, seed=None): """ Parameters ---------- dataset : Dataset splits : list A list of (subset(str), ratio(float)) - Subset is expected to be one of ["train", "val", "test"]. The sum of ratios is expected to be 1. seed : int, optional """ super().__init__(dataset, splits, seed) - @staticmethod - def _group_by_bbox_labels(dataset): + if task == SplitTask.detection.name: + self.annotation_type = [AnnotationType.bbox] + elif task == SplitTask.segmentation.name: + self.annotation_type = [AnnotationType.mask, AnnotationType.polygon] + + def _group_by_labels(self, dataset): by_labels = dict() + unlabeled = [] + for idx, item in enumerate(dataset): - bbox_anns = [a for a in item.annotations - if a.type == AnnotationType.bbox] - assert 0 < len(bbox_anns), \ - "Expected more than one bbox annotation in the dataset" - for ann in bbox_anns: - label = getattr(ann, 'label', None) + instance_anns = [a for a in item.annotations if a.type in self.annotation_type] + if len(instance_anns) == 0: + unlabeled.append(idx) + continue + for instance_ann in instance_anns: + label = getattr(instance_ann, "label", None) if label not in by_labels: - by_labels[label] = [(idx, ann)] + by_labels[label] = [(idx, instance_ann)] else: - by_labels[label].append((idx, ann)) - return by_labels + by_labels[label].append((idx, instance_ann)) + + return by_labels, unlabeled def _split_dataset(self): np.random.seed(self._seed) @@ -504,82 +741,117 @@ def _split_dataset(self): subsets, sratio = self._snames, self._sratio # 1. group by bbox label - by_labels = self._group_by_bbox_labels(self._extractor) + by_labels, unlabeled = self._group_by_labels(self._extractor) # 2. group by attributes - by_combinations = dict() - for label, items in by_labels.items(): + required = self._get_required(sratio) + by_combinations = list() + for _, items in by_labels.items(): by_attributes = self._group_by_attr(items) - for attributes, indice in by_attributes.items(): - gname = "label: %s, attributes: %s" % (label, attributes) - by_combinations[gname] = indice + # merge groups which have too small samples. + attr_combinations = list(by_attributes.keys()) + np.random.shuffle(attr_combinations) # add randomless + cluster = [] + min_cluster = max(required, len(items) * 0.01) # temp solution + for attr in attr_combinations: + indice = by_attributes[attr] + if len(indice) >= min_cluster: + by_combinations.append(indice) + else: + cluster.extend(indice) + if len(cluster) >= min_cluster: + by_combinations.append(cluster) + cluster = [] + + if len(cluster) > 0: + by_combinations.append(cluster) + cluster = [] + total = len(self._extractor) # total number of GT samples per label-attr combinations - n_combs = {k: len(v) for k, v in by_combinations.items()} + n_combs = [len(v) for v in by_combinations] # 3-1. initially count per-image GT samples - scores_all = {} - init_scores = {} - for idx, _ in enumerate(self._extractor): - counts = {k: v.count(idx) for k, v in by_combinations.items()} - scores_all[idx] = counts - init_scores[idx] = np.sum( - [v / n_combs[k] for k, v in counts.items()] - ) + counts_all = {} + for idx_img in range(total): + if idx_img not in unlabeled: + counts_all[idx_img] = dict() + + for idx_comb, indice in enumerate(by_combinations): + for idx_img in indice: + if idx_comb not in counts_all[idx_img]: + counts_all[idx_img][idx_comb] = 1 + else: + counts_all[idx_img][idx_comb] += 1 by_splits = dict() for sname in self._subsets: by_splits[sname] = [] - total = len(self._extractor) - target_size = dict() - expected = [] # expected numbers of per split GT samples + target_ins = [] # target instance numbers to be split for sname, ratio in zip(subsets, sratio): - target_size[sname] = total * ratio - expected.append( - (sname, {k: v * ratio for k, v in n_combs.items()}) - ) + target_ins.append([sname, np.array(n_combs) * ratio]) - # functions for keep the # of annotations not exceed the expected num + init_scores = {} + for idx_img, distributions in counts_all.items(): + norm_sum = 0.0 + for idx_comb, dis in distributions.items(): + norm_sum += dis / n_combs[idx_comb] + init_scores[idx_img] = norm_sum + + by_scores = dict() + for idx_img, score in init_scores.items(): + if score not in by_scores: + by_scores[score] = [idx_img] + else: + by_scores[score].append(idx_img) + + # functions for keep the # of annotations not exceed the target_ins num def compute_penalty(counts, n_combs): p = 0 - for k, v in counts.items(): - p += max(0, (v / n_combs[k]) - 1.0) + for idx_comb, v in counts.items(): + if n_combs[idx_comb] <= 0: + p += 1 + else: + p += max(0, (v / n_combs[idx_comb]) - 1.0) + return p def update_nc(counts, n_combs): - for k, v in counts.items(): - n_combs[k] = max(0, n_combs[k] - v) - if n_combs[k] == 0: - n_combs[k] = -1 - return n_combs + for idx_comb, v in counts.items(): + n_combs[idx_comb] = n_combs[idx_comb] - v # 3-2. assign each DatasetItem to a split, one by one - for idx, _ in sorted( - init_scores.items(), key=lambda item: item[1], reverse=True - ): - counts = scores_all[idx] - - # shuffling split order to add randomness - # when two or more splits have the same penalty value - np.random.shuffle(expected) - - pp = [] - for sname, nc in expected: - if target_size[sname] <= len(by_splits[sname]): - # the split has enough images, - # stop adding more images to this split - pp.append(1e08) - else: - # compute penalty based on the number of GT samples - # added in the split - pp.append(compute_penalty(counts, nc)) - - # we push an image to a split with the minimum penalty - midx = np.argmin(pp) - - sname, nc = expected[midx] - by_splits[sname].append(idx) - update_nc(counts, nc) + actual_ins = copy.deepcopy(target_ins) + for score in sorted(by_scores.keys(), reverse=True): + indice = by_scores[score] + np.random.shuffle(indice) # add randomness for the same score + + for idx in indice: + counts = counts_all[idx] + # shuffling split order to add randomness + # when two or more splits have the same penalty value + np.random.shuffle(actual_ins) + + pp = [] + for sname, nc in actual_ins: + if np.sum(nc) <= 0: + # the split has enough instances, + # stop adding more images to this split + pp.append(1e08) + else: + # compute penalty based on the number of GT samples + # added in the split + pp.append(compute_penalty(counts, nc)) + + # we push an image to a split with the minimum penalty + midx = np.argmin(pp) + sname, nc = actual_ins[midx] + by_splits[sname].append(idx) + update_nc(counts, nc) + + # split unlabeled data + if len(unlabeled) > 0: + self._split_unlabeled(unlabeled, by_splits) self._set_parts(by_splits) diff --git a/datumaro/plugins/transforms.py b/datumaro/plugins/transforms.py index e634794ff75c..dfecb25a990c 100644 --- a/datumaro/plugins/transforms.py +++ b/datumaro/plugins/transforms.py @@ -18,6 +18,7 @@ ) from datumaro.components.cli_plugin import CliPlugin import datumaro.util.mask_tools as mask_tools +from datumaro.util import parse_str_enum_value, NOTSET from datumaro.util.annotation_util import find_group_leader, find_instances @@ -433,7 +434,22 @@ def transform_item(self, item): class RemapLabels(Transform, CliPlugin): """ Changes labels in the dataset.|n + |n + A label can be:|n + - renamed (and joined with existing) -|n + |s|swhen specified '--label :'|n + - deleted - when specified '--label :' or default action is 'delete'|n + |s|sand the label is not mentioned in the list. When a label|n + |s|sis deleted, all the associated annotations are removed|n + - kept unchanged - when specified '--label :'|n + |s|sor default action is 'keep' and the label is not mentioned in the list|n + Annotations with no label are managed by the default action policy.|n + |n Examples:|n + - Remove the 'person' label (and corresponding annotations):|n + |s|sremap_labels -l person: --default keep|n + - Rename 'person' to 'pedestrian' and 'human' to 'pedestrian', join:|n + |s|sremap_labels -l person:pedestrian -l human:pedestrian --default keep|n - Rename 'person' to 'car' and 'cat' to 'dog', keep 'bus', remove others:|n |s|sremap_labels -l person:car -l bus:bus -l cat:dog --default delete """ @@ -463,9 +479,9 @@ def build_cmdline_parser(cls, **kwargs): def __init__(self, extractor, mapping, default=None): super().__init__(extractor) - assert isinstance(default, (str, self.DefaultAction)) - if isinstance(default, str): - default = self.DefaultAction[default] + default = parse_str_enum_value(default, self.DefaultAction, + self.DefaultAction.keep) + self._default_action = default assert isinstance(mapping, (dict, list)) if isinstance(mapping, list): @@ -503,10 +519,10 @@ def _make_label_id_map(self, src_label_cat, label_mapping, default_action): dst_label_cat = LabelCategories(attributes=src_label_cat.attributes) id_mapping = {} for src_index, src_label in enumerate(src_label_cat.items): - dst_label = label_mapping.get(src_label.name) - if not dst_label and default_action == self.DefaultAction.keep: + dst_label = label_mapping.get(src_label.name, NOTSET) + if dst_label is NOTSET and default_action == self.DefaultAction.keep: dst_label = src_label.name # keep unspecified as is - if not dst_label: + elif not dst_label or dst_label is NOTSET: continue dst_index = dst_label_cat.find(dst_label)[0] @@ -518,7 +534,7 @@ def _make_label_id_map(self, src_label_cat, label_mapping, default_action): if log.getLogger().isEnabledFor(log.DEBUG): log.debug("Label mapping:") for src_id, src_label in enumerate(src_label_cat.items): - if id_mapping.get(src_id): + if id_mapping.get(src_id) is not None: log.debug("#%s '%s' -> #%s '%s'", src_id, src_label.name, id_mapping[src_id], dst_label_cat.items[id_mapping[src_id]].name @@ -535,14 +551,11 @@ def categories(self): def transform_item(self, item): annotations = [] for ann in item.annotations: - if ann.type in { AnnotationType.label, AnnotationType.mask, - AnnotationType.points, AnnotationType.polygon, - AnnotationType.polyline, AnnotationType.bbox - } and ann.label is not None: + if getattr(ann, 'label') is not None: conv_label = self._map_id(ann.label) if conv_label is not None: annotations.append(ann.wrap(label=conv_label)) - else: + elif self._default_action is self.DefaultAction.keep: annotations.append(ann.wrap()) return item.wrap(annotations=annotations) diff --git a/datumaro/plugins/voc_format/converter.py b/datumaro/plugins/voc_format/converter.py index abb109ff9a36..54be318b0eca 100644 --- a/datumaro/plugins/voc_format/converter.py +++ b/datumaro/plugins/voc_format/converter.py @@ -201,7 +201,7 @@ def save_subsets(self): ET.SubElement(source_elem, 'annotation').text = 'Unknown' ET.SubElement(source_elem, 'image').text = 'Unknown' - if item.has_image: + if item.has_image and item.image.has_size: h, w = item.image.size size_elem = ET.SubElement(root_elem, 'size') ET.SubElement(size_elem, 'width').text = str(w) @@ -236,20 +236,12 @@ def save_subsets(self): ET.SubElement(obj_elem, 'pose').text = \ str(attr['pose']) - if 'truncated' in attr: - truncated = _convert_attr('truncated', attr, int, 0) - ET.SubElement(obj_elem, 'truncated').text = \ - '%d' % truncated - - if 'difficult' in attr: - difficult = _convert_attr('difficult', attr, int, 0) - ET.SubElement(obj_elem, 'difficult').text = \ - '%d' % difficult - - if 'occluded' in attr: - occluded = _convert_attr('occluded', attr, int, 0) - ET.SubElement(obj_elem, 'occluded').text = \ - '%d' % occluded + ET.SubElement(obj_elem, 'truncated').text = \ + '%d' % _convert_attr('truncated', attr, int, 0) + ET.SubElement(obj_elem, 'occluded').text = \ + '%d' % _convert_attr('occluded', attr, int, 0) + ET.SubElement(obj_elem, 'difficult').text = \ + '%d' % _convert_attr('difficult', attr, int, 0) bbox = obj.get_bbox() if bbox is not None: diff --git a/datumaro/plugins/widerface_format.py b/datumaro/plugins/widerface_format.py index f5e0008f6075..a8439dc83ce0 100644 --- a/datumaro/plugins/widerface_format.py +++ b/datumaro/plugins/widerface_format.py @@ -10,6 +10,7 @@ from datumaro.components.converter import Converter from datumaro.components.extractor import (AnnotationType, Bbox, DatasetItem, Importer, Label, LabelCategories, SourceExtractor) +from datumaro.util import str_to_bool class WiderFacePath: @@ -21,6 +22,7 @@ class WiderFacePath: IMAGES_DIR_NO_LABEL = 'no_label' BBOX_ATTRIBUTES = ['blur', 'expression', 'illumination', 'occluded', 'pose', 'invalid'] + DEFAULT_LABEL = 'face' class WiderFaceExtractor(SourceExtractor): def __init__(self, path, subset=None): @@ -40,13 +42,13 @@ def __init__(self, path, subset=None): def _load_categories(self): label_cat = LabelCategories() - path = osp.join(self._dataset_dir, WiderFacePath.LABELS_FILE) if osp.isfile(path): with open(path, encoding='utf-8') as labels_file: for line in labels_file: label_cat.add(line.strip()) else: + label_cat.add(WiderFacePath.DEFAULT_LABEL) subset_path = osp.join(self._dataset_dir, WiderFacePath.SUBSET_DIR + self._subset, WiderFacePath.IMAGES_DIR) @@ -56,12 +58,15 @@ def _load_categories(self): images_dir != WiderFacePath.IMAGES_DIR_NO_LABEL: if '--' in images_dir: images_dir = images_dir.split('--')[1] - label_cat.add(images_dir) - + if images_dir != WiderFacePath.DEFAULT_LABEL: + label_cat.add(images_dir) + if len(label_cat) == 1: + label_cat = LabelCategories() return { AnnotationType.label: label_cat } def _load_items(self, path): items = {} + label_categories = self._categories[AnnotationType.label] with open(path, 'r', encoding='utf-8') as f: lines = f.readlines() @@ -73,6 +78,7 @@ def _load_items(self, path): for line_idx in line_ids: image_path = lines[line_idx].strip() item_id = osp.splitext(image_path)[0] + item_id = item_id.replace('\\', '/') image_path = osp.join(self._dataset_dir, WiderFacePath.SUBSET_DIR + self._subset, @@ -84,9 +90,9 @@ def _load_items(self, path): if '--' in label_name: label_name = label_name.split('--')[1] if label_name != WiderFacePath.IMAGES_DIR_NO_LABEL: - label = \ - self._categories[AnnotationType.label].find(label_name)[0] - annotations.append(Label(label=label)) + label = label_categories.find(label_name)[0] + if label != None: + annotations.append(Label(label=label)) item_id = item_id[len(item_id.split('/')[0]) + 1:] items[item_id] = DatasetItem(id=item_id, subset=self._subset, @@ -101,21 +107,25 @@ def _load_items(self, path): for bbox in bbox_lines: bbox_list = bbox.split() if 4 <= len(bbox_list): - attributes = {} - label = None + label = label_categories.find(WiderFacePath.DEFAULT_LABEL)[0] if len(bbox_list) == 5 or len(bbox_list) == 11: - if len(bbox_list) == 5: - label_name = bbox_list[4] - else: - label_name = bbox_list[10] - label = \ - self._categories[AnnotationType.label].find(label_name)[0] + label_name = bbox_list[-1] + label = label_categories.find(label_name)[0] + if label == None and len(label_categories) == 0: + label_categories.add(WiderFacePath.DEFAULT_LABEL) + label = label_categories.find(WiderFacePath.DEFAULT_LABEL)[0] + + attributes = {} if 10 <= len(bbox_list): i = 4 for attr in WiderFacePath.BBOX_ATTRIBUTES: if bbox_list[i] != '-': - attributes[attr] = bbox_list[i] + if bbox_list[i] in ['True', 'False']: + attributes[attr] = str_to_bool(bbox_list[i]) + else: + attributes[attr] = bbox_list[i] i += 1 + annotations.append(Bbox( float(bbox_list[0]), float(bbox_list[1]), float(bbox_list[2]), float(bbox_list[3]), @@ -180,7 +190,8 @@ def apply(self): wider_attr += '- ' if 0 < attr_counter: wider_annotation += wider_attr - if bbox.label is not None: + if label_categories[bbox.label].name != WiderFacePath.DEFAULT_LABEL and \ + bbox.label is not None: wider_annotation += '%s' % label_categories[bbox.label].name wider_annotation += '\n' diff --git a/datumaro/plugins/yolo_format/converter.py b/datumaro/plugins/yolo_format/converter.py index fb71b8f172ad..71f021f0e695 100644 --- a/datumaro/plugins/yolo_format/converter.py +++ b/datumaro/plugins/yolo_format/converter.py @@ -49,7 +49,7 @@ def apply(self): if not subset_name or subset_name == DEFAULT_SUBSET_NAME: subset_name = YoloPath.DEFAULT_SUBSET_NAME elif subset_name not in YoloPath.SUBSET_NAMES: - log.warn("Skipping subset export '%s'. " + log.warning("Skipping subset export '%s'. " "If specified, the only valid names are %s" % \ (subset_name, ', '.join( "'%s'" % s for s in YoloPath.SUBSET_NAMES))) diff --git a/datumaro/plugins/yolo_format/extractor.py b/datumaro/plugins/yolo_format/extractor.py index 33ab8eb7ff51..77a7d958b2e2 100644 --- a/datumaro/plugins/yolo_format/extractor.py +++ b/datumaro/plugins/yolo_format/extractor.py @@ -92,7 +92,7 @@ def __init__(self, config_path, image_info=None): with open(list_path, 'r', encoding='utf-8') as f: subset.items = OrderedDict( (self.name_from_path(p), self.localize_path(p)) - for p in f + for p in f if p.strip() ) subsets[subset_name] = subset @@ -176,7 +176,9 @@ def _load_categories(names_path): with open(names_path, 'r', encoding='utf-8') as f: for label in f: - label_categories.add(label.strip()) + label = label.strip() + if label: + label_categories.add(label) return label_categories diff --git a/datumaro/util/__init__.py b/datumaro/util/__init__.py index ad16d2347dce..79190a0fbb21 100644 --- a/datumaro/util/__init__.py +++ b/datumaro/util/__init__.py @@ -5,9 +5,10 @@ import attr from contextlib import ExitStack +from distutils.util import strtobool as str_to_bool # pylint: disable=unused-import from functools import partial, wraps from itertools import islice -from distutils.util import strtobool as str_to_bool # pylint: disable=unused-import +from typing import Iterable, Tuple NOTSET = object() @@ -85,6 +86,24 @@ def parse_str_enum_value(value, enum_class, default=NOTSET, (enum_class.__name__, type(value).__name__)) return value +def escape(s: str, escapes: Iterable[Tuple[str, str]]) -> str: + """ + 'escapes' is an iterable of (pattern, substitute) pairs + """ + + for pattern, sub in escapes: + s = s.replace(pattern, sub) + return s + +def unescape(s: str, escapes: Iterable[Tuple[str, str]]) -> str: + """ + 'escapes' is an iterable of (pattern, substitute) pairs + """ + + for pattern, sub in escapes: + s = s.replace(sub, pattern) + return s + def optional_arg_decorator(fn): @wraps(fn) def wrapped_decorator(*args, **kwargs): diff --git a/datumaro/util/image.py b/datumaro/util/image.py index 95002322633f..e1acd4792d88 100644 --- a/datumaro/util/image.py +++ b/datumaro/util/image.py @@ -7,19 +7,21 @@ from enum import Enum from io import BytesIO -from typing import Iterator, Iterable, Union +from typing import Any, Callable, Iterator, Iterable, Optional, Tuple, Union import numpy as np import os import os.path as osp _IMAGE_BACKENDS = Enum('_IMAGE_BACKENDS', ['cv2', 'PIL']) _IMAGE_BACKEND = None +_image_loading_errors = (FileNotFoundError, ) try: import cv2 _IMAGE_BACKEND = _IMAGE_BACKENDS.cv2 except ImportError: import PIL _IMAGE_BACKEND = _IMAGE_BACKENDS.PIL + _image_loading_errors = (*_image_loading_errors, PIL.UnidentifiedImageError) from datumaro.util.image_cache import ImageCache as _ImageCache from datumaro.util.os_util import walk @@ -33,6 +35,8 @@ def load_image(path, dtype=np.float32): if _IMAGE_BACKEND == _IMAGE_BACKENDS.cv2: import cv2 image = cv2.imread(path, cv2.IMREAD_UNCHANGED) + if image is None: + raise FileNotFoundError("Can't open image: %s" % path) image = image.astype(dtype) elif _IMAGE_BACKEND == _IMAGE_BACKENDS.PIL: from PIL import Image @@ -43,8 +47,6 @@ def load_image(path, dtype=np.float32): else: raise NotImplementedError() - if image is None: - raise ValueError("Can't open image '%s'" % path) assert len(image.shape) in {2, 3} if len(image.shape) == 3: assert image.shape[2] in {3, 4} @@ -63,7 +65,13 @@ def save_image(path, image, create_dir=False, dtype=np.uint8, **kwargs): if not kwargs: kwargs = {} - if _IMAGE_BACKEND == _IMAGE_BACKENDS.cv2: + # NOTE: OpenCV documentation says "If the image format is not supported, + # the image will be converted to 8-bit unsigned and saved that way". + # Conversion from np.int32 to np.uint8 is not working properly + backend = _IMAGE_BACKEND + if dtype == np.int32: + backend = _IMAGE_BACKENDS.PIL + if backend == _IMAGE_BACKENDS.cv2: import cv2 params = [] @@ -76,7 +84,7 @@ def save_image(path, image, create_dir=False, dtype=np.uint8, **kwargs): image = image.astype(dtype) cv2.imwrite(path, image, params=params) - elif _IMAGE_BACKEND == _IMAGE_BACKENDS.PIL: + elif backend == _IMAGE_BACKENDS.PIL: from PIL import Image params = {} @@ -227,15 +235,16 @@ def __hash__(self): return hash((id(self), self.path, self.loader)) class Image: - def __init__(self, data=None, path=None, loader=None, cache=None, - size=None): - assert size is None or len(size) == 2 + def __init__(self, data: Union[None, Callable, np.ndarray] = None, + path: Optional[str] = None, loader: Optional[Callable] = None, + size: Optional[Tuple[int, int]] = None, cache: Any = None): + assert size is None or len(size) == 2, size if size is not None: assert len(size) == 2 and 0 < size[0] and 0 < size[1], size size = tuple(size) self._size = size # (H, W) - assert path is None or isinstance(path, str) + assert path is None or isinstance(path, str), path if path is None: path = '' elif path: @@ -254,15 +263,15 @@ def __init__(self, data=None, path=None, loader=None, cache=None, self._size = data.shape[:2] @property - def path(self): + def path(self) -> str: return self._path @property - def ext(self): + def ext(self) -> str: return osp.splitext(osp.basename(self.path))[1] @property - def data(self): + def data(self) -> np.ndarray: if callable(self._data): data = self._data() else: @@ -273,17 +282,20 @@ def data(self): return data @property - def has_data(self): + def has_data(self) -> bool: return self._data is not None @property - def has_size(self): + def has_size(self) -> bool: return self._size is not None or isinstance(self._data, np.ndarray) @property - def size(self): + def size(self) -> Optional[Tuple[int, int]]: if self._size is None: - data = self.data + try: + data = self.data + except _image_loading_errors: + return None if data is not None: self._size = data.shape[:2] return self._size diff --git a/datumaro/util/mask_tools.py b/datumaro/util/mask_tools.py index b6c2bc946205..bd763dffea6c 100644 --- a/datumaro/util/mask_tools.py +++ b/datumaro/util/mask_tools.py @@ -3,6 +3,7 @@ # # SPDX-License-Identifier: MIT +from itertools import chain import numpy as np from datumaro.util.image import lazy_image, load_image @@ -112,6 +113,13 @@ def make_binary_mask(mask): return mask return mask.astype(bool) +def bgr2index(img): + if img.dtype.kind not in {'b', 'i', 'u'}: + img = img.astype(np.uint8) + return (img[..., 0] << 16) + (img[..., 1] << 8) + img[..., 2] + +def index2bgr(id_map): + return np.dstack((id_map >> 16, id_map >> 8, id_map)).astype(np.uint8) def load_mask(path, inverse_colormap=None): mask = load_image(path, dtype=np.uint8) @@ -279,7 +287,7 @@ def find_mask_bbox(mask): y0, y1 = np.where(rows)[0][[0, -1]] return [x0, y0, x1 - x0, y1 - y0] -def merge_masks(masks): +def merge_masks(masks, start=None): """ Merges masks into one, mask order is responsible for z order. To avoid memory explosion on mask materialization, consider passing @@ -288,6 +296,9 @@ def merge_masks(masks): Inputs: a sequence of index masks or (binary mask, index) pairs Outputs: an index mask """ + if start is not None: + masks = chain([start], masks) + it = iter(masks) try: diff --git a/datumaro/version.py b/datumaro/version.py index 43ac631f58b7..aae69457aadb 100644 --- a/datumaro/version.py +++ b/datumaro/version.py @@ -1 +1 @@ -VERSION = '0.1.8' \ No newline at end of file +VERSION = '0.1.9' \ No newline at end of file diff --git a/docs/cli_design.mm b/docs/cli_design.mm index 0ff17cb29940..9e2eddacebe5 100644 --- a/docs/cli_design.mm +++ b/docs/cli_design.mm @@ -9,7 +9,7 @@ - + diff --git a/docs/design.md b/docs/design.md index 1e520400c0df..b24a57a5958f 100644 --- a/docs/design.md +++ b/docs/design.md @@ -108,7 +108,7 @@ It should be capable of downloading and processing data from CVAT. - [x] PASCAL VOC - [x] YOLO - [x] TF Detection API - - [ ] Cityscapes + - [x] Cityscapes - [x] ImageNet - Dataset visualization (`show`) diff --git a/docs/developer_guide.md b/docs/developer_guide.md index 2bfab0e5b8ae..e8eff1bcf85f 100644 --- a/docs/developer_guide.md +++ b/docs/developer_guide.md @@ -72,7 +72,7 @@ class colors for masks, class attributes. This information is stored in `dataset.categories`, which is a mapping from `AnnotationType` to a corresponding `...Categories` class. Each annotation type can have its `Categories`. Typically, there will be a `LabelCategories` object. -Annotations and other categories adress dataset labels +Annotations and other categories address dataset labels by their indices in this object. The main operation for a dataset is iteration over its elements. diff --git a/docs/formats/cityscapes_user_manual.md b/docs/formats/cityscapes_user_manual.md new file mode 100644 index 000000000000..534f91c726a8 --- /dev/null +++ b/docs/formats/cityscapes_user_manual.md @@ -0,0 +1,176 @@ +# Cityscapes user manual + +## Contents + +- [Format specification](#format-specification) +- [Load Cityscapes dataset](#load-Cityscapes-dataset) +- [Export to other formats](#export-to-other-formats) +- [Export to Cityscapes](#export-to-Cityscapes) +- [Particular use cases](#particular-use-cases) + +## Format specification + +Cityscapes format overview available [here](https://www.cityscapes-dataset.com/dataset-overview/). +Cityscapes format specification available [here](https://github.com/mcordts/cityscapesScripts#the-cityscapes-dataset). + +Cityscapes dataset format supports `Masks` (segmentations tasks) annotations. + +## Load Cityscapes dataset + +The Cityscapes dataset is available for free [download](https://www.cityscapes-dataset.com/downloads/). + +There are two ways to create Datumaro project and add Cityscapes dataset to it: + +``` bash +datum import --format cityscapes --input-path +# or +datum create +datum add path -f cityscapes +``` + +It is possible to specify project name and project directory run +`datum create --help` for more information. + +Cityscapes dataset directory should have the following structure: + + +``` +└─ Dataset/ + ├── imgsFine/ + │ ├── leftImg8bit + │ │ ├── + │ │ | ├── {city1} + │ │ │ | ├── {city1}_{seq:[0...6]}_{frame:[0...6]}_leftImg8bit.png + │ │ │ │ └── ... + │ │ | ├── {city2} + │ │ │ └── ... + │ │ └── ... + ├── gtFine/ + │ ├── + │ │ ├── {city1} + │ │ | ├── {city1}_{seq:[0...6]}_{frame:[0...6]}_gtFine_color.png + │ │ | ├── {city1}_{seq:[0...6]}_{frame:[0...6]}_gtFine_instanceIds.png + │ │ | ├── {city1}_{seq:[0...6]}_{frame:[0...6]}_gtFine_labelIds.png + │ │ │ └── ... + │ │ ├── {city2} + │ │ └── ... + │ └── ... +``` + +Annotated files description: +1. *leftImg8bit.png - left images in 8-bit LDR format +1. *color.png - class labels are encoded by its color +1. *instanceIds.png - class and instance labels are encoded by an instance ID. + The pixel values encode class and the individual instance: the integer part + of a division by 1000 of each ID provides class ID, the remainder + is the instance ID. If a certain annotation describes multiple instances, + then the pixels have the regular ID of that class +1. *labelIds.png - class labels are encoded by its ID + +To make sure that the selected dataset has been added to the project, you can run +`datum info`, which will display the project and dataset information. + +## Export to other formats + +Datumaro can convert Cityscapes dataset into any other format [Datumaro supports](../user_manual.md#supported-formats). +To get the expected result, the dataset needs to be converted to formats +that support the segmentation task (e.g. PascalVOC, CamVID, etc.) +There are few ways to convert Cityscapes dataset to other dataset format: + +``` bash +datum project import -f cityscapes -i +datum export -f voc -o +# or +datum convert -if cityscapes -i -f voc -o +``` + +Some formats provide extra options for conversion. +These options are passed after double dash (`--`) in the command line. +To get information about them, run + +`datum export -f -- -h` + +## Export to Cityscapes + +There are few ways to convert dataset to Cityscapes format: + +``` bash +# export dataset into Cityscapes format from existing project +datum export -p -f cityscapes -o \ + -- --save-images +# converting to Cityscapes format from other format +datum convert -if voc -i \ + -f cityscapes -o -- --save-images +``` + +Extra options for export to cityscapes format: +- `--save-images` allow to export dataset with saving images +(by default `False`); +- `--image-ext IMAGE_EXT` allow to specify image extension +for exporting dataset (by default - keep original or use `.png`, if none). +- `--apply-colormap APPLY_COLORMAP` allow to use colormap for class masks +(`*color.png` files, by default `True`); +- `--label_map` allow to define a custom colormap. Example + +``` bash +# mycolormap.txt : +# 0 0 255 sky +# 255 0 0 person +#... +datum export -f cityscapes -- --label-map mycolormap.txt + +# or you can use original cityscapes colomap: +datum export -f cityscapes -- --label-map cityscapes +``` + +## Particular use cases + +Datumaro supports filtering, transformation, merging etc. for all formats +and for the Cityscapes format in particular. Follow +[user manual](../user_manual.md) +to get more information about these operations. + +There are few examples of using Datumaro operations to solve +particular problems with Cityscapes dataset: + +### Example 1. How to load an original Cityscapes dataset and convert to Pascal VOC + +```bash +datum create -o project +datum add path -p project -f cityscapes ./Cityscapes/ +datum stats -p project +datum export -p final_project -o dataset -f voc --overwrite -- --save-images +``` + +### Example 2. How to create custom Cityscapes-like dataset + +```python +import numpy as np +from datumaro.components.dataset import Dataset +from datumaro.components.extractor import Mask, DatasetItem + +import datumaro.plugins.cityscapes_format as Cityscapes + +label_map = OrderedDict() +label_map['background'] = (0, 0, 0) +label_map['label_1'] = (1, 2, 3) +label_map['label_2'] = (3, 2, 1) +categories = Cityscapes.make_cityscapes_categories(label_map) + +dataset = Dataset.from_iterable([ + DatasetItem(id=1, + image=np.ones((1, 5, 3)), + annotations=[ + Mask(image=np.array([[1, 0, 0, 1, 1]]), label=1, id=1, + attributes={'is_crowd': False}), + Mask(image=np.array([[0, 1, 1, 0, 0]]), label=2, id=2, + attributes={'is_crowd': False}), + ] + ), + ], categories=categories) + +dataset.export('./dataset', format='cityscapes') +``` + +More examples of working with Cityscapes dataset from code can be found in +[tests](../../tests/test_cityscapes_format.py) diff --git a/docs/formats/coco_user_manual.md b/docs/formats/coco_user_manual.md new file mode 100644 index 000000000000..5b2d37901665 --- /dev/null +++ b/docs/formats/coco_user_manual.md @@ -0,0 +1,218 @@ +# COCO user manual + +## Contents + +- [Format specification](#format-specification) +- [Load COCO dataset](#load-COCO-dataset) +- [Export to other formats](#export-to-other-formats) +- [Export to COCO](#export-to-COCO) +- [Particular use cases](#particular-use-cases) + +## Format specification + +COCO format specification available [here](https://cocodataset.org/#format-data). + +COCO dataset format supports `captions`, `image_info`, `instances`, `panoptic`, +`person_keypoints`, `stuff` annotation tasks +and, as Datumaro extension, `label` (like `instances` with only `category_id`) + +## Load COCO dataset + +The COCO dataset is available for free download: + +Images: +- [train images](http://images.cocodataset.org/zips/train2017.zip) +- [val images](http://images.cocodataset.org/zips/val2017.zip) +- [test images](http://images.cocodataset.org/zips/test2017.zip) +- [unlabeled images](http://images.cocodataset.org/zips/unlabeled2017.zip) + +Annotations: +- [captions](http://images.cocodataset.org/annotations/annotations_trainval2017.zip) +- [image_info](http://images.cocodataset.org/annotations/image_info_test2017.zip) +- [instances](http://images.cocodataset.org/annotations/annotations_trainval2017.zip) +- [panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) +- [person_keypoints](http://images.cocodataset.org/annotations/annotations_trainval2017.zip) +- [stuff](http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip) + +There are two ways to create Datumaro project and add COCO dataset to it: + +``` bash +datum import --format coco --input-path +# or +datum create +datum add path -f coco +``` + +It is possible to specify project name and project directory run +`datum create --help` for more information. + +COCO dataset directory should have the following structure: + + +``` +└─ Dataset/ + ├── images/ + │ ├── train + │ │ ├── + │ │ ├── + │ │ └── ... + │ ├── val + │ │ ├── + │ │ ├── + │ │ └── ... + ├── annotations/ + │ └── _train.json + │ └── _test.json +``` + +For `panoptic` COCO dataset directory should have the following structure: + + +``` +└─ Dataset/ + ├── images/ + │ ├── train + │ │ ├── + │ │ ├── + │ │ └── ... + │ ├── val + │ │ ├── + │ │ ├── + │ │ └── ... + ├── annotations/ + │ ├── panoptic_train + │ │ ├── + │ │ ├── + │ │ └── ... + │ ├── panoptic_train.json + │ ├── panoptic_val + │ │ ├── + │ │ ├── + │ │ └── ... + │ └── panoptic_val.json +``` + +You can import dataset for specific tasks +of COCO dataset instead of the whole dataset, +for example: + +``` bash +datum import --format coco_stuff --input-path +``` + +Datumaro supports the following COCO tasks: +- [Image Captioning](https://cocodataset.org/#captions-2015) (`coco_caption`) +- [Object Detection](https://cocodataset.org/#detection-2020) (`coco_instances`) +- Image classification (our extension) (`coco_labels`) - a format like Object Detection, which uses + only `category_id` and `score` annotation fields +- [Panoptic Segmentation](https://cocodataset.org/#panoptic-2020) (`coco_panoptic`) +- [Keypoint Detection](https://cocodataset.org/#keypoints-2020) (`coco_person_keypoints`) +- [Stuff Segmentation](https://cocodataset.org/#stuff-2019) (`coco_stuff`) + +To make sure that the selected dataset has been added to the project, you can run +`datum info`, which will display the project and dataset information. + +## Export to other formats + +Datumaro can convert COCO dataset into any other format [Datumaro supports](../user_manual.md#supported-formats). +To get the expected result, the dataset needs to be converted to formats +that support the specified task (e.g. for panoptic segmentation - VOC, CamVID) +There are few ways to convert COCO dataset to other dataset format: + +``` bash +datum project import -f coco -i +datum export -f voc -o +# or +datum convert -if coco -i -f voc -o +``` + +Some formats provide extra options for conversion. +These options are passed after double dash (`--`) in the command line. +To get information about them, run + +`datum export -f -- -h` + +## Export to COCO + +There are few ways to convert dataset to COCO format: + +``` bash +# export dataset into COCO format from existing project +datum export -p -f coco -o \ + -- --save-images +# converting to COCO format from other format +datum convert -if voc -i \ + -f coco -o -- --save-images +``` + +Extra options for export to COCO format: +- `--save-images` allow to export dataset with saving images +(by default `False`); +- `--image-ext IMAGE_EXT` allow to specify image extension +for exporting dataset (by default - keep original or use `.jpg`, if none); +- `--segmentation-mode MODE` allow to specify save mode for instance segmentation: + - 'guess': guess the mode for each instance (using 'is_crowd' attribute as hint) + - 'polygons': save polygons( merge and convert masks, prefer polygons) + - 'mask': save masks (merge and convert polygons, prefer masks) +(by default `guess`); +- `--crop-covered` allow to crop covered segments so that background objects +segmentation was more accurate (by default `False`); +- `--allow-attributes ALLOW_ATTRIBUTES` allow export of attributes +(by default `True`); +- `--reindex REINDEX` allow to assign new indices to images and annotations, +useful to avoid merge conflicts (by default `False`); +- `--merge-images` allow to save all images into a single directory +(by default `False`); +- `--tasks TASKS` allow to specify tasks for export dataset, +by default Datumaro uses all tasks. Example: + +```bash +datum import -o project -f coco -i +datum export -p project -f coco -- --tasks instances,stuff +``` + +## Particular use cases + +Datumaro supports filtering, transformation, merging etc. for all formats +and for the COCO format in particular. Follow +[user manual](../user_manual.md) +to get more information about these operations. + +There are few examples of using Datumaro operations to solve +particular problems with COCO dataset: + +### Example 1. How to load an original panoptic COCO dataset and convert to Pascal VOC + +```bash +datum create -o project +datum add path -p project -f coco_panoptic ./COCO/annotations/panoptic_val2017.json +datum stats -p project +datum export -p final_project -o dataset -f voc --overwrite -- --save-images +``` + +### Example 2. How to create custom COCO-like dataset + +```python +import numpy as np +from datumaro.components.dataset import Dataset +from datumaro.components.extractor import Mask, DatasetItem + +dataset = Dataset.from_iterable([ + DatasetItem(id='000000000001', + image=np.ones((1, 5, 3)), + subset='val', + attributes={'id': 40}, + annotations=[ + Mask(image=np.array([[0, 0, 1, 1, 0]]), label=3, + id=7, group=7, attributes={'is_crowd': False}), + Mask(image=np.array([[0, 1, 0, 0, 1]]), label=1, + id=20, group=20, attributes={'is_crowd': True}), + ] + ), + ], categories=['a', 'b', 'c', 'd']) + +dataset.export('./dataset', format='coco_panoptic') +``` + +More examples of working with COCO dataset from code can be found in +[tests](../../tests/test_coco_format.py) diff --git a/docs/formats/mnist_user_manual.md b/docs/formats/mnist_user_manual.md new file mode 100644 index 000000000000..01645e2827e1 --- /dev/null +++ b/docs/formats/mnist_user_manual.md @@ -0,0 +1,179 @@ +# MNIST user manual + +## Contents + +- [Format specification](#format-specification) +- [Load MNIST dataset](#load-MNIST-dataset) +- [Export to other formats](#export-to-other-formats) +- [Export to MNIST](#export-to-MNIST) +- [Particular use cases](#particular-use-cases) + +## Format specification + +MNIST format specification available [here](http://yann.lecun.com/exdb/mnist/). +Fashion MNIST format specification available [here](https://github.com/zalandoresearch/fashion-mnist). +MNIST in CSV format specification available [here](https://pjreddie.com/projects/mnist-in-csv/). + +MNIST dataset format supports `Labels` annotations. + +## Load MNIST dataset + +The MNIST dataset is available for free download: + +- [train-images-idx3-ubyte.gz](https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz): training set images +- [train-labels-idx1-ubyte.gz](https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz): training set labels +- [t10k-images-idx3-ubyte.gz](https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz): test set images +- [t10k-labels-idx1-ubyte.gz](https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz): test set labels + +The Fashion MNIST dataset is available for free download: + +- [train-images-idx3-ubyte.gz](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz): training set images +- [train-labels-idx1-ubyte.gz](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz): training set labels +- [t10k-images-idx3-ubyte.gz](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz): test set images +- [t10k-labels-idx1-ubyte.gz](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz): test set labels + +The MNIST in CSV dataset is available for free download: + +- [mnist_train.csv](https://pjreddie.com/media/files/mnist_train.csv) +- [mnist_test.csv](https://pjreddie.com/media/files/mnist_test.csv) + +There are two ways to create Datumaro project and add MNIST dataset to it: + +``` bash +datum import --format mnist --input-path +# or +datum create +datum add path -f mnist +``` + +There are two ways to create Datumaro project and add MNIST in CSV dataset to it: + +``` bash +datum import --format mnist_csv --input-path +# or +datum create +datum add path -f mnist_csv +``` + +It is possible to specify project name and project directory run +`datum create --help` for more information. + +MNIST dataset directory should have the following structure: + + +``` +└─ Dataset/ + ├── labels.txt # list of non-digit labels (optional) + ├── t10k-images-idx3-ubyte.gz + ├── t10k-labels-idx1-ubyte.gz + ├── train-images-idx3-ubyte.gz + └── train-labels-idx1-ubyte.gz +``` +MNIST in CSV dataset directory should have the following structure: + + +``` +└─ Dataset/ + ├── labels.txt # list of non-digit labels (optional) + ├── mnist_test.csv + └── mnist_train.csv +``` +If the dataset needs non-digit labels, you need to add the labels.txt +to the dataset folder. +For example, labels.txt for Fashion MNIST labels contains the following: + +``` +T-shirt/top +Trouser +Pullover +Dress +Coat +Sandal +Shirt +Sneaker +Bag +Ankle boot +``` + +MNIST format only supports single channel 28 x 28 images. + +## Export to other formats + +Datumaro can convert MNIST dataset into any other format [Datumaro supports](../user_manual.md#supported-formats). +To get the expected result, the dataset needs to be converted to formats +that support the classification task (e.g. CIFAR-10/100, ImageNet, PascalVOC, etc.) +There are few ways to convert MNIST dataset to other dataset format: + +``` bash +datum project import -f mnist -i +datum export -f imagenet -o +# or +datum convert -if mnist -i -f imagenet -o +``` + +These commands also work for MNIST in CSV if you use `mnist_csv` instead of `mnist`. + +## Export to MNIST + +There are few ways to convert dataset to MNIST format: + +``` bash +# export dataset into MNIST format from existing project +datum export -p -f mnist -o \ + -- --save-images +# converting to MNIST format from other format +datum convert -if imagenet -i \ + -f mnist -o -- --save-images +``` + +Extra options for export to MNIST format: + +- `--save-images` allow to export dataset with saving images +(by default `False`); +- `--image-ext ` allow to specify image extension +for exporting dataset (by default `.png`). + +These commands also work for MNIST in CSV if you use `mnist_csv` instead of `mnist`. + +## Particular use cases + +Datumaro supports filtering, transformation, merging etc. for all formats +and for the MNIST format in particular. Follow [user manual](../user_manual.md) +to get more information about these operations. + +There are few examples of using Datumaro operations to solve +particular problems with MNIST dataset: + +### Example 1. How to create custom MNIST-like dataset + +```python +from datumaro.components.dataset import Dataset +from datumaro.components.extractor import Label, DatasetItem + +dataset = Dataset.from_iterable([ + DatasetItem(id=0, image=np.ones((28, 28)), + annotations=[Label(2)] + ), + DatasetItem(id=1, image=np.ones((28, 28)), + annotations=[Label(7)] + ) +], categories=[str(label) for label in range(10)]) + +dataset.export('./dataset', format='mnist') +``` + +### Example 2. How to filter and convert MNIST dataset to ImageNet + +Convert MNIST dataset to ImageNet format, keep only images with `3` class presented: + +``` bash +# Download MNIST dataset: +# https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz +# https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz +datum convert --input-format mnist --input-path \ + --output-format imagenet \ + --filter '/item[annotation/label="3"]' +``` + +More examples of working with MNIST dataset from code can be found in +[tests_mnist](../../tests/test_mnist_format.py) and [tests_mnist_csv](../../tests/test_mnist_csv_format.py) diff --git a/docs/formats/pascal_voc_user_manual.md b/docs/formats/pascal_voc_user_manual.md new file mode 100644 index 000000000000..0e5db0e9258d --- /dev/null +++ b/docs/formats/pascal_voc_user_manual.md @@ -0,0 +1,317 @@ +# Pascal VOC user manual + +## Contents +- [Format specification](#format-specification) +- [Load Pascal VOC dataset](#load-pascal-voc-dataset) +- [Export to other formats](#export-to-other-formats) +- [Export to Pascal VOC](#export-to-pascal-VOC) +- [Particular use cases](#particular-use-cases) + +## Format specification + +- Pascal VOC format specification available +[here](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/devkit_doc.pdf). + +- Original Pascal VOC dataset format support the followoing types of annotations: + - `Labels` (for classification tasks); + - `Bounding boxes` (for detection, action detection and person layout tasks); + - `Masks` (for segmentations tasks). + +- Supported attributes: + - `occluded`: indicates that a significant portion of the object within the + bounding box is occluded by another object; + - `truncated`: indicates that the bounding box specified for the object does + not correspond to the full extent of the object; + - `difficult`: indicates that the object is considered difficult to recognize; + - action attributes (`jumping`, `reading`, `phoning` and + [more](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/actionexamples/index.html)). + +## Load Pascal VOC dataset + +The Pascal VOC dataset is available for free download +[here](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html#devkit) + +There are two ways to create Datumaro project and add Pascal VOC dataset to it: + +``` bash +datum import --format voc --input-path +# or +datum create +datum add path -f voc +``` + +It is possible to specify project name and project directory run +`datum create --help` for more information. +Pascal VOC dataset directory should have the following structure: + + +``` +└─ Dataset/ + ├── label_map.txt # list of non-pascal labels (optional) + ├── Annotations/ + │ ├── ann1.xml # Pascal VOC format annotation file + │ ├── ann2.xml + │ ├── ... + ├── JPEGImages/ + │ ├── img1.jpg + │ ├── img2.jpg + │ ├── ... + ├── SegmentationClass/ # directory with semantic segmentation masks + │ ├── img1.png + │ ├── img2.png + │ ├── ... + ├── SegmentationObject/ # directory with instance segmentation masks + │ ├── img1.png + │ ├── img2.png + │ ├── ... + ├── ImageSets/ + │ ├── Main/ # directory with list of images for detection and classification task + │ │ ├── test.txt # list of image names in test subset (without extension) + | | ├── train.txt # list of image names in train subset (without extension) + | | ├── ... + │ ├── Layout/ # directory with list of images for person layout task + │ │ ├── test.txt + | | ├── train.txt + | | ├── ... + │ ├── Action/ # directory with list of images for action classification task + │ │ ├── test.txt + | | ├── train.txt + | | ├── ... + │ ├── Segmentation/ # directory with list of images for segmentation task + │ │ ├── test.txt + | | ├── train.txt + | | ├── ... +``` + +The `ImageSets` directory should contain at least one of the directories: +`Main`, `Layout`, `Action`, `Segmentation`. +These directories contain `.txt` files +with a list of images in a subset, the subset name is the same as the `.txt` file name. + +In `label_map.txt` you can define custom color map and non-pascal labels, for example: + +``` +# label_map [label : color_rgb : parts : actions] +helicopter::: +elephant:0:124:134:head,ear,foot: +``` +It is also possible to import grayscale (1-channel) PNG masks. +For grayscale masks provide a list of labels with the number of lines +equal to the maximum color index on images. The lines must be in the +right order so that line index is equal to the color index. Lines can +have arbitrary, but different, colors. If there are gaps in the used +color indices in the annotations, they must be filled with arbitrary +dummy labels. Example: + +``` +car:0,128,0:: # color index 0 +aeroplane:10,10,128:: # color index 1 +_dummy2:2,2,2:: # filler for color index 2 +_dummy3:3,3,3:: # filler for color index 3 +boat:108,0,100:: # color index 3 +... +_dummy198:198,198,198:: # filler for color index 198 +_dummy199:199,199,199:: # filler for color index 199 +the_last_label:12,28,0:: # color index 200 +``` + +You can import dataset for specific tasks +of Pascal VOC dataset instead of the whole dataset, +for example: + +``` bash +datum add path -f voc_detection +``` + +Datumaro supports the following Pascal VOC tasks: +- Image classification (`voc_classification`) +- Object detection (`voc_detection`) +- Action classification (`voc_action`) +- Class and instance segmentation (`voc_segmentation`) +- Person layout detection (`voc_layout`) + +To make sure that the selected dataset has been added to the project, you can run +`datum info`, which will display the project and dataset information. + +## Export to other formats + +Datumaro can convert Pascal VOC dataset into any other format +[Datumaro supports](../user_manual.md#supported-formats). + +Such conversion will only be successful if the output +format can represent the type of dataset you want to convert, +e.g. image classification annotations can be +saved in `ImageNet` format, but no as `COCO keypoints`. + +There are few ways to convert Pascal VOC dataset to other dataset format: + +``` bash +datum import -f voc -i +datum export -f coco -o +# or +datum convert -if voc -i -f coco -o + +``` + +Some formats provide extra options for conversion. +These options are passed after double dash (`--`) in the command line. +To get information about them, run + +`datum export -f -- -h` + +## Export to Pascal VOC + +There are few ways to convert an existing dataset to Pascal VOC format: + +``` bash +# export dataset into Pascal VOC format (classification) from existing project +datum export -p -f voc -o -- --tasks classification + +# converting to Pascal VOC format from other format +datum convert -if imagenet -i \ + -f voc -o \ + -- --label_map voc --save-images +``` + +Extra options for export to Pascal VOC format: + +- `--save-images` allow to export dataset with saving images +(by default `False`); + +- `--image-ext IMAGE_EXT` allow to specify image extension +for exporting dataset (by default use original or `.jpg` if none); + +- `--apply-colormap APPLY_COLORMAP` allow to use colormap for class +and instance masks (by default `True`); + +- `--allow-attributes ALLOW_ATTRIBUTES` allow export of attributes +(by default `True`); + +- `--tasks TASKS` allow to specify tasks for export dataset, +by default Datumaro uses all tasks. Example: + +```bash +datum import -o project -f voc -i ./VOC2012 +datum export -p project -f voc -- --tasks detection,classification +``` + +- `--label_map` allow to define a custom colormap. Example + +``` bash +# mycolormap.txt [label : color_rgb : parts : actions]: +# cat:0,0,255:: +# person:255,0,0:head: +datum export -f voc_segmentation -- --label-map mycolormap.txt + +# or you can use original voc colomap: +datum export -f voc_segmentation -- --label-map voc +``` + +## Particular use cases + +Datumaro supports filtering, transformation, merging etc. for all formats +and for the Pascal VOC format in particular. Follow +[user manual](../user_manual.md) +to get more information about these operations. + +There are few examples of using Datumaro operations to solve +particular problems with Pascal VOC dataset: + +### Example 1. How to prepare an original dataset for training. +In this example, preparing the original dataset to train the semantic segmentation model includes: +loading, +checking duplicate images, +setting the number of images, +splitting into subsets, +export the result to Pascal VOC format. + +```bash +datum create -o project +datum add path -p project -f voc_segmentation ./VOC2012/ImageSets/Segmentation/trainval.txt +datum stats -p project # check statisctics.json -> repeated images +datum transform -p project -o ndr_project -t ndr -- -w trainval -k 2500 +datum filter -p ndr_project -o trainval2500 -e '/item[subset="trainval"]' +datum transform -p trainval2500 -o final_project -t random_split -- -s train:.8 -s val:.2 +datum export -p final_project -o dataset -f voc -- --label-map voc --save-images +``` + +### Example 2. How to create custom dataset + +```python +from datumaro.components.dataset import Dataset +from datumaro.util.image import Image +from datumaro.components.extractor import Bbox, Polygon, Label, DatasetItem + +dataset = Dataset.from_iterable([ + DatasetItem(id='image1', image=Image(path='image1.jpg', size=(10, 20)), + annotations=[Label(3), + Bbox(1.0, 1.0, 10.0, 8.0, label=0, attributes={'difficult': True, 'running': True}), + Polygon([1, 2, 3, 2, 4, 4], label=2, attributes={'occluded': True}), + Polygon([6, 7, 8, 8, 9, 7, 9, 6], label=2), + ] + ), +], categories=['person', 'sky', 'water', 'lion']) + +dataset.transform('polygons_to_masks') +dataset.export('./mydataset', format='voc', label_map='my_labelmap.txt') + +""" +my_labelmap.txt: +# label:color_rgb:parts:actions +person:0,0,255:hand,foot:jumping,running +sky:128,0,0:: +water:0,128,0:: +lion:255,128,0:: +""" +``` + +### Example 3. Load, filter and convert from code +Load Pascal VOC dataset, and export train subset with items +which has `jumping` attribute: + +```python +from datumaro.components.dataset import Dataset + +dataset = Dataset.import_from('./VOC2012', format='voc') + +train_dataset = dataset.get_subset('train').as_dataset() + +def only_jumping(item): + for ann in item.annotations: + if ann.attributes.get('jumping'): + return True + return False + +train_dataset.select(only_jumping) + +train_dataset.export('./jumping_label_me', format='label_me', save_images=True) +``` + +### Example 4. Get information about items in Pascal VOC 2012 dataset for segmentation task: + +```python +from datumaro.components.dataset import Dataset +from datumaro.components.extractor import AnnotationType + +dataset = Dataset.import_from('./VOC2012', format='voc') + +def has_mask(item): + for ann in item.annotations: + if ann.type == AnnotationType.mask: + return True + return False + +dataset.select(has_mask) + +print("Pascal VOC 2012 has %s images for segmentation task:" % len(dataset)) +for subset_name, subset in dataset.subsets().items(): + for item in subset: + print(item.id, subset_name, end=";") +``` + +After executing this code, we can see that there are 5826 images +in Pascal VOC 2012 has for segmentation task and this result is the same as the +[official documentation](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/dbstats.html) + +More examples of working with Pascal VOC dataset from code can be found in +[tests](../../tests/test_voc_format.py) diff --git a/docs/formats/yolo_user_manual.md b/docs/formats/yolo_user_manual.md new file mode 100644 index 000000000000..266547945f56 --- /dev/null +++ b/docs/formats/yolo_user_manual.md @@ -0,0 +1,210 @@ +# YOLO user manual + +## Contents +- [Format specification](#format-specification) +- [Load YOLO dataset](#load-yolo-dataset) +- [Export to other formats](#export-to-other-formats) +- [Export to YOLO format](#export-to-yolo-format) +- [Particular use cases](#particular-use-cases) + +## Format specification + +- The YOLO dataset format is for training and validating object detection models. +Specification for this format available +[here](https://github.com/AlexeyAB/darknet#how-to-train-to-detect-your-custom-objects). +And also you can find some official examples on working with YOLO dataset +[here](https://pjreddie.com/darknet/yolo/); + +- The YOLO dataset format support the following types of annotations: + - `Bounding boxes` + +- YOLO format doesn't support attributes for annotations; + +- The format only supports subsets named `train` or `valid`. + +## Load YOLO dataset + +Few ways to create Datumaro project and add YOLO dataset to it: + +```bash +datum import -o project -f yolo -i + +# another way to do the same: +datum create -o project +datum add path -p project -f yolo -i + +# and you can add another one yolo dataset: +datum add path -p project -f yolo -i +``` + +YOLO dataset directory should have the following structure: + + +``` +└─ yolo_dataset/ + │ + ├── obj.names # file with list of classes + ├── obj.data # file with dataset information + ├── train.txt # list of image paths in train subset + ├── valid.txt # list of image paths in valid subset + │ + ├── obj_train_data/ # directory with annotations and images for train subset + │ ├── image1.txt # list of labeled bounding boxes for image1 + │ ├── image1.jpg + │ ├── image2.txt + │ ├── image2.jpg + │ ├── ... + │ + ├── obj_valid_data/ # directory with annotations and images for valid subset + │ ├── image101.txt + │ ├── image101.jpg + │ ├── image102.txt + │ ├── image102.jpg + │ ├── ... +``` +> YOLO dataset cannot contain a subset with a name other than `train` or `valid`. +If imported dataset contains such subsets, they will be ignored. +If you are exporting a project into yolo format, +all subsets different from `train` and `valid` will be skipped. +If there is no subset separation in a project, the data +will be saved in `train` subset. + +- `obj.data` should have the following content, it is not necessary to have both +subsets, but necessary to have one of them: +``` +classes = 5 # optional +names = +train = +valid = +backup = backup/ # optional +``` +- `obj.names` contain list of classes. +The line number for the class is the same as its index: +``` +label1 # label1 has index 0 +label2 # label2 has index 1 +label3 # label2 has index 2 +... +``` +- Files `train.txt` and `valid.txt` should have the following structure: +``` + + +... +``` +- Files in directories `obj_train_data/` and `obj_valid_data/` +should contain information about labeled bounding boxes +for images: +``` +# image1.txt: +# +0 0.250000 0.400000 0.300000 0.400000 +3 0.600000 0.400000 0.400000 0.266667 +``` +Here `x`, `y`, `width`, and `height` are relative to the image's width and height. + +## Export to other formats + +Datumaro can convert YOLO dataset into any other format +[Datumaro supports](../docs/user_manual.md#supported-formats). +For successful conversion the output format should support +object detection task (e.g. Pascal VOC, COCO, TF Detection API etc.) + +Examples: +```bash +datum import -o project -f yolo -i +datum export -p project -f voc -o +``` + +```bash +datum convert -if yolo -i \ + -f coco_instances -o +``` + +## Export to YOLO format + +Datumaro can convert an existing dataset to YOLO format, +if the dataset supports object detection task. + +Example: + +``` +datum import -p project -f coco_instances -i +datum export -p project -f yolo -o -- --save-images +``` + +Extra options for export to YOLO format: + +- `--save-images` allow to export dataset with saving images +(default: `False`); +- `--image-ext ` allow to specify image extension +for exporting dataset (default: use original or `.jpg`, if none). + +## Particular use cases + +### How to prepare PASCAL VOC dataset for exporting to YOLO format dataset? + +```bash +datum import -o project -f voc -i ./VOC2012 +datum filter -p project -e '/item[subset="train" or subset="val"]' -o trainval_voc +datum transform -p trainval_voc -o trainvalid_voc \ + -t map_subsets -- -s train:train -s val:valid +datum export -p trainvalid_voc -f yolo -o ./yolo_dataset -- --save-images +``` + +### How to remove some class from YOLO dataset? +Delete all items, which contain `cat` objects and remove +`cat` from list of classes: +```bash +datum import -o project -f yolo -i ./yolo_dataset +datum filter -p project -o filtered -m i+a -e '/item/annotation[label!="cat"]' +datum transform -p filtered -o without_cat -t remap_labels -- -l cat: +datum export -p without_cat -f yolo -o ./yolo_without_cats +``` + +### How to create custom dataset in YOLO format? +```python +import numpy as np +from datumaro.components.dataset import Dataset +from datumaro.components.extractor import Bbox, DatasetItem + +dataset = Dataset.from_iterable([ + DatasetItem(id='image_001', subset='train', + image=np.ones((20, 20, 3)), + annotations=[ + Bbox(3.0, 1.0, 8.0, 5.0, label=1), + Bbox(1.0, 1.0, 10.0, 1.0, label=2) + ] + ), + DatasetItem(id='image_002', subset='train', + image=np.ones((15, 10, 3)), + annotations=[ + Bbox(4.0, 4.0, 4.0, 4.0, label=3) + ] + ) +], categories=['house', 'bridge', 'crosswalk', 'traffic_light']) + +dataset.export('../yolo_dataset', format='yolo', save_images=True) +``` + +### How to get information about objects on each images? + +If you only want information about label names for each +images, then you can get it from code: +```python +from datumaro.components.dataset import Dataset +from datumaro.components.extractor import AnnotationType + +dataset = Dataset.import_from('./yolo_dataset', format='yolo') +cats = dataset.categories()[AnnotationType.label] + +for item in dataset: + for ann in item.annotations: + print(item.id, cats[ann.label].name) +``` + +And If you want complete information about each items you can run: +```bash +datum import -o project -f yolo -i ./yolo_dataset +datum filter -p project --dry-run -e '/item' +``` \ No newline at end of file diff --git a/docs/user_manual.md b/docs/user_manual.md index 06585d36fdb6..db7c28f9d590 100644 --- a/docs/user_manual.md +++ b/docs/user_manual.md @@ -20,6 +20,7 @@ - [Compare projects](#compare-projects) - [Obtaining project info](#get-project-info) - [Obtaining project statistics](#get-project-statistics) + - [Validate project annotations](#validate-project-annotations) - [Register model](#register-model) - [Run inference](#run-model) - [Run inference explanation](#explain-inference) @@ -84,16 +85,19 @@ import datumaro ## Supported Formats List of supported formats: -- MS COCO (`image_info`, `instances`, `person_keypoints`, `captions`, `labels`*) +- MS COCO (`image_info`, `instances`, `person_keypoints`, `captions`, `labels`, `panoptic`, `stuff`) - [Format specification](http://cocodataset.org/#format-data) - [Dataset example](../tests/assets/coco_dataset) - `labels` are our extension - like `instances` with only `category_id` + - [Format documentation](./formats/coco_user_manual.md) - PASCAL VOC (`classification`, `detection`, `segmentation` (class, instances), `action_classification`, `person_layout`) - [Format specification](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/index.html) - [Dataset example](../tests/assets/voc_dataset) + - [Format documentation](./formats/pascal_voc_user_manual.md) - YOLO (`bboxes`) - [Format specification](https://github.com/AlexeyAB/darknet#how-to-train-pascal-voc-data) - [Dataset example](../tests/assets/yolo_dataset) + - [Format documentation](./formats/yolo_user_manual.md) - TF Detection API (`bboxes`, `masks`) - Format specifications: [bboxes](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/using_your_own_dataset.md), [masks](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/instance_segmentation.md) - [Dataset example](../tests/assets/tf_detection_api_dataset) @@ -113,9 +117,24 @@ List of supported formats: - [Dataset example](../tests/assets/imagenet_dataset) - [Dataset example (txt for classification)](../tests/assets/imagenet_txt_dataset) - Detection format is the same as in PASCAL VOC +- CIFAR-10/100 (`classification` (python version)) + - [Format specification](https://www.cs.toronto.edu/~kriz/cifar.html) + - [Dataset example](../tests/assets/cifar_dataset) +- MNIST (`classification`) + - [Format specification](http://yann.lecun.com/exdb/mnist/) + - [Dataset example](../tests/assets/mnist_dataset) + - [Format documentation](./formats/mnist_user_manual.md) +- MNIST in CSV (`classification`) + - [Format specification](https://pjreddie.com/projects/mnist-in-csv/) + - [Dataset example](../tests/assets/mnist_csv_dataset) + - [Format documentation](./formats/mnist_user_manual.md) - CamVid (`segmentation`) - [Format specification](http://mi.eng.cam.ac.uk/research/projects/VideoRec/CamVid/) - [Dataset example](../tests/assets/camvid_dataset) +- Cityscapes (`segmentation`) + - [Format specification](https://www.cityscapes-dataset.com/dataset-overview/) + - [Dataset example](../tests/assets/cityscapes_dataset) + - [Format documentation](./formats/cityscapes_user_manual.md) - CVAT - [Format specification](https://github.com/opencv/cvat/blob/develop/cvat/apps/documentation/xml_format.md) - [Dataset example](../tests/assets/cvat_dataset) @@ -128,7 +147,7 @@ List of supported formats: - Market-1501 (`person re-identification`) - [Format specification](https://www.aitribune.com/dataset/2018051063) - [Dataset example](../tests/assets/market1501_dataset) -- LFW (`person re-identification`, `landmarks`) +- LFW (`classification`, `person re-identification`, `landmarks`) - [Format specification](http://vis-www.cs.umass.edu/lfw/) - [Dataset example](../tests/assets/lfw_dataset) @@ -875,6 +894,180 @@ datum stats -p test_project + +### Validate project annotations + +This command inspects annotations with respect to the task type +and stores the result in JSON file. + +The task types supported are `classification`, `detection`, and `segmentation`. + +The validation result contains +- annotation statistics based on the task type +- validation reports, such as + - items not having annotations + - items having undefined annotations + - imbalanced distribution in class/attributes + - too small or large values +- summary + +Usage: + +``` bash +datum validate --help + +datum validate -p +``` + +Here is the list of validation items(a.k.a. anomaly types). + +| Anomaly Type | Description | Task Type | +| ------------ | ----------- | --------- | +| MissingLabelCategories | Metadata (ex. LabelCategories) should be defined | common | +| MissingAnnotation | No annotation found for an Item | common | +| MissingAttribute | An attribute key is missing for an Item | common | +| MultiLabelAnnotations | Item needs a single label | classification | +| UndefinedLabel | A label not defined in the metadata is found for an item | common | +| UndefinedAttribute | An attribute not defined in the metadata is found for an item | common | +| LabelDefinedButNotFound | A label is defined, but not found actually | common | +| AttributeDefinedButNotFound | An attribute is defined, but not found actually | common | +| OnlyOneLabel | The dataset consists of only label | common | +| OnlyOneAttributeValue | The dataset consists of only attribute value | common | +| FewSamplesInLabel | The number of samples in a label might be too low | common | +| FewSamplesInAttribute | The number of samples in an attribute might be too low | common | +| ImbalancedLabels | There is an imbalance in the label distribution | common | +| ImbalancedAttribute | There is an imbalance in the attribute distribution | common | +| ImbalancedDistInLabel | Values (ex. bbox width) are not evenly distributed for a label | detection, segmentation | +| ImbalancedDistInAttribute | Values (ex. bbox width) are not evenly distributed for an attribute | detection, segmentation | +| NegativeLength | The width or height of bounding box is negative | detection | +| InvalidValue | There's invalid (ex. inf, nan) value for bounding box info. | detection | +| FarFromLabelMean | An annotation has an too small or large value than average for a label | detection, segmentation | +| FarFromAttrMean | An annotation has an too small or large value than average for an attribute | detection, segmentation | + + +Validation Result Format: + +
+ +``` bash +{ + 'statistics': { + ## common statistics + 'label_distribution': { + 'defined_labels': , # : + 'undefined_labels': + # : { + # 'count': , + # 'items_with_undefined_label': [, ] + # } + }, + 'attribute_distribution': { + 'defined_attributes': , + # : { + # : { + # 'distribution': {: , }, + # 'items_missing_attribute': [, ] + # } + # } + 'undefined_attributes': + # : { + # : { + # 'distribution': {: , }, + # 'items_with_undefined_attr': [, ] + # } + # } + }, + 'total_ann_count': , + 'items_missing_annotation': , # [, ] + + ## statistics for classification task + 'items_with_multiple_labels': , # [, ] + + ## statistics for detection task + 'items_with_invalid_value': , + # '': {: [ , ], } + # - properties: 'x', 'y', 'width', 'height', + # 'area(wxh)', 'ratio(w/h)', 'short', 'long' + # - 'short' is min(w,h) and 'long' is max(w,h). + 'items_with_negative_length': , + # '': { : { <'width'|'height'>: , }, } + 'bbox_distribution_in_label': , # : + 'bbox_distribution_in_attribute': , + # : {: { : , }, } + 'bbox_distribution_in_dataset_item': , + # '': + + ## statistics for segmentation task + 'items_with_invalid_value': , + # '': {: [ , ], } + # - properties: 'area', 'width', 'height' + 'mask_distribution_in_label': , # : + 'mask_distribution_in_attribute': , + # : { + # : { : , } + # } + 'mask_distribution_in_dataset_item': , + # '': + }, + 'validation_reports': , # [ , ] + # validation_error_format = { + # 'anomaly_type': , + # 'description': , + # 'severity': , # 'warning' or 'error' + # 'item_id': , # optional, when it is related to a DatasetItem + # 'subset': , # optional, when it is related to a DatasetItem + # } + 'summary': { + 'errors': , + 'warnings': + } +} + +``` + +`item_key` is defined as, +``` python +item_key = (, ) +``` + +`bbox_template` and `mask_template` are defined as, + +``` python +bbox_template = { + 'width': , + 'height': , + 'area(wxh)': , + 'ratio(w/h)': , + 'short': , # short = min(w, h) + 'long': # long = max(w, h) +} +mask_template = { + 'area': , + 'width': , + 'height': +} +``` + +`numerical_stat_template` is defined as, + +``` python +numerical_stat_template = { + 'items_far_from_mean': , + # {'': {: , }, } + 'mean': , + 'stdev': , + 'min': , + 'max': , + 'median': , + 'histogram': { + 'bins': , # [, ] + 'counts': , # [, ] + } +} +``` + +
+ ### Register model Supported models: @@ -900,7 +1093,8 @@ datum model add \ ``` Interpretation script for an OpenVINO detection model (`convert.py`): -You can find OpenVINO™ model interpreter samples in datumaro/plugins/openvino/samples. [Instruction](datumaro/plugins/openvino/README.md) +You can find OpenVINO model interpreter samples in +`datumaro/plugins/openvino/samples` ([instruction](datumaro/plugins/openvino/README.md)). ``` python from datumaro.components.extractor import * @@ -989,6 +1183,25 @@ datum diff inference -o diff ### Explain inference +Runs an explainable AI algorithm for a model. + +This tool is supposed to help an AI developer to debug a model and a dataset. +Basically, it executes inference and tries to find problems in the trained +model - determine decision boundaries and belief intervals for the classifier. + +Currently, the only available algorithm is RISE ([article](https://arxiv.org/pdf/1806.07421.pdf)), +which runs inference and then re-runs a model multiple times on each +image to produce a heatmap of activations for each output of the +first inference. As a result, we obtain few heatmaps, which +shows, how image pixels affected the inference result. This algorithm doesn't +require any special information about the model, but it requires the model to +return all the outputs and confidences. The algorighm only supports +classification and detection models. + +The following use cases available: +- RISE for classification +- RISE for object detection + Usage: ``` bash @@ -1007,11 +1220,70 @@ Example: run inference explanation on a single image with visualization ``` bash datum create <...> datum model add mymodel <...> -datum explain \ - -m mymodel \ - -t 'image.png' \ - rise \ - -s 1000 --progressive +datum explain -t image.png -m mymodel \ + rise --max-samples 1000 --progressive +``` + +> Note: this algorithm requires the model to return +> *all* (or a _reasonable_ amount) the outputs and confidences unfiltered, +> i.e. all the `Label` annotations for classification models and +> all the `Bbox`es for detection models. +> You can find examples of the expected model outputs in [`tests/test_RISE.py`](../tests/test_RISE.py) + +For OpenVINO models the output processing script would look like this: + +Classification scenario: + +``` python +from datumaro.components.extractor import * +from datumaro.util.annotation_util import softmax + +def process_outputs(inputs, outputs): + # inputs = model input, array or images, shape = (N, C, H, W) + # outputs = model output, logits, shape = (N, n_classes) + # results = conversion result, [ [ Annotation, ... ], ... ] + results = [] + for input, output in zip(inputs, outputs): + input_height, input_width = input.shape[:2] + confs = softmax(output[0]) + for label, conf in enumerate(confs): + results.append(Label(int(label)), attributes={'score': float(conf)}) + + return results +``` + + +Object Detection scenario: + +``` python +from datumaro.components.extractor import * + +# return a significant number of output boxes to make multiple runs +# statistically correct and meaningful +max_det = 1000 + +def process_outputs(inputs, outputs): + # inputs = model input, array or images, shape = (N, C, H, W) + # outputs = model output, shape = (N, 1, K, 7) + # results = conversion result, [ [ Annotation, ... ], ... ] + results = [] + for input, output in zip(inputs, outputs): + input_height, input_width = input.shape[:2] + detections = output[0] + image_results = [] + for i, det in enumerate(detections): + label = int(det[1]) + conf = float(det[2]) + x = max(int(det[3] * input_width), 0) + y = max(int(det[4] * input_height), 0) + w = min(int(det[5] * input_width - x), input_width) + h = min(int(det[6] * input_height - y), input_height) + image_results.append(Bbox(x, y, w, h, + label=label, attributes={'score': conf} )) + + results.append(image_results[:max_det]) + + return results ``` ### Transform Project @@ -1034,18 +1306,22 @@ Example: split a dataset randomly to `train` and `test` subsets, ratio is 2:1 datum transform -t random_split -- --subset train:.67 --subset test:.33 ``` -Example: split a dataset in task-specific manner. Supported tasks are -classification, detection, and re-identification. +Example: split a dataset in task-specific manner. The tasks supported are +classification, detection, segmentation and re-identification. ``` bash -datum transform -t classification_split -- \ - --subset train:.5 --subset val:.2 --subset test:.3 +datum transform -t split -- \ + -t classification --subset train:.5 --subset val:.2 --subset test:.3 + +datum transform -t split -- \ + -t detection --subset train:.5 --subset val:.2 --subset test:.3 -datum transform -t detection_split -- \ - --subset train:.5 --subset val:.2 --subset test:.3 +datum transform -t split -- \ + -t segmentation --subset train:.5 --subset val:.2 --subset test:.3 -datum transform -t reidentification_split -- \ - --subset train:.5 --subset val:.2 --subset test:.3 --query .5 +datum transform -t split -- \ + -t reid --subset train:.5 --subset val:.2 --subset test:.3 \ + --query .5 ``` Example: convert polygons to masks, masks to boxes etc.: @@ -1074,9 +1350,7 @@ datum transform -t rename -- -e '|pattern|replacement|' datum transform -t rename -- -e '|frame_(\d+)|\\1|' ``` -Example: Sampling dataset items, subset `train` is divided into `sampled`(sampled_subset) and `unsampled` -- `train` has 100 data, and 20 samples are selected. There are `sampled`(20 samples) and 80 `unsampled`(80 datas) subsets. -- Remove `train` subset (if sample_name=`train` or unsample_name=`train`, still remain) +Example: sampling dataset items as many as the number of target samples with sampling method entered by the user, divide into `sampled` and `unsampled` subsets - There are five methods of sampling the m option. - `topk`: Return the k with high uncertainty data - `lowk`: Return the k with low uncertainty data @@ -1087,14 +1361,14 @@ Example: Sampling dataset items, subset `train` is divided into `sampled`(sample ``` bash datum transform -t sampler -- \ -a entropy \ - -subset_name train \ - -sample_name sampled \ - -unsample_name unsampled \ + -i train \ + -o sampled \ + -u unsampled \ -m topk \ -k 20 ``` -Example : Control number of outputs to 100 after NDR +Example : control number of outputs to 100 after NDR - There are two methods in NDR e option - `random`: sample from removed data randomly - `similarity`: sample from removed data with ascending @@ -1153,7 +1427,7 @@ pip install 'git+https://github.com/openvinotoolkit/open_model_zoo.git#subdirect #### OpenVINO™ This plugin provides support for model inference with [OpenVINO™](https://01.org/openvinotoolkit). -The plugin depends on the OpenVINO™ Tookit, which can be installed by +The plugin depends on the OpenVINO™ Toolkit, which can be installed by following [these instructions](https://docs.openvinotoolkit.org/latest/index.html#packaging_and_deployment) ### Dataset Formats diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 000000000000..976c84825b47 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +python_classes = +python_functions = diff --git a/requirements.txt b/requirements.txt index 5cfc7dd4f248..b5490d77d2d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,8 +6,9 @@ lxml>=4.4.1 matplotlib>=3.3.1 opencv-python-headless>=4.1.0.25 Pillow>=6.1.0 -pycocotools>=2.0.0 +pycocotools>=2.0.0 --no-binary=pycocotools # https://github.com/openvinotoolkit/datumaro/issues/253 PyYAML>=5.3.1 scikit-image>=0.15.0 tensorboardX>=1.8 pandas>=1.1.5 +pytest>=5.3.5 \ No newline at end of file diff --git a/setup.py b/setup.py index d1e5ff0152dc..68591d9242be 100644 --- a/setup.py +++ b/setup.py @@ -42,8 +42,17 @@ def get_requirements(): 'matplotlib', 'numpy>=1.17.3', 'Pillow', - 'pycocotools; platform_system != "Windows"', + + # Avoid 2.0.2 Linux binary distribution because of + # a conflict in numpy versions with TensorFlow: + # - TF is compiled with numpy 1.19 ABI + # - pycocotools is compiled with numpy 1.20 ABI + # Using a previous version allows to force package rebuilding. + # + # https://github.com/openvinotoolkit/datumaro/issues/253 + 'pycocotools!=2.0.2; platform_system != "Windows"', 'pycocotools-windows; platform_system == "Windows"', + 'PyYAML', 'scikit-image', 'tensorboardX', diff --git a/tests/assets/cifar_dataset/batches.meta b/tests/assets/cifar_dataset/batches.meta new file mode 100644 index 0000000000000000000000000000000000000000..2021e05c912bc5558443209c27a82265af0fc307 GIT binary patch literal 70 zcmZo*nQF}d0ku04UBG5&!@I literal 0 HcmV?d00001 diff --git a/tests/assets/cifar_dataset/data_batch_1 b/tests/assets/cifar_dataset/data_batch_1 new file mode 100644 index 0000000000000000000000000000000000000000..e4ed1edc0b8f4b129ff28008e584bf6f04e1a70b GIT binary patch literal 3317 zcmeI#zfQw25C(7?S`aiVd&W+YGVlVh5s`r%MVG`)EhSFm+CwCeSol-vwjM9ZOMvS> z2KNmf?9Z0`+wN`g{ga58S6buMM)%6Vpmy|3xY7f3s>-v;bOd& zF@BiltQoD!2H!gk*1}AfR8}>k**WVQM+d@XED~Ot_Eb59XWr>v*7QkFv@T>JA4T#( z+iP65Zl2Wf)|t;~|8**yqkN&VEFXSD;!+4l_5MzTA4m=humB6N01L1H3$OqSu)x0& hpfCyGss`6=t2n2G(ZpHMMw>2;+i9MToAj1^_6MrCYlQ#+ literal 0 HcmV?d00001 diff --git a/tests/assets/cifar_dataset/test_batch b/tests/assets/cifar_dataset/test_batch new file mode 100644 index 0000000000000000000000000000000000000000..e3776023189c3f89998c42f2d7ff9c0aa6e7a34d GIT binary patch literal 9494 zcmeI&Jxjzu5C-7QeJGNY+NV&sDiHM-XlF2my@XA&mkT7DkZg*8f{h;s)7|)cz1$ua z{siHjV&NTTws`7!TYS4$!qzLV=xkk9O3PaH{5(astYx8+>!8&I|E8Ni>DDGiCzVW9 zW!EAnnnyE5i(DGXbfJg39fPdtR8S9aaB%Lf(EI2j=9wa2&@EKU3K!5{A?5owyXjK|ufl z5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=|5kvfOCWMn bX|hdXkCm9tjp|LQ3*UDA&~Mg$$V0FP+xLys literal 0 HcmV?d00001 diff --git a/tests/assets/cityscapes_dataset/gtFine/test/defaultcity/defaultcity_000001_000031_gtFine_instanceIds.png b/tests/assets/cityscapes_dataset/gtFine/test/defaultcity/defaultcity_000001_000031_gtFine_instanceIds.png new file mode 100644 index 0000000000000000000000000000000000000000..9a2cb23ffa86206a2ab160a7130f677f754e7ec7 GIT binary patch literal 76 zcmeAS@N?(olHy`uVBq!ia0vp^tU$~t03;ZaS)bFS$tekG2`6-t6H*cw Y7+8KVJQc0h?*u9HboFyt=akR{04LoMdjJ3c literal 0 HcmV?d00001 diff --git a/tests/assets/cityscapes_dataset/gtFine/test/defaultcity/defaultcity_000001_000032_gtFine_instanceIds.png b/tests/assets/cityscapes_dataset/gtFine/test/defaultcity/defaultcity_000001_000032_gtFine_instanceIds.png new file mode 100644 index 0000000000000000000000000000000000000000..56c008eac13067d18945da44505d2462602153bc GIT binary patch literal 76 zcmeAS@N?(olHy`uVBq!ia0vp^tU$~t03;ZaS)bFS$!n&hBxL+Q^I>iy YBZELaGi#|D%K?xwPgg&ebxsLQ08Wk)i2wiq literal 0 HcmV?d00001 diff --git a/tests/assets/cityscapes_dataset/gtFine/train/defaultcity/defaultcity_000002_000045_gtFine_instanceIds.png b/tests/assets/cityscapes_dataset/gtFine/train/defaultcity/defaultcity_000002_000045_gtFine_instanceIds.png new file mode 100644 index 0000000000000000000000000000000000000000..e658ec33cda661d28014cdf6891a175b988e7cff GIT binary patch literal 76 zcmeAS@N?(olHy`uVBq!ia0vp^tU$~t03;ZaS)bFS$q5N*34OLF4xBl_ Y!@y?6sC^-1(E^Y%Pgg&ebxsLQ06=^akN^Mx literal 0 HcmV?d00001 diff --git a/tests/assets/cityscapes_dataset/gtFine/val/defaultcity/defaultcity_000001_000019_gtFine_instanceIds.png b/tests/assets/cityscapes_dataset/gtFine/val/defaultcity/defaultcity_000001_000019_gtFine_instanceIds.png new file mode 100644 index 0000000000000000000000000000000000000000..d2ccdd1f1a7ced01bd18144787ccba00b5db98a2 GIT binary patch literal 76 zcmeAS@N?(olHy`uVBq!ia0vp^tU$~t03;ZaS)bFS$q5N*i3y27GJ%bO WA&WsMXzu$7AZ4DeelF{r5}E)R;}Hu0 literal 0 HcmV?d00001 diff --git a/tests/assets/cityscapes_dataset/imgsFine/leftImg8bit/test/defaultcity/defaultcity_000001_000031_leftImg8bit.png b/tests/assets/cityscapes_dataset/imgsFine/leftImg8bit/test/defaultcity/defaultcity_000001_000031_leftImg8bit.png new file mode 100644 index 0000000000000000000000000000000000000000..528f10546704be6b339cfe1f577ca4b10ef4f472 GIT binary patch literal 70 zcmeAS@N?(olHy`uVBq!ia0vp^tU%1j!2~2{&iT9qEaBo9!XcZ?!o;QmFVdQ&MBb@0GX=|x&QzG literal 0 HcmV?d00001 diff --git a/tests/assets/cityscapes_dataset/imgsFine/leftImg8bit/test/defaultcity/defaultcity_000001_000032_leftImg8bit.png b/tests/assets/cityscapes_dataset/imgsFine/leftImg8bit/test/defaultcity/defaultcity_000001_000032_leftImg8bit.png new file mode 100644 index 0000000000000000000000000000000000000000..528f10546704be6b339cfe1f577ca4b10ef4f472 GIT binary patch literal 70 zcmeAS@N?(olHy`uVBq!ia0vp^tU%1j!2~2{&iT9qEaBo9!XcZ?!o;QmFVdQ&MBb@0GX=|x&QzG literal 0 HcmV?d00001 diff --git a/tests/assets/cityscapes_dataset/imgsFine/leftImg8bit/train/defaultcity/defaultcity_000002_000045_leftImg8bit.png b/tests/assets/cityscapes_dataset/imgsFine/leftImg8bit/train/defaultcity/defaultcity_000002_000045_leftImg8bit.png new file mode 100644 index 0000000000000000000000000000000000000000..528f10546704be6b339cfe1f577ca4b10ef4f472 GIT binary patch literal 70 zcmeAS@N?(olHy`uVBq!ia0vp^tU%1j!2~2{&iT9qEaBo9!XcZ?!o;QmFVdQ&MBb@0GX=|x&QzG literal 0 HcmV?d00001 diff --git a/tests/assets/cityscapes_dataset/imgsFine/leftImg8bit/val/defaultcity/defaultcity_000001_000019_leftImg8bit.png b/tests/assets/cityscapes_dataset/imgsFine/leftImg8bit/val/defaultcity/defaultcity_000001_000019_leftImg8bit.png new file mode 100644 index 0000000000000000000000000000000000000000..528f10546704be6b339cfe1f577ca4b10ef4f472 GIT binary patch literal 70 zcmeAS@N?(olHy`uVBq!ia0vp^tU%1j!2~2{&iT9qEaBo9!XcZ?!o;QmFVdQ&MBb@0GX=|x&QzG literal 0 HcmV?d00001 diff --git a/tests/assets/coco_dataset/coco_panoptic/annotations/panoptic_val.json b/tests/assets/coco_dataset/coco_panoptic/annotations/panoptic_val.json new file mode 100644 index 000000000000..c945de7ca5ff --- /dev/null +++ b/tests/assets/coco_dataset/coco_panoptic/annotations/panoptic_val.json @@ -0,0 +1,75 @@ +{ + "licenses": [{ + "name": "", + "id": 0, + "url": "" + }], + "info": { + "contributor": "", + "date_created": "", + "description": "", + "url": "", + "version": "", + "year": "" + }, + "categories": [ + { + "id": 1, + "name": "a", + "supercategory": "", + "isthing": 1 + }, + { + "id": 2, + "name": "b", + "supercategory": "", + "isthing": 1 + }, + { + "id": 3, + "name": "c", + "supercategory": "", + "isthing": 1 + }, + { + "id": 4, + "name": "d", + "supercategory": "", + "isthing": 1 + } + ], + "images": [ + { + "id": 40, + "width": 5, + "height": 1, + "file_name": "000000000001.jpg", + "license": 0, + "flickr_url": "", + "coco_url": "", + "date_captured": 0 + } + ], + "annotations": [ + { + "image_id": 40, + "file_name": "000000000001.png", + "segments_info": [ + { + "id": 7, + "category_id": 4, + "area": 2.0, + "bbox": [2.0, 0.0, 1.0, 0.0], + "iscrowd": 0 + }, + { + "id": 20, + "category_id": 2, + "area": 2.0, + "bbox": [1.0, 0.0, 3.0, 0.0], + "iscrowd": 1 + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/assets/coco_dataset/coco_panoptic/annotations/panoptic_val/000000000001.png b/tests/assets/coco_dataset/coco_panoptic/annotations/panoptic_val/000000000001.png new file mode 100644 index 0000000000000000000000000000000000000000..e471bfed416252e6619cfb903be67ce3e1104417 GIT binary patch literal 78 zcmeAS@N?(olHy`uVBq!ia0vp^tU%1j!2~2{&iT9qNQrv7IEHY@CZ{AM2qY%_IK#lS bhlhuO^AMw0yP>lwPyvIdtDnm{r-UW|NK_FF literal 0 HcmV?d00001 diff --git a/tests/assets/coco_dataset/coco_panoptic/images/val/000000000001.jpg b/tests/assets/coco_dataset/coco_panoptic/images/val/000000000001.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a082a80324c398d11403c8aba2946f58746be4ea GIT binary patch literal 631 zcmex=^(PF6}rMnOeST|r4lSw=>~TvNxu(8R<5A1R0qH8UG()kO#Vxl@SaWpn#EynT3^&or9B$8>nEb z00R>vGcywlGb<|#3s7|}P@aKBkX1<0(2-3zFp*uUP{gQl;zAB(r;P_igD!qhF-|IK z;^Yz&myncFRa4i{)G{$OGqmaka3YSZQ|TeofBv2)j3M&~ka)>xhT)6Qdr?PR-2hpUWi(FzVCJ$9Vg1iRy l8F3zKBFkrRk0JbZi-Cuk5g2*Qf(-TyAGkCYHQ4{Z2>|I(&5!^9 literal 0 HcmV?d00001 diff --git a/tests/assets/coco_dataset/coco_stuff/annotations/stuff_val.json b/tests/assets/coco_dataset/coco_stuff/annotations/stuff_val.json new file mode 100644 index 000000000000..51a654f9d11c --- /dev/null +++ b/tests/assets/coco_dataset/coco_stuff/annotations/stuff_val.json @@ -0,0 +1,50 @@ +{ + "licenses": [ + { + "name": "", + "id": 0, + "url": "" + } + ], + "info": { + "contributor": "", + "date_created": "", + "description": "", + "url": "", + "version": "", + "year": "" + }, + "categories": [ + { + "id": 1, + "name": "TEST", + "supercategory": "" + } + ], + "images": [ + { + "id": 1, + "width": 5, + "height": 10, + "file_name": "000000000001.jpg", + "license": 0, + "flickr_url": "", + "coco_url": "", + "date_captured": 0 + } + ], + "annotations": [ + { + "id": 2, + "image_id": 1, + "category_id": 1, + "segmentation": { + "counts": [0, 10, 5, 5, 5, 5, 0, 10, 10, 0], + "size": [10, 5] + }, + "area": 30, + "bbox": [0, 0, 10, 4], + "iscrowd": 0 + } + ] + } diff --git a/tests/assets/coco_dataset/coco_stuff/images/val/000000000001.jpg b/tests/assets/coco_dataset/coco_stuff/images/val/000000000001.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8bce84d3bf50bd756621338e0da944a42428fb06 GIT binary patch literal 631 zcmex=^(PF6}rMnOeST|r4lSw=>~TvNxu(8R<c1}I=;VrF4wW9Q)H;sz?% zD!{d!pzFb!U9xX3zTPI5o8roG<0MW4oqZMDikqloVbuf*=gfJ(V&YTRE(2~ znmD<{#3dx9RMpfqG__1j&CD$#!v`*nMGf}ADfgxLCakm2}0|Nl_D;Zq? literal 0 HcmV?d00001 diff --git a/tests/assets/mnist_dataset/t10k-labels-idx1-ubyte.gz b/tests/assets/mnist_dataset/t10k-labels-idx1-ubyte.gz new file mode 100644 index 0000000000000000000000000000000000000000..a06f7a317c867ae3bdd32dcac470417bce34ce4c GIT binary patch literal 54 zcmb2|=HQq%eR2ZR{}MxkY~7s1q|}^Z-OQ8^(PF6}rMnOeST|r4lSw=>~TvNxu(8R<Mn*w~ z|3?_)frhg(f&l{*FfuW-u(GjpaB^`26>Jq?U}9uuW@2GxWo2Ojs;&jfGq4D<3Mm>o zvIz$!vMUve7&T5@$f4}C@t|nX#SbdRNkvVZTw>x9l2WQ_>Kd9_CZ=ZQ7M51dF0O9w z9-dyoA)#U65s^{JDXD4c8JStdC8cHM6_r)ZEv;?s9i3g1CQq3GGAU*RJ2VdF$b$$4{OPfBE|D a`;VW${@-HY00o;p!v`*nMP1teZvp^yfopjH literal 0 HcmV?d00001 diff --git a/tests/assets/voc_dataset/JPEGImages/2007_000002.jpg b/tests/assets/voc_dataset/voc_dataset1/JPEGImages/2007_000002.jpg similarity index 100% rename from tests/assets/voc_dataset/JPEGImages/2007_000002.jpg rename to tests/assets/voc_dataset/voc_dataset1/JPEGImages/2007_000002.jpg diff --git a/tests/assets/voc_dataset/SegmentationClass/2007_000001.png b/tests/assets/voc_dataset/voc_dataset1/SegmentationClass/2007_000001.png similarity index 100% rename from tests/assets/voc_dataset/SegmentationClass/2007_000001.png rename to tests/assets/voc_dataset/voc_dataset1/SegmentationClass/2007_000001.png diff --git a/tests/assets/voc_dataset/SegmentationObject/2007_000001.png b/tests/assets/voc_dataset/voc_dataset1/SegmentationObject/2007_000001.png similarity index 100% rename from tests/assets/voc_dataset/SegmentationObject/2007_000001.png rename to tests/assets/voc_dataset/voc_dataset1/SegmentationObject/2007_000001.png diff --git a/tests/assets/voc_dataset/voc_dataset2/Annotations/a.xml b/tests/assets/voc_dataset/voc_dataset2/Annotations/a.xml new file mode 100644 index 000000000000..d0c631a22280 --- /dev/null +++ b/tests/assets/voc_dataset/voc_dataset2/Annotations/a.xml @@ -0,0 +1,22 @@ + + + a.jpg + + Unknown + Unknown + Unknown + + 0 + + background + 0 + 1 + 0 + + 1.0 + 2.0 + 4.0 + 6.0 + + + diff --git a/tests/assets/voc_dataset/voc_dataset2/Annotations/b.xml b/tests/assets/voc_dataset/voc_dataset2/Annotations/b.xml new file mode 100644 index 000000000000..4dbfb5646f12 --- /dev/null +++ b/tests/assets/voc_dataset/voc_dataset2/Annotations/b.xml @@ -0,0 +1,22 @@ + + + b.jpg + + Unknown + Unknown + Unknown + + 0 + + aeroplane + 0 + 1 + 0 + + 2.0 + 2.0 + 7.0 + 6.0 + + + diff --git a/tests/assets/voc_dataset/voc_dataset2/Annotations/c.xml b/tests/assets/voc_dataset/voc_dataset2/Annotations/c.xml new file mode 100644 index 000000000000..72071892ca6f --- /dev/null +++ b/tests/assets/voc_dataset/voc_dataset2/Annotations/c.xml @@ -0,0 +1,22 @@ + + + c.jpg + + Unknown + Unknown + Unknown + + 0 + + bicycle + 0 + 0 + 0 + + 3.0 + 1.0 + 11.0 + 6.0 + + + diff --git a/tests/assets/voc_dataset/voc_dataset2/Annotations/d.xml b/tests/assets/voc_dataset/voc_dataset2/Annotations/d.xml new file mode 100644 index 000000000000..8917c1b7d5d7 --- /dev/null +++ b/tests/assets/voc_dataset/voc_dataset2/Annotations/d.xml @@ -0,0 +1,22 @@ + + + d.jpg + + Unknown + Unknown + Unknown + + 0 + + bird + 0 + 0 + 0 + + 4.0 + 4.0 + 8.0 + 8.0 + + + diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Action/trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Action/trainval.txt new file mode 100644 index 000000000000..d68dd4031d2a --- /dev/null +++ b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Action/trainval.txt @@ -0,0 +1,4 @@ +a +b +c +d diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Layout/trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Layout/trainval.txt new file mode 100644 index 000000000000..d68dd4031d2a --- /dev/null +++ b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Layout/trainval.txt @@ -0,0 +1,4 @@ +a +b +c +d diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/aeroplane_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/aeroplane_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/background_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/background_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/bicycle_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/bicycle_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/bird_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/bird_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/boat_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/boat_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/bottle_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/bottle_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/bus_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/bus_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/car_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/car_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/cat_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/cat_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/chair_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/chair_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/cow_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/cow_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/diningtable_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/diningtable_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/dog_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/dog_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/horse_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/horse_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/ignored_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/ignored_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/motorbike_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/motorbike_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/person_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/person_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/pottedplant_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/pottedplant_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/sheep_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/sheep_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/sofa_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/sofa_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/train_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/train_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/trainval.txt new file mode 100644 index 000000000000..d68dd4031d2a --- /dev/null +++ b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/trainval.txt @@ -0,0 +1,4 @@ +a +b +c +d diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/tvmonitor_trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Main/tvmonitor_trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/ImageSets/Segmentation/trainval.txt b/tests/assets/voc_dataset/voc_dataset2/ImageSets/Segmentation/trainval.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/assets/voc_dataset/voc_dataset2/labelmap.txt b/tests/assets/voc_dataset/voc_dataset2/labelmap.txt new file mode 100644 index 000000000000..28c44bd6a5bc --- /dev/null +++ b/tests/assets/voc_dataset/voc_dataset2/labelmap.txt @@ -0,0 +1,23 @@ +# label:color_rgb:parts:actions +background:0,0,0:: +aeroplane:128,0,0:: +bicycle:0,128,0:: +bird:128,128,0:: +boat:0,0,128:: +bottle:128,0,128:: +bus:0,128,128:: +car:128,128,128:: +cat:64,0,0:: +chair:192,0,0:: +cow:64,128,0:: +diningtable:192,128,0:: +dog:64,0,128:: +horse:192,0,128:: +motorbike:64,128,128:: +person:192,128,128:: +pottedplant:0,64,0:: +sheep:128,64,0:: +sofa:0,192,0:: +train:128,192,0:: +tvmonitor:0,64,128:: +ignored:224,224,192:: diff --git a/tests/cli/test_diff.py b/tests/cli/test_diff.py index 591b117119f8..96bf97fa2587 100644 --- a/tests/cli/test_diff.py +++ b/tests/cli/test_diff.py @@ -15,9 +15,11 @@ ) from datumaro.util.image import Image from datumaro.util.test_utils import TestDir +from ..requirements import Requirements, mark_requirement class DiffTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_compare_projects(self): # just a smoke test label_categories1 = LabelCategories.from_iterable(['x', 'a', 'b', 'y']) mask_categories1 = MaskCategories.make_default(len(label_categories1)) diff --git a/tests/cli/test_voc_format.py b/tests/cli/test_voc_format.py new file mode 100644 index 000000000000..a707a4651336 --- /dev/null +++ b/tests/cli/test_voc_format.py @@ -0,0 +1,291 @@ +import os.path as osp +import numpy as np +from collections import OrderedDict + +from unittest import TestCase + +import datumaro.plugins.voc_format.format as VOC +from datumaro.components.dataset import Dataset, DatasetItem +from datumaro.components.extractor import Bbox, Mask, Image, Label +from datumaro.cli.__main__ import main +from datumaro.util.test_utils import TestDir, compare_datasets +from ..requirements import Requirements, mark_requirement + +DUMMY_DATASETS_DIR = osp.join(__file__[:__file__.rfind(osp.join('tests', ''))], + 'tests', 'assets', 'voc_dataset') + +def run(test, *args, expected_code=0): + test.assertEqual(expected_code, main(args), str(args)) + + +class VocIntegrationScenarios(TestCase): + def _test_can_save_and_load(self, project_path, source_path, source_dataset, + dataset_format, result_path=None, label_map=None): + run(self, 'create', '-o', project_path) + run(self, 'add', 'path', '-p', project_path, '-f', dataset_format, source_path) + + result_dir = osp.join(project_path, 'voc_dataset') + run(self, 'export', '-f', dataset_format, '-p', project_path, + '-o', result_dir, '--', '--label-map', label_map) + + result_path = osp.join(result_dir, result_path) if result_path else result_dir + target_dataset = Dataset.import_from(result_path, dataset_format) + compare_datasets(self, source_dataset, target_dataset) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_preparing_dataset_for_train_model(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='c', subset='train', + annotations=[ + Bbox(3.0, 1.0, 8.0, 5.0, + attributes={ + 'truncated': False, + 'occluded': False, + 'difficult': False + }, + id=1, label=2, group=1 + ) + ] + ), + DatasetItem(id='d', subset='test', + annotations=[ + Bbox(4.0, 4.0, 4.0, 4.0, + attributes={ + 'truncated': False, + 'occluded': False, + 'difficult': False + }, + id=1, label=3, group=1 + ) + ] + ) + ], categories=VOC.make_voc_categories()) + + dataset_path = osp.join(DUMMY_DATASETS_DIR, 'voc_dataset2') + + with TestDir() as test_dir: + run(self, 'create', '-o', test_dir) + run(self, 'add', 'path', '-p', test_dir, '-f', 'voc', dataset_path) + + result_path = osp.join(test_dir, 'result') + run(self, 'filter', '-p', test_dir, '-m', 'i+a', + '-e', "/item/annotation[occluded='False']", '-o', result_path) + + split_path = osp.join(test_dir, 'split') + run(self, 'transform', '-p', result_path, '-o', split_path, + '-t', 'random_split', '--', '-s', 'test:.5', + '-s', 'train:.5', '--seed', '1') + + export_path = osp.join(test_dir, 'dataset') + run(self, 'export', '-p', split_path, '-f', 'voc', + '-o', export_path, '--', '--label-map', 'voc') + + parsed_dataset = Dataset.import_from(export_path, format='voc') + compare_datasets(self, source_dataset, parsed_dataset) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_convert_to_voc_format(self): + label_map = OrderedDict(('label_' + str(i), [None, [], []]) for i in range(10)) + label_map['background'] = [None, [], []] + label_map.move_to_end('background', last=False) + + source_dataset = Dataset.from_iterable([ + DatasetItem(id='1', subset='train', + annotations=[ + Bbox(0.0, 2.0, 4.0, 2.0, + attributes={ + 'difficult': False, + 'truncated': False, + 'occluded': False + }, + id=1, label=3, group=1 + ), + Bbox(3.0, 3.0, 2.0, 3.0, + attributes={ + 'difficult': False, + 'truncated': False, + 'occluded': False + }, + id=2, label=5, group=2 + ) + ] + ) + ], categories=VOC.make_voc_categories(label_map)) + + with TestDir() as test_dir: + yolo_dir = osp.join(__file__[:__file__.rfind(osp.join('tests', ''))], + 'tests', 'assets', 'yolo_dataset') + + run(self, 'create', '-o', test_dir) + run(self, 'add', 'path', '-p', test_dir, '-f', 'yolo', yolo_dir) + + voc_export = osp.join(test_dir, 'voc_export') + run(self, 'export', '-p', test_dir, '-f', 'voc', + '-o', voc_export) + + parsed_dataset = Dataset.import_from(voc_export, format='voc') + compare_datasets(self, source_dataset, parsed_dataset) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_voc_dataset(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='2007_000001', subset='train', + image=Image(path='2007_000001.jpg', size=(10, 20)), + annotations=[Label(i) for i in range(22) if i % 2 == 1] + [ + Bbox(4.0, 5.0, 2.0, 2.0, label=15, + attributes={ + 'difficult': False, + 'truncated': False, + 'occluded': False, + **{ + a.name : a.value % 2 == 1 + for a in VOC.VocAction + } + }, + id=1, group=1 + ), + Bbox(1.0, 2.0, 2.0, 2.0, label=8, + attributes={ + 'difficult': False, + 'truncated': True, + 'occluded': False, + 'pose': 'Unspecified' + }, + id=2, group=2 + ), + Bbox(5.5, 6.0, 2.0, 2.0, label=22, + id=0, group=1 + ), + Mask(image=np.ones([5, 10]), label=2, group=1) + ] + ), + DatasetItem(id='2007_000002', subset='test', + image=np.ones((10, 20, 3)) + ) + ], categories=VOC.make_voc_categories()) + + voc_dir = osp.join(DUMMY_DATASETS_DIR, 'voc_dataset1') + with TestDir() as test_dir: + self._test_can_save_and_load(test_dir, voc_dir, source_dataset, + 'voc', label_map='voc') + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_voc_layout_dataset(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='2007_000001', subset='train', + image=Image(path='2007_000001.jpg', size=(10, 20)), + annotations=[ + Bbox(4.0, 5.0, 2.0, 2.0, label=15, + attributes={ + 'difficult': False, + 'truncated': False, + 'occluded': False, + **{ + a.name : a.value % 2 == 1 + for a in VOC.VocAction + } + }, + id=1, group=1 + ), + Bbox(5.5, 6.0, 2.0, 2.0, label=22, + id=0, group=1 + ), + ] + ), + ], categories=VOC.make_voc_categories()) + + voc_layout_path = osp.join(DUMMY_DATASETS_DIR, 'voc_dataset1', + 'ImageSets', 'Layout', 'train.txt') + + with TestDir() as test_dir: + result_voc_path = osp.join('ImageSets', 'Layout', 'train.txt') + self._test_can_save_and_load(test_dir, voc_layout_path, source_dataset, + 'voc_layout', result_path=result_voc_path, label_map='voc') + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_voc_detect_dataset(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='2007_000001', subset='train', + image=Image(path='2007_000001.jpg', size=(10, 20)), + annotations=[ + Bbox(4.0, 5.0, 2.0, 2.0, label=15, + attributes={ + 'difficult': False, + 'truncated': False, + 'occluded': False, + **{ + a.name : a.value % 2 == 1 + for a in VOC.VocAction + } + }, + id=2, group=2 + ), + Bbox(1.0, 2.0, 2.0, 2.0, label=8, + attributes={ + 'difficult': False, + 'truncated': True, + 'occluded': False, + 'pose': 'Unspecified' + }, + id=1, group=1 + ) + ] + ), + ], categories=VOC.make_voc_categories()) + + voc_detection_path = osp.join(DUMMY_DATASETS_DIR, 'voc_dataset1', + 'ImageSets', 'Main', 'train.txt') + + with TestDir() as test_dir: + result_voc_path = osp.join('ImageSets', 'Main', 'train.txt') + self._test_can_save_and_load(test_dir, voc_detection_path, source_dataset, + 'voc_detection', result_path=result_voc_path, label_map='voc') + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_voc_segmentation_dataset(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='2007_000001', subset='train', + image=Image(path='2007_000001.jpg', size=(10, 20)), + annotations=[ + Mask(image=np.ones([5, 10]), label=2, group=1) + ] + ) + ], categories=VOC.make_voc_categories()) + + voc_segm_path = osp.join(DUMMY_DATASETS_DIR, 'voc_dataset1', + 'ImageSets', 'Segmentation', 'train.txt') + + with TestDir() as test_dir: + result_voc_path = osp.join('ImageSets', 'Segmentation', 'train.txt') + self._test_can_save_and_load(test_dir, voc_segm_path, source_dataset, + 'voc_segmentation', result_path=result_voc_path, label_map='voc') + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_voc_action_dataset(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='2007_000001', subset='train', + image=Image(path='2007_000001.jpg', size=(10, 20)), + annotations=[ + Bbox(4.0, 5.0, 2.0, 2.0, label=15, + attributes={ + 'difficult': False, + 'truncated': False, + 'occluded': False, + **{ + a.name : a.value % 2 == 1 + for a in VOC.VocAction + } + }, + id=1, group=1 + ) + ] + ) + ], categories=VOC.make_voc_categories()) + + voc_act_path = osp.join(DUMMY_DATASETS_DIR, 'voc_dataset1', + 'ImageSets', 'Action', 'train.txt') + + with TestDir() as test_dir: + result_voc_path = osp.join('ImageSets', 'Action', 'train.txt') + self._test_can_save_and_load(test_dir, voc_act_path, source_dataset, + 'voc_action', result_path=result_voc_path, label_map='voc') diff --git a/tests/cli/test_yolo_format.py b/tests/cli/test_yolo_format.py new file mode 100644 index 000000000000..2ff047a01066 --- /dev/null +++ b/tests/cli/test_yolo_format.py @@ -0,0 +1,163 @@ +import numpy as np +import os.path as osp + +from unittest import TestCase + +from datumaro.cli.__main__ import main +from datumaro.components.dataset import Dataset +from datumaro.components.extractor import (DatasetItem, + AnnotationType, Bbox) +from datumaro.util.test_utils import TestDir, compare_datasets +import datumaro.plugins.voc_format.format as VOC +from ..requirements import Requirements, mark_requirement + +def run(test, *args, expected_code=0): + test.assertEqual(expected_code, main(args), str(args)) + +class YoloIntegrationScenarios(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_yolo_dataset(self): + target_dataset = Dataset.from_iterable([ + DatasetItem(id='1', subset='train', + image=np.ones((10, 15, 3)), + annotations=[ + Bbox(3.0, 3.0, 2.0, 3.0, label=4), + Bbox(0.0, 2.0, 4.0, 2.0, label=2) + ] + ) + ], categories=['label_' + str(i) for i in range(10)]) + + with TestDir() as test_dir: + yolo_dir = osp.join(__file__[:__file__.rfind(osp.join('tests', ''))], + 'tests', 'assets', 'yolo_dataset') + + run(self, 'import', '-o', test_dir, '-f', 'yolo', '-i', yolo_dir) + + export_dir = osp.join(test_dir, 'export_dir') + run(self, 'export', '-p', test_dir, '-o', export_dir, + '-f', 'yolo', '--', '--save-images') + + parsed_dataset = Dataset.import_from(export_dir, format='yolo') + compare_datasets(self, target_dataset, parsed_dataset) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_export_mot_as_yolo(self): + target_dataset = Dataset.from_iterable([ + DatasetItem(id='1', subset='train', + annotations=[ + Bbox(0.0, 4.0, 4.0, 8.0, label=2) + ] + ) + ], categories=['label_' + str(i) for i in range(10)]) + + with TestDir() as test_dir: + mot_dir = osp.join(__file__[:__file__.rfind(osp.join('tests', ''))], + 'tests', 'assets', 'mot_dataset') + + run(self, 'create', '-o', test_dir) + run(self, 'add', 'path', '-p', test_dir, '-f', 'mot_seq', mot_dir) + + yolo_dir = osp.join(test_dir, 'yolo_dir') + run(self, 'export', '-p', test_dir, '-o', yolo_dir, + '-f', 'yolo', '--', '--save-images') + + parsed_dataset = Dataset.import_from(yolo_dir, format='yolo') + compare_datasets(self, target_dataset, parsed_dataset) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_convert_voc_to_yolo(self): + target_dataset = Dataset.from_iterable([ + DatasetItem(id='2007_000001', subset='train', + annotations=[ + Bbox(8.0, 2.5, 4.0, 1.0, label=15), + Bbox(2.0, 1.0, 4.0, 1.0, label=8), + Bbox(11.0, 3.0, 4.0, 1.0, label=22) + ] + ) + ], categories=[label.name for label in + VOC.make_voc_categories()[AnnotationType.label]]) + + with TestDir() as test_dir: + voc_dir = osp.join(__file__[:__file__.rfind(osp.join('tests', ''))], + 'tests', 'assets', 'voc_dataset', 'voc_dataset1') + yolo_dir = osp.join(test_dir, 'yolo_dir') + + run(self, 'convert', '-if', 'voc', '-i', voc_dir, + '-f', 'yolo', '-o', yolo_dir, '--', '--save-images') + + parsed_dataset = Dataset.import_from(yolo_dir, format='yolo') + compare_datasets(self, target_dataset, parsed_dataset) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_ignore_non_supported_subsets(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='img1', subset='test', + image=np.ones((10, 20, 3)), + annotations=[ + Bbox(1.0, 2.0, 1.0, 1.0, label=0) + ] + ), + DatasetItem(id='img2', subset='train', + image=np.ones((10, 5, 3)), + annotations=[ + Bbox(3.0, 1.0, 2.0, 1.0, label=1) + ] + ) + ], categories=[str(i) for i in range(4)]) + + target_dataset = Dataset.from_iterable([ + DatasetItem(id='img2', subset='train', + image=np.ones((10, 5, 3)), + annotations=[ + Bbox(3.0, 1.0, 2.0, 1.0, label=1) + ] + ) + ], categories=[str(i) for i in range(4)]) + + with TestDir() as test_dir: + dataset_dir = osp.join(test_dir, 'dataset_dir') + source_dataset.save(dataset_dir, save_images=True) + + run(self, 'create', '-o', test_dir) + run(self, 'add', 'path', '-p', test_dir, '-f', 'datumaro', dataset_dir) + + yolo_dir = osp.join(test_dir, 'yolo_dir') + run(self, 'export', '-p', test_dir, '-o', yolo_dir, + '-f', 'yolo', '--', '--save-images') + + parsed_dataset = Dataset.import_from(yolo_dir, format='yolo') + compare_datasets(self, target_dataset, parsed_dataset) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_delete_labels_from_yolo_dataset(self): + target_dataset = Dataset.from_iterable([ + DatasetItem(id='1', subset='train', + image=np.ones((10, 15, 3)), + annotations=[ + Bbox(0.0, 2.0, 4.0, 2.0, label=0) + ] + ) + ], categories=['label_2']) + + with TestDir() as test_dir: + yolo_dir = osp.join(__file__[:__file__.rfind(osp.join('tests', ''))], + 'tests', 'assets', 'yolo_dataset') + + run(self, 'create', '-o', test_dir) + run(self, 'add', 'path', '-p', test_dir, '-f', 'yolo', yolo_dir) + + filtered_path = osp.join(test_dir, 'filtered') + run(self, 'filter', '-p', test_dir, '-o', filtered_path, + '-m', 'i+a', '-e', "/item/annotation[label='label_2']") + + result_path = osp.join(test_dir, 'result') + run(self, 'transform', '-p', filtered_path, '-o', result_path, + '-t', 'remap_labels', '--', '-l', 'label_2:label_2', + '--default', 'delete') + + export_dir = osp.join(test_dir, 'export') + run(self, 'export', '-p', result_path, '-o', export_dir, + '-f', 'yolo', '--', '--save-image') + + parsed_dataset = Dataset.import_from(export_dir, format='yolo') + compare_datasets(self, target_dataset, parsed_dataset) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000000..75e399ee31ac --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,17 @@ +# Copyright (C) 2021 Intel Corporation +# +# SPDX-License-Identifier: MIT + +def pytest_configure(config): + # register additional markers + config.addinivalue_line("markers", "unit: mark a test as unit test") + config.addinivalue_line("markers", "component: mark a test a component test") + config.addinivalue_line("markers", "cli: mark a test a CLI test") + + config.addinivalue_line("markers", "priority_low: mark a test as low priority") + config.addinivalue_line("markers", "priority_medium: mark a test as medium priority") + config.addinivalue_line("markers", "priority_high: mark a test as high priority") + + config.addinivalue_line("markers", "components(ids): link a test with a component") + config.addinivalue_line("markers", "reqids(ids): link a test with a requirement") + config.addinivalue_line("markers", "bugs(ids): link a test with a bug") diff --git a/tests/requirements.py b/tests/requirements.py new file mode 100644 index 000000000000..49ab421d62eb --- /dev/null +++ b/tests/requirements.py @@ -0,0 +1,49 @@ +# Copyright (C) 2021 Intel Corporation +# +# SPDX-License-Identifier: MIT + +import pytest + + +def mark_requirement(requirement): + def wrapper(test_func): + @pytest.mark.components(DatumaroComponent.Datumaro) + @pytest.mark.component + @pytest.mark.priority_medium + @pytest.mark.reqids(requirement) + def test_wrapper(*args, **kwargs): + return test_func(*args, **kwargs) + return test_wrapper + return wrapper + +def mark_bug(bugs): + def wrapper(test_func): + @pytest.mark.components(DatumaroComponent.Datumaro) + @pytest.mark.component + @pytest.mark.priority_medium + @pytest.mark.bugs(bugs) + def test_wrapper(*args, **kwargs): + return test_func(*args, **kwargs) + return test_wrapper + return wrapper + + +class DatumaroComponent: + Datumaro = "datumaro" + + +class Requirements: + # Exact requirements + DATUM_GENERAL_REQ = "Datumaro general requirement" + + # GitHub issues (not bugs) + # https://github.com/openvinotoolkit/datumaro/issues + DATUM_244 = "Add Snyk integration" + + # GitHub issues (bugs) + # https://github.com/openvinotoolkit/datumaro/issues + DATUM_BUG_219 = "Return format is not uniform" + + +class SkipMessages: + NOT_IMPLEMENTED = "NOT IMPLEMENTED" diff --git a/tests/test_RISE.py b/tests/test_RISE.py index 04772287f43a..b32cc0b9459e 100644 --- a/tests/test_RISE.py +++ b/tests/test_RISE.py @@ -6,9 +6,11 @@ from datumaro.components.extractor import Label, Bbox from datumaro.components.launcher import Launcher from datumaro.components.algorithms.rise import RISE +from .requirements import Requirements, mark_requirement class RiseTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_rise_can_be_applied_to_classification_model(self): class TestLauncher(Launcher): def __init__(self, class_count, roi, **kwargs): @@ -57,6 +59,7 @@ def _process(self, image): hrest_den = (h_sum - roi_sum) / (h_area - roi_area) self.assertLess(hrest_den, roi_den) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_rise_can_be_applied_to_detection_model(self): ROI = namedtuple('ROI', ['threshold', 'x', 'y', 'w', 'h', 'label']) diff --git a/tests/test_camvid_format.py b/tests/test_camvid_format.py index 9bf3b1b2340c..73df7b2b592f 100644 --- a/tests/test_camvid_format.py +++ b/tests/test_camvid_format.py @@ -12,9 +12,11 @@ from datumaro.util.image import Image from datumaro.util.test_utils import (TestDir, compare_datasets, test_save_and_load) +from .requirements import Requirements, mark_requirement class CamvidFormatTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_write_and_parse_labelmap(self): src_label_map = Camvid.CamvidLabelMap @@ -35,6 +37,7 @@ def categories(self): return Camvid.make_camvid_categories() class CamvidImportTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='0001TP_008550', subset='test', @@ -73,16 +76,19 @@ def test_can_import(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect_camvid(self): self.assertTrue(CamvidImporter.detect(DUMMY_DATASET_DIR)) class CamvidConverterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def _test_save_and_load(self, source_dataset, converter, test_dir, target_dataset=None, importer_args=None, **kwargs): return test_save_and_load(self, source_dataset, converter, test_dir, importer='camvid', target_dataset=target_dataset, importer_args=importer_args, **kwargs) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_camvid_segm(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -100,6 +106,7 @@ def __iter__(self): partial(CamvidConverter.convert, label_map='camvid'), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_camvid_segm_unpainted(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -127,6 +134,7 @@ def __iter__(self): label_map='camvid', apply_colormap=False), test_dir, target_dataset=DstExtractor()) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_no_subsets(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -146,6 +154,7 @@ def __iter__(self): self._test_save_and_load(TestExtractor(), partial(CamvidConverter.convert, label_map='camvid'), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -162,6 +171,7 @@ def __iter__(self): self._test_save_and_load(TestExtractor(), partial(CamvidConverter.convert, label_map='camvid'), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_with_no_masks(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -176,6 +186,7 @@ def __iter__(self): partial(CamvidConverter.convert, label_map='camvid'), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_dataset_with_source_labelmap_undefined(self): class SrcExtractor(TestExtractorBase): def __iter__(self): @@ -211,6 +222,7 @@ def categories(self): partial(CamvidConverter.convert, label_map='source'), test_dir, target_dataset=DstExtractor()) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_dataset_with_source_labelmap_defined(self): class SrcExtractor(TestExtractorBase): def __iter__(self): @@ -245,6 +257,7 @@ def categories(self): partial(CamvidConverter.convert, label_map='source'), test_dir, target_dataset=DstExtractor()) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): class SrcExtractor(TestExtractorBase): def __iter__(self): diff --git a/tests/test_cifar_format.py b/tests/test_cifar_format.py new file mode 100644 index 000000000000..480d795954ca --- /dev/null +++ b/tests/test_cifar_format.py @@ -0,0 +1,158 @@ +import os.path as osp +from unittest import TestCase + +import numpy as np +from datumaro.components.dataset import Dataset +from datumaro.components.extractor import (AnnotationType, DatasetItem, Label, + LabelCategories) +from datumaro.plugins.cifar_format import CifarConverter, CifarImporter +from datumaro.util.image import Image +from datumaro.util.test_utils import TestDir, compare_datasets +from .requirements import Requirements, mark_requirement + + +class CifarFormatTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='image_2', subset='test', + image=np.ones((32, 32, 3)), + annotations=[Label(0)] + ), + DatasetItem(id='image_3', subset='test', + image=np.ones((32, 32, 3)) + ), + DatasetItem(id='image_4', subset='test', + image=np.ones((32, 32, 3)), + annotations=[Label(1)] + ) + ], categories=['label_0', 'label_1']) + + with TestDir() as test_dir: + CifarConverter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'cifar') + + compare_datasets(self, source_dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_without_saving_images(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='a', subset='train_1', + annotations=[Label(0)] + ), + DatasetItem(id='b', subset='train_first', + annotations=[Label(1)] + ), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + 'label' + str(label) for label in range(2)), + }) + + with TestDir() as test_dir: + CifarConverter.convert(source_dataset, test_dir, save_images=False) + parsed_dataset = Dataset.import_from(test_dir, 'cifar') + + compare_datasets(self, source_dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_with_different_image_size(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='image_1', + image=np.ones((10, 8, 3)), + annotations=[Label(0)] + ), + DatasetItem(id='image_2', + image=np.ones((32, 32, 3)), + annotations=[Label(1)] + ), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + 'label' + str(label) for label in range(2)), + }) + + with TestDir() as test_dir: + CifarConverter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'cifar') + + compare_datasets(self, source_dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id="кириллица с пробелом", + image=np.ones((32, 32, 3)), + annotations=[Label(0)] + ), + ], categories=['label_0']) + + with TestDir() as test_dir: + CifarConverter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'cifar') + + compare_datasets(self, source_dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_image_with_arbitrary_extension(self): + dataset = Dataset.from_iterable([ + DatasetItem(id='q/1', image=Image(path='q/1.JPEG', + data=np.zeros((32, 32, 3)))), + DatasetItem(id='a/b/c/2', image=Image(path='a/b/c/2.bmp', + data=np.zeros((32, 32, 3)))), + ], categories=[]) + + with TestDir() as test_dir: + CifarConverter.convert(dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'cifar') + + compare_datasets(self, dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_empty_image(self): + dataset = Dataset.from_iterable([ + DatasetItem(id='a', annotations=[Label(0)]), + DatasetItem(id='b') + ], categories=['label_0']) + + with TestDir() as test_dir: + CifarConverter.convert(dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'cifar') + + compare_datasets(self, dataset, parsed_dataset, + require_images=True) + +DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'cifar_dataset') + +class CifarImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_import(self): + expected_dataset = Dataset.from_iterable([ + DatasetItem(id='image_1', subset='train_1', + image=np.ones((32, 32, 3)), + annotations=[Label(0)] + ), + DatasetItem(id='image_2', subset='test', + image=np.ones((32, 32, 3)), + annotations=[Label(1)] + ), + DatasetItem(id='image_3', subset='test', + image=np.ones((32, 32, 3)), + annotations=[Label(3)] + ), + DatasetItem(id='image_4', subset='test', + image=np.ones((32, 32, 3)), + annotations=[Label(2)] + ) + ], categories=['airplane', 'automobile', 'bird', 'cat']) + + dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'cifar') + + compare_datasets(self, expected_dataset, dataset) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_detect(self): + self.assertTrue(CifarImporter.detect(DUMMY_DATASET_DIR)) diff --git a/tests/test_cityscapes_format.py b/tests/test_cityscapes_format.py new file mode 100644 index 000000000000..fd23de9d76b5 --- /dev/null +++ b/tests/test_cityscapes_format.py @@ -0,0 +1,350 @@ +import os.path as osp +from collections import OrderedDict +from functools import partial +from unittest import TestCase + +import datumaro.plugins.cityscapes_format as Cityscapes +import numpy as np +from datumaro.components.extractor import (AnnotationType, DatasetItem, + Extractor, LabelCategories, Mask) +from datumaro.components.dataset import Dataset +from datumaro.plugins.cityscapes_format import (CityscapesImporter, + CityscapesConverter) +from datumaro.util.image import Image +from datumaro.util.test_utils import (TestDir, compare_datasets, + test_save_and_load) + +DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', + 'cityscapes_dataset') + +class CityscapesFormatTest(TestCase): + def test_can_write_and_parse_labelmap(self): + src_label_map = Cityscapes.CityscapesLabelMap + + with TestDir() as test_dir: + file_path = osp.join(test_dir, 'label_colors.txt') + + Cityscapes.write_label_map(file_path, src_label_map) + dst_label_map = Cityscapes.parse_label_map(file_path) + + self.assertEqual(src_label_map, dst_label_map) + +class CityscapesImportTest(TestCase): + def test_can_import(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='defaultcity/defaultcity_000001_000031', + subset='test', + image=np.ones((1, 5, 3)), + annotations=[ + Mask(image=np.array([[1, 1, 0, 0, 0]]), id=3, label=3, + attributes={'is_crowd': True}), + Mask(image=np.array([[0, 0, 1, 0, 0]]), id=1, label=27, + attributes={'is_crowd': False}), + Mask(image=np.array([[0, 0, 0, 1, 1]]), id=2, label=27, + attributes={'is_crowd': False}), + ] + ), + DatasetItem(id='defaultcity/defaultcity_000001_000032', + subset='test', + image=np.ones((1, 5, 3)), + annotations=[ + Mask(image=np.array([[1, 1, 0, 0, 0]]), id=1, label=31, + attributes={'is_crowd': False}), + Mask(image=np.array([[0, 0, 1, 0, 0]]), id=12, label=12, + attributes={'is_crowd': True}), + Mask(image=np.array([[0, 0, 0, 1, 1]]), id=3, label=3, + attributes={'is_crowd': True}), + ] + ), + DatasetItem(id='defaultcity/defaultcity_000002_000045', + subset='train', + image=np.ones((1, 5, 3)), + annotations=[ + Mask(image=np.array([[1, 1, 0, 1, 1]]), id=3, label=3, + attributes={'is_crowd': True}), + Mask(image=np.array([[0, 0, 1, 0, 0]]), id=1, label=24, + attributes={'is_crowd': False}), + ] + ), + DatasetItem(id='defaultcity/defaultcity_000001_000019', + subset = 'val', + image=np.ones((1, 5, 3)), + annotations=[ + Mask(image=np.array([[1, 0, 0, 1, 1]]), id=3, label=3, + attributes={'is_crowd': True}), + Mask(image=np.array([[0, 1, 1, 0, 0]]), id=24, label=1, + attributes={'is_crowd': False}), + ] + ), + ], categories=Cityscapes.make_cityscapes_categories()) + + parsed_dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'cityscapes') + + compare_datasets(self, source_dataset, parsed_dataset) + + def test_can_detect_cityscapes(self): + self.assertTrue(CityscapesImporter.detect(DUMMY_DATASET_DIR)) + + +class TestExtractorBase(Extractor): + def _label(self, cityscapes_label): + return self.categories()[AnnotationType.label].find(cityscapes_label)[0] + + def categories(self): + return Cityscapes.make_cityscapes_categories() + +class CityscapesConverterTest(TestCase): + def _test_save_and_load(self, source_dataset, converter, test_dir, + target_dataset=None, importer_args=None, **kwargs): + return test_save_and_load(self, source_dataset, converter, test_dir, + importer='cityscapes', + target_dataset=target_dataset, importer_args=importer_args, **kwargs) + + def test_can_save_cityscapes_segm(self): + class TestExtractor(TestExtractorBase): + def __iter__(self): + return iter([ + DatasetItem(id='defaultcity_1_2', subset='test', + image=np.ones((1, 5, 3)), annotations=[ + Mask(image=np.array([[0, 0, 0, 1, 0]]), label=3, id=3, + attributes={'is_crowd': True}), + Mask(image=np.array([[0, 1, 1, 0, 0]]), label=24, id=1, + attributes={'is_crowd': False}), + Mask(image=np.array([[1, 0, 0, 0, 1]]), label=15, id=15, + attributes={'is_crowd': True}), + ]), + DatasetItem(id='defaultcity_3', subset='val', + image=np.ones((1, 5, 3)), annotations=[ + Mask(image=np.array([[1, 1, 0, 1, 1]]), label=3, id=3, + attributes={'is_crowd': True}), + Mask(image=np.array([[0, 0, 1, 0, 0]]), label=5, id=5, + attributes={'is_crowd': True}), + ]), + ]) + with TestDir() as test_dir: + self._test_save_and_load(TestExtractor(), + partial(CityscapesConverter.convert, label_map='cityscapes', + save_images=True), test_dir) + + def test_can_save_cityscapes_segm_unpainted(self): + class TestExtractor(TestExtractorBase): + def __iter__(self): + return iter([ + DatasetItem(id='defaultcity_1_2', subset='test', + image=np.ones((1, 5, 3)), annotations=[ + Mask(image=np.array([[0, 0, 0, 1, 0]]), label=3, id=3, + attributes={'is_crowd': True}), + Mask(image=np.array([[0, 1, 1, 0, 0]]), label=24, id=1, + attributes={'is_crowd': False}), + Mask(image=np.array([[1, 0, 0, 0, 1]]), label=15, id=15, + attributes={'is_crowd': True}), + ]), + ]) + + with TestDir() as test_dir: + self._test_save_and_load(TestExtractor(), + partial(CityscapesConverter.convert, label_map='cityscapes', + save_images=True, apply_colormap=False), test_dir) + + def test_can_save_cityscapes_dataset_with_no_subsets(self): + class TestExtractor(TestExtractorBase): + def __iter__(self): + return iter([ + DatasetItem(id='defaultcity_1_2', + image=np.ones((1, 5, 3)), annotations=[ + Mask(image=np.array([[1, 0, 0, 1, 0]]), label=0, id=0, + attributes={'is_crowd': True}), + Mask(image=np.array([[0, 1, 1, 0, 1]]), label=3, id=3, + attributes={'is_crowd': True}), + ]), + + DatasetItem(id='defaultcity_1_3', + image=np.ones((1, 5, 3)), annotations=[ + Mask(image=np.array([[1, 1, 0, 1, 0]]), label=1, id=1, + attributes={'is_crowd': True}), + Mask(image=np.array([[0, 0, 1, 0, 1]]), label=2, id=2, + attributes={'is_crowd': True}), + ]), + ]) + + with TestDir() as test_dir: + self._test_save_and_load(TestExtractor(), + partial(CityscapesConverter.convert, label_map='cityscapes', + save_images=True), test_dir) + + def test_can_save_cityscapes_dataset_without_frame_and_sequence(self): + class TestExtractor(TestExtractorBase): + def __iter__(self): + return iter([ + DatasetItem(id='justcity', subset='test', + image=np.ones((1, 5, 3)), annotations=[ + Mask(image=np.array([[1, 0, 0, 1, 1]]), label=3, id=3, + attributes={'is_crowd': True}), + Mask(image=np.array([[0, 1, 1, 0, 0]]), label=24, id=1, + attributes={'is_crowd': False}), + ]), + ]) + with TestDir() as test_dir: + self._test_save_and_load(TestExtractor(), + partial(CityscapesConverter.convert, label_map='cityscapes', + save_images=True), test_dir) + + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + class TestExtractor(TestExtractorBase): + def __iter__(self): + return iter([ + DatasetItem(id='кириллица с пробелом', + image=np.ones((1, 5, 3)), annotations=[ + Mask(image=np.array([[1, 0, 0, 1, 1]]), label=3, id=3, + attributes={'is_crowd': True}), + Mask(image=np.array([[0, 1, 1, 0, 0]]), label=24, id=1, + attributes={'is_crowd': False}), + ]), + ]) + + with TestDir() as test_dir: + self._test_save_and_load(TestExtractor(), + partial(CityscapesConverter.convert, label_map='cityscapes', + save_images=True), test_dir) + + def test_can_save_cityscapes_dataset_with_strange_id(self): + class TestExtractor(TestExtractorBase): + def __iter__(self): + return iter([ + DatasetItem(id='a/b/1', subset='test', + image=np.ones((1, 5, 3)), annotations=[ + Mask(image=np.array([[1, 0, 0, 1, 1]]), label=3, id=3, + attributes={'is_crowd': True}), + Mask(image=np.array([[0, 1, 1, 0, 0]]), label=24, id=1, + attributes={'is_crowd': False}), + ]), + ]) + + with TestDir() as test_dir: + self._test_save_and_load(TestExtractor(), + partial(CityscapesConverter.convert, label_map='cityscapes', + save_images=True), test_dir) + + def test_can_save_with_no_masks(self): + class TestExtractor(TestExtractorBase): + def __iter__(self): + return iter([ + DatasetItem(id='city_1_2', subset='test', + image=np.ones((2, 5, 3)), + ), + ]) + + with TestDir() as test_dir: + self._test_save_and_load(TestExtractor(), + partial(CityscapesConverter.convert, label_map='cityscapes', + save_images=True), test_dir) + + def test_dataset_with_source_labelmap_undefined(self): + class SrcExtractor(TestExtractorBase): + def __iter__(self): + yield DatasetItem(id=1, image=np.ones((1, 5, 3)), annotations=[ + Mask(image=np.array([[1, 0, 0, 1, 1]]), label=1, id=1, + attributes={'is_crowd': False}), + Mask(image=np.array([[0, 1, 1, 0, 0]]), label=2, id=2, + attributes={'is_crowd': False}), + ]) + + def categories(self): + label_cat = LabelCategories() + label_cat.add('background') + label_cat.add('Label_1') + label_cat.add('label_2') + return { + AnnotationType.label: label_cat, + } + + class DstExtractor(TestExtractorBase): + def __iter__(self): + yield DatasetItem(id=1, image=np.ones((1, 5, 3)), annotations=[ + Mask(image=np.array([[1, 0, 0, 1, 1]]), + attributes={'is_crowd': False}, id=1, + label=self._label('Label_1')), + Mask(image=np.array([[0, 1, 1, 0, 0]]), + attributes={'is_crowd': False}, id=2, + label=self._label('label_2')), + ]) + + def categories(self): + label_map = OrderedDict() + label_map['background'] = None + label_map['Label_1'] = None + label_map['label_2'] = None + return Cityscapes.make_cityscapes_categories(label_map) + + with TestDir() as test_dir: + self._test_save_and_load(SrcExtractor(), + partial(CityscapesConverter.convert, label_map='source', + save_images=True), test_dir, target_dataset=DstExtractor()) + + def test_dataset_with_source_labelmap_defined(self): + class SrcExtractor(TestExtractorBase): + def __iter__(self): + yield DatasetItem(id=1, image=np.ones((1, 5, 3)), annotations=[ + Mask(image=np.array([[1, 0, 0, 1, 1]]), label=1, id=1, + attributes={'is_crowd': False}), + Mask(image=np.array([[0, 1, 1, 0, 0]]), label=2, id=2, + attributes={'is_crowd': False}), + ]) + + def categories(self): + label_map = OrderedDict() + label_map['background'] = (0, 0, 0) + label_map['label_1'] = (1, 2, 3) + label_map['label_2'] = (3, 2, 1) + return Cityscapes.make_cityscapes_categories(label_map) + + class DstExtractor(TestExtractorBase): + def __iter__(self): + yield DatasetItem(id=1, image=np.ones((1, 5, 3)), annotations=[ + Mask(image=np.array([[1, 0, 0, 1, 1]]), + attributes={'is_crowd': False}, id=1, + label=self._label('label_1')), + Mask(image=np.array([[0, 1, 1, 0, 0]]), + attributes={'is_crowd': False}, id=2, + label=self._label('label_2')), + ]) + + def categories(self): + label_map = OrderedDict() + label_map['background'] = (0, 0, 0) + label_map['label_1'] = (1, 2, 3) + label_map['label_2'] = (3, 2, 1) + return Cityscapes.make_cityscapes_categories(label_map) + + with TestDir() as test_dir: + self._test_save_and_load(SrcExtractor(), + partial(CityscapesConverter.convert, label_map='source', + save_images=True), test_dir, target_dataset=DstExtractor()) + + def test_can_save_and_load_image_with_arbitrary_extension(self): + class TestExtractor(TestExtractorBase): + def __iter__(self): + return iter([ + DatasetItem(id='q/1', image=Image(path='q/1.JPEG', + data=np.zeros((4, 3, 3)))), + + DatasetItem(id='a/b/c/2', image=Image( + path='a/b/c/2.bmp', data=np.ones((1, 5, 3)) + ), annotations=[ + Mask(image=np.array([[1, 0, 0, 1, 0]]), label=0, id=0, + attributes={'is_crowd': True}), + Mask(image=np.array([[0, 1, 1, 0, 1]]), label=1, id=1, + attributes={'is_crowd': True}), + ]), + ]) + + def categories(self): + label_map = OrderedDict() + label_map['a'] = None + label_map['b'] = None + return Cityscapes.make_cityscapes_categories(label_map) + + with TestDir() as test_dir: + self._test_save_and_load(TestExtractor(), + partial(CityscapesConverter.convert, save_images=True), + test_dir, require_images=True) diff --git a/tests/test_coco_format.py b/tests/test_coco_format.py index ae24b4d88e3e..c1b033d4f425 100644 --- a/tests/test_coco_format.py +++ b/tests/test_coco_format.py @@ -17,16 +17,20 @@ CocoInstancesConverter, CocoPersonKeypointsConverter, CocoLabelsConverter, + CocoPanopticConverter, + CocoStuffConverter, ) from datumaro.plugins.coco_format.importer import CocoImporter from datumaro.util.image import Image from datumaro.util.test_utils import (TestDir, compare_datasets, test_save_and_load) - +from .requirements import Requirements, mark_requirement DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'coco_dataset') + class CocoImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import_instances(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='000000000001', image=np.ones((10, 5, 3)), @@ -49,6 +53,7 @@ def test_can_import_instances(self): compare_datasets(self, expected_dataset, dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import_captions(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', @@ -72,6 +77,7 @@ def test_can_import_captions(self): compare_datasets(self, expected_dataset, dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import_labels(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', @@ -86,6 +92,7 @@ def test_can_import_labels(self): compare_datasets(self, expected_dataset, dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import_points(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', @@ -131,6 +138,7 @@ def test_can_import_points(self): compare_datasets(self, expected_dataset, dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import_image_info(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id=1, image=Image(path='1.jpg', size=(10, 15)), @@ -142,6 +150,48 @@ def test_can_import_image_info(self): compare_datasets(self, expected_dataset, dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_import_panoptic(self): + expected_dataset = Dataset.from_iterable([ + DatasetItem(id='000000000001', + image=np.ones((1, 5, 3)), + subset='val', + attributes={'id': 40}, + annotations=[ + Mask(image=np.array([[0, 0, 1, 1, 0]]), label=3, + id=7, group=7, attributes={'is_crowd': False}), + Mask(image=np.array([[0, 1, 0, 0, 1]]), label=1, + id=20, group=20, attributes={'is_crowd': True}), + ] + ), + ], categories=['a', 'b', 'c', 'd']) + + dataset = Dataset.import_from( + osp.join(DUMMY_DATASET_DIR, 'coco_panoptic'), 'coco') + + compare_datasets(self, expected_dataset, dataset, require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_import_stuff(self): + expected_dataset = Dataset.from_iterable([ + DatasetItem(id='000000000001', image=np.ones((10, 5, 3)), + subset='val', attributes={'id': 1}, + annotations=[ + Mask(np.array( + [[1, 0, 0, 1, 0]] * 5 + + [[1, 1, 1, 1, 0]] * 5 + ), label=0, + id=2, group=2, attributes={'is_crowd': False}), + ] + ), + ], categories=['TEST',]) + + dataset = Dataset.import_from( + osp.join(DUMMY_DATASET_DIR, 'coco_stuff'), 'coco') + + compare_datasets(self, expected_dataset, dataset) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect(self): self.assertTrue(CocoImporter.detect( osp.join(DUMMY_DATASET_DIR, 'coco_instances'))) @@ -153,6 +203,7 @@ def _test_save_and_load(self, source_dataset, converter, test_dir, importer='coco', target_dataset=target_dataset, importer_args=importer_args, **kwargs) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_captions(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', @@ -175,6 +226,7 @@ def test_can_save_and_load_captions(self): self._test_save_and_load(expected_dataset, CocoCaptionsConverter.convert, test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_instances(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', image=np.ones((4, 4, 3)), @@ -255,6 +307,82 @@ def test_can_save_and_load_instances(self): CocoInstancesConverter.convert, test_dir, target_dataset=target_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_panoptic(self): + dataset = Dataset.from_iterable([ + DatasetItem(id=1, subset='train', image=np.ones((4, 4, 3)), + annotations=[ + Mask(image=np.array([ + [0, 1, 0, 0], + [0, 1, 0, 0], + [0, 1, 1, 1], + [0, 0, 0, 0] + ]), + attributes={ 'is_crowd': False }, + label=4, group=3, id=3), + ], attributes={'id': 1}), + + DatasetItem(id=2, subset='val', image=np.ones((5, 5, 3)), + annotations=[ + Mask(image=np.array([ + [0, 0, 0, 0, 0], + [1, 1, 1, 0, 0], + [1, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0] + ]), + attributes={ 'is_crowd': False }, + label=4, group=3, id=3), + Mask(image=np.array([ + [0, 0, 0, 0, 1], + [0, 0, 0, 0, 1], + [0, 0, 0, 0, 1], + [0, 0, 0, 0, 1], + [0, 0, 0, 0, 1] + ]), + attributes={ 'is_crowd': False }, + label=2, group=2, id=2), + ], attributes={'id': 2}), + ], categories=[str(i) for i in range(10)]) + + with TestDir() as test_dir: + self._test_save_and_load(dataset, + partial(CocoPanopticConverter.convert, save_images=True), + test_dir, require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_stuff(self): + dataset = Dataset.from_iterable([ + DatasetItem(id=1, subset='train', image=np.ones((4, 4, 3)), + annotations=[ + Mask(np.array([ + [0, 1, 0, 0], + [0, 1, 0, 0], + [0, 1, 1, 1], + [0, 0, 0, 0]], + ), + attributes={ 'is_crowd': False }, + label=4, group=3, id=3), + ], attributes={'id': 2}), + + DatasetItem(id=2, subset='val', image=np.ones((4, 4, 3)), + annotations=[ + Mask(np.array([ + [0, 0, 0, 0], + [1, 1, 1, 0], + [1, 1, 0, 0], + [0, 0, 0, 0]], + ), + attributes={ 'is_crowd': False }, + label=4, group=3, id=3), + ], attributes={'id': 1}), + ], categories=[str(i) for i in range(10)]) + + with TestDir() as test_dir: + self._test_save_and_load(dataset, + CocoStuffConverter.convert, test_dir) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_merge_polygons_on_loading(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, image=np.zeros((6, 10, 3)), @@ -292,6 +420,7 @@ def test_can_merge_polygons_on_loading(self): importer_args={'merge_instance_polygons': True}, target_dataset=target_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_crop_covered_segments(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, image=np.zeros((5, 5, 3)), @@ -335,7 +464,24 @@ def test_can_crop_covered_segments(self): partial(CocoInstancesConverter.convert, crop_covered=True), test_dir, target_dataset=target_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_convert_polygons_to_mask(self): + """ + Description: + Ensure that the dataset polygon annotation can be properly converted into dataset segmentation mask. + + Expected results: + Dataset segmentation mask converted from dataset polygon annotation is equal to expected mask. + + Steps: + 1. Prepare dataset with polygon annotation (source dataset) + 2. Prepare dataset with expected mask segmentation mode (target dataset) + 3. Convert source dataset to target, with conversion of annotation from polygon to mask. Verify that result + segmentation mask is equal to expected mask. + + """ + + # 1. Prepare dataset with polygon annotation (source dataset) source_dataset = Dataset.from_iterable([ DatasetItem(id=1, image=np.zeros((6, 10, 3)), annotations=[ @@ -347,6 +493,7 @@ def test_can_convert_polygons_to_mask(self): ), ], categories=[str(i) for i in range(10)]) + # 2. Prepare dataset with expected mask segmentation mode (target dataset) target_dataset = Dataset.from_iterable([ DatasetItem(id=1, image=np.zeros((6, 10, 3)), annotations=[ @@ -366,11 +513,14 @@ def test_can_convert_polygons_to_mask(self): ), ], categories=[str(i) for i in range(10)]) + # 3. Convert source dataset to target, with conversion of annotation from polygon to mask. Verify that result + # segmentation mask is equal to expected mask. with TestDir() as test_dir: self._test_save_and_load(source_dataset, partial(CocoInstancesConverter.convert, segmentation_mode='mask'), test_dir, target_dataset=target_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_convert_masks_to_polygons(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, image=np.zeros((5, 10, 3)), @@ -408,6 +558,7 @@ def test_can_convert_masks_to_polygons(self): test_dir, target_dataset=target_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_images(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', attributes={'id': 1}), @@ -424,6 +575,7 @@ def test_can_save_and_load_images(self): self._test_save_and_load(expected_dataset, CocoImageInfoConverter.convert, test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='кириллица с пробелом', subset='train', @@ -434,6 +586,7 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): self._test_save_and_load(expected_dataset, CocoImageInfoConverter.convert, test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_labels(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', @@ -447,6 +600,7 @@ def test_can_save_and_load_labels(self): self._test_save_and_load(expected_dataset, CocoLabelsConverter.convert, test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_keypoints(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', image=np.zeros((5, 5, 3)), @@ -522,6 +676,7 @@ def test_can_save_and_load_keypoints(self): CocoPersonKeypointsConverter.convert, test_dir, target_dataset=target_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_no_subsets(self): test_dataset = Dataset.from_iterable([ DatasetItem(id=1, attributes={'id': 1}), @@ -532,6 +687,7 @@ def test_can_save_dataset_with_no_subsets(self): self._test_save_and_load(test_dataset, CocoConverter.convert, test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_image_info(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id=1, image=Image(path='1.jpg', size=(10, 15)), @@ -542,6 +698,7 @@ def test_can_save_dataset_with_image_info(self): self._test_save_and_load(expected_dataset, CocoImageInfoConverter.convert, test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_relative_paths(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='1', image=np.ones((4, 2, 3)), @@ -557,6 +714,7 @@ def test_relative_paths(self): partial(CocoImageInfoConverter.convert, save_images=True), test_dir, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): expected = Dataset.from_iterable([ DatasetItem(id='q/1', image=Image(path='q/1.JPEG', @@ -567,9 +725,10 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): with TestDir() as test_dir: self._test_save_and_load(expected, - partial(CocoConverter.convert, save_images=True), + partial(CocoImageInfoConverter.convert, save_images=True), test_dir, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_preserve_coco_ids(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='some/name1', image=np.ones((4, 2, 3)), @@ -581,6 +740,7 @@ def test_preserve_coco_ids(self): partial(CocoImageInfoConverter.convert, save_images=True), test_dir, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_annotation_attributes(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id=1, image=np.ones((4, 2, 3)), annotations=[ @@ -593,6 +753,7 @@ def test_annotation_attributes(self): self._test_save_and_load(expected_dataset, CocoConverter.convert, test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_auto_annotation_ids(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=2, image=np.ones((4, 2, 3)), annotations=[ @@ -611,6 +772,7 @@ def test_auto_annotation_ids(self): self._test_save_and_load(source_dataset, CocoConverter.convert, test_dir, target_dataset=target_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_reindex(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=2, image=np.ones((4, 2, 3)), annotations=[ @@ -630,6 +792,7 @@ def test_reindex(self): partial(CocoConverter.convert, reindex=True), test_dir, target_dataset=target_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_images_in_single_dir(self): dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', image=np.ones((2, 4, 3)), @@ -643,6 +806,7 @@ def test_can_save_images_in_single_dir(self): test_dir, require_images=True) self.assertTrue(osp.isfile(osp.join(test_dir, 'images', '1.jpg'))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_images_in_separate_dirs(self): dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', image=np.ones((2, 4, 3)), @@ -657,6 +821,7 @@ def test_can_save_images_in_separate_dirs(self): self.assertTrue(osp.isfile(osp.join( test_dir, 'images', 'train', '1.jpg'))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_inplace_save_writes_only_updated_data(self): with TestDir() as path: # generate initial dataset @@ -683,4 +848,4 @@ def test_inplace_save_writes_only_updated_data(self): self.assertFalse(osp.isfile(osp.join( path, 'annotations', 'image_info_c.json'))) self.assertTrue(osp.isfile(osp.join(path, 'images', 'a', '2.jpg'))) - self.assertFalse(osp.isfile(osp.join(path, 'images', 'c', '3.jpg'))) \ No newline at end of file + self.assertFalse(osp.isfile(osp.join(path, 'images', 'c', '3.jpg'))) diff --git a/tests/test_command_targets.py b/tests/test_command_targets.py index 5b8a69f31829..f5241f1889a3 100644 --- a/tests/test_command_targets.py +++ b/tests/test_command_targets.py @@ -8,9 +8,11 @@ ImageTarget, SourceTarget from datumaro.util.image import save_image from datumaro.util.test_utils import TestDir +from .requirements import Requirements, mark_requirement class CommandTargetsTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_image_false_when_no_file(self): target = ImageTarget() @@ -18,6 +20,7 @@ def test_image_false_when_no_file(self): self.assertFalse(status) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_image_false_when_false(self): with TestDir() as test_dir: path = osp.join(test_dir, 'test.jpg') @@ -30,6 +33,7 @@ def test_image_false_when_false(self): self.assertFalse(status) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_image_true_when_true(self): with TestDir() as test_dir: path = osp.join(test_dir, 'test.jpg') @@ -41,6 +45,7 @@ def test_image_true_when_true(self): self.assertTrue(status) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_project_false_when_no_file(self): target = ProjectTarget() @@ -48,6 +53,7 @@ def test_project_false_when_no_file(self): self.assertFalse(status) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_project_false_when_no_name(self): target = ProjectTarget(project=Project()) @@ -55,6 +61,7 @@ def test_project_false_when_no_name(self): self.assertFalse(status) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_project_true_when_project_file(self): with TestDir() as test_dir: path = osp.join(test_dir, 'test.jpg') @@ -66,6 +73,7 @@ def test_project_true_when_project_file(self): self.assertTrue(status) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_project_true_when_project_name(self): project_name = 'qwerty' project = Project({ @@ -77,6 +85,7 @@ def test_project_true_when_project_name(self): self.assertTrue(status) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_project_false_when_not_project_name(self): project_name = 'qwerty' project = Project({ @@ -88,6 +97,7 @@ def test_project_false_when_not_project_name(self): self.assertFalse(status) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_project_false_when_not_project_file(self): with TestDir() as test_dir: path = osp.join(test_dir, 'test.jpg') @@ -100,6 +110,7 @@ def test_project_false_when_not_project_file(self): self.assertFalse(status) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_source_false_when_no_project(self): target = SourceTarget() @@ -107,6 +118,7 @@ def test_source_false_when_no_project(self): self.assertFalse(status) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_source_true_when_source_exists(self): source_name = 'qwerty' project = Project() @@ -117,6 +129,7 @@ def test_source_true_when_source_exists(self): self.assertTrue(status) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_source_false_when_source_doesnt_exist(self): source_name = 'qwerty' project = Project() diff --git a/tests/test_config.py b/tests/test_config.py index 32332b3545d0..2fee6b237961 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,9 +1,11 @@ from unittest import TestCase from datumaro.components.config import Config, DictConfig, SchemaBuilder +from .requirements import Requirements, mark_requirement class ConfigTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_produce_multilayer_config_from_dict(self): schema_low = SchemaBuilder() \ .add('options', dict) \ diff --git a/tests/test_cvat_format.py b/tests/test_cvat_format.py index 5b2c60e130bc..d95ed26304c2 100644 --- a/tests/test_cvat_format.py +++ b/tests/test_cvat_format.py @@ -14,7 +14,7 @@ from datumaro.util.image import Image from datumaro.util.test_utils import (TestDir, compare_datasets, test_save_and_load) - +from .requirements import Requirements, mark_requirement DUMMY_IMAGE_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'cvat_dataset', 'for_images') @@ -23,12 +23,15 @@ 'assets', 'cvat_dataset', 'for_video') class CvatImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect_image(self): self.assertTrue(CvatImporter.detect(DUMMY_IMAGE_DATASET_DIR)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect_video(self): self.assertTrue(CvatImporter.detect(DUMMY_VIDEO_DATASET_DIR)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_load_image(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='img0', subset='train', @@ -61,6 +64,7 @@ def test_can_load_image(self): compare_datasets(self, expected_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_load_video(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='frame_000010', subset='annotations', @@ -147,6 +151,7 @@ def _test_save_and_load(self, source_dataset, converter, test_dir, importer='cvat', target_dataset=target_dataset, importer_args=importer_args, **kwargs) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load(self): src_label_cat = LabelCategories(attributes={'occluded', 'common'}) for i in range(10): @@ -237,6 +242,7 @@ def test_can_save_and_load(self): partial(CvatConverter.convert, save_images=True), test_dir, target_dataset=target_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_allow_undeclared_attrs(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=0, annotations=[ @@ -260,6 +266,7 @@ def test_can_allow_undeclared_attrs(self): partial(CvatConverter.convert, allow_undeclared_attrs=True), test_dir, target_dataset=target_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_relative_paths(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='1', image=np.ones((4, 2, 3))), @@ -281,6 +288,7 @@ def test_relative_paths(self): partial(CvatConverter.convert, save_images=True), test_dir, target_dataset=target_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): label_categories = LabelCategories(attributes={'occluded'}) for i in range(10): @@ -314,6 +322,7 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): partial(CvatConverter.convert, save_images=True), test_dir, target_dataset=target_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): expected = Dataset.from_iterable([ DatasetItem('q/1', image=Image(path='q/1.JPEG', @@ -326,7 +335,12 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): self._test_save_and_load(expected, partial(CvatConverter.convert, save_images=True), test_dir, require_images=True) + self.assertTrue(osp.isfile( + osp.join(test_dir, 'images', 'q', '1.JPEG'))) + self.assertTrue(osp.isfile( + osp.join(test_dir, 'images', 'a', 'b', 'c', '2.bmp'))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_preserve_frame_ids(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='some/name1', image=np.ones((4, 2, 3)), @@ -337,6 +351,7 @@ def test_preserve_frame_ids(self): self._test_save_and_load(expected_dataset, CvatConverter.convert, test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_reindex(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='some/name1', image=np.ones((4, 2, 3)), @@ -353,6 +368,7 @@ def test_reindex(self): partial(CvatConverter.convert, reindex=True), test_dir, target_dataset=expected_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_inplace_save_writes_only_updated_data(self): with TestDir() as path: # generate initial dataset diff --git a/tests/test_dataset.py b/tests/test_dataset.py index ac5e6b65a6f8..f8f7f0a0852c 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -15,9 +15,11 @@ LabelCategories, AnnotationType, Transform) from datumaro.util.image import Image from datumaro.util.test_utils import TestDir, compare_datasets +from .requirements import Requirements, mark_requirement class DatasetTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_create_from_extractors(self): class SrcExtractor1(Extractor): def __iter__(self): @@ -56,6 +58,7 @@ def __iter__(self): compare_datasets(self, DstExtractor(), dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_create_from_iterable(self): class TestExtractor(Extractor): def __iter__(self): @@ -86,6 +89,33 @@ def categories(self): compare_datasets(self, TestExtractor(), actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_join_datasets_with_empty_categories(self): + expected = Dataset.from_iterable([ + DatasetItem(1, annotations=[ + Label(0), + Bbox(1, 2, 3, 4), + Caption('hello world'), + ]) + ], categories=['a']) + + src1 = Dataset.from_iterable([ + DatasetItem(1, annotations=[ Bbox(1, 2, 3, 4, label=None) ]) + ], categories=[]) + + src2 = Dataset.from_iterable([ + DatasetItem(1, annotations=[ Label(0) ]) + ], categories=['a']) + + src3 = Dataset.from_iterable([ + DatasetItem(1, annotations=[ Caption('hello world') ]) + ]) + + actual = Dataset.from_extractors(src1, src2, src3) + + compare_datasets(self, expected, actual) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, annotations=[ Label(2) ]), @@ -98,6 +128,7 @@ def test_can_save_and_load(self): compare_datasets(self, source_dataset, loaded_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect(self): env = Environment() env.importers.items = {DEFAULT_FORMAT: env.importers[DEFAULT_FORMAT]} @@ -114,6 +145,7 @@ def test_can_detect(self): self.assertEqual(DEFAULT_FORMAT, detected_format) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect_and_import(self): env = Environment() env.importers.items = {DEFAULT_FORMAT: env.importers[DEFAULT_FORMAT]} @@ -132,6 +164,7 @@ def test_can_detect_and_import(self): self.assertEqual(imported_dataset.format, DEFAULT_FORMAT) compare_datasets(self, source_dataset, imported_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_export_by_string_format_name(self): env = Environment() env.converters.items = {'qq': env.converters[DEFAULT_FORMAT]} @@ -143,6 +176,7 @@ def test_can_export_by_string_format_name(self): with TestDir() as test_dir: dataset.export(format='qq', save_dir=test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_transform_by_string_name(self): expected = Dataset.from_iterable([ DatasetItem(id=1, annotations=[ Label(2) ], attributes={'qq': 1}), @@ -163,6 +197,7 @@ def transform_item(self, item): compare_datasets(self, expected, actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_join_annotations(self): a = Dataset.from_iterable([ DatasetItem(id=1, subset='train', annotations=[ @@ -190,6 +225,7 @@ def test_can_join_annotations(self): compare_datasets(self, expected, merged) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_cant_join_different_categories(self): s1 = Dataset.from_iterable([], categories=['a', 'b']) s2 = Dataset.from_iterable([], categories=['b', 'a']) @@ -197,6 +233,7 @@ def test_cant_join_different_categories(self): with self.assertRaisesRegex(DatumaroError, "different categories"): Dataset.from_extractors(s1, s2) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_join_datasets(self): s1 = Dataset.from_iterable([ DatasetItem(0), DatasetItem(1) ]) s2 = Dataset.from_iterable([ DatasetItem(1), DatasetItem(2) ]) @@ -208,6 +245,7 @@ def test_can_join_datasets(self): compare_datasets(self, expected, actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_inplace_save_writes_only_updated_data(self): with TestDir() as path: # generate initial dataset @@ -232,6 +270,7 @@ def test_inplace_save_writes_only_updated_data(self): self.assertFalse(osp.isfile(osp.join(path, 'annotations', 'b.json'))) self.assertTrue(osp.isfile(osp.join(path, 'annotations', 'c.json'))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_track_modifications_on_addition(self): dataset = Dataset.from_iterable([ DatasetItem(1), @@ -244,6 +283,7 @@ def test_can_track_modifications_on_addition(self): self.assertTrue(dataset.is_modified) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_track_modifications_on_removal(self): dataset = Dataset.from_iterable([ DatasetItem(1), @@ -256,6 +296,7 @@ def test_can_track_modifications_on_removal(self): self.assertTrue(dataset.is_modified) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_create_patch(self): expected = Dataset.from_iterable([ DatasetItem(2), @@ -290,6 +331,7 @@ def test_can_create_patch(self): compare_datasets(self, expected, dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_create_more_precise_patch_when_cached(self): expected = Dataset.from_iterable([ DatasetItem(2), @@ -325,6 +367,7 @@ def test_can_create_more_precise_patch_when_cached(self): compare_datasets(self, expected, dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_do_lazy_put_and_remove(self): iter_called = False class TestExtractor(Extractor): @@ -350,6 +393,7 @@ def __iter__(self): self.assertTrue(dataset.is_cache_initialized) self.assertTrue(iter_called) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_put(self): dataset = Dataset() @@ -357,6 +401,7 @@ def test_can_put(self): self.assertTrue((1, '') in dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_do_lazy_get_on_updated_item(self): iter_called = False class TestExtractor(Extractor): @@ -374,6 +419,7 @@ def __iter__(self): self.assertTrue((2, '') in dataset) self.assertFalse(iter_called) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_switch_eager_and_lazy_with_cm_global(self): iter_called = False class TestExtractor(Extractor): @@ -390,6 +436,7 @@ def __iter__(self): self.assertTrue(iter_called) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_switch_eager_and_lazy_with_cm_local(self): iter_called = False class TestExtractor(Extractor): @@ -410,6 +457,7 @@ def __iter__(self): self.assertTrue(iter_called) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_do_lazy_select(self): iter_called = False class TestExtractor(Extractor): @@ -433,7 +481,8 @@ def __iter__(self): self.assertTrue(iter_called) - def test_can_chain_lazy_tranforms(self): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_chain_lazy_transforms(self): iter_called = False class TestExtractor(Extractor): def __iter__(self): @@ -461,12 +510,14 @@ def transform_item(self, item): self.assertTrue(iter_called) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_raises_when_repeated_items_in_source(self): dataset = Dataset.from_iterable([DatasetItem(0), DatasetItem(0)]) with self.assertRaises(RepeatedItemError): dataset.init_cache() + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_check_item_existence(self): dataset = Dataset.from_iterable([ DatasetItem(0, subset='a'), DatasetItem(1) @@ -479,6 +530,7 @@ def test_can_check_item_existence(self): self.assertTrue(1 in dataset) self.assertFalse(0 in dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_put_with_id_override(self): dataset = Dataset.from_iterable([]) @@ -486,6 +538,7 @@ def test_can_put_with_id_override(self): self.assertTrue((2, 'b') in dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_compute_cache_with_empty_source(self): dataset = Dataset.from_iterable([]) dataset.put(DatasetItem(2)) @@ -494,6 +547,7 @@ def test_can_compute_cache_with_empty_source(self): self.assertTrue(2 in dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_cant_do_partial_caching_in_get_when_default(self): iter_called = 0 class TestExtractor(Extractor): @@ -514,6 +568,7 @@ def __iter__(self): self.assertEqual(1, iter_called) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_do_partial_caching_in_get_when_redefined(self): iter_called = 0 get_called = 0 @@ -541,6 +596,7 @@ def get(self, id, subset=None): #pylint: disable=redefined-builtin self.assertEqual(0, iter_called) self.assertEqual(2, get_called) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_binds_on_save(self): dataset = Dataset.from_iterable([DatasetItem(1)]) @@ -553,6 +609,7 @@ def test_binds_on_save(self): self.assertEqual(dataset.data_path, test_dir) self.assertEqual(dataset.format, DEFAULT_FORMAT) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_flushes_changes_on_save(self): dataset = Dataset.from_iterable([]) dataset.put(DatasetItem(1)) @@ -564,6 +621,7 @@ def test_flushes_changes_on_save(self): self.assertFalse(dataset.is_modified) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_does_not_load_images_on_saving(self): # Issue https://github.com/openvinotoolkit/datumaro/issues/177 # Missing image metadata (size etc.) can lead to image loading on @@ -585,6 +643,7 @@ def test_loader(): class DatasetItemTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_ctor_requires_id(self): with self.assertRaises(Exception): # pylint: disable=no-value-for-parameter @@ -592,6 +651,7 @@ def test_ctor_requires_id(self): # pylint: enable=no-value-for-parameter @staticmethod + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_ctors_with_image(): for args in [ { 'id': 0, 'image': None }, @@ -605,6 +665,7 @@ def test_ctors_with_image(): class DatasetFilterTest(TestCase): @staticmethod + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_item_representations(): item = DatasetItem(id=1, subset='subset', path=['a', 'b'], image=np.ones((5, 4, 3)), @@ -626,6 +687,7 @@ def test_item_representations(): encoded = DatasetItemEncoder.encode(item) DatasetItemEncoder.to_string(encoded) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_item_filter_can_be_applied(self): class TestExtractor(Extractor): def __iter__(self): @@ -638,6 +700,7 @@ def __iter__(self): self.assertEqual(2, len(filtered)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_annotations_filter_can_be_applied(self): class SrcExtractor(Extractor): def __iter__(self): @@ -672,6 +735,7 @@ def __iter__(self): self.assertListEqual(list(filtered), list(DstExtractor())) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_annotations_filter_can_remove_empty_items(self): source = Dataset.from_iterable([ DatasetItem(id=0), diff --git a/tests/test_datumaro_format.py b/tests/test_datumaro_format.py index b3d3a950b2a9..ae5a88781fad 100644 --- a/tests/test_datumaro_format.py +++ b/tests/test_datumaro_format.py @@ -16,6 +16,7 @@ from datumaro.util.image import Image from datumaro.util.test_utils import (TestDir, compare_datasets_strict, test_save_and_load) +from .requirements import Requirements, mark_requirement class DatumaroConverterTest(TestCase): @@ -27,6 +28,7 @@ def _test_save_and_load(self, source_dataset, converter, test_dir, compare=compare_datasets_strict) @property + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_dataset(self): label_categories = LabelCategories(attributes={'a', 'b', 'score'}) for i in range(5): @@ -84,17 +86,20 @@ def test_dataset(self): AnnotationType.points: points_categories, }) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load(self): with TestDir() as test_dir: self._test_save_and_load(self.test_dataset, partial(DatumaroConverter.convert, save_images=True), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect(self): with TestDir() as test_dir: DatumaroConverter.convert(self.test_dataset, save_dir=test_dir) self.assertTrue(DatumaroImporter.detect(test_dir)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_relative_paths(self): test_dataset = Dataset.from_iterable([ DatasetItem(id='1', image=np.ones((4, 2, 3))), @@ -106,6 +111,7 @@ def test_relative_paths(self): self._test_save_and_load(test_dataset, partial(DatumaroConverter.convert, save_images=True), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): test_dataset = Dataset.from_iterable([ DatasetItem(id='кириллица с пробелом', image=np.ones((4, 2, 3))), @@ -116,6 +122,7 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): partial(DatumaroConverter.convert, save_images=True), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): expected = Dataset.from_iterable([ DatasetItem(id='q/1', image=Image(path='q/1.JPEG', @@ -129,6 +136,7 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): partial(DatumaroConverter.convert, save_images=True), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_inplace_save_writes_only_updated_data(self): with TestDir() as path: # generate initial dataset diff --git a/tests/test_diff.py b/tests/test_diff.py index bb25991ed270..83dfcde1b526 100644 --- a/tests/test_diff.py +++ b/tests/test_diff.py @@ -6,9 +6,11 @@ from datumaro.components.operations import DistanceComparator, ExactComparator from unittest import TestCase +from .requirements import Requirements, mark_requirement class DistanceComparatorTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_no_bbox_diff_with_same_item(self): detections = 3 anns = [ @@ -31,6 +33,7 @@ def test_no_bbox_diff_with_same_item(self): self.assertLess(iou_thresh, a_bbox.iou(b_bbox)) self.assertEqual(a_bbox.label, b_bbox.label) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_find_bbox_with_wrong_label(self): detections = 3 class_count = 2 @@ -57,6 +60,7 @@ def test_can_find_bbox_with_wrong_label(self): self.assertLess(iou_thresh, a_bbox.iou(b_bbox)) self.assertEqual((a_bbox.label + 1) % class_count, b_bbox.label) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_find_missing_boxes(self): detections = 3 class_count = 2 @@ -80,6 +84,7 @@ def test_can_find_missing_boxes(self): self.assertEqual(len(item2.annotations), len(b_greater)) self.assertEqual(0, len(matches)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_no_label_diff_with_same_item(self): detections = 3 anns = [ Label(i) for i in range(detections) ] @@ -92,6 +97,7 @@ def test_no_label_diff_with_same_item(self): self.assertEqual(0, len(b_greater)) self.assertEqual(len(item.annotations), len(matches)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_find_wrong_label(self): item1 = DatasetItem(id=1, annotations=[ Label(0), @@ -111,6 +117,7 @@ def test_can_find_wrong_label(self): self.assertEqual(2, len(b_greater)) self.assertEqual(1, len(matches)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_match_points(self): item1 = DatasetItem(id=1, annotations=[ Points([1, 2, 2, 0, 1, 1], label=0), @@ -132,6 +139,7 @@ def test_can_match_points(self): self.assertEqual(0, len(mismatches)) class ExactComparatorTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_class_comparison(self): a = Dataset.from_iterable([], categories=['a', 'b', 'c']) b = Dataset.from_iterable([], categories=['b', 'c']) @@ -141,6 +149,7 @@ def test_class_comparison(self): self.assertEqual(1, len(errors), errors) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_item_comparison(self): a = Dataset.from_iterable([ DatasetItem(id=1, subset='train'), @@ -159,6 +168,7 @@ def test_item_comparison(self): self.assertEqual({('3', DEFAULT_SUBSET_NAME)}, b_extra_items) self.assertEqual(1, len(errors), errors) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_annotation_comparison(self): a = Dataset.from_iterable([ DatasetItem(id=1, annotations=[ @@ -195,6 +205,7 @@ def test_annotation_comparison(self): self.assertEqual(2, len(unmatched), unmatched) self.assertEqual(0, len(errors), errors) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_image_comparison(self): a = Dataset.from_iterable([ DatasetItem(id=11, image=np.ones((5, 4, 3)), annotations=[ diff --git a/tests/test_icdar_format.py b/tests/test_icdar_format.py index 7559d6fe379f..21bd8f0e0fd2 100644 --- a/tests/test_icdar_format.py +++ b/tests/test_icdar_format.py @@ -16,23 +16,27 @@ from datumaro.util.image import Image from datumaro.util.test_utils import (TestDir, compare_datasets, test_save_and_load) - +from .requirements import Requirements, mark_requirement DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'icdar_dataset') class IcdarImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect_word_recognition(self): self.assertTrue(IcdarWordRecognitionImporter.detect( osp.join(DUMMY_DATASET_DIR, 'word_recognition'))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect_text_localization(self): self.assertTrue(IcdarTextLocalizationImporter.detect( osp.join(DUMMY_DATASET_DIR, 'text_localization'))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect_text_segmentation(self): self.assertTrue(IcdarTextSegmentationImporter.detect( osp.join(DUMMY_DATASET_DIR, 'text_segmentation'))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import_captions(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='word_1', subset='train', @@ -55,6 +59,7 @@ def test_can_import_captions(self): compare_datasets(self, expected_dataset, dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import_bboxes(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='img_1', subset='train', @@ -79,6 +84,7 @@ def test_can_import_bboxes(self): compare_datasets(self, expected_dataset, dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import_masks(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='1', subset='train', @@ -116,6 +122,7 @@ def _test_save_and_load(self, source_dataset, converter, test_dir, importer, importer, target_dataset=target_dataset, importer_args=importer_args, **kwargs) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_captions(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='a/b/1', subset='train', @@ -133,6 +140,7 @@ def test_can_save_and_load_captions(self): partial(IcdarWordRecognitionConverter.convert, save_images=True), test_dir, 'icdar_word_recognition') + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_bboxes(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='a/b/1', subset='train', @@ -159,6 +167,7 @@ def test_can_save_and_load_bboxes(self): partial(IcdarTextLocalizationConverter.convert, save_images=True), test_dir, 'icdar_text_localization') + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_masks(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='a/b/1', subset='train', @@ -192,6 +201,7 @@ def test_can_save_and_load_masks(self): partial(IcdarTextSegmentationConverter.convert, save_images=True), test_dir, 'icdar_text_segmentation') + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_with_no_subsets(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id=1, image=np.ones((8, 8, 3)), @@ -205,6 +215,7 @@ def test_can_save_and_load_with_no_subsets(self): IcdarTextLocalizationConverter.convert, test_dir, 'icdar_text_localization') + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='кириллица с пробелом', @@ -221,6 +232,7 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): partial(converter.convert, save_images=True), test_dir, importer, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): expected = Dataset.from_iterable([ DatasetItem(id='q/1', image=Image(path='q/1.JPEG', diff --git a/tests/test_image.py b/tests/test_image.py index 5f4ef81c4f95..1983b711d95c 100644 --- a/tests/test_image.py +++ b/tests/test_image.py @@ -6,6 +6,7 @@ import datumaro.util.image as image_module from datumaro.util.test_utils import TestDir +from .requirements import Requirements, mark_requirement class ImageOperationsTest(TestCase): @@ -15,6 +16,7 @@ def setUp(self): def tearDown(self): image_module._IMAGE_BACKEND = self.default_backend + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_save_and_load_backends(self): backends = image_module._IMAGE_BACKENDS for save_backend, load_backend, c in product(backends, backends, [1, 3]): @@ -34,6 +36,7 @@ def test_save_and_load_backends(self): self.assertTrue(np.array_equal(src_image, dst_image), 'save: %s, load: %s' % (save_backend, load_backend)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_encode_and_decode_backends(self): backends = image_module._IMAGE_BACKENDS for save_backend, load_backend, c in product(backends, backends, [1, 3]): @@ -52,11 +55,13 @@ def test_encode_and_decode_backends(self): self.assertTrue(np.array_equal(src_image, dst_image), 'save: %s, load: %s' % (save_backend, load_backend)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_save_image_to_inexistent_dir_raises_error(self): with self.assertRaises(FileNotFoundError): image_module.save_image('some/path.jpg', np.ones((5, 4, 3)), create_dir=False) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_save_image_can_create_dir(self): with TestDir() as test_dir: path = osp.join(test_dir, 'some', 'path.jpg') diff --git a/tests/test_image_dir_format.py b/tests/test_image_dir_format.py index 1b056e8f3e5d..4cbea707ab12 100644 --- a/tests/test_image_dir_format.py +++ b/tests/test_image_dir_format.py @@ -9,9 +9,11 @@ from datumaro.plugins.image_dir_format import ImageDirConverter from datumaro.util.image import Image, save_image from datumaro.util.test_utils import TestDir, compare_datasets, test_save_and_load +from .requirements import Requirements, mark_requirement class ImageDirFormatTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_load(self): dataset = Dataset.from_iterable([ DatasetItem(id=1, image=np.ones((10, 6, 3))), @@ -22,6 +24,7 @@ def test_can_load(self): test_save_and_load(self, dataset, ImageDirConverter.convert, test_dir, importer='image_dir', require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_relative_paths(self): dataset = Dataset.from_iterable([ DatasetItem(id='1', image=np.ones((4, 2, 3))), @@ -33,6 +36,7 @@ def test_relative_paths(self): test_save_and_load(self, dataset, ImageDirConverter.convert, test_dir, importer='image_dir') + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): dataset = Dataset.from_iterable([ DatasetItem(id='кириллица с пробелом', image=np.ones((4, 2, 3))), @@ -42,6 +46,7 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): test_save_and_load(self, dataset, ImageDirConverter.convert, test_dir, importer='image_dir') + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): dataset = Dataset.from_iterable([ DatasetItem(id='q/1', image=Image(path='q/1.JPEG', @@ -54,6 +59,7 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): test_save_and_load(self, dataset, ImageDirConverter.convert, test_dir, importer='image_dir', require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_custom_extension(self): expected = Dataset.from_iterable([ DatasetItem(id='a/3', image=Image(path='a/3.qq', diff --git a/tests/test_imagenet_format.py b/tests/test_imagenet_format.py index 9a4da64a3fe6..6735f1812362 100644 --- a/tests/test_imagenet_format.py +++ b/tests/test_imagenet_format.py @@ -10,8 +10,11 @@ from datumaro.plugins.imagenet_format import ImagenetConverter, ImagenetImporter from datumaro.util.image import Image from datumaro.util.test_utils import TestDir, compare_datasets +from .requirements import Requirements, mark_requirement + class ImagenetFormatTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='1', @@ -35,6 +38,7 @@ def test_can_save_and_load(self): compare_datasets(self, source_dataset, parsed_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_with_multiple_labels(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='1', @@ -57,6 +61,7 @@ def test_can_save_and_load_with_multiple_labels(self): compare_datasets(self, source_dataset, parsed_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): source_dataset = Dataset.from_iterable([ DatasetItem(id="кириллица с пробелом", @@ -76,6 +81,7 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): compare_datasets(self, source_dataset, parsed_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): dataset = Dataset.from_iterable([ DatasetItem(id='a', image=Image(path='a.JPEG', @@ -95,6 +101,7 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'imagenet_dataset') class ImagenetImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='1', @@ -114,5 +121,6 @@ def test_can_import(self): compare_datasets(self, expected_dataset, dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect_imagenet(self): self.assertTrue(ImagenetImporter.detect(DUMMY_DATASET_DIR)) diff --git a/tests/test_imagenet_txt_format.py b/tests/test_imagenet_txt_format.py index 2c4d231f5898..15bfc2b81cd9 100644 --- a/tests/test_imagenet_txt_format.py +++ b/tests/test_imagenet_txt_format.py @@ -11,9 +11,11 @@ ImagenetTxtConverter, ImagenetTxtImporter from datumaro.util.image import Image from datumaro.util.test_utils import TestDir, compare_datasets +from .requirements import Requirements, mark_requirement class ImagenetTxtFormatTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='1', subset='train', @@ -36,6 +38,7 @@ def test_can_save_and_load(self): compare_datasets(self, source_dataset, parsed_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_with_multiple_labels(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='1', subset='train', @@ -57,6 +60,7 @@ def test_can_save_and_load_with_multiple_labels(self): compare_datasets(self, source_dataset, parsed_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_no_subsets(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='a/b/c', image=np.zeros((8, 4, 3)), @@ -76,6 +80,7 @@ def test_can_save_dataset_with_no_subsets(self): compare_datasets(self, source_dataset, parsed_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): dataset = Dataset.from_iterable([ DatasetItem(id="кириллица с пробелом", @@ -95,6 +100,7 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): compare_datasets(self, dataset, parsed_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): dataset = Dataset.from_iterable([ DatasetItem(id='a/1', image=Image(path='a/1.JPEG', @@ -114,6 +120,7 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'imagenet_txt_dataset') class ImagenetTxtImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='1', subset='train', image=np.zeros((8, 6, 3)), @@ -137,5 +144,6 @@ def test_can_import(self): compare_datasets(self, expected_dataset, dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect_imagenet(self): self.assertTrue(ImagenetTxtImporter.detect(DUMMY_DATASET_DIR)) diff --git a/tests/test_images.py b/tests/test_images.py index a003b8d426e2..a0c22d607c7c 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -7,9 +7,11 @@ from datumaro.util.image import (lazy_image, load_image, save_image, \ Image, ByteImage, encode_image) from datumaro.util.image_cache import ImageCache +from .requirements import Requirements, mark_requirement class LazyImageTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_cache_works(self): with TestDir() as test_dir: image = np.ones((100, 100, 3), dtype=np.uint8) @@ -23,6 +25,7 @@ def test_cache_works(self): self.assertFalse(non_caching_loader() is non_caching_loader()) class ImageCacheTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_cache_fifo_displacement(self): capacity = 2 cache = ImageCache(capacity) @@ -39,6 +42,7 @@ def test_cache_fifo_displacement(self): matches = sum([a is b for a, b in zip(first_request, second_request)]) self.assertEqual(matches, len(first_request) - 1) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_global_cache_is_accessible(self): loader = lazy_image(None, loader=lambda p: object()) @@ -47,6 +51,7 @@ def test_global_cache_is_accessible(self): self.assertEqual(ImageCache.get_instance().size(), 1) class ImageTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_lazy_image_shape(self): data = np.ones((5, 6, 3)) @@ -56,6 +61,7 @@ def test_lazy_image_shape(self): self.assertEqual((2, 4), image_lazy.size) self.assertEqual((5, 6), image_eager.size) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_ctors(self): with TestDir() as test_dir: path = osp.join(test_dir, 'path.png') @@ -82,6 +88,7 @@ def test_ctors(self): # pylint: enable=pointless-statement class BytesImageTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_lazy_image_shape(self): data = encode_image(np.ones((5, 6, 3)), 'png') @@ -91,6 +98,7 @@ def test_lazy_image_shape(self): self.assertEqual((2, 4), image_lazy.size) self.assertEqual((5, 6), image_eager.size) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_ctors(self): with TestDir() as test_dir: path = osp.join(test_dir, 'path.png') diff --git a/tests/test_labelme_format.py b/tests/test_labelme_format.py index 3a514d3d9f86..ad80a9ecf5a8 100644 --- a/tests/test_labelme_format.py +++ b/tests/test_labelme_format.py @@ -1,16 +1,16 @@ from functools import partial import numpy as np +import os import os.path as osp from unittest import TestCase from datumaro.components.dataset import Dataset -from datumaro.components.extractor import (DatasetItem, - AnnotationType, Bbox, Mask, Polygon, LabelCategories -) +from datumaro.components.extractor import (DatasetItem, Bbox, Mask, Polygon) from datumaro.plugins.labelme_format import LabelMeImporter, LabelMeConverter from datumaro.util.image import Image from datumaro.util.test_utils import (TestDir, compare_datasets, test_save_and_load) +from .requirements import Requirements, mark_requirement class LabelMeConverterTest(TestCase): @@ -20,6 +20,7 @@ def _test_save_and_load(self, source_dataset, converter, test_dir, importer='label_me', target_dataset=target_dataset, importer_args=importer_args, **kwargs) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='dir1/1', subset='train', @@ -31,6 +32,8 @@ def test_can_save_and_load(self): 'a1': 'qwe', 'a2': True, 'a3': 123, + 'a4': '42', # must be escaped and recognized as string + 'escaped': 'a,b. = \\= \\\\ " \\" \\, \\', }), Mask(np.array([[0, 1], [1, 0], [1, 1]]), group=2, attributes={ 'username': 'test' }), @@ -40,10 +43,7 @@ def test_can_save_and_load(self): ), ] ), - ], categories={ - AnnotationType.label: LabelCategories.from_iterable( - 'label_' + str(label) for label in range(10)), - }) + ], categories=['label_' + str(label) for label in range(10)]) target_dataset = Dataset.from_iterable([ DatasetItem(id='dir1/1', subset='train', @@ -60,6 +60,8 @@ def test_can_save_and_load(self): 'a1': 'qwe', 'a2': True, 'a3': 123, + 'a4': '42', + 'escaped': 'a,b. = \\= \\\\ " \\" \\, \\', } ), Mask(np.array([[0, 1], [1, 0], [1, 1]]), group=2, @@ -77,10 +79,7 @@ def test_can_save_and_load(self): ), ] ), - ], categories={ - AnnotationType.label: LabelCategories.from_iterable([ - 'label_2', 'label_3']), - }) + ], categories=['label_2', 'label_3']) with TestDir() as test_dir: self._test_save_and_load( @@ -88,6 +87,7 @@ def test_can_save_and_load(self): partial(LabelMeConverter.convert, save_images=True), test_dir, target_dataset=target_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): dataset = Dataset.from_iterable([ DatasetItem(id='a/1', image=Image(path='a/1.JPEG', @@ -101,42 +101,25 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): partial(LabelMeConverter.convert, save_images=True), test_dir, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='кириллица с пробелом', subset='train', image=np.ones((16, 16, 3)), - annotations=[ - Polygon([0, 4, 4, 4, 5, 6], label=3, attributes={ - 'occluded': True, - 'a1': 'qwe', - 'a2': True, - 'a3': 123, - }), - ] + annotations=[ Polygon([0, 4, 4, 4, 5, 6], label=3) ] ), - ], categories={ - AnnotationType.label: LabelCategories.from_iterable( - 'label_' + str(label) for label in range(10)), - }) + ], categories=['label_' + str(label) for label in range(10)]) target_dataset = Dataset.from_iterable([ DatasetItem(id='кириллица с пробелом', subset='train', image=np.ones((16, 16, 3)), annotations=[ Polygon([0, 4, 4, 4, 5, 6], label=0, id=0, - attributes={ - 'occluded': True, 'username': '', - 'a1': 'qwe', - 'a2': True, - 'a3': 123, - } + attributes={ 'occluded': False, 'username': '' } ), ] ), - ], categories={ - AnnotationType.label: LabelCategories.from_iterable([ - 'label_3']), - }) + ], categories=['label_3']) with TestDir() as test_dir: self._test_save_and_load( @@ -144,12 +127,62 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): partial(LabelMeConverter.convert, save_images=True), test_dir, target_dataset=target_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_relative_paths(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='1', image=np.ones((4, 2, 3))), + DatasetItem(id='subdir1/1', image=np.ones((2, 6, 3))), + DatasetItem(id='subdir2/1', image=np.ones((5, 4, 3))), + + DatasetItem(id='sub/dir3/1', image=np.ones((3, 4, 3)), annotations=[ + Mask(np.array([ + [0, 1, 1, 0], + [0, 1, 1, 0], + [0, 0, 0, 0], + ]), label=1, attributes={ + 'occluded': False, 'username': 'user' + } + ) + ]), + + DatasetItem(id='subdir3/1', subset='a', image=np.ones((5, 4, 3)), + annotations=[ + Bbox(1, 2, 3, 4, label=0, attributes={ + 'occluded': False, 'username': 'user' + }) + ]), + DatasetItem(id='subdir3/1', subset='b', image=np.ones((4, 4, 3))), + ], categories=['label1', 'label2']) + + with TestDir() as test_dir: + self._test_save_and_load(source_dataset, + partial(LabelMeConverter.convert, save_images=True), + test_dir, require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_dataset_to_correct_dir_with_correct_filename(self): + dataset = Dataset.from_iterable([ + DatasetItem(id='dir/a', image=Image(path='dir/a.JPEG', + data=np.zeros((4, 3, 3)))), + ], categories=[]) + + with TestDir() as test_dir: + self._test_save_and_load(dataset, + partial(LabelMeConverter.convert, save_images=True), + test_dir, require_images=True) + + xml_dirpath = osp.join(test_dir, 'default/dir') + self.assertEqual(os.listdir(osp.join(test_dir, 'default')), ['dir']) + self.assertEqual(set(os.listdir(xml_dirpath)), {'a.xml', 'a.JPEG'}) + DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'labelme_dataset') class LabelMeImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect(self): self.assertTrue(LabelMeImporter.detect(DUMMY_DATASET_DIR)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import(self): img1 = np.ones((77, 102, 3)) * 255 img1[6:32, 7:41] = 0 @@ -229,12 +262,9 @@ def test_can_import(self): ), ] ), - ], categories={ - AnnotationType.label: LabelCategories.from_iterable([ - 'window', 'license plate', 'o1', - 'q1', 'b1', 'm1', 'hg', - ]), - }) + ], categories=[ + 'window', 'license plate', 'o1', 'q1', 'b1', 'm1', 'hg', + ]) parsed = Dataset.import_from(DUMMY_DATASET_DIR, 'label_me') compare_datasets(self, expected=target_dataset, actual=parsed) \ No newline at end of file diff --git a/tests/test_lfw_format.py b/tests/test_lfw_format.py index 3aa64365d190..64a37731a723 100644 --- a/tests/test_lfw_format.py +++ b/tests/test_lfw_format.py @@ -3,44 +3,44 @@ import numpy as np from datumaro.components.dataset import Dataset -from datumaro.components.extractor import DatasetItem, Points +from datumaro.components.extractor import DatasetItem, Label, Points from datumaro.plugins.lfw_format import LfwConverter, LfwImporter from datumaro.util.image import Image from datumaro.util.test_utils import TestDir, compare_datasets +from .requirements import Requirements, mark_requirement class LfwFormatTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load(self): source_dataset = Dataset.from_iterable([ - DatasetItem(id='name0/name0_0001', - subset='test', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': ['name0/name0_0002'], - 'negative_pairs': [] - } + DatasetItem(id='name0_0001', subset='test', + image=np.ones((2, 5, 3)), + annotations=[Label(0, attributes={ + 'positive_pairs': ['name0/name0_0002'] + })] ), - DatasetItem(id='name0/name0_0002', - subset='test', image=np.ones((2, 5, 3)), - attributes={ + DatasetItem(id='name0_0002', subset='test', + image=np.ones((2, 5, 3)), + annotations=[Label(0, attributes={ 'positive_pairs': ['name0/name0_0001'], 'negative_pairs': ['name1/name1_0001'] - } + })] ), - DatasetItem(id='name1/name1_0001', - subset='test', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': ['name1/name1_0002'], - 'negative_pairs': [] - } + DatasetItem(id='name1_0001', subset='test', + image=np.ones((2, 5, 3)), + annotations=[Label(1, attributes={ + 'positive_pairs': ['name1/name1_0002'] + })] ), - DatasetItem(id='name1/name1_0002', - subset='test', image=np.ones((2, 5, 3)), - attributes={ + DatasetItem(id='name1_0002', subset='test', + image=np.ones((2, 5, 3)), + annotations=[Label(1, attributes={ 'positive_pairs': ['name1/name1_0002'], 'negative_pairs': ['name0/name0_0001'] - } + })] ), - ]) + ], categories=['name0', 'name1']) with TestDir() as test_dir: LfwConverter.convert(source_dataset, test_dir, save_images=True) @@ -48,29 +48,26 @@ def test_can_save_and_load(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_with_landmarks(self): source_dataset = Dataset.from_iterable([ - DatasetItem(id='name0/name0_0001', + DatasetItem(id='name0_0001', subset='test', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': ['name0/name0_0002'], - 'negative_pairs': [] - }, annotations=[ + Label(0, attributes={ + 'positive_pairs': ['name0/name0_0002'] + }), Points([0, 4, 3, 3, 2, 2, 1, 0, 3, 0]), ] ), - DatasetItem(id='name0/name0_0002', + DatasetItem(id='name0_0002', subset='test', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': [], - 'negative_pairs': [] - }, annotations=[ + Label(0), Points([0, 5, 3, 5, 2, 2, 1, 0, 3, 0]), ] ), - ]) + ], categories=['name0']) with TestDir() as test_dir: LfwConverter.convert(source_dataset, test_dir, save_images=True) @@ -78,23 +75,20 @@ def test_can_save_and_load_with_landmarks(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_with_no_subsets(self): source_dataset = Dataset.from_iterable([ - DatasetItem(id='name0/name0_0001', + DatasetItem(id='name0_0001', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': ['name0/name0_0002'], - 'negative_pairs': [] - }, + annotations=[Label(0, attributes={ + 'positive_pairs': ['name0/name0_0002'] + })], ), - DatasetItem(id='name0/name0_0002', + DatasetItem(id='name0_0002', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': [], - 'negative_pairs': [] - }, + annotations=[Label(0)] ), - ]) + ], categories=['name0']) with TestDir() as test_dir: LfwConverter.convert(source_dataset, test_dir, save_images=True) @@ -102,23 +96,48 @@ def test_can_save_and_load_with_no_subsets(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_with_no_format_names(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='a/1', + image=np.ones((2, 5, 3)), + annotations=[Label(0, attributes={ + 'positive_pairs': ['name0/b/2'], + 'negative_pairs': ['d/4'] + })], + ), + DatasetItem(id='b/2', + image=np.ones((2, 5, 3)), + annotations=[Label(0)] + ), + DatasetItem(id='c/3', + image=np.ones((2, 5, 3)), + annotations=[Label(1)] + ), + DatasetItem(id='d/4', + image=np.ones((2, 5, 3)), + ), + ], categories=['name0', 'name1']) + + with TestDir() as test_dir: + LfwConverter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'lfw') + + compare_datasets(self, source_dataset, parsed_dataset) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): dataset = Dataset.from_iterable([ DatasetItem(id='кириллица с пробелом', - image=np.ones((2, 5, 3)), - attributes = { - 'positive_pairs': [], - 'negative_pairs': [] - }, + image=np.ones((2, 5, 3)) ), - DatasetItem(id='name0/name0_0002', + DatasetItem(id='name0_0002', image=np.ones((2, 5, 3)), - attributes = { - 'positive_pairs': [], + annotations=[Label(0, attributes={ 'negative_pairs': ['кириллица с пробелом'] - }, + })] ), - ]) + ], categories=['name0']) with TestDir() as test_dir: LfwConverter.convert(dataset, test_dir, save_images=True) @@ -126,23 +145,16 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): compare_datasets(self, dataset, parsed_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): dataset = Dataset.from_iterable([ - DatasetItem(id='name0/name0_0001', image=Image( - path='name0/name0_0001.JPEG', data=np.zeros((4, 3, 3))), - attributes={ - 'positive_pairs': [], - 'negative_pairs': [] - }, - ), - DatasetItem(id='name0/name0_0002', image=Image( - path='name0/name0_0002.bmp', data=np.zeros((3, 4, 3))), - attributes={ - 'positive_pairs': ['name0/name0_0001'], - 'negative_pairs': [] - }, + DatasetItem(id='a/1', image=Image( + path='a/1.JPEG', data=np.zeros((4, 3, 3))), ), - ]) + DatasetItem(id='b/c/d/2', image=Image( + path='b/c/d/2.bmp', data=np.zeros((3, 4, 3))), + ), + ], categories=[]) with TestDir() as test_dir: LfwConverter.convert(dataset, test_dir, save_images=True) @@ -153,43 +165,40 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'lfw_dataset') class LfwImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect(self): self.assertTrue(LfwImporter.detect(DUMMY_DATASET_DIR)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import(self): expected_dataset = Dataset.from_iterable([ - DatasetItem(id='name0/name0_0001', - subset='test', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': [], - 'negative_pairs': ['name1/name1_0001', - 'name1/name1_0002'] - }, + DatasetItem(id='name0_0001', subset='test', + image=np.ones((2, 5, 3)), annotations=[ + Label(0, attributes={ + 'negative_pairs': ['name1/name1_0001', + 'name1/name1_0002'] + }), Points([0, 4, 3, 3, 2, 2, 1, 0, 3, 0]), ] ), - DatasetItem(id='name1/name1_0001', - subset='test', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': ['name1/name1_0002'], - 'negative_pairs': [] - }, + DatasetItem(id='name1_0001', subset='test', + image=np.ones((2, 5, 3)), annotations=[ + Label(1, attributes={ + 'positive_pairs': ['name1/name1_0002'], + }), Points([1, 6, 4, 6, 3, 3, 2, 1, 4, 1]), ] ), - DatasetItem(id='name1/name1_0002', - subset='test', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': [], - 'negative_pairs': [] - }, + DatasetItem(id='name1_0002', subset='test', + image=np.ones((2, 5, 3)), annotations=[ + Label(1), Points([0, 5, 3, 5, 2, 2, 1, 0, 3, 0]), ] ), - ]) + ], categories=['name0', 'name1']) dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'lfw') diff --git a/tests/test_market1501_format.py b/tests/test_market1501_format.py index 9eaaa30fcfea..17fc3afeacd5 100644 --- a/tests/test_market1501_format.py +++ b/tests/test_market1501_format.py @@ -8,9 +8,11 @@ Market1501Importer) from datumaro.util.image import Image from datumaro.util.test_utils import TestDir, compare_datasets +from .requirements import Requirements, mark_requirement class Market1501FormatTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='0001_c2s3_000001_00', @@ -45,6 +47,7 @@ def test_can_save_and_load(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_no_subsets(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='0001_c2s3_000001_00', @@ -63,6 +66,7 @@ def test_can_save_dataset_with_no_subsets(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='кириллица с пробелом', @@ -82,6 +86,7 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): compare_datasets(self, source_dataset, parsed_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_no_save_images(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='0001_c2s3_000001_00', @@ -108,6 +113,7 @@ def test_can_save_dataset_with_no_save_images(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): expected = Dataset.from_iterable([ DatasetItem(id='q/1', image=Image( @@ -133,6 +139,7 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): compare_datasets(self, expected, parsed_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_no_attributes(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='test1', @@ -157,9 +164,11 @@ def test_can_save_dataset_with_no_attributes(self): DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'market1501_dataset') class Market1501ImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect(self): self.assertTrue(Market1501Importer.detect(DUMMY_DATASET_DIR)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='0001_c2s3_000111_00', diff --git a/tests/test_masks.py b/tests/test_masks.py index 4396966089fb..1025927a4f49 100644 --- a/tests/test_masks.py +++ b/tests/test_masks.py @@ -4,9 +4,11 @@ import datumaro.util.mask_tools as mask_tools from datumaro.components.extractor import CompiledMask +from .requirements import Requirements, mark_requirement class PolygonConversionsTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_mask_can_be_converted_to_polygon(self): mask = np.array([ [0, 1, 1, 1, 0, 1, 1, 1, 1, 0], @@ -24,6 +26,7 @@ def test_mask_can_be_converted_to_polygon(self): self.assertEqual(len(expected), len(computed)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_crop_covered_segments(self): image_size = [7, 7] initial = [ @@ -80,6 +83,7 @@ def _test_mask_to_rle(self, source_mask): self.assertTrue(np.array_equal(source_mask, resulting_mask), '%s\n%s\n' % (source_mask, resulting_mask)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_mask_to_rle_multi(self): cases = [ np.array([ @@ -118,6 +122,7 @@ def test_mask_to_rle_multi(self): self._test_mask_to_rle(case) class ColormapOperationsTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_paint_mask(self): mask = np.zeros((1, 3), dtype=np.uint8) mask[:, 0] = 0 @@ -136,6 +141,7 @@ def test_can_paint_mask(self): self.assertTrue(np.array_equal(expected, actual), '%s\nvs.\n%s' % (expected, actual)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_unpaint_mask(self): colormap = mask_tools.generate_colormap(3) inverse_colormap = mask_tools.invert_colormap(colormap) @@ -155,6 +161,7 @@ def test_can_unpaint_mask(self): self.assertTrue(np.array_equal(expected, actual), '%s\nvs.\n%s' % (expected, actual)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_remap_mask(self): class_count = 10 remap_fn = lambda c: class_count - c @@ -172,6 +179,7 @@ def test_can_remap_mask(self): self.assertTrue(np.array_equal(expected, actual), '%s\nvs.\n%s' % (expected, actual)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_merge_masks(self): masks = [ np.array([0, 2, 4, 0, 0, 1]), @@ -186,6 +194,7 @@ def test_can_merge_masks(self): self.assertTrue(np.array_equal(expected, actual), '%s\nvs.\n%s' % (expected, actual)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_decode_compiled_mask(self): class_idx = 1000 instance_idx = 10000 diff --git a/tests/test_mnist_csv_format.py b/tests/test_mnist_csv_format.py new file mode 100644 index 000000000000..8fd258a8e5bd --- /dev/null +++ b/tests/test_mnist_csv_format.py @@ -0,0 +1,195 @@ +import os.path as osp +from unittest import TestCase + +import numpy as np +from datumaro.components.dataset import Dataset +from datumaro.components.extractor import (AnnotationType, DatasetItem, Label, + LabelCategories) +from datumaro.plugins.mnist_csv_format import (MnistCsvConverter, + MnistCsvImporter) +from datumaro.util.image import Image +from datumaro.util.test_utils import TestDir, compare_datasets +from .requirements import Requirements, mark_requirement + + +class MnistCsvFormatTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id=0, subset='test', + image=np.ones((28, 28)), + annotations=[Label(0)] + ), + DatasetItem(id=1, subset='test', + image=np.ones((28, 28)) + ), + DatasetItem(id=2, subset='test', + image=np.ones((28, 28)), + annotations=[Label(1)] + ) + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + str(label) for label in range(10)), + }) + + with TestDir() as test_dir: + MnistCsvConverter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'mnist_csv') + + compare_datasets(self, source_dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_without_saving_images(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id=0, subset='train', + annotations=[Label(0)] + ), + DatasetItem(id=1, subset='train', + annotations=[Label(1)] + ), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + str(label) for label in range(10)), + }) + + with TestDir() as test_dir: + MnistCsvConverter.convert(source_dataset, test_dir, save_images=False) + parsed_dataset = Dataset.import_from(test_dir, 'mnist_csv') + + compare_datasets(self, source_dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_with_different_image_size(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id=0, image=np.ones((10, 8)), + annotations=[Label(0)] + ), + DatasetItem(id=1, image=np.ones((4, 3)), + annotations=[Label(1)] + ), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + str(label) for label in range(10)), + }) + + with TestDir() as test_dir: + MnistCsvConverter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'mnist_csv') + + compare_datasets(self, source_dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id="кириллица с пробелом", + image=np.ones((28, 28)), + annotations=[Label(0)] + ), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + str(label) for label in range(10)), + }) + + with TestDir() as test_dir: + MnistCsvConverter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'mnist_csv') + + compare_datasets(self, source_dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_image_with_arbitrary_extension(self): + dataset = Dataset.from_iterable([ + DatasetItem(id='q/1', image=Image(path='q/1.JPEG', + data=np.zeros((28, 28)))), + DatasetItem(id='a/b/c/2', image=Image(path='a/b/c/2.bmp', + data=np.zeros((28, 28)))), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + str(label) for label in range(10)), + }) + + with TestDir() as test_dir: + MnistCsvConverter.convert(dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'mnist_csv') + + compare_datasets(self, dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_empty_image(self): + dataset = Dataset.from_iterable([ + DatasetItem(id=0, annotations=[Label(0)]), + DatasetItem(id=1) + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + str(label) for label in range(10)), + }) + + with TestDir() as test_dir: + MnistCsvConverter.convert(dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'mnist_csv') + + compare_datasets(self, dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_with_other_labels(self): + dataset = Dataset.from_iterable([ + DatasetItem(id=0, image=np.ones((28, 28)), + annotations=[Label(0)]), + DatasetItem(id=1, image=np.ones((28, 28)), + annotations=[Label(1)]) + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + 'label_%s' % label for label in range(2)), + }) + + with TestDir() as test_dir: + MnistCsvConverter.convert(dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'mnist_csv') + + compare_datasets(self, dataset, parsed_dataset, + require_images=True) + +DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'mnist_csv_dataset') + +class MnistCsvImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_import(self): + expected_dataset = Dataset.from_iterable([ + DatasetItem(id=0, subset='test', + image=np.ones((28, 28)), + annotations=[Label(0)] + ), + DatasetItem(id=1, subset='test', + image=np.ones((28, 28)), + annotations=[Label(2)] + ), + DatasetItem(id=2, subset='test', + image=np.ones((28, 28)), + annotations=[Label(1)] + ), + DatasetItem(id=0, subset='train', + image=np.ones((28, 28)), + annotations=[Label(5)] + ), + DatasetItem(id=1, subset='train', + image=np.ones((28, 28)), + annotations=[Label(7)] + ) + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + str(label) for label in range(10)), + }) + + dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'mnist_csv') + + compare_datasets(self, expected_dataset, dataset) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_detect(self): + self.assertTrue(MnistCsvImporter.detect(DUMMY_DATASET_DIR)) diff --git a/tests/test_mnist_format.py b/tests/test_mnist_format.py new file mode 100644 index 000000000000..41fe67286f6e --- /dev/null +++ b/tests/test_mnist_format.py @@ -0,0 +1,194 @@ +import os.path as osp +from unittest import TestCase + +import numpy as np +from datumaro.components.dataset import Dataset +from datumaro.components.extractor import (AnnotationType, DatasetItem, Label, + LabelCategories) +from datumaro.plugins.mnist_format import MnistConverter, MnistImporter +from datumaro.util.image import Image +from datumaro.util.test_utils import TestDir, compare_datasets +from .requirements import Requirements, mark_requirement + + +class MnistFormatTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id=0, subset='test', + image=np.ones((28, 28)), + annotations=[Label(0)] + ), + DatasetItem(id=1, subset='test', + image=np.ones((28, 28)) + ), + DatasetItem(id=2, subset='test', + image=np.ones((28, 28)), + annotations=[Label(1)] + ) + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + str(label) for label in range(10)), + }) + + with TestDir() as test_dir: + MnistConverter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'mnist') + + compare_datasets(self, source_dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_without_saving_images(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id=0, subset='train', + annotations=[Label(0)] + ), + DatasetItem(id=1, subset='train', + annotations=[Label(1)] + ), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + str(label) for label in range(10)), + }) + + with TestDir() as test_dir: + MnistConverter.convert(source_dataset, test_dir, save_images=False) + parsed_dataset = Dataset.import_from(test_dir, 'mnist') + + compare_datasets(self, source_dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_with_different_image_size(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id=0, image=np.ones((3, 4)), + annotations=[Label(0)] + ), + DatasetItem(id=1, image=np.ones((2, 2)), + annotations=[Label(1)] + ), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + str(label) for label in range(10)), + }) + + with TestDir() as test_dir: + MnistConverter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'mnist') + + compare_datasets(self, source_dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id="кириллица с пробелом", + image=np.ones((28, 28)), + annotations=[Label(0)] + ), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + str(label) for label in range(10)), + }) + + with TestDir() as test_dir: + MnistConverter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'mnist') + + compare_datasets(self, source_dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_image_with_arbitrary_extension(self): + dataset = Dataset.from_iterable([ + DatasetItem(id='q/1', image=Image(path='q/1.JPEG', + data=np.zeros((28, 28)))), + DatasetItem(id='a/b/c/2', image=Image(path='a/b/c/2.bmp', + data=np.zeros((28, 28)))), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + str(label) for label in range(10)), + }) + + with TestDir() as test_dir: + MnistConverter.convert(dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'mnist') + + compare_datasets(self, dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_empty_image(self): + dataset = Dataset.from_iterable([ + DatasetItem(id=0, annotations=[Label(0)]), + DatasetItem(id=1) + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + str(label) for label in range(10)), + }) + + with TestDir() as test_dir: + MnistConverter.convert(dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'mnist') + + compare_datasets(self, dataset, parsed_dataset, + require_images=True) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_with_other_labels(self): + dataset = Dataset.from_iterable([ + DatasetItem(id=0, image=np.ones((28, 28)), + annotations=[Label(0)]), + DatasetItem(id=1, image=np.ones((28, 28)), + annotations=[Label(1)]) + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + 'label_%s' % label for label in range(2)), + }) + + with TestDir() as test_dir: + MnistConverter.convert(dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'mnist') + + compare_datasets(self, dataset, parsed_dataset, + require_images=True) + +DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'mnist_dataset') + +class MnistImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_import(self): + expected_dataset = Dataset.from_iterable([ + DatasetItem(id=0, subset='test', + image=np.ones((28, 28)), + annotations=[Label(0)] + ), + DatasetItem(id=1, subset='test', + image=np.ones((28, 28)), + annotations=[Label(2)] + ), + DatasetItem(id=2, subset='test', + image=np.ones((28, 28)), + annotations=[Label(1)] + ), + DatasetItem(id=0, subset='train', + image=np.ones((28, 28)), + annotations=[Label(5)] + ), + DatasetItem(id=1, subset='train', + image=np.ones((28, 28)), + annotations=[Label(7)] + ) + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + str(label) for label in range(10)), + }) + + dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'mnist') + + compare_datasets(self, expected_dataset, dataset) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_detect(self): + self.assertTrue(MnistImporter.detect(DUMMY_DATASET_DIR)) diff --git a/tests/test_mot_format.py b/tests/test_mot_format.py index e5757d3cd7b9..6b428b40f88c 100644 --- a/tests/test_mot_format.py +++ b/tests/test_mot_format.py @@ -11,6 +11,7 @@ from datumaro.util.image import Image from datumaro.util.test_utils import (TestDir, compare_datasets, test_save_and_load) +from .requirements import Requirements, mark_requirement class MotConverterTest(TestCase): @@ -20,6 +21,7 @@ def _test_save_and_load(self, source_dataset, converter, test_dir, importer='mot_seq', target_dataset=target_dataset, importer_args=importer_args, **kwargs) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_bboxes(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', @@ -98,6 +100,7 @@ def test_can_save_bboxes(self): partial(MotSeqGtConverter.convert, save_images=True), test_dir, target_dataset=target_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): expected = Dataset.from_iterable([ DatasetItem('1', image=Image( @@ -123,9 +126,11 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'mot_dataset') class MotImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect(self): self.assertTrue(MotSeqImporter.detect(DUMMY_DATASET_DIR)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id=1, diff --git a/tests/test_mots_format.py b/tests/test_mots_format.py index 534c01b01657..51691a68c490 100644 --- a/tests/test_mots_format.py +++ b/tests/test_mots_format.py @@ -10,6 +10,7 @@ from datumaro.util.image import Image from datumaro.util.test_utils import (TestDir, compare_datasets, test_save_and_load) +from .requirements import Requirements, mark_requirement DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'mots_dataset') @@ -21,6 +22,7 @@ def _test_save_and_load(self, source_dataset, converter, test_dir, importer='mots', target_dataset=target_dataset, importer_args=importer_args, **kwargs) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_masks(self): source = Dataset.from_iterable([ DatasetItem(id=1, subset='a', image=np.ones((5, 1)), annotations=[ @@ -67,6 +69,7 @@ def test_can_save_masks(self): partial(MotsPngConverter.convert, save_images=True), test_dir, target_dataset=target) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): source = Dataset.from_iterable([ DatasetItem(id='кириллица с пробелом', subset='a', @@ -81,6 +84,7 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): partial(MotsPngConverter.convert, save_images=True), test_dir, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): expected = Dataset.from_iterable([ DatasetItem('q/1', image=Image( @@ -105,9 +109,11 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): test_dir, require_images=True) class MotsImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect(self): self.assertTrue(MotsImporter.detect(DUMMY_DATASET_DIR)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import(self): target = Dataset.from_iterable([ DatasetItem(id=1, subset='train', image=np.ones((5, 1)), annotations=[ diff --git a/tests/test_ndr.py b/tests/test_ndr.py index cdc32fde73e3..7180d1f5236a 100644 --- a/tests/test_ndr.py +++ b/tests/test_ndr.py @@ -7,6 +7,8 @@ LabelCategories, AnnotationType) import datumaro.plugins.ndr as ndr +from .requirements import Requirements, mark_requirement + class NDRTest(TestCase): def _generate_dataset(self, config, num_duplicate, dataset='classification'): @@ -39,6 +41,7 @@ def _generate_dataset(self, config, num_duplicate, dataset='classification'): dataset = Dataset.from_iterable(iterable, categories) return dataset + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_ndr_with_error(self): config = { "label1": 100, @@ -96,6 +99,7 @@ def test_ndr_with_error(self): result = ndr.NDR(source, working_subset='train') len(result) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_ndr_without_cut(self): config = { "label1": 100, @@ -117,6 +121,7 @@ def test_ndr_without_cut(self): self.assertEqual(300, len(source.get_subset("val"))) self.assertEqual(300, len(source.get_subset("test"))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_ndr_can_use_undersample_uniform(self): config = { "label1": 100, @@ -139,6 +144,7 @@ def test_ndr_can_use_undersample_uniform(self): self.assertEqual(300, len(source.get_subset("val"))) self.assertEqual(300, len(source.get_subset("test"))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_ndr_can_use_undersample_inverse(self): config = { "label1": 100, @@ -161,6 +167,7 @@ def test_ndr_can_use_undersample_inverse(self): self.assertEqual(300, len(source.get_subset("val"))) self.assertEqual(300, len(source.get_subset("test"))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_ndr_can_use_oversample_random(self): config = { "label1": 100, @@ -183,6 +190,7 @@ def test_ndr_can_use_oversample_random(self): self.assertEqual(300, len(source.get_subset("val"))) self.assertEqual(300, len(source.get_subset("test"))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_ndr_can_use_oversample_similarity(self): config = { "label1": 100, @@ -205,6 +213,7 @@ def test_ndr_can_use_oversample_similarity(self): self.assertEqual(300, len(source.get_subset("val"))) self.assertEqual(300, len(source.get_subset("test"))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_ndr_gradient_fails_on_invalid_parameters(self): source = self._generate_dataset({ 'label1': 5 }, 10) @@ -235,6 +244,7 @@ def test_ndr_gradient_fails_on_invalid_parameters(self): hash_dim=-5, block_shape=(8, 8), algorithm='gradient') len(result) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_ndr_gradient_can_use_block(self): config = { "label1": 100, @@ -256,6 +266,7 @@ def test_ndr_gradient_can_use_block(self): self.assertEqual(300, len(source.get_subset("val"))) self.assertEqual(300, len(source.get_subset("test"))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_ndr_gradient_can_use_hash_dim(self): config = { "label1": 100, @@ -278,6 +289,7 @@ def test_ndr_gradient_can_use_hash_dim(self): self.assertEqual(300, len(source.get_subset("val"))) self.assertEqual(300, len(source.get_subset("test"))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_ndr_gradient_can_use_sim_thresh(self): config = { "label1": 100, @@ -300,6 +312,7 @@ def test_ndr_gradient_can_use_sim_thresh(self): self.assertEqual(300, len(source.get_subset("val"))) self.assertEqual(300, len(source.get_subset("test"))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_ndr_seed(self): config = { "label1": 100, diff --git a/tests/test_ops.py b/tests/test_ops.py index 7294614a20e4..8658015dc8a6 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -10,9 +10,11 @@ compute_ann_statistics, mean_std, find_unique_images) from datumaro.components.dataset import Dataset from datumaro.util.test_utils import compare_datasets +from .requirements import Requirements, mark_requirement class TestOperations(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_mean_std(self): expected_mean = [100, 50, 150] expected_std = [20, 50, 10] @@ -33,6 +35,7 @@ def test_mean_std(self): for estd, astd in zip(expected_std, actual_std): self.assertAlmostEqual(estd, astd, places=0) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_stats(self): dataset = Dataset.from_iterable([ DatasetItem(id=1, image=np.ones((5, 5, 3)), annotations=[ @@ -138,6 +141,7 @@ def test_stats(self): self.assertEqual(expected, actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_stats_with_empty_dataset(self): dataset = Dataset.from_iterable([ DatasetItem(id=1), @@ -189,6 +193,7 @@ def test_stats_with_empty_dataset(self): self.assertEqual(expected, actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_unique_image_count(self): expected = { frozenset([('1', 'a'), ('1', 'b')]), @@ -215,6 +220,7 @@ def test_unique_image_count(self): class TestMultimerge(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_match_items(self): # items 1 and 3 are unique, item 2 is common and should be merged @@ -271,6 +277,7 @@ def test_can_match_items(self): key=lambda e: e.item_id) ) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_match_shapes(self): source0 = Dataset.from_iterable([ DatasetItem(1, annotations=[ @@ -374,6 +381,7 @@ def test_can_match_shapes(self): key=lambda e: len(e.sources)) ) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_attributes(self): source0 = Dataset.from_iterable([ DatasetItem(1, annotations=[ @@ -420,6 +428,7 @@ def test_attributes(self): if isinstance(e, FailedAttrVotingError)]) ) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_group_checks(self): dataset = Dataset.from_iterable([ DatasetItem(1, annotations=[ @@ -446,6 +455,7 @@ def test_group_checks(self): if isinstance(e, WrongGroupError)]), merger.errors ) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_merge_classes(self): source0 = Dataset.from_iterable([ DatasetItem(1, annotations=[ @@ -479,6 +489,7 @@ def test_can_merge_classes(self): compare_datasets(self, expected, merged, ignored_attrs={'score'}) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_merge_categories(self): source0 = Dataset.from_iterable([ DatasetItem(1, annotations=[ Label(0), ]), diff --git a/tests/test_project.py b/tests/test_project.py index b4ab7bbf58e2..8a8ddbaccf51 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -12,9 +12,11 @@ from datumaro.components.config import Config from datumaro.components.dataset import Dataset, DEFAULT_FORMAT from datumaro.util.test_utils import TestDir, compare_datasets +from .requirements import Requirements, mark_requirement class ProjectTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_project_generate(self): src_config = Config({ 'project_name': 'test_project', @@ -34,13 +36,16 @@ def test_project_generate(self): src_config.format_version, result_config.format_version) @staticmethod + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_default_ctor_is_ok(): Project() @staticmethod + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_empty_config_is_ok(): Project(Config()) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_add_source(self): source_name = 'source' origin = Source({ @@ -55,6 +60,7 @@ def test_add_source(self): self.assertIsNotNone(added) self.assertEqual(added, origin) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_added_source_can_be_saved(self): source_name = 'source' origin = Source({ @@ -67,6 +73,7 @@ def test_added_source_can_be_saved(self): self.assertEqual(origin, saved.sources[source_name]) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_added_source_can_be_dumped(self): source_name = 'source' origin = Source({ @@ -82,6 +89,7 @@ def test_added_source_can_be_dumped(self): loaded = loaded.get_source(source_name) self.assertEqual(origin, loaded) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import_with_custom_importer(self): class TestImporter: def __call__(self, path, subset=None): @@ -102,6 +110,7 @@ def __call__(self, path, subset=None): self.assertEqual(path, project.config.project_filename) self.assertListEqual(['train'], project.config.subsets) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_dump_added_model(self): model_name = 'model' @@ -116,6 +125,7 @@ def test_can_dump_added_model(self): loaded = loaded.get_model(model_name) self.assertEqual(saved, loaded) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_have_project_source(self): with TestDir() as test_dir: Project.generate(test_dir) @@ -128,6 +138,7 @@ def test_can_have_project_source(self): self.assertTrue('project1' in dataset.sources) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_batch_launch_custom_model(self): dataset = Dataset.from_iterable([ DatasetItem(id=i, subset='train', image=np.array([i])) @@ -157,6 +168,7 @@ def launch(self, inputs): self.assertEqual(int(item.id), item.annotations[0].attributes['data']) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_do_transform_with_custom_model(self): class TestExtractorSrc(Extractor): def __iter__(self): @@ -209,6 +221,7 @@ def __iter__(self): self.assertEqual(0, item1.annotations[0].label) self.assertEqual(1, item2.annotations[0].label) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_source_datasets_can_be_merged(self): class TestExtractor(Extractor): def __init__(self, url, n=0, s=0): @@ -235,6 +248,7 @@ def __iter__(self): self.assertEqual(n1 + n2, len(dataset)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_cant_merge_different_categories(self): class TestExtractor1(Extractor): def __iter__(self): @@ -264,6 +278,7 @@ def categories(self): with self.assertRaisesRegex(Exception, "different categories"): project.make_dataset() + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_project_filter_can_be_applied(self): class TestExtractor(Extractor): def __iter__(self): @@ -279,6 +294,7 @@ def __iter__(self): self.assertEqual(5, len(dataset)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_own_dataset(self): with TestDir() as test_dir: src_project = Project() @@ -292,6 +308,7 @@ def test_can_save_and_load_own_dataset(self): self.assertEqual(list(src_dataset), list(loaded_dataset)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_project_own_dataset_can_be_modified(self): project = Project() dataset = project.make_dataset() @@ -301,6 +318,7 @@ def test_project_own_dataset_can_be_modified(self): self.assertEqual(item, next(iter(dataset))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_project_compound_child_can_be_modified_recursively(self): with TestDir() as test_dir: child1 = Project({ @@ -331,6 +349,7 @@ def test_project_compound_child_can_be_modified_recursively(self): self.assertEqual(1, len(dataset.sources['child1'])) self.assertEqual(1, len(dataset.sources['child2'])) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_project_can_merge_item_annotations(self): class TestExtractor1(Extractor): def __iter__(self): @@ -359,6 +378,7 @@ def __iter__(self): item = next(iter(merged)) self.assertEqual(3, len(item.annotations)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect_and_import(self): env = Environment() env.importers.items = {DEFAULT_FORMAT: env.importers[DEFAULT_FORMAT]} @@ -378,6 +398,7 @@ def test_can_detect_and_import(self): DEFAULT_FORMAT) compare_datasets(self, source_dataset, imported_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_custom_extractor_can_be_created(self): class CustomExtractor(Extractor): def __iter__(self): diff --git a/tests/test_sampler.py b/tests/test_sampler.py index 3f3d316d28fa..4d4c10a9c0ad 100644 --- a/tests/test_sampler.py +++ b/tests/test_sampler.py @@ -19,6 +19,7 @@ has_libs = True except ImportError: has_libs = False +from .requirements import Requirements, mark_requirement @skipIf(not has_libs, "pandas library is not available") @@ -83,6 +84,7 @@ def _generate_classification_dataset(self, config, subset=None, dataset = Dataset.from_iterable(iterable, categories) return dataset + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_sampler_get_sample_classification(self): config = { "label1": 10, @@ -209,6 +211,7 @@ def test_sampler_get_sample_classification(self): num_pre_train_subset - len(result.get_subset("sample")), ) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_sampler_gives_error(self): config = { "label1": 10, @@ -378,6 +381,7 @@ def test_sampler_gives_error(self): entropy(data_df, infer_df) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_sampler_get_invalid_data(self): with self.subTest("empty dataset"): config = { @@ -491,6 +495,7 @@ def test_sampler_get_invalid_data(self): result = iter(result) next(result) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_sampler_number_of_samples(self): config = { "label1": 10, @@ -675,6 +680,7 @@ def test_sampler_number_of_samples(self): ) self.assertEqual(len(result.get_subset("sample")), 9) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_sampler_accumulated_sampling(self): config = { "label1": 10, @@ -882,6 +888,7 @@ def test_sampler_accumulated_sampling(self): self.assertEqual(len(result.get_subset("sample")), 9) self.assertEqual(len(result.get_subset("test")), num_pre_test_subset - 4) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_sampler_unaccumulated_sampling(self): config = { "label1": 10, @@ -1095,6 +1102,7 @@ def test_sampler_unaccumulated_sampling(self): self.assertEqual(len(result.get_subset("sample3")), 4) self.assertEqual(len(result.get_subset("test")), num_pre_test_subset - 4) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_sampler_parser(self): from argparse import ArgumentParser diff --git a/tests/test_splitter.py b/tests/test_splitter.py index 838694a62b5f..21aa6ceb4df8 100644 --- a/tests/test_splitter.py +++ b/tests/test_splitter.py @@ -3,11 +3,19 @@ from unittest import TestCase from datumaro.components.project import Dataset -from datumaro.components.extractor import (DatasetItem, Label, Bbox, - LabelCategories, AnnotationType) +from datumaro.components.extractor import ( + DatasetItem, + Label, + Bbox, + Mask, + Polygon, + LabelCategories, + AnnotationType, +) import datumaro.plugins.splitter as splitter from datumaro.components.operations import compute_ann_statistics +from .requirements import Requirements, mark_requirement class SplitterTest(TestCase): @@ -40,23 +48,29 @@ def _generate_dataset(self, config): for _ in range(count): idx += 1 iterable.append( - DatasetItem(idx, subset=self._get_subset(idx), - annotations=[ - Label(label_id, attributes=attributes) - ], + DatasetItem( + idx, + subset=self._get_subset(idx), + annotations=[Label(label_id, attributes=attributes)], + image=np.ones((1, 1, 3)), ) ) else: for _ in range(counts): idx += 1 iterable.append( - DatasetItem(idx, subset=self._get_subset(idx), - annotations=[Label(label_id)]) + DatasetItem( + idx, + subset=self._get_subset(idx), + annotations=[Label(label_id)], + image=np.ones((1, 1, 3)), + ) ) categories = {AnnotationType.label: label_cat} dataset = Dataset.from_iterable(iterable, categories) return dataset + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_split_for_classification_multi_class_no_attr(self): config = { "label1": {"attrs": None, "counts": 10}, @@ -64,9 +78,10 @@ def test_split_for_classification_multi_class_no_attr(self): "label3": {"attrs": None, "counts": 30}, } source = self._generate_dataset(config) + task = splitter.SplitTask.classification.name splits = [("train", 0.7), ("test", 0.3)] - actual = splitter.ClassificationSplit(source, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(42, len(actual.get_subset("train"))) self.assertEqual(18, len(actual.get_subset("test"))) @@ -85,13 +100,15 @@ def test_split_for_classification_multi_class_no_attr(self): self.assertEqual(6, dist_test["label2"][0]) self.assertEqual(9, dist_test["label3"][0]) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_split_for_classification_single_class_single_attr(self): counts = {0: 10, 1: 20, 2: 30} config = {"label": {"attrs": ["attr"], "counts": counts}} source = self._generate_dataset(config) + task = splitter.SplitTask.classification.name splits = [("train", 0.7), ("test", 0.3)] - actual = splitter.ClassificationSplit(source, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(42, len(actual.get_subset("train"))) self.assertEqual(18, len(actual.get_subset("test"))) @@ -110,6 +127,7 @@ def test_split_for_classification_single_class_single_attr(self): self.assertEqual(6, attr_test["attr"]["distribution"]["1"][0]) self.assertEqual(9, attr_test["attr"]["distribution"]["2"][0]) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_split_for_classification_single_class_multi_attr(self): counts = { (0, 0): 20, @@ -122,31 +140,41 @@ def test_split_for_classification_single_class_multi_attr(self): attrs = ["attr1", "attr2"] config = {"label": {"attrs": attrs, "counts": counts}} source = self._generate_dataset(config) - - splits = [("train", 0.7), ("test", 0.3)] - actual = splitter.ClassificationSplit(source, splits) - - self.assertEqual(84, len(actual.get_subset("train"))) - self.assertEqual(36, len(actual.get_subset("test"))) - - # check stats for train - stat_train = compute_ann_statistics(actual.get_subset("train")) - attr_train = stat_train["annotations"]["labels"]["attributes"] - self.assertEqual(49, attr_train["attr1"]["distribution"]["0"][0]) - self.assertEqual(35, attr_train["attr1"]["distribution"]["1"][0]) - self.assertEqual(28, attr_train["attr2"]["distribution"]["0"][0]) - self.assertEqual(21, attr_train["attr2"]["distribution"]["1"][0]) - self.assertEqual(35, attr_train["attr2"]["distribution"]["2"][0]) - - # check stats for test - stat_test = compute_ann_statistics(actual.get_subset("test")) - attr_test = stat_test["annotations"]["labels"]["attributes"] - self.assertEqual(21, attr_test["attr1"]["distribution"]["0"][0]) - self.assertEqual(15, attr_test["attr1"]["distribution"]["1"][0]) - self.assertEqual(12, attr_test["attr2"]["distribution"]["0"][0]) - self.assertEqual(9, attr_test["attr2"]["distribution"]["1"][0]) - self.assertEqual(15, attr_test["attr2"]["distribution"]["2"][0]) - + task = splitter.SplitTask.classification.name + + with self.subTest("zero remainder"): + splits = [("train", 0.7), ("test", 0.3)] + actual = splitter.Split(source, task, splits, seed=100) + + self.assertEqual(84, len(actual.get_subset("train"))) + self.assertEqual(36, len(actual.get_subset("test"))) + + # check stats for train + stat_train = compute_ann_statistics(actual.get_subset("train")) + attr_train = stat_train["annotations"]["labels"]["attributes"] + self.assertEqual(49, attr_train["attr1"]["distribution"]["0"][0]) + self.assertEqual(35, attr_train["attr1"]["distribution"]["1"][0]) + self.assertEqual(28, attr_train["attr2"]["distribution"]["0"][0]) + self.assertEqual(21, attr_train["attr2"]["distribution"]["1"][0]) + self.assertEqual(35, attr_train["attr2"]["distribution"]["2"][0]) + + # check stats for test + stat_test = compute_ann_statistics(actual.get_subset("test")) + attr_test = stat_test["annotations"]["labels"]["attributes"] + self.assertEqual(21, attr_test["attr1"]["distribution"]["0"][0]) + self.assertEqual(15, attr_test["attr1"]["distribution"]["1"][0]) + self.assertEqual(12, attr_test["attr2"]["distribution"]["0"][0]) + self.assertEqual(9, attr_test["attr2"]["distribution"]["1"][0]) + self.assertEqual(15, attr_test["attr2"]["distribution"]["2"][0]) + + with self.subTest("non-zero remainder"): + splits = [("train", 0.95), ("test", 0.05)] + actual = splitter.Split(source, task, splits, seed=100) + + self.assertEqual(114, len(actual.get_subset("train"))) + self.assertEqual(6, len(actual.get_subset("test"))) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_split_for_classification_multi_label_with_attr(self): counts = { (0, 0): 20, @@ -163,9 +191,10 @@ def test_split_for_classification_multi_label_with_attr(self): "label2": {"attrs": attr2, "counts": counts}, } source = self._generate_dataset(config) + task = splitter.SplitTask.classification.name splits = [("train", 0.7), ("test", 0.3)] - actual = splitter.ClassificationSplit(source, splits) + actual = splitter.Split(source, task, splits, seed=100) train = actual.get_subset("train") test = actual.get_subset("test") @@ -203,75 +232,83 @@ def test_split_for_classification_multi_label_with_attr(self): self.assertEqual(15, attr_test["attr3"]["distribution"]["2"][0]) with self.subTest("random seed test"): - r1 = splitter.ClassificationSplit(source, splits, seed=1234) - r2 = splitter.ClassificationSplit(source, splits, seed=1234) - r3 = splitter.ClassificationSplit(source, splits, seed=4321) - self.assertEqual( - list(r1.get_subset("test")), list(r2.get_subset("test")) - ) + r1 = splitter.Split(source, task, splits, seed=1234) + r2 = splitter.Split(source, task, splits, seed=1234) + r3 = splitter.Split(source, task, splits, seed=4321) + self.assertEqual(list(r1.get_subset("test")), list(r2.get_subset("test"))) self.assertNotEqual( list(r1.get_subset("test")), list(r3.get_subset("test")) ) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_split_for_classification_zero_ratio(self): config = { "label1": {"attrs": None, "counts": 5}, } source = self._generate_dataset(config) splits = [("train", 0.1), ("val", 0.9), ("test", 0.0)] + task = splitter.SplitTask.classification.name + + actual = splitter.Split(source, task, splits, seed=100) - actual = splitter.ClassificationSplit(source, splits) - self.assertEqual(1, len(actual.get_subset("train"))) self.assertEqual(4, len(actual.get_subset("val"))) self.assertEqual(0, len(actual.get_subset("test"))) - def test_split_for_classification_gives_error(self): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_split_for_classification_unlabeled(self): with self.subTest("no label"): - source = Dataset.from_iterable([ - DatasetItem(1, annotations=[]), - DatasetItem(2, annotations=[]), - ], categories=["a", "b", "c"]) + iterable = [DatasetItem(i, annotations=[]) for i in range(10)] + source = Dataset.from_iterable(iterable, categories=["a", "b"]) + splits = [("train", 0.7), ("test", 0.3)] + task = splitter.SplitTask.classification.name + actual = splitter.Split(source, task, splits, seed=100) - with self.assertRaisesRegex(Exception, "exactly one is expected"): - splits = [("train", 0.7), ("test", 0.3)] - actual = splitter.ClassificationSplit(source, splits) - len(actual.get_subset("train")) + self.assertEqual(7, len(actual.get_subset("train"))) + self.assertEqual(3, len(actual.get_subset("test"))) with self.subTest("multi label"): - source = Dataset.from_iterable([ - DatasetItem(1, annotations=[Label(0), Label(1)]), - DatasetItem(2, annotations=[Label(0), Label(2)]), - ], categories=["a", "b", "c"]) + anns = [Label(0), Label(1)] + iterable = [DatasetItem(i, annotations=anns) for i in range(10)] + source = Dataset.from_iterable(iterable, categories=["a", "b"]) + splits = [("train", 0.7), ("test", 0.3)] + task = splitter.SplitTask.classification.name + actual = splitter.Split(source, task, splits, seed=100) - with self.assertRaisesRegex(Exception, "exactly one is expected"): - splits = [("train", 0.7), ("test", 0.3)] - splitter.ClassificationSplit(source, splits) - len(actual.get_subset("train")) + self.assertEqual(7, len(actual.get_subset("train"))) + self.assertEqual(3, len(actual.get_subset("test"))) - source = Dataset.from_iterable([ - DatasetItem(1, annotations=[Label(0)]), - DatasetItem(2, annotations=[Label(1)]), - ], categories=["a", "b", "c"]) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_split_for_classification_gives_error(self): + source = Dataset.from_iterable( + [ + DatasetItem(1, annotations=[Label(0)]), + DatasetItem(2, annotations=[Label(1)]), + ], + categories=["a", "b", "c"], + ) + task = splitter.SplitTask.classification.name with self.subTest("wrong ratio"): with self.assertRaisesRegex(Exception, "in the range"): splits = [("train", -0.5), ("test", 1.5)] - splitter.ClassificationSplit(source, splits) + splitter.Split(source, task, splits) with self.assertRaisesRegex(Exception, "Sum of ratios"): splits = [("train", 0.5), ("test", 0.5), ("val", 0.5)] - splitter.ClassificationSplit(source, splits) + splitter.Split(source, task, splits) - with self.subTest("wrong subset name"): - with self.assertRaisesRegex(Exception, "Subset name"): - splits = [("train_", 0.5), ("val", 0.2), ("test", 0.3)] - splitter.ClassificationSplit(source, splits) + with self.subTest("duplicated subset name"): + with self.assertRaisesRegex(Exception, "duplicated"): + splits = [("train", 0.5), ("train", 0.2), ("test", 0.3)] + splitter.Split(source, task, splits) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_split_for_reidentification(self): - ''' + """ Test ReidentificationSplit using Dataset with label (ImageNet style) - ''' + """ + def _get_present(stat): values_present = [] for label, dist in stat["distribution"].items(): @@ -295,9 +332,9 @@ def _get_present(stat): attr_for_id = None source = self._generate_dataset(config) splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)] + task = splitter.SplitTask.reid.name query = 0.4 / 0.7 - actual = splitter.ReidentificationSplit(source, - splits, query, attr_for_id) + actual = splitter.Split(source, task, splits, query, attr_for_id) stats = dict() for sname in ["train", "val", "test-query", "test-gallery"]: @@ -344,10 +381,11 @@ def _get_present(stat): self.assertEqual(int(total * 0.3 / 0.7), dist_gallery[pid][0]) self.assertEqual(int(total * 0.4 / 0.7), dist_query[pid][0]) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_split_for_reidentification_randomseed(self): - ''' + """ Test randomseed for reidentification - ''' + """ counts = {} config = dict() for i in range(10): @@ -356,60 +394,60 @@ def test_split_for_reidentification_randomseed(self): counts[label] = count config[label] = {"attrs": None, "counts": count} source = self._generate_dataset(config) + task = splitter.SplitTask.reid.name splits = [("train", 0.5), ("test", 0.5)] query = 0.4 / 0.7 - r1 = splitter.ReidentificationSplit(source, splits, query, seed=1234) - r2 = splitter.ReidentificationSplit(source, splits, query, seed=1234) - r3 = splitter.ReidentificationSplit(source, splits, query, seed=4321) - self.assertEqual( - list(r1.get_subset("train")), list(r2.get_subset("train")) - ) - self.assertNotEqual( - list(r1.get_subset("train")), list(r3.get_subset("train")) - ) + r1 = splitter.Split(source, task, splits, query, seed=1234) + r2 = splitter.Split(source, task, splits, query, seed=1234) + r3 = splitter.Split(source, task, splits, query, seed=4321) + self.assertEqual(list(r1.get_subset("train")), list(r2.get_subset("train"))) + self.assertNotEqual(list(r1.get_subset("train")), list(r3.get_subset("train"))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_split_for_reidentification_rebalance(self): - ''' + """ rebalance function shouldn't gives error when there's no exchange - ''' + """ config = dict() for i in range(100): label = "label%03d" % i config[label] = {"attrs": None, "counts": 7} source = self._generate_dataset(config) + task = splitter.SplitTask.reid.name splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)] query = 0.4 / 0.7 - actual = splitter.ReidentificationSplit(source, splits, query) + actual = splitter.Split(source, task, splits, query, seed=100) self.assertEqual(350, len(actual.get_subset("train"))) self.assertEqual(140, len(actual.get_subset("val"))) self.assertEqual(90, len(actual.get_subset("test-gallery"))) self.assertEqual(120, len(actual.get_subset("test-query"))) - def test_split_for_reidentification_gives_error(self): - query = 0.4 / 0.7 # valid query ratio + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_split_for_reidentification_unlabeled(self): + query = 0.5 + task = splitter.SplitTask.reid.name with self.subTest("no label"): - source = Dataset.from_iterable([ - DatasetItem(1, annotations=[]), - DatasetItem(2, annotations=[]), - ], categories=["a", "b", "c"]) + iterable = [DatasetItem(i, annotations=[]) for i in range(10)] + source = Dataset.from_iterable(iterable, categories=["a", "b"]) + splits = [("train", 0.6), ("test", 0.4)] + actual = splitter.Split(source, task, splits, query, seed=100) + self.assertEqual(10, len(actual.get_subset("not-supported"))) - with self.assertRaisesRegex(Exception, "exactly one is expected"): - splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)] - actual = splitter.ReidentificationSplit(source, splits, query) - len(actual.get_subset("train")) + with self.subTest("multi label"): + anns = [Label(0), Label(1)] + iterable = [DatasetItem(i, annotations=anns) for i in range(10)] + source = Dataset.from_iterable(iterable, categories=["a", "b"]) + splits = [("train", 0.6), ("test", 0.4)] + actual = splitter.Split(source, task, splits, query, seed=100) - with self.subTest(msg="multi label"): - source = Dataset.from_iterable([ - DatasetItem(1, annotations=[Label(0), Label(1)]), - DatasetItem(2, annotations=[Label(0), Label(2)]), - ], categories=["a", "b", "c"]) + self.assertEqual(10, len(actual.get_subset("not-supported"))) - with self.assertRaisesRegex(Exception, "exactly one is expected"): - splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)] - actual = splitter.ReidentificationSplit(source, splits, query) - len(actual.get_subset("train")) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_split_for_reidentification_gives_error(self): + query = 0.4 / 0.7 # valid query ratio + task = splitter.SplitTask.reid.name counts = {i: (i % 3 + 1) * 7 for i in range(10)} config = {"person": {"attrs": ["PID"], "counts": counts}} @@ -417,30 +455,35 @@ def test_split_for_reidentification_gives_error(self): with self.subTest("wrong ratio"): with self.assertRaisesRegex(Exception, "in the range"): splits = [("train", -0.5), ("val", 0.2), ("test", 0.3)] - splitter.ReidentificationSplit(source, splits, query) + splitter.Split(source, task, splits, query) with self.assertRaisesRegex(Exception, "Sum of ratios"): splits = [("train", 0.6), ("val", 0.2), ("test", 0.3)] - splitter.ReidentificationSplit(source, splits, query) + splitter.Split(source, task, splits, query) with self.assertRaisesRegex(Exception, "in the range"): splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)] - actual = splitter.ReidentificationSplit(source, splits, -query) + actual = splitter.Split(source, task, splits, -query) + + with self.subTest("duplicated subset name"): + with self.assertRaisesRegex(Exception, "duplicated"): + splits = [("train", 0.5), ("train", 0.2), ("test", 0.3)] + splitter.Split(source, task, splits, query) with self.subTest("wrong subset name"): with self.assertRaisesRegex(Exception, "Subset name"): splits = [("_train", 0.5), ("val", 0.2), ("test", 0.3)] - splitter.ReidentificationSplit(source, splits, query) + splitter.Split(source, task, splits, query) with self.subTest("wrong attribute name for person id"): splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)] - actual = splitter.ReidentificationSplit(source, splits, query) + actual = splitter.Split(source, task, splits, query) with self.assertRaisesRegex(Exception, "Unknown subset"): actual.get_subset("test") - def _generate_detection_dataset(self, **kwargs): - append_bbox = kwargs.get("append_bbox") + def _generate_detection_segmentation_dataset(self, **kwargs): + annotation_type = kwargs.get("annotation_type") with_attr = kwargs.get("with_attr", False) nimages = kwargs.get("nimages", 10) @@ -469,10 +512,18 @@ def _generate_detection_dataset(self, **kwargs): attributes["attr0"] = attr_val % 3 attributes["attr%d" % (label_id + 1)] = attr_val % 2 for ann_id in range(count): - append_bbox(annotations, label_id=label_id, ann_id=ann_id, - attributes=attributes) - item = DatasetItem(img_id, subset=self._get_subset(img_id), - annotations=annotations, attributes={"id": img_id}) + annotation_type( + annotations, + label_id=label_id, + ann_id=ann_id, + attributes=attributes, + ) + item = DatasetItem( + img_id, + subset=self._get_subset(img_id), + annotations=annotations, + attributes={"id": img_id}, + ) iterable.append(item) dataset = Dataset.from_iterable(iterable, categories) @@ -482,7 +533,12 @@ def _generate_detection_dataset(self, **kwargs): def _get_append_bbox(dataset_type): def append_bbox_coco(annotations, **kwargs): annotations.append( - Bbox(1, 1, 2, 2, label=kwargs["label_id"], + Bbox( + 1, + 1, + 2, + 2, + label=kwargs["label_id"], id=kwargs["ann_id"], attributes=kwargs["attributes"], group=kwargs["ann_id"], @@ -494,7 +550,12 @@ def append_bbox_coco(annotations, **kwargs): def append_bbox_voc(annotations, **kwargs): annotations.append( - Bbox(1, 1, 2, 2, label=kwargs["label_id"], + Bbox( + 1, + 1, + 2, + 2, + label=kwargs["label_id"], id=kwargs["ann_id"] + 1, attributes=kwargs["attributes"], group=kwargs["ann_id"], @@ -504,7 +565,12 @@ def append_bbox_voc(annotations, **kwargs): Label(kwargs["label_id"], attributes=kwargs["attributes"]) ) annotations.append( - Bbox(1, 1, 2, 2, label=kwargs["label_id"] + 3, + Bbox( + 1, + 1, + 2, + 2, + label=kwargs["label_id"] + 3, group=kwargs["ann_id"], ) ) # part @@ -520,7 +586,12 @@ def append_bbox_yolo(annotations, **kwargs): def append_bbox_cvat(annotations, **kwargs): annotations.append( - Bbox(1, 1, 2, 2, label=kwargs["label_id"], + Bbox( + 1, + 1, + 2, + 2, + label=kwargs["label_id"], id=kwargs["ann_id"], attributes=kwargs["attributes"], group=kwargs["ann_id"], @@ -533,7 +604,12 @@ def append_bbox_cvat(annotations, **kwargs): def append_bbox_labelme(annotations, **kwargs): annotations.append( - Bbox(1, 1, 2, 2, label=kwargs["label_id"], + Bbox( + 1, + 1, + 2, + 2, + label=kwargs["label_id"], id=kwargs["ann_id"], attributes=kwargs["attributes"], ) @@ -544,7 +620,12 @@ def append_bbox_labelme(annotations, **kwargs): def append_bbox_mot(annotations, **kwargs): annotations.append( - Bbox(1, 1, 2, 2, label=kwargs["label_id"], + Bbox( + 1, + 1, + 2, + 2, + label=kwargs["label_id"], attributes=kwargs["attributes"], ) ) @@ -553,9 +634,7 @@ def append_bbox_mot(annotations, **kwargs): ) def append_bbox_widerface(annotations, **kwargs): - annotations.append( - Bbox(1, 1, 2, 2, attributes=kwargs["attributes"]) - ) + annotations.append(Bbox(1, 1, 2, 2, attributes=kwargs["attributes"])) annotations.append(Label(0, attributes=kwargs["attributes"])) functions = { @@ -571,8 +650,170 @@ def append_bbox_widerface(annotations, **kwargs): func = functions.get(dataset_type, append_bbox_cvat) return func + @staticmethod + def _get_append_mask(dataset_type): + def append_mask_coco(annotations, **kwargs): + annotations.append( + Mask( + np.array([[0, 0, 0, 1, 0]]), + label=kwargs["label_id"], + id=kwargs["ann_id"], + attributes=kwargs["attributes"], + group=kwargs["ann_id"], + ) + ) + annotations.append( + Label(kwargs["label_id"], attributes=kwargs["attributes"]) + ) + + def append_mask_voc(annotations, **kwargs): + annotations.append( + Mask( + np.array([[0, 0, 0, 1, 0]]), + label=kwargs["label_id"], + id=kwargs["ann_id"] + 1, + attributes=kwargs["attributes"], + group=kwargs["ann_id"], + ) + ) # obj + annotations.append( + Label(kwargs["label_id"], attributes=kwargs["attributes"]) + ) + annotations.append( + Mask( + np.array([[0, 0, 0, 1, 0]]), + label=kwargs["label_id"] + 3, + group=kwargs["ann_id"], + ) + ) # part + annotations.append( + Label(kwargs["label_id"] + 3, attributes=kwargs["attributes"]) + ) + + def append_mask_labelme(annotations, **kwargs): + annotations.append( + Mask( + np.array([[0, 0, 0, 1, 0]]), + label=kwargs["label_id"], + id=kwargs["ann_id"], + attributes=kwargs["attributes"], + ) + ) + annotations.append( + Label(kwargs["label_id"], attributes=kwargs["attributes"]) + ) + + def append_mask_mot(annotations, **kwargs): + annotations.append( + Mask( + np.array([[0, 0, 0, 1, 0]]), + label=kwargs["label_id"], + attributes=kwargs["attributes"], + ) + ) + annotations.append( + Label(kwargs["label_id"], attributes=kwargs["attributes"]) + ) + + functions = { + "coco": append_mask_coco, + "voc": append_mask_voc, + "labelme": append_mask_labelme, + "mot": append_mask_mot, + } + + func = functions.get(dataset_type, append_mask_coco) + return func + + @staticmethod + def _get_append_polygon(dataset_type): + def append_polygon_coco(annotations, **kwargs): + annotations.append( + Polygon( + [0, 0, 1, 0, 1, 2, 0, 2], + label=kwargs["label_id"], + id=kwargs["ann_id"], + attributes=kwargs["attributes"], + group=kwargs["ann_id"], + ) + ) + annotations.append( + Label(kwargs["label_id"], attributes=kwargs["attributes"]) + ) + + def append_polygon_voc(annotations, **kwargs): + annotations.append( + Polygon( + [0, 0, 1, 0, 1, 2, 0, 2], + label=kwargs["label_id"], + id=kwargs["ann_id"] + 1, + attributes=kwargs["attributes"], + group=kwargs["ann_id"], + ) + ) # obj + annotations.append( + Label(kwargs["label_id"], attributes=kwargs["attributes"]) + ) + annotations.append( + Polygon( + [0, 0, 1, 0, 1, 2, 0, 2], + label=kwargs["label_id"] + 3, + group=kwargs["ann_id"], + ) + ) # part + annotations.append( + Label(kwargs["label_id"] + 3, attributes=kwargs["attributes"]) + ) + + def append_polygon_yolo(annotations, **kwargs): + annotations.append(Bbox(1, 1, 2, 2, label=kwargs["label_id"])) + annotations.append( + Label(kwargs["label_id"], attributes=kwargs["attributes"]) + ) + + def append_polygon_cvat(annotations, **kwargs): + annotations.append( + Polygon( + [0, 0, 1, 0, 1, 2, 0, 2], + label=kwargs["label_id"], + id=kwargs["ann_id"], + attributes=kwargs["attributes"], + group=kwargs["ann_id"], + z_order=kwargs["ann_id"], + ) + ) + annotations.append( + Label(kwargs["label_id"], attributes=kwargs["attributes"]) + ) + + def append_polygon_labelme(annotations, **kwargs): + annotations.append( + Polygon( + [0, 0, 1, 0, 1, 2, 0, 2], + label=kwargs["label_id"], + id=kwargs["ann_id"], + attributes=kwargs["attributes"], + ) + ) + annotations.append( + Label(kwargs["label_id"], attributes=kwargs["attributes"]) + ) + + functions = { + "coco": append_polygon_coco, + "voc": append_polygon_voc, + "yolo": append_polygon_yolo, + "cvat": append_polygon_cvat, + "labelme": append_polygon_labelme, + } + + func = functions.get(dataset_type, append_polygon_coco) + return func + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_split_for_detection(self): dtypes = ["coco", "voc", "yolo", "cvat", "labelme", "mot", "widerface"] + task = splitter.SplitTask.detection.name params = [] for dtype in dtypes: for with_attr in [False, True]: @@ -580,8 +821,8 @@ def test_split_for_detection(self): params.append((dtype, with_attr, 10, 7, 0, 3)) for dtype, with_attr, nimages, train, val, test in params: - source, _ = self._generate_detection_dataset( - append_bbox=self._get_append_bbox(dtype), + source, _ = self._generate_detection_segmentation_dataset( + annotation_type=self._get_append_bbox(dtype), with_attr=with_attr, nimages=nimages, ) @@ -598,59 +839,317 @@ def test_split_for_detection(self): train=train, val=val, test=test, + task=task, ): - actual = splitter.DetectionSplit(source, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(train, len(actual.get_subset("train"))) self.assertEqual(val, len(actual.get_subset("val"))) self.assertEqual(test, len(actual.get_subset("test"))) # random seed test - source, _ = self._generate_detection_dataset( - append_bbox=self._get_append_bbox("cvat"), + source, _ = self._generate_detection_segmentation_dataset( + annotation_type=self._get_append_bbox("cvat"), with_attr=True, nimages=10, ) splits = [("train", 0.5), ("test", 0.5)] - r1 = splitter.DetectionSplit(source, splits, seed=1234) - r2 = splitter.DetectionSplit(source, splits, seed=1234) - r3 = splitter.DetectionSplit(source, splits, seed=4321) - self.assertEqual( - list(r1.get_subset("test")), list(r2.get_subset("test")) - ) - self.assertNotEqual( - list(r1.get_subset("test")), list(r3.get_subset("test")) + r1 = splitter.Split(source, task, splits, seed=1234) + r2 = splitter.Split(source, task, splits, seed=1234) + r3 = splitter.Split(source, task, splits, seed=4321) + self.assertEqual(list(r1.get_subset("test")), list(r2.get_subset("test"))) + self.assertNotEqual(list(r1.get_subset("test")), list(r3.get_subset("test"))) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_split_for_detection_with_unlabeled(self): + source, _ = self._generate_detection_segmentation_dataset( + annotation_type=self._get_append_bbox("cvat"), + with_attr=True, + nimages=10, ) + for i in range(10): + source.put(DatasetItem(i + 10, annotations={})) - def test_split_for_detection_gives_error(self): - with self.subTest(msg="bbox annotation"): - source = Dataset.from_iterable([ - DatasetItem(1, annotations=[Label(0), Label(1)]), - DatasetItem(2, annotations=[Label(0), Label(2)]), - ], categories=["a", "b", "c"]) - - with self.assertRaisesRegex(Exception, "more than one bbox"): - splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)] - actual = splitter.DetectionSplit(source, splits) - len(actual.get_subset("train")) + splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)] + task = splitter.SplitTask.detection.name + actual = splitter.Split(source, task, splits, seed=100) + self.assertEqual(10, len(actual.get_subset("train"))) + self.assertEqual(4, len(actual.get_subset("val"))) + self.assertEqual(6, len(actual.get_subset("test"))) - source, _ = self._generate_detection_dataset( - append_bbox=self._get_append_bbox("cvat"), + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_split_for_detection_gives_error(self): + source, _ = self._generate_detection_segmentation_dataset( + annotation_type=self._get_append_bbox("cvat"), with_attr=True, nimages=5, ) + task = splitter.SplitTask.detection.name with self.subTest("wrong ratio"): with self.assertRaisesRegex(Exception, "in the range"): splits = [("train", -0.5), ("test", 1.5)] - splitter.DetectionSplit(source, splits) + splitter.Split(source, task, splits) with self.assertRaisesRegex(Exception, "Sum of ratios"): splits = [("train", 0.5), ("test", 0.5), ("val", 0.5)] - splitter.DetectionSplit(source, splits) + splitter.Split(source, task, splits) + + with self.subTest("duplicated subset name"): + with self.assertRaisesRegex(Exception, "duplicated"): + splits = [("train", 0.5), ("train", 0.2), ("test", 0.3)] + splitter.Split(source, task, splits) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_no_subset_name_and_count_restriction(self): + splits = [ + ("_train", 0.5), + ("valid", 0.1), + ("valid2", 0.1), + ("test*", 0.2), + ("test2", 0.1), + ] + + with self.subTest("classification"): + config = {"label1": {"attrs": None, "counts": 10}} + task = splitter.SplitTask.classification.name + source = self._generate_dataset(config) + actual = splitter.Split(source, task, splits, seed=100) + self.assertEqual(5, len(actual.get_subset("_train"))) + self.assertEqual(1, len(actual.get_subset("valid"))) + self.assertEqual(1, len(actual.get_subset("valid2"))) + self.assertEqual(2, len(actual.get_subset("test*"))) + self.assertEqual(1, len(actual.get_subset("test2"))) + + with self.subTest("detection"): + source, _ = self._generate_detection_segmentation_dataset( + annotation_type=self._get_append_bbox("cvat"), + with_attr=True, + nimages=10, + ) + task = splitter.SplitTask.detection.name + actual = splitter.Split(source, task, splits, seed=21) + self.assertEqual(4, len(actual.get_subset("_train"))) + self.assertEqual(1, len(actual.get_subset("valid"))) + self.assertEqual(2, len(actual.get_subset("valid2"))) + self.assertEqual(2, len(actual.get_subset("test*"))) + self.assertEqual(1, len(actual.get_subset("test2"))) + + with self.subTest("segmentation"): + source, _ = self._generate_detection_segmentation_dataset( + annotation_type=self._get_append_mask("coco"), + with_attr=True, + nimages=10, + ) + task = splitter.SplitTask.detection.name + actual = splitter.Split(source, task, splits, seed=100) + self.assertEqual(5, len(actual.get_subset("_train"))) + self.assertEqual(1, len(actual.get_subset("valid"))) + self.assertEqual(1, len(actual.get_subset("valid2"))) + self.assertEqual(2, len(actual.get_subset("test*"))) + self.assertEqual(1, len(actual.get_subset("test2"))) + + source, _ = self._generate_detection_segmentation_dataset( + annotation_type=self._get_append_polygon("coco"), + with_attr=True, + nimages=10, + ) + actual = splitter.Split(source, task, splits, seed=100) + self.assertEqual(5, len(actual.get_subset("_train"))) + self.assertEqual(1, len(actual.get_subset("valid"))) + self.assertEqual(1, len(actual.get_subset("valid2"))) + self.assertEqual(2, len(actual.get_subset("test*"))) + self.assertEqual(1, len(actual.get_subset("test2"))) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_split_for_segmentation(self): + + with self.subTest("mask annotation"): + dtypes = ["coco", "voc", "labelme", "mot"] + task = splitter.SplitTask.segmentation.name + params = [] + for dtype in dtypes: + for with_attr in [False, True]: + params.append((dtype, with_attr, 10, 5, 3, 2)) + params.append((dtype, with_attr, 10, 7, 0, 3)) + + for dtype, with_attr, nimages, train, val, test in params: + source, _ = self._generate_detection_segmentation_dataset( + annotation_type=self._get_append_mask(dtype), + with_attr=with_attr, + nimages=nimages, + ) + total = np.sum([train, val, test]) + splits = [ + ("train", train / total), + ("val", val / total), + ("test", test / total), + ] + with self.subTest( + dtype=dtype, + with_attr=with_attr, + nimage=nimages, + train=train, + val=val, + test=test, + task=task, + ): + actual = splitter.Split(source, task, splits, seed=100) + + self.assertEqual(train, len(actual.get_subset("train"))) + self.assertEqual(val, len(actual.get_subset("val"))) + self.assertEqual(test, len(actual.get_subset("test"))) + + # random seed test + source, _ = self._generate_detection_segmentation_dataset( + annotation_type=self._get_append_mask("coco"), + with_attr=True, + nimages=10, + ) - with self.subTest("wrong subset name"): - with self.assertRaisesRegex(Exception, "Subset name"): - splits = [("train_", 0.5), ("val", 0.2), ("test", 0.3)] - splitter.DetectionSplit(source, splits) + splits = [("train", 0.5), ("test", 0.5)] + r1 = splitter.Split(source, task, splits, seed=1234) + r2 = splitter.Split(source, task, splits, seed=1234) + r3 = splitter.Split(source, task, splits, seed=4321) + self.assertEqual(list(r1.get_subset("test")), list(r2.get_subset("test"))) + self.assertNotEqual( + list(r1.get_subset("test")), list(r3.get_subset("test")) + ) + + with self.subTest("polygon annotation"): + dtypes = ["coco", "voc", "labelme", "yolo", "cvat"] + task = splitter.SplitTask.segmentation.name + params = [] + for dtype in dtypes: + for with_attr in [False, True]: + params.append((dtype, with_attr, 10, 5, 3, 2)) + params.append((dtype, with_attr, 10, 7, 0, 3)) + + expected = [] + for dtype, with_attr, nimages, train, val, test in params: + source, _ = self._generate_detection_segmentation_dataset( + annotation_type=self._get_append_polygon(dtype), + with_attr=with_attr, + nimages=nimages, + ) + total = np.sum([train, val, test]) + splits = [ + ("train", train / total), + ("val", val / total), + ("test", test / total), + ] + with self.subTest( + dtype=dtype, + with_attr=with_attr, + nimage=nimages, + train=train, + val=val, + test=test, + task=task, + ): + actual = splitter.Split(source, task, splits, seed=21) + + expected.append([dtype, with_attr, len(actual.get_subset("train")), len(actual.get_subset("val")), len(actual.get_subset("test"))]) + + self.assertEqual(train, len(actual.get_subset("train"))) + self.assertEqual(val, len(actual.get_subset("val"))) + self.assertEqual(test, len(actual.get_subset("test"))) + + # random seed test + source, _ = self._generate_detection_segmentation_dataset( + annotation_type=self._get_append_polygon("coco"), + with_attr=True, + nimages=10, + ) + + splits = [("train", 0.5), ("test", 0.5)] + r1 = splitter.Split(source, task, splits, seed=1234) + r2 = splitter.Split(source, task, splits, seed=1234) + r3 = splitter.Split(source, task, splits, seed=4321) + self.assertEqual(list(r1.get_subset("test")), list(r2.get_subset("test"))) + self.assertNotEqual( + list(r1.get_subset("test")), list(r3.get_subset("test")) + ) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_split_for_segmentation_with_unlabeled(self): + + with self.subTest("mask annotation"): + source, _ = self._generate_detection_segmentation_dataset( + annotation_type=self._get_append_mask("coco"), + with_attr=True, + nimages=10, + ) + for i in range(10): + source.put(DatasetItem(i + 10, annotations={})) + + splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)] + task = splitter.SplitTask.segmentation.name + actual = splitter.Split(source, task, splits, seed=100) + self.assertEqual(10, len(actual.get_subset("train"))) + self.assertEqual(4, len(actual.get_subset("val"))) + self.assertEqual(6, len(actual.get_subset("test"))) + + with self.subTest("polygon annotation"): + source, _ = self._generate_detection_segmentation_dataset( + annotation_type=self._get_append_polygon("coco"), + with_attr=True, + nimages=10, + ) + for i in range(10): + source.put(DatasetItem(i + 10, annotations={})) + + splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)] + task = splitter.SplitTask.segmentation.name + actual = splitter.Split(source, task, splits, seed=100) + self.assertEqual(10, len(actual.get_subset("train"))) + self.assertEqual(4, len(actual.get_subset("val"))) + self.assertEqual(6, len(actual.get_subset("test"))) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_split_for_segmentation_gives_error(self): + + with self.subTest("mask annotation"): + source, _ = self._generate_detection_segmentation_dataset( + annotation_type=self._get_append_mask("coco"), + with_attr=True, + nimages=5, + ) + task = splitter.SplitTask.segmentation.name + + with self.subTest("wrong ratio"): + with self.assertRaisesRegex(Exception, "in the range"): + splits = [("train", -0.5), ("test", 1.5)] + splitter.Split(source, task, splits) + + with self.assertRaisesRegex(Exception, "Sum of ratios"): + splits = [("train", 0.5), ("test", 0.5), ("val", 0.5)] + splitter.Split(source, task, splits) + + with self.subTest("duplicated subset name"): + with self.assertRaisesRegex(Exception, "duplicated"): + splits = [("train", 0.5), ("train", 0.2), ("test", 0.3)] + splitter.Split(source, task, splits) + + with self.subTest("polygon annotation"): + source, _ = self._generate_detection_segmentation_dataset( + annotation_type=self._get_append_polygon("coco"), + with_attr=True, + nimages=5, + ) + task = splitter.SplitTask.segmentation.name + + with self.subTest("wrong ratio"): + with self.assertRaisesRegex(Exception, "in the range"): + splits = [("train", -0.5), ("test", 1.5)] + splitter.Split(source, task, splits) + + with self.assertRaisesRegex(Exception, "Sum of ratios"): + splits = [("train", 0.5), ("test", 0.5), ("val", 0.5)] + splitter.Split(source, task, splits) + + with self.subTest("duplicated subset name"): + with self.assertRaisesRegex(Exception, "duplicated"): + splits = [("train", 0.5), ("train", 0.2), ("test", 0.3)] + splitter.Split(source, task, splits) diff --git a/tests/test_tfrecord_format.py b/tests/test_tfrecord_format.py index 6db7c07eb3b5..1b147fa3e804 100644 --- a/tests/test_tfrecord_format.py +++ b/tests/test_tfrecord_format.py @@ -13,6 +13,7 @@ from datumaro.util.test_utils import (TestDir, compare_datasets, test_save_and_load) from datumaro.util.tf_util import check_import +from .requirements import Requirements, mark_requirement try: from datumaro.plugins.tf_detection_api_format.extractor import \ @@ -28,6 +29,7 @@ @skipIf(not module_found, "Tensorflow package is not found") class TfImportTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_raises_when_crashes_on_import(self): # Should fire if import can't be done for any reason except # module unavailability and import crash @@ -42,6 +44,7 @@ def _test_save_and_load(self, source_dataset, converter, test_dir, importer='tf_detection_api', target_dataset=target_dataset, importer_args=importer_args, **kwargs) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_bboxes(self): test_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', @@ -63,6 +66,7 @@ def test_can_save_bboxes(self): partial(TfDetectionApiConverter.convert, save_images=True), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_masks(self): test_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', image=np.ones((4, 5, 3)), @@ -87,6 +91,7 @@ def test_can_save_masks(self): partial(TfDetectionApiConverter.convert, save_masks=True), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_no_subsets(self): test_dataset = Dataset.from_iterable([ DatasetItem(id=1, @@ -121,6 +126,7 @@ def test_can_save_dataset_with_no_subsets(self): partial(TfDetectionApiConverter.convert, save_images=True), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): test_dataset = Dataset.from_iterable([ DatasetItem(id='кириллица с пробелом', @@ -142,6 +148,7 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): partial(TfDetectionApiConverter.convert, save_images=True), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_image_info(self): test_dataset = Dataset.from_iterable([ DatasetItem(id='1/q.e', @@ -154,6 +161,7 @@ def test_can_save_dataset_with_image_info(self): self._test_save_and_load(test_dataset, TfDetectionApiConverter.convert, test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_unknown_image_formats(self): test_dataset = Dataset.from_iterable([ DatasetItem(id=1, @@ -173,6 +181,7 @@ def test_can_save_dataset_with_unknown_image_formats(self): partial(TfDetectionApiConverter.convert, save_images=True), test_dir, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): dataset = Dataset.from_iterable([ DatasetItem('q/1', subset='train', @@ -188,6 +197,7 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): partial(TfDetectionApiConverter.convert, save_images=True), test_dir, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_inplace_save_writes_only_updated_data(self): with TestDir() as path: # generate initial dataset @@ -209,6 +219,7 @@ def test_inplace_save_writes_only_updated_data(self): self.assertFalse(osp.isfile(osp.join(path, 'b.tfrecord'))) self.assertTrue(osp.isfile(osp.join(path, 'c.tfrecord'))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_labelmap_parsing(self): text = """ { @@ -241,9 +252,11 @@ def test_labelmap_parsing(self): @skipIf(import_failed, "Failed to import tensorflow") class TfrecordImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect(self): self.assertTrue(TfDetectionApiImporter.detect(DUMMY_DATASET_DIR)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import(self): target_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 5098d03634de..9ccd45023e3e 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -10,9 +10,11 @@ import datumaro.util.mask_tools as mask_tools import datumaro.plugins.transforms as transforms from datumaro.util.test_utils import compare_datasets +from .requirements import Requirements, mark_requirement class TransformsTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_reindex(self): source = Dataset.from_iterable([ DatasetItem(id=10), @@ -29,6 +31,7 @@ def test_reindex(self): actual = transforms.Reindex(source, start=5) compare_datasets(self, expected, actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_mask_to_polygons(self): source = Dataset.from_iterable([ DatasetItem(id=1, image=np.zeros((5, 10, 3)), annotations=[ @@ -53,6 +56,7 @@ def test_mask_to_polygons(self): actual = transforms.MasksToPolygons(source) compare_datasets(self, expected, actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_mask_to_polygons_small_polygons_message(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, image=np.zeros((5, 10, 3)), annotations=[ @@ -74,6 +78,7 @@ def test_mask_to_polygons_small_polygons_message(self): compare_datasets(self, target_dataset, actual) self.assertRegex('\n'.join(logs.output), 'too small polygons') + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_polygons_to_masks(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, image=np.zeros((5, 10, 3)), annotations=[ @@ -106,6 +111,7 @@ def test_polygons_to_masks(self): actual = transforms.PolygonsToMasks(source_dataset) compare_datasets(self, target_dataset, actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_crop_covered_segments(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, image=np.zeros((5, 5, 3)), annotations=[ @@ -137,6 +143,7 @@ def test_crop_covered_segments(self): actual = transforms.CropCoveredSegments(source_dataset) compare_datasets(self, target_dataset, actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_merge_instance_segments(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, image=np.zeros((5, 5, 3)), @@ -184,6 +191,7 @@ def test_merge_instance_segments(self): include_polygons=True) compare_datasets(self, target_dataset, actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_map_subsets(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='a'), @@ -201,6 +209,7 @@ def test_map_subsets(self): { 'a': '', 'b': 'a' }) compare_datasets(self, target_dataset, actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_shapes_to_boxes(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, image=np.zeros((5, 5, 3)), @@ -233,6 +242,7 @@ def test_shapes_to_boxes(self): actual = transforms.ShapesToBoxes(source_dataset) compare_datasets(self, target_dataset, actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_id_from_image(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, image='path.jpg'), @@ -246,6 +256,7 @@ def test_id_from_image(self): actual = transforms.IdFromImageName(source_dataset) compare_datasets(self, target_dataset, actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_boxes_to_masks(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, image=np.zeros((5, 5, 3)), @@ -291,6 +302,7 @@ def test_boxes_to_masks(self): actual = transforms.BoxesToMasks(source_dataset) compare_datasets(self, target_dataset, actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_random_split(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset="a"), @@ -310,6 +322,7 @@ def test_random_split(self): self.assertEqual(4, len(actual.get_subset('train'))) self.assertEqual(3, len(actual.get_subset('test'))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_random_split_gives_error_on_wrong_ratios(self): source_dataset = Dataset.from_iterable([DatasetItem(id=1)]) @@ -328,6 +341,7 @@ def test_random_split_gives_error_on_wrong_ratios(self): ('test', 1.5), ]) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_remap_labels(self): src_dataset = Dataset.from_iterable([ DatasetItem(id=1, annotations=[ @@ -336,15 +350,18 @@ def test_remap_labels(self): Bbox(1, 2, 3, 4, label=2), Mask(image=np.array([1]), label=3), - # Should be kept + # Should be deleted Polygon([1, 1, 2, 2, 3, 4], label=4), - PolyLine([1, 3, 4, 2, 5, 6]) + + # Should be kept + PolyLine([1, 3, 4, 2, 5, 6]), + Bbox(4, 3, 2, 1, label=5), ]) ], categories={ AnnotationType.label: LabelCategories.from_iterable( - 'label%s' % i for i in range(5)), + 'label%s' % i for i in range(6)), AnnotationType.mask: MaskCategories( - colormap=mask_tools.generate_colormap(5)), + colormap=mask_tools.generate_colormap(6)), }) dst_dataset = Dataset.from_iterable([ @@ -353,40 +370,50 @@ def test_remap_labels(self): Bbox(1, 2, 3, 4, label=0), Mask(image=np.array([1]), label=1), - Polygon([1, 1, 2, 2, 3, 4], label=2), - PolyLine([1, 3, 4, 2, 5, 6], label=None) + PolyLine([1, 3, 4, 2, 5, 6], label=None), + Bbox(4, 3, 2, 1, label=2), ]), ], categories={ AnnotationType.label: LabelCategories.from_iterable( - ['label0', 'label9', 'label4']), + ['label0', 'label9', 'label5']), AnnotationType.mask: MaskCategories(colormap={ - k: v for k, v in mask_tools.generate_colormap(5).items() - if k in { 0, 1, 3, 4 } + k: v for k, v in mask_tools.generate_colormap(6).items() + if k in { 0, 1, 3, 5 } }) }) actual = transforms.RemapLabels(src_dataset, mapping={ - 'label1': 'label9', - 'label2': 'label0', - 'label3': 'label9', + 'label1': 'label9', # rename & join with new label9 (from label3) + 'label2': 'label0', # rename & join with existing label0 + 'label3': 'label9', # rename & join with new label9 (form label1) + 'label4': '', # delete the label and associated annotations + # 'label5' - unchanged }, default='keep') compare_datasets(self, dst_dataset, actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_remap_labels_delete_unspecified(self): source_dataset = Dataset.from_iterable([ - DatasetItem(id=1, annotations=[ Label(0) ]) - ], categories=['label0']) + DatasetItem(id=1, annotations=[ + Label(0, id=0), # will be removed + Label(1, id=1), + Bbox(1, 2, 3, 4, label=None), + ]) + ], categories=['label0', 'label1']) target_dataset = Dataset.from_iterable([ - DatasetItem(id=1), - ], categories=[]) + DatasetItem(id=1, annotations=[ + Label(0, id=1), + ]), + ], categories=['label1']) actual = transforms.RemapLabels(source_dataset, - mapping={}, default='delete') + mapping={ 'label1': 'label1' }, default='delete') compare_datasets(self, target_dataset, actual) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_transform_labels(self): src_dataset = Dataset.from_iterable([ DatasetItem(id=1, annotations=[ diff --git a/tests/test_util.py b/tests/test_util.py index f19e5d4f95de..88f850eb3a38 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -6,9 +6,11 @@ from datumaro.util import Rollback, error_rollback from datumaro.util.test_utils import TestDir from datumaro.util.os_util import walk +from .requirements import Requirements, mark_requirement class TestRollback(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_does_not_call_on_no_error(self): success = True def cb(): @@ -20,6 +22,7 @@ def cb(): self.assertTrue(success) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_calls_on_error(self): success = False def cb(): @@ -35,6 +38,7 @@ def cb(): finally: self.assertTrue(success) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_decorator_calls_on_error(self): success = False def cb(): @@ -53,6 +57,7 @@ def foo(on_error=None): finally: self.assertTrue(success) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_decorator_does_not_call_on_no_error(self): success = True def cb(): @@ -67,6 +72,7 @@ def foo(on_error=None): self.assertTrue(success) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_decorator_supports_implicit_arg(self): success = False def cb(): @@ -85,6 +91,7 @@ def foo(): finally: self.assertTrue(success) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_fowrard_args(self): success1 = False def cb1(a1, a2=None, ignore_errors=None): @@ -111,6 +118,7 @@ def cb2(a1, a2=None, ignore_errors=None): self.assertTrue(success2) class TestOsUtils(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_walk_with_maxdepth(self): with TestDir() as rootdir: os.makedirs(osp.join(rootdir, '1', '2', '3', '4')) diff --git a/tests/test_validator.py b/tests/test_validator.py index d6c8700d8025..2d0bd47e7877 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -8,16 +8,18 @@ from datumaro.components.dataset import Dataset, DatasetItem from datumaro.components.errors import (MissingLabelCategories, - MissingLabelAnnotation, MultiLabelAnnotations, MissingAttribute, + MissingAnnotation, MultiLabelAnnotations, MissingAttribute, UndefinedLabel, UndefinedAttribute, LabelDefinedButNotFound, AttributeDefinedButNotFound, OnlyOneLabel, FewSamplesInLabel, FewSamplesInAttribute, ImbalancedLabels, ImbalancedAttribute, - ImbalancedBboxDistInLabel, ImbalancedBboxDistInAttribute, - MissingBboxAnnotation, NegativeLength, InvalidValue, FarFromLabelMean, + ImbalancedDistInLabel, ImbalancedDistInAttribute, + NegativeLength, InvalidValue, FarFromLabelMean, FarFromAttrMean, OnlyOneAttributeValue) -from datumaro.components.extractor import Bbox, Label +from datumaro.components.extractor import Bbox, Label, Mask, Polygon from datumaro.components.validator import (ClassificationValidator, - DetectionValidator, TaskType, validate_annotations, _Validator) + DetectionValidator, TaskType, validate_annotations, _Validator, + SegmentationValidator) +from .requirements import Requirements, mark_requirement class TestValidatorTemplate(TestCase): @@ -29,12 +31,22 @@ def setUpClass(cls): Bbox(1, 2, 3, 4, id=1, label=0, attributes={ 'a': 1, 'b': 2, }), + Mask(id=2, label=0, attributes={'a': 1, 'b': 2}, + image=np.array([[0, 0, 0, 0, 0], + [0, 0, 1, 1, 1], + [0, 0, 1, 1, 1], + [0, 0, 1, 1, 1], + [0, 0, 1, 1, 1], + ])), ]), DatasetItem(id=2, image=np.ones((2, 4, 3)), annotations=[ Label(2, id=0, attributes={'a': 2, 'b': 2, }), Bbox(2, 3, 1, 4, id=1, label=0, attributes={ 'a': 1, 'b': 1, }), + Mask(id=2, label=0, attributes={'a': 1, 'b': 1}, + image=np.array([[1, 1, 1, 1], [0, 0, 0, 0]]) + ), ]), DatasetItem(id=3), DatasetItem(id=4, image=np.ones((2, 4, 3)), annotations=[ @@ -46,30 +58,54 @@ def setUpClass(cls): Bbox(3, 1, 4, 2, id=3, label=0, attributes={ 'a': 2, 'b': 2, }), + Polygon([1, 3, 1, 5, 5, 5, 5, 3], label=0, id=4, + attributes={'a': 2, 'b': 2, + }), + Polygon([3, 1, 3, 5, 5, 5, 5, 1], label=1, id=5, + attributes={'a': 2, 'b': 1, + }), ]), DatasetItem(id=5, image=np.ones((2, 4, 3)), annotations=[ Label(0, id=0, attributes={'a': 20, 'b': 10, }), Bbox(1, 2, 3, 4, id=1, label=1, attributes={ 'a': 1, 'b': 1, }), + Polygon([1, 2, 1, 5, 5, 5, 5, 2], label=1, id=2, + attributes={'a': 1, 'b': 1, + }), ]), DatasetItem(id=6, image=np.ones((2, 4, 3)), annotations=[ Label(1, id=0, attributes={'a': 11, 'b': 2, 'c': 3, }), Bbox(2, 3, 4, 1, id=1, label=1, attributes={ 'a': 2, 'b': 2, }), + Mask(id=2, label=1, attributes={'a': 2, 'b': 2}, + image=np.array([[1, 0, 0], + [1, 0, 0], + [1, 0, 0], + [1, 0, 0], + ])), ]), DatasetItem(id=7, image=np.ones((2, 4, 3)), annotations=[ Label(1, id=0, attributes={'a': 1, 'b': 2, 'c': 5, }), Bbox(1, 2, 3, 4, id=1, label=2, attributes={ 'a': 1, 'b': 2, }), + Polygon([1, 2, 1, 5, 5, 5, 5, 2], label=2, id=2, + attributes={'a': 1, 'b': 2, + }), ]), DatasetItem(id=8, image=np.ones((2, 4, 3)), annotations=[ Label(2, id=0, attributes={'a': 7, 'b': 9, 'c': 5, }), Bbox(2, 1, 3, 4, id=1, label=2, attributes={ 'a': 2, 'b': 1, }), + Mask(id=2, label=2, attributes={'a': 2, 'b': 1}, + image=np.array([[1, 1, 1], + [1, 1, 1], + [1, 1, 1], + [1, 1, 1], + ])), ]), ], categories=[[f'label_{i}', None, {'a', 'b', }] for i in range(2)]) @@ -78,12 +114,16 @@ def setUpClass(cls): class TestBaseValidator(TestValidatorTemplate): @classmethod def setUpClass(cls): - cls.validator = _Validator(TaskType.classification) + cls.validator = _Validator(task_type=TaskType.classification, + few_samples_thr=1, imbalance_ratio_thr=50, far_from_mean_thr=5.0, + dominance_ratio_thr=0.8, topk_bins=0.1) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_generate_reports(self): with self.assertRaises(NotImplementedError): self.validator.generate_reports({}) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_missing_label_categories(self): stats = { 'label_distribution': { @@ -96,6 +136,7 @@ def test_check_missing_label_categories(self): self.assertTrue(len(actual_reports) == 1) self.assertIsInstance(actual_reports[0], MissingLabelCategories) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_missing_attribute(self): label_name = 'unit' attr_name = 'test' @@ -109,6 +150,7 @@ def test_check_missing_attribute(self): self.assertTrue(len(actual_reports) == 1) self.assertIsInstance(actual_reports[0], MissingAttribute) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_undefined_label(self): label_name = 'unittest' label_stats = { @@ -121,6 +163,7 @@ def test_check_undefined_label(self): self.assertTrue(len(actual_reports) == 1) self.assertIsInstance(actual_reports[0], UndefinedLabel) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_undefined_attribute(self): label_name = 'unit' attr_name = 'test' @@ -134,6 +177,7 @@ def test_check_undefined_attribute(self): self.assertTrue(len(actual_reports) == 1) self.assertIsInstance(actual_reports[0], UndefinedAttribute) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_label_defined_but_not_found(self): stats = { 'label_distribution': { @@ -149,6 +193,7 @@ def test_check_label_defined_but_not_found(self): self.assertTrue(len(actual_reports) == 1) self.assertIsInstance(actual_reports[0], LabelDefinedButNotFound) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_attribute_defined_but_not_found(self): label_name = 'unit' attr_stats = { @@ -163,6 +208,7 @@ def test_check_attribute_defined_but_not_found(self): self.assertTrue(len(actual_reports) == 1) self.assertIsInstance(actual_reports[0], AttributeDefinedButNotFound) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_only_one_label(self): stats = { 'label_distribution': { @@ -178,6 +224,7 @@ def test_check_only_one_label(self): self.assertTrue(len(actual_reports) == 1) self.assertIsInstance(actual_reports[0], OnlyOneLabel) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_only_one_attribute_value(self): label_name = 'unit' attr_name = 'test' @@ -193,12 +240,13 @@ def test_check_only_one_attribute_value(self): self.assertTrue(len(actual_reports) == 1) self.assertIsInstance(actual_reports[0], OnlyOneAttributeValue) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_few_samples_in_label(self): with self.subTest('Few Samples'): stats = { 'label_distribution': { 'defined_labels': { - 'unit': self.validator.DEFAULT_FEW_SAMPLES + 'unit': self.validator.few_samples_thr } } } @@ -212,7 +260,7 @@ def test_check_few_samples_in_label(self): stats = { 'label_distribution': { 'defined_labels': { - 'unit': self.validator.DEFAULT_FEW_SAMPLES + 1 + 'unit': self.validator.few_samples_thr + 1 } } } @@ -221,6 +269,7 @@ def test_check_few_samples_in_label(self): self.assertTrue(len(actual_reports) == 0) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_few_samples_in_attribute(self): label_name = 'unit' attr_name = 'test' @@ -228,7 +277,7 @@ def test_check_few_samples_in_attribute(self): with self.subTest('Few Samples'): attr_dets = { 'distribution': { - 'mock': self.validator.DEFAULT_FEW_SAMPLES + 'mock': self.validator.few_samples_thr } } @@ -241,7 +290,7 @@ def test_check_few_samples_in_attribute(self): with self.subTest('No Few Samples Warning'): attr_dets = { 'distribution': { - 'mock': self.validator.DEFAULT_FEW_SAMPLES + 1 + 'mock': self.validator.few_samples_thr + 1 } } @@ -250,12 +299,13 @@ def test_check_few_samples_in_attribute(self): self.assertTrue(len(actual_reports) == 0) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_imbalanced_labels(self): with self.subTest('Imbalance'): stats = { 'label_distribution': { 'defined_labels': { - 'unit': self.validator.DEFAULT_IMBALANCE_RATIO, + 'unit': self.validator.imbalance_ratio_thr, 'test': 1 } } @@ -270,7 +320,7 @@ def test_check_imbalanced_labels(self): stats = { 'label_distribution': { 'defined_labels': { - 'unit': self.validator.DEFAULT_IMBALANCE_RATIO - 1, + 'unit': self.validator.imbalance_ratio_thr - 1, 'test': 1 } } @@ -280,6 +330,7 @@ def test_check_imbalanced_labels(self): self.assertTrue(len(actual_reports) == 0) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_imbalanced_attribute(self): label_name = 'unit' attr_name = 'test' @@ -287,7 +338,7 @@ def test_check_imbalanced_attribute(self): with self.subTest('Imbalance'): attr_dets = { 'distribution': { - 'mock': self.validator.DEFAULT_IMBALANCE_RATIO, + 'mock': self.validator.imbalance_ratio_thr, 'mock_1': 1 } } @@ -301,7 +352,7 @@ def test_check_imbalanced_attribute(self): with self.subTest('No Imbalance Warning'): attr_dets = { 'distribution': { - 'mock': self.validator.DEFAULT_IMBALANCE_RATIO - 1, + 'mock': self.validator.imbalance_ratio_thr - 1, 'mock_1': 1 } } @@ -315,18 +366,22 @@ def test_check_imbalanced_attribute(self): class TestClassificationValidator(TestValidatorTemplate): @classmethod def setUpClass(cls): - cls.validator = ClassificationValidator() + cls.validator = ClassificationValidator(few_samples_thr=1, + imbalance_ratio_thr=50, far_from_mean_thr=5.0, + dominance_ratio_thr=0.8, topk_bins=0.1) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_missing_label_annotation(self): stats = { - 'items_missing_label': [(1, 'unittest')] + 'items_missing_annotation': [(1, 'unittest')] } - actual_reports = self.validator._check_missing_label_annotation(stats) + actual_reports = self.validator._check_missing_annotation(stats) self.assertTrue(len(actual_reports) == 1) - self.assertIsInstance(actual_reports[0], MissingLabelAnnotation) + self.assertIsInstance(actual_reports[0], MissingAnnotation) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_multi_label_annotations(self): stats = { 'items_with_multiple_labels': [(1, 'unittest')] @@ -341,11 +396,14 @@ def test_check_multi_label_annotations(self): class TestDetectionValidator(TestValidatorTemplate): @classmethod def setUpClass(cls): - cls.validator = DetectionValidator() + cls.validator = DetectionValidator(few_samples_thr=1, + imbalance_ratio_thr=50, far_from_mean_thr=5.0, + dominance_ratio_thr=0.8, topk_bins=0.1) - def test_check_imbalanced_bbox_dist_in_label(self): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_imbalanced_dist_in_label(self): label_name = 'unittest' - most = int(self.validator.DEFAULT_BBOX_IMBALANCE * 100) + most = int(self.validator.dominance_thr * 100) rest = 100 - most with self.subTest('Imbalanced'): @@ -356,11 +414,11 @@ def test_check_imbalanced_bbox_dist_in_label(self): } } } - reports = self.validator._check_imbalanced_bbox_dist_in_label( + reports = self.validator._check_imbalanced_dist_in_label( label_name, bbox_label_stats) self.assertTrue(len(reports) == 1) - self.assertIsInstance(reports[0], ImbalancedBboxDistInLabel) + self.assertIsInstance(reports[0], ImbalancedDistInLabel) with self.subTest('No Imbalanced Warning'): bbox_label_stats = { @@ -370,15 +428,16 @@ def test_check_imbalanced_bbox_dist_in_label(self): } } } - reports = self.validator._check_imbalanced_bbox_dist_in_label( + reports = self.validator._check_imbalanced_dist_in_label( label_name, bbox_label_stats) self.assertTrue(len(reports) == 0) - def test_check_imbalanced_bbox_dist_in_attr(self): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_imbalanced_dist_in_attr(self): label_name = 'unit' attr_name = 'test' - most = int(self.validator.DEFAULT_BBOX_IMBALANCE * 100) + most = int(self.validator.dominance_thr * 100) rest = 100 - most with self.subTest('Imbalanced'): @@ -392,11 +451,11 @@ def test_check_imbalanced_bbox_dist_in_attr(self): } } - reports = self.validator._check_imbalanced_bbox_dist_in_attr( + reports = self.validator._check_imbalanced_dist_in_attr( label_name, attr_name, bbox_attr_stats) self.assertTrue(len(reports) == 1) - self.assertIsInstance(reports[0], ImbalancedBboxDistInAttribute) + self.assertIsInstance(reports[0], ImbalancedDistInAttribute) with self.subTest('No Imbalanced Warning'): bbox_attr_stats = { @@ -409,21 +468,23 @@ def test_check_imbalanced_bbox_dist_in_attr(self): } } - reports = self.validator._check_imbalanced_bbox_dist_in_attr( + reports = self.validator._check_imbalanced_dist_in_attr( label_name, attr_name, bbox_attr_stats) self.assertTrue(len(reports) == 0) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_missing_bbox_annotation(self): stats = { - 'items_missing_bbox': [(1, 'unittest')] + 'items_missing_annotation': [(1, 'unittest')] } - actual_reports = self.validator._check_missing_bbox_annotation(stats) + actual_reports = self.validator._check_missing_annotation(stats) self.assertTrue(len(actual_reports) == 1) - self.assertIsInstance(actual_reports[0], MissingBboxAnnotation) + self.assertIsInstance(actual_reports[0], MissingAnnotation) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_negative_length(self): stats = { 'items_with_negative_length': { @@ -440,6 +501,7 @@ def test_check_negative_length(self): self.assertTrue(len(actual_reports) == 1) self.assertIsInstance(actual_reports[0], NegativeLength) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_invalid_value(self): stats = { 'items_with_invalid_value': { @@ -454,6 +516,7 @@ def test_check_invalid_value(self): self.assertTrue(len(actual_reports) == 1) self.assertIsInstance(actual_reports[0], InvalidValue) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_far_from_label_mean(self): label_name = 'unittest' bbox_label_stats = { @@ -473,6 +536,7 @@ def test_check_far_from_label_mean(self): self.assertTrue(len(actual_reports) == 1) self.assertIsInstance(actual_reports[0], FarFromLabelMean) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_check_far_from_attr_mean(self): label_name = 'unit' attr_name = 'test' @@ -496,14 +560,174 @@ def test_check_far_from_attr_mean(self): self.assertIsInstance(actual_reports[0], FarFromAttrMean) +class TestSegmentationValidator(TestValidatorTemplate): + @classmethod + def setUpClass(cls): + cls.validator = SegmentationValidator(few_samples_thr=1, + imbalance_ratio_thr=50, far_from_mean_thr=5.0, + dominance_ratio_thr=0.8, topk_bins=0.1) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_imbalanced_dist_in_label(self): + label_name = 'unittest' + most = int(self.validator.dominance_thr * 100) + rest = 100 - most + + with self.subTest('Imbalanced'): + mask_label_stats = { + 'area': { + 'histogram': { + 'counts': [most, rest] + } + } + } + reports = self.validator._check_imbalanced_dist_in_label( + label_name, mask_label_stats) + + self.assertTrue(len(reports) == 1) + self.assertIsInstance(reports[0], ImbalancedDistInLabel) + + with self.subTest('No Imbalanced Warning'): + mask_label_stats = { + 'area': { + 'histogram': { + 'counts': [most - 1, rest] + } + } + } + reports = self.validator._check_imbalanced_dist_in_label( + label_name, mask_label_stats) + + self.assertTrue(len(reports) == 0) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_imbalanced_dist_in_attr(self): + label_name = 'unit' + attr_name = 'test' + most = int(self.validator.dominance_thr * 100) + rest = 100 - most + + with self.subTest('Imbalanced'): + mask_attr_stats = { + 'mock': { + 'x': { + 'histogram': { + 'counts': [most, rest] + } + } + } + } + + reports = self.validator._check_imbalanced_dist_in_attr( + label_name, attr_name, mask_attr_stats) + + self.assertTrue(len(reports) == 1) + self.assertIsInstance(reports[0], ImbalancedDistInAttribute) + + with self.subTest('No Imbalanced Warning'): + mask_attr_stats = { + 'mock': { + 'x': { + 'histogram': { + 'counts': [most - 1, rest] + } + } + } + } + + reports = self.validator._check_imbalanced_dist_in_attr( + label_name, attr_name, mask_attr_stats) + + self.assertTrue(len(reports) == 0) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_missing_mask_annotation(self): + stats = { + 'items_missing_annotation': [(1, 'unittest')] + } + + actual_reports = self.validator._check_missing_annotation(stats) + + self.assertTrue(len(actual_reports) == 1) + self.assertIsInstance(actual_reports[0], MissingAnnotation) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_invalid_value(self): + stats = { + 'items_with_invalid_value': { + ('1', 'unittest'): { + 1: ['x'] + } + } + } + + actual_reports = self.validator._check_invalid_value(stats) + + self.assertTrue(len(actual_reports) == 1) + self.assertIsInstance(actual_reports[0], InvalidValue) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_far_from_label_mean(self): + label_name = 'unittest' + mask_label_stats = { + 'w': { + 'items_far_from_mean': { + ('1', 'unittest'): { + 1: 100 + } + }, + 'mean': 0, + } + } + + actual_reports = self.validator._check_far_from_label_mean( + label_name, mask_label_stats) + + self.assertTrue(len(actual_reports) == 1) + self.assertIsInstance(actual_reports[0], FarFromLabelMean) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_far_from_attr_mean(self): + label_name = 'unit' + attr_name = 'test' + mask_attr_stats = { + 'mock': { + 'w': { + 'items_far_from_mean': { + ('1', 'unittest'): { + 1: 100 + } + }, + 'mean': 0, + } + } + } + + actual_reports = self.validator._check_far_from_attr_mean( + label_name, attr_name, mask_attr_stats) + + self.assertTrue(len(actual_reports) == 1) + self.assertIsInstance(actual_reports[0], FarFromAttrMean) + + class TestValidateAnnotations(TestValidatorTemplate): + + extra_args = { + 'few_samples_thr': 1, + 'imbalance_ratio_thr': 50, + 'far_from_mean_thr': 5.0, + 'dominance_ratio_thr': 0.8, + 'topk_bins': 0.1, + } + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_validate_annotations_classification(self): - actual_results = validate_annotations(self.dataset, 'classification') + actual_results = validate_annotations(self.dataset, 'classification', + **self.extra_args) with self.subTest('Test of statistics', i=0): actual_stats = actual_results['statistics'] - self.assertEqual(actual_stats['total_label_count'], 8) - self.assertEqual(len(actual_stats['items_missing_label']), 1) + self.assertEqual(actual_stats['total_ann_count'], 8) + self.assertEqual(len(actual_stats['items_missing_annotation']), 1) self.assertEqual(len(actual_stats['items_with_multiple_labels']), 1) label_dist = actual_stats['label_distribution'] @@ -538,7 +762,7 @@ def test_validate_annotations_classification(self): self.assertEqual(report_count_by_type['UndefinedAttribute'], 7) self.assertEqual(report_count_by_type['FewSamplesInAttribute'], 3) self.assertEqual(report_count_by_type['UndefinedLabel'], 2) - self.assertEqual(report_count_by_type['MissingLabelAnnotation'], 1) + self.assertEqual(report_count_by_type['MissingAnnotation'], 1) self.assertEqual(report_count_by_type['MultiLabelAnnotations'], 1) self.assertEqual(report_count_by_type['OnlyOneAttributeValue'], 1) self.assertEqual(report_count_by_type['MissingAttribute'], 1) @@ -552,13 +776,15 @@ def test_validate_annotations_classification(self): self.assertEqual(actual_summary, expected_summary) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_validate_annotations_detection(self): - actual_results = validate_annotations(self.dataset, 'detection') + actual_results = validate_annotations(self.dataset, 'detection', + **self.extra_args) with self.subTest('Test of statistics', i=0): actual_stats = actual_results['statistics'] - self.assertEqual(actual_stats['total_bbox_count'], 8) - self.assertEqual(len(actual_stats['items_missing_bbox']), 1) + self.assertEqual(actual_stats['total_ann_count'], 8) + self.assertEqual(len(actual_stats['items_missing_annotation']), 1) self.assertEqual(actual_stats['items_with_negative_length'], {}) self.assertEqual(actual_stats['items_with_invalid_value'], {}) @@ -589,12 +815,12 @@ def test_validate_annotations_detection(self): count_by_type = Counter(report_types) self.assertEqual(len(actual_reports), 45) - self.assertEqual(count_by_type['ImbalancedBboxDistInAttribute'], 32) + self.assertEqual(count_by_type['ImbalancedDistInAttribute'], 32) self.assertEqual(count_by_type['FewSamplesInAttribute'], 4) self.assertEqual(count_by_type['UndefinedAttribute'], 4) - self.assertEqual(count_by_type['ImbalancedBboxDistInLabel'], 2) + self.assertEqual(count_by_type['ImbalancedDistInLabel'], 2) self.assertEqual(count_by_type['UndefinedLabel'], 2) - self.assertEqual(count_by_type['MissingBboxAnnotation'], 1) + self.assertEqual(count_by_type['MissingAnnotation'], 1) with self.subTest('Test of summary', i=2): actual_summary = actual_results['summary'] @@ -605,10 +831,68 @@ def test_validate_annotations_detection(self): self.assertEqual(actual_summary, expected_summary) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_validate_annotations_segmentation(self): + actual_results = validate_annotations(self.dataset, 'segmentation', + **self.extra_args) + + with self.subTest('Test of statistics', i=0): + actual_stats = actual_results['statistics'] + self.assertEqual(actual_stats['total_ann_count'], 8) + self.assertEqual(len(actual_stats['items_missing_annotation']), 1) + self.assertEqual(actual_stats['items_with_invalid_value'], {}) + + mask_dist_by_label = actual_stats['mask_distribution_in_label'] + label_prop_stats = mask_dist_by_label['label_1']['area'] + self.assertEqual(label_prop_stats['items_far_from_mean'], {}) + areas = [12, 4, 8] + self.assertEqual(label_prop_stats['mean'], np.mean(areas)) + self.assertEqual(label_prop_stats['stdev'], np.std(areas)) + self.assertEqual(label_prop_stats['min'], np.min(areas)) + self.assertEqual(label_prop_stats['max'], np.max(areas)) + self.assertEqual(label_prop_stats['median'], np.median(areas)) + + mask_dist_by_attr = actual_stats['mask_distribution_in_attribute'] + attr_prop_stats = mask_dist_by_attr['label_0']['a']['1']['area'] + areas = [12, 4] + self.assertEqual(attr_prop_stats['items_far_from_mean'], {}) + self.assertEqual(attr_prop_stats['mean'], np.mean(areas)) + self.assertEqual(attr_prop_stats['stdev'], np.std(areas)) + self.assertEqual(attr_prop_stats['min'], np.min(areas)) + self.assertEqual(attr_prop_stats['max'], np.max(areas)) + self.assertEqual(attr_prop_stats['median'], np.median(areas)) + + mask_dist_item = actual_stats['mask_distribution_in_dataset_item'] + self.assertEqual(sum(mask_dist_item.values()), 8) + + with self.subTest('Test of validation reports', i=1): + actual_reports = actual_results['validation_reports'] + report_types = [r['anomaly_type'] for r in actual_reports] + count_by_type = Counter(report_types) + + self.assertEqual(len(actual_reports), 24) + self.assertEqual(count_by_type['ImbalancedDistInLabel'], 0) + self.assertEqual(count_by_type['ImbalancedDistInAttribute'], 13) + self.assertEqual(count_by_type['MissingAnnotation'], 1) + self.assertEqual(count_by_type['UndefinedLabel'], 2) + self.assertEqual(count_by_type['FewSamplesInAttribute'], 4) + self.assertEqual(count_by_type['UndefinedAttribute'], 4) + + with self.subTest('Test of summary', i=2): + actual_summary = actual_results['summary'] + expected_summary = { + 'errors': 6, + 'warnings': 18 + } + + self.assertEqual(actual_summary, expected_summary) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_validate_annotations_invalid_task_type(self): with self.assertRaises(ValueError): - validate_annotations(self.dataset, 'INVALID') + validate_annotations(self.dataset, 'INVALID', **self.extra_args) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_validate_annotations_invalid_dataset_type(self): with self.assertRaises(TypeError): - validate_annotations(object(), 'classification') + validate_annotations(object(), 'classification', **self.extra_args) diff --git a/tests/test_vgg_face2_format.py b/tests/test_vgg_face2_format.py index d6d232a9217b..9801f44d17bc 100644 --- a/tests/test_vgg_face2_format.py +++ b/tests/test_vgg_face2_format.py @@ -9,9 +9,11 @@ VggFace2Importer) from datumaro.util.image import Image from datumaro.util.test_utils import TestDir, compare_datasets +from .requirements import Requirements, mark_requirement class VggFace2FormatTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='1', subset='train', image=np.ones((8, 8, 3)), @@ -55,6 +57,7 @@ def test_can_save_and_load(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_no_subsets(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='b/1', image=np.ones((8, 8, 3)), @@ -72,6 +75,7 @@ def test_can_save_dataset_with_no_subsets(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='кириллица с пробелом', image=np.ones((8, 8, 3)), @@ -89,6 +93,7 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): compare_datasets(self, source_dataset, parsed_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_no_save_images(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='1', image=np.ones((8, 8, 3)), @@ -106,6 +111,7 @@ def test_can_save_dataset_with_no_save_images(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_no_labels(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='1', image=np.ones((8, 8, 3)), @@ -128,6 +134,7 @@ def test_can_save_dataset_with_no_labels(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_wrong_number_of_points(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='1', image=np.ones((8, 8, 3)), @@ -149,6 +156,7 @@ def test_can_save_dataset_with_wrong_number_of_points(self): compare_datasets(self, target_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): dataset = Dataset.from_iterable([ DatasetItem('q/1', image=Image(path='q/1.JPEG', @@ -172,9 +180,11 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'vgg_face2_dataset') class VggFace2ImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect(self): self.assertTrue(VggFace2Importer.detect(DUMMY_DATASET_DIR)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='0001_01', subset='train', diff --git a/tests/test_voc_format.py b/tests/test_voc_format.py index fd03a35eb78a..f8b7df67199d 100644 --- a/tests/test_voc_format.py +++ b/tests/test_voc_format.py @@ -24,9 +24,11 @@ from datumaro.util.mask_tools import load_mask from datumaro.util.test_utils import (TestDir, compare_datasets, test_save_and_load) +from .requirements import Requirements, mark_requirement class VocFormatTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_colormap_generator(self): reference = np.array([ [ 0, 0, 0], @@ -55,6 +57,7 @@ def test_colormap_generator(self): self.assertTrue(np.array_equal(reference, list(VOC.VocColormap.values()))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_write_and_parse_labelmap(self): src_label_map = VOC.make_voc_label_map() src_label_map['qq'] = [None, ['part1', 'part2'], ['act1', 'act2']] @@ -76,9 +79,10 @@ def categories(self): return VOC.make_voc_categories() -DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'voc_dataset') +DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'voc_dataset', 'voc_dataset1') class VocImportTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import(self): class DstExtractor(TestExtractorBase): def __iter__(self): @@ -128,6 +132,7 @@ def __iter__(self): compare_datasets(self, DstExtractor(), dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect_voc(self): self.assertTrue(VocImporter.detect(DUMMY_DATASET_DIR)) @@ -138,6 +143,7 @@ def _test_save_and_load(self, source_dataset, converter, test_dir, importer='voc', target_dataset=target_dataset, importer_args=importer_args, **kwargs) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_voc_cls(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -158,6 +164,7 @@ def __iter__(self): partial(VocClassificationConverter.convert, label_map='voc'), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_voc_det(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -214,6 +221,7 @@ def __iter__(self): partial(VocDetectionConverter.convert, label_map='voc'), test_dir, target_dataset=DstExtractor()) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_voc_segm(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -248,6 +256,7 @@ def __iter__(self): partial(VocSegmentationConverter.convert, label_map='voc'), test_dir, target_dataset=DstExtractor()) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_voc_segm_unpainted(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -283,6 +292,7 @@ def __iter__(self): label_map='voc', apply_colormap=False), test_dir, target_dataset=DstExtractor()) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_voc_segm_with_many_instances(self): def bit(x, y, shape): mask = np.zeros(shape) @@ -318,6 +328,7 @@ def __iter__(self): partial(VocSegmentationConverter.convert, label_map='voc'), test_dir, target_dataset=DstExtractor()) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_voc_layout(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -342,6 +353,7 @@ def __iter__(self): self._test_save_and_load(TestExtractor(), partial(VocLayoutConverter.convert, label_map='voc'), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_voc_action(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -398,6 +410,7 @@ def __iter__(self): label_map='voc', allow_attributes=False), test_dir, target_dataset=DstExtractor()) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_no_subsets(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -412,6 +425,7 @@ def __iter__(self): partial(VocConverter.convert, label_map='voc', tasks=task), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -428,6 +442,7 @@ def __iter__(self): save_images=True), test_dir, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_images(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -445,6 +460,7 @@ def __iter__(self): save_images=True, tasks=task), test_dir, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_dataset_with_voc_labelmap(self): class SrcExtractor(TestExtractorBase): def __iter__(self): @@ -482,6 +498,7 @@ def categories(self): partial(VocConverter.convert, label_map='voc'), test_dir, target_dataset=DstExtractor()) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_dataset_with_source_labelmap_undefined(self): class SrcExtractor(TestExtractorBase): def __iter__(self): @@ -529,6 +546,7 @@ def categories(self): partial(VocConverter.convert, label_map='source'), test_dir, target_dataset=DstExtractor()) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_dataset_with_source_labelmap_defined(self): class SrcExtractor(TestExtractorBase): def __iter__(self): @@ -575,6 +593,7 @@ def categories(self): partial(VocConverter.convert, label_map='source'), test_dir, target_dataset=DstExtractor()) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_dataset_with_fixed_labelmap(self): class SrcExtractor(TestExtractorBase): def __iter__(self): @@ -629,6 +648,7 @@ def categories(self): partial(VocConverter.convert, label_map=label_map), test_dir, target_dataset=DstExtractor()) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_background_masks_dont_introduce_instances_but_cover_others(self): dataset = Dataset.from_iterable([ DatasetItem(1, image=np.zeros((4, 1, 1)), annotations=[ @@ -648,6 +668,7 @@ def test_background_masks_dont_introduce_instances_but_cover_others(self): self.assertTrue(np.array_equal([0, 1], np.unique(cls_mask))) self.assertTrue(np.array_equal([0, 1], np.unique(inst_mask))) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_image_info(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -661,6 +682,7 @@ def __iter__(self): partial(VocConverter.convert, label_map='voc', tasks=task), test_dir) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -678,6 +700,7 @@ def __iter__(self): save_images=True), test_dir, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_relative_paths(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -694,6 +717,7 @@ def __iter__(self): label_map='voc', save_images=True, tasks=task), test_dir, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_attributes(self): class TestExtractor(TestExtractorBase): def __iter__(self): @@ -725,6 +749,7 @@ def __iter__(self): partial(VocConverter.convert, label_map='voc'), test_dir, target_dataset=DstExtractor()) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_inplace_save_writes_only_updated_data(self): with TestDir() as path: # generate initial dataset @@ -763,4 +788,31 @@ def test_inplace_save_writes_only_updated_data(self): self.assertFalse(osp.isfile( osp.join(path, 'SegmentationObject', '3.png'))) self.assertFalse(osp.isfile( - osp.join(path, 'SegmentationClass', '3.png'))) \ No newline at end of file + osp.join(path, 'SegmentationClass', '3.png'))) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_dataset_with_no_data_images(self): + class TestExtractor(TestExtractorBase): + def __iter__(self): + return iter([ + DatasetItem(id='frame1', subset='test', + image=Image(path='frame1.jpg'), + annotations=[ + Bbox(1.0, 2.0, 3.0, 4.0, + attributes={ + 'difficult': False, + 'truncated': False, + 'occluded': False + }, + id=1, label=0, group=1 + ) + ] + ) + ]) + + def categories(self): + return VOC.make_voc_categories() + + with TestDir() as test_dir: + self._test_save_and_load(TestExtractor(), + partial(VocConverter.convert, label_map='voc'), test_dir) diff --git a/tests/test_widerface_format.py b/tests/test_widerface_format.py index 0465f5d3f3f2..a6b4ab3ccf48 100644 --- a/tests/test_widerface_format.py +++ b/tests/test_widerface_format.py @@ -8,54 +8,55 @@ from datumaro.plugins.widerface_format import WiderFaceConverter, WiderFaceImporter from datumaro.util.image import Image from datumaro.util.test_utils import TestDir, compare_datasets +from .requirements import Requirements, mark_requirement class WiderFaceFormatTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='1', subset='train', image=np.ones((8, 8, 3)), annotations=[ - Bbox(0, 2, 4, 2), - Bbox(0, 1, 2, 3, attributes={ + Bbox(0, 2, 4, 2, label=0), + Bbox(0, 1, 2, 3, label=0, attributes={ 'blur': '2', 'expression': '0', 'illumination': '0', 'occluded': '0', 'pose': '2', 'invalid': '0'}), - Label(0), + Label(1), ] ), DatasetItem(id='2', subset='train', image=np.ones((10, 10, 3)), annotations=[ - Bbox(0, 2, 4, 2, attributes={ + Bbox(0, 2, 4, 2, label=0, attributes={ 'blur': '2', 'expression': '0', 'illumination': '1', 'occluded': '0', 'pose': '1', 'invalid': '0'}), - Bbox(3, 3, 2, 3, attributes={ + Bbox(3, 3, 2, 3, label=0, attributes={ 'blur': '0', 'expression': '1', 'illumination': '0', 'occluded': '0', 'pose': '2', 'invalid': '0'}), - Bbox(2, 1, 2, 3, attributes={ + Bbox(2, 1, 2, 3, label=0, attributes={ 'blur': '2', 'expression': '0', 'illumination': '0', 'occluded': '0', 'pose': '0', 'invalid': '1'}), - Label(1), + Label(2), ] ), DatasetItem(id='3', subset='val', image=np.ones((8, 8, 3)), annotations=[ - Bbox(0, 1.1, 5.3, 2.1, attributes={ + Bbox(0, 1.1, 5.3, 2.1, label=0, attributes={ 'blur': '2', 'expression': '1', 'illumination': '0', 'occluded': '0', 'pose': '1', 'invalid': '0'}), - Bbox(0, 2, 3, 2, attributes={ - 'occluded': 'False'}), - Bbox(0, 2, 4, 2), - Bbox(0, 7, 3, 2, attributes={ + Bbox(0, 2, 3, 2, label=0, attributes={ + 'occluded': False}), + Bbox(0, 3, 4, 2, label=0, attributes={ + 'occluded': True}), + Bbox(0, 2, 4, 2, label=0), + Bbox(0, 7, 3, 2, label=0, attributes={ 'blur': '2', 'expression': '1', 'illumination': '0', 'occluded': '0', 'pose': '1', 'invalid': '0'}), ] ), DatasetItem(id='4', subset='val', image=np.ones((8, 8, 3))), - ], categories={ - AnnotationType.label: LabelCategories.from_iterable( - 'label_' + str(i) for i in range(3)), - }) + ], categories=['face', 'label_0', 'label_1']) with TestDir() as test_dir: WiderFaceConverter.convert(source_dataset, test_dir, save_images=True) @@ -63,6 +64,7 @@ def test_can_save_and_load(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_no_subsets(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='a/b/1', image=np.ones((8, 8, 3)), @@ -73,10 +75,7 @@ def test_can_save_dataset_with_no_subsets(self): 'occluded': '0', 'pose': '2', 'invalid': '0'}), ] ), - ], categories={ - AnnotationType.label: LabelCategories.from_iterable( - 'label_' + str(i) for i in range(3)), - }) + ], categories=['face', 'label_0', 'label_1']) with TestDir() as test_dir: WiderFaceConverter.convert(source_dataset, test_dir, save_images=True) @@ -84,19 +83,17 @@ def test_can_save_dataset_with_no_subsets(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='кириллица с пробелом', image=np.ones((8, 8, 3)), annotations=[ - Bbox(0, 1, 2, 3, label=1, attributes = { + Bbox(0, 1, 2, 3, label=0, attributes = { 'blur': '2', 'expression': '0', 'illumination': '0', 'occluded': '0', 'pose': '2', 'invalid': '0'}), ] ), - ], categories={ - AnnotationType.label: LabelCategories.from_iterable( - 'label_' + str(i) for i in range(3)), - }) + ], categories=['face']) with TestDir() as test_dir: WiderFaceConverter.convert(source_dataset, test_dir, save_images=True) @@ -105,30 +102,31 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): compare_datasets(self, source_dataset, parsed_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_non_widerface_attributes(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='a/b/1', image=np.ones((8, 8, 3)), annotations=[ - Bbox(0, 2, 4, 2), - Bbox(0, 1, 2, 3, attributes={ + Bbox(0, 2, 4, 2, label=0), + Bbox(0, 1, 2, 3, label=0, attributes={ 'non-widerface attribute': '0', 'blur': 1, 'invalid': '1'}), - Bbox(1, 1, 2, 2, attributes={ + Bbox(1, 1, 2, 2, label=0, attributes={ 'non-widerface attribute': '0'}), ] ), - ], categories=[]) + ], categories=['face']) target_dataset = Dataset.from_iterable([ DatasetItem(id='a/b/1', image=np.ones((8, 8, 3)), annotations=[ - Bbox(0, 2, 4, 2), - Bbox(0, 1, 2, 3, attributes={ + Bbox(0, 2, 4, 2, label=0), + Bbox(0, 1, 2, 3, label=0, attributes={ 'blur': '1', 'invalid': '1'}), - Bbox(1, 1, 2, 2), + Bbox(1, 1, 2, 2, label=0), ] ), - ], categories=[]) + ], categories=['face']) with TestDir() as test_dir: WiderFaceConverter.convert(source_dataset, test_dir, save_images=True) @@ -136,6 +134,7 @@ def test_can_save_dataset_with_non_widerface_attributes(self): compare_datasets(self, target_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): dataset = Dataset.from_iterable([ DatasetItem('q/1', image=Image(path='q/1.JPEG', @@ -153,9 +152,11 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'widerface_dataset') class WiderFaceImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect(self): self.assertTrue(WiderFaceImporter.detect(DUMMY_DATASET_DIR)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id='0_Parade_image_01', subset='train', diff --git a/tests/test_yolo_format.py b/tests/test_yolo_format.py index 5449ba6626de..a537e8d736d0 100644 --- a/tests/test_yolo_format.py +++ b/tests/test_yolo_format.py @@ -12,9 +12,11 @@ from datumaro.plugins.yolo_format.converter import YoloConverter from datumaro.util.image import Image, save_image from datumaro.util.test_utils import TestDir, compare_datasets +from .requirements import Requirements, mark_requirement class YoloFormatTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', image=np.ones((8, 8, 3)), @@ -47,6 +49,7 @@ def test_can_save_and_load(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_image_info(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', @@ -69,6 +72,7 @@ def test_can_save_dataset_with_image_info(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_load_dataset_with_exact_image_info(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', @@ -90,6 +94,7 @@ def test_can_load_dataset_with_exact_image_info(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='кириллица с пробелом', subset='train', image=np.ones((8, 8, 3)), @@ -109,6 +114,7 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): compare_datasets(self, source_dataset, parsed_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_relative_paths(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='1', subset='train', @@ -128,6 +134,7 @@ def test_relative_paths(self): compare_datasets(self, source_dataset, parsed_dataset) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_image_with_arbitrary_extension(self): dataset = Dataset.from_iterable([ DatasetItem('q/1', subset='train', @@ -142,6 +149,7 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): compare_datasets(self, dataset, parsed_dataset, require_images=True) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_inplace_save_writes_only_updated_data(self): with TestDir() as path: # generate initial dataset @@ -171,9 +179,11 @@ def test_inplace_save_writes_only_updated_data(self): DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'yolo_dataset') class YoloImporterTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_detect(self): self.assertTrue(YoloImporter.detect(DUMMY_DATASET_DIR)) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_import(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train',