From e0bf86613f8c5f12213df5a02f5e5d8ee1c38a63 Mon Sep 17 00:00:00 2001 From: Ben Hoff Date: Wed, 10 Jul 2019 20:09:04 -0400 Subject: [PATCH] added in pdf extractor --- CHANGELOG.md | 1 + cvat/apps/engine/media.mimetypes | 3 ++ cvat/apps/engine/media_extractors.py | 60 ++++++++++++++++++++++++++++ cvat/requirements/base.txt | 1 + 4 files changed, 65 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ceb6f440e0ca..d403dab1bae5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Auto annotation using Faster R-CNN with Inception v2 (utils/open_model_zoo) - Auto annotation using Pixel Link mobilenet v2 - text detection (utils/open_model_zoo) - Ability to create a custom extractors for unsupported media types +- Added in PDF extractor ### Changed - Outside and keyframe buttons in the side panel for all interpolation shapes (they were only for boxes before) diff --git a/cvat/apps/engine/media.mimetypes b/cvat/apps/engine/media.mimetypes index 79ee9c539e6e..48d16b55f7b8 100644 --- a/cvat/apps/engine/media.mimetypes +++ b/cvat/apps/engine/media.mimetypes @@ -220,3 +220,6 @@ application/x-tarz tar.z application/x-tzo tar.lzo application/x-xz-compressed-tar txz application/zip zip + +# PDF +application/pdf pdf diff --git a/cvat/apps/engine/media_extractors.py b/cvat/apps/engine/media_extractors.py index bc4424ba80ac..68adf0f7c5ad 100644 --- a/cvat/apps/engine/media_extractors.py +++ b/cvat/apps/engine/media_extractors.py @@ -72,6 +72,56 @@ def save_image(self, k, dest_path): image.close() return width, height +class PDFExtractor(MediaExtractor): + def __init__(self, source_path, dest_path, image_quality, step=1, start=0, stop=0): + if not source_path: + raise Exception('No PDF found') + + from pdf2image import convert_from_path + self._temp_directory = tempfile.mkdtemp(prefix='cvat-') + super().__init__( + source_path=source_path[0], + dest_path=dest_path, + image_quality=image_quality, + step=1, + start=0, + stop=0, + ) + + self._dimensions = [] + file_ = convert_from_path(self._source_path) + self._basename = os.path.splitext(os.path.basename(self._source_path))[0] + for page_num, page in enumerate(file_): + output = os.path.join(self._temp_directory, self._basename + f'{page_num}' + '.jpg') + self._dimensions.append(page.size) + page.save(output, 'JPEG') + + self._length = len(os.listdir(self._temp_directory)) + + def _get_imagepath(self, k): + img_path = os.path.join(self._temp_directory, self._basename + f'{k}' + '.jpg') + return img_path + + def __iter__(self): + i = 0 + while os.path.exists(self._get_imagepath(i)): + yield self._get_imagepath(i) + i += 1 + + def __del__(self): + if self._temp_directory: + shutil.rmtree(self._temp_directory) + + def __getitem__(self, k): + return self._get_imagepath(k) + + def __len__(self): + return self._length + + def save_image(self, k, dest_path): + shutil.copyfile(self[k], dest_path) + return self._dimensions[k] + #Note step, start, stop have no affect class DirectoryExtractor(ImageListExtractor): def __init__(self, source_path, dest_path, image_quality, step=1, start=0, stop=0): @@ -180,6 +230,10 @@ def _is_image(path): def _is_dir(path): return os.path.isdir(path) +def _is_pdf(path): + mime = mimetypes.guess_type(path) + return mime[0] == 'application/pdf' + # 'has_mime_type': function receives 1 argument - path to file. # Should return True if file has specified media type. # 'extractor': class that extracts images from specified media. @@ -213,4 +267,10 @@ def _is_dir(path): 'mode': 'annotation', 'unique': False, }, + 'pdf': { + 'has_mime_type': _is_pdf, + 'extractor': PDFExtractor, + 'mode': 'annotation', + 'unique': True, + }, } diff --git a/cvat/requirements/base.txt b/cvat/requirements/base.txt index d20e24221391..9245f8ca61ba 100644 --- a/cvat/requirements/base.txt +++ b/cvat/requirements/base.txt @@ -33,3 +33,4 @@ djangorestframework==3.9.1 Pygments==2.3.1 drf-yasg==1.15.0 Shapely==1.6.4.post2 +pdf2image==1.6.0