-
Notifications
You must be signed in to change notification settings - Fork 3.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add PDF extractor #557
Add PDF extractor #557
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -72,6 +72,56 @@ def save_image(self, k, dest_path): | |
image.close() | ||
return width, height | ||
|
||
class PDFExtractor(MediaExtractor): | ||
def __init__(self, source_path, dest_path, image_quality, step=1, start=0, stop=0): | ||
if not source_path: | ||
raise Exception('No PDF found') | ||
|
||
from pdf2image import convert_from_path | ||
self._temp_directory = tempfile.mkdtemp(prefix='cvat-') | ||
super().__init__( | ||
source_path=source_path[0], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why source_path[0]? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was following the implementation for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure why the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @benhoff , you specified in description of pdf extractor that multiple pdf documents can be uploaded. For video extractor unique flag is True. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Extractor's constructor always receive a list as
Maybe it will be better to change behaviour and pass to the constructor a list or single item according its description. I'll think about that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changed |
||
dest_path=dest_path, | ||
image_quality=image_quality, | ||
step=1, | ||
start=0, | ||
stop=0, | ||
) | ||
|
||
self._dimensions = [] | ||
file_ = convert_from_path(self._source_path) | ||
self._basename = os.path.splitext(os.path.basename(self._source_path))[0] | ||
for page_num, page in enumerate(file_): | ||
output = os.path.join(self._temp_directory, self._basename + f'{page_num}' + '.jpg') | ||
self._dimensions.append(page.size) | ||
page.save(output, 'JPEG') | ||
|
||
self._length = len(os.listdir(self._temp_directory)) | ||
|
||
def _get_imagepath(self, k): | ||
img_path = os.path.join(self._temp_directory, self._basename + f'{k}' + '.jpg') | ||
return img_path | ||
|
||
def __iter__(self): | ||
i = 0 | ||
while os.path.exists(self._get_imagepath(i)): | ||
yield self._get_imagepath(i) | ||
i += 1 | ||
|
||
def __del__(self): | ||
if self._temp_directory: | ||
shutil.rmtree(self._temp_directory) | ||
|
||
def __getitem__(self, k): | ||
return self._get_imagepath(k) | ||
|
||
def __len__(self): | ||
return self._length | ||
|
||
def save_image(self, k, dest_path): | ||
shutil.copyfile(self[k], dest_path) | ||
return self._dimensions[k] | ||
|
||
#Note step, start, stop have no affect | ||
class DirectoryExtractor(ImageListExtractor): | ||
def __init__(self, source_path, dest_path, image_quality, step=1, start=0, stop=0): | ||
|
@@ -180,6 +230,10 @@ def _is_image(path): | |
def _is_dir(path): | ||
return os.path.isdir(path) | ||
|
||
def _is_pdf(path): | ||
mime = mimetypes.guess_type(path) | ||
return mime[0] == 'application/pdf' | ||
|
||
# 'has_mime_type': function receives 1 argument - path to file. | ||
# Should return True if file has specified media type. | ||
# 'extractor': class that extracts images from specified media. | ||
|
@@ -213,4 +267,10 @@ def _is_dir(path): | |
'mode': 'annotation', | ||
'unique': False, | ||
}, | ||
'pdf': { | ||
'has_mime_type': _is_pdf, | ||
'extractor': PDFExtractor, | ||
'mode': 'annotation', | ||
'unique': True, | ||
}, | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,3 +33,4 @@ djangorestframework==3.9.1 | |
Pygments==2.3.1 | ||
drf-yasg==1.15.0 | ||
Shapely==1.6.4.post2 | ||
pdf2image==1.6.0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would look at ArchiveExtractor implementation and inherit the class from DirectoryExtractor. Let's implement here _extract method ... What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was using
VideoExtractor
as a basis here because in my case, PDF's could have multiple pages. Is there a better way to handle multiple page PDF's?