Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change Text extraction is not allowed error to warning #453

Merged
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## Changed
- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))

- Raise a warning instead of an error when extracting text from a non-extractable PDF ([#350](https://github.com/pdfminer/pdfminer.six/issues/350))

## [20200517]

### Added
Expand Down
4 changes: 1 addition & 3 deletions pdfminer/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
page_numbers,
maxpages=maxpages,
password=password,
caching=not disable_caching,
check_extractable=True):
caching=not disable_caching):
page.rotate = (page.rotate + rotation) % 360
interpreter.process_page(page)

Expand Down Expand Up @@ -118,7 +117,6 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True,
):
interpreter.process_page(page)

Expand Down
6 changes: 5 additions & 1 deletion pdfminer/pdfdocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@ class PDFPasswordIncorrect(PDFEncryptionError):
pass


class PDFTextExtractionNotAllowed(PDFEncryptionError):
class PDFTextExtractionNotAllowedWarning(UserWarning):
pass


class PDFTextExtractionNotAllowedError(PDFEncryptionError):
pass


Expand Down
22 changes: 16 additions & 6 deletions pdfminer/pdfpage.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import warnings
from . import settings
from .psparser import LIT
from .pdftypes import PDFObjectNotFound
Expand All @@ -8,7 +9,8 @@
from .pdftypes import dict_value
from .pdfparser import PDFParser
from .pdfdocument import PDFDocument
from .pdfdocument import PDFTextExtractionNotAllowed
from .pdfdocument import PDFTextExtractionNotAllowedWarning
from .pdfdocument import PDFTextExtractionNotAllowedError


log = logging.getLogger(__name__)
Expand Down Expand Up @@ -120,15 +122,23 @@ def search(obj, parent):
@classmethod
def get_pages(cls, fp,
pagenos=None, maxpages=0, password='',
caching=True, check_extractable=True):
pietermarsman marked this conversation as resolved.
Show resolved Hide resolved
caching=True, check_extractable=False):
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument(parser, password=password, caching=caching)
# Check if the document allows text extraction. If not, abort.
if check_extractable and not doc.is_extractable:
error_msg = 'Text extraction is not allowed: %r' % fp
raise PDFTextExtractionNotAllowed(error_msg)
# Check if the document allows text extraction.
# If not, warn the user and proceed.
if not doc.is_extractable:
if check_extractable:
error_msg = 'Text extraction is not allowed: %r' % fp
raise PDFTextExtractionNotAllowedError(error_msg)
else:
warning_msg = 'The PDF %r contains a metadata field '\
'indicating that it should not allow ' \
'text extraction. Ignoring this field ' \
'and proceeding.' % fp
warnings.warn(warning_msg, PDFTextExtractionNotAllowedWarning)
# Process each page contained in the document.
for (pageno, page) in enumerate(cls.create_pages(doc)):
if pagenos and (pageno not in pagenos):
Expand Down
Binary file added samples/contrib/issue-00352-asw-oct96-p41.pdf
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/test_tools_pdf2txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ def test_nlp2004slides(self):
def test_contrib_2b(self):
run('contrib/2b.pdf', '-A -t xml')

def test_contrib_issue_350(self):
"""Regression test for https://github.com/pdfminer/pdfminer.six/issues/350"""
run('contrib/issue-00352-asw-oct96-p41.pdf')

def test_scancode_patchelf(self):
"""Regression test for # https://github.com/euske/pdfminer/issues/96"""
run('scancode/patchelf.pdf')
Expand Down