Skip to content

Commit

Permalink
Error handling for PDFminer (#10)
Browse files Browse the repository at this point in the history
* Create separate journey test file

* Remove nbsp and formfeed characters

* Fix signature block tests

* Add Help run config

* Refactor method names and test relationships

* Extract edge cases to a separate file

* Remove redundant sequences from regexes

* Add more journey tests

* Remove test-specific string

* Refactor and add more tests

* Identify edge cases where PDFminer breaks with an error message
  • Loading branch information
samayer12 authored Jul 23, 2020
1 parent 8dd0abd commit 7e12e96
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 2 deletions.
8 changes: 6 additions & 2 deletions src/interleave.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,14 @@ def build_paragraphs(input_text):
old_paragraph_number = 0
for i in range(0, len(matches), 3):
paragraph_number = int(matches[i].split('.')[0])
current_line = re.sub(r'[\n\f]', '', ''.join(matches[i:i + 2]).strip())

if paragraph_number != old_paragraph_number + 1:
result[-1] += re.sub(r'[\n\f]', '', ''.join(matches[i:i + 2]).strip())
result[-1] += current_line
else:
result.append(re.sub(r'[\n\f]', '', ''.join(matches[i:i + 2]).strip()))
if current_line[:-1].isdigit():
current_line += ' PARSE ERROR'
result.append(current_line)
old_paragraph_number = paragraph_number
return result[1:]

Expand Down
7 changes: 7 additions & 0 deletions test/test_edge_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ def setUpClass(cls):
with open('text/edge_final_paragraph.txt', 'r') as file:
cls.edge_final_paragraph = file.read()

with open('text/edge_bad_parse.txt', 'r') as file:
cls.edge_bad_parse = file.read()

cls.table_title = 'Table 1: PMNs for which EPA untimely published notice of receipt in the Federal Register'


Expand All @@ -39,6 +42,10 @@ def test_edge_case_ignore_trailing_tables(self):
result = interleave.sanitize_text(self.edge_final_paragraph)[0]
self.assertNotIn(self.table_title, result)

def test_edge_marks_bad_paragraph_parse(self):
result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.edge_bad_parse))
self.assertEqual(3, len(result))
self.assertIn('2. PARSE ERROR', result)

if __name__ == '__main__':
unittest.main()
9 changes: 9 additions & 0 deletions test/text/edge_bad_parse.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
1. Paragraph 10 states a conclusion of law and so requires no response.

2.

3.

Paragraph 2 states conclusions of law and so requires no response.

Paragraph 3 states conclusions of law, and so requires no response.

0 comments on commit 7e12e96

Please sign in to comment.