Skip to content

Commit

Permalink
Merge refactoring (#8)
Browse files Browse the repository at this point in the history
* Create separate journey test file

* Remove nbsp and formfeed characters

* Fix signature block tests

* Add Help run config

* Refactor method names and test relationships

* Extract edge cases to a separate file

* Remove redundant sequences from regexes

* Add more journey tests

* Remove test-specific string
  • Loading branch information
samayer12 authored Jul 21, 2020
1 parent 0b1f4d0 commit 027084b
Show file tree
Hide file tree
Showing 8 changed files with 366 additions and 78 deletions.
23 changes: 23 additions & 0 deletions .idea/runConfigurations/Interleave___Help.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 27 additions & 9 deletions src/interleave.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def convert_pdf_to_txt(path):
return text


def build_paragraph(input_text):
def build_paragraphs(input_text):
matches = re.split(r'(\n\n)(\d+\.\s)', input_text)[2:]
result = ['']
old_paragraph_number = 0
Expand All @@ -25,19 +25,36 @@ def build_paragraph(input_text):
return result[1:]


def get_sentences(input_text):
def remove_headers(input_text):
sentences = re.sub(r'(\fC.*\d\n)', '', input_text) # Remove Page Headers
sentences = re.sub(r'(\n\d+ \n)', '', sentences) # Remove Page Numbers
sentences = re.sub(r'\n+[A-Z ]+\n+', '\n\n', sentences) # Remove Section Titles
sentences = re.sub(r'([IVXCMD]+\.[A-Za-z \.\'\’\n-]+)\n(?!\dA-z)', ' \n\n',
sentences = re.sub(r'([IVXCMD]+\.[A-Za-z .\'’\n-]+)\n(?!\dA-z)', ' \n\n',
sentences) # Remove Roman Numeral Section Titles but don't conflict with next rule
sentences = re.sub(r'(\n{3,}|\n\n )', '', sentences) # Apply Consistent Paragraph Spacing
sentences = re.sub(r' ', ' ', sentences) # Apply Consistent Text Spacing
sentences = sentences.replace(chr(160), '\n') # Remove nbsp Page Breaks
return sentences


def prepare_body_text(input_text):
sentences = re.sub(r'(\n{3,}|\n\n )', '\n\n', input_text) # Apply Consistent Paragraph Spacing
sentences = re.sub(r' {2}', ' ', sentences) # Apply Consistent Text Spacing
sentences = re.sub(r'(\S)(\n\n)([A-Za-z])', r'\1 \3', sentences) # Handle EOL without a space
sentences = re.sub(r'(\n\) )(1\.\s)', r'\n\n\2', sentences, 1) # Make first paragraph match others
sentences = re.split(r'\n\nTable 1:', sentences)[0] # Remove tables that follow document body
return sentences


def remove_trailing_content(input_text):
sentences = re.split(r'\n\nTable 1:', input_text)[0] # Remove tables that follow document body
sentences = re.split(r'\s/s/', sentences)[0] # Remove EPA-style signature blocks
return build_paragraph('\n\n' + sentences) # \n\n to match test files with real datasets
sentences = re.split(r'Respectfully submitted,', sentences)[0] # Remove EPA-style signature blocks
return sentences


def sanitize_text(input_text):
sentences = remove_headers(input_text)
sentences = prepare_body_text(sentences)
sentences = remove_trailing_content(sentences)
return sentences


def zip_sentences(list1, list2):
Expand Down Expand Up @@ -77,8 +94,9 @@ def main(argv):

text_first_file = convert_pdf_to_txt(args.input[0])
text_second_file = convert_pdf_to_txt(args.input[1])

filtered_text = zip_sentences(get_sentences(text_first_file), get_sentences(text_second_file))
# \n\n to match test files with real datasets
filtered_text = zip_sentences(build_paragraphs(sanitize_text(text_first_file)),
build_paragraphs(sanitize_text(text_second_file)))
print(create_csv(filtered_text, args.output[0], (args.input[0], args.input[1])))


Expand Down
61 changes: 61 additions & 0 deletions test/test_edge_cases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import unittest
from src import interleave


class PDFEdgeCases(unittest.TestCase):
@classmethod
def setUpClass(cls):
with open('text/edge_line_start_numbers.txt', 'r') as file:
cls.edge_numbers = file.read()

with open('text/edge_missing_paragraphs.txt', 'r') as file:
cls.edge_missing_paragraphs = file.read()

with open('text/edge_first_paragraph.txt', 'r') as file:
cls.edge_first_paragraph = file.read()

with open('text/edge_final_paragraph.txt', 'r') as file:
cls.edge_final_paragraph = file.read()

with open('text/edge_final_paragraph.txt', 'r') as file:
cls.edge_final_paragraph = file.read()

with open('text/edge_EPA_signature_block_1.txt', 'r') as file:
cls.edge_EPA_sigblock_1 = file.read()

with open('text/edge_EPA_signature_block_2.txt', 'r') as file:
cls.edge_EPA_sigblock_2 = file.read()

cls.table_title = 'Table 1: PMNs for which EPA untimely published notice of receipt in the Federal Register'

cls.EPA_signature_1 = '/s/'

cls.EPA_signature_2 = 'Respectfully submitted,'

def test_edge_case_line_starts_with_numeric_sentence_end(self):
result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.edge_numbers))
self.assertEqual(7, len(result)) # Expect seven paragraphs

def test_edge_case_missing_paragraphs(self):
result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.edge_missing_paragraphs))
self.assertEqual(7, len(result)) # Expect seven paragraphs

def test_edge_case_isolate_first_paragraph(self):
result = interleave.build_paragraphs(interleave.sanitize_text(self.edge_first_paragraph))
self.assertEqual(1, len(result))

def test_edge_case_ignore_trailing_tables(self):
result = interleave.sanitize_text(self.edge_final_paragraph)[0]
self.assertNotIn(self.table_title, result)

def test_edge_case_strip_EPA_sigblock_1(self):
result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_1))
self.assertNotIn(self.EPA_signature_1, result)

def test_edge_case_strip_EPA_sigblock_2(self):
result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_2))
self.assertNotIn(self.EPA_signature_2, result)


if __name__ == '__main__':
unittest.main()
39 changes: 39 additions & 0 deletions test/test_journey.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import unittest
from src import interleave
import textract


class PDFJourneyTests(unittest.TestCase):

@classmethod
def setUpClass(cls):
cls.complex_PDF_1 = textract.process('PDFs/Complex_1.pdf', method='pdfminer').decode()

cls.complex_PDF_2 = textract.process('PDFs/Complex_2.pdf', method='pdfminer').decode()

cls.complex_PDF_3 = textract.process('PDFs/Complex_3.pdf', method='pdfminer').decode()

cls.complex_PDF_4 = textract.process('PDFs/Complex_4.pdf', method='pdfminer').decode()

cls.complex_PDF_5 = textract.process('PDFs/Complex_5.pdf', method='pdfminer').decode()

cls.complex_PDF_6 = textract.process('PDFs/Complex_6.pdf', method='pdfminer').decode()

def test_counts_correct_amount_of_paragraphs_for_complex_12(self):
result = interleave.zip_sentences(interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_1)),
interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_2)))
self.assertEqual(166, len(result))

def test_counts_correct_amount_of_paragraphs_for_complex_34(self):
result = interleave.zip_sentences(interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_3)),
interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_4)))
self.assertEqual(64, len(result))

def test_counts_correct_amount_of_paragraphs_for_complex_56(self):
result = interleave.zip_sentences(interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_5)),
interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_6)))
self.assertEqual(69, len(result))


if __name__ == '__main__':
unittest.main()
93 changes: 25 additions & 68 deletions test/test_interleave.py → test/test_unit.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import unittest
from src import interleave
from unittest.mock import patch
import textract


class PDFTests(unittest.TestCase):
class PDFUnitTests(unittest.TestCase):

@classmethod
def setUpClass(cls):
Expand All @@ -18,35 +17,16 @@ def setUpClass(cls):
cls.multipage_text = file.read()

with open('text/1000_paragraphs.txt', 'r') as file:
cls.thousand_paragraphs = file.read()
cls.thousand_paragraphs = '\n\n' + file.read()

with open('text/pagenumbers.txt', 'r') as file:
cls.pagenumbers = file.read()
cls.pagenumbers = '\n\n' + file.read()

with open('text/headers.txt', 'r') as file:
cls.headers = file.read()
cls.headers = '\n\n' + file.read()

with open('text/edge_line_start_numbers.txt', 'r') as file:
cls.edge_numbers = file.read()

with open('text/edge_missing_paragraphs.txt', 'r') as file:
cls.edge_missing_paragraphs = file.read()

with open('text/edge_first_paragraph.txt', 'r') as file:
cls.edge_first_paragraph = file.read()

with open('text/edge_final_paragraph.txt', 'r') as file:
cls.edge_final_paragraph = file.read()

with open('text/edge_final_paragraph.txt', 'r') as file:
cls.edge_final_paragraph = file.read()

with open('text/edge_EPA_signature_block.txt', 'r') as file:
cls.edge_EPA_sigblock = file.read()

cls.complex_PDF_1 = textract.process('PDFs/Complex_1.pdf', method='pdfminer').decode()

cls.complex_PDF_2 = textract.process('PDFs/Complex_2.pdf', method='pdfminer').decode()
with open('text/nbsp_formfeed.txt', 'r') as file:
cls.nbsp_formfeed = '\n\n' + file.read()

cls.split_simple_text = ['1. First Entry.', '2. Second Entry.', '3. Third Entry.']

Expand All @@ -70,10 +50,6 @@ def setUpClass(cls):
('2. Second Entry.', '2. Second Entry.'),
('3. Third Entry.', '3. Third Entry.')]

cls.table_title = 'Table 1: PMNs for which EPA untimely published notice of receipt in the Federal Register'

cls.EPA_signature = '/s/'

def test_opens_PDF(self):
self.assertEqual(self.short_text, interleave.convert_pdf_to_txt('PDFs/Simple.pdf'))

Expand All @@ -84,63 +60,44 @@ def test_opens_multipage_PDF(self):
self.assertEqual(self.multipage_text, interleave.convert_pdf_to_txt('PDFs/Multipage.pdf'))

def test_counts_up_to_1000_paragraphs(self):
self.assertEqual(1000, len(interleave.get_sentences(self.thousand_paragraphs)))
result = interleave.build_paragraphs(interleave.sanitize_text(self.thousand_paragraphs))
self.assertEqual(1000, len(result))

def test_builds_paragraphs_correctly(self):
self.assertEqual(self.split_simple_text, interleave.build_paragraph('\n\n' + self.short_text))

def test_counts_correct_amount_of_paragraphs_for_complex_documents(self):
result = interleave.zip_sentences(interleave.get_sentences(self.complex_PDF_1),
interleave.get_sentences(self.complex_PDF_2))
self.assertEqual(166, len(result))
self.assertEqual(self.split_simple_text, interleave.build_paragraphs('\n\n' + self.short_text))

def test_splits_short_sentences(self):
self.assertEqual(self.split_simple_text,
interleave.get_sentences(self.short_text))
result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.short_text))
self.assertEqual(self.split_simple_text, result)

def test_splits_multiline_sentences(self):
self.assertEqual(self.split_multiline_text,
interleave.get_sentences(self.multiline_text))
result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.multiline_text))
self.assertEqual(self.split_multiline_text, result)

def test_splits_multipage_paragraphs(self):
self.assertEqual(self.split_multipage_text,
interleave.get_sentences(self.multipage_text))
result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.multipage_text))
self.assertEqual(self.split_multipage_text, result)

def test_removes_page_numbers(self):
self.assertNotRegex('\n'.join(interleave.get_sentences(self.pagenumbers)),
self.assertNotRegex('\n'.join(interleave.sanitize_text(self.pagenumbers)),
r'\n\d+ \n')

def test_removes_page_headers(self):
self.assertNotRegex('\n'.join(interleave.get_sentences(self.headers)),
self.assertNotRegex('\n'.join(interleave.sanitize_text(self.headers)),
r'(\fC.*\d\n)')

def test_edge_case_line_starts_with_numeric_sentence_end(self):
result = interleave.get_sentences(self.edge_numbers)
self.assertEqual(7, len(result)) # Expect seven paragraphs

def test_edge_case_missing_paragraphs(self):
result = interleave.get_sentences(self.edge_missing_paragraphs)
self.assertEqual(7, len(result)) # Expect seven paragraphs

def test_edge_case_isolate_first_paragraph(self):
result = interleave.get_sentences(self.edge_first_paragraph)
self.assertEqual(1, len(result))

def test_edge_case_ignore_trailing_tables(self):
result = interleave.get_sentences(self.edge_final_paragraph)[0]
self.assertNotIn(self.table_title, result)

def test_edge_case_strip_EPA_style_sigblock(self):
result = interleave.get_sentences(self.edge_EPA_sigblock)
self.assertNotIn(self.EPA_signature, result)
def test_removes_nbsp_formfeed_page_breaks(self):
result = interleave.sanitize_text(self.nbsp_formfeed)
self.assertNotIn(chr(160), result)
self.assertNotIn(chr(12), result)

def test_zip_sentences_to_tuple(self):
list1 = self.short_text
list2 = self.short_text
list1 = '\n\n' + self.short_text
list2 = '\n\n' + self.short_text

self.assertEqual(self.processed_text,
interleave.zip_sentences(interleave.get_sentences(list1),
interleave.get_sentences(list2)))
interleave.zip_sentences(interleave.build_paragraphs(interleave.sanitize_text(list1)),
interleave.build_paragraphs(interleave.sanitize_text(list2))))

@patch('builtins.open')
@patch('src.interleave.writer', autospec=True)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
165. The allegations in paragraph 165 state legal conclusions that require no response.
1. The allegations in paragraph 165 state legal conclusions that require no response.

The remainder of the complaint consists of plaintiffs’ request for relief, to which no response is

Expand Down
Loading

0 comments on commit 027084b

Please sign in to comment.