Skip to content

Commit

Permalink
Test refactors (#9)
Browse files Browse the repository at this point in the history
* Create separate journey test file

* Remove nbsp and formfeed characters

* Fix signature block tests

* Add Help run config

* Refactor method names and test relationships

* Extract edge cases to a separate file

* Remove redundant sequences from regexes

* Add more journey tests

* Remove test-specific string

* Refactor and add more tests
  • Loading branch information
samayer12 authored Jul 21, 2020
1 parent 027084b commit 8dd0abd
Show file tree
Hide file tree
Showing 7 changed files with 4,278 additions and 22 deletions.
7 changes: 3 additions & 4 deletions src/interleave.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ def remove_headers(input_text):
sentences = re.sub(r'(\fC.*\d\n)', '', input_text) # Remove Page Headers
sentences = re.sub(r'(\n\d+ \n)', '', sentences) # Remove Page Numbers
sentences = re.sub(r'\n+[A-Z ]+\n+', '\n\n', sentences) # Remove Section Titles
sentences = re.sub(r'([IVXCMD]+\.[A-Za-z .\'’\n-]+)\n(?!\dA-z)', ' \n\n',
sentences) # Remove Roman Numeral Section Titles but don't conflict with next rule
sentences = re.sub(r'([IVXCMD]+\.[A-Za-z .\'’\n-]+)\n(?!\dA-z)', '',
sentences) # Remove Roman Numeral Section Titles
sentences = sentences.replace(chr(160), '\n') # Remove nbsp Page Breaks
return sentences


def prepare_body_text(input_text):
sentences = re.sub(r'(\n{3,}|\n\n )', '\n\n', input_text) # Apply Consistent Paragraph Spacing
sentences = re.sub(r' {2}', ' ', sentences) # Apply Consistent Text Spacing
sentences = re.sub(r' {2,}', ' ', sentences) # Apply Consistent Text Spacing
sentences = re.sub(r'(\S)(\n\n)([A-Za-z])', r'\1 \3', sentences) # Handle EOL without a space
sentences = re.sub(r'(\n\) )(1\.\s)', r'\n\n\2', sentences, 1) # Make first paragraph match others
return sentences
Expand Down Expand Up @@ -94,7 +94,6 @@ def main(argv):

text_first_file = convert_pdf_to_txt(args.input[0])
text_second_file = convert_pdf_to_txt(args.input[1])
# \n\n to match test files with real datasets
filtered_text = zip_sentences(build_paragraphs(sanitize_text(text_first_file)),
build_paragraphs(sanitize_text(text_second_file)))
print(create_csv(filtered_text, args.output[0], (args.input[0], args.input[1])))
Expand Down
19 changes: 1 addition & 18 deletions test/test_edge_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,9 @@ def setUpClass(cls):
with open('text/edge_final_paragraph.txt', 'r') as file:
cls.edge_final_paragraph = file.read()

with open('text/edge_EPA_signature_block_1.txt', 'r') as file:
cls.edge_EPA_sigblock_1 = file.read()

with open('text/edge_EPA_signature_block_2.txt', 'r') as file:
cls.edge_EPA_sigblock_2 = file.read()

cls.table_title = 'Table 1: PMNs for which EPA untimely published notice of receipt in the Federal Register'

cls.EPA_signature_1 = '/s/'

cls.EPA_signature_2 = 'Respectfully submitted,'


def test_edge_case_line_starts_with_numeric_sentence_end(self):
result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.edge_numbers))
self.assertEqual(7, len(result)) # Expect seven paragraphs
Expand All @@ -48,14 +39,6 @@ def test_edge_case_ignore_trailing_tables(self):
result = interleave.sanitize_text(self.edge_final_paragraph)[0]
self.assertNotIn(self.table_title, result)

def test_edge_case_strip_EPA_sigblock_1(self):
result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_1))
self.assertNotIn(self.EPA_signature_1, result)

def test_edge_case_strip_EPA_sigblock_2(self):
result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_2))
self.assertNotIn(self.EPA_signature_2, result)


if __name__ == '__main__':
unittest.main()
56 changes: 56 additions & 0 deletions test/test_unit.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import unittest
from src import interleave
from unittest.mock import patch
from random import randint



class PDFUnitTests(unittest.TestCase):
Expand Down Expand Up @@ -28,6 +30,24 @@ def setUpClass(cls):
with open('text/nbsp_formfeed.txt', 'r') as file:
cls.nbsp_formfeed = '\n\n' + file.read()

with open('text/edge_EPA_signature_block_1.txt', 'r') as file:
cls.edge_EPA_sigblock_1 = file.read()

with open('text/edge_EPA_signature_block_2.txt', 'r') as file:
cls.edge_EPA_sigblock_2 = file.read()

with open('text/paragraph_variable_spacing.txt', 'r') as file:
cls.paragraph_variable_spaced = file.read()

with open('text/roman_numerals.txt', 'r') as file:
cls.roman_numerals = file.read()

with open('text/section_titles.txt', 'r') as file:
cls.section_titles = file.read()

with open('text/trailing_table.txt', 'r') as file:
cls.trailing_table = file.read()

cls.split_simple_text = ['1. First Entry.', '2. Second Entry.', '3. Third Entry.']

cls.split_multiline_text = ['1. First Entry. This is a really long entry. It spans multiple lines. Very long. '
Expand All @@ -50,6 +70,10 @@ def setUpClass(cls):
('2. Second Entry.', '2. Second Entry.'),
('3. Third Entry.', '3. Third Entry.')]

cls.EPA_signature_1 = '/s/'

cls.EPA_signature_2 = 'Respectfully submitted,'

def test_opens_PDF(self):
self.assertEqual(self.short_text, interleave.convert_pdf_to_txt('PDFs/Simple.pdf'))

Expand Down Expand Up @@ -86,11 +110,43 @@ def test_removes_page_headers(self):
self.assertNotRegex('\n'.join(interleave.sanitize_text(self.headers)),
r'(\fC.*\d\n)')

def test_removes_section_titles(self):
result = (interleave.remove_headers(self.section_titles)).split('\n\n')
self.assertEqual(18, len(result))
self.assertNotIn('\n', result)

def test_removes_roman_numerals(self):
result = interleave.remove_headers(self.roman_numerals)
self.assertNotRegex(result, r'[IVXCMD]+\.')

def test_removes_nbsp_formfeed_page_breaks(self):
result = interleave.sanitize_text(self.nbsp_formfeed)
self.assertNotIn(chr(160), result)
self.assertNotIn(chr(12), result)

def test_applies_consistent_paragraph_spacing(self):
result = (interleave.prepare_body_text(self.paragraph_variable_spaced)).split('\n\n')
self.assertEqual(5, len(result))
self.assertNotIn('\n', result)

def test_applies_consistent_text_spacing(self):
result = interleave.prepare_body_text('first second' + (' ' * randint(2, 1024))
+ 'third fourth')
self.assertNotRegex(result, r' {2,}')

def test_removes_tables_after_doc_body(self):
result = interleave.remove_trailing_content(self.trailing_table)
self.assertNotIn('Table 1:', result)
self.assertEqual(2, len(result.split('\n\n')))

def test_edge_case_strip_EPA_sigblock_1(self):
result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_1))
self.assertNotIn(self.EPA_signature_1, result)

def test_edge_case_strip_EPA_sigblock_2(self):
result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_2))
self.assertNotIn(self.EPA_signature_2, result)

def test_zip_sentences_to_tuple(self):
list1 = '\n\n' + self.short_text
list2 = '\n\n' + self.short_text
Expand Down
12 changes: 12 additions & 0 deletions test/text/paragraph_variable_spacing.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1. Word

2. Word


3. Word

4. Word



5. Word
23 changes: 23 additions & 0 deletions test/text/roman_numerals.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
I. One

1. I should still be here

II. Two

III. Three

IV. Four

VI. Six

X. Ten

2. I should still be here

C. One-hundred

D. Five-hundred

M. One-thousand

3. I should still be here
49 changes: 49 additions & 0 deletions test/text/section_titles.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
1.

In the Toxic Substances Control Act (“TSCA”), Congress required the United

INTRODUCTION



States Environmental Protection Agency (“EPA”) to approve any new chemical before it can

first be manufactured.

2. Action by this Court is needed to

ensure Plaintiffs and their members have timely access to information and are able to provide

input on the potential risks of new chemicals and the need for protections from those risks prior

to completion of EPA’s reviews.

JURISDICTION AND VENUE

3.

This action arises under the Toxic Substances Control Act, 15 U.S.C. § 2619.

PARTIES

4.

Environmental Defense Fund (“EDF”) is a membership organization incorporated

under the laws of the State of New York.

LEGAL FRAMEWORK

5. Here's a paragraph.

FACTUAL BACKGROUND

6. Another one.

CLAIMS FOR RELIEF

7. Yup, you guessed it.

PRAYER FOR RELIEF

8. Words.
Loading

0 comments on commit 8dd0abd

Please sign in to comment.