Test refactors (#9)

* Create separate journey test file * Remove nbsp and formfeed characters * Fix signature block tests * Add Help run config * Refactor method names and test relationships * Extract edge cases to a separate file * Remove redundant sequences from regexes * Add more journey tests * Remove test-specific string * Refactor and add more tests
samayer12 · Jul 21, 2020 · 8dd0abd · 8dd0abd
1 parent 027084b
commit 8dd0abd
Show file tree

Hide file tree

Showing 7 changed files with 4,278 additions and 22 deletions.
diff --git a/src/interleave.py b/src/interleave.py
@@ -29,15 +29,15 @@ def remove_headers(input_text):
     sentences = re.sub(r'(\fC.*\d\n)', '', input_text)  # Remove Page Headers
     sentences = re.sub(r'(\n\d+ \n)', '', sentences)  # Remove Page Numbers
     sentences = re.sub(r'\n+[A-Z ]+\n+', '\n\n', sentences)  # Remove Section Titles
-    sentences = re.sub(r'([IVXCMD]+\.[A-Za-z .\'’\n-]+)\n(?!\dA-z)', ' \n\n',
-                       sentences)  # Remove Roman Numeral Section Titles but don't conflict with next rule
+    sentences = re.sub(r'([IVXCMD]+\.[A-Za-z .\'’\n-]+)\n(?!\dA-z)', '',
+                       sentences)  # Remove Roman Numeral Section Titles
     sentences = sentences.replace(chr(160), '\n')  # Remove nbsp Page Breaks
     return sentences
 
 
 def prepare_body_text(input_text):
     sentences = re.sub(r'(\n{3,}|\n\n )', '\n\n', input_text)  # Apply Consistent Paragraph Spacing
-    sentences = re.sub(r' {2}', ' ', sentences)  # Apply Consistent Text Spacing
+    sentences = re.sub(r' {2,}', ' ', sentences)  # Apply Consistent Text Spacing
     sentences = re.sub(r'(\S)(\n\n)([A-Za-z])', r'\1 \3', sentences)  # Handle EOL without a space
     sentences = re.sub(r'(\n\) )(1\.\s)', r'\n\n\2', sentences, 1)  # Make first paragraph match others
     return sentences
@@ -94,7 +94,6 @@ def main(argv):
 
         text_first_file = convert_pdf_to_txt(args.input[0])
         text_second_file = convert_pdf_to_txt(args.input[1])
-        # \n\n to match test files with real datasets
         filtered_text = zip_sentences(build_paragraphs(sanitize_text(text_first_file)),
                                       build_paragraphs(sanitize_text(text_second_file)))
         print(create_csv(filtered_text, args.output[0], (args.input[0], args.input[1])))

diff --git a/test/test_edge_cases.py b/test/test_edge_cases.py
@@ -20,18 +20,9 @@ def setUpClass(cls):
         with open('text/edge_final_paragraph.txt', 'r') as file:
             cls.edge_final_paragraph = file.read()
 
-        with open('text/edge_EPA_signature_block_1.txt', 'r') as file:
-            cls.edge_EPA_sigblock_1 = file.read()
-
-        with open('text/edge_EPA_signature_block_2.txt', 'r') as file:
-            cls.edge_EPA_sigblock_2 = file.read()
-
         cls.table_title = 'Table 1: PMNs for which EPA untimely published notice of receipt in the Federal Register'
 
-        cls.EPA_signature_1 = '/s/'
-
-        cls.EPA_signature_2 = 'Respectfully submitted,'
-
+
     def test_edge_case_line_starts_with_numeric_sentence_end(self):
         result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.edge_numbers))
         self.assertEqual(7, len(result))  # Expect seven paragraphs
@@ -48,14 +39,6 @@ def test_edge_case_ignore_trailing_tables(self):
         result = interleave.sanitize_text(self.edge_final_paragraph)[0]
         self.assertNotIn(self.table_title, result)
 
-    def test_edge_case_strip_EPA_sigblock_1(self):
-        result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_1))
-        self.assertNotIn(self.EPA_signature_1, result)
-
-    def test_edge_case_strip_EPA_sigblock_2(self):
-        result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_2))
-        self.assertNotIn(self.EPA_signature_2, result)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/test_unit.py b/test/test_unit.py
@@ -1,6 +1,8 @@
 import unittest
 from src import interleave
 from unittest.mock import patch
+from random import randint
+
 
 
 class PDFUnitTests(unittest.TestCase):
@@ -28,6 +30,24 @@ def setUpClass(cls):
         with open('text/nbsp_formfeed.txt', 'r') as file:
             cls.nbsp_formfeed = '\n\n' + file.read()
 
+        with open('text/edge_EPA_signature_block_1.txt', 'r') as file:
+            cls.edge_EPA_sigblock_1 = file.read()
+
+        with open('text/edge_EPA_signature_block_2.txt', 'r') as file:
+            cls.edge_EPA_sigblock_2 = file.read()
+
+        with open('text/paragraph_variable_spacing.txt', 'r') as file:
+            cls.paragraph_variable_spaced = file.read()
+
+        with open('text/roman_numerals.txt', 'r') as file:
+            cls.roman_numerals = file.read()
+
+        with open('text/section_titles.txt', 'r') as file:
+            cls.section_titles = file.read()
+
+        with open('text/trailing_table.txt', 'r') as file:
+            cls.trailing_table = file.read()
+
         cls.split_simple_text = ['1. First Entry.', '2. Second Entry.', '3. Third Entry.']
 
         cls.split_multiline_text = ['1. First Entry. This is a really long entry. It spans multiple lines. Very long. '
@@ -50,6 +70,10 @@ def setUpClass(cls):
                               ('2. Second Entry.', '2. Second Entry.'),
                               ('3. Third Entry.', '3. Third Entry.')]
 
+        cls.EPA_signature_1 = '/s/'
+
+        cls.EPA_signature_2 = 'Respectfully submitted,'
+
     def test_opens_PDF(self):
         self.assertEqual(self.short_text, interleave.convert_pdf_to_txt('PDFs/Simple.pdf'))
 
@@ -86,11 +110,43 @@ def test_removes_page_headers(self):
         self.assertNotRegex('\n'.join(interleave.sanitize_text(self.headers)),
                             r'(\fC.*\d\n)')
 
+    def test_removes_section_titles(self):
+        result = (interleave.remove_headers(self.section_titles)).split('\n\n')
+        self.assertEqual(18, len(result))
+        self.assertNotIn('\n', result)
+
+    def test_removes_roman_numerals(self):
+        result = interleave.remove_headers(self.roman_numerals)
+        self.assertNotRegex(result, r'[IVXCMD]+\.')
+
     def test_removes_nbsp_formfeed_page_breaks(self):
         result = interleave.sanitize_text(self.nbsp_formfeed)
         self.assertNotIn(chr(160), result)
         self.assertNotIn(chr(12), result)
 
+    def test_applies_consistent_paragraph_spacing(self):
+        result = (interleave.prepare_body_text(self.paragraph_variable_spaced)).split('\n\n')
+        self.assertEqual(5, len(result))
+        self.assertNotIn('\n', result)
+
+    def test_applies_consistent_text_spacing(self):
+        result = interleave.prepare_body_text('first second' + (' ' * randint(2, 1024))
+                                              + 'third  fourth')
+        self.assertNotRegex(result, r' {2,}')
+
+    def test_removes_tables_after_doc_body(self):
+        result = interleave.remove_trailing_content(self.trailing_table)
+        self.assertNotIn('Table 1:', result)
+        self.assertEqual(2, len(result.split('\n\n')))
+
+    def test_edge_case_strip_EPA_sigblock_1(self):
+        result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_1))
+        self.assertNotIn(self.EPA_signature_1, result)
+
+    def test_edge_case_strip_EPA_sigblock_2(self):
+        result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_2))
+        self.assertNotIn(self.EPA_signature_2, result)
+
     def test_zip_sentences_to_tuple(self):
         list1 = '\n\n' + self.short_text
         list2 = '\n\n' + self.short_text

diff --git a/test/text/paragraph_variable_spacing.txt b/test/text/paragraph_variable_spacing.txt
@@ -0,0 +1,12 @@
+1. Word
+
+2. Word
+
+
+3. Word
+
+ 4. Word
+
+
+
+5. Word
diff --git a/test/text/roman_numerals.txt b/test/text/roman_numerals.txt
@@ -0,0 +1,23 @@
+I. One
+
+1. I should still be here
+
+II. Two
+
+III. Three
+
+IV. Four
+
+VI. Six
+
+X. Ten
+
+2. I should still be here
+
+C. One-hundred
+
+D. Five-hundred
+
+M. One-thousand
+
+3. I should still be here
diff --git a/test/text/section_titles.txt b/test/text/section_titles.txt
@@ -0,0 +1,49 @@
+1. 
+
+In the Toxic Substances Control Act (“TSCA”), Congress required the United 
+
+INTRODUCTION 
+
+
+
+States Environmental Protection Agency (“EPA”) to approve any new chemical before it can 
+
+first be manufactured.
+
+2. Action by this Court is needed to 
+
+ensure Plaintiffs and their members have timely access to information and are able to provide 
+
+input on the potential risks of new chemicals and the need for protections from those risks prior 
+
+to completion of EPA’s reviews. 
+
+JURISDICTION AND VENUE 
+
+3. 
+
+This action arises under the Toxic Substances Control Act, 15 U.S.C. § 2619.
+
+PARTIES 
+
+4. 
+
+Environmental Defense Fund (“EDF”) is a membership organization incorporated 
+
+under the laws of the State of New York.
+
+LEGAL FRAMEWORK
+
+5. Here's a paragraph.
+
+FACTUAL BACKGROUND
+
+6. Another one.
+
+CLAIMS FOR RELIEF
+
+7. Yup, you guessed it.
+
+PRAYER FOR RELIEF
+
+8. Words.
-Original file line number
+Diff line change
@@ -0,0 +1,12 @@
+. Word
+. Word
+. Word
+. Word
+. Word