From 027084b60c181a8973f07f8dfff941a86e2a8369 Mon Sep 17 00:00:00 2001
From: Sam Mayer <samayer12@gmail.com>
Date: Tue, 21 Jul 2020 12:18:56 -0500
Subject: [PATCH] Merge refactoring (#8)

* Create separate journey test file

* Remove nbsp and formfeed characters

* Fix signature block tests

* Add Help run config

* Refactor method names and test relationships

* Extract edge cases to a separate file

* Remove redundant sequences from regexes

* Add more journey tests

* Remove test-specific string
---
 .idea/runConfigurations/Interleave___Help.xml |  23 +++
 src/interleave.py                             |  36 +++-
 test/test_edge_cases.py                       |  61 +++++++
 test/test_journey.py                          |  39 ++++
 test/{test_interleave.py => test_unit.py}     |  93 +++-------
 ...ock.txt => edge_EPA_signature_block_1.txt} |   2 +-
 test/text/edge_EPA_signature_block_2.txt      | 171 ++++++++++++++++++
 test/text/nbsp_formfeed.txt                   |  19 ++
 8 files changed, 366 insertions(+), 78 deletions(-)
 create mode 100644 .idea/runConfigurations/Interleave___Help.xml
 create mode 100644 test/test_edge_cases.py
 create mode 100644 test/test_journey.py
 rename test/{test_interleave.py => test_unit.py} (53%)
 rename test/text/{edge_EPA_signature_block.txt => edge_EPA_signature_block_1.txt} (92%)
 create mode 100644 test/text/edge_EPA_signature_block_2.txt
 create mode 100644 test/text/nbsp_formfeed.txt
diff --git a/.idea/runConfigurations/Interleave___Help.xml b/.idea/runConfigurations/Interleave___Help.xml
new file mode 100644
index 0000000..98b3edd
--- /dev/null
+++ b/.idea/runConfigurations/Interleave___Help.xml
@@ -0,0 +1,23 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="Interleave | Help" type="PythonConfigurationType" factoryName="Python">
+    <module name="Interleave" />
+    <option name="INTERPRETER_OPTIONS" value="" />
+    <option name="PARENT_ENVS" value="true" />
+    <envs>
+      <env name="PYTHONUNBUFFERED" value="1" />
+    </envs>
+    <option name="SDK_HOME" value="/usr/bin/python3.8" />
+    <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/src" />
+    <option name="IS_MODULE_SDK" value="false" />
+    <option name="ADD_CONTENT_ROOTS" value="true" />
+    <option name="ADD_SOURCE_ROOTS" value="true" />
+    <option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/interleave.py" />
+    <option name="PARAMETERS" value="-h" />
+    <option name="SHOW_COMMAND_LINE" value="false" />
+    <option name="EMULATE_TERMINAL" value="false" />
+    <option name="MODULE_MODE" value="false" />
+    <option name="REDIRECT_INPUT" value="false" />
+    <option name="INPUT_FILE" value="" />
+    <method v="2" />
+  </configuration>
+</component>
\ No newline at end of file
diff --git a/src/interleave.py b/src/interleave.py
index 7952e29..a9abcca 100644
--- a/src/interleave.py
+++ b/src/interleave.py
@@ -11,7 +11,7 @@ def convert_pdf_to_txt(path):
     return text
 
 
-def build_paragraph(input_text):
+def build_paragraphs(input_text):
     matches = re.split(r'(\n\n)(\d+\.\s)', input_text)[2:]
     result = ['']
     old_paragraph_number = 0
@@ -25,19 +25,36 @@ def build_paragraph(input_text):
     return result[1:]
 
 
-def get_sentences(input_text):
+def remove_headers(input_text):
     sentences = re.sub(r'(\fC.*\d\n)', '', input_text)  # Remove Page Headers
     sentences = re.sub(r'(\n\d+ \n)', '', sentences)  # Remove Page Numbers
     sentences = re.sub(r'\n+[A-Z ]+\n+', '\n\n', sentences)  # Remove Section Titles
-    sentences = re.sub(r'([IVXCMD]+\.[A-Za-z \.\'\’\n-]+)\n(?!\dA-z)', ' \n\n',
+    sentences = re.sub(r'([IVXCMD]+\.[A-Za-z .\'’\n-]+)\n(?!\dA-z)', ' \n\n',
                        sentences)  # Remove Roman Numeral Section Titles but don't conflict with next rule
-    sentences = re.sub(r'(\n{3,}|\n\n )', '', sentences)  # Apply Consistent Paragraph Spacing
-    sentences = re.sub(r'  ', ' ', sentences)  # Apply Consistent Text Spacing
+    sentences = sentences.replace(chr(160), '\n')  # Remove nbsp Page Breaks
+    return sentences
+
+
+def prepare_body_text(input_text):
+    sentences = re.sub(r'(\n{3,}|\n\n )', '\n\n', input_text)  # Apply Consistent Paragraph Spacing
+    sentences = re.sub(r' {2}', ' ', sentences)  # Apply Consistent Text Spacing
     sentences = re.sub(r'(\S)(\n\n)([A-Za-z])', r'\1 \3', sentences)  # Handle EOL without a space
     sentences = re.sub(r'(\n\) )(1\.\s)', r'\n\n\2', sentences, 1)  # Make first paragraph match others
-    sentences = re.split(r'\n\nTable 1:', sentences)[0]  # Remove tables that follow document body
+    return sentences
+
+
+def remove_trailing_content(input_text):
+    sentences = re.split(r'\n\nTable 1:', input_text)[0]  # Remove tables that follow document body
     sentences = re.split(r'\s/s/', sentences)[0]  # Remove EPA-style signature blocks
-    return build_paragraph('\n\n' + sentences)  # \n\n to match test files with real datasets
+    sentences = re.split(r'Respectfully submitted,', sentences)[0]  # Remove EPA-style signature blocks
+    return sentences
+
+
+def sanitize_text(input_text):
+    sentences = remove_headers(input_text)
+    sentences = prepare_body_text(sentences)
+    sentences = remove_trailing_content(sentences)
+    return sentences
 
 
 def zip_sentences(list1, list2):
@@ -77,8 +94,9 @@ def main(argv):
 
         text_first_file = convert_pdf_to_txt(args.input[0])
         text_second_file = convert_pdf_to_txt(args.input[1])
-
-        filtered_text = zip_sentences(get_sentences(text_first_file), get_sentences(text_second_file))
+        # \n\n to match test files with real datasets
+        filtered_text = zip_sentences(build_paragraphs(sanitize_text(text_first_file)),
+                                      build_paragraphs(sanitize_text(text_second_file)))
         print(create_csv(filtered_text, args.output[0], (args.input[0], args.input[1])))
 
 
diff --git a/test/test_edge_cases.py b/test/test_edge_cases.py
new file mode 100644
index 0000000..f7a8770
--- /dev/null
+++ b/test/test_edge_cases.py
@@ -0,0 +1,61 @@
+import unittest
+from src import interleave
+
+
+class PDFEdgeCases(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        with open('text/edge_line_start_numbers.txt', 'r') as file:
+            cls.edge_numbers = file.read()
+
+        with open('text/edge_missing_paragraphs.txt', 'r') as file:
+            cls.edge_missing_paragraphs = file.read()
+
+        with open('text/edge_first_paragraph.txt', 'r') as file:
+            cls.edge_first_paragraph = file.read()
+
+        with open('text/edge_final_paragraph.txt', 'r') as file:
+            cls.edge_final_paragraph = file.read()
+
+        with open('text/edge_final_paragraph.txt', 'r') as file:
+            cls.edge_final_paragraph = file.read()
+
+        with open('text/edge_EPA_signature_block_1.txt', 'r') as file:
+            cls.edge_EPA_sigblock_1 = file.read()
+
+        with open('text/edge_EPA_signature_block_2.txt', 'r') as file:
+            cls.edge_EPA_sigblock_2 = file.read()
+
+        cls.table_title = 'Table 1: PMNs for which EPA untimely published notice of receipt in the Federal Register'
+
+        cls.EPA_signature_1 = '/s/'
+
+        cls.EPA_signature_2 = 'Respectfully submitted,'
+
+    def test_edge_case_line_starts_with_numeric_sentence_end(self):
+        result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.edge_numbers))
+        self.assertEqual(7, len(result))  # Expect seven paragraphs
+
+    def test_edge_case_missing_paragraphs(self):
+        result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.edge_missing_paragraphs))
+        self.assertEqual(7, len(result))  # Expect seven paragraphs
+
+    def test_edge_case_isolate_first_paragraph(self):
+        result = interleave.build_paragraphs(interleave.sanitize_text(self.edge_first_paragraph))
+        self.assertEqual(1, len(result))
+
+    def test_edge_case_ignore_trailing_tables(self):
+        result = interleave.sanitize_text(self.edge_final_paragraph)[0]
+        self.assertNotIn(self.table_title, result)
+
+    def test_edge_case_strip_EPA_sigblock_1(self):
+        result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_1))
+        self.assertNotIn(self.EPA_signature_1, result)
+
+    def test_edge_case_strip_EPA_sigblock_2(self):
+        result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_2))
+        self.assertNotIn(self.EPA_signature_2, result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/test_journey.py b/test/test_journey.py
new file mode 100644
index 0000000..b9f3ab8
--- /dev/null
+++ b/test/test_journey.py
@@ -0,0 +1,39 @@
+import unittest
+from src import interleave
+import textract
+
+
+class PDFJourneyTests(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.complex_PDF_1 = textract.process('PDFs/Complex_1.pdf', method='pdfminer').decode()
+
+        cls.complex_PDF_2 = textract.process('PDFs/Complex_2.pdf', method='pdfminer').decode()
+
+        cls.complex_PDF_3 = textract.process('PDFs/Complex_3.pdf', method='pdfminer').decode()
+
+        cls.complex_PDF_4 = textract.process('PDFs/Complex_4.pdf', method='pdfminer').decode()
+
+        cls.complex_PDF_5 = textract.process('PDFs/Complex_5.pdf', method='pdfminer').decode()
+
+        cls.complex_PDF_6 = textract.process('PDFs/Complex_6.pdf', method='pdfminer').decode()
+
+    def test_counts_correct_amount_of_paragraphs_for_complex_12(self):
+        result = interleave.zip_sentences(interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_1)),
+                                          interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_2)))
+        self.assertEqual(166, len(result))
+
+    def test_counts_correct_amount_of_paragraphs_for_complex_34(self):
+        result = interleave.zip_sentences(interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_3)),
+                                          interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_4)))
+        self.assertEqual(64, len(result))
+
+    def test_counts_correct_amount_of_paragraphs_for_complex_56(self):
+        result = interleave.zip_sentences(interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_5)),
+                                          interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_6)))
+        self.assertEqual(69, len(result))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/test_interleave.py b/test/test_unit.py
similarity index 53%
rename from test/test_interleave.py
rename to test/test_unit.py
index efd93bb..18505a1 100644
--- a/test/test_interleave.py
+++ b/test/test_unit.py
@@ -1,10 +1,9 @@
 import unittest
 from src import interleave
 from unittest.mock import patch
-import textract
 
 
-class PDFTests(unittest.TestCase):
+class PDFUnitTests(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
@@ -18,35 +17,16 @@ def setUpClass(cls):
             cls.multipage_text = file.read()
 
         with open('text/1000_paragraphs.txt', 'r') as file:
-            cls.thousand_paragraphs = file.read()
+            cls.thousand_paragraphs = '\n\n' + file.read()
 
         with open('text/pagenumbers.txt', 'r') as file:
-            cls.pagenumbers = file.read()
+            cls.pagenumbers = '\n\n' + file.read()
 
         with open('text/headers.txt', 'r') as file:
-            cls.headers = file.read()
+            cls.headers = '\n\n' + file.read()
 
-        with open('text/edge_line_start_numbers.txt', 'r') as file:
-            cls.edge_numbers = file.read()
-
-        with open('text/edge_missing_paragraphs.txt', 'r') as file:
-            cls.edge_missing_paragraphs = file.read()
-
-        with open('text/edge_first_paragraph.txt', 'r') as file:
-            cls.edge_first_paragraph = file.read()
-
-        with open('text/edge_final_paragraph.txt', 'r') as file:
-            cls.edge_final_paragraph = file.read()
-
-        with open('text/edge_final_paragraph.txt', 'r') as file:
-            cls.edge_final_paragraph = file.read()
-
-        with open('text/edge_EPA_signature_block.txt', 'r') as file:
-            cls.edge_EPA_sigblock = file.read()
-
-        cls.complex_PDF_1 = textract.process('PDFs/Complex_1.pdf', method='pdfminer').decode()
-
-        cls.complex_PDF_2 = textract.process('PDFs/Complex_2.pdf', method='pdfminer').decode()
+        with open('text/nbsp_formfeed.txt', 'r') as file:
+            cls.nbsp_formfeed = '\n\n' + file.read()
 
         cls.split_simple_text = ['1. First Entry.', '2. Second Entry.', '3. Third Entry.']
 
@@ -70,10 +50,6 @@ def setUpClass(cls):
                               ('2. Second Entry.', '2. Second Entry.'),
                               ('3. Third Entry.', '3. Third Entry.')]
 
-        cls.table_title = 'Table 1: PMNs for which EPA untimely published notice of receipt in the Federal Register'
-
-        cls.EPA_signature = '/s/'
-
     def test_opens_PDF(self):
         self.assertEqual(self.short_text, interleave.convert_pdf_to_txt('PDFs/Simple.pdf'))
 
@@ -84,63 +60,44 @@ def test_opens_multipage_PDF(self):
         self.assertEqual(self.multipage_text, interleave.convert_pdf_to_txt('PDFs/Multipage.pdf'))
 
     def test_counts_up_to_1000_paragraphs(self):
-        self.assertEqual(1000, len(interleave.get_sentences(self.thousand_paragraphs)))
+        result = interleave.build_paragraphs(interleave.sanitize_text(self.thousand_paragraphs))
+        self.assertEqual(1000, len(result))
 
     def test_builds_paragraphs_correctly(self):
-        self.assertEqual(self.split_simple_text, interleave.build_paragraph('\n\n' + self.short_text))
-
-    def test_counts_correct_amount_of_paragraphs_for_complex_documents(self):
-        result = interleave.zip_sentences(interleave.get_sentences(self.complex_PDF_1),
-                                          interleave.get_sentences(self.complex_PDF_2))
-        self.assertEqual(166, len(result))
+        self.assertEqual(self.split_simple_text, interleave.build_paragraphs('\n\n' + self.short_text))
 
     def test_splits_short_sentences(self):
-        self.assertEqual(self.split_simple_text,
-                         interleave.get_sentences(self.short_text))
+        result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.short_text))
+        self.assertEqual(self.split_simple_text, result)
 
     def test_splits_multiline_sentences(self):
-        self.assertEqual(self.split_multiline_text,
-                         interleave.get_sentences(self.multiline_text))
+        result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.multiline_text))
+        self.assertEqual(self.split_multiline_text, result)
 
     def test_splits_multipage_paragraphs(self):
-        self.assertEqual(self.split_multipage_text,
-                         interleave.get_sentences(self.multipage_text))
+        result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.multipage_text))
+        self.assertEqual(self.split_multipage_text, result)
 
     def test_removes_page_numbers(self):
-        self.assertNotRegex('\n'.join(interleave.get_sentences(self.pagenumbers)),
+        self.assertNotRegex('\n'.join(interleave.sanitize_text(self.pagenumbers)),
                             r'\n\d+ \n')
 
     def test_removes_page_headers(self):
-        self.assertNotRegex('\n'.join(interleave.get_sentences(self.headers)),
+        self.assertNotRegex('\n'.join(interleave.sanitize_text(self.headers)),
                             r'(\fC.*\d\n)')
 
-    def test_edge_case_line_starts_with_numeric_sentence_end(self):
-        result = interleave.get_sentences(self.edge_numbers)
-        self.assertEqual(7, len(result))  # Expect seven paragraphs
-
-    def test_edge_case_missing_paragraphs(self):
-        result = interleave.get_sentences(self.edge_missing_paragraphs)
-        self.assertEqual(7, len(result))  # Expect seven paragraphs
-
-    def test_edge_case_isolate_first_paragraph(self):
-        result = interleave.get_sentences(self.edge_first_paragraph)
-        self.assertEqual(1, len(result))
-
-    def test_edge_case_ignore_trailing_tables(self):
-        result = interleave.get_sentences(self.edge_final_paragraph)[0]
-        self.assertNotIn(self.table_title, result)
-
-    def test_edge_case_strip_EPA_style_sigblock(self):
-        result = interleave.get_sentences(self.edge_EPA_sigblock)
-        self.assertNotIn(self.EPA_signature, result)
+    def test_removes_nbsp_formfeed_page_breaks(self):
+        result = interleave.sanitize_text(self.nbsp_formfeed)
+        self.assertNotIn(chr(160), result)
+        self.assertNotIn(chr(12), result)
 
     def test_zip_sentences_to_tuple(self):
-        list1 = self.short_text
-        list2 = self.short_text
+        list1 = '\n\n' + self.short_text
+        list2 = '\n\n' + self.short_text
 
         self.assertEqual(self.processed_text,
-                         interleave.zip_sentences(interleave.get_sentences(list1),
-                                                  interleave.get_sentences(list2)))
+                         interleave.zip_sentences(interleave.build_paragraphs(interleave.sanitize_text(list1)),
+                                                  interleave.build_paragraphs(interleave.sanitize_text(list2))))
 
     @patch('builtins.open')
     @patch('src.interleave.writer', autospec=True)
diff --git a/test/text/edge_EPA_signature_block.txt b/test/text/edge_EPA_signature_block_1.txt
similarity index 92%
rename from test/text/edge_EPA_signature_block.txt
rename to test/text/edge_EPA_signature_block_1.txt
index b2ac5ec..2a239c8 100644
--- a/test/text/edge_EPA_signature_block.txt
+++ b/test/text/edge_EPA_signature_block_1.txt
@@ -1,4 +1,4 @@
-165. The allegations in paragraph 165 state legal conclusions that require no response. 
+1. The allegations in paragraph 165 state legal conclusions that require no response. 
 
 The remainder of the complaint consists of plaintiffs’ request for relief, to which no response is 
 
diff --git a/test/text/edge_EPA_signature_block_2.txt b/test/text/edge_EPA_signature_block_2.txt
new file mode 100644
index 0000000..b18878b
--- /dev/null
+++ b/test/text/edge_EPA_signature_block_2.txt
@@ -0,0 +1,171 @@
+1. The allegations in Paragraph 59 state a legal conclusion to which no response is required. 
+
+REQUEST FOR RELIEF 
+
+ 
+
+Paragraphs 60-64 consist of Plaintiffs’ request for relief, to which no response is 
+
+required. To the extent a response is required, Defendants deny that Plaintiffs are entitled to any 
+
+relief. 
+
+GENERAL DENIAL 
+
+To the extent that any allegation is not specifically addressed in the preceding paragraphs, 
+
+Defendants deny that allegation.  
+
+ 
+
+10 
+
+Case 1:17-cv-01023-ESH   Document 13   Filed 07/28/17   Page 11 of 12
+
+AFFIRMATIVE AND OTHER DEFENSES 
+
+FIRST DEFENSE 
+
+The Court lacks subject matter jurisdiction over one or more of Plaintiffs’ claims.  
+
+SECOND DEFENSE 
+
+One or more of Plaintiffs’ claims fails to state a claim upon which relief can be granted.  
+
+ 
+
+ 
+
+Defendants may have additional defenses which are not known at this time but which 
+
+may become known as Plaintiffs clarify their claims. Accordingly, Defendants reserve the right 
+
+to assert each and every affirmative or other defense that may be available, including any 
+
+defenses available under Federal Rules of Civil Procedure 8 or 12, once the precise nature of the 
+
+claims or events is ascertained in the future. 
+
+ 
+Dated: July 28, 2017   
+ 
+ 
+ 
+ 
+ 
+ 
+
+ 
+ 
+ 
+
+ 
+ 
+ 
+
+ 
+ 
+ 
+
+ 
+
+ 
+
+ 
+
+ 
+
+ 
+
+ 
+ 
+
+ 
+
+ 
+
+ 
+ 
+ 
+
+ 
+
+ 
+
+ 
+ 
+ 
+
+ 
+
+Respectfully submitted, 
+
+JEFFREY H. WOOD 
+Acting Assistant Attorney General 
+Environment and Natural Resources Division 
+
+/s/ Meghan E. Greenfield . 
+Meghan E. Greenfield 
+U.S. Department of Justice 
+Environment & Natural Resources Division 
+Environmental Defense Section 
+P.O. Box 7611 
+Washington, D.C. 20044 
+Telephone: (202) 514-2795 
+Facsimile: (202) 514-8865 
+Meghan.Greenfield@usdoj.gov 
+ 
+Counsel for Defendants 
+
+11 
+
+Case 1:17-cv-01023-ESH   Document 13   Filed 07/28/17   Page 12 of 12
+
+ 
+
+CERTIFICATE OF SERVICE 
+
+ 
+On July 28, 2017, I electronically submitted the foregoing document with the Clerk of the 
+
+Court, using the electronic case filing system of the Court. I hereby certify that I have served all 
+
+counsel of record electronically. 
+
+ 
+ 
+ 
+
+ 
+
+ 
+ 
+ 
+
+ 
+
+ 
+ 
+
+ 
+ 
+
+ 
+  
+
+ 
+ 
+
+ 
+ 
+
+/s/ Meghan E. Greenfield . 
+Meghan E. Greenfield  
+
+ 
+
+ 
+
+12 
+
+
\ No newline at end of file
diff --git a/test/text/nbsp_formfeed.txt b/test/text/nbsp_formfeed.txt
new file mode 100644
index 0000000..8672e43
--- /dev/null
+++ b/test/text/nbsp_formfeed.txt
@@ -0,0 +1,19 @@
+1.  The allegations in Paragraph 4 state legal conclusions to which no response is required. 
+
+2 
+
+ 
+
+ 
+
+Case 1:17-cv-01023-ESH   Document 13   Filed 07/28/17   Page 3 of 12
+
+2.  The allegations in the first sentence of Paragraph 5 characterize the Clean Water Act and 
+
+EPA regulations, which speak for themselves and are the best evidence of their contents. 
+
+To the extent those allegations are inconsistent with the Clean Water Act or the cited 
+
+regulations, Defendants deny them. The remaining allegations in the first sentence of 
+
+Paragraph 5, and the allegations in the second sentence of Paragraph 5, are denied. 
\ No newline at end of file