From 027084b60c181a8973f07f8dfff941a86e2a8369 Mon Sep 17 00:00:00 2001 From: Sam Mayer Date: Tue, 21 Jul 2020 12:18:56 -0500 Subject: [PATCH] Merge refactoring (#8) * Create separate journey test file * Remove nbsp and formfeed characters * Fix signature block tests * Add Help run config * Refactor method names and test relationships * Extract edge cases to a separate file * Remove redundant sequences from regexes * Add more journey tests * Remove test-specific string --- .idea/runConfigurations/Interleave___Help.xml | 23 +++ src/interleave.py | 36 +++- test/test_edge_cases.py | 61 +++++++ test/test_journey.py | 39 ++++ test/{test_interleave.py => test_unit.py} | 93 +++------- ...ock.txt => edge_EPA_signature_block_1.txt} | 2 +- test/text/edge_EPA_signature_block_2.txt | 171 ++++++++++++++++++ test/text/nbsp_formfeed.txt | 19 ++ 8 files changed, 366 insertions(+), 78 deletions(-) create mode 100644 .idea/runConfigurations/Interleave___Help.xml create mode 100644 test/test_edge_cases.py create mode 100644 test/test_journey.py rename test/{test_interleave.py => test_unit.py} (53%) rename test/text/{edge_EPA_signature_block.txt => edge_EPA_signature_block_1.txt} (92%) create mode 100644 test/text/edge_EPA_signature_block_2.txt create mode 100644 test/text/nbsp_formfeed.txt diff --git a/.idea/runConfigurations/Interleave___Help.xml b/.idea/runConfigurations/Interleave___Help.xml new file mode 100644 index 0000000..98b3edd --- /dev/null +++ b/.idea/runConfigurations/Interleave___Help.xml @@ -0,0 +1,23 @@ + + + + + \ No newline at end of file diff --git a/src/interleave.py b/src/interleave.py index 7952e29..a9abcca 100644 --- a/src/interleave.py +++ b/src/interleave.py @@ -11,7 +11,7 @@ def convert_pdf_to_txt(path): return text -def build_paragraph(input_text): +def build_paragraphs(input_text): matches = re.split(r'(\n\n)(\d+\.\s)', input_text)[2:] result = [''] old_paragraph_number = 0 @@ -25,19 +25,36 @@ def build_paragraph(input_text): return result[1:] -def get_sentences(input_text): +def remove_headers(input_text): sentences = re.sub(r'(\fC.*\d\n)', '', input_text) # Remove Page Headers sentences = re.sub(r'(\n\d+ \n)', '', sentences) # Remove Page Numbers sentences = re.sub(r'\n+[A-Z ]+\n+', '\n\n', sentences) # Remove Section Titles - sentences = re.sub(r'([IVXCMD]+\.[A-Za-z \.\'\’\n-]+)\n(?!\dA-z)', ' \n\n', + sentences = re.sub(r'([IVXCMD]+\.[A-Za-z .\'’\n-]+)\n(?!\dA-z)', ' \n\n', sentences) # Remove Roman Numeral Section Titles but don't conflict with next rule - sentences = re.sub(r'(\n{3,}|\n\n )', '', sentences) # Apply Consistent Paragraph Spacing - sentences = re.sub(r' ', ' ', sentences) # Apply Consistent Text Spacing + sentences = sentences.replace(chr(160), '\n') # Remove nbsp Page Breaks + return sentences + + +def prepare_body_text(input_text): + sentences = re.sub(r'(\n{3,}|\n\n )', '\n\n', input_text) # Apply Consistent Paragraph Spacing + sentences = re.sub(r' {2}', ' ', sentences) # Apply Consistent Text Spacing sentences = re.sub(r'(\S)(\n\n)([A-Za-z])', r'\1 \3', sentences) # Handle EOL without a space sentences = re.sub(r'(\n\) )(1\.\s)', r'\n\n\2', sentences, 1) # Make first paragraph match others - sentences = re.split(r'\n\nTable 1:', sentences)[0] # Remove tables that follow document body + return sentences + + +def remove_trailing_content(input_text): + sentences = re.split(r'\n\nTable 1:', input_text)[0] # Remove tables that follow document body sentences = re.split(r'\s/s/', sentences)[0] # Remove EPA-style signature blocks - return build_paragraph('\n\n' + sentences) # \n\n to match test files with real datasets + sentences = re.split(r'Respectfully submitted,', sentences)[0] # Remove EPA-style signature blocks + return sentences + + +def sanitize_text(input_text): + sentences = remove_headers(input_text) + sentences = prepare_body_text(sentences) + sentences = remove_trailing_content(sentences) + return sentences def zip_sentences(list1, list2): @@ -77,8 +94,9 @@ def main(argv): text_first_file = convert_pdf_to_txt(args.input[0]) text_second_file = convert_pdf_to_txt(args.input[1]) - - filtered_text = zip_sentences(get_sentences(text_first_file), get_sentences(text_second_file)) + # \n\n to match test files with real datasets + filtered_text = zip_sentences(build_paragraphs(sanitize_text(text_first_file)), + build_paragraphs(sanitize_text(text_second_file))) print(create_csv(filtered_text, args.output[0], (args.input[0], args.input[1]))) diff --git a/test/test_edge_cases.py b/test/test_edge_cases.py new file mode 100644 index 0000000..f7a8770 --- /dev/null +++ b/test/test_edge_cases.py @@ -0,0 +1,61 @@ +import unittest +from src import interleave + + +class PDFEdgeCases(unittest.TestCase): + @classmethod + def setUpClass(cls): + with open('text/edge_line_start_numbers.txt', 'r') as file: + cls.edge_numbers = file.read() + + with open('text/edge_missing_paragraphs.txt', 'r') as file: + cls.edge_missing_paragraphs = file.read() + + with open('text/edge_first_paragraph.txt', 'r') as file: + cls.edge_first_paragraph = file.read() + + with open('text/edge_final_paragraph.txt', 'r') as file: + cls.edge_final_paragraph = file.read() + + with open('text/edge_final_paragraph.txt', 'r') as file: + cls.edge_final_paragraph = file.read() + + with open('text/edge_EPA_signature_block_1.txt', 'r') as file: + cls.edge_EPA_sigblock_1 = file.read() + + with open('text/edge_EPA_signature_block_2.txt', 'r') as file: + cls.edge_EPA_sigblock_2 = file.read() + + cls.table_title = 'Table 1: PMNs for which EPA untimely published notice of receipt in the Federal Register' + + cls.EPA_signature_1 = '/s/' + + cls.EPA_signature_2 = 'Respectfully submitted,' + + def test_edge_case_line_starts_with_numeric_sentence_end(self): + result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.edge_numbers)) + self.assertEqual(7, len(result)) # Expect seven paragraphs + + def test_edge_case_missing_paragraphs(self): + result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.edge_missing_paragraphs)) + self.assertEqual(7, len(result)) # Expect seven paragraphs + + def test_edge_case_isolate_first_paragraph(self): + result = interleave.build_paragraphs(interleave.sanitize_text(self.edge_first_paragraph)) + self.assertEqual(1, len(result)) + + def test_edge_case_ignore_trailing_tables(self): + result = interleave.sanitize_text(self.edge_final_paragraph)[0] + self.assertNotIn(self.table_title, result) + + def test_edge_case_strip_EPA_sigblock_1(self): + result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_1)) + self.assertNotIn(self.EPA_signature_1, result) + + def test_edge_case_strip_EPA_sigblock_2(self): + result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_2)) + self.assertNotIn(self.EPA_signature_2, result) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_journey.py b/test/test_journey.py new file mode 100644 index 0000000..b9f3ab8 --- /dev/null +++ b/test/test_journey.py @@ -0,0 +1,39 @@ +import unittest +from src import interleave +import textract + + +class PDFJourneyTests(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.complex_PDF_1 = textract.process('PDFs/Complex_1.pdf', method='pdfminer').decode() + + cls.complex_PDF_2 = textract.process('PDFs/Complex_2.pdf', method='pdfminer').decode() + + cls.complex_PDF_3 = textract.process('PDFs/Complex_3.pdf', method='pdfminer').decode() + + cls.complex_PDF_4 = textract.process('PDFs/Complex_4.pdf', method='pdfminer').decode() + + cls.complex_PDF_5 = textract.process('PDFs/Complex_5.pdf', method='pdfminer').decode() + + cls.complex_PDF_6 = textract.process('PDFs/Complex_6.pdf', method='pdfminer').decode() + + def test_counts_correct_amount_of_paragraphs_for_complex_12(self): + result = interleave.zip_sentences(interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_1)), + interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_2))) + self.assertEqual(166, len(result)) + + def test_counts_correct_amount_of_paragraphs_for_complex_34(self): + result = interleave.zip_sentences(interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_3)), + interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_4))) + self.assertEqual(64, len(result)) + + def test_counts_correct_amount_of_paragraphs_for_complex_56(self): + result = interleave.zip_sentences(interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_5)), + interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_6))) + self.assertEqual(69, len(result)) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_interleave.py b/test/test_unit.py similarity index 53% rename from test/test_interleave.py rename to test/test_unit.py index efd93bb..18505a1 100644 --- a/test/test_interleave.py +++ b/test/test_unit.py @@ -1,10 +1,9 @@ import unittest from src import interleave from unittest.mock import patch -import textract -class PDFTests(unittest.TestCase): +class PDFUnitTests(unittest.TestCase): @classmethod def setUpClass(cls): @@ -18,35 +17,16 @@ def setUpClass(cls): cls.multipage_text = file.read() with open('text/1000_paragraphs.txt', 'r') as file: - cls.thousand_paragraphs = file.read() + cls.thousand_paragraphs = '\n\n' + file.read() with open('text/pagenumbers.txt', 'r') as file: - cls.pagenumbers = file.read() + cls.pagenumbers = '\n\n' + file.read() with open('text/headers.txt', 'r') as file: - cls.headers = file.read() + cls.headers = '\n\n' + file.read() - with open('text/edge_line_start_numbers.txt', 'r') as file: - cls.edge_numbers = file.read() - - with open('text/edge_missing_paragraphs.txt', 'r') as file: - cls.edge_missing_paragraphs = file.read() - - with open('text/edge_first_paragraph.txt', 'r') as file: - cls.edge_first_paragraph = file.read() - - with open('text/edge_final_paragraph.txt', 'r') as file: - cls.edge_final_paragraph = file.read() - - with open('text/edge_final_paragraph.txt', 'r') as file: - cls.edge_final_paragraph = file.read() - - with open('text/edge_EPA_signature_block.txt', 'r') as file: - cls.edge_EPA_sigblock = file.read() - - cls.complex_PDF_1 = textract.process('PDFs/Complex_1.pdf', method='pdfminer').decode() - - cls.complex_PDF_2 = textract.process('PDFs/Complex_2.pdf', method='pdfminer').decode() + with open('text/nbsp_formfeed.txt', 'r') as file: + cls.nbsp_formfeed = '\n\n' + file.read() cls.split_simple_text = ['1. First Entry.', '2. Second Entry.', '3. Third Entry.'] @@ -70,10 +50,6 @@ def setUpClass(cls): ('2. Second Entry.', '2. Second Entry.'), ('3. Third Entry.', '3. Third Entry.')] - cls.table_title = 'Table 1: PMNs for which EPA untimely published notice of receipt in the Federal Register' - - cls.EPA_signature = '/s/' - def test_opens_PDF(self): self.assertEqual(self.short_text, interleave.convert_pdf_to_txt('PDFs/Simple.pdf')) @@ -84,63 +60,44 @@ def test_opens_multipage_PDF(self): self.assertEqual(self.multipage_text, interleave.convert_pdf_to_txt('PDFs/Multipage.pdf')) def test_counts_up_to_1000_paragraphs(self): - self.assertEqual(1000, len(interleave.get_sentences(self.thousand_paragraphs))) + result = interleave.build_paragraphs(interleave.sanitize_text(self.thousand_paragraphs)) + self.assertEqual(1000, len(result)) def test_builds_paragraphs_correctly(self): - self.assertEqual(self.split_simple_text, interleave.build_paragraph('\n\n' + self.short_text)) - - def test_counts_correct_amount_of_paragraphs_for_complex_documents(self): - result = interleave.zip_sentences(interleave.get_sentences(self.complex_PDF_1), - interleave.get_sentences(self.complex_PDF_2)) - self.assertEqual(166, len(result)) + self.assertEqual(self.split_simple_text, interleave.build_paragraphs('\n\n' + self.short_text)) def test_splits_short_sentences(self): - self.assertEqual(self.split_simple_text, - interleave.get_sentences(self.short_text)) + result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.short_text)) + self.assertEqual(self.split_simple_text, result) def test_splits_multiline_sentences(self): - self.assertEqual(self.split_multiline_text, - interleave.get_sentences(self.multiline_text)) + result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.multiline_text)) + self.assertEqual(self.split_multiline_text, result) def test_splits_multipage_paragraphs(self): - self.assertEqual(self.split_multipage_text, - interleave.get_sentences(self.multipage_text)) + result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.multipage_text)) + self.assertEqual(self.split_multipage_text, result) def test_removes_page_numbers(self): - self.assertNotRegex('\n'.join(interleave.get_sentences(self.pagenumbers)), + self.assertNotRegex('\n'.join(interleave.sanitize_text(self.pagenumbers)), r'\n\d+ \n') def test_removes_page_headers(self): - self.assertNotRegex('\n'.join(interleave.get_sentences(self.headers)), + self.assertNotRegex('\n'.join(interleave.sanitize_text(self.headers)), r'(\fC.*\d\n)') - def test_edge_case_line_starts_with_numeric_sentence_end(self): - result = interleave.get_sentences(self.edge_numbers) - self.assertEqual(7, len(result)) # Expect seven paragraphs - - def test_edge_case_missing_paragraphs(self): - result = interleave.get_sentences(self.edge_missing_paragraphs) - self.assertEqual(7, len(result)) # Expect seven paragraphs - - def test_edge_case_isolate_first_paragraph(self): - result = interleave.get_sentences(self.edge_first_paragraph) - self.assertEqual(1, len(result)) - - def test_edge_case_ignore_trailing_tables(self): - result = interleave.get_sentences(self.edge_final_paragraph)[0] - self.assertNotIn(self.table_title, result) - - def test_edge_case_strip_EPA_style_sigblock(self): - result = interleave.get_sentences(self.edge_EPA_sigblock) - self.assertNotIn(self.EPA_signature, result) + def test_removes_nbsp_formfeed_page_breaks(self): + result = interleave.sanitize_text(self.nbsp_formfeed) + self.assertNotIn(chr(160), result) + self.assertNotIn(chr(12), result) def test_zip_sentences_to_tuple(self): - list1 = self.short_text - list2 = self.short_text + list1 = '\n\n' + self.short_text + list2 = '\n\n' + self.short_text self.assertEqual(self.processed_text, - interleave.zip_sentences(interleave.get_sentences(list1), - interleave.get_sentences(list2))) + interleave.zip_sentences(interleave.build_paragraphs(interleave.sanitize_text(list1)), + interleave.build_paragraphs(interleave.sanitize_text(list2)))) @patch('builtins.open') @patch('src.interleave.writer', autospec=True) diff --git a/test/text/edge_EPA_signature_block.txt b/test/text/edge_EPA_signature_block_1.txt similarity index 92% rename from test/text/edge_EPA_signature_block.txt rename to test/text/edge_EPA_signature_block_1.txt index b2ac5ec..2a239c8 100644 --- a/test/text/edge_EPA_signature_block.txt +++ b/test/text/edge_EPA_signature_block_1.txt @@ -1,4 +1,4 @@ -165. The allegations in paragraph 165 state legal conclusions that require no response. +1. The allegations in paragraph 165 state legal conclusions that require no response. The remainder of the complaint consists of plaintiffs’ request for relief, to which no response is diff --git a/test/text/edge_EPA_signature_block_2.txt b/test/text/edge_EPA_signature_block_2.txt new file mode 100644 index 0000000..b18878b --- /dev/null +++ b/test/text/edge_EPA_signature_block_2.txt @@ -0,0 +1,171 @@ +1. The allegations in Paragraph 59 state a legal conclusion to which no response is required. + +REQUEST FOR RELIEF + + + +Paragraphs 60-64 consist of Plaintiffs’ request for relief, to which no response is + +required. To the extent a response is required, Defendants deny that Plaintiffs are entitled to any + +relief. + +GENERAL DENIAL + +To the extent that any allegation is not specifically addressed in the preceding paragraphs, + +Defendants deny that allegation. + +  + +10 + + Case 1:17-cv-01023-ESH Document 13 Filed 07/28/17 Page 11 of 12 + +AFFIRMATIVE AND OTHER DEFENSES + +FIRST DEFENSE + +The Court lacks subject matter jurisdiction over one or more of Plaintiffs’ claims. + +SECOND DEFENSE + +One or more of Plaintiffs’ claims fails to state a claim upon which relief can be granted. + +  + + + +Defendants may have additional defenses which are not known at this time but which + +may become known as Plaintiffs clarify their claims. Accordingly, Defendants reserve the right + +to assert each and every affirmative or other defense that may be available, including any + +defenses available under Federal Rules of Civil Procedure 8 or 12, once the precise nature of the + +claims or events is ascertained in the future. + + +Dated: July 28, 2017 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +  + + + + + + + + + + + + + + + + + +Respectfully submitted, + +JEFFREY H. WOOD +Acting Assistant Attorney General +Environment and Natural Resources Division + +/s/ Meghan E. Greenfield . +Meghan E. Greenfield +U.S. Department of Justice +Environment & Natural Resources Division +Environmental Defense Section +P.O. Box 7611 +Washington, D.C. 20044 +Telephone: (202) 514-2795 +Facsimile: (202) 514-8865 +Meghan.Greenfield@usdoj.gov + +Counsel for Defendants + +11 + + Case 1:17-cv-01023-ESH Document 13 Filed 07/28/17 Page 12 of 12 + +  + +CERTIFICATE OF SERVICE + + +On July 28, 2017, I electronically submitted the foregoing document with the Clerk of the + +Court, using the electronic case filing system of the Court. I hereby certify that I have served all + +counsel of record electronically. + + + + + + + + + + + +  + + + + + + + + + + + + + + + + +/s/ Meghan E. Greenfield . +Meghan E. Greenfield + + + + + +12 + + \ No newline at end of file diff --git a/test/text/nbsp_formfeed.txt b/test/text/nbsp_formfeed.txt new file mode 100644 index 0000000..8672e43 --- /dev/null +++ b/test/text/nbsp_formfeed.txt @@ -0,0 +1,19 @@ +1. The allegations in Paragraph 4 state legal conclusions to which no response is required. + +2 + +  + +  + + Case 1:17-cv-01023-ESH Document 13 Filed 07/28/17 Page 3 of 12 + +2. The allegations in the first sentence of Paragraph 5 characterize the Clean Water Act and + +EPA regulations, which speak for themselves and are the best evidence of their contents. + +To the extent those allegations are inconsistent with the Clean Water Act or the cited + +regulations, Defendants deny them. The remaining allegations in the first sentence of + +Paragraph 5, and the allegations in the second sentence of Paragraph 5, are denied. \ No newline at end of file