-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Create separate journey test file * Remove nbsp and formfeed characters * Fix signature block tests * Add Help run config * Refactor method names and test relationships * Extract edge cases to a separate file * Remove redundant sequences from regexes * Add more journey tests * Remove test-specific string
- Loading branch information
Showing
8 changed files
with
366 additions
and
78 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import unittest | ||
from src import interleave | ||
|
||
|
||
class PDFEdgeCases(unittest.TestCase): | ||
@classmethod | ||
def setUpClass(cls): | ||
with open('text/edge_line_start_numbers.txt', 'r') as file: | ||
cls.edge_numbers = file.read() | ||
|
||
with open('text/edge_missing_paragraphs.txt', 'r') as file: | ||
cls.edge_missing_paragraphs = file.read() | ||
|
||
with open('text/edge_first_paragraph.txt', 'r') as file: | ||
cls.edge_first_paragraph = file.read() | ||
|
||
with open('text/edge_final_paragraph.txt', 'r') as file: | ||
cls.edge_final_paragraph = file.read() | ||
|
||
with open('text/edge_final_paragraph.txt', 'r') as file: | ||
cls.edge_final_paragraph = file.read() | ||
|
||
with open('text/edge_EPA_signature_block_1.txt', 'r') as file: | ||
cls.edge_EPA_sigblock_1 = file.read() | ||
|
||
with open('text/edge_EPA_signature_block_2.txt', 'r') as file: | ||
cls.edge_EPA_sigblock_2 = file.read() | ||
|
||
cls.table_title = 'Table 1: PMNs for which EPA untimely published notice of receipt in the Federal Register' | ||
|
||
cls.EPA_signature_1 = '/s/' | ||
|
||
cls.EPA_signature_2 = 'Respectfully submitted,' | ||
|
||
def test_edge_case_line_starts_with_numeric_sentence_end(self): | ||
result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.edge_numbers)) | ||
self.assertEqual(7, len(result)) # Expect seven paragraphs | ||
|
||
def test_edge_case_missing_paragraphs(self): | ||
result = interleave.build_paragraphs(interleave.sanitize_text('\n\n' + self.edge_missing_paragraphs)) | ||
self.assertEqual(7, len(result)) # Expect seven paragraphs | ||
|
||
def test_edge_case_isolate_first_paragraph(self): | ||
result = interleave.build_paragraphs(interleave.sanitize_text(self.edge_first_paragraph)) | ||
self.assertEqual(1, len(result)) | ||
|
||
def test_edge_case_ignore_trailing_tables(self): | ||
result = interleave.sanitize_text(self.edge_final_paragraph)[0] | ||
self.assertNotIn(self.table_title, result) | ||
|
||
def test_edge_case_strip_EPA_sigblock_1(self): | ||
result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_1)) | ||
self.assertNotIn(self.EPA_signature_1, result) | ||
|
||
def test_edge_case_strip_EPA_sigblock_2(self): | ||
result = '\n'.join(interleave.sanitize_text(self.edge_EPA_sigblock_2)) | ||
self.assertNotIn(self.EPA_signature_2, result) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import unittest | ||
from src import interleave | ||
import textract | ||
|
||
|
||
class PDFJourneyTests(unittest.TestCase): | ||
|
||
@classmethod | ||
def setUpClass(cls): | ||
cls.complex_PDF_1 = textract.process('PDFs/Complex_1.pdf', method='pdfminer').decode() | ||
|
||
cls.complex_PDF_2 = textract.process('PDFs/Complex_2.pdf', method='pdfminer').decode() | ||
|
||
cls.complex_PDF_3 = textract.process('PDFs/Complex_3.pdf', method='pdfminer').decode() | ||
|
||
cls.complex_PDF_4 = textract.process('PDFs/Complex_4.pdf', method='pdfminer').decode() | ||
|
||
cls.complex_PDF_5 = textract.process('PDFs/Complex_5.pdf', method='pdfminer').decode() | ||
|
||
cls.complex_PDF_6 = textract.process('PDFs/Complex_6.pdf', method='pdfminer').decode() | ||
|
||
def test_counts_correct_amount_of_paragraphs_for_complex_12(self): | ||
result = interleave.zip_sentences(interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_1)), | ||
interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_2))) | ||
self.assertEqual(166, len(result)) | ||
|
||
def test_counts_correct_amount_of_paragraphs_for_complex_34(self): | ||
result = interleave.zip_sentences(interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_3)), | ||
interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_4))) | ||
self.assertEqual(64, len(result)) | ||
|
||
def test_counts_correct_amount_of_paragraphs_for_complex_56(self): | ||
result = interleave.zip_sentences(interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_5)), | ||
interleave.build_paragraphs(interleave.sanitize_text(self.complex_PDF_6))) | ||
self.assertEqual(69, len(result)) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
test/text/edge_EPA_signature_block.txt → test/text/edge_EPA_signature_block_1.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.