From b65f5e4f0858b9b785fbdf1a6c8694a970033e65 Mon Sep 17 00:00:00 2001 From: Finn Date: Thu, 16 May 2024 07:31:38 -0700 Subject: [PATCH] Enhance tika document parsing tests (#13618) * Update tika document parsing bwc tests. Signed-off-by: Carroll * Skip sample tika files which do not parse consistently. Signed-off-by: Carroll * Formatting for spotlessJavaCheck. Signed-off-by: Carroll * Use fixed locale for consistent tika parsing. Signed-off-by: Carroll * Move sha1 map to .checksums file. Signed-off-by: Carroll * For locale dependant files do not verify contents with hash. Signed-off-by: Carroll * Remove strict checksum validation for additional locale dependant files. Signed-off-by: Carroll --------- Signed-off-by: Carroll --- .../ingest/attachment/TikaDocTests.java | 65 +++--- .../ingest/attachment/test/.checksums | 209 ++++++++++++++++++ 2 files changed, 248 insertions(+), 26 deletions(-) create mode 100644 plugins/ingest-attachment/src/test/resources/org/opensearch/ingest/attachment/test/.checksums diff --git a/plugins/ingest-attachment/src/test/java/org/opensearch/ingest/attachment/TikaDocTests.java b/plugins/ingest-attachment/src/test/java/org/opensearch/ingest/attachment/TikaDocTests.java index 7f4a9b8ca0ac7..a022b8b9bf8b0 100644 --- a/plugins/ingest-attachment/src/test/java/org/opensearch/ingest/attachment/TikaDocTests.java +++ b/plugins/ingest-attachment/src/test/java/org/opensearch/ingest/attachment/TikaDocTests.java @@ -32,54 +32,67 @@ package org.opensearch.ingest.attachment; +import org.apache.commons.codec.digest.DigestUtils; import org.apache.lucene.tests.util.LuceneTestCase.SuppressFileSystems; import org.apache.lucene.tests.util.TestUtil; import org.apache.tika.metadata.Metadata; import org.opensearch.common.io.PathUtils; +import org.opensearch.common.xcontent.XContentHelper; +import org.opensearch.common.xcontent.json.JsonXContent; import org.opensearch.test.OpenSearchTestCase; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; +import java.util.Map; /** - * Evil test-coverage cheat, we parse a bunch of docs from tika - * so that we have a nice grab-bag variety, and assert some content - * comes back and no exception. + * Parse sample tika documents and assert the contents has not changed according to previously recorded checksums. + * Uncaught changes to tika parsing could potentially pose bwc issues. + * Note: In some cases tika will access a user's locale to inform the parsing of a file. + * The checksums of these files are left empty, and we only validate that parsed content is not null. */ @SuppressFileSystems("ExtrasFS") // don't try to parse extraN public class TikaDocTests extends OpenSearchTestCase { - /** some test files from tika test suite, zipped up */ + /** some test files from the apache tika unit test suite with accompanying sha1 checksums */ static final String TIKA_FILES = "/org/opensearch/ingest/attachment/test/tika-files/"; + static final String TIKA_CHECKSUMS = "/org/opensearch/ingest/attachment/test/.checksums"; - public void testFiles() throws Exception { - Path tmp = createTempDir(); - logger.debug("unzipping all tika sample files"); - try (DirectoryStream stream = Files.newDirectoryStream(PathUtils.get(getClass().getResource(TIKA_FILES).toURI()))) { - for (Path doc : stream) { - String filename = doc.getFileName().toString(); - TestUtil.unzip(getClass().getResourceAsStream(TIKA_FILES + filename), tmp); - } - } + public void testParseSamples() throws Exception { + String checksumJson = Files.readString(PathUtils.get(getClass().getResource(TIKA_CHECKSUMS).toURI())); + Map checksums = XContentHelper.convertToMap(JsonXContent.jsonXContent, checksumJson, false); + DirectoryStream stream = Files.newDirectoryStream(unzipToTemp(TIKA_FILES)); - try (DirectoryStream stream = Files.newDirectoryStream(tmp)) { - for (Path doc : stream) { - logger.debug("parsing: {}", doc); - assertParseable(doc); + for (Path doc : stream) { + String parsedContent = tryParse(doc); + assertNotNull(parsedContent); + assertFalse(parsedContent.isEmpty()); + + String check = checksums.get(doc.getFileName().toString()).toString(); + if (!check.isEmpty()) { + assertEquals(check, DigestUtils.sha1Hex(parsedContent)); } } + + stream.close(); } - void assertParseable(Path fileName) throws Exception { - try { - byte bytes[] = Files.readAllBytes(fileName); - String parsedContent = TikaImpl.parse(bytes, new Metadata(), -1); - assertNotNull(parsedContent); - assertFalse(parsedContent.isEmpty()); - logger.debug("extracted content: {}", parsedContent); - } catch (Exception e) { - throw new RuntimeException("parsing of filename: " + fileName.getFileName() + " failed", e); + private Path unzipToTemp(String zipDir) throws Exception { + Path tmp = createTempDir(); + DirectoryStream stream = Files.newDirectoryStream(PathUtils.get(getClass().getResource(zipDir).toURI())); + + for (Path doc : stream) { + String filename = doc.getFileName().toString(); + TestUtil.unzip(getClass().getResourceAsStream(zipDir + filename), tmp); } + + stream.close(); + return tmp; + } + + private String tryParse(Path doc) throws Exception { + byte bytes[] = Files.readAllBytes(doc); + return TikaImpl.parse(bytes, new Metadata(), -1); } } diff --git a/plugins/ingest-attachment/src/test/resources/org/opensearch/ingest/attachment/test/.checksums b/plugins/ingest-attachment/src/test/resources/org/opensearch/ingest/attachment/test/.checksums new file mode 100644 index 0000000000000..227d7d833a231 --- /dev/null +++ b/plugins/ingest-attachment/src/test/resources/org/opensearch/ingest/attachment/test/.checksums @@ -0,0 +1,209 @@ +{ + "testWORD_tabular_symbol.doc": "c708d7ef841f7e1748436b8ef5670d0b2de1a227", + "testWORD_1img.docx": "367e2ade13ca3c19bcd8a323e21d51d407e017ac", + "testMasterFooter.odp": "bcc59df70699c739423a50e362c722b81ae76498", + "testTXTNonASCIIUTF8.txt": "1ef514431ca8d838f11e99f8e4a0637730b77aa0", + "EmbeddedOutlook.docx": "c544a6765c19ba11b0bf3edb55c79e1bd8565c6e", + "testWORD_override_list_numbering.docx": "4e892319b921322916225def763f451e4bbb4e16", + "testTextBoxes.key": "b01581d5bd2483ce649a1a1406136359f4b93167", + "testPPT_masterText.pptx": "9fee8337b76dc3e196f4554dcde22b9dd1c3b3e8", + "testComment.docx": "333b9009686f27265b4729e8172b3e62048ec7ec", + "testRTFInvalidUnicode.rtf": "32b3e3d8e5c5a1b66cb15fc964b9341bea7048f4", + "testEXCEL_headers_footers.xlsx": "9e8d2a700fc431fe29030e86e08162fc8ecf2c1a", + "testWORD6.doc": "1479de589755c7212815445799c44dab69d4587c", + "testPagesHeadersFootersFootnotes.pages": "99d434be7de4902dc70700aa9c2a31624583c1f1", + "testPDF_no_extract_yes_accessibility_owner_empty.pdf": "6eb693dac68fece3bf3cd1aa9880ea9b23fc927c", + "testOpenOffice2.odt": "564b3e1999a53073a04142e01b663757a6e7fb08", + "testTables.key": "250cff75db7fc3c8b95b2cbd3f37308826e0c93d", + "testDOCX_Thumbnail.docx": "fce6a43271bc242e2bb8341afa659ed166e08050", + "testWORD_3imgs.docx": "292ca6fa41d32b462e66061e89adb19423721975", + "testPDF_acroform3.pdf": "dcf6588cb5e41701b168606ea6bfbadecdcd3bc9", + "testWORD_missing_ooxml_bean1.docx": "c3058f2513fecc0a6d76d3ecf55676f236b085ff", + "testPDFTwoTextBoxes.pdf": "4adf324ce030076b1755fdb3a6cce676ee325ae4", + "testRTFUnicodeGothic.rtf": "f9932470ff686b0c217ea94ed5d4f2fd85f7998e", + "headers.mbox": "75ec25789fe870b6d25365e4ea73d731fc274847", + "testPPT_embeded.ppt": "", + "testXML3.xml": "804d4812408eb324ae8483d2140b648ec871dd2a", + "testOptionalHyphen.doc": "10f9ca38cc2985e94967aa2c454bfe40aff76976", + "testComment.doc": "66e57653d5d08478556ca640408b172b65855cc7", + "testEXCEL_headers_footers.xls": "18977c66fc8bcb8c44de3063b69b65a3de9c3f25", + "testWORD_embedded_rtf.doc": "cc2d289acfe3d1068a2649b7fa0c06c50bb6ceda", + "testEXCEL_custom_props.xlsx": "6b72ae08362a204b37dbba0a30b4134ae3e7918f", + "testOptionalHyphen.docx": "5b8ffc0df1691a8fed7d63aa9b256e9e02e36d71", + "testPPT_various.pptx": "d149de9af8071141a6ba6e2cd4ef5f6d9431a826", + "testWORD_closingSmartQInHyperLink.doc": "9859f378c603b70bf0d44a281169ae5b16a21878", + "test_embedded_zip.pptx": "d19406edcec09440d066877c451ceba60abc3483", + "testRTFUmlautSpaces.rtf": "155b39879c5b5fbad22fd650be37ae7f91489eb2", + "protectedFile.xlsx": "ee08eeaf05c35c960243f831c3a974d9ee07aa28", + "Doc1_ole.doc": "fb63220506ab666f1fe87b0608e1447fd4fd3489", + "testEXCEL_embeded.xlsx": "", + "EmbeddedDocument.docx": "", + "testODFwithOOo3.odt": "3815d6fb7f5829db882ea8ebd664f252711e6e60", + "testPagesHeadersFootersRomanUpper.pages": "85b3cd545ba6c33e5d44b844a6afea8cb6eaec0b", + "testPPT_comment.ppt": "88fd667fd0292785395a8d0d229304aa91110556", + "testPPT_2imgs.pptx": "66eda11ad472918153100dad8ee5be0f1f8e2e04", + "testPagesHeadersFootersAlphaUpper.pages": "56bef0d1eaedfd7599aae29031d2eeb0e3fe4688", + "testWORD_text_box.docx": "e01f7b05c6aac3449b9a699c3e4d2e62ff3368a3", + "testWORD_missing_text.docx": "3814332884a090b6d1020bff58d0531486710c45", + "testComment.pdf": "60e181061a00454c2e622bd37a9878234c13231d", + "testPDF_no_extract_no_accessibility_owner_empty.pdf": "6eb693dac68fece3bf3cd1aa9880ea9b23fc927c", + "test_embedded_package.rtf": "cd90adb3f777e68aa0288fd23e8f4fbce260a763", + "testPDF_bom.pdf": "6eb693dac68fece3bf3cd1aa9880ea9b23fc927c", + "testOptionalHyphen.ppt": "7e016e42860bd408054bb8653fef39b2756119d9", + "testHTML_utf8.html": "3ba828044754772e4c9df5f9a2213beaa75842ef", + "testPPT_comment.pptx": "25fab588194dabd5902fd2ef880ee9542d036776", + "testRTFWithCurlyBraces.rtf": "019cab63b73ff89d094823cf50c0a721bec08ee2", + "testFooter.ods": "846e1d0415b23fa27631b536b0cf566abbf8fcc1", + "testPPT.ppt": "933ee556884b1d9e28b801daa0d77bbaa4f4be62", + "testEXCEL-formats.xls": "", + "testPPT_masterFooter.pptx": "29bb97006b3608b7db6ff72b94d20157878d94dd", + "testWORD_header_hyperlink.doc": "914bbec0730c54948ad307ea3e375ef0c100abf1", + "testRTFHyperlink.rtf": "2b2ffb1997aa495fbab1af490d134051de168c97", + "testExtraSpaces.pdf": "b5575400309b01c1050a927d8d1ecf8761062abc", + "testRTFWindowsCodepage1250.rtf": "7ba418843f401634f97d21c844c2c4093b7194fb", + "testRTFTableCellSeparation2.rtf": "62782ca40ff0ed6c3ba90f8055ee724b44af203f", + "testPagesHeadersFootersRomanLower.pages": "2410fc803907001eb39c201ad4184b243e271c6d", + "headerPic.docx": "c704bb648feac7975dff1024a5f762325be7cbc2", + "testHTMLNoisyMetaEncoding_4.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a", + "testRTFBoldItalic.rtf": "0475d224078682cf3f9f3f4cbc14a63456c5a0d8", + "test-outlook.msg": "1f202fc11a873e305d5b4d4607409f3f734065ec", + "testRTFVarious.rtf": "bf6ea9cf57886e680c5e6743a66a12b950a09083", + "testXHTML.html": "c6da900f81c1c550518e65d579d3dd62dd7c5c0c", + "EmbeddedPDF.docx": "454476bdf4a968189a6f53e75c146382bf58a434", + "testXML.xml": "e1615e9b31be58f7af9ad963e5a112efa5cdaffa", + "testWORD_no_format.docx": "9a3f5d8a4c8c0f077cc615bcfc554dc87d5926aa", + "testPPT_masterText.ppt": "f5ff5e2d45ccb180cf371ed99b7dfeb2a93539b3", + "testPDF_PDFEncodedStringInXMP.pdf": "78fd59d394f72d28a9908739fa562099978dafa1", + "testPPT_custom_props.pptx": "72152d28afbc23a50cc71fa37d1dce9ef03ca72d", + "testRTFListOverride.rtf": "f8c61d8a66afdaa07f3740e859497818bfc2ca01", + "testEXCEL_1img.xls": "", + "testWORD_1img.doc": "0826d299a7770e93603f5667d89dccb7b74d904c", + "testNPEOpenDocument.odt": "4210b973c80084c58463ec637fa43e911f77d6fe", + "testRTFWord2010CzechCharacters.rtf": "9443011aac32434240ab8dbff360c970fc1c7074", + "testPDF_Version.8.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2", + "testPPT.ppsx": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6", + "testPPT_autodate.pptx": "50467dbb37d1c74b8b37fe93eddf6f9e87d21bf3", + "testWordArt.pptx": "3566bbee790704b3654fe78319957f9e0cddb6d9", + "NullHeader.docx": "18430c968ba29173b52610efdaa723424b3c4d79", + "testRTFWordPadCzechCharacters.rtf": "5dbb58452a3507c384008662f8fce90063f12189", + "resume.html": "fbfb9d8264f6eebd79847fe7a7f1b81edd4a027d", + "testPagesLayout.pages": "5db1ab91c93e6183d0af8513f62c7b87964704af", + "testOptionalHyphen.pptx": "c2977eefe7d2cad8c671f550d7883185ec65591b", + "testWORD_numbered_list.docx": "07194c58165993468e66bc4eba4f5bd89d5bee09", + "testEXCEL_1img.xlsx": "", + "testPDFTripleLangTitle.pdf": "6eb693dac68fece3bf3cd1aa9880ea9b23fc927c", + "protect.xlsx": "ee08eeaf05c35c960243f831c3a974d9ee07aa28", + "testWORD_bold_character_runs2.docx": "f10e562d8825ec2e17e0d9f58646f8084a658cfa", + "testXLSX_Thumbnail.xlsx": "020bf155ae157661c11727c54e6694cf9cd2c0d3", + "testWORD_embedded_pdf.docx": "d8adb797aaaac92afd8dd9b499bd197347f15688", + "testOptionalHyphen.rtf": "2f77b61bab5b4502b4ddd5018b454be157091d07", + "testEXCEL-charts.xls": "", + "testWORD_override_list_numbering.doc": "60e47a3e71ba08af20af96131d61740a1f0bafa3", + "testPDF_twoAuthors.pdf": "c5f0296cc21f9ae99ceb649b561c55f99d7d9452", + "testPDF_Version.10.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2", + "testHTMLNoisyMetaEncoding_2.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a", + "testFooter.odt": "cd5d0fcbcf48d6f005d087c47d00e84f39bcc321", + "testPPT.pptm": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6", + "testPPT_various.ppt": "399e27a9893284f106dc44f15b5e636454db681e", + "testRTFListMicrosoftWord.rtf": "0303eb3e2f30530621a7a407847b759a3b21467e", + "testWORD_bold_character_runs2.doc": "f10e562d8825ec2e17e0d9f58646f8084a658cfa", + "boilerplate-whitespace.html": "a9372bc75d7d84cbcbb0bce68fcaed73ad8ef52c", + "testEXCEL_95.xls": "20d9b9b0f3aecd28607516b4b837c8bab3524b6c", + "testPPT_embedded_two_slides.pptx": "", + "testPDF_bookmarks.pdf": "5fc486c443511452db4f1aa6530714c6aa49c831", + "test_recursive_embedded.docx": "afc32b07ce07ad273e5b3d1a43390a9d2b6dd0a9", + "testEXCEL-formats.xlsx": "", + "testPPT_masterText2.pptx": "2b01eab5d0349e3cfe791b28c70c2dbf4efc884d", + "test.doc": "774be3106edbb6d80be36dbb548d62401dcfa0fe", + "test_recursive_embedded_npe.docx": "afc32b07ce07ad273e5b3d1a43390a9d2b6dd0a9", + "testPPT_embedded2.ppt": "80e106b3fc68107e7f9579cff04e3b15bdfc557a", + "testWORD_custom_props.docx": "e7a737a5237a6aa9c6b3fc677eb8fa65c30d6dfe", + "testPDF_Version.4.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2", + "testBinControlWord.rtf": "ef858fbb7584ea7f92ffed8d0a08c1cc35ffee07", + "testWORD_null_style.docx": "0be9dcfb83423c78a06af514ec21e4e7770ec48e", + "test-outlook2003.msg": "bb3c35eb7e95d657d7977c1d3d52862734f9f329", + "testPDFVarious.pdf": "c66bbbacb10dd27430f7d0bed9518e75793cedae", + "testHTMLNoisyMetaEncoding_3.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a", + "testRTFCorruptListOverride.rtf": "116a782d02a7f25010a15cbbb189bf98e6b89855", + "testEXCEL_custom_props.xls": "b5584d9b13ab1566ce539238dc75e7eb3449ba7f", + "testPDF_Version.7.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2", + "testPDFEmbeddingAndEmbedded.docx": "e7b648adb15cd16cdd84437c2b9524a8eeb213e4", + "testHTMLNoisyMetaEncoding_1.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a", + "testWORD_3imgs.doc": "818aa8c6c44dd78c49100c3c38e95abdf3812981", + "testRTFEmbeddedLink.rtf": "2720ffb5ff3a6bbb2c5c1cb43fb4922362ed788a", + "testKeynote.key": "11387b59fc6339bb73653fcbb26d387521b98ec9", + "testPDF.pdf": "5a377554685367764eaf73d093408ace323fcec7", + "protectedSheets.xlsx": "", + "testWORD.doc": "cdd41377e699287cbbe17fbb1498cfe5814dde23", + "testComment.xlsx": "d4be580bb97c1c90be379281179c7932b37a18c0", + "testPDFPackage.pdf": "75d6fa216b4e2880a65ced55d17ca2b599d2606c", + "testWORD_embeded.doc": "", + "testHTML.html": "6548b16c5ea33e907577615ce60ca4876a3936ef", + "testEXCEL_5.xls": "a174f098333c659d331317641d4d1d9d83055288", + "pictures.ppt": "95bbfdbf2f60f74371285c337d3445d0acd59a9b", + "testPPT_masterText2.ppt": "f5ff5e2d45ccb180cf371ed99b7dfeb2a93539b3", + "testPDF-custommetadata.pdf": "a84b914655db55574e6002b6f37209ecd4c3d462", + "testWORD_embeded.docx": "", + "testStyles.odt": "c25dd05633e3aab7132d2f5608126e2b4b03848f", + "testPDF_multiFormatEmbFiles.pdf": "2103b2c30b44d5bb3aa790ab04a6741a10ea235a", + "testXML2.xml": "a8c85a327716fad93faa4eb0f993057597d6f471", + "testPagesComments.pages": "cbb45131cf45b9c454e754a07af3ae927b1a69cc", + "testEXCEL_4.xls": "8d5e6156222151faaccb079d46ddb5393dd25771", + "testWORD_no_format.doc": "88feaf03fe58ee5cc667916c6a54cbd5d605cc1c", + "testPages.pages": "288e6db2f39604e372a2095257509c78dba22cbb", + "footnotes.docx": "33b01b73a12f9e14efbcc340890b11ee332dca8e", + "testWORD_bold_character_runs.doc": "f10e562d8825ec2e17e0d9f58646f8084a658cfa", + "testWORD_custom_props.doc": "e7a737a5237a6aa9c6b3fc677eb8fa65c30d6dfe", + "testPDF_Version.11.x.PDFA-1b.pdf": "71853c6197a6a7f222db0f1978c7cb232b87c5ee", + "testAnnotations.pdf": "5f599e7916198540e1b52c3e472a525f50fd45f6", + "tika434.html": "7d74122631f52f003a48018cc376026ccd8d984e", + "testPagesHeadersFootersAlphaLower.pages": "fc1d766908134ff4689fa63fa3e91c3e9b08d975", + "testRTFRegularImages.rtf": "756b1db45cb05357ceaf9c8efcf0b76e3913e190", + "testRTFUmlautSpaces2.rtf": "1fcd029357062241d74d789e93477c101ff24e3f", + "testWORD_numbered_list.doc": "e06656dd9b79ac970f3cd065fa8b630a4981556f", + "testPPT_autodate.ppt": "05b93967ea0248ad263b2f24586e125df353fd3d", + "testBulletPoints.key": "92242d67c3dbc1b22aac3f98e47061d09e7719f9", + "testMasterSlideTable.key": "1d61e2fa3c3f3615500c7f72f62971391b9e9a2f", + "testWORD_various.doc": "8cbdf1a4e0d78471eb90403612c4e92866acf0cb", + "testEXCEL_textbox.xlsx": "1e81121e91e58a74d838e414ae0fc0055a4b4100", + "big-preamble.html": "a9d759b46b6c6c1857d0d89c3a75ee2f3ace70c9", + "testWORD.docx": "f72140bef19475e950e56084d1ab1cb926697b19", + "testComment.rtf": "f6351d0f1f20c4ee0fff70adca6abbc6e638610e", + "testRTFUnicodeUCNControlWordCharacterDoubling.rtf": "3e6f2f38682e38ffc96a476ca51bec2291a27fa7", + "testPDF_Version.5.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2", + "testPPTX_Thumbnail.pptx": "6aa019154289317c7b7832fe46556e6d61cd0a9f", + "testRTFTableCellSeparation.rtf": "5647290a3197c1855fad10201dc7be60ea7b0e42", + "testRTFControls.rtf": "aee6afb80e8b09cf49f056020c037f70c2757e49", + "testEXCEL.xls": "", + "testRTFJapanese.rtf": "08976f9a7d6d3a155cad84d7fa23295cb972a17a", + "testPageNumber.pdf": "96b03d2cc6782eba653af28228045964e68422b5", + "testOptionalHyphen.pdf": "12edd450ea76ea4e79f80ebd3442999ec2180dbc", + "testPDFFileEmbInAnnotation.pdf": "97a6e5781bbaa6aea040546d797c4916f9d90c86", + "testFontAfterBufferedText.rtf": "d1c8757b3ed91f2d7795234405c43005868affa3", + "testPPT_masterFooter.ppt": "8c9104385820c2631ddda20814231808fac03d4d", + "testWORD_various.docx": "189df989e80afb09281901aefc458c6630a8530b", + "testComment.ppt": "21842dd9cb8a7d4af0f102543c192861c9789705", + "testPopupAnnotation.pdf": "1717b1d16c0a4b9ff5790cac90fc8e0fba170a35", + "testWORD_bold_character_runs.docx": "f10e562d8825ec2e17e0d9f58646f8084a658cfa", + "testOverlappingText.pdf": "726da7d6c184512ed8d44af2a5085d65523c4572", + "testRTF.rtf": "91e830ceba556741116c9e83b0c69a0d6c5c9304", + "testRTFIgnoredControlWord.rtf": "1eb6a2f2fd32b1bb4227c0c02a35cb6027d9ec8c", + "testComment.xls": "4de962f16452159ce302fc4a412b06a06cf9a0f6", + "testPPT.ppsm": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6", + "boilerplate.html": "b3558f02c3179e4aeeb6057594d87bda79964e7b", + "testEXCEL_embeded.xls": "", + "testEXCEL.xlsx": "", + "testPPT_2imgs.ppt": "9a68072ffcf171389e78cf8bc018c4b568a6202d", + "testComment.pptx": "6ae6052f469b8f901fd4fd8bc70f8e267255a58e", + "testPDF_Version.6.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2", + "testPPT.pptx": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6", + "testPPT_custom_props.ppt": "edf196acc12701accc7be5dfe63e053436db45e6", + "testPPT_embeded.pptx": "", + "testRTFListLibreOffice.rtf": "4c38d9e2f0a8c9a4c2cc8d2a52db9591ab759abe", + "testPDF_Version.9.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2", + "testRTFHexEscapeInsideWord.rtf": "6cffda07e774c55b5465d8134a0bdcb8c30f3386", + "testRTFNewlines.rtf": "2375ca14e2b0d8f7ff6bbda5191544b3ee7c09fb", + "testRTF-ms932.rtf": "5f9db1b83bf8e9c4c6abb065adaeb151307d33f2", + "test_TIKA-1251.doc": "5a9394c34274964055fdd9272b4f7dc314b99ecf", + "test_list_override.rtf": "9fe8b4a36c5222fe7ed2e9b54e2330aec8fa9423" +}