Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport 2.x] Enhance tika document parsing tests #13708

Merged
merged 1 commit into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -32,54 +32,67 @@

package org.opensearch.ingest.attachment;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.lucene.tests.util.LuceneTestCase.SuppressFileSystems;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.tika.metadata.Metadata;
import org.opensearch.common.io.PathUtils;
import org.opensearch.common.xcontent.XContentHelper;
import org.opensearch.common.xcontent.json.JsonXContent;
import org.opensearch.test.OpenSearchTestCase;

import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Map;

/**
* Evil test-coverage cheat, we parse a bunch of docs from tika
* so that we have a nice grab-bag variety, and assert some content
* comes back and no exception.
* Parse sample tika documents and assert the contents has not changed according to previously recorded checksums.
* Uncaught changes to tika parsing could potentially pose bwc issues.
* Note: In some cases tika will access a user's locale to inform the parsing of a file.
* The checksums of these files are left empty, and we only validate that parsed content is not null.
*/
@SuppressFileSystems("ExtrasFS") // don't try to parse extraN
public class TikaDocTests extends OpenSearchTestCase {

/** some test files from tika test suite, zipped up */
/** some test files from the apache tika unit test suite with accompanying sha1 checksums */
static final String TIKA_FILES = "/org/opensearch/ingest/attachment/test/tika-files/";
static final String TIKA_CHECKSUMS = "/org/opensearch/ingest/attachment/test/.checksums";

public void testFiles() throws Exception {
Path tmp = createTempDir();
logger.debug("unzipping all tika sample files");
try (DirectoryStream<Path> stream = Files.newDirectoryStream(PathUtils.get(getClass().getResource(TIKA_FILES).toURI()))) {
for (Path doc : stream) {
String filename = doc.getFileName().toString();
TestUtil.unzip(getClass().getResourceAsStream(TIKA_FILES + filename), tmp);
}
}
public void testParseSamples() throws Exception {
String checksumJson = Files.readString(PathUtils.get(getClass().getResource(TIKA_CHECKSUMS).toURI()));
Map<String, Object> checksums = XContentHelper.convertToMap(JsonXContent.jsonXContent, checksumJson, false);
DirectoryStream<Path> stream = Files.newDirectoryStream(unzipToTemp(TIKA_FILES));

try (DirectoryStream<Path> stream = Files.newDirectoryStream(tmp)) {
for (Path doc : stream) {
logger.debug("parsing: {}", doc);
assertParseable(doc);
for (Path doc : stream) {
String parsedContent = tryParse(doc);
assertNotNull(parsedContent);
assertFalse(parsedContent.isEmpty());

String check = checksums.get(doc.getFileName().toString()).toString();
if (!check.isEmpty()) {
assertEquals(check, DigestUtils.sha1Hex(parsedContent));
}
}

stream.close();
}

void assertParseable(Path fileName) throws Exception {
try {
byte bytes[] = Files.readAllBytes(fileName);
String parsedContent = TikaImpl.parse(bytes, new Metadata(), -1);
assertNotNull(parsedContent);
assertFalse(parsedContent.isEmpty());
logger.debug("extracted content: {}", parsedContent);
} catch (Exception e) {
throw new RuntimeException("parsing of filename: " + fileName.getFileName() + " failed", e);
private Path unzipToTemp(String zipDir) throws Exception {
Path tmp = createTempDir();
DirectoryStream<Path> stream = Files.newDirectoryStream(PathUtils.get(getClass().getResource(zipDir).toURI()));

for (Path doc : stream) {
String filename = doc.getFileName().toString();
TestUtil.unzip(getClass().getResourceAsStream(zipDir + filename), tmp);
}

stream.close();
return tmp;
}

private String tryParse(Path doc) throws Exception {
byte bytes[] = Files.readAllBytes(doc);
return TikaImpl.parse(bytes, new Metadata(), -1);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
{
"testWORD_tabular_symbol.doc": "c708d7ef841f7e1748436b8ef5670d0b2de1a227",
"testWORD_1img.docx": "367e2ade13ca3c19bcd8a323e21d51d407e017ac",
"testMasterFooter.odp": "bcc59df70699c739423a50e362c722b81ae76498",
"testTXTNonASCIIUTF8.txt": "1ef514431ca8d838f11e99f8e4a0637730b77aa0",
"EmbeddedOutlook.docx": "c544a6765c19ba11b0bf3edb55c79e1bd8565c6e",
"testWORD_override_list_numbering.docx": "4e892319b921322916225def763f451e4bbb4e16",
"testTextBoxes.key": "b01581d5bd2483ce649a1a1406136359f4b93167",
"testPPT_masterText.pptx": "9fee8337b76dc3e196f4554dcde22b9dd1c3b3e8",
"testComment.docx": "333b9009686f27265b4729e8172b3e62048ec7ec",
"testRTFInvalidUnicode.rtf": "32b3e3d8e5c5a1b66cb15fc964b9341bea7048f4",
"testEXCEL_headers_footers.xlsx": "9e8d2a700fc431fe29030e86e08162fc8ecf2c1a",
"testWORD6.doc": "1479de589755c7212815445799c44dab69d4587c",
"testPagesHeadersFootersFootnotes.pages": "99d434be7de4902dc70700aa9c2a31624583c1f1",
"testPDF_no_extract_yes_accessibility_owner_empty.pdf": "6eb693dac68fece3bf3cd1aa9880ea9b23fc927c",
"testOpenOffice2.odt": "564b3e1999a53073a04142e01b663757a6e7fb08",
"testTables.key": "250cff75db7fc3c8b95b2cbd3f37308826e0c93d",
"testDOCX_Thumbnail.docx": "fce6a43271bc242e2bb8341afa659ed166e08050",
"testWORD_3imgs.docx": "292ca6fa41d32b462e66061e89adb19423721975",
"testPDF_acroform3.pdf": "dcf6588cb5e41701b168606ea6bfbadecdcd3bc9",
"testWORD_missing_ooxml_bean1.docx": "c3058f2513fecc0a6d76d3ecf55676f236b085ff",
"testPDFTwoTextBoxes.pdf": "4adf324ce030076b1755fdb3a6cce676ee325ae4",
"testRTFUnicodeGothic.rtf": "f9932470ff686b0c217ea94ed5d4f2fd85f7998e",
"headers.mbox": "75ec25789fe870b6d25365e4ea73d731fc274847",
"testPPT_embeded.ppt": "",
"testXML3.xml": "804d4812408eb324ae8483d2140b648ec871dd2a",
"testOptionalHyphen.doc": "10f9ca38cc2985e94967aa2c454bfe40aff76976",
"testComment.doc": "66e57653d5d08478556ca640408b172b65855cc7",
"testEXCEL_headers_footers.xls": "18977c66fc8bcb8c44de3063b69b65a3de9c3f25",
"testWORD_embedded_rtf.doc": "cc2d289acfe3d1068a2649b7fa0c06c50bb6ceda",
"testEXCEL_custom_props.xlsx": "6b72ae08362a204b37dbba0a30b4134ae3e7918f",
"testOptionalHyphen.docx": "5b8ffc0df1691a8fed7d63aa9b256e9e02e36d71",
"testPPT_various.pptx": "d149de9af8071141a6ba6e2cd4ef5f6d9431a826",
"testWORD_closingSmartQInHyperLink.doc": "9859f378c603b70bf0d44a281169ae5b16a21878",
"test_embedded_zip.pptx": "d19406edcec09440d066877c451ceba60abc3483",
"testRTFUmlautSpaces.rtf": "155b39879c5b5fbad22fd650be37ae7f91489eb2",
"protectedFile.xlsx": "ee08eeaf05c35c960243f831c3a974d9ee07aa28",
"Doc1_ole.doc": "fb63220506ab666f1fe87b0608e1447fd4fd3489",
"testEXCEL_embeded.xlsx": "",
"EmbeddedDocument.docx": "",
"testODFwithOOo3.odt": "3815d6fb7f5829db882ea8ebd664f252711e6e60",
"testPagesHeadersFootersRomanUpper.pages": "85b3cd545ba6c33e5d44b844a6afea8cb6eaec0b",
"testPPT_comment.ppt": "88fd667fd0292785395a8d0d229304aa91110556",
"testPPT_2imgs.pptx": "66eda11ad472918153100dad8ee5be0f1f8e2e04",
"testPagesHeadersFootersAlphaUpper.pages": "56bef0d1eaedfd7599aae29031d2eeb0e3fe4688",
"testWORD_text_box.docx": "e01f7b05c6aac3449b9a699c3e4d2e62ff3368a3",
"testWORD_missing_text.docx": "3814332884a090b6d1020bff58d0531486710c45",
"testComment.pdf": "60e181061a00454c2e622bd37a9878234c13231d",
"testPDF_no_extract_no_accessibility_owner_empty.pdf": "6eb693dac68fece3bf3cd1aa9880ea9b23fc927c",
"test_embedded_package.rtf": "cd90adb3f777e68aa0288fd23e8f4fbce260a763",
"testPDF_bom.pdf": "6eb693dac68fece3bf3cd1aa9880ea9b23fc927c",
"testOptionalHyphen.ppt": "7e016e42860bd408054bb8653fef39b2756119d9",
"testHTML_utf8.html": "3ba828044754772e4c9df5f9a2213beaa75842ef",
"testPPT_comment.pptx": "25fab588194dabd5902fd2ef880ee9542d036776",
"testRTFWithCurlyBraces.rtf": "019cab63b73ff89d094823cf50c0a721bec08ee2",
"testFooter.ods": "846e1d0415b23fa27631b536b0cf566abbf8fcc1",
"testPPT.ppt": "933ee556884b1d9e28b801daa0d77bbaa4f4be62",
"testEXCEL-formats.xls": "",
"testPPT_masterFooter.pptx": "29bb97006b3608b7db6ff72b94d20157878d94dd",
"testWORD_header_hyperlink.doc": "914bbec0730c54948ad307ea3e375ef0c100abf1",
"testRTFHyperlink.rtf": "2b2ffb1997aa495fbab1af490d134051de168c97",
"testExtraSpaces.pdf": "b5575400309b01c1050a927d8d1ecf8761062abc",
"testRTFWindowsCodepage1250.rtf": "7ba418843f401634f97d21c844c2c4093b7194fb",
"testRTFTableCellSeparation2.rtf": "62782ca40ff0ed6c3ba90f8055ee724b44af203f",
"testPagesHeadersFootersRomanLower.pages": "2410fc803907001eb39c201ad4184b243e271c6d",
"headerPic.docx": "c704bb648feac7975dff1024a5f762325be7cbc2",
"testHTMLNoisyMetaEncoding_4.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a",
"testRTFBoldItalic.rtf": "0475d224078682cf3f9f3f4cbc14a63456c5a0d8",
"test-outlook.msg": "1f202fc11a873e305d5b4d4607409f3f734065ec",
"testRTFVarious.rtf": "bf6ea9cf57886e680c5e6743a66a12b950a09083",
"testXHTML.html": "c6da900f81c1c550518e65d579d3dd62dd7c5c0c",
"EmbeddedPDF.docx": "454476bdf4a968189a6f53e75c146382bf58a434",
"testXML.xml": "e1615e9b31be58f7af9ad963e5a112efa5cdaffa",
"testWORD_no_format.docx": "9a3f5d8a4c8c0f077cc615bcfc554dc87d5926aa",
"testPPT_masterText.ppt": "f5ff5e2d45ccb180cf371ed99b7dfeb2a93539b3",
"testPDF_PDFEncodedStringInXMP.pdf": "78fd59d394f72d28a9908739fa562099978dafa1",
"testPPT_custom_props.pptx": "72152d28afbc23a50cc71fa37d1dce9ef03ca72d",
"testRTFListOverride.rtf": "f8c61d8a66afdaa07f3740e859497818bfc2ca01",
"testEXCEL_1img.xls": "",
"testWORD_1img.doc": "0826d299a7770e93603f5667d89dccb7b74d904c",
"testNPEOpenDocument.odt": "4210b973c80084c58463ec637fa43e911f77d6fe",
"testRTFWord2010CzechCharacters.rtf": "9443011aac32434240ab8dbff360c970fc1c7074",
"testPDF_Version.8.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
"testPPT.ppsx": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6",
"testPPT_autodate.pptx": "50467dbb37d1c74b8b37fe93eddf6f9e87d21bf3",
"testWordArt.pptx": "3566bbee790704b3654fe78319957f9e0cddb6d9",
"NullHeader.docx": "18430c968ba29173b52610efdaa723424b3c4d79",
"testRTFWordPadCzechCharacters.rtf": "5dbb58452a3507c384008662f8fce90063f12189",
"resume.html": "fbfb9d8264f6eebd79847fe7a7f1b81edd4a027d",
"testPagesLayout.pages": "5db1ab91c93e6183d0af8513f62c7b87964704af",
"testOptionalHyphen.pptx": "c2977eefe7d2cad8c671f550d7883185ec65591b",
"testWORD_numbered_list.docx": "07194c58165993468e66bc4eba4f5bd89d5bee09",
"testEXCEL_1img.xlsx": "",
"testPDFTripleLangTitle.pdf": "6eb693dac68fece3bf3cd1aa9880ea9b23fc927c",
"protect.xlsx": "ee08eeaf05c35c960243f831c3a974d9ee07aa28",
"testWORD_bold_character_runs2.docx": "f10e562d8825ec2e17e0d9f58646f8084a658cfa",
"testXLSX_Thumbnail.xlsx": "020bf155ae157661c11727c54e6694cf9cd2c0d3",
"testWORD_embedded_pdf.docx": "d8adb797aaaac92afd8dd9b499bd197347f15688",
"testOptionalHyphen.rtf": "2f77b61bab5b4502b4ddd5018b454be157091d07",
"testEXCEL-charts.xls": "",
"testWORD_override_list_numbering.doc": "60e47a3e71ba08af20af96131d61740a1f0bafa3",
"testPDF_twoAuthors.pdf": "c5f0296cc21f9ae99ceb649b561c55f99d7d9452",
"testPDF_Version.10.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
"testHTMLNoisyMetaEncoding_2.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a",
"testFooter.odt": "cd5d0fcbcf48d6f005d087c47d00e84f39bcc321",
"testPPT.pptm": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6",
"testPPT_various.ppt": "399e27a9893284f106dc44f15b5e636454db681e",
"testRTFListMicrosoftWord.rtf": "0303eb3e2f30530621a7a407847b759a3b21467e",
"testWORD_bold_character_runs2.doc": "f10e562d8825ec2e17e0d9f58646f8084a658cfa",
"boilerplate-whitespace.html": "a9372bc75d7d84cbcbb0bce68fcaed73ad8ef52c",
"testEXCEL_95.xls": "20d9b9b0f3aecd28607516b4b837c8bab3524b6c",
"testPPT_embedded_two_slides.pptx": "",
"testPDF_bookmarks.pdf": "5fc486c443511452db4f1aa6530714c6aa49c831",
"test_recursive_embedded.docx": "afc32b07ce07ad273e5b3d1a43390a9d2b6dd0a9",
"testEXCEL-formats.xlsx": "",
"testPPT_masterText2.pptx": "2b01eab5d0349e3cfe791b28c70c2dbf4efc884d",
"test.doc": "774be3106edbb6d80be36dbb548d62401dcfa0fe",
"test_recursive_embedded_npe.docx": "afc32b07ce07ad273e5b3d1a43390a9d2b6dd0a9",
"testPPT_embedded2.ppt": "80e106b3fc68107e7f9579cff04e3b15bdfc557a",
"testWORD_custom_props.docx": "e7a737a5237a6aa9c6b3fc677eb8fa65c30d6dfe",
"testPDF_Version.4.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
"testBinControlWord.rtf": "ef858fbb7584ea7f92ffed8d0a08c1cc35ffee07",
"testWORD_null_style.docx": "0be9dcfb83423c78a06af514ec21e4e7770ec48e",
"test-outlook2003.msg": "bb3c35eb7e95d657d7977c1d3d52862734f9f329",
"testPDFVarious.pdf": "c66bbbacb10dd27430f7d0bed9518e75793cedae",
"testHTMLNoisyMetaEncoding_3.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a",
"testRTFCorruptListOverride.rtf": "116a782d02a7f25010a15cbbb189bf98e6b89855",
"testEXCEL_custom_props.xls": "b5584d9b13ab1566ce539238dc75e7eb3449ba7f",
"testPDF_Version.7.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
"testPDFEmbeddingAndEmbedded.docx": "e7b648adb15cd16cdd84437c2b9524a8eeb213e4",
"testHTMLNoisyMetaEncoding_1.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a",
"testWORD_3imgs.doc": "818aa8c6c44dd78c49100c3c38e95abdf3812981",
"testRTFEmbeddedLink.rtf": "2720ffb5ff3a6bbb2c5c1cb43fb4922362ed788a",
"testKeynote.key": "11387b59fc6339bb73653fcbb26d387521b98ec9",
"testPDF.pdf": "5a377554685367764eaf73d093408ace323fcec7",
"protectedSheets.xlsx": "",
"testWORD.doc": "cdd41377e699287cbbe17fbb1498cfe5814dde23",
"testComment.xlsx": "d4be580bb97c1c90be379281179c7932b37a18c0",
"testPDFPackage.pdf": "75d6fa216b4e2880a65ced55d17ca2b599d2606c",
"testWORD_embeded.doc": "",
"testHTML.html": "6548b16c5ea33e907577615ce60ca4876a3936ef",
"testEXCEL_5.xls": "a174f098333c659d331317641d4d1d9d83055288",
"pictures.ppt": "95bbfdbf2f60f74371285c337d3445d0acd59a9b",
"testPPT_masterText2.ppt": "f5ff5e2d45ccb180cf371ed99b7dfeb2a93539b3",
"testPDF-custommetadata.pdf": "a84b914655db55574e6002b6f37209ecd4c3d462",
"testWORD_embeded.docx": "",
"testStyles.odt": "c25dd05633e3aab7132d2f5608126e2b4b03848f",
"testPDF_multiFormatEmbFiles.pdf": "2103b2c30b44d5bb3aa790ab04a6741a10ea235a",
"testXML2.xml": "a8c85a327716fad93faa4eb0f993057597d6f471",
"testPagesComments.pages": "cbb45131cf45b9c454e754a07af3ae927b1a69cc",
"testEXCEL_4.xls": "8d5e6156222151faaccb079d46ddb5393dd25771",
"testWORD_no_format.doc": "88feaf03fe58ee5cc667916c6a54cbd5d605cc1c",
"testPages.pages": "288e6db2f39604e372a2095257509c78dba22cbb",
"footnotes.docx": "33b01b73a12f9e14efbcc340890b11ee332dca8e",
"testWORD_bold_character_runs.doc": "f10e562d8825ec2e17e0d9f58646f8084a658cfa",
"testWORD_custom_props.doc": "e7a737a5237a6aa9c6b3fc677eb8fa65c30d6dfe",
"testPDF_Version.11.x.PDFA-1b.pdf": "71853c6197a6a7f222db0f1978c7cb232b87c5ee",
"testAnnotations.pdf": "5f599e7916198540e1b52c3e472a525f50fd45f6",
"tika434.html": "7d74122631f52f003a48018cc376026ccd8d984e",
"testPagesHeadersFootersAlphaLower.pages": "fc1d766908134ff4689fa63fa3e91c3e9b08d975",
"testRTFRegularImages.rtf": "756b1db45cb05357ceaf9c8efcf0b76e3913e190",
"testRTFUmlautSpaces2.rtf": "1fcd029357062241d74d789e93477c101ff24e3f",
"testWORD_numbered_list.doc": "e06656dd9b79ac970f3cd065fa8b630a4981556f",
"testPPT_autodate.ppt": "05b93967ea0248ad263b2f24586e125df353fd3d",
"testBulletPoints.key": "92242d67c3dbc1b22aac3f98e47061d09e7719f9",
"testMasterSlideTable.key": "1d61e2fa3c3f3615500c7f72f62971391b9e9a2f",
"testWORD_various.doc": "8cbdf1a4e0d78471eb90403612c4e92866acf0cb",
"testEXCEL_textbox.xlsx": "1e81121e91e58a74d838e414ae0fc0055a4b4100",
"big-preamble.html": "a9d759b46b6c6c1857d0d89c3a75ee2f3ace70c9",
"testWORD.docx": "f72140bef19475e950e56084d1ab1cb926697b19",
"testComment.rtf": "f6351d0f1f20c4ee0fff70adca6abbc6e638610e",
"testRTFUnicodeUCNControlWordCharacterDoubling.rtf": "3e6f2f38682e38ffc96a476ca51bec2291a27fa7",
"testPDF_Version.5.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
"testPPTX_Thumbnail.pptx": "6aa019154289317c7b7832fe46556e6d61cd0a9f",
"testRTFTableCellSeparation.rtf": "5647290a3197c1855fad10201dc7be60ea7b0e42",
"testRTFControls.rtf": "aee6afb80e8b09cf49f056020c037f70c2757e49",
"testEXCEL.xls": "",
"testRTFJapanese.rtf": "08976f9a7d6d3a155cad84d7fa23295cb972a17a",
"testPageNumber.pdf": "96b03d2cc6782eba653af28228045964e68422b5",
"testOptionalHyphen.pdf": "12edd450ea76ea4e79f80ebd3442999ec2180dbc",
"testPDFFileEmbInAnnotation.pdf": "97a6e5781bbaa6aea040546d797c4916f9d90c86",
"testFontAfterBufferedText.rtf": "d1c8757b3ed91f2d7795234405c43005868affa3",
"testPPT_masterFooter.ppt": "8c9104385820c2631ddda20814231808fac03d4d",
"testWORD_various.docx": "189df989e80afb09281901aefc458c6630a8530b",
"testComment.ppt": "21842dd9cb8a7d4af0f102543c192861c9789705",
"testPopupAnnotation.pdf": "1717b1d16c0a4b9ff5790cac90fc8e0fba170a35",
"testWORD_bold_character_runs.docx": "f10e562d8825ec2e17e0d9f58646f8084a658cfa",
"testOverlappingText.pdf": "726da7d6c184512ed8d44af2a5085d65523c4572",
"testRTF.rtf": "91e830ceba556741116c9e83b0c69a0d6c5c9304",
"testRTFIgnoredControlWord.rtf": "1eb6a2f2fd32b1bb4227c0c02a35cb6027d9ec8c",
"testComment.xls": "4de962f16452159ce302fc4a412b06a06cf9a0f6",
"testPPT.ppsm": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6",
"boilerplate.html": "b3558f02c3179e4aeeb6057594d87bda79964e7b",
"testEXCEL_embeded.xls": "",
"testEXCEL.xlsx": "",
"testPPT_2imgs.ppt": "9a68072ffcf171389e78cf8bc018c4b568a6202d",
"testComment.pptx": "6ae6052f469b8f901fd4fd8bc70f8e267255a58e",
"testPDF_Version.6.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
"testPPT.pptx": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6",
"testPPT_custom_props.ppt": "edf196acc12701accc7be5dfe63e053436db45e6",
"testPPT_embeded.pptx": "",
"testRTFListLibreOffice.rtf": "4c38d9e2f0a8c9a4c2cc8d2a52db9591ab759abe",
"testPDF_Version.9.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
"testRTFHexEscapeInsideWord.rtf": "6cffda07e774c55b5465d8134a0bdcb8c30f3386",
"testRTFNewlines.rtf": "2375ca14e2b0d8f7ff6bbda5191544b3ee7c09fb",
"testRTF-ms932.rtf": "5f9db1b83bf8e9c4c6abb065adaeb151307d33f2",
"test_TIKA-1251.doc": "5a9394c34274964055fdd9272b4f7dc314b99ecf",
"test_list_override.rtf": "9fe8b4a36c5222fe7ed2e9b54e2330aec8fa9423"
}
Loading