From f51ae0aef10a052308d6830fcefad77347bc5ebd Mon Sep 17 00:00:00 2001 From: tallison Date: Tue, 14 Apr 2020 11:09:52 -0400 Subject: [PATCH] TIKA-3091 prevent npe in PDFParserConfig by initializing three parameters with default values. # Conflicts: # tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java --- .../org/apache/tika/parser/pdf/PDFParser.java | 11 ++++++++ .../tika/parser/pdf/PDFParserConfig.java | 28 +++++++++++++++++-- .../apache/tika/parser/pdf/PDFParserTest.java | 10 +++++++ 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 2e637e07d0..6d8b5b1211 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -523,6 +523,17 @@ void setExtractInlineImages(boolean extractInlineImages) { defaultConfig.setExtractInlineImages(extractInlineImages); } + @Field + void setAverageCharTolerance(float averageCharTolerance) { + defaultConfig.setAverageCharTolerance(averageCharTolerance); + } + + @Field + void setSpacingTolerance(float spacingTolerance) { + defaultConfig.setSpacingTolerance(spacingTolerance); + } + + @Field void setCatchIntermediateExceptions(boolean catchIntermediateExceptions) { defaultConfig.setCatchIntermediateIOExceptions(catchIntermediateExceptions); diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index b5d6824afc..da8b3097dc 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -114,10 +114,16 @@ private static OCR_STRATEGY parse(String s) { private boolean extractMarkedContent = false; //The character width-based tolerance value used to estimate where spaces in text should be added - private Float averageCharTolerance; + //Default taken from PDFBox. + private Float averageCharTolerance = 0.5f; //The space width-based tolerance value used to estimate where spaces in text should be added - private Float spacingTolerance; + //Default taken from PDFBox. + private Float spacingTolerance = 0.3f; + + // The multiplication factor for line height to decide when a new paragraph starts. + //Default taken from PDFBox. + private Float dropThreshold = 2.5f; //If the PDF has an XFA element, process only that and skip extracting //content from elsewhere in the document. @@ -238,6 +244,10 @@ private void init(InputStream is) { setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false)); + setAverageCharTolerance(getFloatProp(props.getProperty("averageCharTolerance"), averageCharTolerance)); + setSpacingTolerance(getFloatProp(props.getProperty("spacingTolerance"), spacingTolerance)); + setDropThreshold(getFloatProp(props.getProperty("dropThreshold"), dropThreshold)); + boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false); boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true); @@ -287,6 +297,9 @@ public void configure(PDF2XHTML pdf2XHTML) { if (getSpacingTolerance() != null) { pdf2XHTML.setSpacingTolerance(getSpacingTolerance()); } + if (getDropThreshold() != null) { + pdf2XHTML.setDropThreshold(dropThreshold); + } pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText()); } @@ -513,6 +526,14 @@ public void setSpacingTolerance(Float spacingTolerance) { this.spacingTolerance = spacingTolerance; } + public Float getDropThreshold() { + return dropThreshold; + } + + public void setDropThreshold(float dropThreshold) { + this.dropThreshold = dropThreshold; + } + public AccessChecker getAccessChecker() { return accessChecker; } @@ -824,6 +845,7 @@ public boolean equals(Object o) { if (getCatchIntermediateIOExceptions() != config.getCatchIntermediateIOExceptions()) return false; if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return false; if (!getSpacingTolerance().equals(config.getSpacingTolerance())) return false; + if (!getDropThreshold().equals(config.getDropThreshold())) return false; if (!getOcrStrategy().equals(config.getOcrStrategy())) return false; if (getOcrImageType() != config.getOcrImageType()) return false; if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) return false; @@ -844,6 +866,7 @@ public int hashCode() { result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0); result = 31 * result + getAverageCharTolerance().hashCode(); result = 31 * result + getSpacingTolerance().hashCode(); + result = 31 * result + getDropThreshold().hashCode(); result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0); result = 31 * result + ocrStrategy.hashCode(); result = 31 * result + getOcrDPI(); @@ -869,6 +892,7 @@ public String toString() { ", extractUniqueInlineImagesOnly=" + extractUniqueInlineImagesOnly + ", averageCharTolerance=" + averageCharTolerance + ", spacingTolerance=" + spacingTolerance + + ", dropThreshold=" + dropThreshold + ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA + ", ocrStrategy=" + ocrStrategy + ", ocrDPI=" + ocrDPI + diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index edcd51320c..4e2e3c5d57 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1528,6 +1528,16 @@ public void testUnmappedUnicodeStats() throws Exception { } + @Test + public void testNPEInPDFParserConfig() { + //TIKA-3091 + PDFParserConfig config = new PDFParserConfig(); + //don't care about values; want to make sure no NPE is thrown + String txt = config.toString(); + config.hashCode(); + config.equals(new PDFParserConfig()); + } + @Test //TIKA-3041 @Ignore("turn back on if we add file from PDFBOX-52") public void testPDFBox52() throws Exception {