Merge remote-tracking branch 'origin/branch_1x' into branch_1x

apache · Jun 9, 2021 · e8ec223 · e8ec223
2 parents d7fa2cd + 10c94ff
commit e8ec223
Show file tree

Hide file tree

Showing 6 changed files with 59 additions and 3 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.27 - ???
 
+   * Deprecate experimental PDFPreflightParser (TIKA-3437).
+
    * Apply encoding detection to zip entry names via Ryan421 (TIKA-3374).
 
    * Add json output for /tika endpoint in tika-server (TIKA-3352).

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -432,6 +432,7 @@ public boolean getExtractAnnotationText() {
      * If true (the default), text in annotations will be
      * extracted.
      */
+    @Field
     public void setExtractAnnotationText(boolean v) {
         defaultConfig.setExtractAnnotationText(v);
     }

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFPreflightParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFPreflightParser.java
@@ -45,6 +45,11 @@
 
 import static org.apache.pdfbox.preflight.PreflightConstants.DICTIONARY_KEY_LINEARIZED;
 
+/**
+ * @deprecated This will be removed in 2.x.  The PDFBox community voted
+ * to retire the preflight parser in PDFBox 4.x.
+ */
+@Deprecated
 public class PDFPreflightParser extends PDFParser {
 
     private static final PDFPreflightParserConfig DEFAULT = new PDFPreflightParserConfig();

diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
@@ -18,10 +18,12 @@
 
 import static org.junit.Assert.assertEquals;
 
+import java.io.EOFException;
 import java.io.InputStream;
 import java.nio.file.Paths;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -104,8 +106,13 @@ public void testMP4ParsingAudio() throws Exception {
     public void testInfiniteLoop() throws Exception {
         //test that a truncated mp4 doesn't cause an infinite loop
         //TIKA-1931 and TIKA-1924
-        XMLResult r = getXML("testMP4_truncated.m4a");
-        assertEquals("audio/mp4", r.metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("M4A", r.metadata.get(XMPDM.AUDIO_COMPRESSOR));
+        try {
+            XMLResult r = getXML("testMP4_truncated.m4a");
+            assertEquals("audio/mp4", r.metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("M4A", r.metadata.get(XMPDM.AUDIO_COMPRESSOR));
+        } catch (TikaException e) {
+            //happens with Java 8
+            //should be eof
+        }
     }
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -358,6 +358,7 @@ public void testAnnotations() throws Exception {
         try(InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf")) {
             content = getText(stream, pdfParser);
         }
+
         content = content.replaceAll("[\\s\u00a0]+", " ");
         assertContains("Here is some text", content);
         assertEquals(-1, content.indexOf("Here is a comment"));
@@ -374,6 +375,18 @@ public void testAnnotations() throws Exception {
         assertContains("Here is some text", content);
         assertEquals(-1, content.indexOf("Here is a comment"));
 
+        //test turning off via config
+        InputStream is = getClass().getResourceAsStream(
+                "/org/apache/tika/parser/pdf/tika-skip-annotations-config.xml");
+        assertNotNull(is);
+        TikaConfig tikaConfig = new TikaConfig(is);
+        Parser p = new AutoDetectParser(tikaConfig);
+        try (InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf")) {
+            content = getText(stream, p, context);
+        }
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        assertContains("Here is some text", content);
+        assertEquals(-1, content.indexOf("Here is a comment"));
 
         // TIKA-738: make sure no extra </p> tags
         String xml = getXML("testAnnotations.pdf").xml;

diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-skip-annotations-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-skip-annotations-config.xml
@@ -0,0 +1,28 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+    </parser>
+    <parser class="org.apache.tika.parser.pdf.PDFParser">
+      <params>
+        <param name="extractAnnotationText" type="bool">false</param>
+      </params>
+    </parser>
+  </parsers>
+</properties>