Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/branch_1x' into branch_1x
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison committed Jun 9, 2021
2 parents d7fa2cd + 10c94ff commit e8ec223
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 3 deletions.
2 changes: 2 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
Release 1.27 - ???

* Deprecate experimental PDFPreflightParser (TIKA-3437).

* Apply encoding detection to zip entry names via Ryan421 (TIKA-3374).

* Add json output for /tika endpoint in tika-server (TIKA-3352).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,7 @@ public boolean getExtractAnnotationText() {
* If true (the default), text in annotations will be
* extracted.
*/
@Field
public void setExtractAnnotationText(boolean v) {
defaultConfig.setExtractAnnotationText(v);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@

import static org.apache.pdfbox.preflight.PreflightConstants.DICTIONARY_KEY_LINEARIZED;

/**
* @deprecated This will be removed in 2.x. The PDFBox community voted
* to retire the preflight parser in PDFBox 4.x.
*/
@Deprecated
public class PDFPreflightParser extends PDFParser {

private static final PDFPreflightParserConfig DEFAULT = new PDFPreflightParserConfig();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@

import static org.junit.Assert.assertEquals;

import java.io.EOFException;
import java.io.InputStream;
import java.nio.file.Paths;

import org.apache.tika.TikaTest;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
Expand Down Expand Up @@ -104,8 +106,13 @@ public void testMP4ParsingAudio() throws Exception {
public void testInfiniteLoop() throws Exception {
//test that a truncated mp4 doesn't cause an infinite loop
//TIKA-1931 and TIKA-1924
XMLResult r = getXML("testMP4_truncated.m4a");
assertEquals("audio/mp4", r.metadata.get(Metadata.CONTENT_TYPE));
assertEquals("M4A", r.metadata.get(XMPDM.AUDIO_COMPRESSOR));
try {
XMLResult r = getXML("testMP4_truncated.m4a");
assertEquals("audio/mp4", r.metadata.get(Metadata.CONTENT_TYPE));
assertEquals("M4A", r.metadata.get(XMPDM.AUDIO_COMPRESSOR));
} catch (TikaException e) {
//happens with Java 8
//should be eof
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ public void testAnnotations() throws Exception {
try(InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf")) {
content = getText(stream, pdfParser);
}

content = content.replaceAll("[\\s\u00a0]+", " ");
assertContains("Here is some text", content);
assertEquals(-1, content.indexOf("Here is a comment"));
Expand All @@ -374,6 +375,18 @@ public void testAnnotations() throws Exception {
assertContains("Here is some text", content);
assertEquals(-1, content.indexOf("Here is a comment"));

//test turning off via config
InputStream is = getClass().getResourceAsStream(
"/org/apache/tika/parser/pdf/tika-skip-annotations-config.xml");
assertNotNull(is);
TikaConfig tikaConfig = new TikaConfig(is);
Parser p = new AutoDetectParser(tikaConfig);
try (InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf")) {
content = getText(stream, p, context);
}
content = content.replaceAll("[\\s\u00a0]+", " ");
assertContains("Here is some text", content);
assertEquals(-1, content.indexOf("Here is a comment"));

// TIKA-738: make sure no extra </p> tags
String xml = getXML("testAnnotations.pdf").xml;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<properties>
<parsers>
<parser class="org.apache.tika.parser.DefaultParser">
<parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
</parser>
<parser class="org.apache.tika.parser.pdf.PDFParser">
<params>
<param name="extractAnnotationText" type="bool">false</param>
</params>
</parser>
</parsers>
</properties>

0 comments on commit e8ec223

Please sign in to comment.