Skip to content

Commit

Permalink
Ingest Attachment: Upgrade Tika to 1.18 (#31252)
Browse files Browse the repository at this point in the history
Fixes ES from hanging when a bad zip file is loaded through Tika.
  • Loading branch information
jdconrad authored and rjernst committed Jun 24, 2018
1 parent 2b7cb63 commit 67f6a57
Show file tree
Hide file tree
Showing 19 changed files with 27 additions and 14 deletions.
18 changes: 11 additions & 7 deletions plugins/ingest-attachment/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ esplugin {
}

versions << [
'tika': '1.17',
'pdfbox': '2.0.8',
'tika': '1.18',
'pdfbox': '2.0.9',
'bouncycastle': '1.55',
'poi': '3.17',
'mime4j': '0.8.1'
Expand All @@ -33,9 +33,10 @@ versions << [
dependencies {
// mandatory for tika
compile "org.apache.tika:tika-core:${versions.tika}"
// build against Jackson 2.9.5, but still works on our current version
compile "org.apache.tika:tika-parsers:${versions.tika}"
compile 'org.tukaani:xz:1.6'
compile 'commons-io:commons-io:2.5'
compile 'org.tukaani:xz:1.8'
compile 'commons-io:commons-io:2.6'
compile "org.slf4j:slf4j-api:${versions.slf4j}"

// character set detection
Expand All @@ -62,7 +63,7 @@ dependencies {
// MS Office
compile "org.apache.poi:poi-scratchpad:${versions.poi}"
// Apple iWork
compile 'org.apache.commons:commons-compress:1.14'
compile 'org.apache.commons:commons-compress:1.16.1'
// Outlook documents
compile "org.apache.james:apache-mime4j-core:${versions.mime4j}"
compile "org.apache.james:apache-mime4j-dom:${versions.mime4j}"
Expand Down Expand Up @@ -118,6 +119,10 @@ thirdPartyAudit.excludes = [
'com.drew.metadata.jpeg.JpegDirectory',
'com.github.junrar.Archive',
'com.github.junrar.rarfile.FileHeader',
'com.github.luben.zstd.ZstdInputStream',
'com.github.luben.zstd.ZstdOutputStream',
'com.github.openjson.JSONArray',
'com.github.openjson.JSONObject',
'com.google.common.reflect.TypeToken',
'com.google.gson.Gson',
'com.googlecode.mp4parser.DataSource',
Expand Down Expand Up @@ -531,6 +536,7 @@ thirdPartyAudit.excludes = [
'org.apache.commons.exec.PumpStreamHandler',
'org.apache.commons.exec.environment.EnvironmentUtils',
'org.apache.commons.lang.StringUtils',
'org.apache.commons.lang.SystemUtils',
'org.apache.ctakes.typesystem.type.refsem.UmlsConcept',
'org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation',
'org.apache.cxf.jaxrs.client.WebClient',
Expand Down Expand Up @@ -635,8 +641,6 @@ thirdPartyAudit.excludes = [
'org.etsi.uri.x01903.v13.impl.UnsignedSignaturePropertiesTypeImpl$1SignatureTimeStampList',
'org.etsi.uri.x01903.v14.ValidationDataType$Factory',
'org.etsi.uri.x01903.v14.ValidationDataType',
'org.json.JSONArray',
'org.json.JSONObject',
'org.json.simple.JSONArray',
'org.json.simple.JSONObject',
'org.json.simple.parser.JSONParser',
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
7b5cdabadb4cf12f5ee0f801399e70635583193f
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/commons-io-2.5.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/commons-io-2.6.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
815893df5f31da2ece4040fe0a12fd44b577afaf
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/fontbox-2.0.8.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/fontbox-2.0.9.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
f961f17ebdbc307e9055e3cf7c0e207f0895ae55
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/pdfbox-2.0.8.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/pdfbox-2.0.9.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
d0425578218624388f2ec84a0b3a11efd55df0f5
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/tika-core-1.17.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/tika-core-1.18.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
69556697de96cf0b22df846e970dafd29866eee0

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
7d9b6dea91d783165f3313d320d3aaaa9a4dfc13
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/xz-1.6.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/xz-1.8.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
c4f7d054303948eb6a4066194253886c8af07128
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ static PermissionCollection getRestrictedPermissions() {
perms.add(new SecurityPermission("putProviderProperty.BC"));
perms.add(new SecurityPermission("insertProvider"));
perms.add(new ReflectPermission("suppressAccessChecks"));
perms.add(new RuntimePermission("accessClassInPackage.sun.java2d.cmm.kcms"));
// xmlbeans, use by POI, needs to get the context classloader
perms.add(new RuntimePermission("getClassLoader"));
// ZipFile needs accessDeclaredMembers on JDK 10; cf. https://bugs.openjdk.java.net/browse/JDK-8187485
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,6 @@ grant {
permission java.lang.RuntimePermission "getClassLoader";
// ZipFile needs accessDeclaredMembers on Java 10
permission java.lang.RuntimePermission "accessDeclaredMembers";
// PDFBox checks for the existence of this class
permission java.lang.RuntimePermission "accessClassInPackage.sun.java2d.cmm.kcms";
};
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,12 @@ public void testAsciidocDocument() throws Exception {
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
}

// See (https://issues.apache.org/jira/browse/COMPRESS-432) for information
// about the issue that causes a zip file to hang in Tika versions prior to 1.18.
public void testZipFileDoesNotHang() {
expectThrows(Exception.class, () -> parseDocument("bad_tika.zip", processor));
}

public void testParseAsBytesArray() throws Exception {
String path = "/org/elasticsearch/ingest/attachment/test/sample-files/text-in-english.txt";
byte[] bytes;
Expand Down
Binary file not shown.

0 comments on commit 67f6a57

Please sign in to comment.