> parserOptions;
+ /**
+ * Configuration of supporting file format for the individual parsers.
+ * Currently is used only for ooxml parser, because the higher number of
+ * supported formats there leads to OOM error during the native build.
+ *
+ * Example:
+ *
+ *
+ * quarkus.tika.parsers = ooxml
+ * quarkus.tika.parser-file-format-support.ooxml.docx = true
+ * quarkus.tika.parser-file-format-support.ooxml.pptx = false
+ * quarkus.tika.parser-file-format-support.ooxml.xlsx = true
+ */
+ @ConfigItem
+ public Map> parserFileFormatSupport;
+
/**
* Full parser class name for a given parser abbreviation.
* For example:
diff --git a/integration-tests/tika/src/main/java/io/quarkus/it/tika/TikaEmdeddedContentResource.java b/integration-tests/tika/src/main/java/io/quarkus/it/tika/TikaEmdeddedContentResource.java
index 995ecbb18e524..e85313ad0dee5 100644
--- a/integration-tests/tika/src/main/java/io/quarkus/it/tika/TikaEmdeddedContentResource.java
+++ b/integration-tests/tika/src/main/java/io/quarkus/it/tika/TikaEmdeddedContentResource.java
@@ -6,14 +6,17 @@
import javax.ws.rs.POST;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
+import javax.ws.rs.core.HttpHeaders;
import javax.ws.rs.core.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.apache.tika.parser.pdf.PDFParser;
import io.quarkus.tika.TikaContent;
+import io.quarkus.tika.TikaMetadata;
import io.quarkus.tika.TikaParser;
@Path("/embedded")
@@ -21,7 +24,7 @@ public class TikaEmdeddedContentResource {
// Avoiding the injection, otherwise the recorded tika-config.xml intended for TikaPdfInvoiceTest is used
TikaParser parser = new TikaParser(new RecursiveParserWrapper(
- new AutoDetectParser(new OfficeParser(), new PDFParser()), true), false);
+ new AutoDetectParser(new OfficeParser(), new PDFParser(), new OOXMLParser()), true), false);
@POST
@Path("/outerText")
@@ -40,4 +43,12 @@ public String extractInnerText(InputStream stream) {
TikaContent content = parser.parse(stream);
return content.getEmbeddedContent().get(0).getText();
}
+
+ @POST
+ @Path("/contentType")
+ @Produces(MediaType.TEXT_PLAIN)
+ public String contentType(InputStream stream) {
+ TikaMetadata metadata = parser.getMetadata(stream);
+ return metadata.getSingleValue(HttpHeaders.CONTENT_TYPE);
+ }
}
diff --git a/integration-tests/tika/src/main/resources/application.properties b/integration-tests/tika/src/main/resources/application.properties
index e93410248ccb4..17b19ac77d34c 100644
--- a/integration-tests/tika/src/main/resources/application.properties
+++ b/integration-tests/tika/src/main/resources/application.properties
@@ -1,2 +1,5 @@
-quarkus.tika.parsers=pdf
+quarkus.tika.parsers=pdf,ooxml
quarkus.tika.parser-options.pdf.sort-by-position=true
+quarkus.tika.parser-file-format-support.ooxml.docx=true
+quarkus.tika.parser-file-format-support.ooxml.pptx=false
+quarkus.tika.parser-file-format-support.ooxml.xlsx=false
diff --git a/integration-tests/tika/src/main/resources/testDOCX_embedded.docx b/integration-tests/tika/src/main/resources/testDOCX_embedded.docx
new file mode 100644
index 0000000000000..cb386bde4aaeb
Binary files /dev/null and b/integration-tests/tika/src/main/resources/testDOCX_embedded.docx differ
diff --git a/integration-tests/tika/src/main/resources/testPPTX_embedded.pptx b/integration-tests/tika/src/main/resources/testPPTX_embedded.pptx
new file mode 100644
index 0000000000000..f116cca833b2a
Binary files /dev/null and b/integration-tests/tika/src/main/resources/testPPTX_embedded.pptx differ
diff --git a/integration-tests/tika/src/main/resources/testXLSX_embedded.xlsx b/integration-tests/tika/src/main/resources/testXLSX_embedded.xlsx
new file mode 100644
index 0000000000000..32f3b24dde28e
Binary files /dev/null and b/integration-tests/tika/src/main/resources/testXLSX_embedded.xlsx differ
diff --git a/integration-tests/tika/src/test/java/io/quarkus/it/tika/TikaEmbeddedContentTest.java b/integration-tests/tika/src/test/java/io/quarkus/it/tika/TikaEmbeddedContentTest.java
index 96df69f9577ab..29e454617d2fe 100644
--- a/integration-tests/tika/src/test/java/io/quarkus/it/tika/TikaEmbeddedContentTest.java
+++ b/integration-tests/tika/src/test/java/io/quarkus/it/tika/TikaEmbeddedContentTest.java
@@ -5,7 +5,11 @@
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
+import java.util.Optional;
+import javax.inject.Inject;
+
+import org.eclipse.microprofile.config.inject.ConfigProperty;
import org.junit.jupiter.api.Test;
import io.quarkus.test.junit.QuarkusTest;
@@ -13,6 +17,13 @@
@QuarkusTest
public class TikaEmbeddedContentTest {
+ @Inject
+ @ConfigProperty(name = "quarkus.tika.parser-file-format-support.ooxml.pptx")
+ Optional parsePptx;
+ @Inject
+ @ConfigProperty(name = "quarkus.tika.parser-file-format-support.ooxml.xlsx")
+ Optional parseXlsx;
+
@Test
public void testGetOuterText() throws Exception {
given()
@@ -35,6 +46,44 @@ public void testGetInnerText() throws Exception {
.body(containsString("The quick brown fox jumps over the lazy dog"));
}
+ @Test
+ public void contentTypePPTXText() throws Exception {
+ if (true || !parsePptx.orElse(true)) {
+ return;
+ }
+ contentTypeText("testPPTX_embedded.pptx",
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation");
+ }
+
+ @Test
+ public void contentTypeXLSXText() throws Exception {
+ if (true || !parseXlsx.orElse(true)) {
+ return;
+ }
+ contentTypeText("testXLSX_embedded.xlsx",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ }
+
+ @Test
+ public void contentTypeDOCXText() throws Exception {
+ contentTypeText("testDOCX_embedded.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+ }
+
+ @Test
+ public void contentTypeEXCELText() throws Exception {
+ contentTypeText("testEXCEL_embeded.xls", "application/vnd.ms-excel");
+ }
+
+ private void contentTypeText(String fileName, String expected) throws Exception {
+ given()
+ .when()
+ .body(readTestFile(fileName))
+ .post("/embedded/contentType")
+ .then()
+ .statusCode(200)
+ .body(containsString(expected));
+ }
+
private byte[] readTestFile(String fileName) throws Exception {
try (InputStream is = getClass().getClassLoader().getResourceAsStream(fileName)) {
return readBytes(is);