From 8e0a21fa49852f1d76971c099437d1028faeced0 Mon Sep 17 00:00:00 2001 From: Manyanda Chitimbo Date: Thu, 23 Jan 2020 15:20:32 +0100 Subject: [PATCH] feat(tika): generate tika xml configuration during build time test: TikaProcessorTest should use Quarkus test framework Fixes https://github.com/quarkusio/quarkus/issues/6746 Fixes https://github.com/quarkusio/quarkus/issues/5700 --- .../TikaParsersConfigBuildItem.java | 21 ---- .../tika/deployment/TikaProcessor.java | 115 ++++++++++++------ .../tika/deployment/TikaProcessorTest.java | 46 ++----- .../tika/runtime/TikaParserParameter.java | 41 ------- .../io/quarkus/tika/runtime/TikaRecorder.java | 48 ++------ 5 files changed, 100 insertions(+), 171 deletions(-) delete mode 100644 extensions/tika/deployment/src/main/java/io/quarkus/tika/deployment/TikaParsersConfigBuildItem.java delete mode 100644 extensions/tika/runtime/src/main/java/io/quarkus/tika/runtime/TikaParserParameter.java diff --git a/extensions/tika/deployment/src/main/java/io/quarkus/tika/deployment/TikaParsersConfigBuildItem.java b/extensions/tika/deployment/src/main/java/io/quarkus/tika/deployment/TikaParsersConfigBuildItem.java deleted file mode 100644 index 69952bb6c7463..0000000000000 --- a/extensions/tika/deployment/src/main/java/io/quarkus/tika/deployment/TikaParsersConfigBuildItem.java +++ /dev/null @@ -1,21 +0,0 @@ -package io.quarkus.tika.deployment; - -import java.util.List; -import java.util.Map; - -import io.quarkus.builder.item.SimpleBuildItem; -import io.quarkus.tika.runtime.TikaParserParameter; - -public final class TikaParsersConfigBuildItem extends SimpleBuildItem { - - private final Map> parsersConfig; - - public TikaParsersConfigBuildItem(Map> parsersConfig) { - this.parsersConfig = parsersConfig; - } - - public Map> getConfiguration() { - return parsersConfig; - } - -} diff --git a/extensions/tika/deployment/src/main/java/io/quarkus/tika/deployment/TikaProcessor.java b/extensions/tika/deployment/src/main/java/io/quarkus/tika/deployment/TikaProcessor.java index 154f4288f37f7..95d3627b19501 100644 --- a/extensions/tika/deployment/src/main/java/io/quarkus/tika/deployment/TikaProcessor.java +++ b/extensions/tika/deployment/src/main/java/io/quarkus/tika/deployment/TikaProcessor.java @@ -6,6 +6,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Optional; import java.util.Set; import java.util.function.Function; @@ -32,7 +33,6 @@ import io.quarkus.deployment.util.ServiceUtil; import io.quarkus.tika.TikaParseException; import io.quarkus.tika.runtime.TikaConfiguration; -import io.quarkus.tika.runtime.TikaParserParameter; import io.quarkus.tika.runtime.TikaParserProducer; import io.quarkus.tika.runtime.TikaRecorder; @@ -52,21 +52,9 @@ public class TikaProcessor { { "odf", "org.apache.tika.parser.odf.OpenDocumentParser" } }).collect(Collectors.toMap(kv -> kv[0], kv -> kv[1])); - private TikaConfiguration config; - @BuildStep AdditionalBeanBuildItem beans() { - return AdditionalBeanBuildItem.builder().addBeanClasses(TikaParserProducer.class).build(); - } - - @BuildStep - @Record(ExecutionTime.STATIC_INIT) - TikaParsersConfigBuildItem initializeTikaParser(BeanContainerBuildItem beanContainer, TikaRecorder recorder) - throws Exception { - Map> parsersConfig = getSupportedParserConfig(config.tikaConfigPath, config.parsers, - config.parserOptions, config.parser); - recorder.initTikaParser(beanContainer.getValue(), config, parsersConfig); - return new TikaParsersConfigBuildItem(parsersConfig); + return AdditionalBeanBuildItem.unremovableOf(TikaParserProducer.class); } @BuildStep @@ -91,42 +79,48 @@ public void registerRuntimeInitializedClasses(BuildProducer resource) throws Exception { + public void registerTikaCoreResources(BuildProducer resource) { resource.produce(new NativeImageResourceBuildItem("org/apache/tika/mime/tika-mimetypes.xml")); resource.produce(new NativeImageResourceBuildItem("org/apache/tika/parser/external/tika-external-parsers.xml")); } @BuildStep - public void registerTikaParsersResources(BuildProducer resource) throws Exception { + public void registerTikaParsersResources(BuildProducer resource) { resource.produce(new NativeImageResourceBuildItem("org/apache/tika/parser/pdf/PDFParser.properties")); } @BuildStep - public void registerPdfBoxResources(BuildProducer resource) throws Exception { + public void registerPdfBoxResources(BuildProducer resource) { resource.produce(new NativeImageResourceBuildItem("org/apache/pdfbox/resources/glyphlist/additional.txt")); resource.produce(new NativeImageResourceBuildItem("org/apache/pdfbox/resources/glyphlist/glyphlist.txt")); resource.produce(new NativeImageResourceBuildItem("org/apache/pdfbox/resources/glyphlist/zapfdingbats.txt")); } @BuildStep - public void registerTikaProviders(BuildProducer serviceProvider, - TikaParsersConfigBuildItem parserConfigItem) throws Exception { - serviceProvider.produce( - new ServiceProviderBuildItem(Parser.class.getName(), - new ArrayList<>(parserConfigItem.getConfiguration().keySet()))); - serviceProvider.produce( - new ServiceProviderBuildItem(Detector.class.getName(), getProviderNames(Detector.class.getName()))); - serviceProvider.produce( - new ServiceProviderBuildItem(EncodingDetector.class.getName(), - getProviderNames(EncodingDetector.class.getName()))); - } - - static List getProviderNames(String serviceProviderName) throws Exception { + @Record(ExecutionTime.STATIC_INIT) + void initializeTikaParser(BeanContainerBuildItem beanContainer, TikaRecorder recorder, + BuildProducer serviceProvider, TikaConfiguration configuration) + throws Exception { + Map> parsers = getSupportedParserConfig(configuration.tikaConfigPath, + configuration.parsers, + configuration.parserOptions, configuration.parser); + String tikaXmlConfiguration = generateTikaXmlConfiguration(parsers); + + serviceProvider.produce(new ServiceProviderBuildItem(Parser.class.getName(), new ArrayList<>(parsers.keySet()))); + serviceProvider + .produce(new ServiceProviderBuildItem(Detector.class.getName(), getProviderNames(Detector.class.getName()))); + serviceProvider.produce(new ServiceProviderBuildItem(EncodingDetector.class.getName(), + getProviderNames(EncodingDetector.class.getName()))); + + recorder.initTikaParser(beanContainer.getValue(), configuration, tikaXmlConfiguration); + } + + private static List getProviderNames(String serviceProviderName) throws Exception { return new ArrayList<>(ServiceUtil.classNamesNamedIn(TikaProcessor.class.getClassLoader(), "META-INF/services/" + serviceProviderName)); } - static Map> getSupportedParserConfig(Optional tikaConfigPath, + public static Map> getSupportedParserConfig(Optional tikaConfigPath, Optional requiredParsers, Map> parserParamMaps, Map parserAbbreviations) throws Exception { @@ -140,14 +134,41 @@ static Map> getSupportedParserConfig(Optional< .collect(Collectors.toList()); Map fullNamesAndAbbreviations = abbreviations.stream() .collect(Collectors.toMap(p -> getParserNameFromConfig(p, parserAbbreviations), Function.identity())); - return providerNames.stream().filter(pred).filter(p -> fullNamesAndAbbreviations.containsKey(p)) .collect(Collectors.toMap(Function.identity(), p -> getParserConfig(p, parserParamMaps.get(fullNamesAndAbbreviations.get(p))))); } } - static List getParserConfig(String parserName, Map parserParamMap) { + private static String generateTikaXmlConfiguration(Map> parserConfig) { + StringBuilder tikaXmlConfigurationBuilder = new StringBuilder(); + tikaXmlConfigurationBuilder.append(""); + tikaXmlConfigurationBuilder.append(""); + for (Entry> parserEntry : parserConfig.entrySet()) { + tikaXmlConfigurationBuilder.append(""); + if (!parserEntry.getValue().isEmpty()) { + appendParserParameters(tikaXmlConfigurationBuilder, parserEntry.getValue()); + } + tikaXmlConfigurationBuilder.append(""); + } + tikaXmlConfigurationBuilder.append(""); + tikaXmlConfigurationBuilder.append(""); + return tikaXmlConfigurationBuilder.toString(); + } + + private static void appendParserParameters(StringBuilder tikaXmlConfigurationBuilder, + List parserParams) { + tikaXmlConfigurationBuilder.append(""); + for (TikaParserParameter parserParam : parserParams) { + tikaXmlConfigurationBuilder.append(""); + tikaXmlConfigurationBuilder.append(parserParam.getValue()); + tikaXmlConfigurationBuilder.append(""); + } + tikaXmlConfigurationBuilder.append(""); + } + + private static List getParserConfig(String parserName, Map parserParamMap) { List parserParams = new LinkedList<>(); if (parserParamMap != null) { for (Map.Entry entry : parserParamMap.entrySet()) { @@ -173,8 +194,8 @@ private static String getParserNameFromConfig(String abbreviation, Map> parserConfig = getParserConfig(null, "pdf", + Map> parserConfig = getParserConfig(null, "pdf", Collections.singletonMap("pdf", Collections.singletonMap("sort-by-position", "true")), Collections.emptyMap()); @@ -115,13 +87,19 @@ public void testSupportedParserNamesWithTikaConfigPath() throws Exception { assertEquals(69, names.size()); } + @Test + public void testUnhyphenation() { + assertEquals("sortByPosition", TikaProcessor.unhyphenate("sort-by-position")); + assertEquals("position", TikaProcessor.unhyphenate("position")); + } + private Set getParserNames(String tikaConfigPath, String parsers) throws Exception { return TikaProcessor.getSupportedParserConfig( Optional.ofNullable(tikaConfigPath), Optional.ofNullable(parsers), Collections.emptyMap(), Collections.emptyMap()).keySet(); } - private Map> getParserConfig(String tikaConfigPath, String parsers, + private Map> getParserConfig(String tikaConfigPath, String parsers, Map> parserParamMaps, Map parserAbbreviations) throws Exception { return TikaProcessor.getSupportedParserConfig( diff --git a/extensions/tika/runtime/src/main/java/io/quarkus/tika/runtime/TikaParserParameter.java b/extensions/tika/runtime/src/main/java/io/quarkus/tika/runtime/TikaParserParameter.java deleted file mode 100644 index 057532068b8ad..0000000000000 --- a/extensions/tika/runtime/src/main/java/io/quarkus/tika/runtime/TikaParserParameter.java +++ /dev/null @@ -1,41 +0,0 @@ -package io.quarkus.tika.runtime; - -public class TikaParserParameter { - private String name; - private String value; - private String type; - - public TikaParserParameter() { - - } - - public TikaParserParameter(String name, String value, String type) { - this.name = name; - this.value = value; - this.type = type; - } - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public String getType() { - return type; - } - - public void setType(String type) { - this.type = type; - } - - public String getValue() { - return value; - } - - public void setValue(String value) { - this.value = value; - } -} diff --git a/extensions/tika/runtime/src/main/java/io/quarkus/tika/runtime/TikaRecorder.java b/extensions/tika/runtime/src/main/java/io/quarkus/tika/runtime/TikaRecorder.java index f0330428e3cf6..ef449afba8f83 100644 --- a/extensions/tika/runtime/src/main/java/io/quarkus/tika/runtime/TikaRecorder.java +++ b/extensions/tika/runtime/src/main/java/io/quarkus/tika/runtime/TikaRecorder.java @@ -3,9 +3,6 @@ import java.io.ByteArrayInputStream; import java.io.InputStream; import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; import org.apache.tika.config.TikaConfig; import org.apache.tika.parser.AutoDetectParser; @@ -20,17 +17,16 @@ @Recorder public class TikaRecorder { - public void initTikaParser(BeanContainer container, TikaConfiguration config, - Map> parserConfig) { - TikaParser parser = initializeParser(config, parserConfig); + public void initTikaParser(BeanContainer container, TikaConfiguration config, String tikaXmlConfiguration) { + TikaParser parser = initializeParser(config, tikaXmlConfiguration); TikaParserProducer producer = container.instance(TikaParserProducer.class); producer.initialize(parser); } - private TikaParser initializeParser(TikaConfiguration config, Map> parserConfig) { - TikaConfig tikaConfig = null; + private TikaParser initializeParser(TikaConfiguration config, String tikaXmlConfiguration) { + TikaConfig tikaConfig; - try (InputStream stream = getTikaConfigStream(config, parserConfig)) { + try (InputStream stream = getTikaConfigStream(config, tikaXmlConfiguration)) { tikaConfig = new TikaConfig(stream); } catch (Exception ex) { final String errorMessage = "Invalid tika-config.xml"; @@ -47,10 +43,9 @@ private TikaParser initializeParser(TikaConfiguration config, Map> parserConfig) { + private static InputStream getTikaConfigStream(TikaConfiguration config, String tikaXmlConfiguration) { // Load tika-config.xml resource - InputStream is = null; + InputStream is; if (config.tikaConfigPath.isPresent()) { is = TikaRecorder.class.getResourceAsStream( config.tikaConfigPath.get().startsWith("/") ? config.tikaConfigPath.get() @@ -60,35 +55,8 @@ private static InputStream getTikaConfigStream(TikaConfiguration config, throw new TikaParseException(errorMessage); } } else { - is = generateTikaConfig(parserConfig); + is = new ByteArrayInputStream(tikaXmlConfiguration.getBytes(StandardCharsets.UTF_8)); } return is; } - - private static InputStream generateTikaConfig(Map> parserConfig) { - StringBuilder sb = new StringBuilder(); - sb.append(""); - sb.append(""); - for (Entry> parserEntry : parserConfig.entrySet()) { - sb.append(""); - if (!parserEntry.getValue().isEmpty()) { - appendParserParameters(sb, parserEntry.getValue()); - } - sb.append(""); - } - sb.append(""); - sb.append(""); - return new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8)); - } - - private static void appendParserParameters(StringBuilder sb, List parserParams) { - sb.append(""); - for (TikaParserParameter parserParam : parserParams) { - sb.append(""); - sb.append(parserParam.getValue()); - sb.append(""); - } - sb.append(""); - } }