Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate Tika XML configuration during build time #6752

Merged
merged 1 commit into from
Jan 31, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import java.util.function.Function;
Expand All @@ -32,7 +33,6 @@
import io.quarkus.deployment.util.ServiceUtil;
import io.quarkus.tika.TikaParseException;
import io.quarkus.tika.runtime.TikaConfiguration;
import io.quarkus.tika.runtime.TikaParserParameter;
import io.quarkus.tika.runtime.TikaParserProducer;
import io.quarkus.tika.runtime.TikaRecorder;

Expand All @@ -52,21 +52,9 @@ public class TikaProcessor {
{ "odf", "org.apache.tika.parser.odf.OpenDocumentParser" }
}).collect(Collectors.toMap(kv -> kv[0], kv -> kv[1]));

private TikaConfiguration config;

@BuildStep
AdditionalBeanBuildItem beans() {
return AdditionalBeanBuildItem.builder().addBeanClasses(TikaParserProducer.class).build();
}

@BuildStep
@Record(ExecutionTime.STATIC_INIT)
TikaParsersConfigBuildItem initializeTikaParser(BeanContainerBuildItem beanContainer, TikaRecorder recorder)
throws Exception {
Map<String, List<TikaParserParameter>> parsersConfig = getSupportedParserConfig(config.tikaConfigPath, config.parsers,
config.parserOptions, config.parser);
recorder.initTikaParser(beanContainer.getValue(), config, parsersConfig);
return new TikaParsersConfigBuildItem(parsersConfig);
return AdditionalBeanBuildItem.unremovableOf(TikaParserProducer.class);
}

@BuildStep
Expand All @@ -91,42 +79,48 @@ public void registerRuntimeInitializedClasses(BuildProducer<RuntimeInitializedCl
}

@BuildStep
public void registerTikaCoreResources(BuildProducer<NativeImageResourceBuildItem> resource) throws Exception {
public void registerTikaCoreResources(BuildProducer<NativeImageResourceBuildItem> resource) {
resource.produce(new NativeImageResourceBuildItem("org/apache/tika/mime/tika-mimetypes.xml"));
resource.produce(new NativeImageResourceBuildItem("org/apache/tika/parser/external/tika-external-parsers.xml"));
}

@BuildStep
public void registerTikaParsersResources(BuildProducer<NativeImageResourceBuildItem> resource) throws Exception {
public void registerTikaParsersResources(BuildProducer<NativeImageResourceBuildItem> resource) {
resource.produce(new NativeImageResourceBuildItem("org/apache/tika/parser/pdf/PDFParser.properties"));
}

@BuildStep
public void registerPdfBoxResources(BuildProducer<NativeImageResourceBuildItem> resource) throws Exception {
public void registerPdfBoxResources(BuildProducer<NativeImageResourceBuildItem> resource) {
resource.produce(new NativeImageResourceBuildItem("org/apache/pdfbox/resources/glyphlist/additional.txt"));
resource.produce(new NativeImageResourceBuildItem("org/apache/pdfbox/resources/glyphlist/glyphlist.txt"));
resource.produce(new NativeImageResourceBuildItem("org/apache/pdfbox/resources/glyphlist/zapfdingbats.txt"));
}

@BuildStep
public void registerTikaProviders(BuildProducer<ServiceProviderBuildItem> serviceProvider,
TikaParsersConfigBuildItem parserConfigItem) throws Exception {
serviceProvider.produce(
new ServiceProviderBuildItem(Parser.class.getName(),
new ArrayList<>(parserConfigItem.getConfiguration().keySet())));
serviceProvider.produce(
new ServiceProviderBuildItem(Detector.class.getName(), getProviderNames(Detector.class.getName())));
serviceProvider.produce(
new ServiceProviderBuildItem(EncodingDetector.class.getName(),
getProviderNames(EncodingDetector.class.getName())));
}

static List<String> getProviderNames(String serviceProviderName) throws Exception {
@Record(ExecutionTime.STATIC_INIT)
void initializeTikaParser(BeanContainerBuildItem beanContainer, TikaRecorder recorder,
BuildProducer<ServiceProviderBuildItem> serviceProvider, TikaConfiguration configuration)
throws Exception {
Map<String, List<TikaParserParameter>> parsers = getSupportedParserConfig(configuration.tikaConfigPath,
Copy link
Member

@sberyozkin sberyozkin Jan 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@machi1990 first of all, thanks for this effort :-)
I'm not sure about removing a build item containing this Map. My plan, with the current master, has been to make this Map visible to various steps dealing with the individual parsers. For example, I suggested to @tpenakov in the issue where he works on OOXML support to group all PDF related build steps into a single method like preparePdfParser and check this map if PDF parser key is available and only then do all those PDF specific native registrations, the same for OOXML.

For example, this method, this one and this one are specific to PDF and the plan has been to make a single method preparePDFParser out of 3 of them and make all of that optional depending on whether org.apache.tika.parser.pdf.PDFParser key is in the map or not.

So I think that build item still has to be retained. Let me know please if it makes sense. CC @gsmet

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@machi1990 first of all, thanks for this effort :-)

Hi @sberyozkin thank you.

I'm not sure about removing a build item containing this Map. My plan, with the current master, has been to make this Map visible to various steps dealing with the individual parsers. For example, I suggested to @tpenakov in the issue where he works on OOXML support to group all PDF related build steps into a single method like preparePdfParser and check this map if PDF parser key is available and only then do all those PDF specific native registrations, the same for OOXML.

Oh sorry, I was not aware of the future plans around this class.

If I understood the description above correctly, it seems to be that these various methods (e.g preparePdfParser) will be consuming this build item from within the TikaProcessor class, correct?

If so, the same be achieved with an invocation of a private method passing the parsers' map as a parameter ?

Copy link
Member

@sberyozkin sberyozkin Jan 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@machi1990 sorry, had to sign off for a bit.
Yes, I actually thought about it while being offline :-), but then TikaProcessor would just have a large single method :-), calling preparePdfParser, prepareOOXmlParser, etc. It just looks a bit cleaner to me to have dedicated build steps per specific parser, or does the new CL model does not allow for it as far as Tika is concerned ? Not a big deal but wonder what is the best approach here

Copy link
Member

@sberyozkin sberyozkin Jan 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @machi1990 @gsmet so what do you think; I've been thinking today, I'd like, I think, to have a build step per each parser which requires some support. The main build step is the one which generates the tika config. may be it can be called as such, generateTikaConfig or createTikaConfig, it would return a Map with the build item, and then there would be PDF build step, OOXML build step, etc as required which will optionally register the extra bits if the map has the parser key. Otherwise it would indeed have to be this master build step keeping adding private calls to preparePDF.etc and it would feel a bit less cool to me :-)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@machi1990 sorry, had to sign off for a bit.

It is okay, I missed your ping too.

Yes, I actually thought about it while being offline :-), but then TikaProcessor would just have a large single method :-), calling preparePdfParser, prepareOOXmlParser, etc. It just looks a bit cleaner to me to have dedicated build steps per specific parser, or does the new CL model does not allow for it as far as Tika is concerned ?

Hi @sberyozkin the change is not CL related but on whether we need the BuildItem to achieve the above.

Hi @machi1990 @gsmet so what do you think; I've been thinking today, I'd like, I think, to have a build step per each parser which requires some support. The main build step is the one which generates the tika config. may be it can be called as such, generateTikaConfig or createTikaConfig, it would return a Map with the build item, and then there would be PDF build step, OOXML build step, etc as required which will optionally register the extra bits if the map has the parser key. Otherwise it would indeed have to be this master build step keeping adding private calls to preparePDF.etc and it would feel a bit less cool to me :-)

If you prefer to keep it around, I can bring it back. @gsmet WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@machi1990 sorry, missed your comment. Yeah, it would be nice, I reckon it is not really related to the class loading issue, though I can see why Guillaume saw it being redundant, because the optimizations plans I'm referring to above are not yet implemented :-) and that item was only used to facilitate the sub-optimal tika config generation

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not about being redundant. We certainly cannot pass those classes from build time to runtime and expect them to work.

I don't see the need for a build item if everything is done in the same extension.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gsmet This PR itself has preserved a reasonable number of build steps, one generates the configuration now, there are few build steps which deal with loading some resources, common to all of the Tika parsers, and some - common to PDF, and there would be OOXML specific build step coming in, we are not going ahead with collapsing them all into a single step in this PR. So it is clean and nice. Now, if this build item is removed, then for me to implement the optimization idea I had in mind I'd have to basically collapse everything (but a single step involving the Tika-common resources) into a single build step (generate the config, optionally load PDF/OOXML specific resources).
I appreciate a build item is great for coordinating between different extensions, but in this case it would help the tika processor keep the parser specific build steps separate, keeping the native optimization in mind. Np if you don't agree, please go ahead with the merge.
@machi1990 thanks for fix, cheers

configuration.parsers,
configuration.parserOptions, configuration.parser);
String tikaXmlConfiguration = generateTikaXmlConfiguration(parsers);

serviceProvider.produce(new ServiceProviderBuildItem(Parser.class.getName(), new ArrayList<>(parsers.keySet())));
serviceProvider
.produce(new ServiceProviderBuildItem(Detector.class.getName(), getProviderNames(Detector.class.getName())));
serviceProvider.produce(new ServiceProviderBuildItem(EncodingDetector.class.getName(),
getProviderNames(EncodingDetector.class.getName())));

recorder.initTikaParser(beanContainer.getValue(), configuration, tikaXmlConfiguration);
}

private static List<String> getProviderNames(String serviceProviderName) throws Exception {
return new ArrayList<>(ServiceUtil.classNamesNamedIn(TikaProcessor.class.getClassLoader(),
"META-INF/services/" + serviceProviderName));
}

static Map<String, List<TikaParserParameter>> getSupportedParserConfig(Optional<String> tikaConfigPath,
public static Map<String, List<TikaParserParameter>> getSupportedParserConfig(Optional<String> tikaConfigPath,
Optional<String> requiredParsers,
Map<String, Map<String, String>> parserParamMaps,
Map<String, String> parserAbbreviations) throws Exception {
Expand All @@ -140,14 +134,41 @@ static Map<String, List<TikaParserParameter>> getSupportedParserConfig(Optional<
.collect(Collectors.toList());
Map<String, String> fullNamesAndAbbreviations = abbreviations.stream()
.collect(Collectors.toMap(p -> getParserNameFromConfig(p, parserAbbreviations), Function.identity()));

return providerNames.stream().filter(pred).filter(p -> fullNamesAndAbbreviations.containsKey(p))
.collect(Collectors.toMap(Function.identity(),
p -> getParserConfig(p, parserParamMaps.get(fullNamesAndAbbreviations.get(p)))));
}
}

static List<TikaParserParameter> getParserConfig(String parserName, Map<String, String> parserParamMap) {
private static String generateTikaXmlConfiguration(Map<String, List<TikaParserParameter>> parserConfig) {
StringBuilder tikaXmlConfigurationBuilder = new StringBuilder();
tikaXmlConfigurationBuilder.append("<properties>");
tikaXmlConfigurationBuilder.append("<parsers>");
for (Entry<String, List<TikaParserParameter>> parserEntry : parserConfig.entrySet()) {
tikaXmlConfigurationBuilder.append("<parser class=\"").append(parserEntry.getKey()).append("\">");
if (!parserEntry.getValue().isEmpty()) {
appendParserParameters(tikaXmlConfigurationBuilder, parserEntry.getValue());
}
tikaXmlConfigurationBuilder.append("</parser>");
}
tikaXmlConfigurationBuilder.append("</parsers>");
tikaXmlConfigurationBuilder.append("</properties>");
return tikaXmlConfigurationBuilder.toString();
}

private static void appendParserParameters(StringBuilder tikaXmlConfigurationBuilder,
List<TikaParserParameter> parserParams) {
tikaXmlConfigurationBuilder.append("<params>");
for (TikaParserParameter parserParam : parserParams) {
tikaXmlConfigurationBuilder.append("<param name=\"").append(parserParam.getName());
tikaXmlConfigurationBuilder.append("\" type=\"").append(parserParam.getType()).append("\">");
tikaXmlConfigurationBuilder.append(parserParam.getValue());
tikaXmlConfigurationBuilder.append("</param>");
}
tikaXmlConfigurationBuilder.append("</params>");
}

private static List<TikaParserParameter> getParserConfig(String parserName, Map<String, String> parserParamMap) {
List<TikaParserParameter> parserParams = new LinkedList<>();
if (parserParamMap != null) {
for (Map.Entry<String, String> entry : parserParamMap.entrySet()) {
Expand All @@ -173,8 +194,8 @@ private static String getParserNameFromConfig(String abbreviation, Map<String, S
+ "quarkus.tika.parser-name." + abbreviation + " property");
}

// Convert a property name such as "sort-by-position" to "sortByPosition"
private static String unhyphenate(String paramName) {
// Convert a property name such as "sort-by-position" to "sortByPosition"
public static String unhyphenate(String paramName) {
StringBuilder sb = new StringBuilder();
String[] words = paramName.split("-");
for (int i = 0; i < words.length; i++) {
Expand Down Expand Up @@ -217,4 +238,28 @@ private static String getParserParamType(String parserName, String paramName) {
throw new TikaParseException(errorMessage);
}
}

public static class TikaParserParameter {
private String name;
private String value;
private String type;

public TikaParserParameter(String name, String value, String type) {
this.name = name;
this.value = value;
this.type = type;
}

public String getName() {
return name;
}

public String getType() {
return type;
}

public String getValue() {
return value;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,43 +10,15 @@
import java.util.Optional;
import java.util.Set;

import org.eclipse.microprofile.config.Config;
import org.eclipse.microprofile.config.spi.ConfigProviderResolver;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.RegisterExtension;

import io.quarkus.runtime.configuration.QuarkusConfigFactory;
import io.quarkus.tika.runtime.TikaParserParameter;
import io.smallrye.config.SmallRyeConfig;
import io.smallrye.config.SmallRyeConfigBuilder;
import io.quarkus.test.QuarkusUnitTest;

public class TikaProcessorTest {

// We must register a configuration otherwise we'll get an exception.

static volatile SmallRyeConfig config;

@BeforeAll
public static void setItUp() {
final SmallRyeConfigBuilder builder = new SmallRyeConfigBuilder();
builder.addDefaultSources();
builder.addDiscoveredConverters();
builder.addDiscoveredSources();
config = builder.build();
QuarkusConfigFactory.setConfig(config);
ConfigProviderResolver cpr = ConfigProviderResolver.instance();
final Config existingConfig = cpr.getConfig();
if (existingConfig != TikaProcessorTest.config) {
cpr.releaseConfig(existingConfig);
}
}

@AfterAll
public static void tearItDown() {
ConfigProviderResolver cpr = ConfigProviderResolver.instance();
cpr.releaseConfig(config);
}
@RegisterExtension
static final QuarkusUnitTest quarkusUnitTest = new QuarkusUnitTest();

@Test
public void testPDFParserName() throws Exception {
Expand Down Expand Up @@ -82,7 +54,7 @@ public void testResolvableCustomAbbreviation() throws Exception {

@Test
public void testPdfParserConfig() throws Exception {
Map<String, List<TikaParserParameter>> parserConfig = getParserConfig(null, "pdf",
Map<String, List<TikaProcessor.TikaParserParameter>> parserConfig = getParserConfig(null, "pdf",
Collections.singletonMap("pdf",
Collections.singletonMap("sort-by-position", "true")),
Collections.emptyMap());
Expand Down Expand Up @@ -115,13 +87,19 @@ public void testSupportedParserNamesWithTikaConfigPath() throws Exception {
assertEquals(69, names.size());
}

@Test
public void testUnhyphenation() {
assertEquals("sortByPosition", TikaProcessor.unhyphenate("sort-by-position"));
assertEquals("position", TikaProcessor.unhyphenate("position"));
}

private Set<String> getParserNames(String tikaConfigPath, String parsers) throws Exception {
return TikaProcessor.getSupportedParserConfig(
Optional.ofNullable(tikaConfigPath), Optional.ofNullable(parsers),
Collections.emptyMap(), Collections.emptyMap()).keySet();
}

private Map<String, List<TikaParserParameter>> getParserConfig(String tikaConfigPath, String parsers,
private Map<String, List<TikaProcessor.TikaParserParameter>> getParserConfig(String tikaConfigPath, String parsers,
Map<String, Map<String, String>> parserParamMaps,
Map<String, String> parserAbbreviations) throws Exception {
return TikaProcessor.getSupportedParserConfig(
Expand Down

This file was deleted.

Loading