diff --git a/src/main/java/io/quarkus/search/app/SearchService.java b/src/main/java/io/quarkus/search/app/SearchService.java index 19947b63..32682ac7 100644 --- a/src/main/java/io/quarkus/search/app/SearchService.java +++ b/src/main/java/io/quarkus/search/app/SearchService.java @@ -78,6 +78,7 @@ public SearchResult search(@RestQuery @DefaultValue(QuarkusVersi .field(localizedField("title_autocomplete", language)).boost(1.0f) .field(localizedField("summary_autocomplete", language)).boost(0.5f) .field(localizedField("fullContent_autocomplete", language)).boost(0.1f) + .field(localizedField("fullContent_configProperties", language)).boost(2.0f) .matching(q) // See: https://github.com/elastic/elasticsearch/issues/39905#issuecomment-471578025 // while the issue is about stopwords the same problem is observed for synonyms on search-analyzer side. diff --git a/src/main/java/io/quarkus/search/app/entity/Guide.java b/src/main/java/io/quarkus/search/app/entity/Guide.java index 811056b2..9d76a85a 100644 --- a/src/main/java/io/quarkus/search/app/entity/Guide.java +++ b/src/main/java/io/quarkus/search/app/entity/Guide.java @@ -72,6 +72,7 @@ public class Guide { @I18nFullTextField(name = "fullContent", valueBridge = @ValueBridgeRef(type = InputProviderHtmlBodyTextBridge.class), highlightable = Highlightable.UNIFIED, analyzerPrefix = AnalysisConfigurer.DEFAULT, searchAnalyzerPrefix = AnalysisConfigurer.DEFAULT_SEARCH) @I18nFullTextField(name = "fullContent_autocomplete", valueBridge = @ValueBridgeRef(type = InputProviderHtmlBodyTextBridge.class), analyzerPrefix = AnalysisConfigurer.AUTOCOMPLETE, searchAnalyzerPrefix = AnalysisConfigurer.DEFAULT_SEARCH) + @I18nFullTextField(name = "fullContent_configProperties", valueBridge = @ValueBridgeRef(type = InputProviderHtmlBodyTextBridge.class), analyzerPrefix = AnalysisConfigurer.CONFIG_PROPERTIES, searchAnalyzerPrefix = AnalysisConfigurer.CONFIG_PROPERTIES_SEARCH) @Transient @IndexingDependency(reindexOnUpdate = ReindexOnUpdate.NO) public InputProvider htmlFullContentProvider; diff --git a/src/main/java/io/quarkus/search/app/hibernate/AnalysisConfigurer.java b/src/main/java/io/quarkus/search/app/hibernate/AnalysisConfigurer.java index 48ab9f25..5c77bbdb 100644 --- a/src/main/java/io/quarkus/search/app/hibernate/AnalysisConfigurer.java +++ b/src/main/java/io/quarkus/search/app/hibernate/AnalysisConfigurer.java @@ -22,6 +22,8 @@ public class AnalysisConfigurer implements ElasticsearchAnalysisConfigurer { public static final String DEFAULT = "basic_analyzer"; public static final String DEFAULT_SEARCH = DEFAULT + "_search"; public static final String AUTOCOMPLETE = "autocomplete"; + public static final String CONFIG_PROPERTIES = "config_properties"; + public static final String CONFIG_PROPERTIES_SEARCH = "config_properties_search"; public static final String SORT = "sort"; public static String defaultAnalyzer(Language language) { @@ -36,6 +38,14 @@ public static String autocompleteAnalyzer(Language language) { return localizedAnalyzer(AUTOCOMPLETE, language); } + public static String configPropertiesAnalyzer(Language language) { + return localizedAnalyzer(CONFIG_PROPERTIES, language); + } + + public static String configPropertiesSearchAnalyzer(Language language) { + return localizedAnalyzer(CONFIG_PROPERTIES_SEARCH, language); + } + public static String localizedAnalyzer(String prefix, Language language) { return "%s_%s".formatted(prefix, language.code); } @@ -44,6 +54,15 @@ public static String localizedAnalyzer(String prefix, Language language) { public void configure(ElasticsearchAnalysisConfigurationContext context) { // for en/es/pt we are going to use the same english configuration since guides are not translated EnumSet englishLanguages = EnumSet.of(Language.ENGLISH, Language.PORTUGUESE, Language.SPANISH); + + context.tokenizer("config_properties_tokenizer") + .type("simple_pattern") + .param("pattern", "(quarkus(\\.[a-z\\-\\\"]+)+)|(QUARKUS(_[A-Z_]+)+)"); + context.tokenFilter("autocomplete_config_properties") + .type("edge_ngram") + .param("min_gram", 2) + .param("max_gram", 70); + for (Language language : englishLanguages) { SharedFilters result = sharedFilters(context, language); @@ -68,6 +87,13 @@ public void configure(ElasticsearchAnalysisConfigurationContext context) { .tokenFilters(result.possessiveStemmer(), "lowercase", "asciifolding", result.stop(), result.regularStemmer(), result.autocompleteEdgeNgram()) .charFilters("html_strip"); + + // config properties + context.analyzer(configPropertiesAnalyzer(language)).custom() + .tokenizer("config_properties_tokenizer") + .tokenFilters("autocomplete_config_properties"); + context.analyzer(configPropertiesSearchAnalyzer(language)).custom() + .tokenizer("keyword"); } // japanese @@ -96,6 +122,12 @@ public void configure(ElasticsearchAnalysisConfigurationContext context) { japanese.autocompleteEdgeNgram()) .charFilters("icu_normalizer", "html_strip"); + context.analyzer(configPropertiesAnalyzer(Language.JAPANESE)).custom() + .tokenizer("config_properties_tokenizer") + .tokenFilters("autocomplete_config_properties"); + context.analyzer(configPropertiesSearchAnalyzer(Language.JAPANESE)).custom() + .tokenizer("keyword"); + // chinese // https://www.elastic.co/guide/en/elasticsearch/plugins/current/_reimplementing_and_extending_the_analyzers.html SharedFilters chinese = sharedFilters(context, Language.CHINESE); @@ -123,6 +155,12 @@ public void configure(ElasticsearchAnalysisConfigurationContext context) { chinese.regularStemmer(), chinese.autocompleteEdgeNgram()) .charFilters("html_strip"); + context.analyzer(configPropertiesAnalyzer(Language.CHINESE)).custom() + .tokenizer("config_properties_tokenizer") + .tokenFilters("autocomplete_config_properties"); + context.analyzer(configPropertiesSearchAnalyzer(Language.CHINESE)).custom() + .tokenizer("keyword"); + context.normalizer(SORT).custom() .tokenFilters("lowercase"); } @@ -144,7 +182,7 @@ private static SharedFilters sharedFilters(ElasticsearchAnalysisConfigurationCon .param("language", "possessive_english"); context.tokenFilter(autocompleteEdgeNgram) .type("edge_ngram") - .param("min_gram", 1) + .param("min_gram", 2) .param("max_gram", 10); context.tokenFilter(synonymsGraphFilter) // See https://www.elastic.co/guide/en/elasticsearch/reference/8.11/analysis-synonym-graph-tokenfilter.html#analysis-synonym-graph-tokenfilter diff --git a/src/main/java/io/quarkus/search/app/hibernate/InputProviderHtmlBodyTextBridge.java b/src/main/java/io/quarkus/search/app/hibernate/InputProviderHtmlBodyTextBridge.java index 8712b935..be50f8a4 100644 --- a/src/main/java/io/quarkus/search/app/hibernate/InputProviderHtmlBodyTextBridge.java +++ b/src/main/java/io/quarkus/search/app/hibernate/InputProviderHtmlBodyTextBridge.java @@ -19,9 +19,16 @@ public String toIndexedValue(InputProvider provider, ValueBridgeToIndexedValueCo try (var in = provider.open()) { Element body = Jsoup.parse(in, StandardCharsets.UTF_8.name(), "/").body(); // Content div has two grid columns: actual content and TOC. There's not much use of the TOC, we want the content only: - Element content = body.selectFirst(".content .grid__item"); + Element content = body.selectFirst(".guide"); if (content != null) { - // Means we've found a guide content column. hence let's use that to have only real content: + // We may be looking at a guide with/without a TOC; + // if it is one with the TOC, there's no uch point in indexing the TOC itself, hence we don't include it: + + // column (grid items) are not present in a guide like all-config + Element guideColumn = content.selectFirst(".grid__item"); + if (guideColumn != null) { + content = guideColumn; + } return encode(content); } else { // we might be looking at a quarkiverse guide; in such case: diff --git a/src/main/resources/indexes/mapping-template.json b/src/main/resources/indexes/mapping-template.json index 112ff473..2a7873ad 100644 --- a/src/main/resources/indexes/mapping-template.json +++ b/src/main/resources/indexes/mapping-template.json @@ -1,7 +1,8 @@ { "_source": { "excludes": [ - "fullContent_autocomplete_*" + "fullContent_autocomplete_*", + "fullContent_configProperties_*" ] } } diff --git a/src/test/java/io/quarkus/search/app/SearchServiceTest.java b/src/test/java/io/quarkus/search/app/SearchServiceTest.java index 100b7864..e69c0f86 100644 --- a/src/test/java/io/quarkus/search/app/SearchServiceTest.java +++ b/src/test/java/io/quarkus/search/app/SearchServiceTest.java @@ -8,6 +8,7 @@ import java.time.Duration; import java.util.List; import java.util.Locale; +import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -90,8 +91,9 @@ void queryMatchingFullTerm() { GuideRef.HIBERNATE_SEARCH_ORM_ELASTICSEARCH, GuideRef.HIBERNATE_REACTIVE, GuideRef.HIBERNATE_REACTIVE_PANACHE, - GuideRef.SPRING_DATA_JPA)); - assertThat(result.total()).isEqualTo(7); + GuideRef.SPRING_DATA_JPA, + GuideRef.ALL_CONFIG)); + assertThat(result.total()).isEqualTo(8); } @Test @@ -102,8 +104,8 @@ void queryMatchingIncludedAdoc() { // (or... the full rendered HTML). var result = search("quarkus.hibernate-orm.validate-in-dev-mode"); assertThat(result.hits()).extracting(GuideSearchHit::url).containsExactlyInAnyOrder(GuideRef.urls( - GuideRef.HIBERNATE_ORM, GuideRef.HIBERNATE_REACTIVE)); - assertThat(result.total()).isEqualTo(2); + GuideRef.HIBERNATE_ORM, GuideRef.HIBERNATE_REACTIVE, GuideRef.ALL_CONFIG)); + assertThat(result.total()).isEqualTo(3); } @Test @@ -118,8 +120,9 @@ void queryMatchingPrefixTerm() { GuideRef.HIBERNATE_REACTIVE, GuideRef.HIBERNATE_REACTIVE_PANACHE, GuideRef.SPRING_DATA_JPA, - GuideRef.DUPLICATED_CONTEXT)); - assertThat(result.total()).isEqualTo(8); + GuideRef.DUPLICATED_CONTEXT, + GuideRef.ALL_CONFIG)); + assertThat(result.total()).isEqualTo(9); } @Test @@ -127,8 +130,8 @@ void queryMatchingTwoTerms() { var result = search("orm elasticsearch"); // We expect an AND by default assertThat(result.hits()).extracting(GuideSearchHit::url) - .containsExactlyInAnyOrder(GuideRef.urls(GuideRef.HIBERNATE_SEARCH_ORM_ELASTICSEARCH)); - assertThat(result.total()).isEqualTo(1); + .containsExactlyInAnyOrder(GuideRef.urls(GuideRef.HIBERNATE_SEARCH_ORM_ELASTICSEARCH, GuideRef.ALL_CONFIG)); + assertThat(result.total()).isEqualTo(2); } @Test @@ -136,7 +139,7 @@ void queryEmptyString() { var result = search(""); assertThat(result.hits()).extracting(GuideSearchHit::url) .containsExactlyInAnyOrder(GuideRef.urls(QuarkusIOSample.SearchServiceFilterDefinition.guides())); - assertThat(result.total()).isEqualTo(10); + assertThat(result.total()).isEqualTo(11); } @Test @@ -147,7 +150,7 @@ void queryNotProvided() { .extract().body().as(SEARCH_RESULT_SEARCH_HITS); assertThat(result.hits()).extracting(GuideSearchHit::url) .containsExactlyInAnyOrder(GuideRef.urls(QuarkusIOSample.SearchServiceFilterDefinition.guides())); - assertThat(result.total()).isEqualTo(10); + assertThat(result.total()).isEqualTo(11); } @ParameterizedTest @@ -212,14 +215,16 @@ private static List relevance() { GuideRef.HIBERNATE_SEARCH_ORM_ELASTICSEARCH, GuideRef.HIBERNATE_REACTIVE_PANACHE, GuideRef.HIBERNATE_REACTIVE, + GuideRef.ALL_CONFIG, GuideRef.SPRING_DATA_JPA)), Arguments.of("reactive", GuideRef.urls( GuideRef.HIBERNATE_REACTIVE, GuideRef.HIBERNATE_REACTIVE_PANACHE, GuideRef.DUPLICATED_CONTEXT, // contains "Hibernate Reactive" + GuideRef.ALL_CONFIG, GuideRef.HIBERNATE_ORM_PANACHE, - GuideRef.STORK_REFERENCE, GuideRef.HIBERNATE_SEARCH_ORM_ELASTICSEARCH, + GuideRef.STORK_REFERENCE, GuideRef.HIBERNATE_ORM, GuideRef.SPRING_DATA_JPA)), Arguments.of("hiber", GuideRef.urls( @@ -231,14 +236,16 @@ private static List relevance() { GuideRef.HIBERNATE_ORM_PANACHE, GuideRef.HIBERNATE_ORM_PANACHE_KOTLIN, GuideRef.HIBERNATE_ORM, + GuideRef.ALL_CONFIG, GuideRef.DUPLICATED_CONTEXT, // contains "Hibernate Reactive" GuideRef.SPRING_DATA_JPA)), Arguments.of("jpa", GuideRef.urls( // TODO we'd probably want ORM before Panache? - GuideRef.HIBERNATE_ORM_PANACHE_KOTLIN, GuideRef.HIBERNATE_REACTIVE_PANACHE, // contains a reference to jpa-modelgen GuideRef.HIBERNATE_ORM_PANACHE, + GuideRef.HIBERNATE_ORM_PANACHE_KOTLIN, GuideRef.HIBERNATE_ORM, + GuideRef.ALL_CONFIG, GuideRef.SPRING_DATA_JPA)), Arguments.of("search", GuideRef.urls( GuideRef.HIBERNATE_SEARCH_ORM_ELASTICSEARCH)), @@ -260,8 +267,9 @@ void projections() { GuideRef.HIBERNATE_REACTIVE, GuideRef.HIBERNATE_REACTIVE_PANACHE, GuideRef.SPRING_DATA_JPA, - GuideRef.DUPLICATED_CONTEXT)); - assertThat(result.total()).isEqualTo(8); + GuideRef.DUPLICATED_CONTEXT, + GuideRef.ALL_CONFIG)); + assertThat(result.total()).isEqualTo(9); } @Test @@ -369,10 +377,10 @@ void highlight_content() { .extract().body().as(SEARCH_RESULT_SEARCH_HITS); AtomicInteger matches = new AtomicInteger(0); - assertThat(result.hits()).extracting(GuideSearchHit::content).hasSize(7) + assertThat(result.hits()).extracting(GuideSearchHit::content).hasSize(8) .allSatisfy(content -> assertThat(content).hasSize(1) .allSatisfy(hitsHaveCorrectWordHighlighted(matches, "orm", "highlighted-content"))); - assertThat(matches.get()).isEqualTo(8); + assertThat(matches.get()).isEqualTo(9); } @Test @@ -442,6 +450,34 @@ void searchForPhrase() { "Duplicated context, context locals, asynchronous processing and propagation"); } + @Test + void findEnvVariable() { + var result = given() + // the variable that we are "planning" to find is actually QUARKUS_DATASOURCE_JDBC_TRACING_IGNORE_FOR_TRACING + // But we'll be looking only for a part of it. + .queryParam("q", "QUARKUS_DATASOURCE_JDBC_TRACING_") + .when().get(GUIDES_SEARCH) + .then() + .statusCode(200) + .extract().body().as(SEARCH_RESULT_SEARCH_HITS); + assertThat(result.hits()).extracting(GuideSearchHit::content) + // empty set since we are not looking for an entire var name, and our autocomplete on text is only producing grams up to 10 chars + .containsOnly(Set.of()); + } + + @Test + void findConfigProperty() { + var result = given() + .queryParam("q", "quarkus.websocket.max-frame-size") + .when().get(GUIDES_SEARCH) + .then() + .statusCode(200) + .extract().body().as(SEARCH_RESULT_SEARCH_HITS); + assertThat(result.hits()).extracting(GuideSearchHit::content) + .containsOnly( + Set.of("Environment variable: QUARKUS_VIRTUAL_THREADS_ENABLED Show more boolean true WebSockets Client Type Default quarkus.websocket.max-frame-size")); + } + private static ThrowingConsumer hitsHaveCorrectWordHighlighted(AtomicInteger matches, String word, String cssClass) { return sentence -> { diff --git a/src/test/java/io/quarkus/search/app/testsupport/GuideRef.java b/src/test/java/io/quarkus/search/app/testsupport/GuideRef.java index 19ad16fb..a550cbe4 100644 --- a/src/test/java/io/quarkus/search/app/testsupport/GuideRef.java +++ b/src/test/java/io/quarkus/search/app/testsupport/GuideRef.java @@ -28,6 +28,7 @@ public record GuideRef(String name) { public static final GuideRef DEV_SERVICES_REFERENCE = create("dev-services"); public static final GuideRef RESTEASY_REACTIVE_REFERENCE = create("resteasy-reactive"); public static final GuideRef VERTX_REFERENCE = create("vertx-reference"); + public static final GuideRef ALL_CONFIG = create("all-config"); public static final GuideRef QUARKIVERSE_AMAZON_S3 = createQuarkiverse( "https://quarkiverse.github.io/quarkiverse-docs/quarkus-amazon-services/dev/amazon-s3.html"); // NOTE: when adding new constants here, don't forget to run the main() method in QuarkusIOFigure diff --git a/src/test/java/io/quarkus/search/app/testsupport/QuarkusIOSample.java b/src/test/java/io/quarkus/search/app/testsupport/QuarkusIOSample.java index f155ab12..3c090045 100644 --- a/src/test/java/io/quarkus/search/app/testsupport/QuarkusIOSample.java +++ b/src/test/java/io/quarkus/search/app/testsupport/QuarkusIOSample.java @@ -324,7 +324,8 @@ public static class SearchServiceFilterDefinition extends AbstractGuideRefSetFil GuideRef.SPRING_DATA_JPA, GuideRef.DUPLICATED_CONTEXT, GuideRef.SECURITY_OIDC_BEARER_TOKEN_AUTHENTICATION, - GuideRef.STORK_REFERENCE + GuideRef.STORK_REFERENCE, + GuideRef.ALL_CONFIG }; public static GuideRef[] guides() { diff --git a/src/test/resources/quarkusio-sample-cn.zip b/src/test/resources/quarkusio-sample-cn.zip index e324974c..3cd0f8fa 100644 Binary files a/src/test/resources/quarkusio-sample-cn.zip and b/src/test/resources/quarkusio-sample-cn.zip differ diff --git a/src/test/resources/quarkusio-sample-es.zip b/src/test/resources/quarkusio-sample-es.zip index 4b66a396..2c45b9f0 100644 Binary files a/src/test/resources/quarkusio-sample-es.zip and b/src/test/resources/quarkusio-sample-es.zip differ diff --git a/src/test/resources/quarkusio-sample-ja.zip b/src/test/resources/quarkusio-sample-ja.zip index 04cee9a0..a525d311 100644 Binary files a/src/test/resources/quarkusio-sample-ja.zip and b/src/test/resources/quarkusio-sample-ja.zip differ diff --git a/src/test/resources/quarkusio-sample-pt.zip b/src/test/resources/quarkusio-sample-pt.zip index 5053eabb..8562f6b3 100644 Binary files a/src/test/resources/quarkusio-sample-pt.zip and b/src/test/resources/quarkusio-sample-pt.zip differ diff --git a/src/test/resources/quarkusio-sample.zip b/src/test/resources/quarkusio-sample.zip index 03535c27..d7025882 100644 Binary files a/src/test/resources/quarkusio-sample.zip and b/src/test/resources/quarkusio-sample.zip differ