Skip to content

Commit

Permalink
Merge pull request #117 from marko-bekhta/i103-searching-for-config-p…
Browse files Browse the repository at this point in the history
…roperties

Add a config property/env variable specific index field
  • Loading branch information
yrodiere authored Jan 8, 2024
2 parents be39669 + 5edfff7 commit 7730eb6
Show file tree
Hide file tree
Showing 13 changed files with 107 additions and 21 deletions.
1 change: 1 addition & 0 deletions src/main/java/io/quarkus/search/app/SearchService.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ public SearchResult<GuideSearchHit> search(@RestQuery @DefaultValue(QuarkusVersi
.field(localizedField("title_autocomplete", language)).boost(1.0f)
.field(localizedField("summary_autocomplete", language)).boost(0.5f)
.field(localizedField("fullContent_autocomplete", language)).boost(0.1f)
.field(localizedField("fullContent_configProperties", language)).boost(2.0f)
.matching(q)
// See: https://github.com/elastic/elasticsearch/issues/39905#issuecomment-471578025
// while the issue is about stopwords the same problem is observed for synonyms on search-analyzer side.
Expand Down
1 change: 1 addition & 0 deletions src/main/java/io/quarkus/search/app/entity/Guide.java
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ public class Guide {

@I18nFullTextField(name = "fullContent", valueBridge = @ValueBridgeRef(type = InputProviderHtmlBodyTextBridge.class), highlightable = Highlightable.UNIFIED, analyzerPrefix = AnalysisConfigurer.DEFAULT, searchAnalyzerPrefix = AnalysisConfigurer.DEFAULT_SEARCH)
@I18nFullTextField(name = "fullContent_autocomplete", valueBridge = @ValueBridgeRef(type = InputProviderHtmlBodyTextBridge.class), analyzerPrefix = AnalysisConfigurer.AUTOCOMPLETE, searchAnalyzerPrefix = AnalysisConfigurer.DEFAULT_SEARCH)
@I18nFullTextField(name = "fullContent_configProperties", valueBridge = @ValueBridgeRef(type = InputProviderHtmlBodyTextBridge.class), analyzerPrefix = AnalysisConfigurer.CONFIG_PROPERTIES, searchAnalyzerPrefix = AnalysisConfigurer.CONFIG_PROPERTIES_SEARCH)
@Transient
@IndexingDependency(reindexOnUpdate = ReindexOnUpdate.NO)
public InputProvider htmlFullContentProvider;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ public class AnalysisConfigurer implements ElasticsearchAnalysisConfigurer {
public static final String DEFAULT = "basic_analyzer";
public static final String DEFAULT_SEARCH = DEFAULT + "_search";
public static final String AUTOCOMPLETE = "autocomplete";
public static final String CONFIG_PROPERTIES = "config_properties";
public static final String CONFIG_PROPERTIES_SEARCH = "config_properties_search";
public static final String SORT = "sort";

public static String defaultAnalyzer(Language language) {
Expand All @@ -36,6 +38,14 @@ public static String autocompleteAnalyzer(Language language) {
return localizedAnalyzer(AUTOCOMPLETE, language);
}

public static String configPropertiesAnalyzer(Language language) {
return localizedAnalyzer(CONFIG_PROPERTIES, language);
}

public static String configPropertiesSearchAnalyzer(Language language) {
return localizedAnalyzer(CONFIG_PROPERTIES_SEARCH, language);
}

public static String localizedAnalyzer(String prefix, Language language) {
return "%s_%s".formatted(prefix, language.code);
}
Expand All @@ -44,6 +54,15 @@ public static String localizedAnalyzer(String prefix, Language language) {
public void configure(ElasticsearchAnalysisConfigurationContext context) {
// for en/es/pt we are going to use the same english configuration since guides are not translated
EnumSet<Language> englishLanguages = EnumSet.of(Language.ENGLISH, Language.PORTUGUESE, Language.SPANISH);

context.tokenizer("config_properties_tokenizer")
.type("simple_pattern")
.param("pattern", "(quarkus(\\.[a-z\\-\\\"]+)+)|(QUARKUS(_[A-Z_]+)+)");
context.tokenFilter("autocomplete_config_properties")
.type("edge_ngram")
.param("min_gram", 2)
.param("max_gram", 70);

for (Language language : englishLanguages) {
SharedFilters result = sharedFilters(context, language);

Expand All @@ -68,6 +87,13 @@ public void configure(ElasticsearchAnalysisConfigurationContext context) {
.tokenFilters(result.possessiveStemmer(), "lowercase", "asciifolding", result.stop(),
result.regularStemmer(), result.autocompleteEdgeNgram())
.charFilters("html_strip");

// config properties
context.analyzer(configPropertiesAnalyzer(language)).custom()
.tokenizer("config_properties_tokenizer")
.tokenFilters("autocomplete_config_properties");
context.analyzer(configPropertiesSearchAnalyzer(language)).custom()
.tokenizer("keyword");
}

// japanese
Expand Down Expand Up @@ -96,6 +122,12 @@ public void configure(ElasticsearchAnalysisConfigurationContext context) {
japanese.autocompleteEdgeNgram())
.charFilters("icu_normalizer", "html_strip");

context.analyzer(configPropertiesAnalyzer(Language.JAPANESE)).custom()
.tokenizer("config_properties_tokenizer")
.tokenFilters("autocomplete_config_properties");
context.analyzer(configPropertiesSearchAnalyzer(Language.JAPANESE)).custom()
.tokenizer("keyword");

// chinese
// https://www.elastic.co/guide/en/elasticsearch/plugins/current/_reimplementing_and_extending_the_analyzers.html
SharedFilters chinese = sharedFilters(context, Language.CHINESE);
Expand Down Expand Up @@ -123,6 +155,12 @@ public void configure(ElasticsearchAnalysisConfigurationContext context) {
chinese.regularStemmer(), chinese.autocompleteEdgeNgram())
.charFilters("html_strip");

context.analyzer(configPropertiesAnalyzer(Language.CHINESE)).custom()
.tokenizer("config_properties_tokenizer")
.tokenFilters("autocomplete_config_properties");
context.analyzer(configPropertiesSearchAnalyzer(Language.CHINESE)).custom()
.tokenizer("keyword");

context.normalizer(SORT).custom()
.tokenFilters("lowercase");
}
Expand All @@ -144,7 +182,7 @@ private static SharedFilters sharedFilters(ElasticsearchAnalysisConfigurationCon
.param("language", "possessive_english");
context.tokenFilter(autocompleteEdgeNgram)
.type("edge_ngram")
.param("min_gram", 1)
.param("min_gram", 2)
.param("max_gram", 10);
context.tokenFilter(synonymsGraphFilter)
// See https://www.elastic.co/guide/en/elasticsearch/reference/8.11/analysis-synonym-graph-tokenfilter.html#analysis-synonym-graph-tokenfilter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,16 @@ public String toIndexedValue(InputProvider provider, ValueBridgeToIndexedValueCo
try (var in = provider.open()) {
Element body = Jsoup.parse(in, StandardCharsets.UTF_8.name(), "/").body();
// Content div has two grid columns: actual content and TOC. There's not much use of the TOC, we want the content only:
Element content = body.selectFirst(".content .grid__item");
Element content = body.selectFirst(".guide");
if (content != null) {
// Means we've found a guide content column. hence let's use that to have only real content:
// We may be looking at a guide with/without a TOC;
// if it is one with the TOC, there's no uch point in indexing the TOC itself, hence we don't include it:

// column (grid items) are not present in a guide like all-config
Element guideColumn = content.selectFirst(".grid__item");
if (guideColumn != null) {
content = guideColumn;
}
return encode(content);
} else {
// we might be looking at a quarkiverse guide; in such case:
Expand Down
3 changes: 2 additions & 1 deletion src/main/resources/indexes/mapping-template.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
{
"_source": {
"excludes": [
"fullContent_autocomplete_*"
"fullContent_autocomplete_*",
"fullContent_configProperties_*"
]
}
}
Expand Down
68 changes: 52 additions & 16 deletions src/test/java/io/quarkus/search/app/SearchServiceTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import java.time.Duration;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -90,8 +91,9 @@ void queryMatchingFullTerm() {
GuideRef.HIBERNATE_SEARCH_ORM_ELASTICSEARCH,
GuideRef.HIBERNATE_REACTIVE,
GuideRef.HIBERNATE_REACTIVE_PANACHE,
GuideRef.SPRING_DATA_JPA));
assertThat(result.total()).isEqualTo(7);
GuideRef.SPRING_DATA_JPA,
GuideRef.ALL_CONFIG));
assertThat(result.total()).isEqualTo(8);
}

@Test
Expand All @@ -102,8 +104,8 @@ void queryMatchingIncludedAdoc() {
// (or... the full rendered HTML).
var result = search("quarkus.hibernate-orm.validate-in-dev-mode");
assertThat(result.hits()).extracting(GuideSearchHit::url).containsExactlyInAnyOrder(GuideRef.urls(
GuideRef.HIBERNATE_ORM, GuideRef.HIBERNATE_REACTIVE));
assertThat(result.total()).isEqualTo(2);
GuideRef.HIBERNATE_ORM, GuideRef.HIBERNATE_REACTIVE, GuideRef.ALL_CONFIG));
assertThat(result.total()).isEqualTo(3);
}

@Test
Expand All @@ -118,25 +120,26 @@ void queryMatchingPrefixTerm() {
GuideRef.HIBERNATE_REACTIVE,
GuideRef.HIBERNATE_REACTIVE_PANACHE,
GuideRef.SPRING_DATA_JPA,
GuideRef.DUPLICATED_CONTEXT));
assertThat(result.total()).isEqualTo(8);
GuideRef.DUPLICATED_CONTEXT,
GuideRef.ALL_CONFIG));
assertThat(result.total()).isEqualTo(9);
}

@Test
void queryMatchingTwoTerms() {
var result = search("orm elasticsearch");
// We expect an AND by default
assertThat(result.hits()).extracting(GuideSearchHit::url)
.containsExactlyInAnyOrder(GuideRef.urls(GuideRef.HIBERNATE_SEARCH_ORM_ELASTICSEARCH));
assertThat(result.total()).isEqualTo(1);
.containsExactlyInAnyOrder(GuideRef.urls(GuideRef.HIBERNATE_SEARCH_ORM_ELASTICSEARCH, GuideRef.ALL_CONFIG));
assertThat(result.total()).isEqualTo(2);
}

@Test
void queryEmptyString() {
var result = search("");
assertThat(result.hits()).extracting(GuideSearchHit::url)
.containsExactlyInAnyOrder(GuideRef.urls(QuarkusIOSample.SearchServiceFilterDefinition.guides()));
assertThat(result.total()).isEqualTo(10);
assertThat(result.total()).isEqualTo(11);
}

@Test
Expand All @@ -147,7 +150,7 @@ void queryNotProvided() {
.extract().body().as(SEARCH_RESULT_SEARCH_HITS);
assertThat(result.hits()).extracting(GuideSearchHit::url)
.containsExactlyInAnyOrder(GuideRef.urls(QuarkusIOSample.SearchServiceFilterDefinition.guides()));
assertThat(result.total()).isEqualTo(10);
assertThat(result.total()).isEqualTo(11);
}

@ParameterizedTest
Expand Down Expand Up @@ -212,14 +215,16 @@ private static List<Arguments> relevance() {
GuideRef.HIBERNATE_SEARCH_ORM_ELASTICSEARCH,
GuideRef.HIBERNATE_REACTIVE_PANACHE,
GuideRef.HIBERNATE_REACTIVE,
GuideRef.ALL_CONFIG,
GuideRef.SPRING_DATA_JPA)),
Arguments.of("reactive", GuideRef.urls(
GuideRef.HIBERNATE_REACTIVE,
GuideRef.HIBERNATE_REACTIVE_PANACHE,
GuideRef.DUPLICATED_CONTEXT, // contains "Hibernate Reactive"
GuideRef.ALL_CONFIG,
GuideRef.HIBERNATE_ORM_PANACHE,
GuideRef.STORK_REFERENCE,
GuideRef.HIBERNATE_SEARCH_ORM_ELASTICSEARCH,
GuideRef.STORK_REFERENCE,
GuideRef.HIBERNATE_ORM,
GuideRef.SPRING_DATA_JPA)),
Arguments.of("hiber", GuideRef.urls(
Expand All @@ -231,14 +236,16 @@ private static List<Arguments> relevance() {
GuideRef.HIBERNATE_ORM_PANACHE,
GuideRef.HIBERNATE_ORM_PANACHE_KOTLIN,
GuideRef.HIBERNATE_ORM,
GuideRef.ALL_CONFIG,
GuideRef.DUPLICATED_CONTEXT, // contains "Hibernate Reactive"
GuideRef.SPRING_DATA_JPA)),
Arguments.of("jpa", GuideRef.urls(
// TODO we'd probably want ORM before Panache?
GuideRef.HIBERNATE_ORM_PANACHE_KOTLIN,
GuideRef.HIBERNATE_REACTIVE_PANACHE, // contains a reference to jpa-modelgen
GuideRef.HIBERNATE_ORM_PANACHE,
GuideRef.HIBERNATE_ORM_PANACHE_KOTLIN,
GuideRef.HIBERNATE_ORM,
GuideRef.ALL_CONFIG,
GuideRef.SPRING_DATA_JPA)),
Arguments.of("search", GuideRef.urls(
GuideRef.HIBERNATE_SEARCH_ORM_ELASTICSEARCH)),
Expand All @@ -260,8 +267,9 @@ void projections() {
GuideRef.HIBERNATE_REACTIVE,
GuideRef.HIBERNATE_REACTIVE_PANACHE,
GuideRef.SPRING_DATA_JPA,
GuideRef.DUPLICATED_CONTEXT));
assertThat(result.total()).isEqualTo(8);
GuideRef.DUPLICATED_CONTEXT,
GuideRef.ALL_CONFIG));
assertThat(result.total()).isEqualTo(9);
}

@Test
Expand Down Expand Up @@ -369,10 +377,10 @@ void highlight_content() {
.extract().body().as(SEARCH_RESULT_SEARCH_HITS);

AtomicInteger matches = new AtomicInteger(0);
assertThat(result.hits()).extracting(GuideSearchHit::content).hasSize(7)
assertThat(result.hits()).extracting(GuideSearchHit::content).hasSize(8)
.allSatisfy(content -> assertThat(content).hasSize(1)
.allSatisfy(hitsHaveCorrectWordHighlighted(matches, "orm", "highlighted-content")));
assertThat(matches.get()).isEqualTo(8);
assertThat(matches.get()).isEqualTo(9);
}

@Test
Expand Down Expand Up @@ -442,6 +450,34 @@ void searchForPhrase() {
"Duplicated context, context locals, <span class=\"highlighted\">asynchronous</span> <span class=\"highlighted\">processing</span> and <span class=\"highlighted\">propagation</span>");
}

@Test
void findEnvVariable() {
var result = given()
// the variable that we are "planning" to find is actually QUARKUS_DATASOURCE_JDBC_TRACING_IGNORE_FOR_TRACING
// But we'll be looking only for a part of it.
.queryParam("q", "QUARKUS_DATASOURCE_JDBC_TRACING_")
.when().get(GUIDES_SEARCH)
.then()
.statusCode(200)
.extract().body().as(SEARCH_RESULT_SEARCH_HITS);
assertThat(result.hits()).extracting(GuideSearchHit::content)
// empty set since we are not looking for an entire var name, and our autocomplete on text is only producing grams up to 10 chars
.containsOnly(Set.of());
}

@Test
void findConfigProperty() {
var result = given()
.queryParam("q", "quarkus.websocket.max-frame-size")
.when().get(GUIDES_SEARCH)
.then()
.statusCode(200)
.extract().body().as(SEARCH_RESULT_SEARCH_HITS);
assertThat(result.hits()).extracting(GuideSearchHit::content)
.containsOnly(
Set.of("Environment variable: QUARKUS_VIRTUAL_THREADS_ENABLED Show more boolean true WebSockets Client Type Default <span class=\"highlighted\">quarkus.websocket.max</span>-<span class=\"highlighted\">frame</span>-<span class=\"highlighted\">size</span>"));
}

private static ThrowingConsumer<String> hitsHaveCorrectWordHighlighted(AtomicInteger matches, String word,
String cssClass) {
return sentence -> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ public record GuideRef(String name) {
public static final GuideRef DEV_SERVICES_REFERENCE = create("dev-services");
public static final GuideRef RESTEASY_REACTIVE_REFERENCE = create("resteasy-reactive");
public static final GuideRef VERTX_REFERENCE = create("vertx-reference");
public static final GuideRef ALL_CONFIG = create("all-config");
public static final GuideRef QUARKIVERSE_AMAZON_S3 = createQuarkiverse(
"https://quarkiverse.github.io/quarkiverse-docs/quarkus-amazon-services/dev/amazon-s3.html");
// NOTE: when adding new constants here, don't forget to run the main() method in QuarkusIOFigure
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,8 @@ public static class SearchServiceFilterDefinition extends AbstractGuideRefSetFil
GuideRef.SPRING_DATA_JPA,
GuideRef.DUPLICATED_CONTEXT,
GuideRef.SECURITY_OIDC_BEARER_TOKEN_AUTHENTICATION,
GuideRef.STORK_REFERENCE
GuideRef.STORK_REFERENCE,
GuideRef.ALL_CONFIG
};

public static GuideRef[] guides() {
Expand Down
Binary file modified src/test/resources/quarkusio-sample-cn.zip
Binary file not shown.
Binary file modified src/test/resources/quarkusio-sample-es.zip
Binary file not shown.
Binary file modified src/test/resources/quarkusio-sample-ja.zip
Binary file not shown.
Binary file modified src/test/resources/quarkusio-sample-pt.zip
Binary file not shown.
Binary file modified src/test/resources/quarkusio-sample.zip
Binary file not shown.

0 comments on commit 7730eb6

Please sign in to comment.