From 772878cd3015958c439a9d9af8b7e821d126744f Mon Sep 17 00:00:00 2001 From: Aurelien FOUCRET Date: Thu, 29 Aug 2024 14:28:46 +0200 Subject: [PATCH] Ensure _termStats is supported with function_score --- .../191_term_statistics_function_score.yml | 680 ++++++++++++++++++ .../search/function/FunctionScoreQuery.java | 12 + .../search/function/ScriptScoreFunction.java | 19 + 3 files changed, 711 insertions(+) create mode 100644 modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/191_term_statistics_function_score.yml diff --git a/modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/191_term_statistics_function_score.yml b/modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/191_term_statistics_function_score.yml new file mode 100644 index 0000000000000..de4d6530f4a92 --- /dev/null +++ b/modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/191_term_statistics_function_score.yml @@ -0,0 +1,680 @@ +setup: + - requires: + cluster_features: ["script.term_stats"] + reason: "support for term stats has been added in 8.16" + + - do: + indices.create: + index: test-index + body: + settings: + number_of_shards: "2" + mappings: + properties: + title: + type: text + genre: + type: text + fields: + keyword: + type: keyword + + - do: + index: { refresh: true, index: test-index, id: "1", routing: 0, body: {"title": "Star wars", "genre": "Sci-fi"} } + - do: + index: { refresh: true, index: test-index, id: "2", routing: 1, body: {"title": "Star trek", "genre": "Sci-fi"} } + - do: + index: { refresh: true, index: test-index, id: "3", routing: 1, body: {"title": "Rambo", "genre": "War movie"} } + - do: + index: { refresh: true, index: test-index, id: "4", routing: 1, body: {"title": "Rambo II", "genre": "War movie"} } + +--- +"match query: uniqueTermsCount without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { match: { "title": "Star wars" } } + script_score: + script: + source: "return _termStats.uniqueTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 2 } + +--- +"match query: uniqueTermsCount with DFS": + - do: + search: + search_type: dfs_query_then_fetch + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { match: { "title": "Star wars" } } + script_score: + script: + source: "return _termStats.uniqueTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 2 } + +--- +"match query: matchedTermsCount without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { match: { "title": "Star wars" } } + script_score: + script: + source: "return _termStats.matchedTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 1 } + +--- +"match query: matchedTermsCount with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { match: { "title": "Star wars" } } + script_score: + script: + source: "return _termStats.matchedTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 1 } + +--- +"match query: docFreq min without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { match: { "title": "Star wars" } } + script_score: + script: + source: "return _termStats.docFreq().getMin()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 0 } + +--- +"match query: docFreq min with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { match: { "title": "Star wars" } } + script_score: + script: + source: "return _termStats.docFreq().getMin()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"match query: docFreq max without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { match: { "title": "Star wars" } } + script_score: + script: + source: "return _termStats.docFreq().getMax()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"match query: docFreq max with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { match: { "title": "Star wars" } } + script_score: + script: + source: "return _termStats.docFreq().getMax()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 2 } + +--- +"match query: totalTermFreq sum without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { match: { "title": "Star wars" } } + script_score: + script: + source: "return _termStats.totalTermFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 1 } + +--- +"match query: totalTermFreq sum with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { match: { "title": "Star wars" } } + script_score: + script: + source: "return _termStats.totalTermFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 3 } + - match: { hits.hits.1._score: 3 } + +--- +"match query: termFreq sum without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { match: { "title": "Star wars" } } + script_score: + script: + source: "return _termStats.termFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 1 } + +--- +"match query: termFreq sum with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { match: { "title": "Star wars" } } + script_score: + script: + source: "return _termStats.termFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 1 } + +--- +"match query: termPositions avg without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { match: { "title": "Star wars" } } + script_score: + script: + source: "return _termStats.termPositions().getAverage()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1.5 } + - match: { hits.hits.1._score: 1 } + +--- +"match query: termPositions avg with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { match: { "title": "Star wars" } } + script_score: + script: + source: "return _termStats.termPositions().getAverage()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1.5 } + - match: { hits.hits.1._score: 1 } + +--- +"term query: uniqueTermsCount without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { term: { "genre.keyword": "Sci-fi" } } + script_score: + script: + source: "return _termStats.uniqueTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"term query: uniqueTermsCount with DFS": + - do: + search: + search_type: dfs_query_then_fetch + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { term: { "genre.keyword": "Sci-fi" } } + script_score: + script: + source: "return _termStats.uniqueTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"term query: matchedTermsCount without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { term: { "genre.keyword": "Sci-fi" } } + script_score: + script: + source: "return _termStats.matchedTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"term query: matchedTermsCount with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { term: { "genre.keyword": "Sci-fi" } } + script_score: + script: + source: "return _termStats.matchedTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"term query: docFreq min without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { term: { "genre.keyword": "Sci-fi" } } + script_score: + script: + source: "return _termStats.docFreq().getMin()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"term query: docFreq min with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { term: { "genre.keyword": "Sci-fi" } } + script_score: + script: + source: "return _termStats.docFreq().getMin()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 2 } + +--- +"term query: docFreq max without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { term: { "genre.keyword": "Sci-fi" } } + script_score: + script: + source: "return _termStats.docFreq().getMax()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"term query: docFreq max with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { term: { "genre.keyword": "Sci-fi" } } + script_score: + script: + source: "return _termStats.docFreq().getMax()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 2 } + +--- +"term query: totalTermFreq sum without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { term: { "genre.keyword": "Sci-fi" } } + script_score: + script: + source: "return _termStats.totalTermFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"term query: totalTermFreq sum with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { term: { "genre.keyword": "Sci-fi" } } + script_score: + script: + source: "return _termStats.totalTermFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 2 } + +--- +"term query: termFreq sum without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { term: { "genre.keyword": "Sci-fi" } } + script_score: + script: + source: "return _termStats.termFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"term query: termFreq sum with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { term: { "genre.keyword": "Sci-fi" } } + script_score: + script: + source: "return _termStats.termFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"term query: termPositions avg without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { term: { "genre.keyword": "Sci-fi" } } + script_score: + script: + source: "return _termStats.termPositions().getAverage()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 0 } + - match: { hits.hits.1._score: 0 } + +--- +"term query: termPositions avg with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + function_score: + boost_mode: replace + query: { term: { "genre.keyword": "Sci-fi" } } + script_score: + script: + source: "return _termStats.termPositions().getAverage()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 0 } + - match: { hits.hits.1._score: 0 } + +--- +"Complex bool query: uniqueTermsCount": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: + bool: + must: + match: { "title": "star wars" } + should: + term: { "genre.keyword": "Sci-fi" } + filter: + match: { "genre" : "sci"} + must_not: + term: { "genre.keyword": "War" } + script_score: + script: + source: "return _termStats.uniqueTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 4 } + - match: { hits.hits.1._score: 4 } + + +--- +"match_all query: uniqueTermsCount": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: + match_all: {} + script_score: + script: + source: "return _termStats.uniqueTermsCount()" + - match: { hits.total: 4 } + - match: { hits.hits.0._score: 0 } + - match: { hits.hits.1._score: 0 } + - match: { hits.hits.2._score: 0 } + - match: { hits.hits.3._score: 0 } + +--- +"match_all query: docFreq": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: + match_all: {} + script_score: + script: + source: "return _termStats.docFreq().getMax()" + - match: { hits.total: 4 } + - match: { hits.hits.0._score: 0 } + - match: { hits.hits.1._score: 0 } + - match: { hits.hits.2._score: 0 } + - match: { hits.hits.3._score: 0 } + +--- +"match_all query: totalTermFreq": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: + match_all: {} + script_score: + script: + source: "return _termStats.totalTermFreq().getSum()" + - match: { hits.total: 4 } + - match: { hits.hits.0._score: 0 } + - match: { hits.hits.1._score: 0 } + - match: { hits.hits.2._score: 0 } + - match: { hits.hits.3._score: 0 } + +--- +"match_all query: termFreq": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: + match_all: {} + script_score: + script: + source: "return _termStats.termFreq().getMax()" + - match: { hits.total: 4 } + - match: { hits.hits.0._score: 0 } + - match: { hits.hits.1._score: 0 } + - match: { hits.hits.2._score: 0 } + - match: { hits.hits.3._score: 0 } + +--- +"match_all query: termPositions": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + function_score: + boost_mode: replace + query: + match_all: {} + script_score: + script: + source: "return _termStats.termPositions().getSum()" + - match: { hits.total: 4 } + - match: { hits.hits.0._score: 0 } + - match: { hits.hits.1._score: 0 } + - match: { hits.hits.2._score: 0 } + - match: { hits.hits.3._score: 0 } diff --git a/server/src/main/java/org/elasticsearch/common/lucene/search/function/FunctionScoreQuery.java b/server/src/main/java/org/elasticsearch/common/lucene/search/function/FunctionScoreQuery.java index aed11297d4285..c47d0154fe048 100644 --- a/server/src/main/java/org/elasticsearch/common/lucene/search/function/FunctionScoreQuery.java +++ b/server/src/main/java/org/elasticsearch/common/lucene/search/function/FunctionScoreQuery.java @@ -10,6 +10,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.FilterScorer; @@ -25,14 +26,17 @@ import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.script.ScriptTermStats; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Objects; +import java.util.Set; /** * A query that allows for a pluggable boost function / filter. If it matches @@ -241,6 +245,14 @@ public Weight createWeight(IndexSearcher searcher, org.apache.lucene.search.Scor 1f ); } + if (functions[i] instanceof ScriptScoreFunction scriptScoreFunction && scriptScoreFunction.needsTermStats()) { + subQueryScoreMode = org.apache.lucene.search.ScoreMode.COMPLETE; + // We collect the different terms used in the child query. + final Set terms = new HashSet<>(); + this.visit(QueryVisitor.termCollector(terms)); + scriptScoreFunction.setTermStatsFactory((ctx, docIdSupplier) -> new ScriptTermStats(searcher, ctx, docIdSupplier, terms)); + + } } Weight subQueryWeight = subQuery.createWeight(searcher, subQueryScoreMode, boost); return new CustomBoostFactorWeight(this, subQueryWeight, filterWeights, subQueryScoreMode.needsScores()); diff --git a/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java b/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java index 6b8a75337b8ee..70233ed8ead6d 100644 --- a/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java +++ b/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java @@ -15,10 +15,13 @@ import org.elasticsearch.script.ExplainableScoreScript; import org.elasticsearch.script.ScoreScript; import org.elasticsearch.script.Script; +import org.elasticsearch.script.ScriptTermStats; import org.elasticsearch.search.lookup.SearchLookup; import java.io.IOException; import java.util.Objects; +import java.util.function.BiFunction; +import java.util.function.IntSupplier; public class ScriptScoreFunction extends ScoreFunction { @@ -45,6 +48,8 @@ public float score() { private final int shardId; private final String indexName; + private BiFunction termStatsFactory; + public ScriptScoreFunction(Script sScript, ScoreScript.LeafFactory script, SearchLookup lookup, String indexName, int shardId) { super(CombineFunction.REPLACE); this.sScript = sScript; @@ -61,6 +66,12 @@ public LeafScoreFunction getLeafScoreFunction(LeafReaderContext ctx) throws IOEx leafScript.setScorer(scorer); leafScript._setIndexName(indexName); leafScript._setShard(shardId); + + if (script.needs_termStats()) { + assert termStatsFactory != null; + leafScript._setTermStats(termStatsFactory.apply(ctx, scorer::docID)); + } + return new LeafScoreFunction() { private double score(int docId, float subQueryScore, ScoreScript.ExplanationHolder holder) throws IOException { @@ -111,6 +122,14 @@ public boolean needsScores() { return script.needs_score(); } + public boolean needsTermStats() { + return script.needs_termStats(); + } + + public void setTermStatsFactory(BiFunction termStatsFactory) { + this.termStatsFactory = termStatsFactory; + } + @Override public String toString() { return "script" + sScript.toString();