From 3f2842c3e76229ac1e65e3e9729189d6b35a84b1 Mon Sep 17 00:00:00 2001 From: Aurelien FOUCRET Date: Thu, 18 Jul 2024 12:16:26 +0200 Subject: [PATCH] Adding tests. --- .../org.elasticsearch.script.score.txt | 6 - .../190_term_statistics_script_score.yml | 499 ++++++++++++++++++ .../elasticsearch/script/TermStatsReader.java | 16 +- 3 files changed, 505 insertions(+), 16 deletions(-) create mode 100644 modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/190_term_statistics_script_score.yml diff --git a/modules/lang-painless/src/main/resources/org/elasticsearch/painless/org.elasticsearch.script.score.txt b/modules/lang-painless/src/main/resources/org/elasticsearch/painless/org.elasticsearch.script.score.txt index 380b100dd3545..ceb5e41938475 100644 --- a/modules/lang-painless/src/main/resources/org/elasticsearch/painless/org.elasticsearch.script.score.txt +++ b/modules/lang-painless/src/main/resources/org/elasticsearch/painless/org.elasticsearch.script.score.txt @@ -13,13 +13,7 @@ class org.elasticsearch.script.ScoreScript @no_import { class org.elasticsearch.script.ScoreScript$Factory @no_import { } -class org.apache.lucene.index.Term { - String field() - String text() -} - class org.elasticsearch.script.TermStatsReader { - Set terms() long uniqueTermsCount() long matchedTermsCount() DoubleSummaryStatistics docFreq() diff --git a/modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/190_term_statistics_script_score.yml b/modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/190_term_statistics_script_score.yml new file mode 100644 index 0000000000000..a6f2d2da49acb --- /dev/null +++ b/modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/190_term_statistics_script_score.yml @@ -0,0 +1,499 @@ +setup: + - requires: + cluster_features: ["gte_v8.16.0"] + reason: "support for byte vectors added in 8.16" + + - do: + indices.create: + index: test-index + body: + settings: + number_of_shards: "2" + mappings: + properties: + title: + type: text + genre: + type: text + fields: + keyword: + type: keyword + + - do: + index: { refresh: true, index: test-index, id: "1", routing: 0, body: {"title": "Star wars", "genre": "Sci-fi"} } + - do: + index: { refresh: true, index: test-index, id: "2", routing: 1, body: {"title": "Star trek", "genre": "Sci-fi"} } + - do: + index: { refresh: true, index: test-index, id: "3", routing: 1, body: {"title": "Rambo", "genre": "War movie"} } + - do: + index: { refresh: true, index: test-index, id: "4", routing: 1, body: {"title": "Rambo II", "genre": "War movie"} } + +--- +"Match query: uniqueTermsCount without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { match: { "title": "Star wars" } } + script: + source: "return _termStatistics.uniqueTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 2 } + +--- +"Match query: uniqueTermsCount with DFS": + - do: + search: + search_type: dfs_query_then_fetch + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { match: { "title": "Star wars" } } + script: + source: "return _termStatistics.uniqueTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 2 } + +--- +"Match query: matchedTermsCount without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { match: { "title": "Star wars" } } + script: + source: "return _termStatistics.matchedTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._id: "2" } + - match: { hits.hits.1._score: 1 } + +--- +"Match query: matchedTermsCount with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + script_score: + query: { match: { "title": "Star wars" } } + script: + source: "return _termStatistics.matchedTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._id: "2" } + - match: { hits.hits.1._score: 1 } + +--- +"Match query: docFreq min without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { match: { "title": "Star wars" } } + script: + source: "return _termStatistics.docFreq().getMin()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 0 } + +--- +"Match query: docFreq min with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + script_score: + query: { match: { "title": "Star wars" } } + script: + source: "return _termStatistics.docFreq().getMin()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"Match query: docFreq max without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { match: { "title": "Star wars" } } + script: + source: "return _termStatistics.docFreq().getMax()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"Match query: docFreq max with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + script_score: + query: { match: { "title": "Star wars" } } + script: + source: "return _termStatistics.docFreq().getMax()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 2 } + +--- +"Match query: totalTermFreq sum without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { match: { "title": "Star wars" } } + script: + source: "return _termStatistics.totalTermFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 1 } + +--- +"Match query: totalTermFreq sum with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + script_score: + query: { match: { "title": "Star wars" } } + script: + source: "return _termStatistics.totalTermFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 3 } + - match: { hits.hits.1._score: 3 } + +--- +"Match query: termFreq sum without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { match: { "title": "Star wars" } } + script: + source: "return _termStatistics.termFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 1 } + +--- +"Match query: termFreq sum with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + script_score: + query: { match: { "title": "Star wars" } } + script: + source: "return _termStatistics.termFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 1 } + +--- +"Match query: termPositions avg without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { match: { "title": "Star wars" } } + script: + source: "return _termStatistics.termPositions().getAverage()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1.5 } + - match: { hits.hits.1._score: 1 } + +--- +"Match query: termPositions avg with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + script_score: + query: { match: { "title": "Star wars" } } + script: + source: "return _termStatistics.termPositions().getAverage()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1.5 } + - match: { hits.hits.1._score: 1 } + +--- +"Term query: uniqueTermsCount without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { term: { "genre.keyword": "Sci-fi" } } + script: + source: "return _termStatistics.uniqueTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"Term query: uniqueTermsCount with DFS": + - do: + search: + search_type: dfs_query_then_fetch + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { term: { "genre.keyword": "Sci-fi" } } + script: + source: "return _termStatistics.uniqueTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"Term query: matchedTermsCount without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { term: { "genre.keyword": "Sci-fi" } } + script: + source: "return _termStatistics.matchedTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._id: "2" } + - match: { hits.hits.1._score: 1 } + +--- +"Term query: matchedTermsCount with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + script_score: + query: { term: { "genre.keyword": "Sci-fi" } } + script: + source: "return _termStatistics.matchedTermsCount()" + - match: { hits.total: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._id: "2" } + - match: { hits.hits.1._score: 1 } + +--- +"Term query: docFreq min without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { term: { "genre.keyword": "Sci-fi" } } + script: + source: "return _termStatistics.docFreq().getMin()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"Term query: docFreq min with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + script_score: + query: { term: { "genre.keyword": "Sci-fi" } } + script: + source: "return _termStatistics.docFreq().getMin()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 2 } + +--- +"Term query: docFreq max without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { term: { "genre.keyword": "Sci-fi" } } + script: + source: "return _termStatistics.docFreq().getMax()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"Term query: docFreq max with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + script_score: + query: { term: { "genre.keyword": "Sci-fi" } } + script: + source: "return _termStatistics.docFreq().getMax()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 2 } + +--- +"Term query: totalTermFreq sum without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { term: { "genre.keyword": "Sci-fi" } } + script: + source: "return _termStatistics.totalTermFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"Term query: totalTermFreq sum with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + script_score: + query: { term: { "genre.keyword": "Sci-fi" } } + script: + source: "return _termStatistics.totalTermFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 2 } + - match: { hits.hits.1._score: 2 } + +--- +"Term query: termFreq sum without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { term: { "genre.keyword": "Sci-fi" } } + script: + source: "return _termStatistics.termFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"Term query: termFreq sum with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + script_score: + query: { term: { "genre.keyword": "Sci-fi" } } + script: + source: "return _termStatistics.termFreq().getSum()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 1 } + - match: { hits.hits.1._score: 1 } + +--- +"Term query: termPositions avg without DFS": + - do: + search: + rest_total_hits_as_int: true + index: test-index + body: + query: + script_score: + query: { term: { "genre.keyword": "Sci-fi" } } + script: + source: "return _termStatistics.termPositions().getAverage()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 0 } + - match: { hits.hits.1._score: 0 } + +--- +"Term query: termPositions avg with DFS": + - do: + search: + rest_total_hits_as_int: true + search_type: dfs_query_then_fetch + index: test-index + body: + query: + script_score: + query: { term: { "genre.keyword": "Sci-fi" } } + script: + source: "return _termStatistics.termPositions().getAverage()" + - match: { hits.total: 2 } + - match: { hits.hits.0._score: 0 } + - match: { hits.hits.1._score: 0 } diff --git a/server/src/main/java/org/elasticsearch/script/TermStatsReader.java b/server/src/main/java/org/elasticsearch/script/TermStatsReader.java index bfe882352c254..6d0fc085c036e 100644 --- a/server/src/main/java/org/elasticsearch/script/TermStatsReader.java +++ b/server/src/main/java/org/elasticsearch/script/TermStatsReader.java @@ -40,20 +40,16 @@ public TermStatsReader(IndexSearcher searcher, Supplier docIdSupplier, this.terms = terms; } - public Set terms() { - return terms; - } - public long uniqueTermsCount() { return terms.size(); } public long matchedTermsCount() { - return terms().stream().filter(term -> { + return terms.stream().filter(term -> { try { PostingsEnum postingsEnum = postings(term); int docId = docIdSupplier.get(); - return postingsEnum != null && postingsEnum.advance(docId) == docId; + return postingsEnum != null && postingsEnum.advance(docId) == docId && postingsEnum.freq() > 0; } catch (IOException e) { throw new UncheckedIOException(e); } @@ -62,7 +58,7 @@ public long matchedTermsCount() { public DoubleSummaryStatistics docFreq() { DoubleSummaryStatistics docFreqStatistics = new DoubleSummaryStatistics(); - for (Term term : terms()) { + for (Term term : terms) { TermStatistics termStats = termStatistics(term); docFreqStatistics.accept(termStats != null ? termStats.docFreq() : 0); } @@ -71,7 +67,7 @@ public DoubleSummaryStatistics docFreq() { public DoubleSummaryStatistics totalTermFreq() { DoubleSummaryStatistics totalTermFreqStatistics = new DoubleSummaryStatistics(); - for (Term term : terms()) { + for (Term term : terms) { TermStatistics termStats = termStatistics(term); totalTermFreqStatistics.accept(termStats != null ? termStats.totalTermFreq() : 0); } @@ -81,7 +77,7 @@ public DoubleSummaryStatistics totalTermFreq() { public DoubleSummaryStatistics termFreq() { DoubleSummaryStatistics termFreqStatistics = new DoubleSummaryStatistics(); - for (Term term : terms()) { + for (Term term : terms) { try { PostingsEnum postingsEnum = postings(term); int docId = docIdSupplier.get(); @@ -101,7 +97,7 @@ public DoubleSummaryStatistics termFreq() { public DoubleSummaryStatistics termPositions() { DoubleSummaryStatistics termPositionsStatistics = new DoubleSummaryStatistics(); - for (Term term : terms()) { + for (Term term : terms) { try { PostingsEnum postingsEnum = postings(term); int docId = docIdSupplier.get();