From 4c7b59621c7851b867aa949363dcbb11267e24d4 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Mon, 18 Mar 2019 14:13:34 -0400 Subject: [PATCH 1/4] Add randomScore function in script_score query To make script_score query to have the same features as function_score query, we need to add randomScore function. This function should be able to produce different random scores on different index shards. It also needs to be able to produce random scores based on the internal Lucene Document Ids. To achieve this three variables have been added to the score script context: - _doc for the internal Lucene doc id - _shard for the shard id - _indexName for the index name Closes #31461 --- .../query-dsl/script-score-query.asciidoc | 67 ++++++---------- .../painless/spi/org.elasticsearch.score.txt | 3 +- .../test/painless/80_script_score.yml | 55 -------------- .../painless/85_script_score_random_score.yml | 76 +++++++++++++++++++ .../search/function/ScriptScoreFunction.java | 15 ++++ .../ScriptScoreFunctionBuilder.java | 2 +- .../org/elasticsearch/script/ScoreScript.java | 53 +++++++++++++ .../script/ScoreScriptUtils.java | 29 ++----- 8 files changed, 177 insertions(+), 123 deletions(-) create mode 100644 modules/lang-painless/src/test/resources/rest-api-spec/test/painless/85_script_score_random_score.yml diff --git a/docs/reference/query-dsl/script-score-query.asciidoc b/docs/reference/query-dsl/script-score-query.asciidoc index ee68d3e40fe13..3bf3a19475cd7 100644 --- a/docs/reference/query-dsl/script-score-query.asciidoc +++ b/docs/reference/query-dsl/script-score-query.asciidoc @@ -182,62 +182,45 @@ different from the query's vector, 0 is used for missing dimensions in the calculations of vector functions. -[[random-functions]] -===== Random functions -There are two predefined ways to produce random values: -`randomNotReproducible` and `randomReproducible`. - -`randomNotReproducible()` uses `java.util.Random` class -to generate a random value of the type `long`. -The generated values are not reproducible between requests' invocations. - -[source,js] --------------------------------------------------- -"script" : { - "source" : "randomNotReproducible()" -} --------------------------------------------------- -// NOTCONSOLE - - -`randomReproducible(String seedValue, int seed)` produces -reproducible random values of type `long`. This function requires -more computational time and memory than the non-reproducible version. - -A good candidate for the `seedValue` is document field values that -are unique across documents and already pre-calculated and preloaded -in the memory. For example, values of the document's `_seq_no` field -is a good candidate, as documents on the same shard have unique values -for the `_seq_no` field. +[[random-score-function]] +===== Random score function +`random_score` function generates scores that are uniformly distributed +from 0 up to but not including 1. + +`randomScore` function has the following syntax: +`randomScore(String docValue, String indexName, int shardId, int seed)`. +It requires a document value, an index name, shard id, and a seed. +For the document value, you can use `_doc` which represents +the internal Lucene doc ids; for the index name, you can use `_index` +which represents the index name of a corresponding document; +and for the shard id, you can use `_shard` which represents +the shard id of a corresponding document. [source,js] -------------------------------------------------- "script" : { - "source" : "randomReproducible(Long.toString(doc['_seq_no'].value), 100)" + "source" : "randomScore(String.valueOf(_doc), _index, _shard, 100)" } -------------------------------------------------- // NOTCONSOLE - -A drawback of using `_seq_no` is that generated values change if -documents are updated. Another drawback is not absolute uniqueness, as -documents from different shards with the same sequence numbers -generate the same random values. - -If you need random values to be distinct across different shards, -you can use a field with unique values across shards, -such as `_id`, but watch out for the memory usage as all -these unique values need to be loaded into memory. +Using the internal Lucene doc ids as a source of randomness is very efficient, +but unfortunately not reproducible since documents might be renumbered +by merges. Note that documents that are within the same shard and have the +same value for field will get the same score, so it is usually desirable +to use a field that has unique values for all documents across a shard. +A good default choice might be to use the `_seq_no` +field, whose only drawback is that scores will change if the document is +updated since update operations also update the value of the _seq_no field. [source,js] -------------------------------------------------- "script" : { - "source" : "randomReproducible(doc['_id'].value, 100)" + "source" : "randomScore(String.valueOf(doc['_seq_no'].value), _index, _shard, 100)" } -------------------------------------------------- // NOTCONSOLE - [[decay-functions]] ===== Decay functions for numeric fields You can read more about decay functions @@ -349,8 +332,8 @@ the following script: ===== `random_score` -Use `randomReproducible` and `randomNotReproducible` functions -as described in <>. +Use `randomScore` function +as described in <>. ===== `field_value_factor` diff --git a/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt b/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt index 3d7b29826c747..3ccb6ca7c8743 100644 --- a/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt +++ b/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt @@ -22,8 +22,7 @@ static_import { double saturation(double, double) from_class org.elasticsearch.script.ScoreScriptUtils double sigmoid(double, double, double) from_class org.elasticsearch.script.ScoreScriptUtils - double randomReproducible(String, int) from_class org.elasticsearch.script.ScoreScriptUtils - double randomNotReproducible() bound_to org.elasticsearch.script.ScoreScriptUtils$RandomNotReproducible + double randomScore(String, String, int, int) from_class org.elasticsearch.script.ScoreScriptUtils double decayGeoLinear(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoLinear double decayGeoExp(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoExp double decayGeoGauss(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoGauss diff --git a/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/80_script_score.yml b/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/80_script_score.yml index a3135777c952c..cf55810058d92 100644 --- a/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/80_script_score.yml +++ b/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/80_script_score.yml @@ -72,61 +72,6 @@ setup: - match: { hits.hits.1._id: d2 } - match: { hits.hits.2._id: d1 } ---- -"Random functions": - - do: - indices.create: - index: test - body: - settings: - number_of_shards: 2 - mappings: - properties: - f1: - type: keyword - - do: - index: - index: test - id: 1 - body: {"f1": "v1"} - - do: - index: - index: test - id: 2 - body: {"f1": "v2"} - - do: - index: - index: test - id: 3 - body: {"f1": "v3"} - - - do: - indices.refresh: {} - - - do: - search: - rest_total_hits_as_int: true - index: test - body: - query: - script_score: - query: {match_all: {} } - script: - source: "randomReproducible(Long.toString(doc['_seq_no'].value), 100)" - - match: { hits.total: 3 } - - - do: - search: - rest_total_hits_as_int: true - index: test - body: - query: - script_score: - query: {match_all: {} } - script: - source: "randomNotReproducible()" - - match: { hits.total: 3 } - --- "Decay geo functions": - do: diff --git a/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/85_script_score_random_score.yml b/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/85_script_score_random_score.yml new file mode 100644 index 0000000000000..a2b00665e8144 --- /dev/null +++ b/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/85_script_score_random_score.yml @@ -0,0 +1,76 @@ +# Integration tests for ScriptScoreQuery using Painless + +setup: +- skip: + version: " - 7.99.99" # correct to 7.09.99 after backporting to 7.1 + reason: "random score function of script score was added in 7.1" + +--- +"Random score function": + - do: + indices.create: + index: test + body: + settings: + number_of_shards: 2 + mappings: + properties: + f1: + type: keyword + + - do: + bulk: + refresh: true + body: + - '{"index": {"_index": "test"}}' + - '{"f1": "v0"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v1"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v2"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v3"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v4"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v5"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v6"}' + + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + script_score: + query: {match_all: {} } + script: + source: "randomScore(String.valueOf(doc['_seq_no'].value), _index, _shard, 100)" + # stash ids to check for reproducibility of ranking + - set: { hits.hits.0._id: id0 } + - set: { hits.hits.1._id: id1 } + - set: { hits.hits.2._id: id2 } + - set: { hits.hits.3._id: id3 } + - set: { hits.hits.4._id: id4 } + - set: { hits.hits.5._id: id5 } + - set: { hits.hits.6._id: id6 } + + # check that ranking is reproducible + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + script_score: + query: {match_all: {} } + script: + source: "randomScore(String.valueOf(doc['_seq_no'].value), _index, _shard, 100)" + - match: { hits.hits.0._id: $id0 } + - match: { hits.hits.1._id: $id1 } + - match: { hits.hits.2._id: $id2 } + - match: { hits.hits.3._id: $id3 } + - match: { hits.hits.4._id: $id4 } + - match: { hits.hits.5._id: $id5 } + - match: { hits.hits.6._id: $id6 } diff --git a/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java b/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java index 8e51bc5951d59..d66d9d94ff4bc 100644 --- a/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java +++ b/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java @@ -50,11 +50,24 @@ public float score() { private final ScoreScript.LeafFactory script; + private final int shardId; + private final String indexName; + public ScriptScoreFunction(Script sScript, ScoreScript.LeafFactory script) { super(CombineFunction.REPLACE); this.sScript = sScript; this.script = script; + this.indexName = null; + this.shardId = -1; + } + + public ScriptScoreFunction(Script sScript, ScoreScript.LeafFactory script, String indexName, int shardId) { + super(CombineFunction.REPLACE); + this.sScript = sScript; + this.script = script; + this.indexName = indexName; + this.shardId = shardId; } @Override @@ -62,6 +75,8 @@ public LeafScoreFunction getLeafScoreFunction(LeafReaderContext ctx) throws IOEx final ScoreScript leafScript = script.newInstance(ctx); final CannedScorer scorer = new CannedScorer(); leafScript.setScorer(scorer); + leafScript.setIndexName(indexName); + leafScript.setShard(shardId); return new LeafScoreFunction() { @Override public double score(int docId, float subQueryScore) throws IOException { diff --git a/server/src/main/java/org/elasticsearch/index/query/functionscore/ScriptScoreFunctionBuilder.java b/server/src/main/java/org/elasticsearch/index/query/functionscore/ScriptScoreFunctionBuilder.java index a860bd19d7c5f..accfd2f656999 100644 --- a/server/src/main/java/org/elasticsearch/index/query/functionscore/ScriptScoreFunctionBuilder.java +++ b/server/src/main/java/org/elasticsearch/index/query/functionscore/ScriptScoreFunctionBuilder.java @@ -94,7 +94,7 @@ protected ScoreFunction doToFunction(QueryShardContext context) { try { ScoreScript.Factory factory = context.getScriptService().compile(script, ScoreScript.CONTEXT); ScoreScript.LeafFactory searchScript = factory.newFactory(script.getParams(), context.lookup()); - return new ScriptScoreFunction(script, searchScript); + return new ScriptScoreFunction(script, searchScript, context.index().getName(), context.getShardId()); } catch (Exception e) { throw new QueryShardException(context, "script_score: the script could not be loaded", e); } diff --git a/server/src/main/java/org/elasticsearch/script/ScoreScript.java b/server/src/main/java/org/elasticsearch/script/ScoreScript.java index 6ac5935826bf7..f75d58266072c 100644 --- a/server/src/main/java/org/elasticsearch/script/ScoreScript.java +++ b/server/src/main/java/org/elasticsearch/script/ScoreScript.java @@ -62,6 +62,11 @@ public abstract class ScoreScript { private DoubleSupplier scoreSupplier = () -> 0.0; + private final int docBase; + private int docId; + private int shardId = -1; + private String indexName = null; + public ScoreScript(Map params, SearchLookup lookup, LeafReaderContext leafContext) { // null check needed b/c of expression engine subclass if (lookup == null) { @@ -69,11 +74,13 @@ public ScoreScript(Map params, SearchLookup lookup, LeafReaderCo assert leafContext == null; this.params = null; this.leafLookup = null; + this.docBase = 0; } else { this.leafLookup = lookup.getLeafSearchLookup(leafContext); params = new HashMap<>(params); params.putAll(leafLookup.asMap()); this.params = new DeprecationMap(params, DEPRECATIONS, "score-script"); + this.docBase = leafContext.docBase; } } @@ -91,6 +98,7 @@ public final Map> getDoc() { /** Set the current document to run the script on next. */ public void setDocument(int docid) { + this.docId = docid; leafLookup.setDocument(docid); } @@ -104,10 +112,55 @@ public void setScorer(Scorable scorer) { }; } + /** + * Accessed as _score in the painless script + * @return the score of the inner query + */ public double get_score() { return scoreSupplier.getAsDouble(); } + /** + * Accessed as _doc in the painless script + * @return the internal document ID + */ + public int get_doc() { + return docBase + docId; + } + + /** + * Accessed as _shard in the painless script + * @return shard id or throws an exception if shard is not set up for this script instance + */ + public int get_shard() { + if (shardId > -1) { + return shardId; + } else { + throw new IllegalArgumentException("shard id can not be looked up!"); + } + } + + /** + * Accessed as _index in the painless script + * @return index name or throws an exception if the index name is not set up for this script instance + */ + public String get_index() { + if (indexName != null) { + return indexName; + } else { + throw new IllegalArgumentException("index name can not be looked up!"); + } + } + + public void setShard(int shardId) { + this.shardId = shardId; + } + + public void setIndexName(String indexName) { + this.indexName = indexName; + } + + /** A factory to construct {@link ScoreScript} instances. */ public interface LeafFactory { diff --git a/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java b/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java index 273b8fcf8559d..ed13488a60977 100644 --- a/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java +++ b/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java @@ -21,7 +21,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.StringHelper; -import org.elasticsearch.common.Randomness; import org.elasticsearch.common.geo.GeoDistance; import org.elasticsearch.common.geo.GeoPoint; import org.elasticsearch.common.geo.GeoUtils; @@ -31,12 +30,9 @@ import org.elasticsearch.index.mapper.DateFieldMapper; import java.time.ZoneId; -import java.util.Random; -/** - * ScoringScriptImpl can be used as {@link ScoreScript} - * to run a previously compiled Painless script. - */ +import static com.carrotsearch.hppc.BitMixer.mix32; + public final class ScoreScriptUtils { /****** STATIC FUNCTIONS that can be used by users for score calculations **/ @@ -53,27 +49,14 @@ public static double sigmoid(double value, double k, double a){ return Math.pow(value,a) / (Math.pow(k,a) + Math.pow(value,a)); } - // reproducible random - public static double randomReproducible(String seedValue, int seed) { - int hash = StringHelper.murmurhash3_x86_32(new BytesRef(seedValue), seed); + public static double randomScore(String seedValue, String indexName, int shardId, int seed) { + int salt = (indexName.hashCode() << 10) | shardId; + int saltedSeed = mix32(salt ^ seed); + int hash = StringHelper.murmurhash3_x86_32(new BytesRef(seedValue), saltedSeed); return (hash & 0x00FFFFFF) / (float)(1 << 24); // only use the lower 24 bits to construct a float from 0.0-1.0 } - // not reproducible random - public static final class RandomNotReproducible { - private final Random rnd; - - public RandomNotReproducible() { - this.rnd = Randomness.get(); - } - - public double randomNotReproducible() { - return rnd.nextDouble(); - } - } - - // **** Decay functions on geo field public static final class DecayGeoLinear { // cached variables calculated once per script execution From 06330b758292d27fa65ca062231b18fc5b0e0aed Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Mon, 25 Mar 2019 11:49:02 -0400 Subject: [PATCH 2/4] Add ScoreScript as binding --- .../query-dsl/script-score-query.asciidoc | 32 ++++---- .../painless/spi/org.elasticsearch.score.txt | 6 +- .../painless/85_script_score_random_score.yml | 76 ++++++++++++++++++- .../org/elasticsearch/script/ScoreScript.java | 9 +-- .../script/ScoreScriptUtils.java | 30 ++++++++ 5 files changed, 127 insertions(+), 26 deletions(-) diff --git a/docs/reference/query-dsl/script-score-query.asciidoc b/docs/reference/query-dsl/script-score-query.asciidoc index 3bf3a19475cd7..44108fde32469 100644 --- a/docs/reference/query-dsl/script-score-query.asciidoc +++ b/docs/reference/query-dsl/script-score-query.asciidoc @@ -188,39 +188,39 @@ in the calculations of vector functions. from 0 up to but not including 1. `randomScore` function has the following syntax: -`randomScore(String docValue, String indexName, int shardId, int seed)`. -It requires a document value, an index name, shard id, and a seed. -For the document value, you can use `_doc` which represents -the internal Lucene doc ids; for the index name, you can use `_index` -which represents the index name of a corresponding document; -and for the shard id, you can use `_shard` which represents -the shard id of a corresponding document. +`randomScore(String docValue, int seed)`. +It requires a document value as a string, and a seed: [source,js] -------------------------------------------------- "script" : { - "source" : "randomScore(String.valueOf(_doc), _index, _shard, 100)" + "source" : "randomScore(String.valueOf(doc['_seq_no'].value), 100)" } -------------------------------------------------- // NOTCONSOLE -Using the internal Lucene doc ids as a source of randomness is very efficient, +`docValue` is an optional parameter, and if omitted the internal Lucene +document ids will be used as a source of randomness. This is very efficient, but unfortunately not reproducible since documents might be renumbered -by merges. Note that documents that are within the same shard and have the -same value for field will get the same score, so it is usually desirable -to use a field that has unique values for all documents across a shard. -A good default choice might be to use the `_seq_no` -field, whose only drawback is that scores will change if the document is -updated since update operations also update the value of the _seq_no field. +by merges. [source,js] -------------------------------------------------- "script" : { - "source" : "randomScore(String.valueOf(doc['_seq_no'].value), _index, _shard, 100)" + "source" : "randomScore(100)" } -------------------------------------------------- // NOTCONSOLE + +Note that documents that are within the same shard and have the +same value for field will get the same score, so it is usually desirable +to use a field that has unique values for all documents across a shard. +A good default choice might be to use the `_seq_no` +field, whose only drawback is that scores will change if the document is +updated since update operations also update the value of the `_seq_no` field. + + [[decay-functions]] ===== Decay functions for numeric fields You can read more about decay functions diff --git a/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt b/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt index 3ccb6ca7c8743..c340afd6a0f2a 100644 --- a/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt +++ b/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt @@ -19,10 +19,14 @@ # This file contains a whitelist for functions to be used in Score context +class org.elasticsearch.script.ScoreScript no_import { +} + static_import { double saturation(double, double) from_class org.elasticsearch.script.ScoreScriptUtils double sigmoid(double, double, double) from_class org.elasticsearch.script.ScoreScriptUtils - double randomScore(String, String, int, int) from_class org.elasticsearch.script.ScoreScriptUtils + double randomScore(org.elasticsearch.script.ScoreScript, String, int) bound_to org.elasticsearch.script.ScoreScriptUtils$RandomScore + double randomScore(org.elasticsearch.script.ScoreScript, int) bound_to org.elasticsearch.script.ScoreScriptUtils$RandomScoreDoc double decayGeoLinear(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoLinear double decayGeoExp(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoExp double decayGeoGauss(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoGauss diff --git a/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/85_script_score_random_score.yml b/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/85_script_score_random_score.yml index a2b00665e8144..4d84e251bf228 100644 --- a/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/85_script_score_random_score.yml +++ b/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/85_script_score_random_score.yml @@ -6,7 +6,7 @@ setup: reason: "random score function of script score was added in 7.1" --- -"Random score function": +"Random score function with _seq_no field": - do: indices.create: index: test @@ -46,7 +46,7 @@ setup: script_score: query: {match_all: {} } script: - source: "randomScore(String.valueOf(doc['_seq_no'].value), _index, _shard, 100)" + source: "randomScore(String.valueOf(doc['_seq_no'].value), 100)" # stash ids to check for reproducibility of ranking - set: { hits.hits.0._id: id0 } - set: { hits.hits.1._id: id1 } @@ -66,7 +66,77 @@ setup: script_score: query: {match_all: {} } script: - source: "randomScore(String.valueOf(doc['_seq_no'].value), _index, _shard, 100)" + source: "randomScore(String.valueOf(doc['_seq_no'].value), 100)" + - match: { hits.hits.0._id: $id0 } + - match: { hits.hits.1._id: $id1 } + - match: { hits.hits.2._id: $id2 } + - match: { hits.hits.3._id: $id3 } + - match: { hits.hits.4._id: $id4 } + - match: { hits.hits.5._id: $id5 } + - match: { hits.hits.6._id: $id6 } + +--- +"Random score function with internal doc Ids": + - do: + indices.create: + index: test + body: + settings: + number_of_shards: 1 + mappings: + properties: + f1: + type: keyword + + - do: + bulk: + refresh: true + body: + - '{"index": {"_index": "test"}}' + - '{"f1": "v0"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v1"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v2"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v3"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v4"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v5"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v6"}' + + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + script_score: + query: {match_all: {} } + script: + source: "randomScore(100)" + # stash ids to check for reproducibility of ranking + - set: { hits.hits.0._id: id0 } + - set: { hits.hits.1._id: id1 } + - set: { hits.hits.2._id: id2 } + - set: { hits.hits.3._id: id3 } + - set: { hits.hits.4._id: id4 } + - set: { hits.hits.5._id: id5 } + - set: { hits.hits.6._id: id6 } + + # check that ranking is reproducible + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + script_score: + query: {match_all: {} } + script: + source: "randomScore(100)" - match: { hits.hits.0._id: $id0 } - match: { hits.hits.1._id: $id1 } - match: { hits.hits.2._id: $id2 } diff --git a/server/src/main/java/org/elasticsearch/script/ScoreScript.java b/server/src/main/java/org/elasticsearch/script/ScoreScript.java index f75d58266072c..8a1fc30f5d73f 100644 --- a/server/src/main/java/org/elasticsearch/script/ScoreScript.java +++ b/server/src/main/java/org/elasticsearch/script/ScoreScript.java @@ -121,18 +121,16 @@ public double get_score() { } /** - * Accessed as _doc in the painless script * @return the internal document ID */ - public int get_doc() { + public int getDocId() { return docBase + docId; } /** - * Accessed as _shard in the painless script * @return shard id or throws an exception if shard is not set up for this script instance */ - public int get_shard() { + public int getShardId() { if (shardId > -1) { return shardId; } else { @@ -141,10 +139,9 @@ public int get_shard() { } /** - * Accessed as _index in the painless script * @return index name or throws an exception if the index name is not set up for this script instance */ - public String get_index() { + public String getIndex() { if (indexName != null) { return indexName; } else { diff --git a/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java b/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java index ed13488a60977..1314d92309e07 100644 --- a/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java +++ b/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java @@ -57,6 +57,36 @@ public static double randomScore(String seedValue, String indexName, int shardId return (hash & 0x00FFFFFF) / (float)(1 << 24); // only use the lower 24 bits to construct a float from 0.0-1.0 } + public static final class RandomScore { + private ScoreScript scoreScript; + public RandomScore(ScoreScript scoreScript) { + this.scoreScript = scoreScript; + } + + public double randomScore(String seedValue, int seed) { + int salt = (scoreScript.getIndex().hashCode() << 10) | scoreScript.getShardId(); + int saltedSeed = mix32(salt ^ seed); + int hash = StringHelper.murmurhash3_x86_32(new BytesRef(seedValue), saltedSeed); + return (hash & 0x00FFFFFF) / (float)(1 << 24); // only use the lower 24 bits to construct a float from 0.0-1.0 + } + } + + + // random score based on the internal Lucene document Ids + public static final class RandomScoreDoc { + private ScoreScript scoreScript; + public RandomScoreDoc(ScoreScript scoreScript) { + this.scoreScript = scoreScript; + } + public double randomScore(int seed) { + String seedValue = Integer.toString(scoreScript.getDocId()); + int salt = (scoreScript.getIndex().hashCode() << 10) | scoreScript.getShardId(); + int saltedSeed = mix32(salt ^ seed); + int hash = StringHelper.murmurhash3_x86_32(new BytesRef(seedValue), saltedSeed); + return (hash & 0x00FFFFFF) / (float)(1 << 24); // only use the lower 24 bits to construct a float from 0.0-1.0 + } + } + // **** Decay functions on geo field public static final class DecayGeoLinear { // cached variables calculated once per script execution From a9b9363317c5e1d1543bc6fdfc120ff68aff8a0b Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Mon, 25 Mar 2019 17:00:32 -0400 Subject: [PATCH 3/4] Address Jack's feedback --- .../query-dsl/script-score-query.asciidoc | 9 ++-- .../painless/spi/org.elasticsearch.score.txt | 2 +- .../painless/85_script_score_random_score.yml | 4 +- .../search/function/ScriptScoreFunction.java | 4 +- .../org/elasticsearch/script/ScoreScript.java | 32 ++++++++++-- .../script/ScoreScriptUtils.java | 52 +++++++++++-------- 6 files changed, 67 insertions(+), 36 deletions(-) diff --git a/docs/reference/query-dsl/script-score-query.asciidoc b/docs/reference/query-dsl/script-score-query.asciidoc index 44108fde32469..56c4f7c41b8ee 100644 --- a/docs/reference/query-dsl/script-score-query.asciidoc +++ b/docs/reference/query-dsl/script-score-query.asciidoc @@ -188,18 +188,19 @@ in the calculations of vector functions. from 0 up to but not including 1. `randomScore` function has the following syntax: -`randomScore(String docValue, int seed)`. -It requires a document value as a string, and a seed: +`randomScore(, )`. +It has a required parameter - `seed` as an integer value, +and an optional parameter - `fieldName` as a string value. [source,js] -------------------------------------------------- "script" : { - "source" : "randomScore(String.valueOf(doc['_seq_no'].value), 100)" + "source" : "randomScore(100, '_seq_no')" } -------------------------------------------------- // NOTCONSOLE -`docValue` is an optional parameter, and if omitted the internal Lucene +If the `fieldName` parameter is omitted, the internal Lucene document ids will be used as a source of randomness. This is very efficient, but unfortunately not reproducible since documents might be renumbered by merges. diff --git a/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt b/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt index c340afd6a0f2a..a98b86185d723 100644 --- a/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt +++ b/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt @@ -25,7 +25,7 @@ class org.elasticsearch.script.ScoreScript no_import { static_import { double saturation(double, double) from_class org.elasticsearch.script.ScoreScriptUtils double sigmoid(double, double, double) from_class org.elasticsearch.script.ScoreScriptUtils - double randomScore(org.elasticsearch.script.ScoreScript, String, int) bound_to org.elasticsearch.script.ScoreScriptUtils$RandomScore + double randomScore(org.elasticsearch.script.ScoreScript, int, String) bound_to org.elasticsearch.script.ScoreScriptUtils$RandomScore double randomScore(org.elasticsearch.script.ScoreScript, int) bound_to org.elasticsearch.script.ScoreScriptUtils$RandomScoreDoc double decayGeoLinear(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoLinear double decayGeoExp(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoExp diff --git a/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/85_script_score_random_score.yml b/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/85_script_score_random_score.yml index 4d84e251bf228..2879d50fedebc 100644 --- a/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/85_script_score_random_score.yml +++ b/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/85_script_score_random_score.yml @@ -46,7 +46,7 @@ setup: script_score: query: {match_all: {} } script: - source: "randomScore(String.valueOf(doc['_seq_no'].value), 100)" + source: "randomScore(100, '_seq_no')" # stash ids to check for reproducibility of ranking - set: { hits.hits.0._id: id0 } - set: { hits.hits.1._id: id1 } @@ -66,7 +66,7 @@ setup: script_score: query: {match_all: {} } script: - source: "randomScore(String.valueOf(doc['_seq_no'].value), 100)" + source: "randomScore(100, '_seq_no')" - match: { hits.hits.0._id: $id0 } - match: { hits.hits.1._id: $id1 } - match: { hits.hits.2._id: $id2 } diff --git a/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java b/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java index d66d9d94ff4bc..960df44a62514 100644 --- a/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java +++ b/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java @@ -75,8 +75,8 @@ public LeafScoreFunction getLeafScoreFunction(LeafReaderContext ctx) throws IOEx final ScoreScript leafScript = script.newInstance(ctx); final CannedScorer scorer = new CannedScorer(); leafScript.setScorer(scorer); - leafScript.setIndexName(indexName); - leafScript.setShard(shardId); + leafScript._setIndexName(indexName); + leafScript._setShard(shardId); return new LeafScoreFunction() { @Override public double score(int docId, float subQueryScore) throws IOException { diff --git a/server/src/main/java/org/elasticsearch/script/ScoreScript.java b/server/src/main/java/org/elasticsearch/script/ScoreScript.java index 8a1fc30f5d73f..f31af4c008c74 100644 --- a/server/src/main/java/org/elasticsearch/script/ScoreScript.java +++ b/server/src/main/java/org/elasticsearch/script/ScoreScript.java @@ -120,17 +120,31 @@ public double get_score() { return scoreSupplier.getAsDouble(); } + /** + * Starting a name with underscore, so that the user cannot access this function directly through a script + * It is only used within predefined painless functions. * @return the internal document ID */ - public int getDocId() { + public int _getDocId() { + return docId; + } + + /** + * Starting a name with underscore, so that the user cannot access this function directly through a script + * It is only used within predefined painless functions. + * @return the internal document ID with the base + */ + public int _getDocBaseId() { return docBase + docId; } /** + * Starting a name with underscore, so that the user cannot access this function directly through a script + * It is only used within predefined painless functions. * @return shard id or throws an exception if shard is not set up for this script instance */ - public int getShardId() { + public int _getShardId() { if (shardId > -1) { return shardId; } else { @@ -139,9 +153,11 @@ public int getShardId() { } /** + * Starting a name with underscore, so that the user cannot access this function directly through a script + * It is only used within predefined painless functions. * @return index name or throws an exception if the index name is not set up for this script instance */ - public String getIndex() { + public String _getIndex() { if (indexName != null) { return indexName; } else { @@ -149,11 +165,17 @@ public String getIndex() { } } - public void setShard(int shardId) { + /** + * Starting a name with underscore, so that the user cannot access this function directly through a script + */ + public void _setShard(int shardId) { this.shardId = shardId; } - public void setIndexName(String indexName) { + /** + * Starting a name with underscore, so that the user cannot access this function directly through a script + */ + public void _setIndexName(String indexName) { this.indexName = indexName; } diff --git a/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java b/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java index 1314d92309e07..5802af2029896 100644 --- a/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java +++ b/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java @@ -21,12 +21,14 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.StringHelper; +import org.elasticsearch.ExceptionsHelper; import org.elasticsearch.common.geo.GeoDistance; import org.elasticsearch.common.geo.GeoPoint; import org.elasticsearch.common.geo.GeoUtils; import org.elasticsearch.common.time.DateMathParser; import org.elasticsearch.common.unit.DistanceUnit; import org.elasticsearch.common.unit.TimeValue; +import org.elasticsearch.index.fielddata.ScriptDocValues; import org.elasticsearch.index.mapper.DateFieldMapper; import java.time.ZoneId; @@ -49,39 +51,45 @@ public static double sigmoid(double value, double k, double a){ return Math.pow(value,a) / (Math.pow(k,a) + Math.pow(value,a)); } - // reproducible random - public static double randomScore(String seedValue, String indexName, int shardId, int seed) { - int salt = (indexName.hashCode() << 10) | shardId; - int saltedSeed = mix32(salt ^ seed); - int hash = StringHelper.murmurhash3_x86_32(new BytesRef(seedValue), saltedSeed); - return (hash & 0x00FFFFFF) / (float)(1 << 24); // only use the lower 24 bits to construct a float from 0.0-1.0 - } - public static final class RandomScore { - private ScoreScript scoreScript; - public RandomScore(ScoreScript scoreScript) { + private final ScoreScript scoreScript; + private final ScriptDocValues docValues; + private final int saltedSeed; + + + public RandomScore(ScoreScript scoreScript, int seed, String fieldName) { this.scoreScript = scoreScript; + this.docValues = scoreScript.getDoc().get(fieldName); + int salt = (scoreScript._getIndex().hashCode() << 10) | scoreScript._getShardId(); + this.saltedSeed = mix32(salt ^ seed); + } - public double randomScore(String seedValue, int seed) { - int salt = (scoreScript.getIndex().hashCode() << 10) | scoreScript.getShardId(); - int saltedSeed = mix32(salt ^ seed); - int hash = StringHelper.murmurhash3_x86_32(new BytesRef(seedValue), saltedSeed); - return (hash & 0x00FFFFFF) / (float)(1 << 24); // only use the lower 24 bits to construct a float from 0.0-1.0 + public double randomScore() { + try { + docValues.setNextDocId(scoreScript._getDocId()); + String seedValue = String.valueOf(docValues.get(0)); + int hash = StringHelper.murmurhash3_x86_32(new BytesRef(seedValue), saltedSeed); + return (hash & 0x00FFFFFF) / (float)(1 << 24); // only use the lower 24 bits to construct a float from 0.0-1.0 + } catch (Exception e) { + throw ExceptionsHelper.convertToElastic(e); + } } } - // random score based on the internal Lucene document Ids public static final class RandomScoreDoc { - private ScoreScript scoreScript; - public RandomScoreDoc(ScoreScript scoreScript) { + private final ScoreScript scoreScript; + private final int saltedSeed; + + public RandomScoreDoc(ScoreScript scoreScript, int seed) { this.scoreScript = scoreScript; + int salt = (scoreScript._getIndex().hashCode() << 10) | scoreScript._getShardId(); + this.saltedSeed = mix32(salt ^ seed); } - public double randomScore(int seed) { - String seedValue = Integer.toString(scoreScript.getDocId()); - int salt = (scoreScript.getIndex().hashCode() << 10) | scoreScript.getShardId(); - int saltedSeed = mix32(salt ^ seed); + + public double randomScore() { + String seedValue = Integer.toString(scoreScript._getDocBaseId()); int hash = StringHelper.murmurhash3_x86_32(new BytesRef(seedValue), saltedSeed); return (hash & 0x00FFFFFF) / (float)(1 << 24); // only use the lower 24 bits to construct a float from 0.0-1.0 } From ce6aaa12e09680dff877157e3e80740d6f565dd9 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Wed, 27 Mar 2019 18:06:21 -0400 Subject: [PATCH 4/4] Rename RandomScore to RandomScoreField --- .../elasticsearch/painless/spi/org.elasticsearch.score.txt | 2 +- .../main/java/org/elasticsearch/script/ScoreScriptUtils.java | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt b/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt index a98b86185d723..03ec9275aa8b7 100644 --- a/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt +++ b/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt @@ -25,7 +25,7 @@ class org.elasticsearch.script.ScoreScript no_import { static_import { double saturation(double, double) from_class org.elasticsearch.script.ScoreScriptUtils double sigmoid(double, double, double) from_class org.elasticsearch.script.ScoreScriptUtils - double randomScore(org.elasticsearch.script.ScoreScript, int, String) bound_to org.elasticsearch.script.ScoreScriptUtils$RandomScore + double randomScore(org.elasticsearch.script.ScoreScript, int, String) bound_to org.elasticsearch.script.ScoreScriptUtils$RandomScoreField double randomScore(org.elasticsearch.script.ScoreScript, int) bound_to org.elasticsearch.script.ScoreScriptUtils$RandomScoreDoc double decayGeoLinear(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoLinear double decayGeoExp(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoExp diff --git a/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java b/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java index 5802af2029896..c7d6e889397ff 100644 --- a/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java +++ b/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java @@ -51,13 +51,14 @@ public static double sigmoid(double value, double k, double a){ return Math.pow(value,a) / (Math.pow(k,a) + Math.pow(value,a)); } - public static final class RandomScore { + // random score based on the documents' values of the given field + public static final class RandomScoreField { private final ScoreScript scoreScript; private final ScriptDocValues docValues; private final int saltedSeed; - public RandomScore(ScoreScript scoreScript, int seed, String fieldName) { + public RandomScoreField(ScoreScript scoreScript, int seed, String fieldName) { this.scoreScript = scoreScript; this.docValues = scoreScript.getDoc().get(fieldName); int salt = (scoreScript._getIndex().hashCode() << 10) | scoreScript._getShardId();