Merge remote-tracking branch 'es/main' into block_source_loader_ignor…

…e_above
martijnvg · Oct 21, 2024 · adb5233 · adb5233
2 parents a496588 + 183ad88
commit adb5233
Show file tree

Hide file tree

Showing 665 changed files with 8,823 additions and 3,642 deletions.
diff --git a/.buildkite/pipelines/lucene-snapshot/run-tests.yml b/.buildkite/pipelines/lucene-snapshot/run-tests.yml
@@ -56,7 +56,6 @@ steps:
         matrix:
           setup:
             BWC_VERSION:
-              - 7.17.13
               - 8.9.1
               - 8.10.0
         agents:

diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/vector/VectorScorerBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/vector/VectorScorerBenchmark.java
@@ -19,7 +19,7 @@
 import org.apache.lucene.store.MMapDirectory;
 import org.apache.lucene.util.hnsw.RandomVectorScorer;
 import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
-import org.apache.lucene.util.quantization.RandomAccessQuantizedByteVectorValues;
+import org.apache.lucene.util.quantization.QuantizedByteVectorValues;
 import org.apache.lucene.util.quantization.ScalarQuantizer;
 import org.elasticsearch.common.logging.LogConfigurator;
 import org.elasticsearch.core.IOUtils;
@@ -217,19 +217,17 @@ public float squareDistanceScalar() {
         return 1 / (1f + adjustedDistance);
     }
 
-    RandomAccessQuantizedByteVectorValues vectorValues(int dims, int size, IndexInput in, VectorSimilarityFunction sim) throws IOException {
+    QuantizedByteVectorValues vectorValues(int dims, int size, IndexInput in, VectorSimilarityFunction sim) throws IOException {
         var sq = new ScalarQuantizer(0.1f, 0.9f, (byte) 7);
         var slice = in.slice("values", 0, in.length());
         return new OffHeapQuantizedByteVectorValues.DenseOffHeapVectorValues(dims, size, sq, false, sim, null, slice);
     }
 
-    RandomVectorScorerSupplier luceneScoreSupplier(RandomAccessQuantizedByteVectorValues values, VectorSimilarityFunction sim)
-        throws IOException {
+    RandomVectorScorerSupplier luceneScoreSupplier(QuantizedByteVectorValues values, VectorSimilarityFunction sim) throws IOException {
         return new Lucene99ScalarQuantizedVectorScorer(null).getRandomVectorScorerSupplier(sim, values);
     }
 
-    RandomVectorScorer luceneScorer(RandomAccessQuantizedByteVectorValues values, VectorSimilarityFunction sim, float[] queryVec)
-        throws IOException {
+    RandomVectorScorer luceneScorer(QuantizedByteVectorValues values, VectorSimilarityFunction sim, float[] queryVec) throws IOException {
         return new Lucene99ScalarQuantizedVectorScorer(null).getRandomVectorScorer(sim, values, queryVec);
     }
 

diff --git a/build-tools-internal/src/main/resources/forbidden/es-server-signatures.txt b/build-tools-internal/src/main/resources/forbidden/es-server-signatures.txt
@@ -59,10 +59,6 @@ org.apache.lucene.util.Version#parseLeniently(java.lang.String)
 
 org.apache.lucene.index.NoMergePolicy#INSTANCE @ explicit use of NoMergePolicy risks forgetting to configure NoMergeScheduler; use org.elasticsearch.common.lucene.Lucene#indexWriterConfigWithNoMerging() instead.
 
-@defaultMessage Spawns a new thread which is solely under lucenes control use ThreadPool#relativeTimeInMillis instead
-org.apache.lucene.search.TimeLimitingCollector#getGlobalTimerThread()
-org.apache.lucene.search.TimeLimitingCollector#getGlobalCounter()
-
 @defaultMessage Don't interrupt threads use FutureUtils#cancel(Future<T>) instead
 java.util.concurrent.Future#cancel(boolean)
 

diff --git a/build-tools-internal/version.properties b/build-tools-internal/version.properties
@@ -1,5 +1,5 @@
 elasticsearch     = 9.0.0
-lucene            = 9.12.0
+lucene            = 10.0.0
 
 bundled_jdk_vendor = openjdk
 bundled_jdk = 22.0.1+8@c7ec1332f7bb44aeba2eb341ae18aca4

diff --git a/distribution/src/config/jvm.options b/distribution/src/config/jvm.options
@@ -62,6 +62,9 @@
 23:-XX:CompileCommand=dontinline,java/lang/invoke/MethodHandle.setAsTypeCache
 23:-XX:CompileCommand=dontinline,java/lang/invoke/MethodHandle.asTypeUncached
 
+# Lucene 10: apply MADV_NORMAL advice to enable more aggressive readahead
+-Dorg.apache.lucene.store.defaultReadAdvice=normal
+
 ## heap dumps
 
 # generate a heap dump when an allocation from the Java heap fails; heap dumps

diff --git a/docs/Versions.asciidoc b/docs/Versions.asciidoc
@@ -1,8 +1,8 @@
 
 include::{docs-root}/shared/versions/stack/{source_branch}.asciidoc[]
 
-:lucene_version:        9.12.0
-:lucene_version_path:   9_12_0
+:lucene_version:        10.0.0
+:lucene_version_path:   10_0_0
 :jdk:                   11.0.2
 :jdk_major:             11
 :build_type:            tar

diff --git a/docs/changelog/113482.yaml b/docs/changelog/113482.yaml
@@ -0,0 +1,27 @@
+pr: 113482
+summary: The 'persian' analyzer has stemmer by default
+area: Analysis
+type: breaking
+issues:
+- 113050
+breaking:
+  title: The 'persian' analyzer has stemmer by default
+  area: Analysis
+  details: >-
+    Lucene 10 has added a final stemming step to its PersianAnalyzer that Elasticsearch
+    exposes as 'persian' analyzer. Existing indices will keep the old
+    non-stemming behaviour while new indices will see the updated behaviour with
+    added stemming.
+    Users that wish to maintain the non-stemming behaviour need to define their
+    own analyzer as outlined in
+    https://www.elastic.co/guide/en/elasticsearch/reference/8.15/analysis-lang-analyzer.html#persian-analyzer.
+    Users that wish to use the new stemming behaviour for existing indices will
+    have to reindex their data.
+  impact: >-
+    Indexing with the 'persian' analyzer will produce slightly different tokens.
+    Users should check if this impacts their search results. If they wish to
+    maintain the legacy non-stemming behaviour they can define their own
+    analyzer equivalent as explained in
+    https://www.elastic.co/guide/en/elasticsearch/reference/8.15/analysis-lang-analyzer.html#persian-analyzer.
+  notable: false
+
diff --git a/docs/changelog/113614.yaml b/docs/changelog/113614.yaml
@@ -0,0 +1,18 @@
+pr: 113614
+summary: The 'german2' stemmer is now an alias for the 'german' snowball stemmer
+area: Analysis
+type: breaking
+issues: []
+breaking:
+  title: The "german2" snowball stemmer is now an alias for the "german" stemmer
+  area: Analysis
+  details: >-
+    Lucene 10 has merged the improved "german2" snowball language stemmer with the
+    "german" stemmer. For Elasticsearch, "german2" is now a deprecated alias for
+    "german". This may results in slightly different tokens being generated for
+    terms with umlaut substitution (like "ue" for "ü" etc...)
+  impact: >-
+    Replace usages of "german2" with "german" in analysis configuration. Old
+    indices that use the "german" stemmer should be reindexed if possible.
+  notable: false
+
diff --git a/docs/changelog/114124.yaml b/docs/changelog/114124.yaml
@@ -0,0 +1,18 @@
+pr: 114124
+summary: The Korean dictionary for Nori has been updated 
+area: Analysis
+type: breaking
+issues: []
+breaking:
+  title: The Korean dictionary for Nori has been updated
+  area: Analysis
+  details: >-
+    Lucene 10 ships with an updated Korean dictionary (mecab-ko-dic-2.1.1). 
+    For details see https://github.com/apache/lucene/issues/11452. Users
+    experiencing changes in search behaviour on existing data are advised to
+    reindex.
+  impact: >-
+    The change is small and should generally provide better analysis results.
+    Existing indices for full-text use cases should be reindexed though.
+  notable: false
+
diff --git a/docs/changelog/114146.yaml b/docs/changelog/114146.yaml
@@ -0,0 +1,20 @@
+pr: 114146 
+summary: Snowball stemmers have been upgraded 
+area: Analysis
+type: breaking
+issues: []
+breaking:
+  title: Snowball stemmers have been upgraded
+  area: Analysis
+  details: >-
+    Lucene 10 ships with an upgrade of its Snowball stemmers. 
+    For details see https://github.com/apache/lucene/issues/13209. Users using
+    Snowball stemmers that are experiencing changes in search behaviour on 
+    existing data are advised to reindex.
+  impact: >-
+    The upgrade should generally provide improved stemming results. Small changes
+    in token analysis can lead to mismatches with previously index data, so
+    existing indices using Snowball stemmers as part of their analysis chain
+    should be reindexed.
+  notable: false
+
diff --git a/docs/changelog/114741.yaml b/docs/changelog/114741.yaml
@@ -0,0 +1,5 @@
+pr: 114741
+summary: Upgrade to Lucene 10
+area: Search
+type: upgrade
+issues: []
diff --git a/docs/plugins/analysis-nori.asciidoc b/docs/plugins/analysis-nori.asciidoc
@@ -244,11 +244,11 @@ Which responds with:
           "end_offset": 3,
           "type": "word",
           "position": 1,
-          "leftPOS": "J(Ending Particle)",
+          "leftPOS": "JKS(Subject case marker)",
           "morphemes": null,
           "posType": "MORPHEME",
           "reading": null,
-          "rightPOS": "J(Ending Particle)"
+          "rightPOS": "JKS(Subject case marker)"
         },
         {
           "token": "깊",
@@ -268,11 +268,11 @@ Which responds with:
           "end_offset": 6,
           "type": "word",
           "position": 3,
-          "leftPOS": "E(Verbal endings)",
+          "leftPOS": "ETM(Adnominal form transformative ending)",
           "morphemes": null,
           "posType": "MORPHEME",
           "reading": null,
-          "rightPOS": "E(Verbal endings)"
+          "rightPOS": "ETM(Adnominal form transformative ending)"
         },
         {
           "token": "나무",
@@ -292,11 +292,11 @@ Which responds with:
           "end_offset": 10,
           "type": "word",
           "position": 5,
-          "leftPOS": "J(Ending Particle)",
+          "leftPOS": "JX(Auxiliary postpositional particle)",
           "morphemes": null,
           "posType": "MORPHEME",
           "reading": null,
-          "rightPOS": "J(Ending Particle)"
+          "rightPOS": "JX(Auxiliary postpositional particle)"
         }
       ]
     },

diff --git a/docs/reference/analysis/analyzers/lang-analyzer.asciidoc b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc
@@ -1430,7 +1430,8 @@ PUT /persian_example
             "decimal_digit",
             "arabic_normalization",
             "persian_normalization",
-            "persian_stop"
+            "persian_stop",
+            "persian_stem"
           ]
         }
       }

diff --git a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc
@@ -173,7 +173,6 @@ http://bvg.udc.es/recursos_lingua/stemming.jsp[`minimal_galician`] (Plural step
 German::
 https://dl.acm.org/citation.cfm?id=1141523[*`light_german`*],
 https://snowballstem.org/algorithms/german/stemmer.html[`german`],
-https://snowballstem.org/algorithms/german2/stemmer.html[`german2`],
 http://members.unine.ch/jacques.savoy/clef/morpho.pdf[`minimal_german`]
 
 Greek::

diff --git a/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc
@@ -40,14 +40,14 @@ POST _analyze
       "start_offset": 0,
       "end_offset": 8,
       "type": "word",
-      "position": 0
+      "position": 1
     },
     {
       "token": "/one/two/three",
       "start_offset": 0,
       "end_offset": 14,
       "type": "word",
-      "position": 0
+      "position": 2
     }
   ]
 }
@@ -144,14 +144,14 @@ POST my-index-000001/_analyze
       "start_offset": 7,
       "end_offset": 18,
       "type": "word",
-      "position": 0
+      "position": 1
     },
     {
       "token": "/three/four/five",
       "start_offset": 7,
       "end_offset": 23,
       "type": "word",
-      "position": 0
+      "position": 2
     }
   ]
 }
@@ -178,14 +178,14 @@ If we were to set `reverse` to `true`, it would produce the following:
 [[analysis-pathhierarchy-tokenizer-detailed-examples]]
 === Detailed examples
 
-A common use-case for the `path_hierarchy` tokenizer is filtering results by 
-file paths. If indexing a file path along with the data, the use of the 
-`path_hierarchy` tokenizer to analyze the path allows filtering the results 
+A common use-case for the `path_hierarchy` tokenizer is filtering results by
+file paths. If indexing a file path along with the data, the use of the
+`path_hierarchy` tokenizer to analyze the path allows filtering the results
 by different parts of the file path string.
 
 
 This example configures an index to have two custom analyzers and applies
-those analyzers to multifields of the `file_path` text field that will 
+those analyzers to multifields of the `file_path` text field that will
 store filenames. One of the two analyzers uses reverse tokenization.
 Some sample documents are then indexed to represent some file paths
 for photos inside photo folders of two different users.
@@ -264,8 +264,8 @@ POST file-path-test/_doc/5
 --------------------------------------------------
 
 
-A search for a particular file path string against the text field matches all 
-the example documents, with Bob's documents ranking highest due to `bob` also 
+A search for a particular file path string against the text field matches all
+the example documents, with Bob's documents ranking highest due to `bob` also
 being one of the terms created by the standard analyzer boosting relevance for
 Bob's documents.
 
@@ -301,7 +301,7 @@ GET file-path-test/_search
 With the reverse parameter for this tokenizer, it's also possible to match
 from the other end of the file path, such as individual file names or a deep
 level subdirectory. The following example shows a search for all files named
-`my_photo1.jpg` within any directory via the `file_path.tree_reversed` field 
+`my_photo1.jpg` within any directory via the `file_path.tree_reversed` field
 configured to use the reverse parameter in the mapping.
 
 
@@ -342,7 +342,7 @@ POST file-path-test/_analyze
 
 
 It's also useful to be able to filter with file paths when combined with other
-types of searches, such as this example looking for any files paths with `16` 
+types of searches, such as this example looking for any files paths with `16`
 that also must be in Alice's photo directory.
 
 [source,console]

diff --git a/docs/reference/search/profile.asciidoc b/docs/reference/search/profile.asciidoc
@@ -1298,7 +1298,7 @@ One of the `dfs.knn` sections for a shard looks like the following:
         "query" : [
             {
                 "type" : "DocAndScoreQuery",
-                "description" : "DocAndScore[100]",
+                "description" : "DocAndScoreQuery[0,...][0.008961825,...],0.008961825",
                 "time_in_nanos" : 444414,
                 "breakdown" : {
                   "set_min_competitive_score_count" : 0,