Merge branch 'main' into refactor-knn

dungba88 · Aug 6, 2024 · 7dd5b40 · 7dd5b40
2 parents b7f9680 + e0e5d81
commit 7dd5b40
Show file tree

Hide file tree

Showing 142 changed files with 9,283 additions and 1,299 deletions.
diff --git a/gradle/generation/forUtil.gradle b/gradle/generation/forUtil.gradle
@@ -23,7 +23,7 @@ configure(project(":lucene:core")) {
     description "Regenerate gen_ForUtil.py"
     group "generation"
 
-    def genDir = file("src/java/org/apache/lucene/codecs/lucene99")
+    def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
     def genScript = file("${genDir}/gen_ForUtil.py")
     def genOutput = file("${genDir}/ForUtil.java")
 
@@ -96,5 +96,30 @@ configure(project(":lucene:backward-codecs")) {
           andThenTasks: ["spotlessJava", "spotlessJavaApply"],
           mustRunBefore: [ "compileJava" ]
   ])
+
+  task generateForUtil99Internal() {
+    description "Regenerate gen_ForUtil.py"
+    group "generation"
+
+    def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene99")
+    def genScript = file("${genDir}/gen_ForUtil.py")
+    def genOutput = file("${genDir}/ForUtil.java")
+
+    inputs.file genScript
+    outputs.file genOutput
+
+    doLast {
+      quietExec {
+        workingDir genDir
+        executable project.externalTool("python3")
+        args = [ '-B', genScript ]
+      }
+    }
+  }
+
+  regenerate.dependsOn wrapWithPersistentChecksums(generateForUtil99Internal, [
+          andThenTasks: ["spotlessJava", "spotlessJavaApply"],
+          mustRunBefore: [ "compileJava" ]
+  ])
 }
 
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -108,6 +108,9 @@ API Changes
 
 * GITHUB#13410: Removed Scorer#getWeight (Sanjay Dutt, Adrien Grand)
 
+* GITHUB#13499: Remove deprecated TopScoreDocCollector + TopFieldCollector methods (#create, #createSharedManager) (Jakub Slowinski)
+
+
 New Features
 ---------------------
 
@@ -134,6 +137,8 @@ New Features
 * GITHUB#13597: Align doc value skipper interval boundaries when an interval contains a constant
   value. (Ignacio Vera)
 
+* GITHUB#13604: Add Kmeans clustering on vectors (Mayya Sharipova, Jim Ferenczi, Tom Veasey)
+
 Improvements
 ---------------------
 
@@ -244,14 +249,22 @@ Other
 
 * GITHUB#13332: Improve MissingDoclet linter to check records correctly. (Uwe Schindler)
 
+* GITHUB#13499: Remove usage of TopScoreDocCollector + TopFieldCollector deprecated methods (#create, #createSharedManager) (Jakub Slowinski)
+
 ======================== Lucene 9.12.0 =======================
 
 API Changes
 ---------------------
 
-* GITHUB#13281: Mark COSINE VectorSimilarityFunction as deprecated. (Pulkit Gupta)
+* GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov)
 
-* GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov) 
+* GITHUB#13612: Hunspell: add Suggester#proceedPastRep to avoid losing relevant suggestions. (Peter Gromov)
+
+* GITHUB#13603: Introduced `IndexSearcher#searchLeaf(LeafReaderContext, Weight, Collector)` protected method to
+  facilitate customizing per-leaf behavior of search without requiring to override
+  `search(LeafReaderContext[], Weight, Collector)` which requires overriding the entire loop across the leaves (Luca Cavanna)
+
+* GITHUB#13559: Add BitSet#nextSetBit(int, int) to get the index of the first set bit in range. (Egor Potemkin)
 
 New Features
 ---------------------
@@ -273,6 +286,11 @@ Improvements
 * GITHUB#13562: Add Intervals.regexp and Intervals.range methods to produce  IntervalsSource
   for regexp and range queries. (Mayya Sharipova)
 
+* GITHUB#13625: Remove BitSet#nextSetBit code duplication. (Greg Miller)
+
+* GITHUB#13285: Early terminate graph searches of AbstractVectorSimilarityQuery to follow timeout set from
+  IndexSearcher#setTimeout(QueryTimeout). (Kaival Parikh)
+
 Optimizations
 ---------------------
 
@@ -300,6 +318,21 @@ Optimizations
 * GITHUB#13582: Stop requiring MaxScoreBulkScorer's outer window from having at
   least INNER_WINDOW_SIZE docs. (Adrien Grand)
 
+* GITHUB#13570, GITHUB#13574, GITHUB#13535: Avoid performance degradation with closing shared Arenas.
+  Closing many individual index files can potentially lead to a degradation in execution performance.
+  Index files are mmapped one-to-one with the JDK's foreign shared Arena. The JVM deoptimizes the top
+  few frames of all threads when closing a shared Arena (see JDK-8335480). We mitigate this situation
+  by 1) using a confined Arena where appropriate, and 2) grouping files from the same segment to a
+  single shared Arena. (Chris Hegarty, Michael Gibney, Uwe Schindler)
+
+* GITHUB#13585: Lucene912PostingsFormat, the new default postings format, now
+  only has 2 levels of skip data, which are inlined into postings instead of
+  being stored at the end of postings lists. This translates into better
+  performance for queries that need skipping such as conjunctions.
+  (Adrien Grand)
+
+* GITHUB#13581: OnHeapHnswGraph no longer allocates a lock for every graph node (Mike Sokolov)
+
 Changes in runtime behavior
 ---------------------
 
@@ -320,6 +353,9 @@ Bug Fixes
 * GITHUB#13553: Correct RamUsageEstimate for scalar quantized knn vector formats so that raw vectors are correctly
   accounted for. (Ben Trent)
 
+* GITHUB#13615: Correct scalar quantization when used in conjunction with COSINE similarity. Vectors are normalized
+  before quantization to ensure the cosine similarity is correctly calculated. (Ben Trent)
+
 Other
 --------------------
 (No changes)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@@ -31,6 +31,7 @@ class ModifyingSuggester {
   private final String misspelled;
   private final WordCase wordCase;
   private final FragmentChecker fragmentChecker;
+  private final boolean proceedPastRep;
   private final char[] tryChars;
   private final Hunspell speller;
 
@@ -39,13 +40,15 @@ class ModifyingSuggester {
       LinkedHashSet<Suggestion> result,
       String misspelled,
       WordCase wordCase,
-      FragmentChecker checker) {
+      FragmentChecker checker,
+      boolean proceedPastRep) {
     this.speller = speller;
     tryChars = speller.dictionary.tryChars.toCharArray();
     this.result = result;
     this.misspelled = misspelled;
     this.wordCase = wordCase;
     fragmentChecker = checker;
+    this.proceedPastRep = proceedPastRep;
   }
 
   /**
@@ -125,9 +128,9 @@ private boolean tryVariationsOf(String word) {
     boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
 
     GradedSuggestions repResult = tryRep(word);
-    if (repResult == GradedSuggestions.Best) return true;
+    if (repResult == GradedSuggestions.Best && !proceedPastRep) return true;
 
-    hasGoodSuggestions |= repResult == GradedSuggestions.Normal;
+    hasGoodSuggestions |= repResult != GradedSuggestions.None;
 
     if (!speller.dictionary.mapTable.isEmpty()) {
       enumerateMapReplacements(word, "", 0);

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Suggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Suggester.java
@@ -53,16 +53,21 @@ public class Suggester {
   private final Dictionary dictionary;
   private final SuggestibleEntryCache suggestibleCache;
   private final FragmentChecker fragmentChecker;
+  private final boolean proceedPastRep;
 
   public Suggester(Dictionary dictionary) {
-    this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE);
+    this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE, false);
   }
 
   private Suggester(
-      Dictionary dictionary, SuggestibleEntryCache suggestibleCache, FragmentChecker checker) {
+      Dictionary dictionary,
+      SuggestibleEntryCache suggestibleCache,
+      FragmentChecker checker,
+      boolean proceedPastRep) {
     this.dictionary = dictionary;
     this.suggestibleCache = suggestibleCache;
     this.fragmentChecker = checker;
+    this.proceedPastRep = proceedPastRep;
   }
 
   /**
@@ -71,16 +76,26 @@ private Suggester(
    * entries are stored as fast-to-iterate plain words instead of highly compressed prefix trees.
    */
   public Suggester withSuggestibleEntryCache() {
-    return new Suggester(
-        dictionary, SuggestibleEntryCache.buildCache(dictionary.words), fragmentChecker);
+    SuggestibleEntryCache cache = SuggestibleEntryCache.buildCache(dictionary.words);
+    return new Suggester(dictionary, cache, fragmentChecker, proceedPastRep);
   }
 
   /**
    * Returns a copy of this suggester instance with {@link FragmentChecker} hint that can improve
    * the performance of the "Modification" phase performance.
    */
   public Suggester withFragmentChecker(FragmentChecker checker) {
-    return new Suggester(dictionary, suggestibleCache, checker);
+    return new Suggester(dictionary, suggestibleCache, checker, proceedPastRep);
+  }
+
+  /**
+   * Returns a copy of this suggester instance that doesn't stop after encountering acceptable words
+   * after applying REP rules. By default, Hunspell stops when it finds any, but this behavior may
+   * not always be desirable, e.g., if we have "REP i ea", "tims" be replaced only by "teams" and
+   * not "times", which could also be meant.
+   */
+  public Suggester proceedPastRep() {
+    return new Suggester(dictionary, suggestibleCache, fragmentChecker, true);
   }
 
   /**
@@ -174,7 +189,8 @@ Root<CharsRef> findStem(
     }
 
     boolean hasGoodSuggestions =
-        new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase, fragmentChecker)
+        new ModifyingSuggester(
+                suggestionSpeller, suggestions, word, wordCase, fragmentChecker, proceedPastRep)
             .suggest();
 
     if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
@@ -59,6 +59,14 @@ public void testAllcaps() throws Exception {
 
   public void testRepSuggestions() throws Exception {
     doTest("rep");
+
+    //noinspection DataFlowIssue
+    Path aff = Path.of(getClass().getResource("rep.aff").toURI());
+    Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
+    Suggester suggester = new Suggester(dictionary);
+    assertEquals(List.of("auto's"), suggester.suggestNoTimeout("autos", () -> {}));
+    assertEquals(
+        List.of("auto's", "auto"), suggester.proceedPastRep().suggestNoTimeout("autos", () -> {}));
   }
 
   public void testPhSuggestions() throws Exception {

diff --git a/lucene/backward-codecs/src/generated/checksums/generateForUtil99.json b/lucene/backward-codecs/src/generated/checksums/generateForUtil99.json
@@ -0,0 +1,4 @@
+{
+    "lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/ForUtil.java": "f31797842f047626df6a1a6b97167bec60269fec",
+    "lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/gen_ForUtil.py": "325f2610974b0e76e278b6445405a098a3763feb"
+}
diff --git a/lucene/backward-codecs/src/java/module-info.java b/lucene/backward-codecs/src/java/module-info.java
@@ -35,6 +35,7 @@
   exports org.apache.lucene.backward_codecs.lucene92;
   exports org.apache.lucene.backward_codecs.lucene94;
   exports org.apache.lucene.backward_codecs.lucene95;
+  exports org.apache.lucene.backward_codecs.lucene99;
   exports org.apache.lucene.backward_codecs.packed;
   exports org.apache.lucene.backward_codecs.store;
 
@@ -43,7 +44,8 @@
   provides org.apache.lucene.codecs.PostingsFormat with
       org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat,
       org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat,
-      org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat;
+      org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat,
+      org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat;
   provides org.apache.lucene.codecs.KnnVectorsFormat with
       org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat,
       org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat,
@@ -59,5 +61,6 @@
       org.apache.lucene.backward_codecs.lucene91.Lucene91Codec,
       org.apache.lucene.backward_codecs.lucene92.Lucene92Codec,
       org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
-      org.apache.lucene.backward_codecs.lucene95.Lucene95Codec;
+      org.apache.lucene.backward_codecs.lucene95.Lucene95Codec,
+      org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
 }
diff --git a/.../lucene/codecs/lucene99/ForDeltaUtil.java → ...ackward_codecs/lucene99/ForDeltaUtil.java b/.../lucene/codecs/lucene99/ForDeltaUtil.java → ...ackward_codecs/lucene99/ForDeltaUtil.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.lucene.codecs.lucene99;
+package org.apache.lucene.backward_codecs.lucene99;
 
 import java.io.IOException;
 import org.apache.lucene.store.DataInput;