Skip to content

Commit

Permalink
Merge branch 'main' into refactor-knn
Browse files Browse the repository at this point in the history
  • Loading branch information
dungba88 authored Aug 6, 2024
2 parents b7f9680 + e0e5d81 commit 7dd5b40
Show file tree
Hide file tree
Showing 142 changed files with 9,283 additions and 1,299 deletions.
27 changes: 26 additions & 1 deletion gradle/generation/forUtil.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ configure(project(":lucene:core")) {
description "Regenerate gen_ForUtil.py"
group "generation"

def genDir = file("src/java/org/apache/lucene/codecs/lucene99")
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
def genScript = file("${genDir}/gen_ForUtil.py")
def genOutput = file("${genDir}/ForUtil.java")

Expand Down Expand Up @@ -96,5 +96,30 @@ configure(project(":lucene:backward-codecs")) {
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])

task generateForUtil99Internal() {
description "Regenerate gen_ForUtil.py"
group "generation"

def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene99")
def genScript = file("${genDir}/gen_ForUtil.py")
def genOutput = file("${genDir}/ForUtil.java")

inputs.file genScript
outputs.file genOutput

doLast {
quietExec {
workingDir genDir
executable project.externalTool("python3")
args = [ '-B', genScript ]
}
}
}

regenerate.dependsOn wrapWithPersistentChecksums(generateForUtil99Internal, [
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])
}

40 changes: 38 additions & 2 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ API Changes

* GITHUB#13410: Removed Scorer#getWeight (Sanjay Dutt, Adrien Grand)

* GITHUB#13499: Remove deprecated TopScoreDocCollector + TopFieldCollector methods (#create, #createSharedManager) (Jakub Slowinski)


New Features
---------------------

Expand All @@ -134,6 +137,8 @@ New Features
* GITHUB#13597: Align doc value skipper interval boundaries when an interval contains a constant
value. (Ignacio Vera)

* GITHUB#13604: Add Kmeans clustering on vectors (Mayya Sharipova, Jim Ferenczi, Tom Veasey)

Improvements
---------------------

Expand Down Expand Up @@ -244,14 +249,22 @@ Other

* GITHUB#13332: Improve MissingDoclet linter to check records correctly. (Uwe Schindler)

* GITHUB#13499: Remove usage of TopScoreDocCollector + TopFieldCollector deprecated methods (#create, #createSharedManager) (Jakub Slowinski)

======================== Lucene 9.12.0 =======================

API Changes
---------------------

* GITHUB#13281: Mark COSINE VectorSimilarityFunction as deprecated. (Pulkit Gupta)
* GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov)

* GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov)
* GITHUB#13612: Hunspell: add Suggester#proceedPastRep to avoid losing relevant suggestions. (Peter Gromov)

* GITHUB#13603: Introduced `IndexSearcher#searchLeaf(LeafReaderContext, Weight, Collector)` protected method to
facilitate customizing per-leaf behavior of search without requiring to override
`search(LeafReaderContext[], Weight, Collector)` which requires overriding the entire loop across the leaves (Luca Cavanna)

* GITHUB#13559: Add BitSet#nextSetBit(int, int) to get the index of the first set bit in range. (Egor Potemkin)

New Features
---------------------
Expand All @@ -273,6 +286,11 @@ Improvements
* GITHUB#13562: Add Intervals.regexp and Intervals.range methods to produce IntervalsSource
for regexp and range queries. (Mayya Sharipova)

* GITHUB#13625: Remove BitSet#nextSetBit code duplication. (Greg Miller)

* GITHUB#13285: Early terminate graph searches of AbstractVectorSimilarityQuery to follow timeout set from
IndexSearcher#setTimeout(QueryTimeout). (Kaival Parikh)

Optimizations
---------------------

Expand Down Expand Up @@ -300,6 +318,21 @@ Optimizations
* GITHUB#13582: Stop requiring MaxScoreBulkScorer's outer window from having at
least INNER_WINDOW_SIZE docs. (Adrien Grand)

* GITHUB#13570, GITHUB#13574, GITHUB#13535: Avoid performance degradation with closing shared Arenas.
Closing many individual index files can potentially lead to a degradation in execution performance.
Index files are mmapped one-to-one with the JDK's foreign shared Arena. The JVM deoptimizes the top
few frames of all threads when closing a shared Arena (see JDK-8335480). We mitigate this situation
by 1) using a confined Arena where appropriate, and 2) grouping files from the same segment to a
single shared Arena. (Chris Hegarty, Michael Gibney, Uwe Schindler)

* GITHUB#13585: Lucene912PostingsFormat, the new default postings format, now
only has 2 levels of skip data, which are inlined into postings instead of
being stored at the end of postings lists. This translates into better
performance for queries that need skipping such as conjunctions.
(Adrien Grand)

* GITHUB#13581: OnHeapHnswGraph no longer allocates a lock for every graph node (Mike Sokolov)

Changes in runtime behavior
---------------------

Expand All @@ -320,6 +353,9 @@ Bug Fixes
* GITHUB#13553: Correct RamUsageEstimate for scalar quantized knn vector formats so that raw vectors are correctly
accounted for. (Ben Trent)

* GITHUB#13615: Correct scalar quantization when used in conjunction with COSINE similarity. Vectors are normalized
before quantization to ensure the cosine similarity is correctly calculated. (Ben Trent)

Other
--------------------
(No changes)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class ModifyingSuggester {
private final String misspelled;
private final WordCase wordCase;
private final FragmentChecker fragmentChecker;
private final boolean proceedPastRep;
private final char[] tryChars;
private final Hunspell speller;

Expand All @@ -39,13 +40,15 @@ class ModifyingSuggester {
LinkedHashSet<Suggestion> result,
String misspelled,
WordCase wordCase,
FragmentChecker checker) {
FragmentChecker checker,
boolean proceedPastRep) {
this.speller = speller;
tryChars = speller.dictionary.tryChars.toCharArray();
this.result = result;
this.misspelled = misspelled;
this.wordCase = wordCase;
fragmentChecker = checker;
this.proceedPastRep = proceedPastRep;
}

/**
Expand Down Expand Up @@ -125,9 +128,9 @@ private boolean tryVariationsOf(String word) {
boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));

GradedSuggestions repResult = tryRep(word);
if (repResult == GradedSuggestions.Best) return true;
if (repResult == GradedSuggestions.Best && !proceedPastRep) return true;

hasGoodSuggestions |= repResult == GradedSuggestions.Normal;
hasGoodSuggestions |= repResult != GradedSuggestions.None;

if (!speller.dictionary.mapTable.isEmpty()) {
enumerateMapReplacements(word, "", 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,21 @@ public class Suggester {
private final Dictionary dictionary;
private final SuggestibleEntryCache suggestibleCache;
private final FragmentChecker fragmentChecker;
private final boolean proceedPastRep;

public Suggester(Dictionary dictionary) {
this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE);
this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE, false);
}

private Suggester(
Dictionary dictionary, SuggestibleEntryCache suggestibleCache, FragmentChecker checker) {
Dictionary dictionary,
SuggestibleEntryCache suggestibleCache,
FragmentChecker checker,
boolean proceedPastRep) {
this.dictionary = dictionary;
this.suggestibleCache = suggestibleCache;
this.fragmentChecker = checker;
this.proceedPastRep = proceedPastRep;
}

/**
Expand All @@ -71,16 +76,26 @@ private Suggester(
* entries are stored as fast-to-iterate plain words instead of highly compressed prefix trees.
*/
public Suggester withSuggestibleEntryCache() {
return new Suggester(
dictionary, SuggestibleEntryCache.buildCache(dictionary.words), fragmentChecker);
SuggestibleEntryCache cache = SuggestibleEntryCache.buildCache(dictionary.words);
return new Suggester(dictionary, cache, fragmentChecker, proceedPastRep);
}

/**
* Returns a copy of this suggester instance with {@link FragmentChecker} hint that can improve
* the performance of the "Modification" phase performance.
*/
public Suggester withFragmentChecker(FragmentChecker checker) {
return new Suggester(dictionary, suggestibleCache, checker);
return new Suggester(dictionary, suggestibleCache, checker, proceedPastRep);
}

/**
* Returns a copy of this suggester instance that doesn't stop after encountering acceptable words
* after applying REP rules. By default, Hunspell stops when it finds any, but this behavior may
* not always be desirable, e.g., if we have "REP i ea", "tims" be replaced only by "teams" and
* not "times", which could also be meant.
*/
public Suggester proceedPastRep() {
return new Suggester(dictionary, suggestibleCache, fragmentChecker, true);
}

/**
Expand Down Expand Up @@ -174,7 +189,8 @@ Root<CharsRef> findStem(
}

boolean hasGoodSuggestions =
new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase, fragmentChecker)
new ModifyingSuggester(
suggestionSpeller, suggestions, word, wordCase, fragmentChecker, proceedPastRep)
.suggest();

if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ public void testAllcaps() throws Exception {

public void testRepSuggestions() throws Exception {
doTest("rep");

//noinspection DataFlowIssue
Path aff = Path.of(getClass().getResource("rep.aff").toURI());
Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
Suggester suggester = new Suggester(dictionary);
assertEquals(List.of("auto's"), suggester.suggestNoTimeout("autos", () -> {}));
assertEquals(
List.of("auto's", "auto"), suggester.proceedPastRep().suggestNoTimeout("autos", () -> {}));
}

public void testPhSuggestions() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/ForUtil.java": "f31797842f047626df6a1a6b97167bec60269fec",
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/gen_ForUtil.py": "325f2610974b0e76e278b6445405a098a3763feb"
}
7 changes: 5 additions & 2 deletions lucene/backward-codecs/src/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
exports org.apache.lucene.backward_codecs.lucene92;
exports org.apache.lucene.backward_codecs.lucene94;
exports org.apache.lucene.backward_codecs.lucene95;
exports org.apache.lucene.backward_codecs.lucene99;
exports org.apache.lucene.backward_codecs.packed;
exports org.apache.lucene.backward_codecs.store;

Expand All @@ -43,7 +44,8 @@
provides org.apache.lucene.codecs.PostingsFormat with
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat,
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat,
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat;
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat,
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat;
provides org.apache.lucene.codecs.KnnVectorsFormat with
org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat,
org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat,
Expand All @@ -59,5 +61,6 @@
org.apache.lucene.backward_codecs.lucene91.Lucene91Codec,
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec,
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec;
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec,
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;

import java.io.IOException;
import org.apache.lucene.store.DataInput;
Expand Down
Loading

0 comments on commit 7dd5b40

Please sign in to comment.