Skip to content

Commit

Permalink
Add auto-detection of Lucene 8 indexes (#1965)
Browse files Browse the repository at this point in the history
+ Instead of needing an explicit flag in SearchCollection, the class automatically detects a 
  Lucene 8 index and disables consistent tie-breaking.
+ Add test cases.
+ Fix misspelling in index name.
  • Loading branch information
lintool authored Aug 20, 2022
1 parent 4625f89 commit 5480dc8
Show file tree
Hide file tree
Showing 46 changed files with 72 additions and 10 deletions.
3 changes: 0 additions & 3 deletions src/main/java/io/anserini/search/SearchArgs.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@ public class SearchArgs {
@Option(name = "-topicreader", required = true, usage = "TopicReader to use.")
public String topicReader;

@Option(name = "-lucene8", usage = "Enable Lucene 8 index compatibility.")
public Boolean lucene8 = false;

// optional arguments
@Option(name = "-querygenerator", usage = "QueryGenerator to use.")
public String queryGenerator = "BagOfWordsQueryGenerator";
Expand Down
7 changes: 4 additions & 3 deletions src/main/java/io/anserini/search/SearchCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -494,9 +494,10 @@ public SearchCollection(SearchArgs args) throws IOException {
loadQrels(args.rf_qrels);
}

// See https://github.com/castorini/anserini/issues/1952
// The solution to the issue described above is to turn off deterministic tie-breaking.
if (args.lucene8) {
// Fix for index compatibility issue between Lucene 8 and 9: https://github.com/castorini/anserini/issues/1952
// If we detect an older index version, we turn off consistent tie-breaking, which avoids accessing docvalues,
// which is the source of the incompatibility.
if (!reader.toString().contains("lucene.version=9")) {
args.arbitraryScoreTieBreak = true;
args.axiom_deterministic = false;
}
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/io/anserini/search/SimpleImpactSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ public SimpleImpactSearcher(String indexDir) throws IOException {
// Fix for index compatibility issue between Lucene 8 and 9: https://github.com/castorini/anserini/issues/1952
// If we detect an older index version, we turn off consistent tie-breaking, which avoids accessing docvalues,
// which is the source of the incompatibility.
this.backwardsCompatibilityLucene8 = !reader.toString().contains("Lucene9");
this.backwardsCompatibilityLucene8 = !reader.toString().contains("lucene.version=9");

// Default to using ImpactSimilarity.
this.similarity = new ImpactSimilarity();
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/io/anserini/search/SimpleSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ public SimpleSearcher(String indexDir, Analyzer analyzer) throws IOException {
// Fix for index compatibility issue between Lucene 8 and 9: https://github.com/castorini/anserini/issues/1952
// If we detect an older index version, we turn off consistent tie-breaking, which avoids accessing docvalues,
// which is the source of the incompatibility.
this.backwardsCompatibilityLucene8 = !reader.toString().contains("Lucene9");
this.backwardsCompatibilityLucene8 = !reader.toString().contains("lucene.version=9");

// Default to using BM25.
this.similarity = new BM25Similarity(Float.parseFloat(defaults.bm25_k1[0]), Float.parseFloat(defaults.bm25_b[0]));
Expand Down
3 changes: 3 additions & 0 deletions src/test/java/io/anserini/IndexerTestBase.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

import java.io.IOException;
import java.nio.file.Path;
import java.util.Locale;

public class IndexerTestBase extends LuceneTestCase {
protected Path tempDir1;
Expand Down Expand Up @@ -97,6 +98,8 @@ private void buildTestIndex() throws IOException {
public void setUp() throws Exception {
super.setUp();

Locale.setDefault(Locale.US);

tempDir1 = createTempDir();
buildTestIndex();
}
Expand Down
61 changes: 61 additions & 0 deletions src/test/java/io/anserini/search/SearchCollectionTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,15 @@

import org.junit.Test;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Locale;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

public class SearchCollectionTest {
Expand Down Expand Up @@ -103,4 +109,59 @@ public void testMutallyExclusive() throws Exception {

restoreStderr();
}

@Test
public void testSearchLucene9() throws Exception {
SearchCollection.main(
new String[] {"-index", "src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_collection2/",
"-topics", "src/test/resources/sample_topics/Trec",
"-topicreader", "Trec", "-output", "run.test", "-bm25"});
check("run.test", new String[]{
"1 Q0 DOC222 1 0.343200 Anserini",
"1 Q0 TREC_DOC_1 2 0.333400 Anserini",
"1 Q0 WSJ_1 3 0.068700 Anserini"});
new File("run.test").delete();

SearchCollection.main(
new String[] {"-index", "src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/",
"-topics", "src/test/resources/sample_topics/json_topics1.tsv",
"-topicreader", "TsvInt", "-output", "run.test", "-pretokenized", "-impact"});
check("run.test", new String[]{
"1 Q0 2000001 1 4.000000 Anserini",});
new File("run.test").delete();
}

@Test
public void testSearchLucene8() throws Exception {
SearchCollection.main(
new String[] {"-index", "src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_collection2/",
"-topics", "src/test/resources/sample_topics/Trec",
"-topicreader", "Trec", "-output", "run.test", "-bm25"});
check("run.test", new String[]{
"1 Q0 DOC222 1 0.343192 Anserini",
"1 Q0 TREC_DOC_1 2 0.333445 Anserini",
"1 Q0 WSJ_1 3 0.068654 Anserini"});
new File("run.test").delete();

SearchCollection.main(
new String[] {"-index", "src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/",
"-topics", "src/test/resources/sample_topics/json_topics1.tsv",
"-topicreader", "TsvInt", "-output", "run.test", "-pretokenized", "-impact"});
check("run.test", new String[]{
"1 Q0 2000001 1 4.000000 Anserini",});
new File("run.test").delete();
}

protected void check(String output, String[] ref) throws IOException {
BufferedReader br = new BufferedReader(new FileReader(output));

int cnt = 0;
String s;
while ((s = br.readLine()) != null) {
assertEquals(ref[cnt], s);
cnt++;
}

assertEquals(cnt, ref.length);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public class SimpleSearcherPrebuiltLucene8Test {
@Test
public void testSearch1() throws Exception {
SimpleSearcher searcher =
new SimpleSearcher("src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2");
new SimpleSearcher("src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_collection2");
assertEquals(3, searcher.get_total_num_docs());

SimpleSearcher.Result[] hits;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public class SimpleSearcherPrebuiltLucene9Test {
@Test
public void testSearch1() throws Exception {
SimpleSearcher searcher =
new SimpleSearcher("src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2");
new SimpleSearcher("src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_collection2");
assertEquals(3, searcher.get_total_num_docs());

SimpleSearcher.Result[] hits;
Expand Down

0 comments on commit 5480dc8

Please sign in to comment.