Skip to content

Commit

Permalink
add _sort to DocValues (#317)
Browse files Browse the repository at this point in the history
* add _sort to DocValues
* Sort start/end using genomicPosition values for contig-aware sorting

---------

Co-authored-by: Sebastian Benjamin <[email protected]>
  • Loading branch information
hextraza and Sebastian Benjamin authored May 6, 2024
1 parent 49cbf2b commit 23d6f44
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 27 deletions.
45 changes: 22 additions & 23 deletions src/main/java/com/github/discvrseq/walkers/VcfToLuceneIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,8 @@
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
Expand Down Expand Up @@ -104,7 +102,7 @@ public void onTraversalStart() {
}

IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setIndexSort(new Sort(new SortField("genomicPosition", SortField.Type.INT, false)));
config.setIndexSort(new Sort(new SortField("genomicPosition_sort", SortField.Type.INT, false)));

try {
writer = new IndexWriter(index, config);
Expand Down Expand Up @@ -245,26 +243,27 @@ else if (line.getCountType() == VCFHeaderLineCount.INTEGER || line.getCountType(

// Add standard fields
doc.add(new TextField("contig", variant.getContig(), Field.Store.YES));
doc.add(new SortedDocValuesField("contig", new BytesRef(variant.getContig())));
doc.add(new SortedDocValuesField("contig_sort", new BytesRef(variant.getContig())));

doc.add(new TextField("ref", variant.getReference().getDisplayString(), Field.Store.YES));
doc.add(new SortedDocValuesField("ref", new BytesRef(variant.getReference().getDisplayString())));
doc.add(new SortedDocValuesField("ref_sort", new BytesRef(variant.getReference().getDisplayString())));

doc.add(new TextField("alt", alt.getDisplayString(), Field.Store.YES));
doc.add(new SortedDocValuesField("alt", new BytesRef(alt.getDisplayString())));
doc.add(new SortedDocValuesField("alt_sort", new BytesRef(alt.getDisplayString())));

final int genomicPositionStart = getGenomicPosition(variant.getContig(), variant.getStart());
doc.add(new IntPoint("start", variant.getStart()));
doc.add(new StoredField("start", variant.getStart()));
doc.add(new NumericDocValuesField("start", variant.getStart()));
doc.add(new NumericDocValuesField("start_sort", genomicPositionStart));

final int genomicPositionEnd = getGenomicPosition(variant.getContig(), variant.getEnd());
doc.add(new IntPoint("end", variant.getEnd()));
doc.add(new StoredField("end", variant.getEnd()));
doc.add(new NumericDocValuesField("end", variant.getEnd()));
doc.add(new NumericDocValuesField("end_sort", genomicPositionEnd));

final int genomicPosition = getGenomicPosition(variant.getContig(), variant.getStart());
doc.add(new IntPoint("genomicPosition", genomicPosition));
doc.add(new StoredField("genomicPosition", genomicPosition));
doc.add(new NumericDocValuesField("genomicPosition", genomicPosition));
doc.add(new IntPoint("genomicPosition", genomicPositionStart));
doc.add(new StoredField("genomicPosition", genomicPositionStart));
doc.add(new NumericDocValuesField("genomicPosition_sort", genomicPositionStart));

if (variant.hasGenotypes()) {
AtomicReference<String> docValue = new AtomicReference<>(null);
Expand All @@ -278,7 +277,7 @@ else if (line.getCountType() == VCFHeaderLineCount.INTEGER || line.getCountType(
});

if (docValue.get() != null) {
doc.add(new SortedDocValuesField("variableSamples", new BytesRef(docValue.get())));
doc.add(new SortedDocValuesField("variableSamples_sort", new BytesRef(docValue.get())));
docValue.set(null);
}

Expand All @@ -291,29 +290,29 @@ else if (line.getCountType() == VCFHeaderLineCount.INTEGER || line.getCountType(
});

if (docValue.get() != null) {
doc.add(new SortedDocValuesField("homozygousVarSamples", new BytesRef(docValue.get())));
doc.add(new SortedDocValuesField("homozygousVarSamples_sort", new BytesRef(docValue.get())));
docValue.set(null);
}

long nHet = variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt) && g.isHet()).count();
doc.add(new IntPoint("nHet", (int)nHet));
doc.add(new StoredField("nHet", (int)nHet));
doc.add(new NumericDocValuesField("nHet", (int)nHet));
doc.add(new NumericDocValuesField("nHet_sort", (int)nHet));

long nHomVar = variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt) && g.isHomVar()).count();
doc.add(new IntPoint("nHomVar", (int)nHomVar));
doc.add(new StoredField("nHomVar", (int)nHomVar));
doc.add(new NumericDocValuesField("nHomVar", (int)nHomVar));
doc.add(new NumericDocValuesField("nHomVar_sort", (int)nHomVar));

long nCalled = variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall()).count();
doc.add(new IntPoint("nCalled", (int)nCalled));
doc.add(new StoredField("nCalled", (int)nCalled));
doc.add(new NumericDocValuesField("nCalled", (int)nCalled));
doc.add(new NumericDocValuesField("nCalled_sort", (int)nCalled));

float fractionHet = (float) nHet / (float) (nHet + nHomVar);
doc.add(new DoublePoint("fractionHet", fractionHet));
doc.add(new StoredField("fractionHet", fractionHet));
doc.add(new NumericDocValuesField("fractionHet", NumericUtils.doubleToSortableLong(fractionHet)));
doc.add(new NumericDocValuesField("fractionHet_sort", NumericUtils.doubleToSortableLong(fractionHet)));
}

try {
Expand Down Expand Up @@ -408,7 +407,7 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var
doc.add(new StringField(key, String.valueOf(value), Field.Store.YES));

if (indexDocValue.get()) {
doc.add(new SortedDocValuesField(key, new BytesRef(String.valueOf(value))));
doc.add(new SortedDocValuesField(key + "_sort", new BytesRef(String.valueOf(value))));
indexDocValue.set(false);
}
}
Expand All @@ -417,7 +416,7 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var
doc.add(new IntPoint(key, x));

if (indexDocValue.get()) {
doc.add(new NumericDocValuesField(key, x));
doc.add(new NumericDocValuesField(key + "_sort", x));
indexDocValue.set(false);
}
}
Expand All @@ -436,7 +435,7 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var
});

if (docValue.get() != null && indexDocValue.get()) {
doc.add(new NumericDocValuesField(key, NumericUtils.doubleToSortableLong(docValue.get())));
doc.add(new NumericDocValuesField(key + "_sort", NumericUtils.doubleToSortableLong(docValue.get())));
indexDocValue.set(false);
}
}
Expand All @@ -455,7 +454,7 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var
});

if (docValue.get() != null && indexDocValue.get()) {
doc.add(new NumericDocValuesField(key, docValue.get()));
doc.add(new NumericDocValuesField(key + "_sort", docValue.get()));
indexDocValue.set(false);
}
}
Expand All @@ -464,7 +463,7 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var
doc.add(new TextField(key, String.valueOf(value), Field.Store.YES));

if (indexDocValue.get()) {
doc.add(new SortedDocValuesField(key, new BytesRef(String.valueOf(value))));
doc.add(new SortedDocValuesField(key +"_sort", new BytesRef(String.valueOf(value))));
indexDocValue.set(false);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ public void doExtendedTest() throws Exception {
Assert.assertEquals(topDocs.totalHits.value, 1L);

// Top 50 hits are sorted by genomicPosition
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("genomicPosition", SortField.Type.INT)));
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("genomicPosition_sort", SortField.Type.INT)));
Assert.assertEquals(6, topDocs.scoreDocs.length);

int lastGenomicPosition = -1;
Expand All @@ -485,7 +485,7 @@ public void doExtendedTest() throws Exception {
}

// Results are sorted by REFFIELD
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("REFFIELD", SortField.Type.STRING)));
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("REFFIELD_sort", SortField.Type.STRING)));
Assert.assertEquals(6, topDocs.scoreDocs.length);

String lastRefField = null;
Expand All @@ -499,7 +499,7 @@ public void doExtendedTest() throws Exception {
}

// Results are sorted by start
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("start", SortField.Type.INT)));
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("start_sort", SortField.Type.INT)));
Assert.assertEquals(6, topDocs.scoreDocs.length);

int lastStart = -1;
Expand All @@ -511,7 +511,7 @@ public void doExtendedTest() throws Exception {
}

// Results are sorted by HaplotypeScore
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("HaplotypeScore", SortField.Type.DOUBLE)));
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("HaplotypeScore_sort", SortField.Type.DOUBLE)));
Assert.assertEquals(6, topDocs.scoreDocs.length);

float lastHaplotypeScore = -1.0f;
Expand Down

0 comments on commit 23d6f44

Please sign in to comment.