Skip to content

Commit

Permalink
Add sort fields every time we doc.add a value (#305)
Browse files Browse the repository at this point in the history
* Add sort fields every time we doc.add a value

---------

Co-authored-by: Sebastian Benjamin <[email protected]>
  • Loading branch information
bbimber and Sebastian Benjamin authored Mar 25, 2024
1 parent e67f88f commit 40912da
Show file tree
Hide file tree
Showing 2 changed files with 162 additions and 12 deletions.
103 changes: 95 additions & 8 deletions src/main/java/com/github/discvrseq/walkers/VcfToLuceneIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,13 @@
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
Expand All @@ -33,6 +36,8 @@
import java.util.concurrent.Callable;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;

/**
Expand Down Expand Up @@ -99,7 +104,7 @@ public void onTraversalStart() {
}

IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setIndexSort(new Sort(new SortedNumericSortField("genomicPosition", SortField.Type.INT, false)));
config.setIndexSort(new Sort(new SortField("genomicPosition", SortField.Type.INT, false)));

try {
writer = new IndexWriter(index, config);
Expand Down Expand Up @@ -240,39 +245,75 @@ else if (line.getCountType() == VCFHeaderLineCount.INTEGER || line.getCountType(

// Add standard fields
doc.add(new TextField("contig", variant.getContig(), Field.Store.YES));
doc.add(new SortedDocValuesField("contig", new BytesRef(variant.getContig())));

doc.add(new TextField("ref", variant.getReference().getDisplayString(), Field.Store.YES));
doc.add(new SortedDocValuesField("ref", new BytesRef(variant.getReference().getDisplayString())));

doc.add(new TextField("alt", alt.getDisplayString(), Field.Store.YES));
doc.add(new SortedDocValuesField("alt", new BytesRef(alt.getDisplayString())));

doc.add(new IntPoint("start", variant.getStart()));
doc.add(new StoredField("start", variant.getStart()));
doc.add(new NumericDocValuesField("start", variant.getStart()));

doc.add(new IntPoint("end", variant.getEnd()));
doc.add(new StoredField("end", variant.getEnd()));
doc.add(new NumericDocValuesField("end", variant.getEnd()));

final int genomicPosition = getGenomicPosition(variant.getContig(), variant.getStart());
doc.add(new IntPoint("genomicPosition", genomicPosition));
doc.add(new StoredField("genomicPosition", genomicPosition));
doc.add(new SortedNumericDocValuesField("genomicPosition", genomicPosition));
doc.add(new NumericDocValuesField("genomicPosition", genomicPosition));

if (variant.hasGenotypes()) {
variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt)).map(Genotype::getSampleName).sorted().forEach(sample -> doc.add(new TextField("variableSamples", sample, Field.Store.YES)));
variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt) && g.isHomVar()).map(Genotype::getSampleName).sorted().forEach(sample -> doc.add(new TextField("homozygousVarSamples", sample, Field.Store.YES)));
AtomicReference<String> docValue = new AtomicReference<>(null);

variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt)).map(Genotype::getSampleName).sorted().forEach(sample -> {
doc.add(new TextField("variableSamples", sample, Field.Store.YES));

if (docValue.get() == null) {
docValue.set(sample);
}
});

if (docValue.get() != null) {
doc.add(new SortedDocValuesField("variableSamples", new BytesRef(docValue.get())));
docValue.set(null);
}

variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt) && g.isHomVar()).map(Genotype::getSampleName).sorted().forEach(sample -> {
doc.add(new TextField("homozygousVarSamples", sample, Field.Store.YES));

if (docValue.get() == null) {
docValue.set(sample);
}
});

if (docValue.get() != null) {
doc.add(new SortedDocValuesField("homozygousVarSamples", new BytesRef(docValue.get())));
docValue.set(null);
}

long nHet = variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt) && g.isHet()).count();
doc.add(new IntPoint("nHet", (int)nHet));
doc.add(new StoredField("nHet", (int)nHet));
doc.add(new NumericDocValuesField("nHet", (int)nHet));

long nHomVar = variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt) && g.isHomVar()).count();
doc.add(new IntPoint("nHomVar", (int)nHomVar));
doc.add(new StoredField("nHomVar", (int)nHomVar));
doc.add(new NumericDocValuesField("nHomVar", (int)nHomVar));

long nCalled = variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall()).count();
doc.add(new IntPoint("nCalled", (int)nCalled));
doc.add(new StoredField("nCalled", (int)nCalled));
doc.add(new NumericDocValuesField("nCalled", (int)nCalled));

float fractionHet = (float) nHet / (float) (nHet + nHomVar);
doc.add(new FloatPoint("fractionHet", fractionHet));
doc.add(new DoublePoint("fractionHet", fractionHet));
doc.add(new StoredField("fractionHet", fractionHet));
doc.add(new NumericDocValuesField("fractionHet", NumericUtils.doubleToSortableLong(fractionHet)));
}

try {
Expand Down Expand Up @@ -353,34 +394,80 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var
}

Collection<?> values = fieldValue instanceof Collection ? (Collection<?>) fieldValue : Collections.singleton(fieldValue);

AtomicBoolean indexDocValue = new AtomicBoolean(true);

values.forEach(value -> {
if (value == null || "".equals(value) || VCFConstants.EMPTY_INFO_FIELD.equals(value)) {
return;
}

try {
switch (variantHeaderLineType) {
case Character -> doc.add(new StringField(key, String.valueOf(value), Field.Store.YES));
case Flag -> doc.add(new IntPoint(key, Boolean.parseBoolean(value.toString()) ? 1 : 0));
case Character -> {
doc.add(new StringField(key, String.valueOf(value), Field.Store.YES));

if (indexDocValue.get()) {
doc.add(new SortedDocValuesField(key, new BytesRef(String.valueOf(value))));
indexDocValue.set(false);
}
}
case Flag -> {
int x = Boolean.parseBoolean(value.toString()) ? 1 : 0;
doc.add(new IntPoint(key, x));

if (indexDocValue.get()) {
doc.add(new NumericDocValuesField(key, x));
indexDocValue.set(false);
}
}
case Float -> {
Collection<Double> parsedVals = attemptToFixNumericValue(key, value, Double.class);
if (parsedVals != null) {
AtomicReference<Double> docValue = new AtomicReference<>(null);

parsedVals.forEach(x -> {
doc.add(new DoublePoint(key, x));
doc.add(new StoredField(key, x));

if (docValue.get() == null) {
docValue.set(x);
}
});

if (docValue.get() != null && indexDocValue.get()) {
doc.add(new NumericDocValuesField(key, NumericUtils.doubleToSortableLong(docValue.get())));
indexDocValue.set(false);
}
}
}
case Integer -> {
Collection<Integer> parsedVals = attemptToFixNumericValue(key, value, Integer.class);
if (parsedVals != null) {
AtomicReference<Integer> docValue = new AtomicReference<>(null);
parsedVals.forEach(x -> {
doc.add(new IntPoint(key, x));
doc.add(new StoredField(key, x));

if (docValue.get() == null) {
docValue.set(x);
}
});

if (docValue.get() != null && indexDocValue.get()) {
doc.add(new NumericDocValuesField(key, docValue.get()));
indexDocValue.set(false);
}
}
}
case String -> {
doc.add(new TextField(key, String.valueOf(value), Field.Store.YES));

if (indexDocValue.get()) {
doc.add(new SortedDocValuesField(key, new BytesRef(String.valueOf(value))));
indexDocValue.set(false);
}
}
case String -> doc.add(new TextField(key, String.valueOf(value), Field.Store.YES));
default -> possiblyReportBadValue(new Exception("VCF header type was not expected: " + variantHeaderLineType.name()), key, value);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
import org.apache.lucene.queryparser.flexible.standard.config.PointsConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
Expand Down Expand Up @@ -465,6 +462,72 @@ public void doExtendedTest() throws Exception {
// Documents where HaplotypeScore == 0.12, with query syntax.
topDocs = indexSearcher.search(numericQueryParser.parse("HaplotypeScore:[0.12 TO 0.12]", ""), 10);
Assert.assertEquals(topDocs.totalHits.value, 1L);

// Top 50 hits are sorted by genomicPosition
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("genomicPosition", SortField.Type.INT)));
Assert.assertEquals(6, topDocs.scoreDocs.length);

int lastGenomicPosition = -1;
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document document = indexSearcher.doc(scoreDoc.doc);
int currentGenomicPosition = Integer.parseInt(document.get("genomicPosition"));
if (lastGenomicPosition != -1) {
Assert.assertTrue(lastGenomicPosition <= currentGenomicPosition);
}
lastGenomicPosition = currentGenomicPosition;
}

// Results are sorted by REFFIELD
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("REFFIELD", SortField.Type.STRING)));
Assert.assertEquals(6, topDocs.scoreDocs.length);

String lastRefField = null;
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document document = indexSearcher.doc(scoreDoc.doc);
String currentRefField = document.get("REFFIELD");
if (lastRefField != null) {
Assert.assertTrue(lastRefField.compareTo(currentRefField) <= 0);
}
lastRefField = currentRefField;
}

// Results are sorted by start
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("start", SortField.Type.INT)));
Assert.assertEquals(6, topDocs.scoreDocs.length);

int lastStart = -1;
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document document = indexSearcher.doc(scoreDoc.doc);
int currentStart = Integer.parseInt(document.get("start"));
Assert.assertTrue(lastStart <= currentStart);
lastStart = currentStart;
}

// Results are sorted by HaplotypeScore
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("HaplotypeScore", SortField.Type.DOUBLE)));
Assert.assertEquals(6, topDocs.scoreDocs.length);

float lastHaplotypeScore = -1.0f;
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document document = indexSearcher.doc(scoreDoc.doc);
float currentHaplotypeScore = Float.parseFloat(document.get("HaplotypeScore"));
Assert.assertTrue(lastHaplotypeScore <= currentHaplotypeScore);
lastHaplotypeScore = currentHaplotypeScore;
}

// Results are sorted by genomicPosition
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, Sort.INDEXORDER);
Assert.assertTrue(topDocs.scoreDocs.length > 0);

lastGenomicPosition = -1;
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document document = indexSearcher.doc(scoreDoc.doc);
int currentGenomicPosition = Integer.parseInt(document.get("genomicPosition"));
if (lastGenomicPosition != -1) {
Assert.assertTrue(lastGenomicPosition <= currentGenomicPosition);
}
lastGenomicPosition = currentGenomicPosition;
}
}
}
}

0 comments on commit 40912da

Please sign in to comment.