From 40912da2e37d83daf8ab5b28a314a5e874d593b2 Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 25 Mar 2024 11:08:31 -0500 Subject: [PATCH] Add sort fields every time we doc.add a value (#305) * Add sort fields every time we doc.add a value --------- Co-authored-by: Sebastian Benjamin --- .../discvrseq/walkers/VcfToLuceneIndexer.java | 103 ++++++++++++++++-- .../VcfToLuceneIndexerIntegrationTest.java | 71 +++++++++++- 2 files changed, 162 insertions(+), 12 deletions(-) diff --git a/src/main/java/com/github/discvrseq/walkers/VcfToLuceneIndexer.java b/src/main/java/com/github/discvrseq/walkers/VcfToLuceneIndexer.java index 66307fd7..3fd9bfee 100644 --- a/src/main/java/com/github/discvrseq/walkers/VcfToLuceneIndexer.java +++ b/src/main/java/com/github/discvrseq/walkers/VcfToLuceneIndexer.java @@ -16,10 +16,13 @@ import org.apache.lucene.document.*; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexableField; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortedNumericSortField; import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.NumericUtils; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.help.DocumentedFeature; @@ -33,6 +36,8 @@ import java.util.concurrent.Callable; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; /** @@ -99,7 +104,7 @@ public void onTraversalStart() { } IndexWriterConfig config = new IndexWriterConfig(analyzer); - config.setIndexSort(new Sort(new SortedNumericSortField("genomicPosition", SortField.Type.INT, false))); + config.setIndexSort(new Sort(new SortField("genomicPosition", SortField.Type.INT, false))); try { writer = new IndexWriter(index, config); @@ -240,39 +245,75 @@ else if (line.getCountType() == VCFHeaderLineCount.INTEGER || line.getCountType( // Add standard fields doc.add(new TextField("contig", variant.getContig(), Field.Store.YES)); + doc.add(new SortedDocValuesField("contig", new BytesRef(variant.getContig()))); + doc.add(new TextField("ref", variant.getReference().getDisplayString(), Field.Store.YES)); + doc.add(new SortedDocValuesField("ref", new BytesRef(variant.getReference().getDisplayString()))); + doc.add(new TextField("alt", alt.getDisplayString(), Field.Store.YES)); + doc.add(new SortedDocValuesField("alt", new BytesRef(alt.getDisplayString()))); doc.add(new IntPoint("start", variant.getStart())); doc.add(new StoredField("start", variant.getStart())); + doc.add(new NumericDocValuesField("start", variant.getStart())); doc.add(new IntPoint("end", variant.getEnd())); doc.add(new StoredField("end", variant.getEnd())); + doc.add(new NumericDocValuesField("end", variant.getEnd())); final int genomicPosition = getGenomicPosition(variant.getContig(), variant.getStart()); doc.add(new IntPoint("genomicPosition", genomicPosition)); doc.add(new StoredField("genomicPosition", genomicPosition)); - doc.add(new SortedNumericDocValuesField("genomicPosition", genomicPosition)); + doc.add(new NumericDocValuesField("genomicPosition", genomicPosition)); if (variant.hasGenotypes()) { - variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt)).map(Genotype::getSampleName).sorted().forEach(sample -> doc.add(new TextField("variableSamples", sample, Field.Store.YES))); - variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt) && g.isHomVar()).map(Genotype::getSampleName).sorted().forEach(sample -> doc.add(new TextField("homozygousVarSamples", sample, Field.Store.YES))); + AtomicReference docValue = new AtomicReference<>(null); + + variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt)).map(Genotype::getSampleName).sorted().forEach(sample -> { + doc.add(new TextField("variableSamples", sample, Field.Store.YES)); + + if (docValue.get() == null) { + docValue.set(sample); + } + }); + + if (docValue.get() != null) { + doc.add(new SortedDocValuesField("variableSamples", new BytesRef(docValue.get()))); + docValue.set(null); + } + + variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt) && g.isHomVar()).map(Genotype::getSampleName).sorted().forEach(sample -> { + doc.add(new TextField("homozygousVarSamples", sample, Field.Store.YES)); + + if (docValue.get() == null) { + docValue.set(sample); + } + }); + + if (docValue.get() != null) { + doc.add(new SortedDocValuesField("homozygousVarSamples", new BytesRef(docValue.get()))); + docValue.set(null); + } long nHet = variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt) && g.isHet()).count(); doc.add(new IntPoint("nHet", (int)nHet)); doc.add(new StoredField("nHet", (int)nHet)); + doc.add(new NumericDocValuesField("nHet", (int)nHet)); long nHomVar = variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt) && g.isHomVar()).count(); doc.add(new IntPoint("nHomVar", (int)nHomVar)); doc.add(new StoredField("nHomVar", (int)nHomVar)); + doc.add(new NumericDocValuesField("nHomVar", (int)nHomVar)); long nCalled = variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall()).count(); doc.add(new IntPoint("nCalled", (int)nCalled)); doc.add(new StoredField("nCalled", (int)nCalled)); + doc.add(new NumericDocValuesField("nCalled", (int)nCalled)); float fractionHet = (float) nHet / (float) (nHet + nHomVar); - doc.add(new FloatPoint("fractionHet", fractionHet)); + doc.add(new DoublePoint("fractionHet", fractionHet)); doc.add(new StoredField("fractionHet", fractionHet)); + doc.add(new NumericDocValuesField("fractionHet", NumericUtils.doubleToSortableLong(fractionHet))); } try { @@ -353,6 +394,9 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var } Collection values = fieldValue instanceof Collection ? (Collection) fieldValue : Collections.singleton(fieldValue); + + AtomicBoolean indexDocValue = new AtomicBoolean(true); + values.forEach(value -> { if (value == null || "".equals(value) || VCFConstants.EMPTY_INFO_FIELD.equals(value)) { return; @@ -360,27 +404,70 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var try { switch (variantHeaderLineType) { - case Character -> doc.add(new StringField(key, String.valueOf(value), Field.Store.YES)); - case Flag -> doc.add(new IntPoint(key, Boolean.parseBoolean(value.toString()) ? 1 : 0)); + case Character -> { + doc.add(new StringField(key, String.valueOf(value), Field.Store.YES)); + + if (indexDocValue.get()) { + doc.add(new SortedDocValuesField(key, new BytesRef(String.valueOf(value)))); + indexDocValue.set(false); + } + } + case Flag -> { + int x = Boolean.parseBoolean(value.toString()) ? 1 : 0; + doc.add(new IntPoint(key, x)); + + if (indexDocValue.get()) { + doc.add(new NumericDocValuesField(key, x)); + indexDocValue.set(false); + } + } case Float -> { Collection parsedVals = attemptToFixNumericValue(key, value, Double.class); if (parsedVals != null) { + AtomicReference docValue = new AtomicReference<>(null); + parsedVals.forEach(x -> { doc.add(new DoublePoint(key, x)); doc.add(new StoredField(key, x)); + + if (docValue.get() == null) { + docValue.set(x); + } }); + + if (docValue.get() != null && indexDocValue.get()) { + doc.add(new NumericDocValuesField(key, NumericUtils.doubleToSortableLong(docValue.get()))); + indexDocValue.set(false); + } } } case Integer -> { Collection parsedVals = attemptToFixNumericValue(key, value, Integer.class); if (parsedVals != null) { + AtomicReference docValue = new AtomicReference<>(null); parsedVals.forEach(x -> { doc.add(new IntPoint(key, x)); doc.add(new StoredField(key, x)); + + if (docValue.get() == null) { + docValue.set(x); + } }); + + if (docValue.get() != null && indexDocValue.get()) { + doc.add(new NumericDocValuesField(key, docValue.get())); + indexDocValue.set(false); + } + } + } + case String -> { + doc.add(new TextField(key, String.valueOf(value), Field.Store.YES)); + + if (indexDocValue.get()) { + doc.add(new SortedDocValuesField(key, new BytesRef(String.valueOf(value)))); + indexDocValue.set(false); } } - case String -> doc.add(new TextField(key, String.valueOf(value), Field.Store.YES)); default -> possiblyReportBadValue(new Exception("VCF header type was not expected: " + variantHeaderLineType.name()), key, value); } } diff --git a/src/test/java/com/github/discvrseq/walkers/VcfToLuceneIndexerIntegrationTest.java b/src/test/java/com/github/discvrseq/walkers/VcfToLuceneIndexerIntegrationTest.java index cfbdfb81..641eb19e 100644 --- a/src/test/java/com/github/discvrseq/walkers/VcfToLuceneIndexerIntegrationTest.java +++ b/src/test/java/com/github/discvrseq/walkers/VcfToLuceneIndexerIntegrationTest.java @@ -11,10 +11,7 @@ import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; import org.apache.lucene.queryparser.flexible.standard.config.PointsConfig; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; @@ -465,6 +462,72 @@ public void doExtendedTest() throws Exception { // Documents where HaplotypeScore == 0.12, with query syntax. topDocs = indexSearcher.search(numericQueryParser.parse("HaplotypeScore:[0.12 TO 0.12]", ""), 10); Assert.assertEquals(topDocs.totalHits.value, 1L); + + // Top 50 hits are sorted by genomicPosition + topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("genomicPosition", SortField.Type.INT))); + Assert.assertEquals(6, topDocs.scoreDocs.length); + + int lastGenomicPosition = -1; + for (ScoreDoc scoreDoc : topDocs.scoreDocs) { + Document document = indexSearcher.doc(scoreDoc.doc); + int currentGenomicPosition = Integer.parseInt(document.get("genomicPosition")); + if (lastGenomicPosition != -1) { + Assert.assertTrue(lastGenomicPosition <= currentGenomicPosition); + } + lastGenomicPosition = currentGenomicPosition; + } + + // Results are sorted by REFFIELD + topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("REFFIELD", SortField.Type.STRING))); + Assert.assertEquals(6, topDocs.scoreDocs.length); + + String lastRefField = null; + for (ScoreDoc scoreDoc : topDocs.scoreDocs) { + Document document = indexSearcher.doc(scoreDoc.doc); + String currentRefField = document.get("REFFIELD"); + if (lastRefField != null) { + Assert.assertTrue(lastRefField.compareTo(currentRefField) <= 0); + } + lastRefField = currentRefField; + } + + // Results are sorted by start + topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("start", SortField.Type.INT))); + Assert.assertEquals(6, topDocs.scoreDocs.length); + + int lastStart = -1; + for (ScoreDoc scoreDoc : topDocs.scoreDocs) { + Document document = indexSearcher.doc(scoreDoc.doc); + int currentStart = Integer.parseInt(document.get("start")); + Assert.assertTrue(lastStart <= currentStart); + lastStart = currentStart; + } + + // Results are sorted by HaplotypeScore + topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("HaplotypeScore", SortField.Type.DOUBLE))); + Assert.assertEquals(6, topDocs.scoreDocs.length); + + float lastHaplotypeScore = -1.0f; + for (ScoreDoc scoreDoc : topDocs.scoreDocs) { + Document document = indexSearcher.doc(scoreDoc.doc); + float currentHaplotypeScore = Float.parseFloat(document.get("HaplotypeScore")); + Assert.assertTrue(lastHaplotypeScore <= currentHaplotypeScore); + lastHaplotypeScore = currentHaplotypeScore; + } + + // Results are sorted by genomicPosition + topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, Sort.INDEXORDER); + Assert.assertTrue(topDocs.scoreDocs.length > 0); + + lastGenomicPosition = -1; + for (ScoreDoc scoreDoc : topDocs.scoreDocs) { + Document document = indexSearcher.doc(scoreDoc.doc); + int currentGenomicPosition = Integer.parseInt(document.get("genomicPosition")); + if (lastGenomicPosition != -1) { + Assert.assertTrue(lastGenomicPosition <= currentGenomicPosition); + } + lastGenomicPosition = currentGenomicPosition; + } } } }