From 5b119d52f06639ad4a211fb624f71bf2537c41cc Mon Sep 17 00:00:00 2001 From: Sebastian Benjamin Date: Mon, 29 Apr 2024 11:45:12 -0700 Subject: [PATCH 1/2] add _sort to DocValues --- .../discvrseq/walkers/VcfToLuceneIndexer.java | 36 +++++++++---------- .../VcfToLuceneIndexerIntegrationTest.java | 8 ++--- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/main/java/com/github/discvrseq/walkers/VcfToLuceneIndexer.java b/src/main/java/com/github/discvrseq/walkers/VcfToLuceneIndexer.java index 3fd9bfee..30ee09e0 100644 --- a/src/main/java/com/github/discvrseq/walkers/VcfToLuceneIndexer.java +++ b/src/main/java/com/github/discvrseq/walkers/VcfToLuceneIndexer.java @@ -104,7 +104,7 @@ public void onTraversalStart() { } IndexWriterConfig config = new IndexWriterConfig(analyzer); - config.setIndexSort(new Sort(new SortField("genomicPosition", SortField.Type.INT, false))); + config.setIndexSort(new Sort(new SortField("genomicPosition_sort", SortField.Type.INT, false))); try { writer = new IndexWriter(index, config); @@ -245,26 +245,26 @@ else if (line.getCountType() == VCFHeaderLineCount.INTEGER || line.getCountType( // Add standard fields doc.add(new TextField("contig", variant.getContig(), Field.Store.YES)); - doc.add(new SortedDocValuesField("contig", new BytesRef(variant.getContig()))); + doc.add(new SortedDocValuesField("contig_sort", new BytesRef(variant.getContig()))); doc.add(new TextField("ref", variant.getReference().getDisplayString(), Field.Store.YES)); - doc.add(new SortedDocValuesField("ref", new BytesRef(variant.getReference().getDisplayString()))); + doc.add(new SortedDocValuesField("ref_sort", new BytesRef(variant.getReference().getDisplayString()))); doc.add(new TextField("alt", alt.getDisplayString(), Field.Store.YES)); - doc.add(new SortedDocValuesField("alt", new BytesRef(alt.getDisplayString()))); + doc.add(new SortedDocValuesField("alt_sort", new BytesRef(alt.getDisplayString()))); doc.add(new IntPoint("start", variant.getStart())); doc.add(new StoredField("start", variant.getStart())); - doc.add(new NumericDocValuesField("start", variant.getStart())); + doc.add(new NumericDocValuesField("start_sort", variant.getStart())); doc.add(new IntPoint("end", variant.getEnd())); doc.add(new StoredField("end", variant.getEnd())); - doc.add(new NumericDocValuesField("end", variant.getEnd())); + doc.add(new NumericDocValuesField("end_sort", variant.getEnd())); final int genomicPosition = getGenomicPosition(variant.getContig(), variant.getStart()); doc.add(new IntPoint("genomicPosition", genomicPosition)); doc.add(new StoredField("genomicPosition", genomicPosition)); - doc.add(new NumericDocValuesField("genomicPosition", genomicPosition)); + doc.add(new NumericDocValuesField("genomicPosition_sort", genomicPosition)); if (variant.hasGenotypes()) { AtomicReference docValue = new AtomicReference<>(null); @@ -278,7 +278,7 @@ else if (line.getCountType() == VCFHeaderLineCount.INTEGER || line.getCountType( }); if (docValue.get() != null) { - doc.add(new SortedDocValuesField("variableSamples", new BytesRef(docValue.get()))); + doc.add(new SortedDocValuesField("variableSamples_sort", new BytesRef(docValue.get()))); docValue.set(null); } @@ -291,29 +291,29 @@ else if (line.getCountType() == VCFHeaderLineCount.INTEGER || line.getCountType( }); if (docValue.get() != null) { - doc.add(new SortedDocValuesField("homozygousVarSamples", new BytesRef(docValue.get()))); + doc.add(new SortedDocValuesField("homozygousVarSamples_sort", new BytesRef(docValue.get()))); docValue.set(null); } long nHet = variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt) && g.isHet()).count(); doc.add(new IntPoint("nHet", (int)nHet)); doc.add(new StoredField("nHet", (int)nHet)); - doc.add(new NumericDocValuesField("nHet", (int)nHet)); + doc.add(new NumericDocValuesField("nHet_sort", (int)nHet)); long nHomVar = variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt) && g.isHomVar()).count(); doc.add(new IntPoint("nHomVar", (int)nHomVar)); doc.add(new StoredField("nHomVar", (int)nHomVar)); - doc.add(new NumericDocValuesField("nHomVar", (int)nHomVar)); + doc.add(new NumericDocValuesField("nHomVar_sort", (int)nHomVar)); long nCalled = variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall()).count(); doc.add(new IntPoint("nCalled", (int)nCalled)); doc.add(new StoredField("nCalled", (int)nCalled)); - doc.add(new NumericDocValuesField("nCalled", (int)nCalled)); + doc.add(new NumericDocValuesField("nCalled_sort", (int)nCalled)); float fractionHet = (float) nHet / (float) (nHet + nHomVar); doc.add(new DoublePoint("fractionHet", fractionHet)); doc.add(new StoredField("fractionHet", fractionHet)); - doc.add(new NumericDocValuesField("fractionHet", NumericUtils.doubleToSortableLong(fractionHet))); + doc.add(new NumericDocValuesField("fractionHet_sort", NumericUtils.doubleToSortableLong(fractionHet))); } try { @@ -408,7 +408,7 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var doc.add(new StringField(key, String.valueOf(value), Field.Store.YES)); if (indexDocValue.get()) { - doc.add(new SortedDocValuesField(key, new BytesRef(String.valueOf(value)))); + doc.add(new SortedDocValuesField(key + "_sort", new BytesRef(String.valueOf(value)))); indexDocValue.set(false); } } @@ -417,7 +417,7 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var doc.add(new IntPoint(key, x)); if (indexDocValue.get()) { - doc.add(new NumericDocValuesField(key, x)); + doc.add(new NumericDocValuesField(key + "_sort", x)); indexDocValue.set(false); } } @@ -436,7 +436,7 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var }); if (docValue.get() != null && indexDocValue.get()) { - doc.add(new NumericDocValuesField(key, NumericUtils.doubleToSortableLong(docValue.get()))); + doc.add(new NumericDocValuesField(key + "_sort", NumericUtils.doubleToSortableLong(docValue.get()))); indexDocValue.set(false); } } @@ -455,7 +455,7 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var }); if (docValue.get() != null && indexDocValue.get()) { - doc.add(new NumericDocValuesField(key, docValue.get())); + doc.add(new NumericDocValuesField(key + "_sort", docValue.get())); indexDocValue.set(false); } } @@ -464,7 +464,7 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var doc.add(new TextField(key, String.valueOf(value), Field.Store.YES)); if (indexDocValue.get()) { - doc.add(new SortedDocValuesField(key, new BytesRef(String.valueOf(value)))); + doc.add(new SortedDocValuesField(key +"_sort", new BytesRef(String.valueOf(value)))); indexDocValue.set(false); } } diff --git a/src/test/java/com/github/discvrseq/walkers/VcfToLuceneIndexerIntegrationTest.java b/src/test/java/com/github/discvrseq/walkers/VcfToLuceneIndexerIntegrationTest.java index bf2f00de..551bb9e9 100644 --- a/src/test/java/com/github/discvrseq/walkers/VcfToLuceneIndexerIntegrationTest.java +++ b/src/test/java/com/github/discvrseq/walkers/VcfToLuceneIndexerIntegrationTest.java @@ -471,7 +471,7 @@ public void doExtendedTest() throws Exception { Assert.assertEquals(topDocs.totalHits.value, 1L); // Top 50 hits are sorted by genomicPosition - topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("genomicPosition", SortField.Type.INT))); + topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("genomicPosition_sort", SortField.Type.INT))); Assert.assertEquals(6, topDocs.scoreDocs.length); int lastGenomicPosition = -1; @@ -485,7 +485,7 @@ public void doExtendedTest() throws Exception { } // Results are sorted by REFFIELD - topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("REFFIELD", SortField.Type.STRING))); + topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("REFFIELD_sort", SortField.Type.STRING))); Assert.assertEquals(6, topDocs.scoreDocs.length); String lastRefField = null; @@ -499,7 +499,7 @@ public void doExtendedTest() throws Exception { } // Results are sorted by start - topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("start", SortField.Type.INT))); + topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("start_sort", SortField.Type.INT))); Assert.assertEquals(6, topDocs.scoreDocs.length); int lastStart = -1; @@ -511,7 +511,7 @@ public void doExtendedTest() throws Exception { } // Results are sorted by HaplotypeScore - topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("HaplotypeScore", SortField.Type.DOUBLE))); + topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("HaplotypeScore_sort", SortField.Type.DOUBLE))); Assert.assertEquals(6, topDocs.scoreDocs.length); float lastHaplotypeScore = -1.0f; From 8461a21d9703b5928b86ea43b8931c11d1cf75f8 Mon Sep 17 00:00:00 2001 From: Sebastian Benjamin Date: Fri, 3 May 2024 15:06:57 -0700 Subject: [PATCH 2/2] Sort start/end using genomicPosition values for contig-aware sorting --- .../discvrseq/walkers/VcfToLuceneIndexer.java | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/main/java/com/github/discvrseq/walkers/VcfToLuceneIndexer.java b/src/main/java/com/github/discvrseq/walkers/VcfToLuceneIndexer.java index 30ee09e0..aa965381 100644 --- a/src/main/java/com/github/discvrseq/walkers/VcfToLuceneIndexer.java +++ b/src/main/java/com/github/discvrseq/walkers/VcfToLuceneIndexer.java @@ -16,10 +16,8 @@ import org.apache.lucene.document.*; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.IndexableField; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; -import org.apache.lucene.search.SortedNumericSortField; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.NumericUtils; @@ -253,18 +251,19 @@ else if (line.getCountType() == VCFHeaderLineCount.INTEGER || line.getCountType( doc.add(new TextField("alt", alt.getDisplayString(), Field.Store.YES)); doc.add(new SortedDocValuesField("alt_sort", new BytesRef(alt.getDisplayString()))); + final int genomicPositionStart = getGenomicPosition(variant.getContig(), variant.getStart()); doc.add(new IntPoint("start", variant.getStart())); doc.add(new StoredField("start", variant.getStart())); - doc.add(new NumericDocValuesField("start_sort", variant.getStart())); + doc.add(new NumericDocValuesField("start_sort", genomicPositionStart)); + final int genomicPositionEnd = getGenomicPosition(variant.getContig(), variant.getEnd()); doc.add(new IntPoint("end", variant.getEnd())); doc.add(new StoredField("end", variant.getEnd())); - doc.add(new NumericDocValuesField("end_sort", variant.getEnd())); + doc.add(new NumericDocValuesField("end_sort", genomicPositionEnd)); - final int genomicPosition = getGenomicPosition(variant.getContig(), variant.getStart()); - doc.add(new IntPoint("genomicPosition", genomicPosition)); - doc.add(new StoredField("genomicPosition", genomicPosition)); - doc.add(new NumericDocValuesField("genomicPosition_sort", genomicPosition)); + doc.add(new IntPoint("genomicPosition", genomicPositionStart)); + doc.add(new StoredField("genomicPosition", genomicPositionStart)); + doc.add(new NumericDocValuesField("genomicPosition_sort", genomicPositionStart)); if (variant.hasGenotypes()) { AtomicReference docValue = new AtomicReference<>(null);