Skip to content

Commit

Permalink
Solution to ticket #91
Browse files Browse the repository at this point in the history
Removed custom_iterator and refactored GenomeTree to RegionVCFIter.
Better reuse of code and don't need the weird __next__/__iter__.
Should be faster, particularly with includebeds
Found an error where TruScore precision lost from float to int casting lost optimal matches.
  • Loading branch information
ACEnglish committed Dec 21, 2021
1 parent 8a7b0a5 commit 771c05b
Show file tree
Hide file tree
Showing 9 changed files with 32 additions and 62 deletions.
2 changes: 1 addition & 1 deletion repo_utils/test_files/answer_key/bench23/fp.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
chr20 278930 . C CGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGACGGAGGGCGGGACGGCGGGAGGGCGGGACGGAGGGACGGAGGGAGGGCGGGACGGAGGGCGGGAGGGCGGGACGGAGGGCGGGAG 60 . QNAME=cluster23_scaffold_2;QSTART=25417363;QSTRAND=-;SVTYPE=INS;SVLEN=188;PctSeqSimilarity=0.667291;PctSizeSimilarity=0.346225;PctRecOverlap=0;SizeDiff=355;StartDistance=139;EndDistance=139;TruScore=42;MatchId=4.0.0 GT 0/1
chr20 279098 . G GGGAGGGAGGGCGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGAGGGACGGAGGGCGGGACGGAGGGAGGGAGGGCGGAGGGAGGGAGGGGCGGGACGGAGGGAGGGAGGGCGGGAGGGATGGAGGGAGGGAGGGCGGGACGGAGGGAGGGACGGAGGGCGGGACGGAGGGAC 60 . QNAME=cluster23_scaffold_3;QSTART=25432493;QSTRAND=-;SVTYPE=INS;SVLEN=179;PctSeqSimilarity=0.540302;PctSizeSimilarity=0.32965;PctRecOverlap=0;SizeDiff=364;StartDistance=-29;EndDistance=-29;TruScore=35;MatchId=4.0.2 GT 1/0
chr20 280211 . CAACAACAACAATTGTACTTCCCTAAGGTTACACCCAGCAGGTGCATAAAACCTACAGTAACAAT C 60 . QNAME=cluster23_scaffold_3;QSTART=25431380;QSTRAND=-;SVTYPE=DEL;SVLEN=-64;PctSeqSimilarity=.;PctSizeSimilarity=.;PctRecOverlap=.;SizeDiff=.;StartDistance=.;EndDistance=.;TruScore=.;MatchId=5._.0 GT 1/0
chr20 641906 . G GGCGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGGGTTGCCT 60 . QNAME=cluster23_scaffold_2;QSTART=25053389;QSTRAND=-;SVTYPE=INS;SVLEN=163;PctSeqSimilarity=0.692156;PctSizeSimilarity=0.25873;PctRecOverlap=0;SizeDiff=467;StartDistance=301;EndDistance=301;GTMatch;TruScore=41;MatchId=10.0.0 GT 0/1
chr20 641906 . G GGCGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGGGTTGCCT 60 . QNAME=cluster23_scaffold_2;QSTART=25053389;QSTRAND=-;SVTYPE=INS;SVLEN=163;PctSeqSimilarity=0.701844;PctSizeSimilarity=0.257911;PctRecOverlap=0;SizeDiff=469;StartDistance=324;EndDistance=324;TruScore=41;MatchId=10.1.0 GT 0/1
chr20 642068 . G GTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGGGCCCAGCGGGGGTGGGGTTGCCTGGGGGGGGCCCAGCGGGGGTGGAGTTGCC 60 . QNAME=cluster23_scaffold_2;QSTART=25053097;QSTRAND=-;SVTYPE=INS;SVLEN=129;PctSeqSimilarity=0.562989;PctSizeSimilarity=0.204114;PctRecOverlap=0;SizeDiff=503;StartDistance=162;EndDistance=162;TruScore=33;MatchId=10.1.1 GT 0/1
chr20 642271 . G GGCCCAGCGGGGGTGGGGTTGCCTGGGGGGGGCCCAGCGGGGGTGGGGTTGCCTGGGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGTGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGATGGGGTTGCCTGGGGGGGGGC 60 . QNAME=cluster23_scaffold_3;QSTART=25068731;QSTRAND=-;SVTYPE=INS;SVLEN=257;PctSeqSimilarity=0.641691;PctSizeSimilarity=0.407937;PctRecOverlap=0;SizeDiff=373;StartDistance=-64;EndDistance=-64;TruScore=42;MatchId=10.0.2 GT 1/0
chr20 642330 . G GCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGTGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGGGGGGC 60 . QNAME=cluster23_scaffold_2;QSTART=25052609;QSTRAND=-;SVTYPE=INS;SVLEN=226;PctSeqSimilarity=0.649306;PctSizeSimilarity=0.35873;PctRecOverlap=0;SizeDiff=404;StartDistance=-123;EndDistance=-123;GTMatch;TruScore=41;MatchId=10.0.3 GT 0/1
Expand Down
2 changes: 1 addition & 1 deletion repo_utils/test_files/answer_key/multi_removed_common.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@
##INFO=<ID=MatchId,Number=1,Type=String,Description="Id to help tie base/comp calls together {chunkid}.{baseid}.{compid}">
##INFO=<ID=Multi,Number=0,Type=Flag,Description="Call is false due to non-multimatching">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA24385 NA12878 HG00733
chr20 149073 . G GAATCCTGACCCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCA 60 . QNAME=cluster23_scaffold_2;QSTART=25547471;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.996479;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-60;EndDistance=-60;TruScore=74;MatchId=1.0 GT ./. ./. 0/1
chr20 149095 . G GGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCGAATCCTGACCCATATTTGGGAA 60 . QNAME=cluster19_000000F;QSTART=25613718;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.997024;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-82;EndDistance=-82;TruScore=74;MatchId=1.0 GT ./. 1/1 ./.
chr20 149073 . G GAATCCTGACCCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCA 60 . QNAME=cluster23_scaffold_2;QSTART=25547471;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.996479;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-60;EndDistance=-60;TruScore=74;MatchId=1.0 GT ./. ./. 0/1
chr20 278930 . C CGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGACGGAGGGCGGGACGGCGGGAGGGCGGGACGGAGGGACGGAGGGAGGGCGGGACGGAGGGCGGGAGGGCGGGACGGAGGGCGGGAG 60 . QNAME=cluster23_scaffold_2;QSTART=25417363;QSTRAND=-;SVTYPE=INS;SVLEN=188;PctSeqSimilarity=0.986631;PctSizeSimilarity=0.978723;PctRecOverlap=1;SizeDiff=-4;StartDistance=0;EndDistance=0;TruScore=98;MatchId=2.0 GT ./. ./. 0/1
chr20 306268 . A ACCAGGCTGGAGTGCAGTGGCTCACTGCGTGGCTCGCTACAGCCTACAACTCCTGGGCTCCAGCAATCCTGCTGCCCCAGCCTCCTGTGTAACTGAGACTACAGGCACGCACCACCACACCCAGCTAATGTTTTCTTTCTTTTTTTTTTTTTTGAGATGAACTCTCACTCTGTTGC 60 . QNAME=cluster19_000000F;QSTART=25455794;QSTRAND=-;SVTYPE=INS;SVLEN=175;PctSeqSimilarity=0.997159;PctSizeSimilarity=1;PctRecOverlap=1;SizeDiff=0;StartDistance=0;EndDistance=0;TruScore=99;MatchId=4.0 GT ./. 1/1 ./.
chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;PctSeqSimilarity=0.995604;PctSizeSimilarity=0.995595;PctRecOverlap=1;SizeDiff=-1;StartDistance=0;EndDistance=0;TruScore=99;MatchId=6.0 GT 0/1 ./. 1/1
Expand Down
2 changes: 1 addition & 1 deletion repo_utils/test_files/answer_key/multi_removed_first.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@
##INFO=<ID=MatchId,Number=1,Type=String,Description="Id to help tie base/comp calls together {chunkid}.{baseid}.{compid}">
##INFO=<ID=Multi,Number=0,Type=Flag,Description="Call is false due to non-multimatching">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA24385 NA12878 HG00733
chr20 149073 . G GAATCCTGACCCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCA 60 . QNAME=cluster23_scaffold_2;QSTART=25547471;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.996479;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-60;EndDistance=-60;TruScore=74;MatchId=1.0 GT ./. ./. 0/1
chr20 149095 . G GGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCGAATCCTGACCCATATTTGGGAA 60 . QNAME=cluster19_000000F;QSTART=25613718;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.997024;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-82;EndDistance=-82;TruScore=74;MatchId=1.0 GT ./. 1/1 ./.
chr20 149073 . G GAATCCTGACCCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCA 60 . QNAME=cluster23_scaffold_2;QSTART=25547471;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.996479;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-60;EndDistance=-60;TruScore=74;MatchId=1.0 GT ./. ./. 0/1
chr20 278930 . C CGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGACGGAGGGCGGGACGGCGGGAGGGCGGGACGGAGGGACGGAGGGAGGGCGGGACGGAGGGCGGGAGGGCGGGACGGAGGGCGGGAG 60 . QNAME=cluster23_scaffold_2;QSTART=25417363;QSTRAND=-;SVTYPE=INS;SVLEN=188;PctSeqSimilarity=0.986631;PctSizeSimilarity=0.978723;PctRecOverlap=1;SizeDiff=-4;StartDistance=0;EndDistance=0;TruScore=98;MatchId=2.0 GT ./. ./. 0/1
chr20 306268 . A ACCAGGCTGGAGTGCAGTGGCTCACTGCGTGGCTCGCTACAGCCTACAACTCCTGGGCTCCAGCAATCCTGCTGCCCCAGCCTCCTGTGTAACTGAGACTACAGGCACGCACCACCACACCCAGCTAATGTTTTCTTTCTTTTTTTTTTTTTTGAGATGAACTCTCACTCTGTTGC 60 . QNAME=cluster19_000000F;QSTART=25455794;QSTRAND=-;SVTYPE=INS;SVLEN=175;PctSeqSimilarity=0.997159;PctSizeSimilarity=1;PctRecOverlap=1;SizeDiff=0;StartDistance=0;EndDistance=0;TruScore=99;MatchId=4.0 GT ./. 1/1 ./.
chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;PctSeqSimilarity=0.995604;PctSizeSimilarity=0.995595;PctRecOverlap=1;SizeDiff=-1;StartDistance=0;EndDistance=0;TruScore=99;MatchId=6.0 GT 0/1 ./. 1/1
Expand Down
2 changes: 1 addition & 1 deletion repo_utils/test_files/answer_key/multi_removed_maxqual.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@
##INFO=<ID=MatchId,Number=1,Type=String,Description="Id to help tie base/comp calls together {chunkid}.{baseid}.{compid}">
##INFO=<ID=Multi,Number=0,Type=Flag,Description="Call is false due to non-multimatching">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA24385 NA12878 HG00733
chr20 149073 . G GAATCCTGACCCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCA 60 . QNAME=cluster23_scaffold_2;QSTART=25547471;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.996479;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-60;EndDistance=-60;TruScore=74;MatchId=1.0 GT ./. ./. 0/1
chr20 149095 . G GGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCGAATCCTGACCCATATTTGGGAA 60 . QNAME=cluster19_000000F;QSTART=25613718;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.997024;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-82;EndDistance=-82;TruScore=74;MatchId=1.0 GT ./. 1/1 ./.
chr20 149073 . G GAATCCTGACCCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCA 60 . QNAME=cluster23_scaffold_2;QSTART=25547471;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.996479;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-60;EndDistance=-60;TruScore=74;MatchId=1.0 GT ./. ./. 0/1
chr20 278930 . C CGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGACGGAGGGCGGGACGGCGGGAGGGCGGGACGGAGGGACGGAGGGAGGGCGGGACGGAGGGCGGGAGGGCGGGACGGAGGGCGGGAG 60 . QNAME=cluster23_scaffold_2;QSTART=25417363;QSTRAND=-;SVTYPE=INS;SVLEN=188;PctSeqSimilarity=0.986631;PctSizeSimilarity=0.978723;PctRecOverlap=1;SizeDiff=-4;StartDistance=0;EndDistance=0;TruScore=98;MatchId=2.0 GT ./. ./. 0/1
chr20 306268 . A ACCAGGCTGGAGTGCAGTGGCTCACTGCGTGGCTCGCTACAGCCTACAACTCCTGGGCTCCAGCAATCCTGCTGCCCCAGCCTCCTGTGTAACTGAGACTACAGGCACGCACCACCACACCCAGCTAATGTTTTCTTTCTTTTTTTTTTTTTTGAGATGAACTCTCACTCTGTTGC 60 . QNAME=cluster19_000000F;QSTART=25455794;QSTRAND=-;SVTYPE=INS;SVLEN=175;PctSeqSimilarity=0.997159;PctSizeSimilarity=1;PctRecOverlap=1;SizeDiff=0;StartDistance=0;EndDistance=0;TruScore=99;MatchId=4.0 GT ./. 1/1 ./.
chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;PctSeqSimilarity=0.995604;PctSizeSimilarity=0.995595;PctRecOverlap=1;SizeDiff=-1;StartDistance=0;EndDistance=0;TruScore=99;MatchId=6.0 GT 0/1 ./. 1/1
Expand Down
Binary file modified repo_utils/test_files/answer_key/truv2df.jl
Binary file not shown.
6 changes: 3 additions & 3 deletions truvari/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
Objects:
:class:`GT`
:class:`GenomeTree`
:class:`RegionVCFIterator`
:class:`LogFileStderr`
:class:`MatchResult`
:class:`Matcher`
Expand All @@ -73,8 +73,8 @@
Matcher
)

from truvari.genome_tree import (
GenomeTree,
from truvari.region_vcf_iter import (
RegionVCFIterator,
)

from truvari.comparisons import (
Expand Down
54 changes: 10 additions & 44 deletions truvari/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import truvari
from truvari.giab_report import make_giabreport


@total_ordering
class MatchResult(): # pylint: disable=too-many-instance-attributes
"""
Expand Down Expand Up @@ -115,7 +114,6 @@ def make_match_params():
ret.sizemax = 50000
ret.passonly = False
ret.no_ref = False
ret.includebed = None
ret.multimatch = False
return ret

Expand Down Expand Up @@ -152,15 +150,14 @@ def filter_call(self, entry, base=False):
Returns True if the call should be filtered
Base has different filtering requirements, so let the method know
"""
# See if it hits the regions, first
if self.params.includebed and not self.params.includebed.include(entry):
size = truvari.entry_size(entry)
if size > self.params.sizemax:
return True

size = truvari.entry_size(entry)
if base and size < self.params.sizemin or size > self.params.sizemax:
if base and size < self.params.sizemin:
return True

if not base and size < self.params.sizefilt or size > self.params.sizemax:
if not base and size < self.params.sizefilt:
return True

samp = self.params.bSample if base else self.params.cSample
Expand Down Expand Up @@ -525,7 +522,7 @@ def annotate_entry(entry, match, header):
entry.info["StartDistance"] = match.st_dist
entry.info["EndDistance"] = match.ed_dist
entry.info["GTMatch"] = match.gt_match
entry.info["TruScore"] = match.score
entry.info["TruScore"] = int(match.score) if match.score else None
entry.info["MatchId"] = match.matid
entry.info["Multi"] = match.multi
return entry
Expand Down Expand Up @@ -757,37 +754,6 @@ def close_outputs(outputs):
outputs["fp_out"].close()


class custom_iterator(): # pylint: disable=too-few-public-methods
"""
Iterate vcfs in the same order, regardless of sort order
"""
def __init__(self, fn):
"""
Wrap VariantFile
"""
self.vcf = pysam.VariantFile(fn)
self.all_chroms = sorted(self.vcf.header.contigs)
self.gen = None
self.pos = None

def __next__(self):
"""
Wrap next
"""
if self.pos is None:
self.pos = 0
self.gen = self.vcf.fetch(self.all_chroms[self.pos])
elif self.pos >= len(self.all_chroms):
raise StopIteration
try:
return next(self.gen)
except StopIteration as stop_iter:
self.pos += 1
if self.pos >= len(self.all_chroms):
raise StopIteration from stop_iter
self.gen = self.vcf.fetch(self.all_chroms[self.pos])
return next(self)

def bench_main(cmdargs):
"""
Main
Expand All @@ -803,11 +769,11 @@ def bench_main(cmdargs):

base = pysam.VariantFile(args.base)
comp = pysam.VariantFile(args.comp)
matcher.params.includebed = truvari.GenomeTree(base, comp,
args.includebed,
args.sizemax)
base_i = custom_iterator(args.base)
comp_i = custom_iterator(args.comp)

regions = truvari.RegionVCFIterator(base, comp, args.includebed, args.sizemax)
base_i = regions.iterate(base)
comp_i = regions.iterate(comp)

chunks = chunker(matcher, ('base', base_i), ('comp', comp_i))
for call in itertools.chain.from_iterable(map(compare_chunk, chunks)):
output_writer(call, outputs, args.sizemin)
Expand Down
2 changes: 1 addition & 1 deletion truvari/comparisons.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ def weighted_score(sim, size, ovl):
:rtype: float
"""
score = (2 * sim + 1 * size + 1 * ovl) / 3.0
new_score = int(score / 1.333333 * 100)
new_score = score / 1.333333 * 100
return new_score


Expand Down
24 changes: 14 additions & 10 deletions truvari/genome_tree.py → truvari/region_vcf_iter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@
from intervaltree import IntervalTree
import truvari.comparisons as tcomp


class GenomeTree():
class RegionVCFIterator():
"""
Helper class to specify included regions of the genome when iterating events.
Helper class to specify include regions of the genome when iterating a VCF
Subset to only events less than max_span.
Subset to only events on contigs listed in vcfA left-join vcfB
"""

def __init__(self, vcfA, vcfB, includebed=None, max_span=None):
def __init__(self, vcfA, vcfB=None, includebed=None, max_span=None):
""" init """
self.includebed = includebed
self.max_span = max_span
Expand All @@ -24,7 +25,10 @@ def __build_tree(self, vcfA, vcfB):
Build the include regions
"""
contigA_set = set(vcfA.header.contigs.keys())
contigB_set = set(vcfB.header.contigs.keys())
if vcfB is not None:
contigB_set = set(vcfB.header.contigs.keys())
else:
contigB_set = contigA_set
all_regions = defaultdict(IntervalTree)
if self.includebed is not None:
counter = 0
Expand Down Expand Up @@ -53,10 +57,10 @@ def __build_tree(self, vcfA, vcfB):

def iterate(self, vcf_file):
"""
Iterates a vcf and yields only the entries that overlap an 'include' region
Iterates a vcf and yields only the entries that overlap included regions
"""
for chrom in self.tree:
for intv in self.tree[chrom]:
for chrom in sorted(self.tree.keys()):
for intv in sorted(self.tree[chrom]):
for entry in vcf_file.fetch(chrom, intv.begin, intv.end):
if self.includebed is None or self.include(entry):
yield entry
Expand All @@ -70,8 +74,8 @@ def include(self, entry):
# Filter these early so we don't have to keep checking overlaps
if self.max_span is None or aend - astart > self.max_span:
return False
overlaps = self.tree[entry.chrom].overlaps(
astart) and self.tree[entry.chrom].overlaps(aend)
overlaps = self.tree[entry.chrom].overlaps(astart) \
and self.tree[entry.chrom].overlaps(aend)
if astart == aend:
return overlaps
return overlaps and len(self.tree[entry.chrom].overlap(astart, aend)) == 1

0 comments on commit 771c05b

Please sign in to comment.