diff --git a/repo_utils/test_files/answer_key/bench23/fp.vcf b/repo_utils/test_files/answer_key/bench23/fp.vcf index 98842ceb..3635e373 100644 --- a/repo_utils/test_files/answer_key/bench23/fp.vcf +++ b/repo_utils/test_files/answer_key/bench23/fp.vcf @@ -49,7 +49,7 @@ chr20 278930 . C CGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGACGGAGGGCGGGACGGCGGGAGGGCGGGACGGAGGGACGGAGGGAGGGCGGGACGGAGGGCGGGAGGGCGGGACGGAGGGCGGGAG 60 . QNAME=cluster23_scaffold_2;QSTART=25417363;QSTRAND=-;SVTYPE=INS;SVLEN=188;PctSeqSimilarity=0.667291;PctSizeSimilarity=0.346225;PctRecOverlap=0;SizeDiff=355;StartDistance=139;EndDistance=139;TruScore=42;MatchId=4.0.0 GT 0/1 chr20 279098 . G GGGAGGGAGGGCGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGAGGGACGGAGGGCGGGACGGAGGGAGGGAGGGCGGAGGGAGGGAGGGGCGGGACGGAGGGAGGGAGGGCGGGAGGGATGGAGGGAGGGAGGGCGGGACGGAGGGAGGGACGGAGGGCGGGACGGAGGGAC 60 . QNAME=cluster23_scaffold_3;QSTART=25432493;QSTRAND=-;SVTYPE=INS;SVLEN=179;PctSeqSimilarity=0.540302;PctSizeSimilarity=0.32965;PctRecOverlap=0;SizeDiff=364;StartDistance=-29;EndDistance=-29;TruScore=35;MatchId=4.0.2 GT 1/0 chr20 280211 . CAACAACAACAATTGTACTTCCCTAAGGTTACACCCAGCAGGTGCATAAAACCTACAGTAACAAT C 60 . QNAME=cluster23_scaffold_3;QSTART=25431380;QSTRAND=-;SVTYPE=DEL;SVLEN=-64;PctSeqSimilarity=.;PctSizeSimilarity=.;PctRecOverlap=.;SizeDiff=.;StartDistance=.;EndDistance=.;TruScore=.;MatchId=5._.0 GT 1/0 -chr20 641906 . G GGCGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGGGTTGCCT 60 . QNAME=cluster23_scaffold_2;QSTART=25053389;QSTRAND=-;SVTYPE=INS;SVLEN=163;PctSeqSimilarity=0.692156;PctSizeSimilarity=0.25873;PctRecOverlap=0;SizeDiff=467;StartDistance=301;EndDistance=301;GTMatch;TruScore=41;MatchId=10.0.0 GT 0/1 +chr20 641906 . G GGCGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGGGTTGCCT 60 . QNAME=cluster23_scaffold_2;QSTART=25053389;QSTRAND=-;SVTYPE=INS;SVLEN=163;PctSeqSimilarity=0.701844;PctSizeSimilarity=0.257911;PctRecOverlap=0;SizeDiff=469;StartDistance=324;EndDistance=324;TruScore=41;MatchId=10.1.0 GT 0/1 chr20 642068 . G GTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGGGCCCAGCGGGGGTGGGGTTGCCTGGGGGGGGCCCAGCGGGGGTGGAGTTGCC 60 . QNAME=cluster23_scaffold_2;QSTART=25053097;QSTRAND=-;SVTYPE=INS;SVLEN=129;PctSeqSimilarity=0.562989;PctSizeSimilarity=0.204114;PctRecOverlap=0;SizeDiff=503;StartDistance=162;EndDistance=162;TruScore=33;MatchId=10.1.1 GT 0/1 chr20 642271 . G GGCCCAGCGGGGGTGGGGTTGCCTGGGGGGGGCCCAGCGGGGGTGGGGTTGCCTGGGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGTGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGATGGGGTTGCCTGGGGGGGGGC 60 . QNAME=cluster23_scaffold_3;QSTART=25068731;QSTRAND=-;SVTYPE=INS;SVLEN=257;PctSeqSimilarity=0.641691;PctSizeSimilarity=0.407937;PctRecOverlap=0;SizeDiff=373;StartDistance=-64;EndDistance=-64;TruScore=42;MatchId=10.0.2 GT 1/0 chr20 642330 . G GCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGAGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGTGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGGGGGGC 60 . QNAME=cluster23_scaffold_2;QSTART=25052609;QSTRAND=-;SVTYPE=INS;SVLEN=226;PctSeqSimilarity=0.649306;PctSizeSimilarity=0.35873;PctRecOverlap=0;SizeDiff=404;StartDistance=-123;EndDistance=-123;GTMatch;TruScore=41;MatchId=10.0.3 GT 0/1 diff --git a/repo_utils/test_files/answer_key/multi_removed_common.vcf b/repo_utils/test_files/answer_key/multi_removed_common.vcf index 76e5bbc1..8291c4d4 100644 --- a/repo_utils/test_files/answer_key/multi_removed_common.vcf +++ b/repo_utils/test_files/answer_key/multi_removed_common.vcf @@ -47,8 +47,8 @@ ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA24385 NA12878 HG00733 -chr20 149073 . G GAATCCTGACCCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCA 60 . QNAME=cluster23_scaffold_2;QSTART=25547471;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.996479;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-60;EndDistance=-60;TruScore=74;MatchId=1.0 GT ./. ./. 0/1 chr20 149095 . G GGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCGAATCCTGACCCATATTTGGGAA 60 . QNAME=cluster19_000000F;QSTART=25613718;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.997024;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-82;EndDistance=-82;TruScore=74;MatchId=1.0 GT ./. 1/1 ./. +chr20 149073 . G GAATCCTGACCCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCA 60 . QNAME=cluster23_scaffold_2;QSTART=25547471;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.996479;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-60;EndDistance=-60;TruScore=74;MatchId=1.0 GT ./. ./. 0/1 chr20 278930 . C CGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGACGGAGGGCGGGACGGCGGGAGGGCGGGACGGAGGGACGGAGGGAGGGCGGGACGGAGGGCGGGAGGGCGGGACGGAGGGCGGGAG 60 . QNAME=cluster23_scaffold_2;QSTART=25417363;QSTRAND=-;SVTYPE=INS;SVLEN=188;PctSeqSimilarity=0.986631;PctSizeSimilarity=0.978723;PctRecOverlap=1;SizeDiff=-4;StartDistance=0;EndDistance=0;TruScore=98;MatchId=2.0 GT ./. ./. 0/1 chr20 306268 . A ACCAGGCTGGAGTGCAGTGGCTCACTGCGTGGCTCGCTACAGCCTACAACTCCTGGGCTCCAGCAATCCTGCTGCCCCAGCCTCCTGTGTAACTGAGACTACAGGCACGCACCACCACACCCAGCTAATGTTTTCTTTCTTTTTTTTTTTTTTGAGATGAACTCTCACTCTGTTGC 60 . QNAME=cluster19_000000F;QSTART=25455794;QSTRAND=-;SVTYPE=INS;SVLEN=175;PctSeqSimilarity=0.997159;PctSizeSimilarity=1;PctRecOverlap=1;SizeDiff=0;StartDistance=0;EndDistance=0;TruScore=99;MatchId=4.0 GT ./. 1/1 ./. chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;PctSeqSimilarity=0.995604;PctSizeSimilarity=0.995595;PctRecOverlap=1;SizeDiff=-1;StartDistance=0;EndDistance=0;TruScore=99;MatchId=6.0 GT 0/1 ./. 1/1 diff --git a/repo_utils/test_files/answer_key/multi_removed_first.vcf b/repo_utils/test_files/answer_key/multi_removed_first.vcf index 76e5bbc1..8291c4d4 100644 --- a/repo_utils/test_files/answer_key/multi_removed_first.vcf +++ b/repo_utils/test_files/answer_key/multi_removed_first.vcf @@ -47,8 +47,8 @@ ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA24385 NA12878 HG00733 -chr20 149073 . G GAATCCTGACCCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCA 60 . QNAME=cluster23_scaffold_2;QSTART=25547471;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.996479;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-60;EndDistance=-60;TruScore=74;MatchId=1.0 GT ./. ./. 0/1 chr20 149095 . G GGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCGAATCCTGACCCATATTTGGGAA 60 . QNAME=cluster19_000000F;QSTART=25613718;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.997024;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-82;EndDistance=-82;TruScore=74;MatchId=1.0 GT ./. 1/1 ./. +chr20 149073 . G GAATCCTGACCCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCA 60 . QNAME=cluster23_scaffold_2;QSTART=25547471;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.996479;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-60;EndDistance=-60;TruScore=74;MatchId=1.0 GT ./. ./. 0/1 chr20 278930 . C CGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGACGGAGGGCGGGACGGCGGGAGGGCGGGACGGAGGGACGGAGGGAGGGCGGGACGGAGGGCGGGAGGGCGGGACGGAGGGCGGGAG 60 . QNAME=cluster23_scaffold_2;QSTART=25417363;QSTRAND=-;SVTYPE=INS;SVLEN=188;PctSeqSimilarity=0.986631;PctSizeSimilarity=0.978723;PctRecOverlap=1;SizeDiff=-4;StartDistance=0;EndDistance=0;TruScore=98;MatchId=2.0 GT ./. ./. 0/1 chr20 306268 . A ACCAGGCTGGAGTGCAGTGGCTCACTGCGTGGCTCGCTACAGCCTACAACTCCTGGGCTCCAGCAATCCTGCTGCCCCAGCCTCCTGTGTAACTGAGACTACAGGCACGCACCACCACACCCAGCTAATGTTTTCTTTCTTTTTTTTTTTTTTGAGATGAACTCTCACTCTGTTGC 60 . QNAME=cluster19_000000F;QSTART=25455794;QSTRAND=-;SVTYPE=INS;SVLEN=175;PctSeqSimilarity=0.997159;PctSizeSimilarity=1;PctRecOverlap=1;SizeDiff=0;StartDistance=0;EndDistance=0;TruScore=99;MatchId=4.0 GT ./. 1/1 ./. chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;PctSeqSimilarity=0.995604;PctSizeSimilarity=0.995595;PctRecOverlap=1;SizeDiff=-1;StartDistance=0;EndDistance=0;TruScore=99;MatchId=6.0 GT 0/1 ./. 1/1 diff --git a/repo_utils/test_files/answer_key/multi_removed_maxqual.vcf b/repo_utils/test_files/answer_key/multi_removed_maxqual.vcf index 76e5bbc1..8291c4d4 100644 --- a/repo_utils/test_files/answer_key/multi_removed_maxqual.vcf +++ b/repo_utils/test_files/answer_key/multi_removed_maxqual.vcf @@ -47,8 +47,8 @@ ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA24385 NA12878 HG00733 -chr20 149073 . G GAATCCTGACCCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCA 60 . QNAME=cluster23_scaffold_2;QSTART=25547471;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.996479;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-60;EndDistance=-60;TruScore=74;MatchId=1.0 GT ./. ./. 0/1 chr20 149095 . G GGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCGAATCCTGACCCATATTTGGGAA 60 . QNAME=cluster19_000000F;QSTART=25613718;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.997024;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-82;EndDistance=-82;TruScore=74;MatchId=1.0 GT ./. 1/1 ./. +chr20 149073 . G GAATCCTGACCCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCA 60 . QNAME=cluster23_scaffold_2;QSTART=25547471;QSTRAND=-;SVTYPE=INS;SVLEN=69;PctSeqSimilarity=0.996479;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-60;EndDistance=-60;TruScore=74;MatchId=1.0 GT ./. ./. 0/1 chr20 278930 . C CGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGCGGGACGGAGGGAGGGAGGGAGGGACGGAGGGCGGGACGGCGGGAGGGCGGGACGGAGGGACGGAGGGAGGGCGGGACGGAGGGCGGGAGGGCGGGACGGAGGGCGGGAG 60 . QNAME=cluster23_scaffold_2;QSTART=25417363;QSTRAND=-;SVTYPE=INS;SVLEN=188;PctSeqSimilarity=0.986631;PctSizeSimilarity=0.978723;PctRecOverlap=1;SizeDiff=-4;StartDistance=0;EndDistance=0;TruScore=98;MatchId=2.0 GT ./. ./. 0/1 chr20 306268 . A ACCAGGCTGGAGTGCAGTGGCTCACTGCGTGGCTCGCTACAGCCTACAACTCCTGGGCTCCAGCAATCCTGCTGCCCCAGCCTCCTGTGTAACTGAGACTACAGGCACGCACCACCACACCCAGCTAATGTTTTCTTTCTTTTTTTTTTTTTTGAGATGAACTCTCACTCTGTTGC 60 . QNAME=cluster19_000000F;QSTART=25455794;QSTRAND=-;SVTYPE=INS;SVLEN=175;PctSeqSimilarity=0.997159;PctSizeSimilarity=1;PctRecOverlap=1;SizeDiff=0;StartDistance=0;EndDistance=0;TruScore=99;MatchId=4.0 GT ./. 1/1 ./. chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;PctSeqSimilarity=0.995604;PctSizeSimilarity=0.995595;PctRecOverlap=1;SizeDiff=-1;StartDistance=0;EndDistance=0;TruScore=99;MatchId=6.0 GT 0/1 ./. 1/1 diff --git a/repo_utils/test_files/answer_key/truv2df.jl b/repo_utils/test_files/answer_key/truv2df.jl index 13a3388c..e598a89c 100644 Binary files a/repo_utils/test_files/answer_key/truv2df.jl and b/repo_utils/test_files/answer_key/truv2df.jl differ diff --git a/truvari/__init__.py b/truvari/__init__.py index 2788fd2a..8db78626 100644 --- a/truvari/__init__.py +++ b/truvari/__init__.py @@ -46,7 +46,7 @@ Objects: :class:`GT` -:class:`GenomeTree` +:class:`RegionVCFIterator` :class:`LogFileStderr` :class:`MatchResult` :class:`Matcher` @@ -73,8 +73,8 @@ Matcher ) -from truvari.genome_tree import ( - GenomeTree, +from truvari.region_vcf_iter import ( + RegionVCFIterator, ) from truvari.comparisons import ( diff --git a/truvari/bench.py b/truvari/bench.py index 3d6a478e..9598fa7c 100644 --- a/truvari/bench.py +++ b/truvari/bench.py @@ -20,7 +20,6 @@ import truvari from truvari.giab_report import make_giabreport - @total_ordering class MatchResult(): # pylint: disable=too-many-instance-attributes """ @@ -115,7 +114,6 @@ def make_match_params(): ret.sizemax = 50000 ret.passonly = False ret.no_ref = False - ret.includebed = None ret.multimatch = False return ret @@ -152,15 +150,14 @@ def filter_call(self, entry, base=False): Returns True if the call should be filtered Base has different filtering requirements, so let the method know """ - # See if it hits the regions, first - if self.params.includebed and not self.params.includebed.include(entry): + size = truvari.entry_size(entry) + if size > self.params.sizemax: return True - size = truvari.entry_size(entry) - if base and size < self.params.sizemin or size > self.params.sizemax: + if base and size < self.params.sizemin: return True - if not base and size < self.params.sizefilt or size > self.params.sizemax: + if not base and size < self.params.sizefilt: return True samp = self.params.bSample if base else self.params.cSample @@ -525,7 +522,7 @@ def annotate_entry(entry, match, header): entry.info["StartDistance"] = match.st_dist entry.info["EndDistance"] = match.ed_dist entry.info["GTMatch"] = match.gt_match - entry.info["TruScore"] = match.score + entry.info["TruScore"] = int(match.score) if match.score else None entry.info["MatchId"] = match.matid entry.info["Multi"] = match.multi return entry @@ -757,37 +754,6 @@ def close_outputs(outputs): outputs["fp_out"].close() -class custom_iterator(): # pylint: disable=too-few-public-methods - """ - Iterate vcfs in the same order, regardless of sort order - """ - def __init__(self, fn): - """ - Wrap VariantFile - """ - self.vcf = pysam.VariantFile(fn) - self.all_chroms = sorted(self.vcf.header.contigs) - self.gen = None - self.pos = None - - def __next__(self): - """ - Wrap next - """ - if self.pos is None: - self.pos = 0 - self.gen = self.vcf.fetch(self.all_chroms[self.pos]) - elif self.pos >= len(self.all_chroms): - raise StopIteration - try: - return next(self.gen) - except StopIteration as stop_iter: - self.pos += 1 - if self.pos >= len(self.all_chroms): - raise StopIteration from stop_iter - self.gen = self.vcf.fetch(self.all_chroms[self.pos]) - return next(self) - def bench_main(cmdargs): """ Main @@ -803,11 +769,11 @@ def bench_main(cmdargs): base = pysam.VariantFile(args.base) comp = pysam.VariantFile(args.comp) - matcher.params.includebed = truvari.GenomeTree(base, comp, - args.includebed, - args.sizemax) - base_i = custom_iterator(args.base) - comp_i = custom_iterator(args.comp) + + regions = truvari.RegionVCFIterator(base, comp, args.includebed, args.sizemax) + base_i = regions.iterate(base) + comp_i = regions.iterate(comp) + chunks = chunker(matcher, ('base', base_i), ('comp', comp_i)) for call in itertools.chain.from_iterable(map(compare_chunk, chunks)): output_writer(call, outputs, args.sizemin) diff --git a/truvari/comparisons.py b/truvari/comparisons.py index 16da9e93..0f9c3d7d 100644 --- a/truvari/comparisons.py +++ b/truvari/comparisons.py @@ -437,7 +437,7 @@ def weighted_score(sim, size, ovl): :rtype: float """ score = (2 * sim + 1 * size + 1 * ovl) / 3.0 - new_score = int(score / 1.333333 * 100) + new_score = score / 1.333333 * 100 return new_score diff --git a/truvari/genome_tree.py b/truvari/region_vcf_iter.py similarity index 78% rename from truvari/genome_tree.py rename to truvari/region_vcf_iter.py index b321d40b..7953c524 100644 --- a/truvari/genome_tree.py +++ b/truvari/region_vcf_iter.py @@ -7,13 +7,14 @@ from intervaltree import IntervalTree import truvari.comparisons as tcomp - -class GenomeTree(): +class RegionVCFIterator(): """ - Helper class to specify included regions of the genome when iterating events. + Helper class to specify include regions of the genome when iterating a VCF + Subset to only events less than max_span. + Subset to only events on contigs listed in vcfA left-join vcfB """ - def __init__(self, vcfA, vcfB, includebed=None, max_span=None): + def __init__(self, vcfA, vcfB=None, includebed=None, max_span=None): """ init """ self.includebed = includebed self.max_span = max_span @@ -24,7 +25,10 @@ def __build_tree(self, vcfA, vcfB): Build the include regions """ contigA_set = set(vcfA.header.contigs.keys()) - contigB_set = set(vcfB.header.contigs.keys()) + if vcfB is not None: + contigB_set = set(vcfB.header.contigs.keys()) + else: + contigB_set = contigA_set all_regions = defaultdict(IntervalTree) if self.includebed is not None: counter = 0 @@ -53,10 +57,10 @@ def __build_tree(self, vcfA, vcfB): def iterate(self, vcf_file): """ - Iterates a vcf and yields only the entries that overlap an 'include' region + Iterates a vcf and yields only the entries that overlap included regions """ - for chrom in self.tree: - for intv in self.tree[chrom]: + for chrom in sorted(self.tree.keys()): + for intv in sorted(self.tree[chrom]): for entry in vcf_file.fetch(chrom, intv.begin, intv.end): if self.includebed is None or self.include(entry): yield entry @@ -70,8 +74,8 @@ def include(self, entry): # Filter these early so we don't have to keep checking overlaps if self.max_span is None or aend - astart > self.max_span: return False - overlaps = self.tree[entry.chrom].overlaps( - astart) and self.tree[entry.chrom].overlaps(aend) + overlaps = self.tree[entry.chrom].overlaps(astart) \ + and self.tree[entry.chrom].overlaps(aend) if astart == aend: return overlaps return overlaps and len(self.tree[entry.chrom].overlap(astart, aend)) == 1