Skip to content

Commit

Permalink
Merge pull request #35 from oxfordmmm/fix/allow-vcf-row-overlap
Browse files Browse the repository at this point in the history
Fix/allow vcf row overlap
  • Loading branch information
JeremyWesthead authored Jul 5, 2024
2 parents 4e1bd25 + ca8629a commit 7160454
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 41 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
1.3.2
1.3.3

13 changes: 8 additions & 5 deletions gumpy/difference.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from abc import ABC # Python library for abstract classes
from typing import Dict, List, Tuple
import numpy

from tqdm import tqdm
import gumpy


Expand Down Expand Up @@ -370,10 +370,13 @@ def __get_variants(self):

# for now we simply allow the ref and alt to be different e.g. can have a
# SNP on a NULL (x>t)
for r, idx, a in zip(
self.genome1.nucleotide_sequence[mask],
self.genome1.nucleotide_index[mask],
self.genome2.nucleotide_sequence[mask],
for r, idx, a in tqdm(
zip(
self.genome1.nucleotide_sequence[mask],
self.genome1.nucleotide_index[mask],
self.genome2.nucleotide_sequence[mask],
),
total=len(self.genome1.nucleotide_sequence[mask]),
):
variants.append(str(idx) + r + ">" + a)
refs.append(r)
Expand Down
34 changes: 21 additions & 13 deletions gumpy/variantfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import warnings
from collections import defaultdict
from typing import Collection, Dict, Iterable, List, Tuple
from tqdm import tqdm

import numpy
import pandas
Expand Down Expand Up @@ -419,14 +420,14 @@ def _find_minor_populations(self):
)
pos = self.calls[(idx, type_)]["pos"]
simple_calls.append((idx, pos, t, bases))
seen = []
seen = set()

for idx, type_ in self.calls.keys():
for idx, type_ in tqdm(self.calls.keys()):
# Check if we've delt with this vcf already
if self.calls[(idx, type_)]["original_vcf_row"] in seen:
if str(self.calls[(idx, type_)]["original_vcf_row"]) in seen:
continue
else:
seen.append(self.calls[(idx, type_)]["original_vcf_row"])
seen.add(str(self.calls[(idx, type_)]["original_vcf_row"]))

# Pull out depth tag from the specific row's format fields
# as the file metadata isn't a guarantee of the actual fields of this row
Expand Down Expand Up @@ -515,7 +516,6 @@ def __find_calls(self):
"""

self.calls = {}
record_positions = set()

for record in self.records:
# VCF files are 1 indexed but keep for now
Expand Down Expand Up @@ -576,12 +576,6 @@ def __find_calls(self):
variant = record.ref
variant_type = "ref"

if index in record_positions:
raise ValueError(
"Multiple calls at position " + str(index) + " in VCF file"
)
record_positions.add(index)

# if the REF, ALT pair are the same length, check if we can decompose
# into SNPs
if len(record.ref) == len(variant):
Expand All @@ -595,6 +589,10 @@ def __find_calls(self):
vcf_info["REF"] = record.ref
vcf_info["ALTS"] = record.alts
metadata["original_vcf_row"] = vcf_info
if self.calls.get((index + counter, variant_type)) is not None:
raise ValueError(
"Multiple calls at position " + str(index) + " in VCF file"
)
self.calls[(index + counter, variant_type)] = metadata

# otherwise the REF, ALT pair are different lengths
Expand All @@ -614,7 +612,12 @@ def __find_calls(self):
vcf_info["REF"] = record.ref
vcf_info["ALTS"] = record.alts
metadata["original_vcf_row"] = vcf_info

if self.calls.get((index + p, "indel")) is not None:
raise ValueError(
"Multiple calls at position "
+ str(index)
+ " in VCF file"
)
self.calls[(index + p, "indel")] = metadata

else:
Expand All @@ -627,7 +630,12 @@ def __find_calls(self):
vcf_info["REF"] = record.ref
vcf_info["ALTS"] = record.alts
metadata["original_vcf_row"] = vcf_info

if self.calls.get((index + p, variant_type)) is not None:
raise ValueError(
"Multiple calls at position "
+ str(index)
+ " in VCF file"
)
self.calls[(index + p, variant_type)] = metadata

def _simplify_call(self, ref: str, alt: str) -> List[Tuple[int, str, str]]:
Expand Down
4 changes: 2 additions & 2 deletions tests/test-cases/TEST-DNA-5.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
##contig=<ID=TEST_DNA,length=99>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 03.vcf
TEST_DNA 4 . A C,CT . PASS KMER=15 GT:DP:COV:GT_CONF 0/0:3:1,1,1:613.77
TEST_DNA 24 . G T,C,A . PASS KMER=15 GT:DP:COV:GT_CONF 0/0:4:1,1,1,1:613.77
TEST_DNA 26 . GG AA,C,AT . PASS KMER=15 GT:DP:COV:GT_CONF 1/1:4:1,1,1,1:475.54
TEST_DNA 21 . G T,C,A . PASS KMER=15 GT:DP:COV:GT_CONF 0/0:4:1,1,1,1:613.77
TEST_DNA 25 . GG AA,C,AT . PASS KMER=15 GT:DP:COV:GT_CONF 1/1:4:1,1,1,1:475.54
TEST_DNA 27 . GG AA,C,AT . PASS KMER=15 GT:DP:AD:GT_CONF 1/1:4:1,1,1,1:475.54
TEST_DNA 29 . G . . PASS KMER=15 GT:DP:AD:GT_CONF 0:1:1:475.54
6 changes: 4 additions & 2 deletions tests/unit/test_instantiation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2736,7 +2736,8 @@ def test_min_dp():
before = gumpy.VCFFile("tests/test-cases/TEST-DNA-5.vcf")
assert sorted(list(before.calls.keys())) == [
(4, "ref"),
(24, "ref"),
(21, "ref"),
(25, "snp"),
(26, "snp"),
(27, "snp"),
(28, "snp"),
Expand All @@ -2746,7 +2747,8 @@ def test_min_dp():
after = gumpy.VCFFile("tests/test-cases/TEST-DNA-5.vcf", min_dp=2)
assert sorted(list(after.calls.keys())) == [
(4, "null"),
(24, "null"),
(21, "null"),
(25, "null"),
(26, "null"),
(27, "null"),
(28, "null"),
Expand Down
24 changes: 6 additions & 18 deletions tests/unit/test_minor_populations.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,24 +31,12 @@ def make_reproducable(list_: [str]) -> [str]:

def test_double_del():
"""Testing for expected crashes with deletions at the same base"""
ref = gumpy.Genome("config/TEST-DNA.gbk")
vcf = gumpy.VCFFile(
"tests/test-cases/TEST-DNA-double-del.vcf",
ignore_filter=True,
minor_population_indices={3, 4, 5, 6, 7},
)
sample = ref + vcf

ref_a = ref.build_gene("A")
sample_a = sample.build_gene("A")

diff = ref_a - sample_a
assert sorted(diff.minor_populations()) == sorted(
["1_del_aa:3", "2_indel:3", "3_del_a:3"]
)
assert sorted(sample_a.minority_populations_GARC()) == sorted(
diff.minor_populations()
)
with pytest.raises(ValueError):
gumpy.VCFFile(
"tests/test-cases/TEST-DNA-double-del.vcf",
ignore_filter=True,
minor_population_indices={3, 4, 5, 6, 7},
)


@pytest.mark.slow
Expand Down

0 comments on commit 7160454

Please sign in to comment.