Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/allow vcf row overlap #35

Merged
merged 3 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
1.3.2
1.3.3

13 changes: 8 additions & 5 deletions gumpy/difference.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from abc import ABC # Python library for abstract classes
from typing import Dict, List, Tuple
import numpy

from tqdm import tqdm
import gumpy


Expand Down Expand Up @@ -370,10 +370,13 @@ def __get_variants(self):

# for now we simply allow the ref and alt to be different e.g. can have a
# SNP on a NULL (x>t)
for r, idx, a in zip(
self.genome1.nucleotide_sequence[mask],
self.genome1.nucleotide_index[mask],
self.genome2.nucleotide_sequence[mask],
for r, idx, a in tqdm(
zip(
self.genome1.nucleotide_sequence[mask],
self.genome1.nucleotide_index[mask],
self.genome2.nucleotide_sequence[mask],
),
total=len(self.genome1.nucleotide_sequence[mask]),
):
variants.append(str(idx) + r + ">" + a)
refs.append(r)
Expand Down
34 changes: 21 additions & 13 deletions gumpy/variantfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import warnings
from collections import defaultdict
from typing import Collection, Dict, Iterable, List, Tuple
from tqdm import tqdm

import numpy
import pandas
Expand Down Expand Up @@ -419,14 +420,14 @@ def _find_minor_populations(self):
)
pos = self.calls[(idx, type_)]["pos"]
simple_calls.append((idx, pos, t, bases))
seen = []
seen = set()

for idx, type_ in self.calls.keys():
for idx, type_ in tqdm(self.calls.keys()):
# Check if we've delt with this vcf already
if self.calls[(idx, type_)]["original_vcf_row"] in seen:
if str(self.calls[(idx, type_)]["original_vcf_row"]) in seen:
continue
else:
seen.append(self.calls[(idx, type_)]["original_vcf_row"])
seen.add(str(self.calls[(idx, type_)]["original_vcf_row"]))

# Pull out depth tag from the specific row's format fields
# as the file metadata isn't a guarantee of the actual fields of this row
Expand Down Expand Up @@ -515,7 +516,6 @@ def __find_calls(self):
"""

self.calls = {}
record_positions = set()

for record in self.records:
# VCF files are 1 indexed but keep for now
Expand Down Expand Up @@ -576,12 +576,6 @@ def __find_calls(self):
variant = record.ref
variant_type = "ref"

if index in record_positions:
raise ValueError(
"Multiple calls at position " + str(index) + " in VCF file"
)
record_positions.add(index)

# if the REF, ALT pair are the same length, check if we can decompose
# into SNPs
if len(record.ref) == len(variant):
Expand All @@ -595,6 +589,10 @@ def __find_calls(self):
vcf_info["REF"] = record.ref
vcf_info["ALTS"] = record.alts
metadata["original_vcf_row"] = vcf_info
if self.calls.get((index + counter, variant_type)) is not None:
raise ValueError(
"Multiple calls at position " + str(index) + " in VCF file"
)
self.calls[(index + counter, variant_type)] = metadata

# otherwise the REF, ALT pair are different lengths
Expand All @@ -614,7 +612,12 @@ def __find_calls(self):
vcf_info["REF"] = record.ref
vcf_info["ALTS"] = record.alts
metadata["original_vcf_row"] = vcf_info

if self.calls.get((index + p, "indel")) is not None:
raise ValueError(
"Multiple calls at position "
+ str(index)
+ " in VCF file"
)
self.calls[(index + p, "indel")] = metadata

else:
Expand All @@ -627,7 +630,12 @@ def __find_calls(self):
vcf_info["REF"] = record.ref
vcf_info["ALTS"] = record.alts
metadata["original_vcf_row"] = vcf_info

if self.calls.get((index + p, variant_type)) is not None:
raise ValueError(
"Multiple calls at position "
+ str(index)
+ " in VCF file"
)
self.calls[(index + p, variant_type)] = metadata

def _simplify_call(self, ref: str, alt: str) -> List[Tuple[int, str, str]]:
Expand Down
4 changes: 2 additions & 2 deletions tests/test-cases/TEST-DNA-5.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
##contig=<ID=TEST_DNA,length=99>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 03.vcf
TEST_DNA 4 . A C,CT . PASS KMER=15 GT:DP:COV:GT_CONF 0/0:3:1,1,1:613.77
TEST_DNA 24 . G T,C,A . PASS KMER=15 GT:DP:COV:GT_CONF 0/0:4:1,1,1,1:613.77
TEST_DNA 26 . GG AA,C,AT . PASS KMER=15 GT:DP:COV:GT_CONF 1/1:4:1,1,1,1:475.54
TEST_DNA 21 . G T,C,A . PASS KMER=15 GT:DP:COV:GT_CONF 0/0:4:1,1,1,1:613.77
TEST_DNA 25 . GG AA,C,AT . PASS KMER=15 GT:DP:COV:GT_CONF 1/1:4:1,1,1,1:475.54
TEST_DNA 27 . GG AA,C,AT . PASS KMER=15 GT:DP:AD:GT_CONF 1/1:4:1,1,1,1:475.54
TEST_DNA 29 . G . . PASS KMER=15 GT:DP:AD:GT_CONF 0:1:1:475.54
6 changes: 4 additions & 2 deletions tests/unit/test_instantiation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2736,7 +2736,8 @@ def test_min_dp():
before = gumpy.VCFFile("tests/test-cases/TEST-DNA-5.vcf")
assert sorted(list(before.calls.keys())) == [
(4, "ref"),
(24, "ref"),
(21, "ref"),
(25, "snp"),
(26, "snp"),
(27, "snp"),
(28, "snp"),
Expand All @@ -2746,7 +2747,8 @@ def test_min_dp():
after = gumpy.VCFFile("tests/test-cases/TEST-DNA-5.vcf", min_dp=2)
assert sorted(list(after.calls.keys())) == [
(4, "null"),
(24, "null"),
(21, "null"),
(25, "null"),
(26, "null"),
(27, "null"),
(28, "null"),
Expand Down
24 changes: 6 additions & 18 deletions tests/unit/test_minor_populations.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,24 +31,12 @@ def make_reproducable(list_: [str]) -> [str]:

def test_double_del():
"""Testing for expected crashes with deletions at the same base"""
ref = gumpy.Genome("config/TEST-DNA.gbk")
vcf = gumpy.VCFFile(
"tests/test-cases/TEST-DNA-double-del.vcf",
ignore_filter=True,
minor_population_indices={3, 4, 5, 6, 7},
)
sample = ref + vcf

ref_a = ref.build_gene("A")
sample_a = sample.build_gene("A")

diff = ref_a - sample_a
assert sorted(diff.minor_populations()) == sorted(
["1_del_aa:3", "2_indel:3", "3_del_a:3"]
)
assert sorted(sample_a.minority_populations_GARC()) == sorted(
diff.minor_populations()
)
with pytest.raises(ValueError):
gumpy.VCFFile(
"tests/test-cases/TEST-DNA-double-del.vcf",
ignore_filter=True,
minor_population_indices={3, 4, 5, 6, 7},
)


@pytest.mark.slow
Expand Down
Loading