Skip to content

Commit

Permalink
Fix MergeSTR bug where alt allele woud be the same as reference
Browse files Browse the repository at this point in the history
  • Loading branch information
LiterallyUniqueLogin committed Mar 20, 2023
1 parent 7ea6ff4 commit 13a6f5a
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 4 deletions.
17 changes: 14 additions & 3 deletions trtools/mergeSTR/mergeSTR.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,12 +215,14 @@ def GetAltsByKey(current_records: List[trh.TRRecord], mergelist: List[bool], key
return result


def GetAltAlleles(current_records: List[trh.TRRecord], mergelist: List[bool], vcftype: Union[str, trh.VcfTypes]) \
def GetAltAlleles(ref_allele: str, current_records: List[trh.TRRecord], mergelist: List[bool], vcftype: Union[str, trh.VcfTypes]) \
-> Tuple[List[str], List[np.ndarray]]:
r"""Get list of alt alleles
Parameters
----------
ref_allele:
The (flank trimmed) reference allele, in upper case
current_records : list of vcf.Record
List of records being merged
mergelist : list of bool
Expand Down Expand Up @@ -260,6 +262,12 @@ def HipstrKey(record: trh.TRRecord):
alt_picker = HipstrKey

alts = GetAltsByKey(current_records, mergelist, alt_picker)
# normally the reference allele is distinct from all alt alleles
# however, if the record has flanking base pairs which are then removed,
# then the ref allele AAAAT and alt allele AAAAC would both be AAAA and
# would be duplicate, which shouldn't occur
if ref_allele in alts:
alts.remove(ref_allele)

if vcftype == trh.VcfTypes.eh:
# EH alleles look like <STR42> where 42 is the
Expand All @@ -272,12 +280,15 @@ def HipstrKey(record: trh.TRRecord):
else:
out_alts = sorted(alts, key=lambda x: (len(x), x))

alleles = [ref_allele]
alleles.extend(out_alts)

mappings = []
for i in range(len(mergelist)):
if mergelist[i]:
ralts = alt_picker(current_records[i])
mappings.append(
np.array([0] + [out_alts.index(ralt.upper()) + 1 for ralt in ralts]).astype(str)
np.array([0] + [alleles.index(ralt.upper()) for ralt in ralts]).astype(str)
)
return out_alts, mappings

Expand Down Expand Up @@ -466,7 +477,7 @@ def MergeRecords(readers: cyvcf2.VCF, vcftype: Union[str, trh.VcfTypes], num_sam
common.WARNING("Conflicting refs found at {}:{}. Skipping.".format(chrom, pos))
return

alt_alleles, mappings = GetAltAlleles(current_records, mergelist, vcftype)
alt_alleles, mappings = GetAltAlleles(ref_allele, current_records, mergelist, vcftype)

# Set common fields
vcfw.write(chrom) # CHROM
Expand Down
16 changes: 15 additions & 1 deletion trtools/mergeSTR/tests/test_mergeSTR.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import argparse
import os

import cyvcf2
import numpy as np
import pytest

Expand Down Expand Up @@ -199,9 +201,20 @@ def test_MissingFieldWarnings(capsys, args, mrgvcfdir):
captured = capsys.readouterr()
assert "Expected format field DP not found" in captured.err

def test_alt_same_len_as_ref_different_flanking(args, mrgvcfdir):
fname1 = os.path.join(mrgvcfdir, "test_file_hipstr1.vcf.gz")
fname2 = os.path.join(mrgvcfdir, "test_file_hipstr2_alt_v_ref.vcf.gz")
args.vcfs = fname1 + "," + fname2
main(args)

vcf = cyvcf2.VCF(args.out + '.vcf')
var = next(vcf)
for alt in var.ALT:
assert alt != var.REF

def test_ConflictingRefs():
# Set up dummy records
dummy_records = []
dummy_records = []
dummy_records.append(DummyHarmonizedRecord('chr1', 100, 'CAGCAG'))
dummy_records.append(DummyHarmonizedRecord('chr1', 100, 'CAGCAG'))
dummy_records.append(DummyHarmonizedRecord('chr1', 100, 'CAG'))
Expand Down Expand Up @@ -337,3 +350,4 @@ def test_popstr_output(args, mrgvcfdir):
# what if there are multiple records at the same location in the same VCF
# if a genotype is no called but there is other format info,
# we aren't emitting it. Is that intended?
# TODO write test where sample and allele orderings change
Binary file not shown.
Binary file not shown.

0 comments on commit 13a6f5a

Please sign in to comment.