Skip to content

Commit

Permalink
Add option to make annotaTR less strict on Beagle AP field checks
Browse files Browse the repository at this point in the history
  • Loading branch information
Melissa Gymrek authored and Melissa Gymrek committed Aug 30, 2024
1 parent ae3ca15 commit e9f23ae
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 13 deletions.
7 changes: 6 additions & 1 deletion trtools/annotaTR/annotaTR.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,11 @@ def getargs(): # pragma: no cover
help="Compute genotype dosages. "
"Optionally specify how. Options=%s"%[str(item) for item in trh.TRDosageTypes.__members__],
type=str)
annot_group.add_argument(
"--warn-on-AP-error",
help="Output a warning but don't crash on error computing on AP field",
action="store_true"
)
annot_group.add_argument(
"--ref-panel",
help="Annotate Beagle-imputed VCF with TR metadata from the reference panel. "
Expand Down Expand Up @@ -555,7 +560,7 @@ def main(args):
minlen = trrecord.min_allele_length
maxlen = trrecord.max_allele_length
if dosage_type is not None:
dosages = trrecord.GetDosages(dosage_type)
dosages = trrecord.GetDosages(dosage_type, strict=(not args.warn_on_AP_error))
# Update record
record.INFO["DSLEN"] = "{minlen},{maxlen}".format(minlen=minlen, maxlen=maxlen)
record.set_format("TRDS", np.array(dosages))
Expand Down
12 changes: 12 additions & 0 deletions trtools/annotaTR/tests/test_annotaTR.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def args(tmpdir):
args.ignore_duplicates = False
args.debug = False
args.chunk_size = 1000
args.warn_on_AP_error = False
return args

@pytest.fixture
Expand Down Expand Up @@ -125,6 +126,17 @@ def test_DosageType(args, vcfdir):
args.dosages = "beagleap_norm"
retcode = main(args)
assert retcode==1
fname = os.path.join(vcfdir, "beagle", "1kg_snpstr_21_first_100k_second_50_STRs_imputed.vcf.gz")
args.vcf = fname
args.vcftype = "hipstr"
args.ref_panel = os.path.join(vcfdir, "beagle", "1kg_snpstr_21_first_100k_first_50_annotated.vcf.gz")
args.warn_on_AP_error = True # should't fail with missing AP
args.outtype = ["pgen","vcf"]
retcode = main(args)
assert retcode==0
args.warn_on_AP_error = False # set back
with pytest.raises(ValueError):
main(args)
# Beagle VCF
fname = os.path.join(vcfdir, "beagle", "beagle_imputed_withap.vcf.gz")
args.vcf = fname
Expand Down
9 changes: 8 additions & 1 deletion trtools/utils/tests/test_trharmonizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,9 +389,13 @@ def test_TRRecord_GetGenotypes_Dosages():
diploid_record.FORMAT["AP1"] = np.array([[10, 0.5], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0]])
with pytest.raises(ValueError):
rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap)
# If not in strict mode these should be nan
assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap, strict=False)).all()
diploid_record.FORMAT["AP1"] = np.array([[-0.5, 0.5], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0]])
with pytest.raises(ValueError):
rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap)
# If not in strict mode these should be nan
assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap, strict=False)).all()

# Test triploid example where alt=[]
triploid_record = get_triploid_record()
Expand Down Expand Up @@ -421,7 +425,10 @@ def test_TRRecord_GetGenotypes_Dosages():
rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap)
with pytest.raises(ValueError):
rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap_norm)

# If not in strict mode, should instead return NA
assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap, strict=False)).all()
assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap_norm, strict=False)).all()

# Test example with fewer alt_alleles than the max genotype index
with pytest.raises(ValueError):
trh.TRRecord(dummy_record, ref_allele, [], "CAG", "", None)
Expand Down
44 changes: 33 additions & 11 deletions trtools/utils/tr_harmonizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import numpy as np

import trtools.utils.utils as utils

import trtools.utils.common as common

# List of supported VCF types
# TODO: add Beagle
Expand Down Expand Up @@ -1084,7 +1084,7 @@ def UniqueStringGenotypes(self) -> Set[int]:
return set(self.UniqueStringGenotypeMapping().values())

def GetDosages(self,
dosagetype: TRDosageTypes = TRDosageTypes.bestguess) -> Optional[np.ndarray]:
dosagetype: TRDosageTypes = TRDosageTypes.bestguess, strict: bool = True) -> Optional[np.ndarray]:
"""
Get an array of genotype dosages for each sample.
Expand All @@ -1105,6 +1105,9 @@ def GetDosages(self,
----------
dosagetype : Enum
Which TRDosageType to compute. Default bestguess
strict : bool
If False, output a warning but do not die on errors validating AP field
If errors are encountered, return nan dosage values
Returns
-------
Expand All @@ -1115,10 +1118,14 @@ def GetDosages(self,
if self.GetNumSamples() == 0:
return None
if (dosagetype in [TRDosageTypes.beagleap, TRDosageTypes.beagleap_norm]) and \
(self.vcfrecord.format("AP1") is None or self.vcfrecord.format("AP2") is None):
raise ValueError(
"Requested Beagle dosages for record at {}:{} but AP1/AP2 fields not found.".format(self.chrom, self.pos)
)
(("AP1" not in self.vcfrecord.FORMAT or "AP2" not in self.vcfrecord.FORMAT) or \
(self.vcfrecord.format("AP1") is None or self.vcfrecord.format("AP2") is None)):
error_msg = "Requested Beagle dosages for record at {}:{} but AP1/AP2 fields not found.".format(self.chrom, self.pos)
if strict:
raise ValueError(error_msg)
else:
common.WARNING(error_msg)
return np.array([np.nan]*self.GetNumSamples())
if dosagetype in [TRDosageTypes.bestguess, TRDosageTypes.bestguess_norm]:
# Get length gts and replace -1 (missing) and -2 (low ploidy) with 0
# But if normalizing set those to np.nan since unclear
Expand All @@ -1141,10 +1148,19 @@ def GetDosages(self,

# Check AP field. allow wiggle room for rounding
if np.any(np.sum(ap1, axis=1) > 1.1) or np.any(np.sum(ap2, axis=1) > 1.1):
raise ValueError("AP1 or AP2 field summing to more than 1 detected")
error_msg = "{}:{} AP1 or AP2 field summing to more than 1 detected".format(self.chrom, self.pos)
if strict:
raise ValueError(error_msg)
else:
common.WARNING(error_msg)
return np.array([np.nan]*self.GetNumSamples())
if np.any(ap1 < 0) or np.any(ap2 < 0):
raise ValueError("Negative AP1 or AP2 fields detected")

error_msg = "{}:{} Negative AP1 or AP2 fields detected".format(self.chrom, self.pos)
if strict:
raise ValueError("Negative AP1 or AP2 fields detected")
else:
common.WARNING(error_msg)
return np.array([np.nan]*self.GetNumSamples())
# Get haplotype dosages
if len(self.alt_allele_lengths) > 0:
max_alt_len = max(self.alt_allele_lengths)
Expand All @@ -1167,12 +1183,18 @@ def GetDosages(self,
else:
# Normalize to be between 0 and 2
dosages = (unnorm_dosages-2*self.min_allele_length)/(self.max_allele_length-self.min_allele_length)
assert not (np.any(dosages>=2.1) or np.any(dosages<=-0.1))
if (np.any(dosages>=2.1) or np.any(dosages<=-0.1)):
error_msg = "{}:{} Error normalizing dosages: value >=2.1 or <=-0.1 detected".format(self.chrom, self.pos)
if strict:
raise ValueError(error_msg)
else:
common.WARNING(error_msg)
return np.array([np.nan]*self.GetNumSamples())
dosages = np.clip(dosages, 0, 2)
else:
dosages = unnorm_dosages
return dosages

def GetLengthGenotypes(self) -> Optional[np.ndarray]:
"""
Get an array of length genotypes for each sample.
Expand Down

0 comments on commit e9f23ae

Please sign in to comment.