Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add option to make annotaTR less strict on Beagle AP field checks #233

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion trtools/annotaTR/annotaTR.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,11 @@ def getargs(): # pragma: no cover
help="Compute genotype dosages. "
"Optionally specify how. Options=%s"%[str(item) for item in trh.TRDosageTypes.__members__],
type=str)
annot_group.add_argument(
"--warn-on-AP-error",
help="Output a warning but don't crash on error computing on AP field",
action="store_true"
)
annot_group.add_argument(
"--ref-panel",
help="Annotate Beagle-imputed VCF with TR metadata from the reference panel. "
Expand Down Expand Up @@ -556,7 +561,7 @@ def main(args):
minlen = trrecord.min_allele_length
maxlen = trrecord.max_allele_length
if dosage_type is not None:
dosages = trrecord.GetDosages(dosage_type)
dosages = trrecord.GetDosages(dosage_type, strict=(not args.warn_on_AP_error))
# Update record
record.INFO["DSLEN"] = "{minlen},{maxlen}".format(minlen=minlen, maxlen=maxlen)
record.set_format("TRDS", np.array(dosages))
Expand Down
12 changes: 12 additions & 0 deletions trtools/annotaTR/tests/test_annotaTR.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def args(tmpdir):
args.ignore_duplicates = False
args.debug = False
args.chunk_size = 1000
args.warn_on_AP_error = False
return args

@pytest.fixture
Expand Down Expand Up @@ -125,6 +126,17 @@ def test_DosageType(args, vcfdir):
args.dosages = "beagleap_norm"
retcode = main(args)
assert retcode==1
fname = os.path.join(vcfdir, "beagle", "1kg_snpstr_21_first_100k_second_50_STRs_imputed.vcf.gz")
args.vcf = fname
args.vcftype = "hipstr"
args.ref_panel = os.path.join(vcfdir, "beagle", "1kg_snpstr_21_first_100k_first_50_annotated.vcf.gz")
args.warn_on_AP_error = True # should't fail with missing AP
args.outtype = ["pgen","vcf"]
retcode = main(args)
assert retcode==0
args.warn_on_AP_error = False # set back
with pytest.raises(ValueError):
main(args)
# Beagle VCF
fname = os.path.join(vcfdir, "beagle", "beagle_imputed_withap.vcf.gz")
args.vcf = fname
Expand Down
7 changes: 7 additions & 0 deletions trtools/utils/tests/test_trharmonizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,9 +389,13 @@ def test_TRRecord_GetGenotypes_Dosages():
diploid_record.FORMAT["AP1"] = np.array([[10, 0.5], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0]])
with pytest.raises(ValueError):
rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap)
# If not in strict mode these should be nan
assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap, strict=False)).all()
diploid_record.FORMAT["AP1"] = np.array([[-0.5, 0.5], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0]])
with pytest.raises(ValueError):
rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap)
# If not in strict mode these should be nan
assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap, strict=False)).all()

# Test triploid example where alt=[]
triploid_record = get_triploid_record()
Expand Down Expand Up @@ -421,6 +425,9 @@ def test_TRRecord_GetGenotypes_Dosages():
rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap)
with pytest.raises(ValueError):
rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap_norm)
# If not in strict mode, should instead return NA
assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap, strict=False)).all()
assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap_norm, strict=False)).all()

# Test example with fewer alt_alleles than the max genotype index
with pytest.raises(ValueError):
Expand Down
42 changes: 32 additions & 10 deletions trtools/utils/tr_harmonizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import numpy as np

import trtools.utils.utils as utils

import trtools.utils.common as common

# List of supported VCF types
# TODO: add Beagle
Expand Down Expand Up @@ -1097,7 +1097,7 @@ def UniqueStringGenotypes(self) -> Set[int]:
return set(self.UniqueStringGenotypeMapping().values())

def GetDosages(self,
dosagetype: TRDosageTypes = TRDosageTypes.bestguess) -> Optional[np.ndarray]:
dosagetype: TRDosageTypes = TRDosageTypes.bestguess, strict: bool = True) -> Optional[np.ndarray]:
"""
Get an array of genotype dosages for each sample.

Expand All @@ -1118,6 +1118,9 @@ def GetDosages(self,
----------
dosagetype : Enum
Which TRDosageType to compute. Default bestguess
strict : bool
If False, output a warning but do not die on errors validating AP field
If errors are encountered, return nan dosage values

Returns
-------
Expand All @@ -1128,10 +1131,14 @@ def GetDosages(self,
if self.GetNumSamples() == 0:
return None
if (dosagetype in [TRDosageTypes.beagleap, TRDosageTypes.beagleap_norm]) and \
(self.vcfrecord.format("AP1") is None or self.vcfrecord.format("AP2") is None):
raise ValueError(
"Requested Beagle dosages for record at {}:{} but AP1/AP2 fields not found.".format(self.chrom, self.pos)
)
(("AP1" not in self.vcfrecord.FORMAT or "AP2" not in self.vcfrecord.FORMAT) or \
(self.vcfrecord.format("AP1") is None or self.vcfrecord.format("AP2") is None)):
error_msg = "Requested Beagle dosages for record at {}:{} but AP1/AP2 fields not found.".format(self.chrom, self.pos)
if strict:
raise ValueError(error_msg)
else:
common.WARNING(error_msg)
return np.array([np.nan]*self.GetNumSamples())
if dosagetype in [TRDosageTypes.bestguess, TRDosageTypes.bestguess_norm]:
# Get length gts and replace -1 (missing) and -2 (low ploidy) with 0
# But if normalizing set those to np.nan since unclear
Expand All @@ -1154,10 +1161,19 @@ def GetDosages(self,

# Check AP field. allow wiggle room for rounding
if np.any(np.sum(ap1, axis=1) > 1.1) or np.any(np.sum(ap2, axis=1) > 1.1):
raise ValueError("AP1 or AP2 field summing to more than 1 detected")
error_msg = "{}:{} AP1 or AP2 field summing to more than 1 detected".format(self.chrom, self.pos)
if strict:
raise ValueError(error_msg)
else:
common.WARNING(error_msg)
return np.array([np.nan]*self.GetNumSamples())
if np.any(ap1 < 0) or np.any(ap2 < 0):
raise ValueError("Negative AP1 or AP2 fields detected")

error_msg = "{}:{} Negative AP1 or AP2 fields detected".format(self.chrom, self.pos)
if strict:
raise ValueError("Negative AP1 or AP2 fields detected")
else:
common.WARNING(error_msg)
return np.array([np.nan]*self.GetNumSamples())
# Get haplotype dosages
if len(self.alt_allele_lengths) > 0:
max_alt_len = max(self.alt_allele_lengths)
Expand All @@ -1180,7 +1196,13 @@ def GetDosages(self,
else:
# Normalize to be between 0 and 2
dosages = (unnorm_dosages-2*self.min_allele_length)/(self.max_allele_length-self.min_allele_length)
assert not (np.any(dosages>=2.1) or np.any(dosages<=-0.1))
if (np.any(dosages>=2.1) or np.any(dosages<=-0.1)):
error_msg = "{}:{} Error normalizing dosages: value >=2.1 or <=-0.1 detected".format(self.chrom, self.pos)
if strict:
raise ValueError(error_msg)
else:
common.WARNING(error_msg)
return np.array([np.nan]*self.GetNumSamples())
dosages = np.clip(dosages, 0, 2)
else:
dosages = unnorm_dosages
Expand Down
Loading