From e9f23ae476ddff85c0b9cf2405d8fbc16bbcb062 Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Fri, 30 Aug 2024 16:29:12 -0700 Subject: [PATCH] Add option to make annotaTR less strict on Beagle AP field checks --- trtools/annotaTR/annotaTR.py | 7 +++- trtools/annotaTR/tests/test_annotaTR.py | 12 +++++++ trtools/utils/tests/test_trharmonizer.py | 9 ++++- trtools/utils/tr_harmonizer.py | 44 ++++++++++++++++++------ 4 files changed, 59 insertions(+), 13 deletions(-) diff --git a/trtools/annotaTR/annotaTR.py b/trtools/annotaTR/annotaTR.py index a47eac76..4e9dd9c7 100644 --- a/trtools/annotaTR/annotaTR.py +++ b/trtools/annotaTR/annotaTR.py @@ -370,6 +370,11 @@ def getargs(): # pragma: no cover help="Compute genotype dosages. " "Optionally specify how. Options=%s"%[str(item) for item in trh.TRDosageTypes.__members__], type=str) + annot_group.add_argument( + "--warn-on-AP-error", + help="Output a warning but don't crash on error computing on AP field", + action="store_true" + ) annot_group.add_argument( "--ref-panel", help="Annotate Beagle-imputed VCF with TR metadata from the reference panel. " @@ -555,7 +560,7 @@ def main(args): minlen = trrecord.min_allele_length maxlen = trrecord.max_allele_length if dosage_type is not None: - dosages = trrecord.GetDosages(dosage_type) + dosages = trrecord.GetDosages(dosage_type, strict=(not args.warn_on_AP_error)) # Update record record.INFO["DSLEN"] = "{minlen},{maxlen}".format(minlen=minlen, maxlen=maxlen) record.set_format("TRDS", np.array(dosages)) diff --git a/trtools/annotaTR/tests/test_annotaTR.py b/trtools/annotaTR/tests/test_annotaTR.py index eda81650..675df652 100644 --- a/trtools/annotaTR/tests/test_annotaTR.py +++ b/trtools/annotaTR/tests/test_annotaTR.py @@ -25,6 +25,7 @@ def args(tmpdir): args.ignore_duplicates = False args.debug = False args.chunk_size = 1000 + args.warn_on_AP_error = False return args @pytest.fixture @@ -125,6 +126,17 @@ def test_DosageType(args, vcfdir): args.dosages = "beagleap_norm" retcode = main(args) assert retcode==1 + fname = os.path.join(vcfdir, "beagle", "1kg_snpstr_21_first_100k_second_50_STRs_imputed.vcf.gz") + args.vcf = fname + args.vcftype = "hipstr" + args.ref_panel = os.path.join(vcfdir, "beagle", "1kg_snpstr_21_first_100k_first_50_annotated.vcf.gz") + args.warn_on_AP_error = True # should't fail with missing AP + args.outtype = ["pgen","vcf"] + retcode = main(args) + assert retcode==0 + args.warn_on_AP_error = False # set back + with pytest.raises(ValueError): + main(args) # Beagle VCF fname = os.path.join(vcfdir, "beagle", "beagle_imputed_withap.vcf.gz") args.vcf = fname diff --git a/trtools/utils/tests/test_trharmonizer.py b/trtools/utils/tests/test_trharmonizer.py index dba2f29f..43d3c117 100644 --- a/trtools/utils/tests/test_trharmonizer.py +++ b/trtools/utils/tests/test_trharmonizer.py @@ -389,9 +389,13 @@ def test_TRRecord_GetGenotypes_Dosages(): diploid_record.FORMAT["AP1"] = np.array([[10, 0.5], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0]]) with pytest.raises(ValueError): rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap) + # If not in strict mode these should be nan + assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap, strict=False)).all() diploid_record.FORMAT["AP1"] = np.array([[-0.5, 0.5], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0]]) with pytest.raises(ValueError): rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap) + # If not in strict mode these should be nan + assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap, strict=False)).all() # Test triploid example where alt=[] triploid_record = get_triploid_record() @@ -421,7 +425,10 @@ def test_TRRecord_GetGenotypes_Dosages(): rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap) with pytest.raises(ValueError): rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap_norm) - + # If not in strict mode, should instead return NA + assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap, strict=False)).all() + assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap_norm, strict=False)).all() + # Test example with fewer alt_alleles than the max genotype index with pytest.raises(ValueError): trh.TRRecord(dummy_record, ref_allele, [], "CAG", "", None) diff --git a/trtools/utils/tr_harmonizer.py b/trtools/utils/tr_harmonizer.py index 5fe8c8f7..3ff9eea2 100644 --- a/trtools/utils/tr_harmonizer.py +++ b/trtools/utils/tr_harmonizer.py @@ -12,7 +12,7 @@ import numpy as np import trtools.utils.utils as utils - +import trtools.utils.common as common # List of supported VCF types # TODO: add Beagle @@ -1084,7 +1084,7 @@ def UniqueStringGenotypes(self) -> Set[int]: return set(self.UniqueStringGenotypeMapping().values()) def GetDosages(self, - dosagetype: TRDosageTypes = TRDosageTypes.bestguess) -> Optional[np.ndarray]: + dosagetype: TRDosageTypes = TRDosageTypes.bestguess, strict: bool = True) -> Optional[np.ndarray]: """ Get an array of genotype dosages for each sample. @@ -1105,6 +1105,9 @@ def GetDosages(self, ---------- dosagetype : Enum Which TRDosageType to compute. Default bestguess + strict : bool + If False, output a warning but do not die on errors validating AP field + If errors are encountered, return nan dosage values Returns ------- @@ -1115,10 +1118,14 @@ def GetDosages(self, if self.GetNumSamples() == 0: return None if (dosagetype in [TRDosageTypes.beagleap, TRDosageTypes.beagleap_norm]) and \ - (self.vcfrecord.format("AP1") is None or self.vcfrecord.format("AP2") is None): - raise ValueError( - "Requested Beagle dosages for record at {}:{} but AP1/AP2 fields not found.".format(self.chrom, self.pos) - ) + (("AP1" not in self.vcfrecord.FORMAT or "AP2" not in self.vcfrecord.FORMAT) or \ + (self.vcfrecord.format("AP1") is None or self.vcfrecord.format("AP2") is None)): + error_msg = "Requested Beagle dosages for record at {}:{} but AP1/AP2 fields not found.".format(self.chrom, self.pos) + if strict: + raise ValueError(error_msg) + else: + common.WARNING(error_msg) + return np.array([np.nan]*self.GetNumSamples()) if dosagetype in [TRDosageTypes.bestguess, TRDosageTypes.bestguess_norm]: # Get length gts and replace -1 (missing) and -2 (low ploidy) with 0 # But if normalizing set those to np.nan since unclear @@ -1141,10 +1148,19 @@ def GetDosages(self, # Check AP field. allow wiggle room for rounding if np.any(np.sum(ap1, axis=1) > 1.1) or np.any(np.sum(ap2, axis=1) > 1.1): - raise ValueError("AP1 or AP2 field summing to more than 1 detected") + error_msg = "{}:{} AP1 or AP2 field summing to more than 1 detected".format(self.chrom, self.pos) + if strict: + raise ValueError(error_msg) + else: + common.WARNING(error_msg) + return np.array([np.nan]*self.GetNumSamples()) if np.any(ap1 < 0) or np.any(ap2 < 0): - raise ValueError("Negative AP1 or AP2 fields detected") - + error_msg = "{}:{} Negative AP1 or AP2 fields detected".format(self.chrom, self.pos) + if strict: + raise ValueError("Negative AP1 or AP2 fields detected") + else: + common.WARNING(error_msg) + return np.array([np.nan]*self.GetNumSamples()) # Get haplotype dosages if len(self.alt_allele_lengths) > 0: max_alt_len = max(self.alt_allele_lengths) @@ -1167,12 +1183,18 @@ def GetDosages(self, else: # Normalize to be between 0 and 2 dosages = (unnorm_dosages-2*self.min_allele_length)/(self.max_allele_length-self.min_allele_length) - assert not (np.any(dosages>=2.1) or np.any(dosages<=-0.1)) + if (np.any(dosages>=2.1) or np.any(dosages<=-0.1)): + error_msg = "{}:{} Error normalizing dosages: value >=2.1 or <=-0.1 detected".format(self.chrom, self.pos) + if strict: + raise ValueError(error_msg) + else: + common.WARNING(error_msg) + return np.array([np.nan]*self.GetNumSamples()) dosages = np.clip(dosages, 0, 2) else: dosages = unnorm_dosages return dosages - + def GetLengthGenotypes(self) -> Optional[np.ndarray]: """ Get an array of length genotypes for each sample.