From e9f23ae476ddff85c0b9cf2405d8fbc16bbcb062 Mon Sep 17 00:00:00 2001
From: Melissa Gymrek <melissagymrek@Melissas-MacBook-Pro-2.local>
Date: Fri, 30 Aug 2024 16:29:12 -0700
Subject: [PATCH] Add option to make annotaTR less strict on Beagle AP field
 checks

---
 trtools/annotaTR/annotaTR.py             |  7 +++-
 trtools/annotaTR/tests/test_annotaTR.py  | 12 +++++++
 trtools/utils/tests/test_trharmonizer.py |  9 ++++-
 trtools/utils/tr_harmonizer.py           | 44 ++++++++++++++++++------
 4 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/trtools/annotaTR/annotaTR.py b/trtools/annotaTR/annotaTR.py
index a47eac76..4e9dd9c7 100644
--- a/trtools/annotaTR/annotaTR.py
+++ b/trtools/annotaTR/annotaTR.py
@@ -370,6 +370,11 @@ def getargs(): # pragma: no cover
         help="Compute genotype dosages. " 
              "Optionally specify how. Options=%s"%[str(item) for item in trh.TRDosageTypes.__members__],
         type=str)
+    annot_group.add_argument(
+        "--warn-on-AP-error",
+        help="Output a warning but don't crash on error computing on AP field",
+        action="store_true"
+    )
     annot_group.add_argument(
         "--ref-panel", 
         help="Annotate Beagle-imputed VCF with TR metadata from the reference panel. "
@@ -555,7 +560,7 @@ def main(args):
         minlen = trrecord.min_allele_length
         maxlen = trrecord.max_allele_length
         if dosage_type is not None:
-            dosages = trrecord.GetDosages(dosage_type)
+            dosages = trrecord.GetDosages(dosage_type, strict=(not args.warn_on_AP_error))
             # Update record
             record.INFO["DSLEN"] = "{minlen},{maxlen}".format(minlen=minlen, maxlen=maxlen)
             record.set_format("TRDS", np.array(dosages))
diff --git a/trtools/annotaTR/tests/test_annotaTR.py b/trtools/annotaTR/tests/test_annotaTR.py
index eda81650..675df652 100644
--- a/trtools/annotaTR/tests/test_annotaTR.py
+++ b/trtools/annotaTR/tests/test_annotaTR.py
@@ -25,6 +25,7 @@ def args(tmpdir):
     args.ignore_duplicates = False
     args.debug = False
     args.chunk_size = 1000
+    args.warn_on_AP_error = False
     return args
 
 @pytest.fixture
@@ -125,6 +126,17 @@ def test_DosageType(args, vcfdir):
     args.dosages = "beagleap_norm"
     retcode = main(args)
     assert retcode==1
+    fname = os.path.join(vcfdir, "beagle", "1kg_snpstr_21_first_100k_second_50_STRs_imputed.vcf.gz")
+    args.vcf = fname
+    args.vcftype = "hipstr"
+    args.ref_panel = os.path.join(vcfdir, "beagle", "1kg_snpstr_21_first_100k_first_50_annotated.vcf.gz")
+    args.warn_on_AP_error = True # should't fail with missing AP
+    args.outtype = ["pgen","vcf"]
+    retcode = main(args)
+    assert retcode==0
+    args.warn_on_AP_error = False # set back
+    with pytest.raises(ValueError):
+        main(args)
     # Beagle VCF
     fname = os.path.join(vcfdir, "beagle", "beagle_imputed_withap.vcf.gz")
     args.vcf = fname
diff --git a/trtools/utils/tests/test_trharmonizer.py b/trtools/utils/tests/test_trharmonizer.py
index dba2f29f..43d3c117 100644
--- a/trtools/utils/tests/test_trharmonizer.py
+++ b/trtools/utils/tests/test_trharmonizer.py
@@ -389,9 +389,13 @@ def test_TRRecord_GetGenotypes_Dosages():
     diploid_record.FORMAT["AP1"] = np.array([[10, 0.5], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0]])
     with pytest.raises(ValueError):
         rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap)
+    # If not in strict mode these should be nan
+    assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap, strict=False)).all()
     diploid_record.FORMAT["AP1"] = np.array([[-0.5, 0.5], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0]])
     with pytest.raises(ValueError):
         rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap)
+    # If not in strict mode these should be nan
+    assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap, strict=False)).all()
 
     # Test triploid example where alt=[]
     triploid_record = get_triploid_record()
@@ -421,7 +425,10 @@ def test_TRRecord_GetGenotypes_Dosages():
         rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap)
     with pytest.raises(ValueError):
         rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap_norm)
-
+    # If not in strict mode, should instead return NA
+    assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap, strict=False)).all()
+    assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap_norm, strict=False)).all()
+    
     # Test example with fewer alt_alleles than the max genotype index
     with pytest.raises(ValueError):
         trh.TRRecord(dummy_record, ref_allele, [], "CAG", "", None)
diff --git a/trtools/utils/tr_harmonizer.py b/trtools/utils/tr_harmonizer.py
index 5fe8c8f7..3ff9eea2 100644
--- a/trtools/utils/tr_harmonizer.py
+++ b/trtools/utils/tr_harmonizer.py
@@ -12,7 +12,7 @@
 import numpy as np
 
 import trtools.utils.utils as utils
-
+import trtools.utils.common as common
 
 # List of supported VCF types
 # TODO: add Beagle
@@ -1084,7 +1084,7 @@ def UniqueStringGenotypes(self) -> Set[int]:
         return set(self.UniqueStringGenotypeMapping().values())
 
     def GetDosages(self, 
-            dosagetype: TRDosageTypes = TRDosageTypes.bestguess) -> Optional[np.ndarray]:
+            dosagetype: TRDosageTypes = TRDosageTypes.bestguess, strict: bool = True) -> Optional[np.ndarray]:
         """
         Get an array of genotype dosages for each sample.
 
@@ -1105,6 +1105,9 @@ def GetDosages(self,
         ----------
         dosagetype : Enum
             Which TRDosageType to compute. Default bestguess
+        strict : bool
+            If False, output a warning but do not die on errors validating AP field
+            If errors are encountered, return nan dosage values
  
         Returns
         -------
@@ -1115,10 +1118,14 @@ def GetDosages(self,
         if self.GetNumSamples() == 0:
             return None
         if (dosagetype in [TRDosageTypes.beagleap, TRDosageTypes.beagleap_norm]) and \
-            (self.vcfrecord.format("AP1") is None or self.vcfrecord.format("AP2") is None):
-                raise ValueError(
-                "Requested Beagle dosages for record at {}:{} but AP1/AP2 fields not found.".format(self.chrom, self.pos)
-                )
+            (("AP1" not in self.vcfrecord.FORMAT or "AP2" not in self.vcfrecord.FORMAT) or \
+            (self.vcfrecord.format("AP1") is None or self.vcfrecord.format("AP2") is None)):
+                error_msg = "Requested Beagle dosages for record at {}:{} but AP1/AP2 fields not found.".format(self.chrom, self.pos)
+                if strict:
+                    raise ValueError(error_msg)
+                else:
+                    common.WARNING(error_msg)
+                    return np.array([np.nan]*self.GetNumSamples())
         if dosagetype in [TRDosageTypes.bestguess, TRDosageTypes.bestguess_norm]:
             # Get length gts and replace -1 (missing) and -2 (low ploidy) with 0
             # But if normalizing set those to np.nan since unclear
@@ -1141,10 +1148,19 @@ def GetDosages(self,
 
             # Check AP field. allow wiggle room for rounding
             if np.any(np.sum(ap1, axis=1) > 1.1) or np.any(np.sum(ap2, axis=1) > 1.1):
-                raise ValueError("AP1 or AP2 field summing to more than 1 detected")
+                error_msg = "{}:{} AP1 or AP2 field summing to more than 1 detected".format(self.chrom, self.pos)
+                if strict:
+                    raise ValueError(error_msg)
+                else:
+                    common.WARNING(error_msg)
+                    return np.array([np.nan]*self.GetNumSamples())
             if np.any(ap1 < 0) or np.any(ap2 < 0):
-                raise ValueError("Negative AP1 or AP2 fields detected")
-
+                error_msg = "{}:{} Negative AP1 or AP2 fields detected".format(self.chrom, self.pos)
+                if strict:
+                    raise ValueError("Negative AP1 or AP2 fields detected")
+                else:
+                    common.WARNING(error_msg)
+                    return np.array([np.nan]*self.GetNumSamples())
             # Get haplotype dosages
             if len(self.alt_allele_lengths) > 0:
                 max_alt_len = max(self.alt_allele_lengths)
@@ -1167,12 +1183,18 @@ def GetDosages(self,
             else:
                 # Normalize to be between 0 and 2
                 dosages = (unnorm_dosages-2*self.min_allele_length)/(self.max_allele_length-self.min_allele_length)
-                assert not (np.any(dosages>=2.1) or np.any(dosages<=-0.1))
+                if (np.any(dosages>=2.1) or np.any(dosages<=-0.1)):
+                    error_msg = "{}:{} Error normalizing dosages: value >=2.1 or <=-0.1 detected".format(self.chrom, self.pos)
+                    if strict:
+                        raise ValueError(error_msg)
+                    else:
+                        common.WARNING(error_msg)
+                        return np.array([np.nan]*self.GetNumSamples())
                 dosages = np.clip(dosages, 0, 2)
         else:
             dosages = unnorm_dosages
         return dosages
-
+        
     def GetLengthGenotypes(self) -> Optional[np.ndarray]:
         """
         Get an array of length genotypes for each sample.