gymrek-lab · gymreklab · Sep 17, 2024 · Aug 30, 2024 · Aug 31, 2024 · Aug 31, 2024
diff --git a/trtools/annotaTR/README.rst b/trtools/annotaTR/README.rst
@@ -32,7 +32,8 @@ Required parameters:
 Other general parameters:
 
 * :code:`--vcftype <string>`: Which genotyping tool generated the input VCF. Default = :code:`auto`. Necessary if it cannot be automatically inferred. One of: :code:`gangstr`, :code:`advntr`, :code:`hipstr`, :code:`eh`, :code:`popstr`.
-* :code:`--outtype <string>`: Which output format to generate. Supported output types are :code:`vcf` or :code:`pgen`. If multiple types are listed (e.g. :code:`--outtype vcf pgen`), all specified output formats are generated. By default, only VCF output is generated at :code:`$outprefix.vcf`. If PGEN output is specified, the files :code:`$outprefix.pgen`, :code:`$outprefix.pvar`, and :code:`$outprefix.psam` are generated. See more on the `pgen format here <https://www.cog-genomics.org/plink/2.0/formats#pgen>`_
+* :code:`--outtype <string>`: Which output format to generate. Supported output types are :code:`vcf`, :code:`gzvcf`, or :code:`pgen`. If multiple types are listed (e.g. :code:`--outtype vcf pgen`), all specified output formats are generated. By default, only VCF output is generated at :code:`$outprefix.vcf`. You cannot output both VCF and gzipped VCF in the same command. If PGEN output is specified, the files :code:`$outprefix.pgen`, :code:`$outprefix.pvar`, and :code:`$outprefix.psam` are generated. See more on the `pgen format here <https://www.cog-genomics.org/plink/2.0/formats#pgen>`_
+* :code:`--region <str>`: Restrict to processing a specific genomic region. Syntax: chr:start-end.
 * :code:`--chunk-size <int>`: If writing PGEN output, load dosages in chunks of this many variants at a type. This can avoid out of memory errors if you are processing very large VCF files.
 
 In addition to specifying input and output options above, you must specify at least one annotation operation to perform. These are described below.
@@ -64,6 +65,10 @@ where the argument to the :code:`--dosages` option specifies the method to compu
 * :code:`beagpleap`: dosages for imputed calls are computed in a way that takes into account imputation uncertainty. This option requires the Beagle VCF FORMAT fields :code:`AP1` and :code:`AP2` to be present, which give the probability of each alternate allele on each of the two haplotypes of an individual. AP-based dosages are generated by computing a weighted sum of allele lengths for each haplotype and added together to get the total dosage for the genotype (:math:`\sum_{hap=1,2} \sum_{allele a} P(a_{hap})len(a)`). In the case of imputation with no uncertainty (all AP fields are either 0 or 1), dosages computed in this way should match best guess dosages.
 * :code:`beagleap_norm`: same as above, but scaled to be between 0 and 2.
 
+Additional dosage options:
+
+* :code:`--warn-on-AP-error`: Output a warning, rather than quit, if invalid AP fields are encountered (i.e. if AP values are not found, do not sum to 1 or are negative, or if invalid dosage values are encountered after normalizing).
+
 If annotating dosages, the following fields are added to VCF output files:
 
 * FORMAT field :code:`TRDS`. This is a float value giving the estimated TR dosage for each call.
@@ -101,6 +106,7 @@ Additional relevant options:
     * **rawalleles** means loci are matched on :code:`chrom:pos:ref:alt`
     * **trimmedalleles** means loci are matched on :code:`chrom:pos:ref:alt` but ref and alt alleles are trimmed to remove common prefixes/suffixes. The trimmedalleles option must be used if you merged samples in your target VCF file using :code:`bcftools merge`, since that tool will modify alleles to remove common sequence (see `this issue <https://github.com/samtools/bcftools/issues/726>`_)
 * :code:`--ignore-duplicates`: This flag outputs a warning if duplicate loci are detected in the reference. If this flag is not set and a duplicate locus is detected, the program quits.
+* :code:`--update-ref-alt`: Update the REF/ALT allele sequences from the reference panel. Fixes issue with alleles being chopped after bcftools merge. Use with caution as this assumes allele order is exactly the same between the refpanel and target VCF. Only works when matching on locus id.
 
 If generating a VCF output file, this command will output a new file containing only STRs, with the following fields added back depending on the genotyper used to generate the reference panel:
 

diff --git a/trtools/annotaTR/annotaTR.py b/trtools/annotaTR/annotaTR.py
@@ -34,6 +34,7 @@ class OutputFileTypes(enum.Enum):
     """Different supported output file types."""
     vcf = "vcf"
     pgen = "pgen"
+    gzvcf = "gzvcf"
     def __repr__(self):
         return '<{}.{}>'.format(self.__class__.__name__, self.name)
 
@@ -45,6 +46,50 @@ class RefMatchTypes(enum.Enum):
     def __repr__(self):
         return '<{}.{}>'.format(self.__class__.__name__, self.name)
 
+def CheckAlleleCompatibility(record_ref, record_alt, panel_ref, panel_alt):
+    """
+    Check if the REF and ALT alleles of the record and
+    reference panel are compatible.
+
+    In the case of running imputation with Beagle followed by
+    bcftools merge, allele order is maintained but sequences
+    themselves may be trimmed by bcftools. This causes problems
+    when harmonizing HipSTR records, since the START/END coords
+    are not updated accordingly. Using annotaTR option --update-ref-alt
+    can restore the original allele sequences from the refpanel.
+    This function provides basic checks to make sure the ref/alts
+    of the panel and target VCF are compatible. In particular:
+    - is the number of ALT alleles the same
+    - are all alleles are offset by the same number of bp
+    - are all the ALTs in the target VCF substrings of the refpanel ALTS.
+    If any of these fail then the alleles are deemed incompatible.
+
+    Parameters
+    ----------
+    record_ref : str
+       REF allele of the target VCF
+    record_alt : list of str
+       ALT alleles of the target VCF
+    panel_ref : str
+       REF allele of the ref panel
+    panel_alt : list of str
+       ALT alleles of the ref panel
+
+    Returns
+    -------
+    is_compatible : bool
+       True if all checks pass, otherwise False
+    """
+    if len(record_alt) != len(panel_alt):
+        return False
+    len_offset = len(panel_ref)-len(record_ref)
+    for i in range(len(panel_alt)):
+        if (len(panel_alt[i])-len(record_alt[i])) != len_offset:
+            return False
+        if record_alt[i].upper() not in panel_alt[i].upper():
+            return False
+    return True
+
 def UpdateVCFHeader(reader, command, vcftype, dosage_type=None, refreader=None):
     """
     Update the VCF header of the reader to include:
@@ -246,6 +291,7 @@ def LoadMetadataFromRefPanel(refreader, vcftype, match_on=RefMatchTypes.locid,
         The key depends on the match_on parameter (see above)
         Values is a Dict[str, str] with key=infofield and
         value=value of that info field in the reference panel
+        Also includes REF/ALT to check against alleles in imputed VCF
     variant_ct : int
         Total number of variants
 
@@ -277,6 +323,8 @@ def LoadMetadataFromRefPanel(refreader, vcftype, match_on=RefMatchTypes.locid,
                 raise ValueError(
                     "Error: duplicate locus detected in refpanel: {locus}".format(locus=locuskey)
                     )
+        locdata["REF"] = record.REF
+        locdata["ALT"] = record.ALT
         metadata[locuskey] = locdata
         variant_ct += 1
     return metadata, variant_ct
@@ -364,12 +412,18 @@ def getargs(): # pragma: no cover
     inout_group.add_argument("--out", help="Prefix for output files", type=str, required=True)
     inout_group.add_argument("--outtype", help="Options=%s"%[str(item) for item in OutputFileTypes.__members__],
         type=str, nargs="+", default=["vcf"])
+    inout_group.add_argument("--region", help="Restrict analysis to this region. Syntax: chr:start-end", type=str)
     annot_group = parser.add_argument_group("Annotations")
     annot_group.add_argument(
         "--dosages", 
         help="Compute genotype dosages. " 
              "Optionally specify how. Options=%s"%[str(item) for item in trh.TRDosageTypes.__members__],
         type=str)
+    annot_group.add_argument(
+        "--warn-on-AP-error",
+        help="Output a warning but don't crash on error computing on AP field",
+        action="store_true"
+    )
     annot_group.add_argument(
         "--ref-panel", 
         help="Annotate Beagle-imputed VCF with TR metadata from the reference panel. "
@@ -387,6 +441,12 @@ def getargs(): # pragma: no cover
         help="Output a warning but do not crash if duplicate loci in refpanel",
         action="store_true"
         )
+    annot_group.add_argument("--update-ref-alt", help="Update the REF/ALT allele sequences from the "
+                                                      "reference panel. Fixes issue with alleles being "
+                                                      "chopped after bcftools merge. Use with caution "
+                                                      "as this assumes allele order is exactly the same "
+                                                      "between the refpanel and target VCF. Only works when "
+                                                      "matching on locus id", action="store_true")
     other_group = parser.add_argument_group("Other options")
     other_group.add_argument(
         "--chunk-size",
@@ -416,6 +476,14 @@ def main(args):
     if args.ref_panel is not None and not os.path.exists(args.ref_panel):
         common.WARNING("Error: %s does not exist"%args.ref_panel)
         return 1
+    if args.match_refpanel_on != "locid" and args.update_ref_alt:
+        common.WARNING("Error: you cannot use --update-ref-alt unless "
+                       " --match-refpanel-on is set to locid")
+        return 1
+    if args.update_ref_alt and args.ref_panel is None:
+        common.WARNING("Error: --update-ref-alt only works with "
+                       " --ref-panel.")
+        return 1
 
     outtypes = set()
     for outtype in args.outtype:
@@ -425,6 +493,9 @@ def main(args):
         except KeyError:
             common.WARNING("Invalid output type")
             return 1
+    if OutputFileTypes.vcf in outtypes and OutputFileTypes.gzvcf in outtypes:
+        common.WARNING("Please specify only either vcf or gzvcf output, not both")
+        return 1
     if args.vcftype != 'auto':
         if args.vcftype not in trh.VcfTypes.__members__:
             common.WARNING("Invalid vcftype")
@@ -466,6 +537,8 @@ def main(args):
             common.WARNING("Error: reference panel annotation not "
                            "currently supported for popSTR")
             return 1
+        if args.region is not None:
+            refreader = refreader(args.region)
         try:
             match_on = RefMatchTypes[args.match_refpanel_on]
         except KeyError:
@@ -474,7 +547,9 @@ def main(args):
         refpanel_metadata, ref_variant_ct = LoadMetadataFromRefPanel(refreader, refpanel_vcftype, 
             match_on=match_on, ignore_duplicates=args.ignore_duplicates)
         if len(refpanel_metadata.keys()) == 0:
-            common.WARNING("Error: No TRs detected in reference panel. Was the right vcftype specified? Quitting")
+            common.WARNING("Error: No TRs detected in reference panel. Check: "
+                           "Was the right vcftype specified? "
+                           "Was an invalid region specified? Quitting")
             return 1
         common.MSG("Loaded " + str(ref_variant_ct) + " TR loci from ref panel",
             debug=True)
@@ -502,12 +577,20 @@ def main(args):
     # Update reader header, even if not writing VCF output
     # This is because we might add VCF fields for parsing
     # with TRHarmonizer along the way
+    # Note need to set up new refreader in case we set the region 
+    # above in which case refreader is an iterator
+    tmp_refreader = None
+    if args.ref_panel is not None:
+        tmp_refreader = utils.LoadSingleReader(args.ref_panel, lazy=True, samples=set())
     if not UpdateVCFHeader(reader, " ".join(sys.argv), vcftype,
-                        dosage_type=dosage_type, refreader=refreader):
+                        dosage_type=dosage_type, 
+                        refreader=tmp_refreader):
         common.WARNING("Error: problem initializing vcf header.")
         return 1
     if OutputFileTypes.vcf in outtypes:
         vcf_writer = cyvcf2.Writer(args.out+".vcf", reader)
+    if OutputFileTypes.gzvcf in outtypes:
+        vcf_writer = cyvcf2.Writer(args.out+".vcf.gz", reader, mode="wz")
 
     # variant_ct needed for pgen
     # If using a ref panel, assume we have same number
@@ -523,7 +606,10 @@ def main(args):
     ###### Process each record #######
     num_variants_processed_batch = 0
     num_variants_processed = 0
-    dosages_batch = np.empty((args.chunk_size, len(reader.samples)), dtype=np.float32)
+    num_samples = len(reader.samples)
+    dosages_batch = np.empty((args.chunk_size, num_samples), dtype=np.float32)
+    if args.region:
+        reader = reader(args.region)
     for record in reader:
         # If using refpanel, first add required fields
         # In that case, only process records in the refpanel
@@ -545,6 +631,16 @@ def main(args):
                 continue
             for infofield in INFOFIELDS[vcftype]:
                 record.INFO[infofield] = refpanel_metadata[locuskey][infofield]
+            if args.update_ref_alt:
+                # Update allele sequences to be exactly as in the
+                # reference panel. 
+                if not CheckAlleleCompatibility(record.REF, record.ALT,
+                    refpanel_metadata[locuskey]["REF"], refpanel_metadata[locuskey]["ALT"]):
+                    raise ValueError("--update-ref-alt set but the REF/ALT fields"
+                                     " at {chrom}:{pos} are incompatible between the"
+                                     " refpanel and target VCF".format(chrom=record.CHROM, pos=record.POS))
+                record.REF = refpanel_metadata[locuskey]["REF"]
+                record.ALT = refpanel_metadata[locuskey]["ALT"]
         try:
             trrecord = trh.HarmonizeRecord(vcfrecord=record, vcftype=vcftype)
         except:
@@ -554,16 +650,24 @@ def main(args):
             return 1
         minlen = trrecord.min_allele_length
         maxlen = trrecord.max_allele_length
+        # Add this check to warn us when bad things happen when parsing alleles
+        if minlen == maxlen and len(trrecord.ref_allele) < 5:
+            common.WARNING("Warning: Suspicious allele lengths found at "
+                "{chrom}:{pos}. If you imputed then used bcftools merge "
+                "and alleles were trimmed, consider using option "
+                "--update-ref-alt. Otherwise dosage values may be invalid. "
+                "Parsed alleles: ref={ref}, alt={alt}".format(chrom=record.CHROM, pos=record.POS, \
+                    ref=trrecord.ref_allele, alt=",".join(trrecord.alt_alleles)))
         if dosage_type is not None:
-            dosages = trrecord.GetDosages(dosage_type)
+            dosages = trrecord.GetDosages(dosage_type, strict=(not args.warn_on_AP_error))
             # Update record
             record.INFO["DSLEN"] = "{minlen},{maxlen}".format(minlen=minlen, maxlen=maxlen)
             record.set_format("TRDS", np.array(dosages))
             # Update batch
             dosages_batch[num_variants_processed_batch] = dosages
 
         # Write to VCF if using vcf output
-        if OutputFileTypes.vcf in outtypes:
+        if OutputFileTypes.vcf in outtypes or OutputFileTypes.gzvcf in outtypes:
             vcf_writer.write_record(record)
 
         # Write pvar if using pgen output
@@ -581,7 +685,7 @@ def main(args):
             if OutputFileTypes.pgen in outtypes:
                 pgen_writer.append_dosages_batch(dosages_batch[:num_variants_processed_batch])
             # Reset
-            dosages_batch = np.empty((args.chunk_size, len(reader.samples)), dtype=np.float32)
+            dosages_batch = np.empty((args.chunk_size, num_samples), dtype=np.float32)
             num_variants_processed_batch = 0
 
     ###### Cleanup #######
@@ -594,7 +698,7 @@ def main(args):
                 "with option --match-refpanel-on trimmedalleles or --match-refpanel-on locid.")
             return 1
         pvar_writer.close()
-    if OutputFileTypes.vcf in outtypes:
+    if OutputFileTypes.vcf in outtypes or OutputFileTypes.gzvcf in outtypes:
         vcf_writer.close()
     return 0