From 0d5d96b8699d043aa88dc62866ded0465e0ba713 Mon Sep 17 00:00:00 2001 From: Gymrek Lab Date: Tue, 17 Sep 2024 19:45:56 +0200 Subject: [PATCH] feat: Add options to facilitate debugging annotaTR runs on large files (#234) Co-authored-by: Melissa Gymrek Co-authored-by: Melissa Gymrek Co-authored-by: Arya Massarat <23412689+aryarm@users.noreply.github.com> --- test/cmdline_tests.sh | 2 + trtools/annotaTR/README.rst | 9 +- trtools/annotaTR/annotaTR.py | 128 ++++++++- trtools/annotaTR/tests/test_annotaTR.py | 245 +++++++++++++++++- .../sample_vcfs/beagle/beagle_badap.vcf.gz | Bin 0 -> 17577 bytes .../beagle/beagle_badap.vcf.gz.tbi | Bin 0 -> 196 bytes .../beagle/beagle_imputed_badalleles.vcf.gz | Bin 0 -> 825 bytes .../beagle_imputed_badalleles.vcf.gz.tbi | Bin 0 -> 154 bytes .../beagle/beagle_imputed_goodalleles.vcf.gz | Bin 0 -> 824 bytes .../beagle_imputed_goodalleles.vcf.gz.tbi | Bin 0 -> 154 bytes .../beagle/beagle_imputed_noTRs.vcf.gz | Bin 0 -> 787 bytes .../beagle/beagle_imputed_noTRs.vcf.gz.tbi | Bin 0 -> 139 bytes .../beagle/beagle_tinyrefpanel.vcf.gz | Bin 0 -> 3131 bytes .../beagle/beagle_tinyrefpanel.vcf.gz.tbi | Bin 0 -> 158 bytes trtools/utils/tests/test_trharmonizer.py | 9 +- trtools/utils/tr_harmonizer.py | 49 +++- 16 files changed, 414 insertions(+), 28 deletions(-) create mode 100644 trtools/testsupport/sample_vcfs/beagle/beagle_badap.vcf.gz create mode 100644 trtools/testsupport/sample_vcfs/beagle/beagle_badap.vcf.gz.tbi create mode 100644 trtools/testsupport/sample_vcfs/beagle/beagle_imputed_badalleles.vcf.gz create mode 100644 trtools/testsupport/sample_vcfs/beagle/beagle_imputed_badalleles.vcf.gz.tbi create mode 100644 trtools/testsupport/sample_vcfs/beagle/beagle_imputed_goodalleles.vcf.gz create mode 100644 trtools/testsupport/sample_vcfs/beagle/beagle_imputed_goodalleles.vcf.gz.tbi create mode 100644 trtools/testsupport/sample_vcfs/beagle/beagle_imputed_noTRs.vcf.gz create mode 100644 trtools/testsupport/sample_vcfs/beagle/beagle_imputed_noTRs.vcf.gz.tbi create mode 100644 trtools/testsupport/sample_vcfs/beagle/beagle_tinyrefpanel.vcf.gz create mode 100644 trtools/testsupport/sample_vcfs/beagle/beagle_tinyrefpanel.vcf.gz.tbi diff --git a/test/cmdline_tests.sh b/test/cmdline_tests.sh index 3679b9ab..b52a58e6 100755 --- a/test/cmdline_tests.sh +++ b/test/cmdline_tests.sh @@ -57,6 +57,8 @@ runcmd_pass "annotaTR --vcf ${BEAGLEDIR}/beagle_imputed_withap.vcf.gz --vcftype runcmd_fail "annotaTR --vcf ${EXDATADIR}/trio_chr21_gangstr.sorted.vcf.gz --out ${TMPDIR}/test" runcmd_fail "annotaTR --vcf ${EXDATADIR}/trio_chr21_gangstr.sorted.vcf.gz --dosages beagleap --outtype pgen --out ${TMPDIR}/test" runcmd_fail "annotaTR --vcf ${EXDATADIR}/trio_chr21_gangstr.sorted.vcf.gz --dosages beagleap_norm --outtype pgen --out ${TMPDIR}/test" +runcmd_fail "annotaTR --vcf ${BEAGLEDIR}/beagle_badap.vcf.gz --vcftype hipstr --ref-panel ${BEAGLEDIR}/beagle_refpanel.vcf.gz --match-refpanel-on rawalleles --dosages beagleap_norm --outtype pgen --out ${TMPDIR}/test" +runcmd_pass "annotaTR --vcf ${BEAGLEDIR}/beagle_badap.vcf.gz --vcftype hipstr --ref-panel ${BEAGLEDIR}/beagle_refpanel.vcf.gz --match-refpanel-on rawalleles --dosages beagleap_norm --outtype pgen --out ${TMPDIR}/test --warn-on-AP-error" # If file has SNPs+TRs but no refpanel, annotatr should fail runcmd_fail "annotaTR --vcf ${BEAGLEDIR}/beagle_imputed_withap.vcf.gz --vcftype hipstr --dosages beagleap --out ${TMPDIR}/test" diff --git a/trtools/annotaTR/README.rst b/trtools/annotaTR/README.rst index 1ac0ace7..b3163a92 100644 --- a/trtools/annotaTR/README.rst +++ b/trtools/annotaTR/README.rst @@ -33,7 +33,9 @@ Other general parameters: * :code:`--vcftype `: Which genotyping tool generated the input VCF. Default = :code:`auto`. Necessary if it cannot be automatically inferred. One of: :code:`gangstr`, :code:`advntr`, :code:`hipstr`, :code:`eh`, :code:`popstr`. * :code:`--outtype `: Which output format to generate. Supported output types are :code:`vcf` or :code:`pgen`. If multiple types are listed (e.g. :code:`--outtype vcf pgen`), all specified output formats are generated. By default, only VCF output is generated at :code:`$outprefix.vcf`. If PGEN output is specified, the files :code:`$outprefix.pgen`, :code:`$outprefix.pvar`, and :code:`$outprefix.psam` are generated. See more on the `pgen format here `_ -* :code:`--chunk-size `: If writing PGEN output, load dosages in chunks of this many variants at a type. This can avoid out of memory errors if you are processing very large VCF files. +* :code:`--vcf-outtype `: Type of VCF output to produce. This option is ignored unless using :code:`--outtype vcf`. Options: z=compressed VCF, v=uncompressed VCF, b=compressed BCF, u=uncompressed BCF, s=stdout +* :code:`--region `: Restrict to processing a specific genomic region. Syntax: chr:start-end. +* :code:`--chunk-size `: If writing PGEN output, load dosages in chunks of this many variants at a time. This can avoid out-of-memory errors if you are processing very large VCF files. In addition to specifying input and output options above, you must specify at least one annotation operation to perform. These are described below. @@ -64,6 +66,10 @@ where the argument to the :code:`--dosages` option specifies the method to compu * :code:`beagpleap`: dosages for imputed calls are computed in a way that takes into account imputation uncertainty. This option requires the Beagle VCF FORMAT fields :code:`AP1` and :code:`AP2` to be present, which give the probability of each alternate allele on each of the two haplotypes of an individual. AP-based dosages are generated by computing a weighted sum of allele lengths for each haplotype and added together to get the total dosage for the genotype (:math:`\sum_{hap=1,2} \sum_{allele a} P(a_{hap})len(a)`). In the case of imputation with no uncertainty (all AP fields are either 0 or 1), dosages computed in this way should match best guess dosages. * :code:`beagleap_norm`: same as above, but scaled to be between 0 and 2. +Additional dosage options: + +* :code:`--warn-on-AP-error`: Output a warning, rather than quit, if invalid AP fields are encountered (i.e. if AP values are not found, do not sum to 1 or are negative, or if invalid dosage values are encountered after normalizing). + If annotating dosages, the following fields are added to VCF output files: * FORMAT field :code:`TRDS`. This is a float value giving the estimated TR dosage for each call. @@ -101,6 +107,7 @@ Additional relevant options: * **rawalleles** means loci are matched on :code:`chrom:pos:ref:alt` * **trimmedalleles** means loci are matched on :code:`chrom:pos:ref:alt` but ref and alt alleles are trimmed to remove common prefixes/suffixes. The trimmedalleles option must be used if you merged samples in your target VCF file using :code:`bcftools merge`, since that tool will modify alleles to remove common sequence (see `this issue `_) * :code:`--ignore-duplicates`: This flag outputs a warning if duplicate loci are detected in the reference. If this flag is not set and a duplicate locus is detected, the program quits. +* :code:`--update-ref-alt`: Update the REF/ALT allele sequences from the reference panel. Fixes issue with alleles being chopped after bcftools merge. Use with caution as this assumes allele order is exactly the same between the refpanel and target VCF. Only works when matching on locus id. If generating a VCF output file, this command will output a new file containing only STRs, with the following fields added back depending on the genotyper used to generate the reference panel: diff --git a/trtools/annotaTR/annotaTR.py b/trtools/annotaTR/annotaTR.py index a47eac76..5b96ae65 100644 --- a/trtools/annotaTR/annotaTR.py +++ b/trtools/annotaTR/annotaTR.py @@ -45,6 +45,50 @@ class RefMatchTypes(enum.Enum): def __repr__(self): return '<{}.{}>'.format(self.__class__.__name__, self.name) +def CheckAlleleCompatibility(record_ref, record_alt, panel_ref, panel_alt): + """ + Check if the REF and ALT alleles of the record and + reference panel are compatible. + + In the case of running imputation with Beagle followed by + bcftools merge, allele order is maintained but sequences + themselves may be trimmed by bcftools. This causes problems + when harmonizing HipSTR records, since the START/END coords + are not updated accordingly. Using annotaTR option --update-ref-alt + can restore the original allele sequences from the refpanel. + This function provides basic checks to make sure the ref/alts + of the panel and target VCF are compatible. In particular: + - is the number of ALT alleles the same + - are all alleles offset by the same number of bp + - are all the ALTs in the target VCF substrings of the refpanel ALTS. + If any of these fail then the alleles are deemed incompatible. + + Parameters + ---------- + record_ref : str + REF allele of the target VCF + record_alt : list of str + ALT alleles of the target VCF + panel_ref : str + REF allele of the ref panel + panel_alt : list of str + ALT alleles of the ref panel + + Returns + ------- + is_compatible : bool + True if all checks pass, otherwise False + """ + if len(record_alt) != len(panel_alt): + return False + len_offset = len(panel_ref)-len(record_ref) + for i in range(len(panel_alt)): + if (len(panel_alt[i])-len(record_alt[i])) != len_offset: + return False + if record_alt[i].upper() not in panel_alt[i].upper(): + return False + return True + def UpdateVCFHeader(reader, command, vcftype, dosage_type=None, refreader=None): """ Update the VCF header of the reader to include: @@ -246,6 +290,7 @@ def LoadMetadataFromRefPanel(refreader, vcftype, match_on=RefMatchTypes.locid, The key depends on the match_on parameter (see above) Values is a Dict[str, str] with key=infofield and value=value of that info field in the reference panel + Also includes REF/ALT to check against alleles in imputed VCF variant_ct : int Total number of variants @@ -277,6 +322,8 @@ def LoadMetadataFromRefPanel(refreader, vcftype, match_on=RefMatchTypes.locid, raise ValueError( "Error: duplicate locus detected in refpanel: {locus}".format(locus=locuskey) ) + locdata["REF"] = record.REF + locdata["ALT"] = record.ALT metadata[locuskey] = locdata variant_ct += 1 return metadata, variant_ct @@ -364,12 +411,21 @@ def getargs(): # pragma: no cover inout_group.add_argument("--out", help="Prefix for output files", type=str, required=True) inout_group.add_argument("--outtype", help="Options=%s"%[str(item) for item in OutputFileTypes.__members__], type=str, nargs="+", default=["vcf"]) + inout_group.add_argument("--vcf-outtype", help="Type of VCF output to produce. " + "z=compressed VCF, v=uncompressed VCF, " + "b=compressed BCF, u=uncompressed BCF, s=stdout", type=str, default="v") + inout_group.add_argument("--region", help="Restrict analysis to this region. Syntax: chr:start-end", type=str) annot_group = parser.add_argument_group("Annotations") annot_group.add_argument( "--dosages", help="Compute genotype dosages. " "Optionally specify how. Options=%s"%[str(item) for item in trh.TRDosageTypes.__members__], type=str) + annot_group.add_argument( + "--warn-on-AP-error", + help="Output a warning but don't crash on error computing on AP field", + action="store_true" + ) annot_group.add_argument( "--ref-panel", help="Annotate Beagle-imputed VCF with TR metadata from the reference panel. " @@ -387,6 +443,12 @@ def getargs(): # pragma: no cover help="Output a warning but do not crash if duplicate loci in refpanel", action="store_true" ) + annot_group.add_argument("--update-ref-alt", help="Update the REF/ALT allele sequences from the " + "reference panel. Fixes issue with alleles being " + "chopped after bcftools merge. Use with caution " + "as this assumes allele order is exactly the same " + "between the refpanel and target VCF. Only works when " + "matching on locus id", action="store_true") other_group = parser.add_argument_group("Other options") other_group.add_argument( "--chunk-size", @@ -416,6 +478,14 @@ def main(args): if args.ref_panel is not None and not os.path.exists(args.ref_panel): common.WARNING("Error: %s does not exist"%args.ref_panel) return 1 + if args.match_refpanel_on != "locid" and args.update_ref_alt: + common.WARNING("Error: you cannot use --update-ref-alt unless " + " --match-refpanel-on is set to locid") + return 1 + if args.update_ref_alt and args.ref_panel is None: + common.WARNING("Error: --update-ref-alt only works with " + " --ref-panel.") + return 1 outtypes = set() for outtype in args.outtype: @@ -425,6 +495,9 @@ def main(args): except KeyError: common.WARNING("Invalid output type") return 1 + if args.vcf_outtype not in ["z","v","u","b","s"]: + common.WARNING("Invalid VCF output type specified: {vcf_outtype}".format(vcf_outtype=args.vcf_outtype)) + return 1 if args.vcftype != 'auto': if args.vcftype not in trh.VcfTypes.__members__: common.WARNING("Invalid vcftype") @@ -466,6 +539,8 @@ def main(args): common.WARNING("Error: reference panel annotation not " "currently supported for popSTR") return 1 + if args.region is not None: + refreader = refreader(args.region) try: match_on = RefMatchTypes[args.match_refpanel_on] except KeyError: @@ -474,7 +549,9 @@ def main(args): refpanel_metadata, ref_variant_ct = LoadMetadataFromRefPanel(refreader, refpanel_vcftype, match_on=match_on, ignore_duplicates=args.ignore_duplicates) if len(refpanel_metadata.keys()) == 0: - common.WARNING("Error: No TRs detected in reference panel. Was the right vcftype specified? Quitting") + common.WARNING("Error: No TRs detected in reference panel. Check: " + "Was the right vcftype specified? " + "Was an invalid region specified? Quitting") return 1 common.MSG("Loaded " + str(ref_variant_ct) + " TR loci from ref panel", debug=True) @@ -502,13 +579,29 @@ def main(args): # Update reader header, even if not writing VCF output # This is because we might add VCF fields for parsing # with TRHarmonizer along the way + # Note need to set up new refreader in case we set the region + # above in which case refreader is an iterator + tmp_refreader = None + if args.ref_panel is not None: + tmp_refreader = utils.LoadSingleReader(args.ref_panel, lazy=True, samples=set()) if not UpdateVCFHeader(reader, " ".join(sys.argv), vcftype, - dosage_type=dosage_type, refreader=refreader): + dosage_type=dosage_type, + refreader=tmp_refreader): common.WARNING("Error: problem initializing vcf header.") return 1 if OutputFileTypes.vcf in outtypes: - vcf_writer = cyvcf2.Writer(args.out+".vcf", reader) - + if args.vcf_outtype == "v": + vcf_writer = cyvcf2.Writer(args.out+".vcf", reader) + elif args.vcf_outtype == "z": + vcf_writer = cyvcf2.Writer(args.out+".vcf.gz", reader, mode="wz") + elif args.vcf_outtype == "b": + vcf_writer = cyvcf2.Writer(args.out+".bcf", reader, mode="wb") + elif args.vcf_outtype == "u": + vcf_writer = cyvcf2.Writer(args.out+".bcf", reader, mode="wbu") + elif args.vcf_outtype == "s": + vcf_writer = cyvcf2.Writer("-", reader) + else: + raise ValueError("Encountered invalid VCF output type") # variant_ct needed for pgen # If using a ref panel, assume we have same number # of TRs as the panel @@ -523,7 +616,10 @@ def main(args): ###### Process each record ####### num_variants_processed_batch = 0 num_variants_processed = 0 - dosages_batch = np.empty((args.chunk_size, len(reader.samples)), dtype=np.float32) + num_samples = len(reader.samples) + dosages_batch = np.empty((args.chunk_size, num_samples), dtype=np.float32) + if args.region: + reader = reader(args.region) for record in reader: # If using refpanel, first add required fields # In that case, only process records in the refpanel @@ -545,6 +641,16 @@ def main(args): continue for infofield in INFOFIELDS[vcftype]: record.INFO[infofield] = refpanel_metadata[locuskey][infofield] + if args.update_ref_alt: + # Update allele sequences to be exactly as in the + # reference panel. + if not CheckAlleleCompatibility(record.REF, record.ALT, + refpanel_metadata[locuskey]["REF"], refpanel_metadata[locuskey]["ALT"]): + raise ValueError("--update-ref-alt set but the REF/ALT fields" + " at {chrom}:{pos} are incompatible between the" + " refpanel and target VCF".format(chrom=record.CHROM, pos=record.POS)) + record.REF = refpanel_metadata[locuskey]["REF"] + record.ALT = refpanel_metadata[locuskey]["ALT"] try: trrecord = trh.HarmonizeRecord(vcfrecord=record, vcftype=vcftype) except: @@ -554,8 +660,16 @@ def main(args): return 1 minlen = trrecord.min_allele_length maxlen = trrecord.max_allele_length + # Add this check to warn us when bad things happen when parsing alleles + if minlen == maxlen and len(trrecord.ref_allele) < 5: + common.WARNING("Warning: Suspicious allele lengths found at " + "{chrom}:{pos}. If you imputed then used bcftools merge " + "and alleles were trimmed, consider using option " + "--update-ref-alt. Otherwise dosage values may be invalid. " + "Parsed alleles: ref={ref}, alt={alt}".format(chrom=record.CHROM, pos=record.POS, \ + ref=trrecord.ref_allele, alt=",".join(trrecord.alt_alleles))) if dosage_type is not None: - dosages = trrecord.GetDosages(dosage_type) + dosages = trrecord.GetDosages(dosage_type, strict=(not args.warn_on_AP_error)) # Update record record.INFO["DSLEN"] = "{minlen},{maxlen}".format(minlen=minlen, maxlen=maxlen) record.set_format("TRDS", np.array(dosages)) @@ -581,7 +695,7 @@ def main(args): if OutputFileTypes.pgen in outtypes: pgen_writer.append_dosages_batch(dosages_batch[:num_variants_processed_batch]) # Reset - dosages_batch = np.empty((args.chunk_size, len(reader.samples)), dtype=np.float32) + dosages_batch = np.empty((args.chunk_size, num_samples), dtype=np.float32) num_variants_processed_batch = 0 ###### Cleanup ####### diff --git a/trtools/annotaTR/tests/test_annotaTR.py b/trtools/annotaTR/tests/test_annotaTR.py index eda81650..e93294d6 100644 --- a/trtools/annotaTR/tests/test_annotaTR.py +++ b/trtools/annotaTR/tests/test_annotaTR.py @@ -1,9 +1,9 @@ -import argparse -import cyvcf2 -import gzip import os +import argparse import pytest +import numpy as np +from pgenlib import PgenReader from ..annotaTR import * from trtools.testsupport.utils import assert_same_vcf @@ -17,7 +17,10 @@ def args(tmpdir): args = argparse.ArgumentParser() args.vcf = None args.vcftype = "auto" + args.vcf_outtype = "v" + args.region = None args.out = str(tmpdir / "test") + args.update_ref_alt = False args.outtype = ["vcf"] args.dosages = None args.ref_panel = None @@ -25,6 +28,7 @@ def args(tmpdir): args.ignore_duplicates = False args.debug = False args.chunk_size = 1000 + args.warn_on_AP_error = False return args @pytest.fixture @@ -85,6 +89,36 @@ def test_OutTypes(args, vcfdir): args.outtype = ["pgen"] retcode = main(args) assert retcode==0 + args.vcf_outtype = "z" + retcode = main(args) + assert retcode==0 + args.vcf_outtype = "s" + retcode = main(args) + assert retcode==0 + args.vcf_outtype = "b" + retcode = main(args) + assert retcode==0 + args.vcf_outtype = "u" + retcode = main(args) + assert retcode==0 + args.vcf_outtype = "l" + retcode = main(args) + assert retcode==1 + # set back + args.vcf_outtype = "v" + # Should get pgen error if input VCF has fewer + # TRs than ref panel + args.vcf = os.path.join(vcfdir, "beagle", "beagle_imputed_noTRs.vcf.gz") + args.ref_panel = os.path.join(vcfdir, "beagle", "beagle_tinyrefpanel.vcf.gz") + args.match_refpanel_on = "locid" + args.dosages = "bestguess_norm" + args.outtype = ["pgen"] + args.out = "test" + retcode = main(args) + assert retcode==1 + for fname_ext in ("pgen", "pvar", "psam"): + if os.path.exists("test."+fname_ext): + os.remove("test."+fname_ext) # Check the pvar file can be ready by cyvcf2? #pvarfile = args.out + ".pvar" #test_reader = cyvcf2.VCF(pvarfile) @@ -108,6 +142,39 @@ def test_VCFType(args, vcfdir): retcode = main(args) assert retcode==0 +def test_UpdateRefAlt(args, vcfdir): + fname = os.path.join(vcfdir, "beagle", "beagle_imputed_withap.vcf.gz") + args.vcf = fname + args.vcftype = "hipstr" + args.ref_panel = os.path.join(vcfdir, "beagle", "beagle_refpanel.vcf.gz") + args.dosages = "beagleap" + args.update_ref_alt = True + # Won't work if matching on anything besides locid + args.match_refpanel_on = "rawalleles" + retcode = main(args) + assert retcode==1 + # Won't work with no ref panel + args.match_refpanel_on = "locid" + args.ref_panel = None + retcode = main(args) + assert retcode==1 + + # Try on good file with alleles that do match refpanel + args.match_refpanel_on = "locid" + fname = os.path.join(vcfdir, "beagle", "beagle_imputed_goodalleles.vcf.gz") + args.ref_panel = os.path.join(vcfdir, "beagle", "beagle_tinyrefpanel.vcf.gz") + args.vcf = fname + retcode = main(args) + assert retcode==0 + + # Try on dummy file with bad alleles that don't match refpanel + fname = os.path.join(vcfdir, "beagle", "beagle_imputed_badalleles.vcf.gz") + args.ref_panel = os.path.join(vcfdir, "beagle", "beagle_tinyrefpanel.vcf.gz") + args.vcf = fname + with pytest.raises(ValueError): + main(args) + + def test_DosageType(args, vcfdir): # Non-beagle VCF fname = os.path.join(vcfdir, "dumpSTR_vcfs", "trio_chr21_gangstr.sorted.vcf.gz") @@ -125,6 +192,17 @@ def test_DosageType(args, vcfdir): args.dosages = "beagleap_norm" retcode = main(args) assert retcode==1 + fname = os.path.join(vcfdir, "beagle", "1kg_snpstr_21_first_100k_second_50_STRs_imputed.vcf.gz") + args.vcf = fname + args.vcftype = "hipstr" + args.ref_panel = os.path.join(vcfdir, "beagle", "1kg_snpstr_21_first_100k_first_50_annotated.vcf.gz") + args.warn_on_AP_error = True # should't fail with missing AP + args.outtype = ["pgen","vcf"] + retcode = main(args) + assert retcode==0 + args.warn_on_AP_error = False # set back + with pytest.raises(ValueError): + main(args) # Beagle VCF fname = os.path.join(vcfdir, "beagle", "beagle_imputed_withap.vcf.gz") args.vcf = fname @@ -146,13 +224,52 @@ def test_DosageType(args, vcfdir): args.dosages = "beagleap_norm" retcode = main(args) assert retcode==0 + +def test_LoadRegion(args, vcfdir): + # Test good region + fname = os.path.join(vcfdir, "dumpSTR_vcfs", "trio_chr21_gangstr.sorted.vcf.gz") + args.vcf = fname + args.vcftype = "gangstr" + args.dosages = "bestguess" + args.region = "chr21:9489666-9546720" + retcode = main(args) + assert retcode==0 -def test_LoadRefpanel(args, vcfdir): + # Test region with refpanel fname = os.path.join(vcfdir, "beagle", "beagle_imputed_withap.vcf.gz") args.vcf = fname args.vcftype = "gangstr" args.ref_panel = os.path.join(vcfdir, "beagle", "beagle_refpanel.vcf.gz") retcode = main(args) + args.region = "chr21:14282813-14303433" + retcode = main(args) + assert retcode==0 + + # Region not in ref panel + # Fails because we find no TRs + args.region = "chr19:14282813-14303433" + retcode = main(args) + assert retcode==1 + + # Malformatted region + # Fails because we find no TRs + args.region = "XXXXX" + retcode = main(args) + assert retcode==1 + + # Note, cyvcf2 doesn't complain about malformatted regions + # and just will not return any intervals + # TODO: We might want to check regions here and in other tools + # like statstr and prancstr where users can set a region + +def test_LoadRefpanel(args, vcfdir): + fname = os.path.join(vcfdir, "beagle", "beagle_imputed_withap.vcf.gz") + args.vcf = fname + args.vcftype = "hipstr" + args.ref_panel = os.path.join(vcfdir, "beagle", "beagle_refpanel.vcf.gz") + args.match_refpanel_on = "rawalleles" + args.debug = True + retcode = main(args) assert retcode == 0 args.vcftype = "auto" retcode = main(args) @@ -163,7 +280,29 @@ def test_LoadRefpanel(args, vcfdir): args.match_refpanel_on = "locid" with pytest.raises(ValueError): main(args) + # Test on example where locid should work + args.vcf = os.path.join(vcfdir, "beagle", "1kg_snpstr_21_first_100k_second_50_STRs_imputed.vcf.gz") + args.ref_panel = os.path.join(vcfdir, "beagle", "1kg_snpstr_21_first_100k_first_50_annotated.vcf.gz") + args.match_refpanel_on = "locid" + args.vcftype = "hipstr" + retcode = main(args) + assert retcode == 0 + # Invalid match option + args.match_refpanel_on = "badoption" + with pytest.raises(ValueError): + GetLocusKey(None, match_on="bad match") + retcode = main(args) + assert retcode == 1 args.match_refpanel_on = "rawalleles" # set back for future tests + # Load mix of SNPs/TRs but no ref panel + fname = os.path.join(vcfdir, "beagle", "beagle_imputed_withap.vcf.gz") + args.vcf = fname + args.vcftype = "hipstr" + args.dosages = "bestguess" + args.ref_panel = None + args.debug = False + retcode = main(args) + assert retcode == 1 # Bad refpanel args.ref_panel = os.path.join(vcfdir, "missing_samples.txt") retcode = main(args) @@ -225,6 +364,60 @@ def test_TrimAlleles(): assert(new_ref == "TAAA") assert(new_alt[0] == ".") +def test_CheckAlleleCompatibility(): + # Alleles identical + panel_ref = "AAT" + panel_alt = ["AATAAT","AATAATAAT"] + record_ref = "AAT" + record_alt = ["AATAAT","AATAATAAT"] + assert CheckAlleleCompatibility(record_ref, record_alt, panel_ref, panel_alt) + + # Alleles in target subset of those in panel + panel_ref = "AATAAT" + panel_alt = ["AATAATAAT","AATAATAATAAT"] + record_ref = "AAT" + record_alt = ["AATAAT","AATAATAAT"] + assert CheckAlleleCompatibility(record_ref, record_alt, panel_ref, panel_alt) + + panel_ref = "AATAAG" + panel_alt = ["AATAATAAG","AATAATAATAAG"] + record_ref = "AAG" + record_alt = ["AATAAG","AATAATAAG"] + assert CheckAlleleCompatibility(record_ref, record_alt, panel_ref, panel_alt) + + panel_ref = "AAGAAT" + panel_alt = ["AAGAATAAT","AATAATAATAAT"] + record_ref = "AAG" + record_alt = ["AAGAAT","AATAATAAT"] + assert CheckAlleleCompatibility(record_ref, record_alt, panel_ref, panel_alt) + + # Different numbers of alleles + panel_ref = "AAGAAT" + panel_alt = ["AAGAATAAT"] + record_ref = "AAG" + record_alt = ["AAGAAT","AATAATAAT"] + assert not CheckAlleleCompatibility(record_ref, record_alt, panel_ref, panel_alt) + + panel_ref = "AAGAAT" + panel_alt = ["AAGAATAAT","AATAATAATAAT"] + record_ref = "AAG" + record_alt = ["AAGAAT"] + assert not CheckAlleleCompatibility(record_ref, record_alt, panel_ref, panel_alt) + + # Alleles in target not a subset of refpanel + panel_ref = "AATAAT" + panel_alt = ["AATAAGAAT","AATAATAATAAT"] + record_ref = "AAT" + record_alt = ["AATAAT","AATAATAAT"] + assert not CheckAlleleCompatibility(record_ref, record_alt, panel_ref, panel_alt) + + # Offsets not the same + panel_ref = "AATAAT" + panel_alt = ["AATAATAAT","AATAATAATAATAAT"] + record_ref = "AAT" + record_alt = ["AATAAT","AATAATAAT"] + assert not CheckAlleleCompatibility(record_ref, record_alt, panel_ref, panel_alt) + """ These tests run annotaTR and compare its output to output that has been generated by a previous version of @@ -237,7 +430,7 @@ def test_TrimAlleles(): it should prevent any unexpected changes in output. If you've reviewed the change in output and find it acceptable, use trtools/testsupport/sample_vcfs/annotaTR_vcfs/create_test_files.sh -to regenerate the test files with the new version of mergeSTR. +to regenerate the test files with the new version of annotaTR. """ def test_OutputFilesSame(args, vcfdir, antrvcfdir): @@ -271,3 +464,45 @@ def test_OutputFilesSame(args, vcfdir, antrvcfdir): args.match_refpanel_on = "trimmedalleles" assert main(args) == 0 assert_same_vcf(args.out + ".vcf", antrvcfdir + "/beagleap_trimmed.vcf", max_lines_to_compare=200) + +def test_PGENOutputSame(args, vcfdir): + # what are the expected dosages? shape: num_variants x num_samples + exp_dosages = np.array([ + [-9, -9, -9], + [1.333313, 1.333313, 1], + [0, 0, 0], + [1.5, 1.5, 1.5], + [1.083313, 1, 0.916687], + [1.25, 1.5, 1], + [1.5, 1.5, 1.375], + [1, 1.333313, 1.333313], + [1.2000122, 1.2000122, 1.4000244], + [2, 2, 2], + [0.75, 1, 1.25], + [1, 1, 1], + [2, 2, 2], + [0.875, 1.15625, 1.2625122], + [0.833313, 1.166687, 0.666687], + [0.7999878, 0.5999756, 0.7999878], + [1, 1, 1], + [2, 1, 2], + ], dtype=np.float32) + + args.vcf = os.path.join(vcfdir, "beagle", "beagle_badap.vcf.gz") + args.ref_panel = os.path.join(vcfdir, "beagle", "beagle_refpanel.vcf.gz") + args.vcftype = "hipstr" + args.dosages = "beagleap_norm" + args.match_refpanel_on = "rawalleles" + args.outtype = ["pgen"] + args.warn_on_AP_error = True + assert main(args) == 0 + pgen = PgenReader(bytes(args.out + ".pgen", "utf8")) + num_vars = pgen.get_variant_ct() + dosages = np.empty((num_vars, pgen.get_raw_sample_ct()), dtype=np.float32) + pgen.read_dosages_list(np.arange(num_vars, dtype=np.uint32), dosages) + np.testing.assert_allclose(dosages, exp_dosages) + + for fname_ext in ("pgen", "pvar", "psam"): + if os.path.exists("test."+fname_ext): + os.remove("test."+fname_ext) + diff --git a/trtools/testsupport/sample_vcfs/beagle/beagle_badap.vcf.gz b/trtools/testsupport/sample_vcfs/beagle/beagle_badap.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..f1778feae34de78c92bfce5f3e655f0538dd2e9c GIT binary patch literal 17577 zcmZsiRa6{27p`%4cP;L20~B{?ao1AZZ3cIDcP*~P-3!ItT?e;8il6VlJ?G>qH(6Op z-u33m-r1BfNN`a9Yo7>El2R~Gt}uD2CQdn=cz+7U%G*&m&a-JuFZCO_#!!mlQUhr9 zOv;#DcM-aSR2&wnUhi=ewb9R)E7e+Sd;*^S_5ZfH(v26PMy+~3{|@xB?GRl+V_0|) zyc1W02~g0kmI!1{uUS-f?o+!X^JHCN#81^^QhqV<{t2sCA-mJErSUrW=v`JRO=ofc zYVgY@KK&p`y|d1PVq+PT`BiiwcXCLE#|nUn0I6CnzRw$YDUuEt*=U=KP!AJkK%#I2aFC@u$+bT)HqZu(wL$=7U(vm882Ej6Z!x2VQbOv1U?8q&d|PK#GXr4f-xr9(;a)Kv!z?fQ)n z#Q0B;ie$WFr(vV}HXH$&r!q4-F&l&sKc)0lyKzwW=2u8O{0ghXtP8zSx!A^t2%B^r zzZcqd{r4)8Gu>^MGOg)WhWI0oYT05&MzuF@Oa|f~rx7tKS>MCPj`eK$o13rp^U^+E z;3V=|@Rz<;;gOM&SpkpHFB86juc-)1rsGV|mc>whSF|IHP;Uj?snYy?3sCPQ$Ug4R zk;D=yL^oX8Y~N-ee={L-%hx+z|9n&)SSoec*nTcF>iOwmPr5#8HMCA-f0Z@e$v+L} z!0vYP?Z0dvmS-$4DbZp zaSLO`KOMMGl$%ksnn0fl-ooAV(+0W_gUyJK*#J<;{j{EdeR>MnT_&OYdBKDUPRt4Z zZI|}YyFewz18(DR%Xa%F9>m@6t^h?gPi_M^V;M??re|KL_<=oOxmP+=w`9DIb?(Rn zr-ML2dRkt4Tm)@A=RSooK}I+$z<9)_-CjxLYhbe*J0@V-GIs=&ng5Iomc;yv$1}(+ zVvtp-@Dn8wWo%oCZbHy##3pl_S{AJ&xkYmEmYaR(Xo?e;D>l0D0688as5S(c)ZkFS zW5PGJQIM5aTgs&bX1l>AoH9S(cYN8N7Twb_(##sM%ni5f8saQHxKBF-(v^z(hez)5 z(8n#bqiKw)+AjStTg*n|(sPV0GXRo0STEU|p&1OJbRt?6!Dr@iQX&l9$6r3oMnj6@#D8T;Hdmv=pOce*H zQWObLQa|Q{lGI(gHAg0Rjl-4H!pouuLj|V~EHhuZcx^ldr`Xzw?y5erQ254Bb%#xv zD+iSvK$wYr7xZwGPzQkN3^R)3DQp!%znGF79@B7j>h$H<&@a$T30?y8Q6xiy@gEi58E;rbb>6BSC-HWd#2#(F~Yy}`1szvhU|&p;{N z0pQ_Jz!rxsi!L4W*eJ%_Yy&K?Zv4+uNbqM)tC zsml;4%x$M*l%atNWE^7zB4&S3Pjf@z$?CpZAeB$#52tM4SuWjj&q!j*jC~y%peiWmq5Ly4;#Hvqzfx$ z3&U#GFIGR@fY)G2W@VDwJwmCcd;ZG`4NUVlQEy5Jzt2)<;*-%&veVWkZL`l&-fVk4 zIzd*gK3-Tff>SD$N^4EyrAdyBPD82ZJ&nl(#^s6`#GRpYD9<)HXfjXo$GQ7*D^n$y zu?2k%87W>Fd-_JHV!97pjz>V-w~;JFq?dd;i3p7%=Wy{kDLVja?tMSIG*V5S+G}jd zQ1vCNzk4VP-4aD)vbru75`cB34R1*m#o|1wIcT1ou+16%hP_!mZ7K^O){94M_+n9e)BMV6ZI7Sd0;@~&PmigXe%4+*x^RZsU!Uhq?DqEL z60Ns&Ixr`ON6^5#ftNmkCZy|h~53p#ZKgID9&XdjH8j?@CC4g(v?T>~XB&Tt=lsBrbBl zh6~{G%55SyPi%S%JtvD!DE~r`cig3$*@7=7)w47Rsy&&!?r>XAO|NQAqK3SA2Kqwt z=2%1i0Kj$tm|V!QbS*=gv>nk3X`P`jj3IEvW))dNFK*71EFENx^AAy1hssYrxWpe8 z+*TvLD?ulA^;r*t0+^4|hI>|ol8q`kG9C}t%FA>!7e%DyV_rD!o7s$I?- z-0B)W^9X{k(wTp9SPv>!K5>p;C=vd&iIMu+jnib{nDscLsO9a+Fa`e0OaWa9*x zdM@_Szf#ljqaD=u$jS)^-#r22pbb)d<0YkvGrnexB^D}jwf}mx`HB1& z(hxo7$;J}NG$cUvP0d!kI~fOvs`eM3NJX`qN%D$SvS!8W$$ME_s%8~}5h`QtkvaPUsVM}mREi=L{~y4xEF)m(Xrgk#@_P8V za>{Mt)b;tL{bBOxyz*tT@#$a2wF%{3+rPB>guJo^OIW)Yw)4sk;fF;9HD~^=k3rnr zraDSl2Em-%-R^v`fsWaSV9fxiCkp>y9^Iq|*k?E^#_oK${s$?@i%Smz@HM@z?GS(0 zpT&>^rpMn0?;!0$pE7UvCAe+J#J`n}J?`I!TAjEB@UE8uYR>GE!P$`4dy*n{IKkk98tODj}@&a$k^qNXc#*vSD$zqAh72 z{%h2$f{{-~ewxJM;_bAQ<8I@jgWbGL_Df<|QxxIh`Kn^-{L-F=Ub`{V0`*6b2P8$ikYk5-mkO^l3-}oxOfidm7>ZGDb z?=kR|Nl+YLec~;!E%c8RekFeQm#AR(Cws=1M#JyzUZ0xoIw9r@OISl{noi=2+OJWi z?z$9sqUA>4bJDO<{|Smmb@ySlyu@tVE|Zre_`Ye{t{e^pLgCT5`6nMOzvaUv9F!j3 zCE`4J_SmR9_5FHeakrnk!dQr{6=GUjBKK5AXgV*dKO#gwO|6nX--GUlwJKZ<L{|+&vXo+PTNN|p->}agjE)?)9t@>@kxo1oldH}4mC!Dk~Ot;A%vD6js zN)u;3w7Za;^^Ab8nL{xPi(nh|QBNDsile6Z7<@Y0hP8lXPAE z(@y>}nsRkMnEb0)E&G1Uz69|$x@|As2U`Qq&v>K#*Ux8nCQkrNvnF7)|5=-2IaBt=M{p#UCOo;?daDoGu9amuYnSIZRQJ9|gVI1* z9voZYd)3RzA5Ufs*;%W)1cHe-7WZBAZ1N_DE~uEo-*}@erB7sLT1Bj|pp=IiO$phC zrP-!@bgxzwq`xYhcSrR}sRAArB?l6$9pL`?craD4^2l3f&^7PW0{7HH_hQaVZ=c0; zrm7pwQ-uSgU?$yuR{ov##j z-4E^L6If7Be;% z>mClgrUvgZ_&lh2!?Xhl2F-F>s8dG^!@u#zWNO_eOT^$vag;Ps zJk=IDMV`)OMl$g&F-$=N|HFd`pPrb<*w(rK4QO? zXt?@W?t-;^0{nva=9dIIC=Q+bqeUNnX)o8B(F4~9vYrQ0_{o3r;qf-B9Yo1yG)_a( z^FG)iOhSH44le?GxnwvNaNSplpHM5C!^l7Ke@a1zQJxuc4tar18egSnQG2=lDWvs!vy*ZKcVWgIG@ncH%2cv zAqqy6uz|h;QpCqq=k|1_-45F`5TVwY4^h#h4yBiqER=+acI6>%)_Uz}A9XU0RDd(A+jVs# zT*;-qsTpkAwv(A;jUN?OJ#g2H2{lf4XoG?&Ell?fZe_VAe`E7qj(l6yaqlP*`REd$ z!c$`em<##)hqW+2qPP1;an!riFJ~90hJ#o7U-;M`e7Y@=tcxf_i~sq=@`>;(0VEsS zy#!aJ#Xr0ITt9_WUaiEs9>*TkuHO!Y3e5aoWM`h(C;@w0&mArg9>pR{LgQY1_M*%T zf*Wisp!eD8Rs^tXThVOQ0Cr;DZY-2%Fpn8?^>Ui3^T=d1d)blqRaIHh^4h3%%Y1lR zmG-WlRtawId~YYc_?^n-n!J>M!G(xpzEK+cGxgHWoIpXli)CQB*PFbJUEe{9ydob% z^rZ}61R88jZ=l7NQQ?TGewfWA&54jHHv7_`i6_k`CzR@+J<{V8-HZB^RmM_F7-2bB?V4Cb_pQq~nOZ=8)eJ46 ztJbvKq?IF+1Ob6^aZ-3x1d!OC_a^%S9#r{u{IA&!8WrTc?$G&-7zw881Ix>9nauCr z!z6X18w58rb3OF0&WasA?>e-q>}0ujf(a1UtTHo-x&qQx@nIbELEu2Th-cjA3x+d6 zrM^(;(Me-E&EYy8>Q(_mP3y4cwc8X|;z>cgDR^*?IHGm9?= z4qrHYI`*`+ex80mJMEP1d<;9gczGR1?z8hX5=jcildqwB78Y=@7@9uE=UrUol3G3S~o4;L!(a$MYf_M(D!S-L$(;|89b#uuuQm z_T(00*9&Khzj+)!p5$z7=P7z+XgP)x?S`B4aHn?xboUxmpIh-Wz*ichRnnljtSu`Y z-UXh2_=q+&v04^Tsw3F2GjN&Y52_&fG$Z=7FL~%`m7F4EZr4}?n$3dw>7GtZ&_EgT`sVcD|K5Pp zlY5iGgvbp)A~69&p22p)>=#Rp8RstGxJlNKjVvBro}e#*!e7cUDt+wzW!p4<7(Vql zT-@UKtGI_qjF7+DeZBE6i!P8}RO}mye56%7`rBOQYM5@RC$-8xbZ`^rja`)7+b!{d zf7NyIr*rvW`n6M6H8y^9)dxEzg(KT_e|v*j?~!KwzZri88p;?QVcIfMj=EXcPp>v6 zOJgyN4Sexn=e1o!C9WUcvJ5gBn2DX;_>w~0F=$6nB^_6o@aI;0bl5=8$^oC?!&(P+ zq6S|aU;j^!3}fThlEod~<>JOA5++G+I_obbLp+*Zro#Vxzt%U}9_3uJ(s< z4tGMuobw}#Yqnd9;zU@7%zkbDS&@6Up)|Q#vyeUHmi~vcOj1xb=?ncaQn%q=ODChO zG$mL2_;CNK2p|IcUr4Z!+Cp*Vep_)X~h4i#fX zo>;)-TimF!-_G$JPhLHx6%3`jI7Fp*@JE%zas~#GFfTy#BuAQ6d~0}1w)46PlH>Py zhnuiQea@F?p@P&f%F*EC{KA&Do22SIa|#$BJ|r3PXZMtxVQaTKt0-R7%R6oL(5rj` zz;+G*SCLj;bJs3u==&1RUh{e6C{KSS%;M7fFmC$>s|9SUt`vxy2aGlrbUj@ki)w2f zkk?XcgJRc_^f9%ILg^md{8ohBFQE3*0IR- z7Orm8DZjR{wJv06R{o3Lfm`n006p{uAfy8Lq$;g?2kq5480-n;Ba{Rm#8Ixa=w6sS zrFRmY($|0bK)34cas-2)f>sezSq4)T^rJakIl}oZ{2cPx+Q>q9UK1A&-I>3SaN-6J zP!e*mLC``cli6fy8es3*5qrVhh?WLfE$f4a;nRokiHR%@fo|#3ZOta1Muv4hUm zJl4)@LrRds#)8QvS97~2v!T|x32V?3DufHwK?bRZU|nj;CPH;zT(%j^al_205J_%q z00!3-Zz>qHqHBNE^uTs~c~+*W{3^mKhPa#6#J3hNCsIh2`bP1PA96;ZnC@=k*T>##vb=4uqq=bapj}`5ER48q$8D5H=X^{m zR3(rg!*u?~u4RG;>epzrVdEJg=15(@Alr%m{Q$WdzhB$xYe#lit|6yj+BO0%HV1^c zF&*Sb=_rE@9ErEu6sa_BPDa2#>)8szy9sKBx>vnR*+*4iTJ-VymcksMsF=JhM5eyF zQSU5L%p-TBv3k2qa(-PRLqxT`|m$m)` z^S_xMj2+9`zNh0in~q7Rdd&cyo*vigcP~)wU4%};TlS{A#r0HjSNw^X5#B<^{9h;H zTR&Qd^L3urs=CTeA&>9TT0O%R={Ib0|C_pIIV4og6!@nH~kuci>*tk zNPya#d!1EVJm25Hs7fIeS7p06MTLOr{}d!3Xp&yKL!&9-A&5EQ$~V6)R{s0~`t%b8 zy^FJ%e9W|cn$)4!gL^+WIlyXhUU>m`Sr2Y)J08OKbvwWOuH;`gz_M-c177bZYvKbd zM_t!1-Vl!6ftemFM9|rsS_TyHdz9GHLDtl#1BEYdzVze!Q3=@ZGfN5l+xCO_B_KLu zqMouOBmO-=hz54y70kbWHqo-exnEsWT_^KVfunPGFyCnsE5^PB8<9joIA`Zy;L+o` z69#ptpJ!OLYJS)}@;lY1-^HK5R8%smPZo1Usba&#F?4iu21X0f4hv$jR?3+0PF+Xe znjq!gw%GO7Fyu#u{MuXbFXy!n+oFs8{Vw6!;%)Q2pl?CIA~68rplc>Ty?0Bi=$omb zrE`l3s}n2H-<9m8W{16p*$(Z$alsKBjH%1Vy}wrAH>qceXap%FC)VT5{t7}($v-u@6R%|Toj9m9}9`MPaS-~VYBa1yE zL^hhaSdrXWNqGuZdl(Hk74cX4kJ$uF&wER6MSPV4{M*j*4=BYgUV^yZso*KG%4hUX zCPrtm;jo%{u^(XMgAJDB-FRQX$ql+OB%1w_CUT|&bD^(nv~3Cp8Gn%q3vE3K8a3LaRtm(@_VH;MWe24KfBvtH+~Eh&zf2Htb^ zp}859+RYc0xtFp94s(rux8$q4`>S*#dBin>r=hJ-n!$~|iMC{1W(o+=!NqYLn9=PH zRHQB=W3XI6&9JIVZACjPbA7iP0_mdnjvpel!;_d2OrfO&3ih`Jd_0dn4bHV}lbEg& zcn9vCzW!a(<_fm`7Cc)$KnEyhKuxoHJu>Mx<2-{Otyqbs-IUa|B}*P-kF6W%bgVt7 zfl?CyTncM1$*1T&kYI{)st-q^h*OylM+uPAirp$(XqOy_{`AJl`%JZ(O0 z`yEzAWJOZ;`#tcM&EOsX0jXIYuY3su(KRdo$gr6;!* zJsNU1A>-)3goC?Opn}ZcF_&n4oLyr7+b1EmA1uXH+{t0w!M1(#pVqYh9 ziDDiyqK>=7t|^g5m~Q<)@cH3H0;J^JFTWjuohf>?bY7j~$ek}$g#!-{xD~F-8t|Ew zD62KDxm+-S{c+l>yY5$Yukvv13mY=+{Gn=}&!SVY@Sdv@=@+58=Rdz=ET*5_6H1ta zYF!Z7G&Tg)(ymTj9sJ0tFenD~Lliy7+5V06<{K%yXr6Y?VVW^X#L8%?P?zMRj zkqFv4!Tqh)eH*vr<+SqGwUh@?ev9LtUf0=`L=}GyR?)!2sH23WXhIs;tGoW|ols#I zIPm2q>PB)1^}Pd)rC+K7u$c{Bj+4R}<5%5?UTU?|W;0t3Oe1u5GvA78wIZVW$`%i8 z7%+yT@V_xD@dP|nCv=}w*6sN^J-ESKIvH|?1l_65H9U}AGyiD< zkG3E#K#0KngNCiYj8(KnbP35qxuvoa!m_Ho#m<+86|PeE1clPnE8B_9_*yhW0sQ)W z?P!m1X!&3;$#%TOEf_RW9FG$(blf%0T+in>Da`UjhqwDIE4AZftDw6m zxikY~vVHH;0*+RFIv2;Ab;=rI+w=Eh>4$W#FPxuW5;t_72CmvD7N4^5>gM2>41Euw z-87(gGZw8IBWxFk=NMc-S9=1NPPZW-RcFv_lAZ|GQzY@ACgh&B)WNx;x=iH`NA?U_ z&F1Mi>CGDquEYoPVEI%5)O0cs^p_*kZ(8j5iRDX-fFurWcX-aSzvlP=eJl0J*4#2{9>k!&bMLRaQhqaT1&e@IeHa}B-5O+J;uUW|7k^xk2YI8}RP{y%JVUi!7XtYHWq<7J z)LANOLyyY3{br-xVXmdhK`UCe2^Vmx%*Ckey0Zoy27t>vlZt(A)9qbC!>zA4M_}J3 zR|D4W3|HzW8`b{6$GQY(G|Bv=bX9A^UmXcD?EU_$EKX)w$b-i%HeVI zLG72v?c4jX3xRC7_TX}rD^1=A4HCDg>fXXoN8G0~Ka+1In5%vPZzK)7uKjjxD8B!G zTmO45D%5wzQ503t;9c5(HG$bSy9G~>;Ixp9zj-`(nKXX$O(v#CZ;L`;#bs}$_=pD^ zGs<(ZRZq8KDzhAN7=?HUf$?3_=DYIgU%-h*@uYg6UX&zVIup6wp3;}2`DBVIi+(URyF25b~oNLmTkJ^_xh?v7nYP|ccy95MLIraP@qW&Wu;I*P1v2d*k z1CCelvILUw67LcW-uMO@+$f#6M8mSKTypk3t8$o_JyUlBCe&QoefMw%(48R~e_0cn zU{xvG!;TOsc8@{5h6k_Ej@xOJ<*%qHl$b2g#=o++(ZV4bJrt&$=AVR7G_i_oH>9Uv+LJ%~W8u&;~l>A#p z29hi2=0rPoE;kSnr;%m> zR!Yfv%@eyNblFb1@M_(J3`aX2d)$w>_F9&b7BBj$#_;{-C$!+PNV$7t!3iC{i6inq zpIY`1qKyv9Ipvk(`crj7ww^h6!184`00)J9-G>Y;>LQ{gUQL6>HQERV zOBaF(Lp5if0aPDUhe*%Ig{l4z^$)`44PUDd^}>FmqNW|A0=94mC9TMW>G#w>WwYft zKQrKqd+qMwleeGhanS6Vyy(UHB3he-LYw=4B6zBduwBc)H+!63Z%bt3Us>*bigF%*OcpjPbniKShT~?R1)yB~c}Mz2+{dlrc& zYHr%|7TQMv4qujOiL$k%`M^5E_AuSetkBvcPw7ITqSRx2AX=$Rv-Q|%t)UjYbJ(xI z^?)J;66efBT6pZ^mYCn+v23kEI;(hzrc$hWy)msCFVuu*D=&6${{th;s$AY(Je=~D z8kf->bg5X!3dCouD$Xi`^nU6~YyPJOfxhFf%g<9zKgu_;duD-qWanTZrt z^3i-6or-8XSWMZwLgvYz{)8{$R&Maeuc(J-!w(v7$GPz2aAPG$wv=QGzDRQ|Rj0f^ z4x^M1SPAX0<;ewt1|A%Lp#yS&mU@}#&+LgEsW8UZtZZah`~aGj=I2?cT!0A?zx;z_pytv`P%PHO>y{p|G7hML+X~)l9YA zAM@VkHP<{F-`#8&)TQ_wepEcahOaLSVZc>02L*Hss1#-Q)Zna)QjC-66 zfQgBJtHq+WJi0ZHRcWX|(0^1vE%kCMe#uG6RneT8GPA`%98t(SCQk&hvM%~d*Q$qD zk|e*}Lwb-?0fnDY%Gtrn%C}3ct--3@Mt(m}=ai}OL-8JQBv`^I@#Xk2M_-Pq_3vic zH(DJE3JO*>-bW74Gd5cNjGbNsuIVPTfAyxZ62f!q*A~|Sl1EGQe@wl1x5)}Eail?O zQ#!f3rdR!QZBfaq@~x0?s7vO*;oPu7s-CiF;iXstxf0xQw zy|BkM+Pp{-y94L{IT05m$z<_X)S*D5zY~KPss^;<2_5@S<#Xfpld(yG3wJme)k7NY zjo~C#Z*P!lTD7YfF}zMUF|;_-&(C$Ev_{#HQ+8-;EGk{y>{Z zcpKTpt)Iu#WDs%CB7(?M^erxui(|dy^3!*yWnP%qC~38*xbHJv7qzt&1hqL~RE~g` z##I)`9XiFfXIL#nC(m>-ZLos)bAEs)(QOKJEbzIc`1ki2^r2_L2MI zG4!eQY?HLUu{JDUH;2cgkB1}NuAA5MvSx~Zc)pH`e}1pj||JnOn8sk>LFv}Z1aVLE1B@1af0>Y{KfN~ucV~Q z%kNP3vV@gAP^xNNLZOm5<{6LXwBPf`YD+VAXfjjt%is?6If~$grC&@Sy2EZ5JxBBM zFGp)i+q-X=%+cPk9I*UkaXKptuG#ByRV^`9MdE92zKt57pDh^1z`NpH2pl*=P-pT( zb8*E=dyG9S-=?0|>`6JdZ9%NP$Uv*nK}zThMm@L`mbuK&|1Z zHDA!-*d6zxqfLLOLdSYD8flI;sfezM$hL_o#=_SCQBtF9^f)5opLikF=6Gk1KHg8K z*ua4h-Wk{vstU=OHmA;P)Nl2u-%VjTXh1IC{5pdoU?~7(&K<^1K4|?9kI&nww!}gU zwWeg-D4twy6DZO~ez?Spv>^3oo!k6xZsRDJnyR_3mV53sE)r4ooWCR@_sCY4j_reJc8Wrii_y{yuk$!3B?4i60K}22SR->YxJ%>#mWFl9XcRuoxxCt5n!}J zJ_3YOx|}(B73oZGB>q-(Ws_GOD;>I7i#dk28kqgxzgkNxWAm$f|5l+-Pp$JbsWPI;RD3fcef zhN*h~&@_3u^$BycOFj@?!6(&ikOjK`d1or&q;`r~xR|gKPtZm-UgAjd$;!$}!bO zJ`7dFwe5DZ=x1tpH7c#bp5tq-&=k~fZZ|nwY=l}mF@PcUD%E0gKsIp62-!S$S{sV&fXmo9c4_V{koFaK z^m~9gEQi{T;13VRFI2z0kjyl7`u3h26at=*o`5eSN=^tce@Lq}v*V=TvKnw&ZWlT2 zxifzTu;)4MgUe-5Dz0WMUyF-CR8g>cWI1q=9ph9uOJc?GE9IF0BS!%B`mj$PC`qgr zfkLz{_X#{+zB*r^EM%N{*l5&x4jz5{ZDc~SX}w*P`!=~G@)Yxy92X?IypOxlx2n+B zD-u((=A_M088=#9kM)dy%yIi@ht`NMYL;=?X_9CXpkn7=*pcu?ODgDe_c z&LRHqYAC?oO8kXE?1SO6K46~`5O8xM)%Mx=y{-JE;id=YVrITd2=S9h9O_!@``jZ; zS;(~(@9Fn>+)pACJ?B7sp`ZTCGjGojZ@9gq%Ftam>P~RqC(&owJI}X|9H)Q)$}|o! zjkgbnH(PzHnuA8Kv!ln~pB?%?C=)FPM|Eq+kSkJ_e!(tlyFzO=4;QgvutW2zY;30z zGQdE+$X5@_Is8fFzY*NLs^4C@`yubn!lX^MXfC&pQO2Ylgxj~KzUHe6(+H(ks)C&x994xtg7#s6|?bKpha zyIp*z$*>uMKh37tz8+O1QH&w~C5I`vx;|Zj3tW6foXdjh4aH>ndfz{Wj^2Lyz_hU+ z4D+=r6;O+ly<{VRhpB{{VGyP=^at*I1q|M>;<|;g14V{^<(3f@9ae#_MkH zVM%1Vl$%sl5?iiyxqCXi@Fy#`|11|!A52|1dHB||A=zpp#VbgZi_WvNnGA0Ag?>sJ z`ZJ-6DJ@Ew+MmzP?-{$aw+#>q?eIi;vj69p3pgnD3w`s^GEx%Zx-%T0)+PG(6-Ue}96yg5*wF@VQNDz=(r3e8@^!M^0qu-J9SrEtZ`oT;3b9 z6mYxlAki0ijYsei>PD&^RdB<)1g&H;Ai1O~I-Sp5ku=R`{p6KjQm$ZKe)~KAVz^jK zlEsK@4gYdJ&{z=_i#uZWG5G1ZTHx$)TH=kMsX1%9)9?RMBO_B}fa9u>vCe_i=~I$>xWkad}7HQI1W>e8Di+VkkP z@i;tr8MlN%jw`gvV4D^ZC4oMo^W&r=boi`WEo)Hoiz29;K;TzAAnhb^2rVi;sf05b z8CFNm@f5GS=xH}D4d$RQRnJ7xXL)7*pEq60VNbf#o3?v}m4f%+%Us-7)D(@Tub{;i ziwJ=msAR;Wx)AdEDlgIa|b8lT>m_eBuF zsV)hvDvmUg?PGVXy75-wGkLap{UzBf>jOq~z~mM0{hu`SuW=096nw3Xf^-$-Q6G#x3VzIu;g2%j-n`Q6D-IbxtIJDc&^emY_Bz-?rm~1UqMrO{I$ezp|As}r z{k`0k_)#zzBo|8hg1c~FD`@W_R;JPj3jgu&dTw2C_$sBB`t={>(DUX?-Tx;+GLO-d zHx+-_jvhEO-GqnohD&G5iCDtVbeQ;u=3HP^XY=nNe;xz^Gpe|eaDbJkF zaqs1Ov3^5KFS^hTSEuyIr&#c1WKKWt9*eT8)=|N2!=8e6a z#bT)q{Lu)!k`D{Knn?6A4mZqr$xKfu;nnQjM9X+HZ#RWgONM+|vdSw5kA8%7>D{wV zD#mM7Ddm+PIj??UKOKGhr!Wer(Jk7KSf*AZ$tQ~<9%mUAsmEIB5G#9jUM-Y%w@$Hu`7KGvSPwaQI=IKTLJ#uf~E%>G*%3mv$&!;^? zwPNKr4Y4pj27#1k#`>}-CoF`T>-29rH|fRfRoX>VTzkb-YEE^OD#gyZMQ3WBA8gd2 zZt`@X8rVIu$}NSL84+qI;WxJ!p^pAeM5QH?P^7(NfSss?#&xlfkA9D={Q3%T2YcbXGFau zF360!i#c=2RE5O6SH8sjoCJc@wS?4|(X%cLkb;Gq^;^M2Ag~mIAfL zzdSc29sH#nUK33DxhOxPip20UwXBQ~7sKS9O<^cSGh@^+!v-{I<#G%4qhvnS#hr6W z69lewjUdD+&X+@NI!b@1+|j+eK?j3RIKRr?)M7MRrqvN^yOS0A%xWHc+UA> z`<`HaQkKjh$Wn-HfV{GoJ;97z;S3jd$dH>gs_5wj!eySojA4*ML!YqpP2-cvIOyNM z``FkP(N!hdZyt7K*$5Gf$N&p72d|kMMnt0KU`JEMood8HB*O4Q=b;}&y~7!z(Dn|P zVzUgK_X^glm=w~=l3u!0?1wT)tp`{&UH4Wk zNcsHT#=4YD3zkX|ne)>_j@jr|dKno0X^-=yb@E~=iHr%x!ij%LjCaPvZWL)v#zF*& zLn(AkHtI2!C6D1?FO?C_!tr3VF@C6nlRQ1LVP0lS{@Ha^YeemqhG*2YasQlN0mXGC z&EsWpC-&f z`luARni4Unsj}dwzivX=+r_2?-oV@m_-IR5cth}F5okB)9q5@wzf$nxDM_~U4& zQ%4E)_IP`6-S=_l=Ck^xtJkT)FHiCF16HoKRvg10CZ_;+8{j)@ZZXQY=DZ`DmRrud z8DAa2_s#8zmBTL3scQM=A`$V4X6{M_!K;${zJ!z@XaS*obcKtANh3A2bRga0$hO@v zDz`JgEw4*k69%{`KlWZzgEx>;#v}8Z)Q)6q0nkU$PHi_Cw686FDhe zo5gR{p>R{COfSP!63DG)GS%6Aidy#6k)21%=j--2mH84Vho11_SApIAYg(xX*Gpx? zuR+Phkk|CH5ND^yA{T8TX)G^*o8BuaZORV-*=~l zcc<(qk@zJ#8t0IMk+DB4T`E|gIwvAPD8Q33*JbxpO~c%H zAyrsT4f;-@M6Jl5k18E1zX1|@-UvvSGb($4eDQ$p%K)C$)>opkPD!WoF*O~`!kzF% z&BT0p9}j`HRr%A!l{P1n7un;!xqbz1N7zjkWLH>;9f8WRsVFP)3N!2n!MisPjfBmC zV(aVR!)n4xc_)MOWD364gR|Y3;#6ber-^<@72t;V2nD&nhqM0XQ_ZPWH7JM5pzG*H zwZwHPc4r~MZ-qkED4APq;U;#2nwbTK59OgY$rjM}1poBxS)@X4=b`N{o%YBtfVgFs zUqa;Tq2;V$I`)p$3m^b-@KktxEeB%X_2?#^26YD9Ten`}x2;=~S*PDL%iSA2N;(Lo z9d}IUX)_>PZ~tY`ozpSMKmWKSO1e@a+iL6_X|;U=Or1B5^|JAALw1LOyQgRY$$34r z?piWH`=u)l;aOa#SHyWsY?@N?3~Y9DA%?b(WmF+PvMDBatrz&kTDy3(5-zC9?tsO2 zCx<6K(86PGhhlH(8X}iU_7J%YDE9lr{)D-tel00-?{XBd$d>JY_VIqc)3v#-mFElUYI`mpz(v27LOPy^}Zny8=;h3ih;37}1RwS`C~BIr_ex0)#K7XW>gp zQ>?e>b&swW{XR3#DvGuxwztL60PTF|QGc4W5b@V6rk+wAsct1G9us}r!Sn7J-?9`R6-RUu!O#)SIIao-^jn~T6HCG9s)yFdPVT2&FSwSQXh99syBxn zyfJO^-*>L}mH!hz1ipb>TYhlp(-0Ct-&r?BzrV$%E zcr6}uzTzZ_;5CApj;;(L(yVX-!%LQlG#-PL;dh86ksIGcdAL3ki}SdjlwKr&*Nm5B z!|$9)Q>Rc;|IS%X={irFfTj`-mromiNc;X!#BFoY?ER-%yB;Qh2JI6*QuEpFx9IMb zL~-+U-%g^pU*3Msx4HRsdp7OQ&E@l7e+gLFZ7l$D&*cVpz#Hr_%dIAf()!N;ur~I} zB%;Mv$QyjSHp%U@9sP43)Y%wQE&bdvi5RSy!|N7&3!r6m=`!-k`)@ILbxPSU z`K65M_?0yyu-ag?K9?tGRF+Ssy~Q7i zu9vs`#FL{iINVjD&5UWIR_$LErHF1U&9EuMgN%xP310C+$dw@tIYeA!wM zlh;uHds5+Jey(4ibG{$1e&+W2|Ma73( literal 0 HcmV?d00001 diff --git a/trtools/testsupport/sample_vcfs/beagle/beagle_badap.vcf.gz.tbi b/trtools/testsupport/sample_vcfs/beagle/beagle_badap.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..36580ff2394cb0c65fa58bad9242b28b46009fe9 GIT binary patch literal 196 zcmb2|=3rp}f&Xj_PR>jW%NgF@)a7yt6li<+U27Nf!GvPQMG3~8uA67Db{EWcTYWWY z^KORvS*wNL&1Kx1U9sFIy{*4^+5E3Hks=P<9*k{?;y2#%PcAI2`ts_^%w$oU-0b;= z=l|L5=4FU8JiqTv{W6C4Gt&8|vmQ{}rtt23dD!(kYu*{#zxoiK`rCZ*DrAuP`Mln1 XZasFvI3pPb26;4hN;5Enoed%YbTm#U literal 0 HcmV?d00001 diff --git a/trtools/testsupport/sample_vcfs/beagle/beagle_imputed_badalleles.vcf.gz b/trtools/testsupport/sample_vcfs/beagle/beagle_imputed_badalleles.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..67e6c0f0b67134488ef906e32d9bbe2de7be7194 GIT binary patch literal 825 zcmV-91IGLxiwFb&00000{{{d;LjnMD0-cm!Z<{a_#h=NixC$@pnj&EBTCy?83ZadZ zes{QLxcz0^KJ<|*) z)Acy=KTiUGNmi>tfGU=()J$-{Px2ggEY~O%lvF89`d@lx14^hy-?iOS`@;5mrl~|F zWAyu5q-l0AtVlo7L8lu`Gh=r7x)4Bn+sP7T^v#MwuLLwBx_vO2Y9KkWT@0ahQ=;#J0Pe98G zpK1;TPG-wTf%c<7njY$#6lGOUDX9F3vgr>O(TuDc!sWVS`$|j3JBUl4DYW|sv7E8-f^Z?&bB9x<#nq@q9`a&{CW>38D!~)i%pqPg~{ewYBsdA?&6)5 z{`v$#owXI!K&liR8yshN<_+zSQ0MO;7NgFMUkuhh3bClT2F4-XfAUTu$ozj&x)z$| zz?(u4JD?3*)!ew=CmsA>9CMyo=Gy=EOu^NqZQI`Ypk2LYb!;2j$7jRZ7;NsD&bl_v z1aED0!EuLY?$Gu2Hjb{M#mrbPR>mYSqH$=D>Du`EjZBU9jWlX!jP?gJ_6qi^XY3X9 zegpbJ7-(h%001A02m}BC000301^_}s0syH1oz6`Q!Y~X5;B)*jBZ*tvtyT!FP7jV^ z)vJdgc-9^L@fJ~;;Ay;sd_dsouQw*3lFO<>4VyNd&Ro*Gs=>n|9ODuSh>{Y7Jv%IF zV`ZH?BC=g|!R1$@6=}QOMJwK+mCh+>4^2poM9Rg04{TofH_iHw!60b`!{t5@h#v(+ ze)fDBspw5&R9_O3G1Ey;gq4HF76AYNABzYC000000RIL6LPG)o8vp|U0000000000 D0}gw3 literal 0 HcmV?d00001 diff --git a/trtools/testsupport/sample_vcfs/beagle/beagle_imputed_badalleles.vcf.gz.tbi b/trtools/testsupport/sample_vcfs/beagle/beagle_imputed_badalleles.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..8aa0b1afd7e9893ae6b3a307b22a89d14d58b90d GIT binary patch literal 154 zcmb2|=3rp}f&Xj_PR>jWwG401Zscuo;BmQler<_tOogDR180hWvXnxjNYnx*t%WX3 z*O;n)_{~e{JzjfRjP(PfQ-jfO)?=9~wS@})th;h-R`Y+B3vUa5-gA#WGhx4rUe&WN fdqR=H)!W&A$6Kw6Z%H#S$fMaV&A<$H42S>#s{QLxcz0^KJ<|*) z)Acy=KTiUGNmi>tfGU=()J$-{Px2ggEY~O%lvF89`d@lx14^hy-?iOS`@;5mrl~|F zWAyu5q-l0AtVlo7L8lu`Gh=r7x)4Bn+sP7T^v#MwuLLwBx_vO2Y9KkWT@0ahQ=;#J0Pe98G zpK1;TPG-wTf%c<7njY$#6lGOUDX9F3vgr>O(TuDc!sWVS`$|j3JBUl4DYW|sv7E8-f^Z?&bB9x<#nq@q9`a&{CW>38D!~)i%pqPg~{ewYBsdA?&6)5 z{`v$#owXI!K&liR8yshN<_+zSQ0MO;7NgFMUkuhh3bClT2F4-XfAUTu$ozj&x)z$| zz?(u4JD?3*)!ew=CmsA>9CMyo=Gy=EOu^NqZQI`Ypk2LYb!;2j$7jRZ7;NsD&bl_v z1aED0!EuLY?$Gu2Hjb{M#mrbPR>mYSqH$=D>Du`EjZBU9jWlX!jP?gJ_6qi^XY3X9 zegpbJ7-(h%001A02m}BC000301^_}s0syE0ozAfe!Y~j9;Q4%*a(A{#+8E(Tv^rRV zQCEi|IBQ2A-XJO&oaE=YyWxj#yxy3AYOb3GHEjBHI&)3)rUeg+aEwc+AWBLQ_Uy1~ zjg@uYh{$#|1edEuE7Eqmi&nftD}z(eAG(k_iIj%{AK1O}>za)pgF(^?rptXI5MKmD z%C8X7o5QHS93*4@=R5(>utHH10RR9WiwFb&00000{{{d;LjnLB00RI30000000006 CY<#i+ literal 0 HcmV?d00001 diff --git a/trtools/testsupport/sample_vcfs/beagle/beagle_imputed_goodalleles.vcf.gz.tbi b/trtools/testsupport/sample_vcfs/beagle/beagle_imputed_goodalleles.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..2677f52137150b39d984758e781f92997b5d13db GIT binary patch literal 154 zcmb2|=3rp}f&Xj_PR>jWwG401Zscuo;BmQler<_tOqryp180hWvXnxjNYnx*t%WX3 z*O;n)_{~e{JzjfRjP(PfQ-jfO)?=9~wS@})th;h-R`Y+B3vUa5-gA#WGhx4rUe&WN fdqR=H)!W&A-XC@q-I8WtkVms!nt>VY7!Uyf^*c4k literal 0 HcmV?d00001 diff --git a/trtools/testsupport/sample_vcfs/beagle/beagle_imputed_noTRs.vcf.gz b/trtools/testsupport/sample_vcfs/beagle/beagle_imputed_noTRs.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..725bad1f0d051bc772930402b29b0b212ca08b0b GIT binary patch literal 787 zcmV+u1MK`CiwFb&00000{{{d;LjnMD0-cm!Z<{a_#h=NixC$@pnj&EBTCy?83ZadZ zes{QLxcz0^KJ<|*) z)Acy=KTiUGNmi>tfGU=()J$-{Px2ggEY~O%lvF89`d@lx14^hy-?iOS`@;5mrl~|F zWAyu5q-l0AtVlo7L8lu`Gh=r7x)4Bn+sP7T^v#MwuLLwBx_vO2Y9KkWT@0ahQ=;#J0Pe98G zpK1;TPG-wTf%c<7njY$#6lGOUDX9F3vgr>O(TuDc!sWVS`$|j3JBUl4DYW|sv7E8-f^Z?&bB9x<#nq@q9`a&{CW>38D!~)i%pqPg~{ewYBsdA?&6)5 z{`v$#owXI!K&liR8yshN<_+zSQ0MO;7NgFMUkuhh3bClT2F4-XfAUTu$ozj&x)z$| zz?(u4JD?3*)!ew=CmsA>9CMyo=Gy=EOu^NqZQI`Ypk2LYb!;2j$7jRZ7;NsD&bl_v z1aED0!EuLY?$Gu2Hjb{M#mrbPR>mYSqH$=D>Du`EjZBU9jWlX!jP?gJ_6qi^XY3X9 zegpbJ7-(h%001A02m}BC000301^_}s0sw;mZHqAu!Y~j7>-|i0ci3^_!$_wX0tJzU zftG@XnwrOBrH)psnP%RfuQ1q&^PU3EbA7(V@y?H!u+NWjK!;+rBHtuYHx6p2Mj{eD zE;+Sy2t5bAT#MS2NNq_y=6lRFTH$s9{V1;6vOhaFjUs~p001A02m}BC000301^_}s R0stET0{{R300000003oIYKs5> literal 0 HcmV?d00001 diff --git a/trtools/testsupport/sample_vcfs/beagle/beagle_imputed_noTRs.vcf.gz.tbi b/trtools/testsupport/sample_vcfs/beagle/beagle_imputed_noTRs.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..f6ee16b0f0efcf4aa648b8410663ad1c498bb9e1 GIT binary patch literal 139 zcmb2|=3rp}f&Xj_PR>jWc?@q)ALMm#5OKX|ts)i3sCr|g81Ib@ONGzg@;c@-L+;(} zIZyt7f0oF@xS~PFVRlYKsr4_tx@lLMj78tSD0!M%$v@@h>w5Q4WMF^aW+!*N^&x2n P26;5=q#2mOwu1-&JEbo# literal 0 HcmV?d00001 diff --git a/trtools/testsupport/sample_vcfs/beagle/beagle_tinyrefpanel.vcf.gz b/trtools/testsupport/sample_vcfs/beagle/beagle_tinyrefpanel.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..d684ee6b9ef2a4c21d40f9c14d33971268711492 GIT binary patch literal 3131 zcmV-B48-#viwFb&00000{{{d;LjnN43C&w=bD~%l{+#(0chglf$xcu%F9OcGwUkOH zQ!#Nx(>t}bYoT7183YC}lk~5ja{=)M)JxQ^>8iAfhx78BbDse%4BfbUS}sN;Xy|wS zcb_y2$FrjCsO>PaAxSbp5T7^%83FlyN!+nb!Z`lgTTfWI!+YJuPV6!2apvz!DMSF+)Ndx6IE1pPj7mQ1;E>&&k<*$DvZUiWE_Q6 zKjdcbi-7#*SflD32F$7wHzbRZQ^h>b(~K7+29d_Hjhi5jQ#GOTRH1=#we*n;frr*F;bC_3;_F}k7I_=3{F~{wJ_yU(z zbRe*Ruu_+lI;H5wt3gt8i369Kupv#IRrFuDbA@6ED$*AgP*MlLVpQ%4}^T zU`><-EpH>PY_O&Znk>r&HqzRL7qA3Ic^f&iQL!wlLXn?ZW~1;bFKbGHjgr~Onka#d zn72_g8%f5ZS`%w|8!fXDRZ)`^tmJJlzcvyTO(_HsC8k+9)OP-KxWC=IW% z#*$c*L_sYuT7%6iyd)~P$VgrrVNJtWEee;xt??p&Ajt(b8QiKU$ciN40vk26;Y9_K zF7lJXEvq%G)NnyZgbeMP1pjbNDX__K1gNnp)e4IL&w4^?yu`!w=WTw;+$e&eNLb#! zfo0n0^}XABvvd4*nz{^Nzf5j2JK6Qo&aE`lX50rVagZAu(k(f?wxX9>WBPo8X6db3 zxa)q{*3C1ApvS;#E|T8qT_6&8&DS)zL?rN3VtAR=ZdM2rg}VlX4p+@vp^k*jHp zBV^UwB&!em%~MlzlbSxB?;!7YgNQz!uaWm_o&jI80N)(RoPtKFaXD95_T|#d6UZqu z#U9o?TeW7XYK^m1TZRhmy~Ri9APz)7yr7@7EW_i}K;wLX9uCGSqKdel!Fl5pQAb>l zaproA;yC7+`(y6u&xh&d7<#40@e*m2S&ZqlULcK9RFpRV0%@3~(J(H8MoFqVTkyPd z5`CNpv(KEFm=Gd;mWSqSwqd1az_CA?=`g9^A9>TE1+$W=X znM6wQKr*=(Nu(SvCPQ$MM9k8N7#BjsD2K=y!|%8^3Yc-9I0e2lOBF8f#RO;+kRPM7 zJjWRY{KlHuAnzvJXtB+{Ek;X_l+fI7gMt<8%L| z%|`B26%kRz!m4lG&O@t?WkuCQzIOW)!i|~dP3{oi7+I5i5GK;!3=(&$WDus!V5^cr z`5;W2#8xFn`5^e!NhiRx-xiAU`egJk_kd`qkvey`JN3Q z48sAM)6T`o$wY_oxu~-QY-#yK|z>emNa0k2RL9fY#eBvj=j_yavenl!f zCBHnh+D~hz2a;<#RK_1tuYu)GyamORj!?MteJ_X-&wfDc#nO*rG5Ro-_J?!JCw3%N zgo>~!8-opXu~yq@Yk{4%Yqb)W(dUS3A?~?I=@xpl)>OdMTKZc$w_UTm%FQ4nBw(s? zs$2w_b53C6S#XQ zk((w*e~3kFwIlL+j!^jJ)8CeSmSGuq%OuaLFO)5xrU*=PHxFEqS z1`giNl>)=_6h=5!bw{7MxEAV5-(sl+VB0Pm3NHl zz+-(+{m~^EtI&s`>!)Ds!RVOePc77d9$(CPrRW@$=~9}I38SSiX9)B0a+T|1ES#5O z5mn;--u6xC(1gm&(_;sI;<1shz#mRHL{B=_6ji-T9yOHwatZo9g_TL1?NmFkFT|+= zc6=;^RMS*pw|MScb)U_vht^fEZCvS({i}Zu^vA2@r&U*!0IrgszFeg-(?{*QKLK|= z`nQ!e001A02m}BC000301^_}s0svnEZB*Ne+b|G)R=*{Lku2Y$s-Vhpys&YMWT*6L zS<=vj?xk$pr4RY|MHeTEcL_0bW{%FBk%aaSuLL7Z2~AVTQL{F?J3?`imPn$n)zE!I zOSD8na$ut7umpevlLS&GHyo2?l48O{%PAA>of3v)oP&+UM)R_8p&B&DB+Fd7)*R=B zi;bldAc1rD@A>uc$T2_`#4ed4NQO*S(jX8LWnKjJX-z^16ScGCTo)bl@F|X&f>tDA1Z98`oF*I ztxa3M=d=foJw#~_9J`L$^^`WTwwbigDGtlBEDK7RYAcQ@De?^G% zEvE!yV6xGK@9opV#O->kZ~A6q_^-p?@nP|F|2*C^vmLZl{P+F%=lHfbzKt(0k4L6< zH^uUE=8`U@4%5XTr37_o!;lQ-2h2&Y8rL0lC6;1L4 zQnzuP-7r$pl(0q*t!>+Kiq`$AGOKlGhgrc2FJJ+2{0?ZNgqV*7001A02m}BC00030 V1^_}s0stET0{{R300000008;S4OIXD literal 0 HcmV?d00001 diff --git a/trtools/testsupport/sample_vcfs/beagle/beagle_tinyrefpanel.vcf.gz.tbi b/trtools/testsupport/sample_vcfs/beagle/beagle_tinyrefpanel.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..a60fb23c70e9c42b26780010e63540a2c103beb8 GIT binary patch literal 158 zcmb2|=3rp}f&Xj_PR>jWjSO$k?&fU{5MaG{Fk+gHW$Bv(q7IxX0?JYfjZ=a$SgaP9 zHXY}Bk!WUlsqWpj(7CAxm^m2#wd*gCU$#ks{aXI5sCl=o&(_N~r1hPyw!gG!*JJND juIFaN=Z7MLuO?UdHi@RD-;!ovkVkWYGy^l(K_CJE>Pa>5 literal 0 HcmV?d00001 diff --git a/trtools/utils/tests/test_trharmonizer.py b/trtools/utils/tests/test_trharmonizer.py index dba2f29f..43d3c117 100644 --- a/trtools/utils/tests/test_trharmonizer.py +++ b/trtools/utils/tests/test_trharmonizer.py @@ -389,9 +389,13 @@ def test_TRRecord_GetGenotypes_Dosages(): diploid_record.FORMAT["AP1"] = np.array([[10, 0.5], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0]]) with pytest.raises(ValueError): rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap) + # If not in strict mode these should be nan + assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap, strict=False)).all() diploid_record.FORMAT["AP1"] = np.array([[-0.5, 0.5], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0]]) with pytest.raises(ValueError): rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap) + # If not in strict mode these should be nan + assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap, strict=False)).all() # Test triploid example where alt=[] triploid_record = get_triploid_record() @@ -421,7 +425,10 @@ def test_TRRecord_GetGenotypes_Dosages(): rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap) with pytest.raises(ValueError): rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap_norm) - + # If not in strict mode, should instead return NA + assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap, strict=False)).all() + assert np.isnan(rec.GetDosages(dosagetype=trh.TRDosageTypes.beagleap_norm, strict=False)).all() + # Test example with fewer alt_alleles than the max genotype index with pytest.raises(ValueError): trh.TRRecord(dummy_record, ref_allele, [], "CAG", "", None) diff --git a/trtools/utils/tr_harmonizer.py b/trtools/utils/tr_harmonizer.py index 5fe8c8f7..5e63c176 100644 --- a/trtools/utils/tr_harmonizer.py +++ b/trtools/utils/tr_harmonizer.py @@ -12,7 +12,7 @@ import numpy as np import trtools.utils.utils as utils - +import trtools.utils.common as common # List of supported VCF types # TODO: add Beagle @@ -261,6 +261,8 @@ def HarmonizeRecord(vcftype: Union[str, VcfTypes], vcfrecord: cyvcf2.Variant): ---------- vcfrecord : A cyvcf2.Variant Object + vcftype : VcfTypes + Type of the VCF file Returns ------- @@ -342,7 +344,6 @@ def _HarmonizeHipSTRRecord(vcfrecord: cyvcf2.Variant): start_offset = int(vcfrecord.INFO['START']) - pos pos_end_offset = int(vcfrecord.INFO['END']) - pos neg_end_offset = pos_end_offset + 1 - len(vcfrecord.REF) - if start_offset == 0 and neg_end_offset == 0: full_alleles = None else: @@ -353,7 +354,6 @@ def _HarmonizeHipSTRRecord(vcfrecord: cyvcf2.Variant): full_alleles = (vcfrecord.REF.upper(), full_alts) - # neg_end_offset is the number of flanking non repeat bp to remove # from the end of each allele # e.g. 'AAAT'[0:-1] == 'AAA' @@ -379,7 +379,6 @@ def _HarmonizeHipSTRRecord(vcfrecord: cyvcf2.Variant): ) else: alt_alleles = [] - # Get the motif. # Hipstr doesn't tell us this explicitly, so figure it out motif = utils.InferRepeatSequence(ref_allele[start_offset:], @@ -1084,7 +1083,7 @@ def UniqueStringGenotypes(self) -> Set[int]: return set(self.UniqueStringGenotypeMapping().values()) def GetDosages(self, - dosagetype: TRDosageTypes = TRDosageTypes.bestguess) -> Optional[np.ndarray]: + dosagetype: TRDosageTypes = TRDosageTypes.bestguess, strict: bool = True) -> Optional[np.ndarray]: """ Get an array of genotype dosages for each sample. @@ -1105,6 +1104,9 @@ def GetDosages(self, ---------- dosagetype : Enum Which TRDosageType to compute. Default bestguess + strict : bool + If False, output a warning but do not die on errors validating AP field + If errors are encountered, return nan dosage values Returns ------- @@ -1115,10 +1117,14 @@ def GetDosages(self, if self.GetNumSamples() == 0: return None if (dosagetype in [TRDosageTypes.beagleap, TRDosageTypes.beagleap_norm]) and \ - (self.vcfrecord.format("AP1") is None or self.vcfrecord.format("AP2") is None): - raise ValueError( - "Requested Beagle dosages for record at {}:{} but AP1/AP2 fields not found.".format(self.chrom, self.pos) - ) + (("AP1" not in self.vcfrecord.FORMAT or "AP2" not in self.vcfrecord.FORMAT) or \ + (self.vcfrecord.format("AP1") is None or self.vcfrecord.format("AP2") is None)): + error_msg = "Requested Beagle dosages for record at {}:{} but AP1/AP2 fields not found.".format(self.chrom, self.pos) + if strict: + raise ValueError(error_msg) + else: + common.WARNING(error_msg) + return np.array([np.nan]*self.GetNumSamples()) if dosagetype in [TRDosageTypes.bestguess, TRDosageTypes.bestguess_norm]: # Get length gts and replace -1 (missing) and -2 (low ploidy) with 0 # But if normalizing set those to np.nan since unclear @@ -1141,10 +1147,19 @@ def GetDosages(self, # Check AP field. allow wiggle room for rounding if np.any(np.sum(ap1, axis=1) > 1.1) or np.any(np.sum(ap2, axis=1) > 1.1): - raise ValueError("AP1 or AP2 field summing to more than 1 detected") + error_msg = "{}:{} AP1 or AP2 field summing to more than 1 detected".format(self.chrom, self.pos) + if strict: + raise ValueError(error_msg) + else: + common.WARNING(error_msg) + return np.array([np.nan]*self.GetNumSamples()) if np.any(ap1 < 0) or np.any(ap2 < 0): - raise ValueError("Negative AP1 or AP2 fields detected") - + error_msg = "{}:{} Negative AP1 or AP2 fields detected".format(self.chrom, self.pos) + if strict: + raise ValueError("Negative AP1 or AP2 fields detected") + else: + common.WARNING(error_msg) + return np.array([np.nan]*self.GetNumSamples()) # Get haplotype dosages if len(self.alt_allele_lengths) > 0: max_alt_len = max(self.alt_allele_lengths) @@ -1167,12 +1182,18 @@ def GetDosages(self, else: # Normalize to be between 0 and 2 dosages = (unnorm_dosages-2*self.min_allele_length)/(self.max_allele_length-self.min_allele_length) - assert not (np.any(dosages>=2.1) or np.any(dosages<=-0.1)) + if (np.any(dosages>=2.1) or np.any(dosages<=-0.1)): + error_msg = "{}:{} Error normalizing dosages: value >=2.1 or <=-0.1 detected".format(self.chrom, self.pos) + if strict: + raise ValueError(error_msg) + else: + common.WARNING(error_msg) + return np.array([np.nan]*self.GetNumSamples()) dosages = np.clip(dosages, 0, 2) else: dosages = unnorm_dosages return dosages - + def GetLengthGenotypes(self) -> Optional[np.ndarray]: """ Get an array of length genotypes for each sample.