From 084c3967dacd43a9ca67d4bfffa3ec985dd24258 Mon Sep 17 00:00:00 2001 From: Buuxx Date: Tue, 27 Aug 2024 00:56:48 +0800 Subject: [PATCH] Add only_passing option to statSTR --- trtools/prancSTR/README.rst | 6 +++--- trtools/statSTR/README.rst | 1 + trtools/statSTR/statSTR.py | 6 ++++++ trtools/statSTR/tests/test_statSTR.py | 12 ++++++++++++ 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/trtools/prancSTR/README.rst b/trtools/prancSTR/README.rst index c2e7a485..6d00bba6 100644 --- a/trtools/prancSTR/README.rst +++ b/trtools/prancSTR/README.rst @@ -45,12 +45,12 @@ Other general parameters: * :code:`--region `: Restrict to the region chr:start-end. VCF file must be bgzipped and indexed to use this option. * :code:`--samples `: Restrict to the given list of samples. Samples are comma separated. * :code:`--vcftype `: Specify the tool which generated the vcf call file for STRs. Currently this will fail if using anything other than :code:`hipstr` VCFs. -* :code:`--only-passing`: Filters out the VCF records with non-passing FILTER column -* :code:`--output-all`: Force tool to output results for all loci. Overrides :code:``--only-passing``. +* :code:`--only-passing`: Filters out the VCF records with non-passing FILTER column. +* :code:`--output-all`: Force tool to output results for all loci. Overrides :code:`--only-passing`. * :code:`--readfield `: Specify which VCF format field output by HipSTR to utilize for extracting read information. We recommend setting this to "MALLREADS". "ALLREADS" is also accepted but we have found that it produces unreliable results. * :code:`--debug`: Print helpful debug messages. * :code:`--quiet`: Restrict printing of any messages. -* :code:`--version`: Print the version of the tool +* :code:`--version`: Print the version of the tool. Notes: diff --git a/trtools/statSTR/README.rst b/trtools/statSTR/README.rst index dc256ec9..3d239c08 100644 --- a/trtools/statSTR/README.rst +++ b/trtools/statSTR/README.rst @@ -30,6 +30,7 @@ Optional general parameters: * :code:`--sample-prefixes `: The prefixes to name output for each samples group. By default uses 1, 2, 3 etc. Must be sample length as :code:`--samples`. * :code:`--region `: Restrict to specific regions (chrom:start-end). Requires the input VCF to be bgzipped and tabix indexed. * :code:`--precision `: How much precision to use when writing stats (default = 3) +* :code:`--only-passing`: Filters out the VCF records with non-passing FILTER column. For specific statistics available, see below. diff --git a/trtools/statSTR/statSTR.py b/trtools/statSTR/statSTR.py index 1750e10a..b642b175 100644 --- a/trtools/statSTR/statSTR.py +++ b/trtools/statSTR/statSTR.py @@ -453,6 +453,8 @@ def getargs(): # pragma: no cover filter_group.add_argument("--region", help="Restrict to the region " "chrom:start-end. Requires file to bgzipped and" " tabix indexed.", type=str) + filter_group.add_argument("--only-passing", help="Only process records " + " where FILTER==PASS", action="store_true") stat_group_name = "Stats group" stat_group = parser.add_argument_group(stat_group_name) stat_group.add_argument("--thresh", help="Output threshold field (max allele size, used for GangSTR strinfo).", action="store_true") @@ -574,6 +576,10 @@ def main(args): nrecords += 1 trrecord = trh.HarmonizeRecord(vcftype, record) + + if args.only_passing and record.FILTER is not None: + continue + if args.plot_afreq and num_plotted <= MAXPLOTS: PlotAlleleFreqs(trrecord, args.out, sample_indexes=sample_indexes, sampleprefixes=sample_prefixes) num_plotted += 1 diff --git a/trtools/statSTR/tests/test_statSTR.py b/trtools/statSTR/tests/test_statSTR.py index 0a108a17..3a7886b7 100644 --- a/trtools/statSTR/tests/test_statSTR.py +++ b/trtools/statSTR/tests/test_statSTR.py @@ -17,6 +17,7 @@ def args(tmpdir): args.sample_prefixes = None args.plot_afreq = False args.region = None + args.only_passing = False args.thresh = False args.afreq = False args.acount = False @@ -49,6 +50,17 @@ def test_RightFile(args, vcfdir): retcode = main(args) assert retcode==0 +# Test rhe only passing option +def test_OnlyPassing(args, vcfdir): + fname = os.path.join(vcfdir, "CEU_test.vcf.gz") + args.vcf = fname + + # With only passing + args.only_passing = True + args.region = None + retcode = main(args) + assert retcode==0 + # Test all the statSTR options def test_Stats(args, vcfdir, capsys): fname = os.path.join(vcfdir, "few_samples_few_loci.vcf.gz")