From 9216de914695a9e9ca95c2c5f26472fee5e281f3 Mon Sep 17 00:00:00 2001 From: danejo3 <33472323+danejo3@users.noreply.github.com> Date: Thu, 27 Oct 2022 14:32:44 -0400 Subject: [PATCH] Improved auto downsampling with custom coverage and avg read length (#23) --- CHANGELOG.md | 1 + yeat/Snakefile | 31 +++++++++++++++--------- yeat/cli.py | 36 +++++++++++++++++++++------- yeat/tests/test_cli.py | 54 ++++++++++++++++++++++++++++++++++++++---- 4 files changed, 98 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8eb9a16..5afebb5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Support for assembly configuration file (#16) - Commands to run a short or long test suite (#20) - Custom downsampling flag (#21) +- Custom coverage flag (#23) ## [0.1] 2021-12-01 diff --git a/yeat/Snakefile b/yeat/Snakefile index 750cbad..fc97ed8 100644 --- a/yeat/Snakefile +++ b/yeat/Snakefile @@ -7,9 +7,11 @@ # Development Center. # ------------------------------------------------------------------------------------------------- +import json from shutil import copyfile import pandas as pd from pathlib import Path +from random import randint rule all: @@ -81,7 +83,9 @@ rule downsample: read2="seq/fastp/{sample}.R2.fq.gz", mash_report="seq/mash/{sample}.report.tsv" params: - downsample=config["downsample"] + coverage=config["coverage"], + downsample=config["downsample"], + fastp_report="seq/fastp/fastp.json" run: if params.downsample == -1: p = Path("seq/downsample") @@ -89,20 +93,25 @@ rule downsample: copyfile(input.read1, output.sub1) copyfile(input.read2, output.sub2) return - result = input.mash_report - df = pd.read_csv(result, sep="\t") + df = pd.read_csv(input.mash_report, sep="\t") genome_size = df["Length"].iloc[0] - print(f"genome size: {genome_size}") + print(f"genome size: {genome_size}") + with open(params.fastp_report, "r") as fh: + qcdata = json.load(fh) + base_count = qcdata["summary"]["after_filtering"]["total_bases"] + read_count = qcdata["summary"]["after_filtering"]["total_reads"] + avg_read_length = base_count / read_count if params.downsample == 0: - coverage = 150 - read_length = 250 - down = (genome_size * coverage) // (2 * read_length) - print(f"auto down: {down}") + down = int((genome_size * params.coverage) / (2 * avg_read_length)) else: down = params.downsample - print(f"custom down: {down}") - shell("seqtk sample {input.read1} {down} > seq/downsample/{wildcards.sample}.R1.fq") - shell("seqtk sample {input.read2} {down} > seq/downsample/{wildcards.sample}.R2.fq") + seed = randint(1, 2**16-1) + print(f"[yeat] average read length: {avg_read_length}") + print(f"[yeat] target depth of coverage: {params.coverage}x") + print(f"[yeat] number of reads to sample: {down}") + print(f"[yeat] random seed for sampling: {seed}") + shell("seqtk sample -s {seed} {input.read1} {down} > seq/downsample/{wildcards.sample}.R1.fq") + shell("seqtk sample -s {seed} {input.read2} {down} > seq/downsample/{wildcards.sample}.R2.fq") shell("gzip -f seq/downsample/*") diff --git a/yeat/cli.py b/yeat/cli.py index 999a772..a0b6ed9 100644 --- a/yeat/cli.py +++ b/yeat/cli.py @@ -7,8 +7,7 @@ # Development Center. # ------------------------------------------------------------------------------------------------- -import argparse -from argparse import Action +from argparse import Action, ArgumentParser, ArgumentTypeError import json from pathlib import Path from pkg_resources import resource_filename @@ -46,6 +45,7 @@ def run( sample="sample", dryrun="dry", downsample=0, + coverage=150, ): snakefile = resource_filename("yeat", "Snakefile") r1 = Path(fastq1).resolve() @@ -66,6 +66,7 @@ def run( sample=sample, dryrun=dryrun, downsample=downsample, + coverage=coverage, ) success = snakemake( snakefile, @@ -79,8 +80,18 @@ def run( raise RuntimeError("Snakemake Failed") -def get_parser(): - parser = argparse.ArgumentParser() +def check_positive(value): + try: + value = int(value) + if value <= 0: + raise ArgumentTypeError(f"{value} is not a positive integer") + except ValueError: + raise ArgumentTypeError(f"{value} is not an integer") + return value + + +def get_parser(exit_on_error=True): + parser = ArgumentParser(exit_on_error=exit_on_error) parser.add_argument("-v", "--version", action="version", version=f"YEAT v{yeat.__version__}") parser.add_argument( "-o", @@ -88,7 +99,7 @@ def get_parser(): type=str, metavar="DIR", default=".", - help="output directory; default is '.'", + help="output directory; default is current working directory", ) parser.add_argument( "-t", @@ -96,14 +107,14 @@ def get_parser(): type=int, metavar="T", default=1, - help="number of threads for Snakemake; default is 1", + help="execute workflow with T threads; by default T=1", ) parser.add_argument( "--sample", type=str, metavar="S", default="sample", - help="sample name for Snakemake; default is 'sample'", + help="specify a unique sample name S for storing assembly results in the working directory; by default S=sample", ) parser.add_argument( "-n", @@ -117,7 +128,15 @@ def get_parser(): type=int, metavar="D", default=0, - help="downsample reads down to the desired number; by default, D=0; when set to default, YEAT will auto downsample; set D=-1 to not downsample", + help="randomly sample D reads from the input rather than assembling the full set; set D=0 to perform auto-downsampling to a desired level of coverage (see --coverage); set D=-1 to disable downsampling; by default D=0", + ) + parser.add_argument( + "-c", + "--coverage", + type=check_positive, + metavar="C", + default=150, + help="target an average depth of coverage Cx when auto-downsampling; by default, C=150", ) parser.add_argument( "--init", @@ -142,4 +161,5 @@ def main(args=None): sample=args.sample, dryrun=args.dry_run, downsample=args.downsample, + coverage=args.coverage, ) diff --git a/yeat/tests/test_cli.py b/yeat/tests/test_cli.py index 45bd65e..bc9c873 100644 --- a/yeat/tests/test_cli.py +++ b/yeat/tests/test_cli.py @@ -7,6 +7,7 @@ # Development Center. # ------------------------------------------------------------------------------------------------- +from argparse import ArgumentError import json import pandas as pd from pathlib import Path @@ -40,11 +41,6 @@ def test_basic_dry_run(tmp_path): cli.main(args) -def test_no_args(): - with pytest.raises(SystemExit, match=r"2"): - cli.main(None) - - def test_invalid_read1_file(): with pytest.raises(Exception, match=r"No such file:.*read1\'"): read1 = "read1" @@ -120,3 +116,51 @@ def test_custom_downsample_input( assert df.iloc[12]["sample_contigs"] == num_contigs assert df.iloc[13]["sample_contigs"] == largest_contig assert df.iloc[14]["sample_contigs"] == total_len + + +@pytest.mark.parametrize("coverage", [("-1"), ("0")]) +def test_invalid_custom_coverage_negative(coverage): + arglist = [ + data_file("megahit.cfg"), + data_file("short_reads_1.fastq.gz"), + data_file("short_reads_2.fastq.gz"), + "--coverage", + coverage, + ] + with pytest.raises(ArgumentError, match=rf"{coverage} is not a positive integer"): + args = cli.get_parser(exit_on_error=False).parse_args(arglist) + + +@pytest.mark.parametrize("coverage", [("string"), ("3.14")]) +def test_invalid_custom_coverage_noninteger(coverage): + arglist = [ + data_file("megahit.cfg"), + data_file("short_reads_1.fastq.gz"), + data_file("short_reads_2.fastq.gz"), + "--coverage", + coverage, + ] + with pytest.raises(ArgumentError, match=rf"{coverage} is not an integer"): + args = cli.get_parser(exit_on_error=False).parse_args(arglist) + + +@pytest.mark.long +@pytest.mark.parametrize("coverage", [("150"), ("75"), ("10")]) +def test_custom_coverage_input(coverage, capsys, tmp_path): + wd = str(tmp_path) + arglist = [ + data_file("megahit.cfg"), + data_file("short_reads_1.fastq.gz"), + data_file("short_reads_2.fastq.gz"), + "--outdir", + wd, + "-c", + coverage, + ] + args = cli.get_parser().parse_args(arglist) + cli.main(args) + quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv" + df = pd.read_csv(quast_report, sep="\t") + assert df.iloc[12]["sample_contigs"] == 56 # num_contigs + assert df.iloc[13]["sample_contigs"] == 35168 # largest_contig + assert df.iloc[14]["sample_contigs"] == 199940 # total_len