Skip to content

Commit

Permalink
Improved auto downsampling with custom coverage and avg read length (#23
Browse files Browse the repository at this point in the history
)
  • Loading branch information
danejo3 authored Oct 27, 2022
1 parent a79e3a5 commit 9216de9
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 24 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Support for assembly configuration file (#16)
- Commands to run a short or long test suite (#20)
- Custom downsampling flag (#21)
- Custom coverage flag (#23)


## [0.1] 2021-12-01
Expand Down
31 changes: 20 additions & 11 deletions yeat/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
# Development Center.
# -------------------------------------------------------------------------------------------------

import json
from shutil import copyfile
import pandas as pd
from pathlib import Path
from random import randint


rule all:
Expand Down Expand Up @@ -81,28 +83,35 @@ rule downsample:
read2="seq/fastp/{sample}.R2.fq.gz",
mash_report="seq/mash/{sample}.report.tsv"
params:
downsample=config["downsample"]
coverage=config["coverage"],
downsample=config["downsample"],
fastp_report="seq/fastp/fastp.json"
run:
if params.downsample == -1:
p = Path("seq/downsample")
p.mkdir(parents=True, exist_ok=True)
copyfile(input.read1, output.sub1)
copyfile(input.read2, output.sub2)
return
result = input.mash_report
df = pd.read_csv(result, sep="\t")
df = pd.read_csv(input.mash_report, sep="\t")
genome_size = df["Length"].iloc[0]
print(f"genome size: {genome_size}")
print(f"genome size: {genome_size}")
with open(params.fastp_report, "r") as fh:
qcdata = json.load(fh)
base_count = qcdata["summary"]["after_filtering"]["total_bases"]
read_count = qcdata["summary"]["after_filtering"]["total_reads"]
avg_read_length = base_count / read_count
if params.downsample == 0:
coverage = 150
read_length = 250
down = (genome_size * coverage) // (2 * read_length)
print(f"auto down: {down}")
down = int((genome_size * params.coverage) / (2 * avg_read_length))
else:
down = params.downsample
print(f"custom down: {down}")
shell("seqtk sample {input.read1} {down} > seq/downsample/{wildcards.sample}.R1.fq")
shell("seqtk sample {input.read2} {down} > seq/downsample/{wildcards.sample}.R2.fq")
seed = randint(1, 2**16-1)
print(f"[yeat] average read length: {avg_read_length}")
print(f"[yeat] target depth of coverage: {params.coverage}x")
print(f"[yeat] number of reads to sample: {down}")
print(f"[yeat] random seed for sampling: {seed}")
shell("seqtk sample -s {seed} {input.read1} {down} > seq/downsample/{wildcards.sample}.R1.fq")
shell("seqtk sample -s {seed} {input.read2} {down} > seq/downsample/{wildcards.sample}.R2.fq")
shell("gzip -f seq/downsample/*")


Expand Down
36 changes: 28 additions & 8 deletions yeat/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
# Development Center.
# -------------------------------------------------------------------------------------------------

import argparse
from argparse import Action
from argparse import Action, ArgumentParser, ArgumentTypeError
import json
from pathlib import Path
from pkg_resources import resource_filename
Expand Down Expand Up @@ -46,6 +45,7 @@ def run(
sample="sample",
dryrun="dry",
downsample=0,
coverage=150,
):
snakefile = resource_filename("yeat", "Snakefile")
r1 = Path(fastq1).resolve()
Expand All @@ -66,6 +66,7 @@ def run(
sample=sample,
dryrun=dryrun,
downsample=downsample,
coverage=coverage,
)
success = snakemake(
snakefile,
Expand All @@ -79,31 +80,41 @@ def run(
raise RuntimeError("Snakemake Failed")


def get_parser():
parser = argparse.ArgumentParser()
def check_positive(value):
try:
value = int(value)
if value <= 0:
raise ArgumentTypeError(f"{value} is not a positive integer")
except ValueError:
raise ArgumentTypeError(f"{value} is not an integer")
return value


def get_parser(exit_on_error=True):
parser = ArgumentParser(exit_on_error=exit_on_error)
parser.add_argument("-v", "--version", action="version", version=f"YEAT v{yeat.__version__}")
parser.add_argument(
"-o",
"--outdir",
type=str,
metavar="DIR",
default=".",
help="output directory; default is '.'",
help="output directory; default is current working directory",
)
parser.add_argument(
"-t",
"--threads",
type=int,
metavar="T",
default=1,
help="number of threads for Snakemake; default is 1",
help="execute workflow with T threads; by default T=1",
)
parser.add_argument(
"--sample",
type=str,
metavar="S",
default="sample",
help="sample name for Snakemake; default is 'sample'",
help="specify a unique sample name S for storing assembly results in the working directory; by default S=sample",
)
parser.add_argument(
"-n",
Expand All @@ -117,7 +128,15 @@ def get_parser():
type=int,
metavar="D",
default=0,
help="downsample reads down to the desired number; by default, D=0; when set to default, YEAT will auto downsample; set D=-1 to not downsample",
help="randomly sample D reads from the input rather than assembling the full set; set D=0 to perform auto-downsampling to a desired level of coverage (see --coverage); set D=-1 to disable downsampling; by default D=0",
)
parser.add_argument(
"-c",
"--coverage",
type=check_positive,
metavar="C",
default=150,
help="target an average depth of coverage Cx when auto-downsampling; by default, C=150",
)
parser.add_argument(
"--init",
Expand All @@ -142,4 +161,5 @@ def main(args=None):
sample=args.sample,
dryrun=args.dry_run,
downsample=args.downsample,
coverage=args.coverage,
)
54 changes: 49 additions & 5 deletions yeat/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Development Center.
# -------------------------------------------------------------------------------------------------

from argparse import ArgumentError
import json
import pandas as pd
from pathlib import Path
Expand Down Expand Up @@ -40,11 +41,6 @@ def test_basic_dry_run(tmp_path):
cli.main(args)


def test_no_args():
with pytest.raises(SystemExit, match=r"2"):
cli.main(None)


def test_invalid_read1_file():
with pytest.raises(Exception, match=r"No such file:.*read1\'"):
read1 = "read1"
Expand Down Expand Up @@ -120,3 +116,51 @@ def test_custom_downsample_input(
assert df.iloc[12]["sample_contigs"] == num_contigs
assert df.iloc[13]["sample_contigs"] == largest_contig
assert df.iloc[14]["sample_contigs"] == total_len


@pytest.mark.parametrize("coverage", [("-1"), ("0")])
def test_invalid_custom_coverage_negative(coverage):
arglist = [
data_file("megahit.cfg"),
data_file("short_reads_1.fastq.gz"),
data_file("short_reads_2.fastq.gz"),
"--coverage",
coverage,
]
with pytest.raises(ArgumentError, match=rf"{coverage} is not a positive integer"):
args = cli.get_parser(exit_on_error=False).parse_args(arglist)


@pytest.mark.parametrize("coverage", [("string"), ("3.14")])
def test_invalid_custom_coverage_noninteger(coverage):
arglist = [
data_file("megahit.cfg"),
data_file("short_reads_1.fastq.gz"),
data_file("short_reads_2.fastq.gz"),
"--coverage",
coverage,
]
with pytest.raises(ArgumentError, match=rf"{coverage} is not an integer"):
args = cli.get_parser(exit_on_error=False).parse_args(arglist)


@pytest.mark.long
@pytest.mark.parametrize("coverage", [("150"), ("75"), ("10")])
def test_custom_coverage_input(coverage, capsys, tmp_path):
wd = str(tmp_path)
arglist = [
data_file("megahit.cfg"),
data_file("short_reads_1.fastq.gz"),
data_file("short_reads_2.fastq.gz"),
"--outdir",
wd,
"-c",
coverage,
]
args = cli.get_parser().parse_args(arglist)
cli.main(args)
quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv"
df = pd.read_csv(quast_report, sep="\t")
assert df.iloc[12]["sample_contigs"] == 56 # num_contigs
assert df.iloc[13]["sample_contigs"] == 35168 # largest_contig
assert df.iloc[14]["sample_contigs"] == 199940 # total_len

0 comments on commit 9216de9

Please sign in to comment.