Skip to content

Commit

Permalink
Adding custom downsample flag (#21)
Browse files Browse the repository at this point in the history
  • Loading branch information
danejo3 authored Sep 26, 2022
1 parent cee6f6d commit a79e3a5
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 10 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Support for Unicycler (#14)
- Support for assembly configuration file (#16)
- Commands to run a short or long test suite (#20)
- Custom downsampling flag (#21)


## [0.1] 2021-12-01
Expand Down
23 changes: 18 additions & 5 deletions yeat/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from shutil import copyfile
import pandas as pd
from pathlib import Path


rule all:
Expand Down Expand Up @@ -79,15 +80,27 @@ rule downsample:
read1="seq/fastp/{sample}.R1.fq.gz",
read2="seq/fastp/{sample}.R2.fq.gz",
mash_report="seq/mash/{sample}.report.tsv"
params:
downsample=config["downsample"]
run:
if params.downsample == -1:
p = Path("seq/downsample")
p.mkdir(parents=True, exist_ok=True)
copyfile(input.read1, output.sub1)
copyfile(input.read2, output.sub2)
return
result = input.mash_report
df = pd.read_csv(result, sep="\t")
genome_size = df["Length"].iloc[0]
coverage = 150
read_length = 250
down = (genome_size * 150) // (2 * read_length)
print(f"genome size: {genome_size}")
print(f"down: {down}")
print(f"genome size: {genome_size}")
if params.downsample == 0:
coverage = 150
read_length = 250
down = (genome_size * coverage) // (2 * read_length)
print(f"auto down: {down}")
else:
down = params.downsample
print(f"custom down: {down}")
shell("seqtk sample {input.read1} {down} > seq/downsample/{wildcards.sample}.R1.fq")
shell("seqtk sample {input.read2} {down} > seq/downsample/{wildcards.sample}.R2.fq")
shell("gzip -f seq/downsample/*")
Expand Down
21 changes: 20 additions & 1 deletion yeat/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,16 @@ def __call__(self, parser, namespace, values, option_string=None):
raise SystemExit()


def run(fastq1, fastq2, assembly_configs, outdir=".", cores=1, sample="sample", dryrun="dry"):
def run(
fastq1,
fastq2,
assembly_configs,
outdir=".",
cores=1,
sample="sample",
dryrun="dry",
downsample=0,
):
snakefile = resource_filename("yeat", "Snakefile")
r1 = Path(fastq1).resolve()
if not r1.is_file():
Expand All @@ -56,6 +65,7 @@ def run(fastq1, fastq2, assembly_configs, outdir=".", cores=1, sample="sample",
cores=cores,
sample=sample,
dryrun=dryrun,
downsample=downsample,
)
success = snakemake(
snakefile,
Expand Down Expand Up @@ -101,6 +111,14 @@ def get_parser():
action="store_true",
help="construct workflow DAG and print a summary but do not execute",
)
parser.add_argument(
"-d",
"--downsample",
type=int,
metavar="D",
default=0,
help="downsample reads down to the desired number; by default, D=0; when set to default, YEAT will auto downsample; set D=-1 to not downsample",
)
parser.add_argument(
"--init",
action=InitAction,
Expand All @@ -123,4 +141,5 @@ def main(args=None):
cores=args.threads,
sample=args.sample,
dryrun=args.dry_run,
downsample=args.downsample,
)
5 changes: 3 additions & 2 deletions yeat/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@


ALGORITHMS = ("spades", "megahit", "unicycler")
KEYS = ["algorithm", "extra_args"]


class AssemblyConfigurationError(ValueError):
Expand Down Expand Up @@ -43,8 +44,8 @@ def parse_json(instream):

@staticmethod
def validate(config):
missingkeys = set(["algorithm", "extra_args"]) - set(config.keys())
extrakeys = set(config.keys()) - set(["algorithm", "extra_args"])
missingkeys = set(KEYS) - set(config.keys())
extrakeys = set(config.keys()) - set(KEYS)
if len(missingkeys) > 0:
keystr = ",".join(sorted(missingkeys))
message = f"Missing assembly configuration setting(s) '{keystr}'"
Expand Down
4 changes: 2 additions & 2 deletions yeat/tests/data/config.cfg
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
[
{
"algorithm": "spades",
"extra_args": ""
"extra_args": "--meta"
},
{
"algorithm": "megahit",
"extra_args": ""
"extra_args": "--min-count 5 --min-contig-len 300"
}
]
6 changes: 6 additions & 0 deletions yeat/tests/data/megahit.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[
{
"algorithm": "megahit",
"extra_args": ""
}
]
28 changes: 28 additions & 0 deletions yeat/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# -------------------------------------------------------------------------------------------------

import json
import pandas as pd
from pathlib import Path
import pytest
from yeat import cli
Expand Down Expand Up @@ -92,3 +93,30 @@ def test_unicycler(capsys, tmp_path):
cli.main(args)
assembly_result = Path(wd).resolve() / "analysis" / "unicycler" / "assembly.fasta"
assert assembly_result.exists()


@pytest.mark.long
@pytest.mark.parametrize(
"downsample,num_contigs,largest_contig,total_len",
[("2000", 71, 5120, 69189), ("-1", 56, 35168, 199940)],
)
def test_custom_downsample_input(
downsample, num_contigs, largest_contig, total_len, capsys, tmp_path
):
wd = str(tmp_path)
arglist = [
data_file("megahit.cfg"),
data_file("short_reads_1.fastq.gz"),
data_file("short_reads_2.fastq.gz"),
"--outdir",
wd,
"-d",
downsample,
]
args = cli.get_parser().parse_args(arglist)
cli.main(args)
quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv"
df = pd.read_csv(quast_report, sep="\t")
assert df.iloc[12]["sample_contigs"] == num_contigs
assert df.iloc[13]["sample_contigs"] == largest_contig
assert df.iloc[14]["sample_contigs"] == total_len

0 comments on commit a79e3a5

Please sign in to comment.