Adding custom downsample flag (#21)

bioforensics · Sep 26, 2022 · a79e3a5 · a79e3a5
1 parent cee6f6d
commit a79e3a5
Show file tree

Hide file tree

Showing 7 changed files with 78 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Support for Unicycler (#14)
 - Support for assembly configuration file (#16)
 - Commands to run a short or long test suite (#20)
+- Custom downsampling flag (#21)
 
 
 ## [0.1] 2021-12-01

diff --git a/yeat/Snakefile b/yeat/Snakefile
@@ -9,6 +9,7 @@
 
 from shutil import copyfile
 import pandas as pd
+from pathlib import Path
 
 
 rule all:
@@ -79,15 +80,27 @@ rule downsample:
         read1="seq/fastp/{sample}.R1.fq.gz",
         read2="seq/fastp/{sample}.R2.fq.gz",
         mash_report="seq/mash/{sample}.report.tsv"
+    params:
+        downsample=config["downsample"]
     run:
+        if params.downsample == -1:
+            p = Path("seq/downsample")
+            p.mkdir(parents=True, exist_ok=True)
+            copyfile(input.read1, output.sub1)
+            copyfile(input.read2, output.sub2)
+            return
         result = input.mash_report
         df = pd.read_csv(result, sep="\t")
         genome_size = df["Length"].iloc[0]
-        coverage = 150
-        read_length = 250
-        down = (genome_size * 150) // (2 * read_length)
-        print(f"genome size: {genome_size}")
-        print(f"down: {down}")
+        print(f"genome size: {genome_size}")    
+        if params.downsample == 0:
+            coverage = 150
+            read_length = 250
+            down = (genome_size * coverage) // (2 * read_length)
+            print(f"auto down: {down}")
+        else:
+            down = params.downsample
+            print(f"custom down: {down}")
         shell("seqtk sample {input.read1} {down} > seq/downsample/{wildcards.sample}.R1.fq")
         shell("seqtk sample {input.read2} {down} > seq/downsample/{wildcards.sample}.R2.fq")
         shell("gzip -f seq/downsample/*")

diff --git a/yeat/cli.py b/yeat/cli.py
@@ -37,7 +37,16 @@ def __call__(self, parser, namespace, values, option_string=None):
         raise SystemExit()
 
 
-def run(fastq1, fastq2, assembly_configs, outdir=".", cores=1, sample="sample", dryrun="dry"):
+def run(
+    fastq1,
+    fastq2,
+    assembly_configs,
+    outdir=".",
+    cores=1,
+    sample="sample",
+    dryrun="dry",
+    downsample=0,
+):
     snakefile = resource_filename("yeat", "Snakefile")
     r1 = Path(fastq1).resolve()
     if not r1.is_file():
@@ -56,6 +65,7 @@ def run(fastq1, fastq2, assembly_configs, outdir=".", cores=1, sample="sample",
         cores=cores,
         sample=sample,
         dryrun=dryrun,
+        downsample=downsample,
     )
     success = snakemake(
         snakefile,
@@ -101,6 +111,14 @@ def get_parser():
         action="store_true",
         help="construct workflow DAG and print a summary but do not execute",
     )
+    parser.add_argument(
+        "-d",
+        "--downsample",
+        type=int,
+        metavar="D",
+        default=0,
+        help="downsample reads down to the desired number; by default, D=0; when set to default, YEAT will auto downsample; set D=-1 to not downsample",
+    )
     parser.add_argument(
         "--init",
         action=InitAction,
@@ -123,4 +141,5 @@ def main(args=None):
         cores=args.threads,
         sample=args.sample,
         dryrun=args.dry_run,
+        downsample=args.downsample,
     )
diff --git a/yeat/config.py b/yeat/config.py
@@ -12,6 +12,7 @@
 
 
 ALGORITHMS = ("spades", "megahit", "unicycler")
+KEYS = ["algorithm", "extra_args"]
 
 
 class AssemblyConfigurationError(ValueError):
@@ -43,8 +44,8 @@ def parse_json(instream):
 
     @staticmethod
     def validate(config):
-        missingkeys = set(["algorithm", "extra_args"]) - set(config.keys())
-        extrakeys = set(config.keys()) - set(["algorithm", "extra_args"])
+        missingkeys = set(KEYS) - set(config.keys())
+        extrakeys = set(config.keys()) - set(KEYS)
         if len(missingkeys) > 0:
             keystr = ",".join(sorted(missingkeys))
             message = f"Missing assembly configuration setting(s) '{keystr}'"

diff --git a/yeat/tests/data/config.cfg b/yeat/tests/data/config.cfg
@@ -1,10 +1,10 @@
 [
     {
         "algorithm": "spades",
-        "extra_args": ""
+        "extra_args": "--meta"
     },
     {
         "algorithm": "megahit",
-        "extra_args": ""
+        "extra_args": "--min-count 5 --min-contig-len 300"
     }
 ]
diff --git a/yeat/tests/data/megahit.cfg b/yeat/tests/data/megahit.cfg
@@ -0,0 +1,6 @@
+[
+    {
+        "algorithm": "megahit",
+        "extra_args": ""
+    }
+]
diff --git a/yeat/tests/test_cli.py b/yeat/tests/test_cli.py
@@ -8,6 +8,7 @@
 # -------------------------------------------------------------------------------------------------
 
 import json
+import pandas as pd
 from pathlib import Path
 import pytest
 from yeat import cli
@@ -92,3 +93,30 @@ def test_unicycler(capsys, tmp_path):
     cli.main(args)
     assembly_result = Path(wd).resolve() / "analysis" / "unicycler" / "assembly.fasta"
     assert assembly_result.exists()
+
+
+@pytest.mark.long
+@pytest.mark.parametrize(
+    "downsample,num_contigs,largest_contig,total_len",
+    [("2000", 71, 5120, 69189), ("-1", 56, 35168, 199940)],
+)
+def test_custom_downsample_input(
+    downsample, num_contigs, largest_contig, total_len, capsys, tmp_path
+):
+    wd = str(tmp_path)
+    arglist = [
+        data_file("megahit.cfg"),
+        data_file("short_reads_1.fastq.gz"),
+        data_file("short_reads_2.fastq.gz"),
+        "--outdir",
+        wd,
+        "-d",
+        downsample,
+    ]
+    args = cli.get_parser().parse_args(arglist)
+    cli.main(args)
+    quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv"
+    df = pd.read_csv(quast_report, sep="\t")
+    assert df.iloc[12]["sample_contigs"] == num_contigs
+    assert df.iloc[13]["sample_contigs"] == largest_contig
+    assert df.iloc[14]["sample_contigs"] == total_len