Adding --seed flag to customize the seed when downsampling (#29)

bioforensics · Nov 4, 2022 · b2cd761 · b2cd761
1 parent 9216de9
commit b2cd761
Show file tree

Hide file tree

Showing 7 changed files with 67 additions and 13 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Commands to run a short or long test suite (#20)
 - Custom downsampling flag (#21)
 - Custom coverage flag (#23)
+- Custom seed flag (#29)
+- 3.9 Python version pin for `exit_on_error` parameter and 3.10's incompatibility with SPAdes <v3.15.4 (#23, #29)
 
 
 ## [0.1] 2021-12-01

diff --git a/README.md b/README.md
@@ -1,11 +1,17 @@
 # YEAT
 
-YEAT, **Y**our **E**verday **A**ssembly **T**ool, is an update to [`asm_tools`](https://github.com/bioforensics/asm_tools). It uses a Snakemake workflow to preprocess, downsample, and assemble paired-end fastq files with SPAdes.
+YEAT, **Y**our **E**verday **A**ssembly **T**ool, is an update to [`asm_tools`](https://github.com/bioforensics/asm_tools). It uses a Snakemake workflow to preprocess, downsample, and assemble paired-end fastq files with various assemblers such as SPAdes, MEGAHIT, and Unicycler.
 
-<p align="center">
-  <img width="220" alt="Screen Shot 2022-02-02 at 10 57 31 AM" src="https://user-images.githubusercontent.com/33472323/152189781-2bfdc62b-f554-42d5-8f78-f94ab2b133eb.png">
-</p>
+## Installation
+
+```
+git clone https://github.com/bioforensics/yeat.git
+cd yeat
+conda env create --name yeat --file environment.yml
+conda activate yeat
+pip install .
+```
 
 ## Usage:
 
-```$ yeat {read1} {read2} --outdir {path} --sample {name}```
+```$ yeat {config} {read1} {read2} --outdir {path} --sample {name}```
diff --git a/environment.yml b/environment.yml
@@ -4,13 +4,14 @@ channels:
     - bioconda
     - defaults
 dependencies:
-    - black=21.10b0
+    - black=22.10
     - fastp>=0.23
     - fastqc>=0.11
     - gzip>=1.7
     - mash>=2.3
     - megahit>=1.2
     - pytest-cov>=3.0
+    - python=3.9
     - quast>=5.0
     - seqtk>=1.3
     - snakemake>=6.10

diff --git a/setup.py b/setup.py
@@ -45,7 +45,7 @@
     ],
     zip_safe=True,
     keywords='genome assembly',
-    python_requires='>=3.7',
+    python_requires='>=3.9',
     project_urls={
         'Bug Reports': 'https://github.com/bioforensics/yeat/issues',
         'Source': 'https://github.com/bioforensics/yeat',

diff --git a/yeat/Snakefile b/yeat/Snakefile
@@ -85,7 +85,8 @@ rule downsample:
     params:
         coverage=config["coverage"],
         downsample=config["downsample"],
-        fastp_report="seq/fastp/fastp.json"
+        fastp_report="seq/fastp/fastp.json",
+        seed=config["seed"]
     run:
         if params.downsample == -1:
             p = Path("seq/downsample")
@@ -105,7 +106,10 @@ rule downsample:
             down = int((genome_size * params.coverage) / (2 * avg_read_length))
         else:
             down = params.downsample
-        seed = randint(1, 2**16-1)
+        if params.seed == None:
+            seed = randint(1, 2**16-1)
+        else:
+            seed = params.seed
         print(f"[yeat] average read length: {avg_read_length}")
         print(f"[yeat] target depth of coverage: {params.coverage}x")
         print(f"[yeat] number of reads to sample: {down}")

diff --git a/yeat/cli.py b/yeat/cli.py
@@ -46,6 +46,7 @@ def run(
     dryrun="dry",
     downsample=0,
     coverage=150,
+    seed=None,
 ):
     snakefile = resource_filename("yeat", "Snakefile")
     r1 = Path(fastq1).resolve()
@@ -67,6 +68,7 @@ def run(
         dryrun=dryrun,
         downsample=downsample,
         coverage=coverage,
+        seed=seed,
     )
     success = snakemake(
         snakefile,
@@ -138,6 +140,13 @@ def get_parser(exit_on_error=True):
         default=150,
         help="target an average depth of coverage Cx when auto-downsampling; by default, C=150",
     )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        metavar="S",
+        default=None,
+        help="seed for the random number generator used for downsampling; by default the seed is chosen randomly",
+    )
     parser.add_argument(
         "--init",
         action=InitAction,
@@ -162,4 +171,5 @@ def main(args=None):
         dryrun=args.dry_run,
         downsample=args.downsample,
         coverage=args.coverage,
+        seed=args.seed,
     )
diff --git a/yeat/tests/test_cli.py b/yeat/tests/test_cli.py
@@ -12,6 +12,7 @@
 import pandas as pd
 from pathlib import Path
 import pytest
+from random import randint
 from yeat import cli
 from yeat.cli import InitAction
 from yeat.tests import data_file
@@ -94,7 +95,7 @@ def test_unicycler(capsys, tmp_path):
 @pytest.mark.long
 @pytest.mark.parametrize(
     "downsample,num_contigs,largest_contig,total_len",
-    [("2000", 71, 5120, 69189), ("-1", 56, 35168, 199940)],
+    [("2000", 79, 5294, 70818), ("-1", 56, 35168, 199940)],
 )
 def test_custom_downsample_input(
     downsample, num_contigs, largest_contig, total_len, capsys, tmp_path
@@ -108,6 +109,8 @@ def test_custom_downsample_input(
         wd,
         "-d",
         downsample,
+        "--seed",
+        "0",
     ]
     args = cli.get_parser().parse_args(arglist)
     cli.main(args)
@@ -161,6 +164,34 @@ def test_custom_coverage_input(coverage, capsys, tmp_path):
     cli.main(args)
     quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv"
     df = pd.read_csv(quast_report, sep="\t")
-    assert df.iloc[12]["sample_contigs"] == 56  # num_contigs
-    assert df.iloc[13]["sample_contigs"] == 35168  # largest_contig
-    assert df.iloc[14]["sample_contigs"] == 199940  # total_len
+    num_contigs = df.iloc[12]["sample_contigs"]
+    assert num_contigs == 56
+    largest_contig = df.iloc[13]["sample_contigs"]
+    assert largest_contig == 35168
+    total_len = df.iloc[14]["sample_contigs"]
+    assert total_len == 199940
+
+
+@pytest.mark.long
+@pytest.mark.parametrize("execution_number", range(3))
+def test_random_downsample_seed(execution_number, capsys, tmp_path):
+    wd = str(tmp_path)
+    arglist = [
+        data_file("megahit.cfg"),
+        data_file("short_reads_1.fastq.gz"),
+        data_file("short_reads_2.fastq.gz"),
+        "--outdir",
+        wd,
+        "-d",
+        "2000",
+    ]
+    args = cli.get_parser().parse_args(arglist)
+    cli.main(args)
+    quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv"
+    df = pd.read_csv(quast_report, sep="\t")
+    num_contigs = df.iloc[12]["sample_contigs"]
+    assert num_contigs == pytest.approx(76, abs=15)  # 76 +/- 20%
+    largest_contig = df.iloc[13]["sample_contigs"]
+    assert largest_contig == pytest.approx(5228, abs=1045)  # 5228 +/- 20%
+    total_len = df.iloc[14]["sample_contigs"]
+    assert total_len == pytest.approx(74393, abs=14878)  # 74393 +/- 20%