From 484189c9c6b9f411741c76481a269bf51d0b5c15 Mon Sep 17 00:00:00 2001
From: Dane <danejo3@outlook.com>
Date: Wed, 2 Nov 2022 13:30:30 -0400
Subject: [PATCH 01/10] first commit

---
 README.md              | 16 +++++++++++-----
 environment.yml        |  3 ++-
 setup.py               |  2 +-
 yeat/Snakefile         |  5 ++++-
 yeat/cli.py            | 10 ++++++++++
 yeat/tests/test_cli.py |  4 +++-
 6 files changed, 31 insertions(+), 9 deletions(-)
diff --git a/README.md b/README.md
index 3be1748..f2187b0 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,17 @@
 # YEAT
 
-YEAT, **Y**our **E**verday **A**ssembly **T**ool, is an update to [`asm_tools`](https://github.com/bioforensics/asm_tools). It uses a Snakemake workflow to preprocess, downsample, and assemble paired-end fastq files with SPAdes.
+YEAT, **Y**our **E**verday **A**ssembly **T**ool, is an update to [`asm_tools`](https://github.com/bioforensics/asm_tools). It uses a Snakemake workflow to preprocess, downsample, and assemble paired-end fastq files with various assemblers such as SPAdes, MEGAHIT, and Unicycler.
 
-<p align="center">
-  <img width="220" alt="Screen Shot 2022-02-02 at 10 57 31 AM" src="https://user-images.githubusercontent.com/33472323/152189781-2bfdc62b-f554-42d5-8f78-f94ab2b133eb.png">
-</p>
+## Installation
+
+```
+git clone https://github.com/bioforensics/yeat.git
+cd yeat
+conda env create --name yeat --file environment.yml
+conda activate yeat
+pip install .
+```
 
 ## Usage:
 
-```$ yeat {read1} {read2} --outdir {path} --sample {name}```
+```$ yeat {config} {read1} {read2} --outdir {path} --sample {name}```
diff --git a/environment.yml b/environment.yml
index 95a854e..6631801 100644
--- a/environment.yml
+++ b/environment.yml
@@ -4,13 +4,14 @@ channels:
     - bioconda
     - defaults
 dependencies:
-    - black=21.10b0
+    - black=22.10
     - fastp>=0.23
     - fastqc>=0.11
     - gzip>=1.7
     - mash>=2.3
     - megahit>=1.2
     - pytest-cov>=3.0
+    - python>=3.9
     - quast>=5.0
     - seqtk>=1.3
     - snakemake>=6.10
diff --git a/setup.py b/setup.py
index 47608a5..9328d39 100644
--- a/setup.py
+++ b/setup.py
@@ -45,7 +45,7 @@
     ],
     zip_safe=True,
     keywords='genome assembly',
-    python_requires='>=3.7',
+    python_requires='>=3.9',
     project_urls={
         'Bug Reports': 'https://github.com/bioforensics/yeat/issues',
         'Source': 'https://github.com/bioforensics/yeat',
diff --git a/yeat/Snakefile b/yeat/Snakefile
index fc97ed8..ee3cf28 100644
--- a/yeat/Snakefile
+++ b/yeat/Snakefile
@@ -11,6 +11,7 @@ import json
 from shutil import copyfile
 import pandas as pd
 from pathlib import Path
+import random
 from random import randint
 
 
@@ -85,7 +86,8 @@ rule downsample:
     params:
         coverage=config["coverage"],
         downsample=config["downsample"],
-        fastp_report="seq/fastp/fastp.json"
+        fastp_report="seq/fastp/fastp.json",
+        seed=config["seed"]
     run:
         if params.downsample == -1:
             p = Path("seq/downsample")
@@ -105,6 +107,7 @@ rule downsample:
             down = int((genome_size * params.coverage) / (2 * avg_read_length))
         else:
             down = params.downsample
+        random.seed(params.seed) # for unit tests
         seed = randint(1, 2**16-1)
         print(f"[yeat] average read length: {avg_read_length}")
         print(f"[yeat] target depth of coverage: {params.coverage}x")
diff --git a/yeat/cli.py b/yeat/cli.py
index a0b6ed9..a046c86 100644
--- a/yeat/cli.py
+++ b/yeat/cli.py
@@ -46,6 +46,7 @@ def run(
     dryrun="dry",
     downsample=0,
     coverage=150,
+    seed=None,
 ):
     snakefile = resource_filename("yeat", "Snakefile")
     r1 = Path(fastq1).resolve()
@@ -67,6 +68,7 @@ def run(
         dryrun=dryrun,
         downsample=downsample,
         coverage=coverage,
+        seed=seed,
     )
     success = snakemake(
         snakefile,
@@ -138,6 +140,13 @@ def get_parser(exit_on_error=True):
         default=150,
         help="target an average depth of coverage Cx when auto-downsampling; by default, C=150",
     )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        metavar="S",
+        default=None,
+        help="random seed; by default, S=None",
+    )
     parser.add_argument(
         "--init",
         action=InitAction,
@@ -162,4 +171,5 @@ def main(args=None):
         dryrun=args.dry_run,
         downsample=args.downsample,
         coverage=args.coverage,
+        seed=args.seed,
     )
diff --git a/yeat/tests/test_cli.py b/yeat/tests/test_cli.py
index bc9c873..447d627 100644
--- a/yeat/tests/test_cli.py
+++ b/yeat/tests/test_cli.py
@@ -94,7 +94,7 @@ def test_unicycler(capsys, tmp_path):
 @pytest.mark.long
 @pytest.mark.parametrize(
     "downsample,num_contigs,largest_contig,total_len",
-    [("2000", 71, 5120, 69189), ("-1", 56, 35168, 199940)],
+    [("2000", 69, 5103, 70075), ("-1", 56, 35168, 199940)],
 )
 def test_custom_downsample_input(
     downsample, num_contigs, largest_contig, total_len, capsys, tmp_path
@@ -108,6 +108,8 @@ def test_custom_downsample_input(
         wd,
         "-d",
         downsample,
+        "--seed",
+        0,
     ]
     args = cli.get_parser().parse_args(arglist)
     cli.main(args)

From 1c85bd4c6767aaf13eef121adc44b47426d6b296 Mon Sep 17 00:00:00 2001
From: Dane <danejo3@outlook.com>
Date: Wed, 2 Nov 2022 15:09:53 -0400
Subject: [PATCH 02/10] small changes

---
 yeat/Snakefile         | 6 ++++--
 yeat/cli.py            | 2 +-
 yeat/tests/test_cli.py | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/yeat/Snakefile b/yeat/Snakefile
index ee3cf28..3e15ba0 100644
--- a/yeat/Snakefile
+++ b/yeat/Snakefile
@@ -107,8 +107,10 @@ rule downsample:
             down = int((genome_size * params.coverage) / (2 * avg_read_length))
         else:
             down = params.downsample
-        random.seed(params.seed) # for unit tests
-        seed = randint(1, 2**16-1)
+        if params.seed == None:
+            seed = randint(1, 2**16-1)
+        else:
+            seed = params.seed
         print(f"[yeat] average read length: {avg_read_length}")
         print(f"[yeat] target depth of coverage: {params.coverage}x")
         print(f"[yeat] number of reads to sample: {down}")
diff --git a/yeat/cli.py b/yeat/cli.py
index a046c86..a4e6f50 100644
--- a/yeat/cli.py
+++ b/yeat/cli.py
@@ -145,7 +145,7 @@ def get_parser(exit_on_error=True):
         type=int,
         metavar="S",
         default=None,
-        help="random seed; by default, S=None",
+        help="override the randomly chosen seed S when downsampling; by default, S=None",
     )
     parser.add_argument(
         "--init",
diff --git a/yeat/tests/test_cli.py b/yeat/tests/test_cli.py
index 447d627..706a6ba 100644
--- a/yeat/tests/test_cli.py
+++ b/yeat/tests/test_cli.py
@@ -94,7 +94,7 @@ def test_unicycler(capsys, tmp_path):
 @pytest.mark.long
 @pytest.mark.parametrize(
     "downsample,num_contigs,largest_contig,total_len",
-    [("2000", 69, 5103, 70075), ("-1", 56, 35168, 199940)],
+    [("2000", 79, 5294, 70818), ("-1", 56, 35168, 199940)],
 )
 def test_custom_downsample_input(
     downsample, num_contigs, largest_contig, total_len, capsys, tmp_path

From 0d2d842e1d0297c81c71d77662f030d2902a7720 Mon Sep 17 00:00:00 2001
From: Dane <danejo3@outlook.com>
Date: Thu, 3 Nov 2022 13:35:39 -0400
Subject: [PATCH 03/10] added tests

---
 yeat/tests/test_cli.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/yeat/tests/test_cli.py b/yeat/tests/test_cli.py
index 706a6ba..cde0927 100644
--- a/yeat/tests/test_cli.py
+++ b/yeat/tests/test_cli.py
@@ -12,6 +12,7 @@
 import pandas as pd
 from pathlib import Path
 import pytest
+from random import randint
 from yeat import cli
 from yeat.cli import InitAction
 from yeat.tests import data_file
@@ -109,7 +110,7 @@ def test_custom_downsample_input(
         "-d",
         downsample,
         "--seed",
-        0,
+        "0",
     ]
     args = cli.get_parser().parse_args(arglist)
     cli.main(args)
@@ -166,3 +167,25 @@ def test_custom_coverage_input(coverage, capsys, tmp_path):
     assert df.iloc[12]["sample_contigs"] == 56  # num_contigs
     assert df.iloc[13]["sample_contigs"] == 35168  # largest_contig
     assert df.iloc[14]["sample_contigs"] == 199940  # total_len
+
+
+@pytest.mark.long
+@pytest.mark.parametrize("execution_number", range(3))
+def test_random_downsample_seed(execution_number, capsys, tmp_path):
+    wd = str(tmp_path)
+    arglist = [
+        data_file("megahit.cfg"),
+        data_file("short_reads_1.fastq.gz"),
+        data_file("short_reads_2.fastq.gz"),
+        "--outdir",
+        wd,
+        "-d",
+        "2000",
+    ]
+    args = cli.get_parser().parse_args(arglist)
+    cli.main(args)
+    quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv"
+    df = pd.read_csv(quast_report, sep="\t")
+    assert 61 <= df.iloc[12]["sample_contigs"] <= 91  # 76 +-20% of avg num_contigs
+    assert 4183 <= df.iloc[13]["sample_contigs"] <= 6273  # 5228 +-20% of avg largest_contig
+    assert 59515 <= df.iloc[14]["sample_contigs"] <= 89271  # 74393 +-20% of avg total_len

From bffb1359992387b5ec9a0d3df706114a891c9528 Mon Sep 17 00:00:00 2001
From: Dane <danejo3@outlook.com>
Date: Thu, 3 Nov 2022 13:36:35 -0400
Subject: [PATCH 04/10] updated changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5afebb5..09ad385 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Commands to run a short or long test suite (#20)
 - Custom downsampling flag (#21)
 - Custom coverage flag (#23)
+- Custom seed flag (#29)
 
 
 ## [0.1] 2021-12-01

From 5c88ad802bd60d6c2ffffdb0c30185c1e5278733 Mon Sep 17 00:00:00 2001
From: Dane <danejo3@outlook.com>
Date: Thu, 3 Nov 2022 13:58:49 -0400
Subject: [PATCH 05/10] cleaning up code

---
 yeat/Snakefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/yeat/Snakefile b/yeat/Snakefile
index 3e15ba0..51c1d01 100644
--- a/yeat/Snakefile
+++ b/yeat/Snakefile
@@ -11,7 +11,6 @@ import json
 from shutil import copyfile
 import pandas as pd
 from pathlib import Path
-import random
 from random import randint
 
 

From e4a5e3172833880e8875980e181fc209d36797f6 Mon Sep 17 00:00:00 2001
From: Dane <danejo3@outlook.com>
Date: Fri, 4 Nov 2022 12:06:41 -0400
Subject: [PATCH 06/10] implemented suggestions

---
 CHANGELOG.md           |  1 +
 environment.yml        |  3 ++-
 yeat/cli.py            |  2 +-
 yeat/tests/test_cli.py | 18 ++++++++++++------
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 09ad385..94542e6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Custom downsampling flag (#21)
 - Custom coverage flag (#23)
 - Custom seed flag (#29)
+- Pinned required Python version to 3.9 for `exit_on_error` parameter and 3.10's incompatibility with SPAdes <v3.15.4 (#23, #29)
 
 
 ## [0.1] 2021-12-01
diff --git a/environment.yml b/environment.yml
index 6631801..af10e93 100644
--- a/environment.yml
+++ b/environment.yml
@@ -5,13 +5,14 @@ channels:
     - defaults
 dependencies:
     - black=22.10
+    - click=8.0
     - fastp>=0.23
     - fastqc>=0.11
     - gzip>=1.7
     - mash>=2.3
     - megahit>=1.2
     - pytest-cov>=3.0
-    - python>=3.9
+    - python=3.9
     - quast>=5.0
     - seqtk>=1.3
     - snakemake>=6.10
diff --git a/yeat/cli.py b/yeat/cli.py
index a4e6f50..c16c801 100644
--- a/yeat/cli.py
+++ b/yeat/cli.py
@@ -145,7 +145,7 @@ def get_parser(exit_on_error=True):
         type=int,
         metavar="S",
         default=None,
-        help="override the randomly chosen seed S when downsampling; by default, S=None",
+        help="seed for the random number generator used for downsampling; by default the seed is chosen randomly",
     )
     parser.add_argument(
         "--init",
diff --git a/yeat/tests/test_cli.py b/yeat/tests/test_cli.py
index cde0927..cd1e123 100644
--- a/yeat/tests/test_cli.py
+++ b/yeat/tests/test_cli.py
@@ -164,9 +164,12 @@ def test_custom_coverage_input(coverage, capsys, tmp_path):
     cli.main(args)
     quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv"
     df = pd.read_csv(quast_report, sep="\t")
-    assert df.iloc[12]["sample_contigs"] == 56  # num_contigs
-    assert df.iloc[13]["sample_contigs"] == 35168  # largest_contig
-    assert df.iloc[14]["sample_contigs"] == 199940  # total_len
+    num_contigs = df.iloc[12]["sample_contigs"]
+    assert num_contigs == 56
+    largest_contig = df.iloc[13]["sample_contigs"]
+    assert largest_contig == 35168
+    total_len = df.iloc[14]["sample_contigs"]
+    assert total_len == 199940
 
 
 @pytest.mark.long
@@ -186,6 +189,9 @@ def test_random_downsample_seed(execution_number, capsys, tmp_path):
     cli.main(args)
     quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv"
     df = pd.read_csv(quast_report, sep="\t")
-    assert 61 <= df.iloc[12]["sample_contigs"] <= 91  # 76 +-20% of avg num_contigs
-    assert 4183 <= df.iloc[13]["sample_contigs"] <= 6273  # 5228 +-20% of avg largest_contig
-    assert 59515 <= df.iloc[14]["sample_contigs"] <= 89271  # 74393 +-20% of avg total_len
+    num_contigs = df.iloc[12]["sample_contigs"]
+    assert num_contigs == pytest.approx(76, abs=15)
+    largest_contig = df.iloc[13]["sample_contigs"]
+    assert largest_contig == pytest.approx(5228, abs=1045)
+    total_len = df.iloc[14]["sample_contigs"]
+    assert total_len == pytest.approx(74393, abs=14878)

From 4f1c23000879eda05393d53ab64d564147178e99 Mon Sep 17 00:00:00 2001
From: Dane <danejo3@outlook.com>
Date: Fri, 4 Nov 2022 12:20:14 -0400
Subject: [PATCH 07/10] updating env.yml

---
 environment.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index af10e93..5302335 100644
--- a/environment.yml
+++ b/environment.yml
@@ -5,7 +5,6 @@ channels:
     - defaults
 dependencies:
     - black=22.10
-    - click=8.0
     - fastp>=0.23
     - fastqc>=0.11
     - gzip>=1.7

From 85e676c97ebaed60406e19cd69ae4f58f99e3aa3 Mon Sep 17 00:00:00 2001
From: Dane <danejo3@outlook.com>
Date: Fri, 4 Nov 2022 12:33:03 -0400
Subject: [PATCH 08/10] updated changedlog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 94542e6..ef4c9df 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,7 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Custom downsampling flag (#21)
 - Custom coverage flag (#23)
 - Custom seed flag (#29)
-- Pinned required Python version to 3.9 for `exit_on_error` parameter and 3.10's incompatibility with SPAdes <v3.15.4 (#23, #29)
+- 3.9 Python version pin for `exit_on_error` parameter and 3.10's incompatibility with SPAdes <v3.15.4 (#23, #29)
 
 
 ## [0.1] 2021-12-01

From b0fb2e52ec58df9f89d6770af406417724c0cf0b Mon Sep 17 00:00:00 2001
From: Dane <danejo3@outlook.com>
Date: Fri, 4 Nov 2022 12:37:56 -0400
Subject: [PATCH 09/10] added comments

---
 yeat/tests/test_cli.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/yeat/tests/test_cli.py b/yeat/tests/test_cli.py
index cd1e123..09b81b7 100644
--- a/yeat/tests/test_cli.py
+++ b/yeat/tests/test_cli.py
@@ -190,8 +190,8 @@ def test_random_downsample_seed(execution_number, capsys, tmp_path):
     quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv"
     df = pd.read_csv(quast_report, sep="\t")
     num_contigs = df.iloc[12]["sample_contigs"]
-    assert num_contigs == pytest.approx(76, abs=15)
+    assert num_contigs == pytest.approx(76, abs=15)  # +/- 20%
     largest_contig = df.iloc[13]["sample_contigs"]
-    assert largest_contig == pytest.approx(5228, abs=1045)
+    assert largest_contig == pytest.approx(5228, abs=1045)  # +/- 20%
     total_len = df.iloc[14]["sample_contigs"]
-    assert total_len == pytest.approx(74393, abs=14878)
+    assert total_len == pytest.approx(74393, abs=14878)  # +/- 20%

From fb2d765da37a4e70c0490fdaebd1d7c1a3baec1e Mon Sep 17 00:00:00 2001
From: Dane <danejo3@outlook.com>
Date: Fri, 4 Nov 2022 12:39:33 -0400
Subject: [PATCH 10/10] added comments

---
 yeat/tests/test_cli.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/yeat/tests/test_cli.py b/yeat/tests/test_cli.py
index 09b81b7..2cf9ef1 100644
--- a/yeat/tests/test_cli.py
+++ b/yeat/tests/test_cli.py
@@ -190,8 +190,8 @@ def test_random_downsample_seed(execution_number, capsys, tmp_path):
     quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv"
     df = pd.read_csv(quast_report, sep="\t")
     num_contigs = df.iloc[12]["sample_contigs"]
-    assert num_contigs == pytest.approx(76, abs=15)  # +/- 20%
+    assert num_contigs == pytest.approx(76, abs=15)  # 76 +/- 20%
     largest_contig = df.iloc[13]["sample_contigs"]
-    assert largest_contig == pytest.approx(5228, abs=1045)  # +/- 20%
+    assert largest_contig == pytest.approx(5228, abs=1045)  # 5228 +/- 20%
     total_len = df.iloc[14]["sample_contigs"]
-    assert total_len == pytest.approx(74393, abs=14878)  # +/- 20%
+    assert total_len == pytest.approx(74393, abs=14878)  # 74393 +/- 20%