From 484189c9c6b9f411741c76481a269bf51d0b5c15 Mon Sep 17 00:00:00 2001 From: Dane Date: Wed, 2 Nov 2022 13:30:30 -0400 Subject: [PATCH 01/10] first commit --- README.md | 16 +++++++++++----- environment.yml | 3 ++- setup.py | 2 +- yeat/Snakefile | 5 ++++- yeat/cli.py | 10 ++++++++++ yeat/tests/test_cli.py | 4 +++- 6 files changed, 31 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 3be1748..f2187b0 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,17 @@ # YEAT -YEAT, **Y**our **E**verday **A**ssembly **T**ool, is an update to [`asm_tools`](https://github.com/bioforensics/asm_tools). It uses a Snakemake workflow to preprocess, downsample, and assemble paired-end fastq files with SPAdes. +YEAT, **Y**our **E**verday **A**ssembly **T**ool, is an update to [`asm_tools`](https://github.com/bioforensics/asm_tools). It uses a Snakemake workflow to preprocess, downsample, and assemble paired-end fastq files with various assemblers such as SPAdes, MEGAHIT, and Unicycler. -

- Screen Shot 2022-02-02 at 10 57 31 AM -

+## Installation + +``` +git clone https://github.com/bioforensics/yeat.git +cd yeat +conda env create --name yeat --file environment.yml +conda activate yeat +pip install . +``` ## Usage: -```$ yeat {read1} {read2} --outdir {path} --sample {name}``` +```$ yeat {config} {read1} {read2} --outdir {path} --sample {name}``` diff --git a/environment.yml b/environment.yml index 95a854e..6631801 100644 --- a/environment.yml +++ b/environment.yml @@ -4,13 +4,14 @@ channels: - bioconda - defaults dependencies: - - black=21.10b0 + - black=22.10 - fastp>=0.23 - fastqc>=0.11 - gzip>=1.7 - mash>=2.3 - megahit>=1.2 - pytest-cov>=3.0 + - python>=3.9 - quast>=5.0 - seqtk>=1.3 - snakemake>=6.10 diff --git a/setup.py b/setup.py index 47608a5..9328d39 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ ], zip_safe=True, keywords='genome assembly', - python_requires='>=3.7', + python_requires='>=3.9', project_urls={ 'Bug Reports': 'https://github.com/bioforensics/yeat/issues', 'Source': 'https://github.com/bioforensics/yeat', diff --git a/yeat/Snakefile b/yeat/Snakefile index fc97ed8..ee3cf28 100644 --- a/yeat/Snakefile +++ b/yeat/Snakefile @@ -11,6 +11,7 @@ import json from shutil import copyfile import pandas as pd from pathlib import Path +import random from random import randint @@ -85,7 +86,8 @@ rule downsample: params: coverage=config["coverage"], downsample=config["downsample"], - fastp_report="seq/fastp/fastp.json" + fastp_report="seq/fastp/fastp.json", + seed=config["seed"] run: if params.downsample == -1: p = Path("seq/downsample") @@ -105,6 +107,7 @@ rule downsample: down = int((genome_size * params.coverage) / (2 * avg_read_length)) else: down = params.downsample + random.seed(params.seed) # for unit tests seed = randint(1, 2**16-1) print(f"[yeat] average read length: {avg_read_length}") print(f"[yeat] target depth of coverage: {params.coverage}x") diff --git a/yeat/cli.py b/yeat/cli.py index a0b6ed9..a046c86 100644 --- a/yeat/cli.py +++ b/yeat/cli.py @@ -46,6 +46,7 @@ def run( dryrun="dry", downsample=0, coverage=150, + seed=None, ): snakefile = resource_filename("yeat", "Snakefile") r1 = Path(fastq1).resolve() @@ -67,6 +68,7 @@ def run( dryrun=dryrun, downsample=downsample, coverage=coverage, + seed=seed, ) success = snakemake( snakefile, @@ -138,6 +140,13 @@ def get_parser(exit_on_error=True): default=150, help="target an average depth of coverage Cx when auto-downsampling; by default, C=150", ) + parser.add_argument( + "--seed", + type=int, + metavar="S", + default=None, + help="random seed; by default, S=None", + ) parser.add_argument( "--init", action=InitAction, @@ -162,4 +171,5 @@ def main(args=None): dryrun=args.dry_run, downsample=args.downsample, coverage=args.coverage, + seed=args.seed, ) diff --git a/yeat/tests/test_cli.py b/yeat/tests/test_cli.py index bc9c873..447d627 100644 --- a/yeat/tests/test_cli.py +++ b/yeat/tests/test_cli.py @@ -94,7 +94,7 @@ def test_unicycler(capsys, tmp_path): @pytest.mark.long @pytest.mark.parametrize( "downsample,num_contigs,largest_contig,total_len", - [("2000", 71, 5120, 69189), ("-1", 56, 35168, 199940)], + [("2000", 69, 5103, 70075), ("-1", 56, 35168, 199940)], ) def test_custom_downsample_input( downsample, num_contigs, largest_contig, total_len, capsys, tmp_path @@ -108,6 +108,8 @@ def test_custom_downsample_input( wd, "-d", downsample, + "--seed", + 0, ] args = cli.get_parser().parse_args(arglist) cli.main(args) From 1c85bd4c6767aaf13eef121adc44b47426d6b296 Mon Sep 17 00:00:00 2001 From: Dane Date: Wed, 2 Nov 2022 15:09:53 -0400 Subject: [PATCH 02/10] small changes --- yeat/Snakefile | 6 ++++-- yeat/cli.py | 2 +- yeat/tests/test_cli.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/yeat/Snakefile b/yeat/Snakefile index ee3cf28..3e15ba0 100644 --- a/yeat/Snakefile +++ b/yeat/Snakefile @@ -107,8 +107,10 @@ rule downsample: down = int((genome_size * params.coverage) / (2 * avg_read_length)) else: down = params.downsample - random.seed(params.seed) # for unit tests - seed = randint(1, 2**16-1) + if params.seed == None: + seed = randint(1, 2**16-1) + else: + seed = params.seed print(f"[yeat] average read length: {avg_read_length}") print(f"[yeat] target depth of coverage: {params.coverage}x") print(f"[yeat] number of reads to sample: {down}") diff --git a/yeat/cli.py b/yeat/cli.py index a046c86..a4e6f50 100644 --- a/yeat/cli.py +++ b/yeat/cli.py @@ -145,7 +145,7 @@ def get_parser(exit_on_error=True): type=int, metavar="S", default=None, - help="random seed; by default, S=None", + help="override the randomly chosen seed S when downsampling; by default, S=None", ) parser.add_argument( "--init", diff --git a/yeat/tests/test_cli.py b/yeat/tests/test_cli.py index 447d627..706a6ba 100644 --- a/yeat/tests/test_cli.py +++ b/yeat/tests/test_cli.py @@ -94,7 +94,7 @@ def test_unicycler(capsys, tmp_path): @pytest.mark.long @pytest.mark.parametrize( "downsample,num_contigs,largest_contig,total_len", - [("2000", 69, 5103, 70075), ("-1", 56, 35168, 199940)], + [("2000", 79, 5294, 70818), ("-1", 56, 35168, 199940)], ) def test_custom_downsample_input( downsample, num_contigs, largest_contig, total_len, capsys, tmp_path From 0d2d842e1d0297c81c71d77662f030d2902a7720 Mon Sep 17 00:00:00 2001 From: Dane Date: Thu, 3 Nov 2022 13:35:39 -0400 Subject: [PATCH 03/10] added tests --- yeat/tests/test_cli.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/yeat/tests/test_cli.py b/yeat/tests/test_cli.py index 706a6ba..cde0927 100644 --- a/yeat/tests/test_cli.py +++ b/yeat/tests/test_cli.py @@ -12,6 +12,7 @@ import pandas as pd from pathlib import Path import pytest +from random import randint from yeat import cli from yeat.cli import InitAction from yeat.tests import data_file @@ -109,7 +110,7 @@ def test_custom_downsample_input( "-d", downsample, "--seed", - 0, + "0", ] args = cli.get_parser().parse_args(arglist) cli.main(args) @@ -166,3 +167,25 @@ def test_custom_coverage_input(coverage, capsys, tmp_path): assert df.iloc[12]["sample_contigs"] == 56 # num_contigs assert df.iloc[13]["sample_contigs"] == 35168 # largest_contig assert df.iloc[14]["sample_contigs"] == 199940 # total_len + + +@pytest.mark.long +@pytest.mark.parametrize("execution_number", range(3)) +def test_random_downsample_seed(execution_number, capsys, tmp_path): + wd = str(tmp_path) + arglist = [ + data_file("megahit.cfg"), + data_file("short_reads_1.fastq.gz"), + data_file("short_reads_2.fastq.gz"), + "--outdir", + wd, + "-d", + "2000", + ] + args = cli.get_parser().parse_args(arglist) + cli.main(args) + quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv" + df = pd.read_csv(quast_report, sep="\t") + assert 61 <= df.iloc[12]["sample_contigs"] <= 91 # 76 +-20% of avg num_contigs + assert 4183 <= df.iloc[13]["sample_contigs"] <= 6273 # 5228 +-20% of avg largest_contig + assert 59515 <= df.iloc[14]["sample_contigs"] <= 89271 # 74393 +-20% of avg total_len From bffb1359992387b5ec9a0d3df706114a891c9528 Mon Sep 17 00:00:00 2001 From: Dane Date: Thu, 3 Nov 2022 13:36:35 -0400 Subject: [PATCH 04/10] updated changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5afebb5..09ad385 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Commands to run a short or long test suite (#20) - Custom downsampling flag (#21) - Custom coverage flag (#23) +- Custom seed flag (#29) ## [0.1] 2021-12-01 From 5c88ad802bd60d6c2ffffdb0c30185c1e5278733 Mon Sep 17 00:00:00 2001 From: Dane Date: Thu, 3 Nov 2022 13:58:49 -0400 Subject: [PATCH 05/10] cleaning up code --- yeat/Snakefile | 1 - 1 file changed, 1 deletion(-) diff --git a/yeat/Snakefile b/yeat/Snakefile index 3e15ba0..51c1d01 100644 --- a/yeat/Snakefile +++ b/yeat/Snakefile @@ -11,7 +11,6 @@ import json from shutil import copyfile import pandas as pd from pathlib import Path -import random from random import randint From e4a5e3172833880e8875980e181fc209d36797f6 Mon Sep 17 00:00:00 2001 From: Dane Date: Fri, 4 Nov 2022 12:06:41 -0400 Subject: [PATCH 06/10] implemented suggestions --- CHANGELOG.md | 1 + environment.yml | 3 ++- yeat/cli.py | 2 +- yeat/tests/test_cli.py | 18 ++++++++++++------ 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 09ad385..94542e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Custom downsampling flag (#21) - Custom coverage flag (#23) - Custom seed flag (#29) +- Pinned required Python version to 3.9 for `exit_on_error` parameter and 3.10's incompatibility with SPAdes =0.23 - fastqc>=0.11 - gzip>=1.7 - mash>=2.3 - megahit>=1.2 - pytest-cov>=3.0 - - python>=3.9 + - python=3.9 - quast>=5.0 - seqtk>=1.3 - snakemake>=6.10 diff --git a/yeat/cli.py b/yeat/cli.py index a4e6f50..c16c801 100644 --- a/yeat/cli.py +++ b/yeat/cli.py @@ -145,7 +145,7 @@ def get_parser(exit_on_error=True): type=int, metavar="S", default=None, - help="override the randomly chosen seed S when downsampling; by default, S=None", + help="seed for the random number generator used for downsampling; by default the seed is chosen randomly", ) parser.add_argument( "--init", diff --git a/yeat/tests/test_cli.py b/yeat/tests/test_cli.py index cde0927..cd1e123 100644 --- a/yeat/tests/test_cli.py +++ b/yeat/tests/test_cli.py @@ -164,9 +164,12 @@ def test_custom_coverage_input(coverage, capsys, tmp_path): cli.main(args) quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv" df = pd.read_csv(quast_report, sep="\t") - assert df.iloc[12]["sample_contigs"] == 56 # num_contigs - assert df.iloc[13]["sample_contigs"] == 35168 # largest_contig - assert df.iloc[14]["sample_contigs"] == 199940 # total_len + num_contigs = df.iloc[12]["sample_contigs"] + assert num_contigs == 56 + largest_contig = df.iloc[13]["sample_contigs"] + assert largest_contig == 35168 + total_len = df.iloc[14]["sample_contigs"] + assert total_len == 199940 @pytest.mark.long @@ -186,6 +189,9 @@ def test_random_downsample_seed(execution_number, capsys, tmp_path): cli.main(args) quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv" df = pd.read_csv(quast_report, sep="\t") - assert 61 <= df.iloc[12]["sample_contigs"] <= 91 # 76 +-20% of avg num_contigs - assert 4183 <= df.iloc[13]["sample_contigs"] <= 6273 # 5228 +-20% of avg largest_contig - assert 59515 <= df.iloc[14]["sample_contigs"] <= 89271 # 74393 +-20% of avg total_len + num_contigs = df.iloc[12]["sample_contigs"] + assert num_contigs == pytest.approx(76, abs=15) + largest_contig = df.iloc[13]["sample_contigs"] + assert largest_contig == pytest.approx(5228, abs=1045) + total_len = df.iloc[14]["sample_contigs"] + assert total_len == pytest.approx(74393, abs=14878) From 4f1c23000879eda05393d53ab64d564147178e99 Mon Sep 17 00:00:00 2001 From: Dane Date: Fri, 4 Nov 2022 12:20:14 -0400 Subject: [PATCH 07/10] updating env.yml --- environment.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/environment.yml b/environment.yml index af10e93..5302335 100644 --- a/environment.yml +++ b/environment.yml @@ -5,7 +5,6 @@ channels: - defaults dependencies: - black=22.10 - - click=8.0 - fastp>=0.23 - fastqc>=0.11 - gzip>=1.7 From 85e676c97ebaed60406e19cd69ae4f58f99e3aa3 Mon Sep 17 00:00:00 2001 From: Dane Date: Fri, 4 Nov 2022 12:33:03 -0400 Subject: [PATCH 08/10] updated changedlog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94542e6..ef4c9df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Custom downsampling flag (#21) - Custom coverage flag (#23) - Custom seed flag (#29) -- Pinned required Python version to 3.9 for `exit_on_error` parameter and 3.10's incompatibility with SPAdes Date: Fri, 4 Nov 2022 12:37:56 -0400 Subject: [PATCH 09/10] added comments --- yeat/tests/test_cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yeat/tests/test_cli.py b/yeat/tests/test_cli.py index cd1e123..09b81b7 100644 --- a/yeat/tests/test_cli.py +++ b/yeat/tests/test_cli.py @@ -190,8 +190,8 @@ def test_random_downsample_seed(execution_number, capsys, tmp_path): quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv" df = pd.read_csv(quast_report, sep="\t") num_contigs = df.iloc[12]["sample_contigs"] - assert num_contigs == pytest.approx(76, abs=15) + assert num_contigs == pytest.approx(76, abs=15) # +/- 20% largest_contig = df.iloc[13]["sample_contigs"] - assert largest_contig == pytest.approx(5228, abs=1045) + assert largest_contig == pytest.approx(5228, abs=1045) # +/- 20% total_len = df.iloc[14]["sample_contigs"] - assert total_len == pytest.approx(74393, abs=14878) + assert total_len == pytest.approx(74393, abs=14878) # +/- 20% From fb2d765da37a4e70c0490fdaebd1d7c1a3baec1e Mon Sep 17 00:00:00 2001 From: Dane Date: Fri, 4 Nov 2022 12:39:33 -0400 Subject: [PATCH 10/10] added comments --- yeat/tests/test_cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yeat/tests/test_cli.py b/yeat/tests/test_cli.py index 09b81b7..2cf9ef1 100644 --- a/yeat/tests/test_cli.py +++ b/yeat/tests/test_cli.py @@ -190,8 +190,8 @@ def test_random_downsample_seed(execution_number, capsys, tmp_path): quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv" df = pd.read_csv(quast_report, sep="\t") num_contigs = df.iloc[12]["sample_contigs"] - assert num_contigs == pytest.approx(76, abs=15) # +/- 20% + assert num_contigs == pytest.approx(76, abs=15) # 76 +/- 20% largest_contig = df.iloc[13]["sample_contigs"] - assert largest_contig == pytest.approx(5228, abs=1045) # +/- 20% + assert largest_contig == pytest.approx(5228, abs=1045) # 5228 +/- 20% total_len = df.iloc[14]["sample_contigs"] - assert total_len == pytest.approx(74393, abs=14878) # +/- 20% + assert total_len == pytest.approx(74393, abs=14878) # 74393 +/- 20%