Skip to content

Commit

Permalink
Adding --seed flag to customize the seed when downsampling (#29)
Browse files Browse the repository at this point in the history
  • Loading branch information
danejo3 authored Nov 4, 2022
1 parent 9216de9 commit b2cd761
Show file tree
Hide file tree
Showing 7 changed files with 67 additions and 13 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Commands to run a short or long test suite (#20)
- Custom downsampling flag (#21)
- Custom coverage flag (#23)
- Custom seed flag (#29)
- 3.9 Python version pin for `exit_on_error` parameter and 3.10's incompatibility with SPAdes <v3.15.4 (#23, #29)


## [0.1] 2021-12-01
Expand Down
16 changes: 11 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
# YEAT

YEAT, **Y**our **E**verday **A**ssembly **T**ool, is an update to [`asm_tools`](https://github.com/bioforensics/asm_tools). It uses a Snakemake workflow to preprocess, downsample, and assemble paired-end fastq files with SPAdes.
YEAT, **Y**our **E**verday **A**ssembly **T**ool, is an update to [`asm_tools`](https://github.com/bioforensics/asm_tools). It uses a Snakemake workflow to preprocess, downsample, and assemble paired-end fastq files with various assemblers such as SPAdes, MEGAHIT, and Unicycler.

<p align="center">
<img width="220" alt="Screen Shot 2022-02-02 at 10 57 31 AM" src="https://user-images.githubusercontent.com/33472323/152189781-2bfdc62b-f554-42d5-8f78-f94ab2b133eb.png">
</p>
## Installation

```
git clone https://github.com/bioforensics/yeat.git
cd yeat
conda env create --name yeat --file environment.yml
conda activate yeat
pip install .
```

## Usage:

```$ yeat {read1} {read2} --outdir {path} --sample {name}```
```$ yeat {config} {read1} {read2} --outdir {path} --sample {name}```
3 changes: 2 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@ channels:
- bioconda
- defaults
dependencies:
- black=21.10b0
- black=22.10
- fastp>=0.23
- fastqc>=0.11
- gzip>=1.7
- mash>=2.3
- megahit>=1.2
- pytest-cov>=3.0
- python=3.9
- quast>=5.0
- seqtk>=1.3
- snakemake>=6.10
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
],
zip_safe=True,
keywords='genome assembly',
python_requires='>=3.7',
python_requires='>=3.9',
project_urls={
'Bug Reports': 'https://github.com/bioforensics/yeat/issues',
'Source': 'https://github.com/bioforensics/yeat',
Expand Down
8 changes: 6 additions & 2 deletions yeat/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ rule downsample:
params:
coverage=config["coverage"],
downsample=config["downsample"],
fastp_report="seq/fastp/fastp.json"
fastp_report="seq/fastp/fastp.json",
seed=config["seed"]
run:
if params.downsample == -1:
p = Path("seq/downsample")
Expand All @@ -105,7 +106,10 @@ rule downsample:
down = int((genome_size * params.coverage) / (2 * avg_read_length))
else:
down = params.downsample
seed = randint(1, 2**16-1)
if params.seed == None:
seed = randint(1, 2**16-1)
else:
seed = params.seed
print(f"[yeat] average read length: {avg_read_length}")
print(f"[yeat] target depth of coverage: {params.coverage}x")
print(f"[yeat] number of reads to sample: {down}")
Expand Down
10 changes: 10 additions & 0 deletions yeat/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def run(
dryrun="dry",
downsample=0,
coverage=150,
seed=None,
):
snakefile = resource_filename("yeat", "Snakefile")
r1 = Path(fastq1).resolve()
Expand All @@ -67,6 +68,7 @@ def run(
dryrun=dryrun,
downsample=downsample,
coverage=coverage,
seed=seed,
)
success = snakemake(
snakefile,
Expand Down Expand Up @@ -138,6 +140,13 @@ def get_parser(exit_on_error=True):
default=150,
help="target an average depth of coverage Cx when auto-downsampling; by default, C=150",
)
parser.add_argument(
"--seed",
type=int,
metavar="S",
default=None,
help="seed for the random number generator used for downsampling; by default the seed is chosen randomly",
)
parser.add_argument(
"--init",
action=InitAction,
Expand All @@ -162,4 +171,5 @@ def main(args=None):
dryrun=args.dry_run,
downsample=args.downsample,
coverage=args.coverage,
seed=args.seed,
)
39 changes: 35 additions & 4 deletions yeat/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pandas as pd
from pathlib import Path
import pytest
from random import randint
from yeat import cli
from yeat.cli import InitAction
from yeat.tests import data_file
Expand Down Expand Up @@ -94,7 +95,7 @@ def test_unicycler(capsys, tmp_path):
@pytest.mark.long
@pytest.mark.parametrize(
"downsample,num_contigs,largest_contig,total_len",
[("2000", 71, 5120, 69189), ("-1", 56, 35168, 199940)],
[("2000", 79, 5294, 70818), ("-1", 56, 35168, 199940)],
)
def test_custom_downsample_input(
downsample, num_contigs, largest_contig, total_len, capsys, tmp_path
Expand All @@ -108,6 +109,8 @@ def test_custom_downsample_input(
wd,
"-d",
downsample,
"--seed",
"0",
]
args = cli.get_parser().parse_args(arglist)
cli.main(args)
Expand Down Expand Up @@ -161,6 +164,34 @@ def test_custom_coverage_input(coverage, capsys, tmp_path):
cli.main(args)
quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv"
df = pd.read_csv(quast_report, sep="\t")
assert df.iloc[12]["sample_contigs"] == 56 # num_contigs
assert df.iloc[13]["sample_contigs"] == 35168 # largest_contig
assert df.iloc[14]["sample_contigs"] == 199940 # total_len
num_contigs = df.iloc[12]["sample_contigs"]
assert num_contigs == 56
largest_contig = df.iloc[13]["sample_contigs"]
assert largest_contig == 35168
total_len = df.iloc[14]["sample_contigs"]
assert total_len == 199940


@pytest.mark.long
@pytest.mark.parametrize("execution_number", range(3))
def test_random_downsample_seed(execution_number, capsys, tmp_path):
wd = str(tmp_path)
arglist = [
data_file("megahit.cfg"),
data_file("short_reads_1.fastq.gz"),
data_file("short_reads_2.fastq.gz"),
"--outdir",
wd,
"-d",
"2000",
]
args = cli.get_parser().parse_args(arglist)
cli.main(args)
quast_report = Path(wd).resolve() / "analysis" / "quast" / "megahit" / "report.tsv"
df = pd.read_csv(quast_report, sep="\t")
num_contigs = df.iloc[12]["sample_contigs"]
assert num_contigs == pytest.approx(76, abs=15) # 76 +/- 20%
largest_contig = df.iloc[13]["sample_contigs"]
assert largest_contig == pytest.approx(5228, abs=1045) # 5228 +/- 20%
total_len = df.iloc[14]["sample_contigs"]
assert total_len == pytest.approx(74393, abs=14878) # 74393 +/- 20%

0 comments on commit b2cd761

Please sign in to comment.