Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: link reads was not properly account for coassemble flag #139

Merged
merged 4 commits into from
Sep 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,11 @@ that can seamlessly communicate with each other. Each module can be run independ

# Quick Installation

Your conda channels should be configured ideally in this order with strict channel priority order
turned on:
Your conda channels should be configured ideally in this order:
```
conda config --add channels defaults
conda config --add channels bioconda
conda config --add channels conda-forge
conda config --set channel_priority strict
```

Your resulting `.condarc` file should look something like:
Expand All @@ -25,7 +23,6 @@ channels:
- conda-forge
- bioconda
- defaults
channel_priority: strict
```

#### Option 1) Install from Bioconda
Expand Down
50 changes: 33 additions & 17 deletions aviary/aviary.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,16 @@ def main():
default=250,
)

base_group.add_argument(
'--request-gpu', '--request_gpu',
help='Request a GPU for use with the pipeline. This will only work if the pipeline is run on a cluster',
type=str2bool,
nargs='?',
const=True,
dest='request_gpu',
default=False,
)

base_group.add_argument(
'-o', '--output',
help='Output directory',
Expand Down Expand Up @@ -303,32 +313,43 @@ def main():

qc_group.add_argument(
'-r', '--reference-filter', '--reference_filter',
help='Reference filter file to aid in the assembly',
help='One or more reference filter files to aid in the assembly. Remove contaminant reads from the assembly.',
dest="reference_filter",
default='none'
nargs='*',
default=['none']
)

qc_group.add_argument(
'--min-read-size', '--min_read_size',
help='Minimum long read size when filtering using Filtlong',
dest="min_read_size",
default=250
default=100
)

qc_group.add_argument(
'--min-mean-q', '--min_mean_q',
help='Minimum mean quality threshold',
dest="min_mean_q",
default=50
default=10
)

qc_group.add_argument(
'--keep-percent', '--keep_percent',
help='Percentage of reads passing quality thresholds kept by filtlong',
help='DEPRECATED: Percentage of reads passing quality thresholds kept by filtlong',
dest="keep_percent",
default=100
)

qc_group.add_argument(
'--skip-qc', '--skip_qc',
help='Skip quality control steps',
type=str2bool,
nargs='?',
const=True,
dest="skip_qc",
default=False
)


####################################################################

Expand All @@ -339,10 +360,9 @@ def main():
read_group_exclusive.add_argument(
'-1', '--pe-1', '--paired-reads-1', '--paired_reads_1', '--pe1',
help='A space separated list of forwards read files \n'
'NOTE: If performing assembly and multiple files and longreads \n'
' are provided then only the first file will be used for assembly. \n'
'NOTE: If performing assembly and multiple files are provided then only the first file will be used for assembly. \n'
' If no longreads are provided then all samples will be co-assembled \n'
' with megahit or metaspades depending on the --coassemble parameter\n',
' with megahit or metaspades depending on the --coassemble parameter',
dest='pe1',
nargs='*',
default="none"
Expand All @@ -351,8 +371,7 @@ def main():
short_read_group.add_argument(
'-2', '--pe-2', '--paired-reads-2', '--paired_reads_2', '--pe2',
help='A space separated list of reverse read files \n'
'NOTE: If performing assembly and multiple files and longreads \n'
' are provided then only the first file will be used for assembly. \n'
'NOTE: If performing assembly and multiple files are provided then only the first file will be used for assembly. \n'
' If no longreads are provided then all samples will be co-assembled \n'
' with megahit or metaspades depending on the --coassemble parameter',
dest='pe2',
Expand All @@ -363,8 +382,7 @@ def main():
read_group_exclusive.add_argument(
'-i','--interleaved',
help='A space separated list of interleaved read files \n'
'NOTE: If performing assembly and multiple files and longreads \n'
' are provided then only the first file will be used for assembly. \n'
'NOTE: If performing assembly and multiple files are provided then only the first file will be used for assembly. \n'
' If no longreads are provided then all samples will be co-assembled \n'
' with megahit or metaspades depending on the --coassemble parameter',
dest='interleaved',
Expand All @@ -375,8 +393,7 @@ def main():
read_group_exclusive.add_argument(
'-c', '--coupled',
help='Forward and reverse read files in a coupled space separated list. \n'
'NOTE: If performing assembly and multiple files and longreads \n'
' are provided then only the first file will be used for assembly. \n'
'NOTE: If performing assembly and multiple files are provided then only the first file will be used for assembly. \n'
' If no longreads are provided then all samples will be co-assembled \n'
' with megahit or metaspades depending on the --coassemble parameter',
dest='coupled',
Expand All @@ -399,8 +416,7 @@ def main():
long_read_group.add_argument(
'-l', '--longreads', '--long-reads', '--long_reads',
help='A space separated list of long-read read files. '
'NOTE: If performing assembly and multiple long read files are provided, \n'
' then only the first file is used for assembly. This behaviour might change in future.',
'NOTE: The first file will be used for assembly unless --coassemble is set to True. Then all files will be used.',
dest='longreads',
nargs='*',
default="none"
Expand Down Expand Up @@ -694,7 +710,7 @@ def main():
nargs='?',
const=True,
dest='coassemble',
default=True,
default=False,
)

assemble_group.add_argument(
Expand Down
2 changes: 2 additions & 0 deletions aviary/envs/coverm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ channels:
dependencies:
- coverm >= 0.6
- galah >= 0.3
- chopper >= 0.6
- pigz
- parallel
- dashing
- fastani
4 changes: 2 additions & 2 deletions aviary/modules/Snakefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ruleorder: skip_long_assembly > get_reads_list_ref > link_reads > short_only
ruleorder: filtlong_no_reference > link_reads
# ruleorder: skip_long_assembly > get_reads_list_ref > link_reads > short_only
# ruleorder: filtlong_no_reference > link_reads

onsuccess:
print("Aviary finished, no error")
Expand Down
1 change: 1 addition & 0 deletions aviary/modules/annotation/annotation.smk
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ rule checkm2:
resources:
mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 128*1024*attempt),
runtime = lambda wildcards, attempt: 8*60*attempt,
gpus = 1 if config["request_gpu"] else 0
log:
'logs/checkm2.log'
benchmark:
Expand Down
146 changes: 25 additions & 121 deletions aviary/modules/assembly/assembly.smk
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
localrules: get_umapped_reads_ref, no_ref_filter, get_high_cov_contigs, skip_long_assembly, short_only, move_spades_assembly, pool_reads, skip_unicycler, skip_unicycler_with_qc, complete_assembly, complete_assembly_with_qc, reset_to_spades_assembly, remove_final_contigs, combine_assemblies, combine_long_only

ruleorder: filter_illumina_assembly > short_only
# ruleorder: fastqc > fastqc_long
ruleorder: skip_unicycler_with_qc > skip_unicycler > combine_assemblies > combine_long_only
ruleorder: skip_long_assembly > get_high_cov_contigs > short_only
ruleorder: skip_long_assembly > filter_illumina_assembly
ruleorder: filter_illumina_ref > no_ref_filter
ruleorder: skip_unicycler_with_qc > skip_unicycler > combine_assemblies > combine_long_only > assemble_short_reads
localrules: get_high_cov_contigs, short_only, move_spades_assembly, pool_reads, skip_unicycler, skip_unicycler_with_qc, complete_assembly, complete_assembly_with_qc, reset_to_spades_assembly, remove_final_contigs, combine_assemblies, combine_long_only

ruleorder: filter_illumina_assembly > short_only > combine_long_only
ruleorder: get_high_cov_contigs > short_only
ruleorder: skip_unicycler_with_qc > skip_unicycler > combine_assemblies > move_spades_assembly > combine_long_only
ruleorder: skip_unicycler_with_qc > skip_unicycler > complete_assembly_with_qc > complete_assembly
ruleorder: skip_unicycler_with_qc > skip_unicycler > combine_assemblies > move_spades_assembly

Expand Down Expand Up @@ -43,76 +39,6 @@ onstart:
if short_reads_2 != "none" and not os.path.exists(short_reads_2[0]):
sys.exit("short_reads_2 does not point to a file")

# Filter reads against a reference, i.e. for removing host contamination from the metagenome
rule map_reads_ref:
input:
fastq = config["long_reads"],
reference_filter = config["reference_filter"]
output:
temp("data/raw_mapped_ref.bam"),
temp("data/raw_mapped_ref.bai")
conda:
"../../envs/coverm.yaml"
benchmark:
"benchmarks/map_reads_ref.benchmark.txt"
params:
mapper = "map-ont" if config["long_read_type"] in ["ont", "ont-hq"] else "map-pb"
threads:
min(config["max_threads"], 64)
resources:
mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 512*1024*attempt),
runtime = lambda wildcards, attempt: 12*60*attempt,
log:
"logs/map_reads_ref.log"
shell:
"minimap2 -ax {params.mapper} --split-prefix=tmp -t {threads} {input.reference_filter} {input.fastq} 2> {log} | "
"samtools view -@ {threads} -b > {output} 2>> {log} && "
"samtools index {output} 2> {log}"


# Get a list of reads that don't map to genome you want to filter
rule get_umapped_reads_ref:
input:
"data/raw_mapped_ref.bam"
output:
temp("data/unmapped_to_ref.list")
params:
"no_full"
conda:
"envs/pysam.yaml"
benchmark:
"benchmarks/get_unmapped_reads_ref.benchmark.txt"
script:
"scripts/filter_read_list.py"


# Create new read file with filtered reads
rule get_reads_list_ref:
input:
fastq = config["long_reads"],
unmapped_list = "data/unmapped_to_ref.list"
output:
temp("data/long_reads.fastq.gz")
threads:
min(config["max_threads"], 64)
resources:
mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 512*1024*attempt),
runtime = lambda wildcards, attempt: 12*60*attempt,
log:
"logs/get_reads_list_ref.log"
conda:
"envs/seqtk.yaml"
benchmark:
"benchmarks/get_reads_list_ref.benchmark.txt"
shell:
"seqtk subseq {input.fastq} {input.unmapped_list} 2> {log} | pigz -p {threads} > {output} 2>> {log}"

# if no reference filter output this done file just to keep the DAG happy
rule no_ref_filter:
output:
filtered = temp("data/short_filter.done")
shell:
"touch {output.filtered}"

# Assembly long reads with metaflye
rule flye_assembly:
Expand Down Expand Up @@ -162,6 +88,7 @@ rule polish_metagenome_flye:
resources:
mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 512*1024*attempt),
runtime = lambda wildcards, attempt: 24*60*attempt,
gpus = 1 if config["request_gpu"] else 0
log:
"logs/polish_metagenome_flye.log"
output:
Expand All @@ -172,31 +99,6 @@ rule polish_metagenome_flye:
"scripts/polish.py"


### Filter illumina reads against provided reference
rule filter_illumina_ref:
input:
reference_filter = config["reference_filter"]
output:
bam = "data/short_unmapped_ref.bam",
fastq = "data/short_reads.fastq.gz",
filtered = "data/short_filter.done"
params:
coassemble = config["coassemble"]
conda:
"../../envs/minimap2.yaml"
threads:
min(config["max_threads"], 64)
resources:
mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 512*1024*attempt),
runtime = lambda wildcards, attempt: 8*60*attempt,
log:
"logs/filter_illumina_ref.log"
benchmark:
"benchmarks/filter_illumina_ref.benchmark.txt"
script:
"scripts/filter_illumina_reference.py"


# Generate BAM file for pilon, discard unmapped reads
rule generate_pilon_sort:
input:
Expand Down Expand Up @@ -389,7 +291,7 @@ rule get_high_cov_contigs:
# Specifically, short reads that do not map to the high coverage long contigs are collected
rule filter_illumina_assembly:
input:
fastq = config['short_reads_1'], # check short reads were supplied
fastq = "data/short_reads.fastq.gz" if config["reference_filter"] != ["none"] else config['short_reads_1'], # check short reads were supplied
fasta = "data/flye_high_cov.fasta"
output:
bam = temp("data/sr_vs_long.sort.bam"),
Expand All @@ -412,25 +314,25 @@ rule filter_illumina_assembly:
"scripts/filter_illumina_assembly.py"


# If unassembled long reads are provided, skip the long read assembly
rule skip_long_assembly:
input:
unassembled_long = config["unassembled_long"]
output:
# fastq = "data/short_reads.filt.fastq.gz",
fasta = "data/flye_high_cov.fasta",
long_reads = temp("data/long_reads.fastq.gz")
shell:
"""
touch {output.fasta} && \
ln {input.unassembled_long} {output.long_reads}
"""
# # If unassembled long reads are provided, skip the long read assembly
# rule skip_long_assembly:
# input:
# unassembled_long = [] if "none" in config["unassembled_long"] else config["unassembled_long"]
# output:
# # fastq = "data/short_reads.filt.fastq.gz",
# fasta = "data/flye_high_cov.fasta",
# long_reads = temp("data/long_reads.fastq.gz")
# shell:
# """
# touch {output.fasta} && \
# ln {input.unassembled_long} {output.long_reads}
# """


# If only short reads are provided
rule short_only:
input:
fastq = config["short_reads_1"]
fastq = "data/short_reads.fastq.gz"
output:
fasta = "data/flye_high_cov.fasta",
# long_reads = temp("data/long_reads.fastq.gz")
Expand Down Expand Up @@ -487,15 +389,15 @@ rule spades_assembly:
cp data/spades_assembly/scaffolds.fasta data/spades_assembly.fasta
fi
else
touch {output.fasta}
mkdir -p {output.spades_folder} && touch {output.fasta}
fi
"""


# Perform short read assembly only with no other steps
rule assemble_short_reads:
input:
fastq = config["short_reads_1"]
fastq = "data/short_reads.fastq.gz"
output:
fasta = "data/short_read_assembly/scaffolds.fasta",
# We cannot mark the output_folder as temp as then it gets deleted,
Expand Down Expand Up @@ -533,6 +435,7 @@ rule move_spades_assembly:
assembly = "data/short_read_assembly/scaffolds.fasta"
output:
out = "data/final_contigs.fasta"
priority: 1
shell:
"cp {input.assembly} {output.out} && rm -rf data/short_read_assembly"

Expand Down Expand Up @@ -703,6 +606,7 @@ rule combine_long_only:
"scripts/combine_assemblies.py"



rule skip_unicycler:
input:
short_fasta = "data/spades_assembly.fasta",
Expand Down
Loading