rhysnewell · rhysnewell · Sep 13, 2023 · Sep 11, 2023 · Sep 12, 2023 · Sep 13, 2023
diff --git a/README.md b/README.md
@@ -10,13 +10,11 @@ that can seamlessly communicate with each other. Each module can be run independ
 
 # Quick Installation
 
-Your conda channels should be configured ideally in this order with strict channel priority order
-turned on:
+Your conda channels should be configured ideally in this order:
 ```
 conda config --add channels defaults
 conda config --add channels bioconda
 conda config --add channels conda-forge
-conda config --set channel_priority strict
 ```
 
 Your resulting `.condarc` file should look something like:
@@ -25,7 +23,6 @@ channels:
   - conda-forge
   - bioconda
   - defaults
-channel_priority: strict
 ```
 
 #### Option 1) Install from Bioconda

diff --git a/aviary/aviary.py b/aviary/aviary.py
@@ -164,6 +164,16 @@ def main():
         default=250,
     )
 
+    base_group.add_argument(
+        '--request-gpu', '--request_gpu',
+        help='Request a GPU for use with the pipeline. This will only work if the pipeline is run on a cluster',
+        type=str2bool,
+        nargs='?',
+        const=True,
+        dest='request_gpu',
+        default=False,
+    )
+
     base_group.add_argument(
         '-o', '--output',
         help='Output directory',
@@ -303,32 +313,43 @@ def main():
 
     qc_group.add_argument(
         '-r', '--reference-filter', '--reference_filter',
-        help='Reference filter file to aid in the assembly',
+        help='One or more reference filter files to aid in the assembly. Remove contaminant reads from the assembly.',
         dest="reference_filter",
-        default='none'
+        nargs='*',
+        default=['none']
     )
 
     qc_group.add_argument(
         '--min-read-size', '--min_read_size',
         help='Minimum long read size when filtering using Filtlong',
         dest="min_read_size",
-        default=250
+        default=100
     )
 
     qc_group.add_argument(
         '--min-mean-q', '--min_mean_q',
         help='Minimum mean quality threshold',
         dest="min_mean_q",
-        default=50
+        default=10
     )
 
     qc_group.add_argument(
         '--keep-percent', '--keep_percent',
-        help='Percentage of reads passing quality thresholds kept by filtlong',
+        help='DEPRECATED: Percentage of reads passing quality thresholds kept by filtlong',
         dest="keep_percent",
         default=100
     )
 
+    qc_group.add_argument(
+        '--skip-qc', '--skip_qc',
+        help='Skip quality control steps',
+        type=str2bool,
+        nargs='?',
+        const=True,
+        dest="skip_qc",
+        default=False
+    )
+
 
     ####################################################################
 
@@ -339,10 +360,9 @@ def main():
     read_group_exclusive.add_argument(
         '-1', '--pe-1', '--paired-reads-1', '--paired_reads_1', '--pe1',
         help='A space separated list of forwards read files \n'
-             'NOTE: If performing assembly and multiple files and longreads \n'
-             '      are provided then only the first file will be used for assembly. \n'
+             'NOTE: If performing assembly and multiple files are provided then only the first file will be used for assembly. \n'
              '      If no longreads are provided then all samples will be co-assembled \n'
-             '      with megahit or metaspades depending on the --coassemble parameter\n',
+             '      with megahit or metaspades depending on the --coassemble parameter',
         dest='pe1',
         nargs='*',
         default="none"
@@ -351,8 +371,7 @@ def main():
     short_read_group.add_argument(
         '-2', '--pe-2', '--paired-reads-2', '--paired_reads_2', '--pe2',
         help='A space separated list of reverse read files \n'
-             'NOTE: If performing assembly and multiple files and longreads \n'
-             '      are provided then only the first file will be used for assembly. \n'
+             'NOTE: If performing assembly and multiple files are provided then only the first file will be used for assembly. \n'
              '      If no longreads are provided then all samples will be co-assembled \n'
              '      with megahit or metaspades depending on the --coassemble parameter',
         dest='pe2',
@@ -363,8 +382,7 @@ def main():
     read_group_exclusive.add_argument(
         '-i','--interleaved',
         help='A space separated list of interleaved read files \n'
-             'NOTE: If performing assembly and multiple files and longreads \n'
-             '      are provided then only the first file will be used for assembly. \n'
+             'NOTE: If performing assembly and multiple files are provided then only the first file will be used for assembly. \n'
              '      If no longreads are provided then all samples will be co-assembled \n'
              '      with megahit or metaspades depending on the --coassemble parameter',
         dest='interleaved',
@@ -375,8 +393,7 @@ def main():
     read_group_exclusive.add_argument(
         '-c', '--coupled',
         help='Forward and reverse read files in a coupled space separated list. \n'
-             'NOTE: If performing assembly and multiple files and longreads \n'
-             '      are provided then only the first file will be used for assembly. \n'
+             'NOTE: If performing assembly and multiple files are provided then only the first file will be used for assembly. \n'
              '      If no longreads are provided then all samples will be co-assembled \n'
              '      with megahit or metaspades depending on the --coassemble parameter',
         dest='coupled',
@@ -399,8 +416,7 @@ def main():
     long_read_group.add_argument(
         '-l', '--longreads', '--long-reads', '--long_reads',
         help='A space separated list of long-read read files. '
-             'NOTE: If performing assembly and multiple long read files are provided, \n'
-             '      then only the first file is used for assembly. This behaviour might change in future.',
+             'NOTE: The first file will be used for assembly unless --coassemble is set to True. Then all files will be used.',
         dest='longreads',
         nargs='*',
         default="none"
@@ -694,7 +710,7 @@ def main():
         nargs='?',
         const=True,
         dest='coassemble',
-        default=True,
+        default=False,
     )
 
     assemble_group.add_argument(

diff --git a/aviary/envs/coverm.yaml b/aviary/envs/coverm.yaml
@@ -4,6 +4,8 @@ channels:
 dependencies:
   - coverm >= 0.6
   - galah >= 0.3
+  - chopper >= 0.6
+  - pigz
   - parallel
   - dashing
   - fastani
diff --git a/aviary/modules/Snakefile b/aviary/modules/Snakefile
@@ -1,5 +1,5 @@
-ruleorder: skip_long_assembly > get_reads_list_ref > link_reads > short_only
-ruleorder: filtlong_no_reference > link_reads
+# ruleorder: skip_long_assembly > get_reads_list_ref > link_reads > short_only
+# ruleorder: filtlong_no_reference > link_reads
 
 onsuccess:
     print("Aviary finished, no error")

diff --git a/aviary/modules/annotation/annotation.smk b/aviary/modules/annotation/annotation.smk
@@ -138,6 +138,7 @@ rule checkm2:
     resources:
         mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 128*1024*attempt),
         runtime = lambda wildcards, attempt: 8*60*attempt,
+        gpus = 1 if config["request_gpu"] else 0
     log:
         'logs/checkm2.log'
     benchmark:

diff --git a/aviary/modules/assembly/assembly.smk b/aviary/modules/assembly/assembly.smk
@@ -1,12 +1,8 @@
-localrules: get_umapped_reads_ref, no_ref_filter, get_high_cov_contigs, skip_long_assembly, short_only, move_spades_assembly, pool_reads, skip_unicycler, skip_unicycler_with_qc, complete_assembly, complete_assembly_with_qc, reset_to_spades_assembly, remove_final_contigs, combine_assemblies, combine_long_only
-
-ruleorder: filter_illumina_assembly > short_only
-# ruleorder: fastqc > fastqc_long
-ruleorder: skip_unicycler_with_qc > skip_unicycler > combine_assemblies > combine_long_only
-ruleorder: skip_long_assembly > get_high_cov_contigs > short_only
-ruleorder: skip_long_assembly > filter_illumina_assembly
-ruleorder: filter_illumina_ref > no_ref_filter
-ruleorder: skip_unicycler_with_qc > skip_unicycler > combine_assemblies > combine_long_only > assemble_short_reads
+localrules: get_high_cov_contigs, short_only, move_spades_assembly, pool_reads, skip_unicycler, skip_unicycler_with_qc, complete_assembly, complete_assembly_with_qc, reset_to_spades_assembly, remove_final_contigs, combine_assemblies, combine_long_only
+
+ruleorder: filter_illumina_assembly > short_only > combine_long_only
+ruleorder: get_high_cov_contigs > short_only
+ruleorder: skip_unicycler_with_qc > skip_unicycler > combine_assemblies > move_spades_assembly > combine_long_only
 ruleorder: skip_unicycler_with_qc > skip_unicycler > complete_assembly_with_qc > complete_assembly
 ruleorder: skip_unicycler_with_qc > skip_unicycler > combine_assemblies > move_spades_assembly
 
@@ -43,76 +39,6 @@ onstart:
     if short_reads_2 != "none" and not os.path.exists(short_reads_2[0]):
         sys.exit("short_reads_2 does not point to a file")
 
-# Filter reads against a reference, i.e. for removing host contamination from the metagenome
-rule map_reads_ref:
-    input:
-        fastq = config["long_reads"],
-        reference_filter = config["reference_filter"]
-    output:
-        temp("data/raw_mapped_ref.bam"),
-        temp("data/raw_mapped_ref.bai")
-    conda:
-        "../../envs/coverm.yaml"
-    benchmark:
-        "benchmarks/map_reads_ref.benchmark.txt"
-    params:
-        mapper = "map-ont" if config["long_read_type"] in ["ont", "ont-hq"] else "map-pb"
-    threads:
-        min(config["max_threads"], 64)
-    resources:
-        mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 512*1024*attempt),
-        runtime = lambda wildcards, attempt: 12*60*attempt,
-    log:
-        "logs/map_reads_ref.log"
-    shell:
-        "minimap2 -ax {params.mapper} --split-prefix=tmp -t {threads} {input.reference_filter} {input.fastq} 2> {log} | "
-        "samtools view -@ {threads} -b > {output} 2>> {log} && "
-        "samtools index {output} 2> {log}"
-
-
-# Get a list of reads that don't map to genome you want to filter
-rule get_umapped_reads_ref:
-    input:
-        "data/raw_mapped_ref.bam"
-    output:
-        temp("data/unmapped_to_ref.list")
-    params:
-        "no_full"
-    conda:
-        "envs/pysam.yaml"
-    benchmark:
-        "benchmarks/get_unmapped_reads_ref.benchmark.txt"
-    script:
-        "scripts/filter_read_list.py"
-
-
-# Create new read file with filtered reads
-rule get_reads_list_ref:
-    input:
-        fastq = config["long_reads"],
-        unmapped_list = "data/unmapped_to_ref.list"
-    output:
-        temp("data/long_reads.fastq.gz")
-    threads:
-        min(config["max_threads"], 64)
-    resources:
-        mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 512*1024*attempt),
-        runtime = lambda wildcards, attempt: 12*60*attempt,
-    log:
-        "logs/get_reads_list_ref.log"
-    conda:
-        "envs/seqtk.yaml"
-    benchmark:
-        "benchmarks/get_reads_list_ref.benchmark.txt"
-    shell:
-        "seqtk subseq {input.fastq} {input.unmapped_list} 2> {log} | pigz -p {threads} > {output} 2>> {log}"
-
-# if no reference filter output this done file just to keep the DAG happy
-rule no_ref_filter:
-    output:
-        filtered = temp("data/short_filter.done")
-    shell:
-        "touch {output.filtered}"
 
 # Assembly long reads with metaflye
 rule flye_assembly:
@@ -162,6 +88,7 @@ rule polish_metagenome_flye:
     resources:
         mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 512*1024*attempt),
         runtime = lambda wildcards, attempt: 24*60*attempt,
+        gpus = 1 if config["request_gpu"] else 0
     log:
         "logs/polish_metagenome_flye.log"
     output:
@@ -172,31 +99,6 @@ rule polish_metagenome_flye:
         "scripts/polish.py"
 
 
-### Filter illumina reads against provided reference
-rule filter_illumina_ref:
-    input:
-        reference_filter = config["reference_filter"]
-    output:
-        bam = "data/short_unmapped_ref.bam",
-        fastq = "data/short_reads.fastq.gz",
-        filtered = "data/short_filter.done"
-    params:
-        coassemble = config["coassemble"]
-    conda:
-        "../../envs/minimap2.yaml"
-    threads:
-        min(config["max_threads"], 64)
-    resources:
-        mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 512*1024*attempt),
-        runtime = lambda wildcards, attempt: 8*60*attempt,
-    log:
-        "logs/filter_illumina_ref.log"
-    benchmark:
-        "benchmarks/filter_illumina_ref.benchmark.txt"
-    script:
-        "scripts/filter_illumina_reference.py"
-
-
 # Generate BAM file for pilon, discard unmapped reads
 rule generate_pilon_sort:
     input:
@@ -389,7 +291,7 @@ rule get_high_cov_contigs:
 # Specifically, short reads that do not map to the high coverage long contigs are collected
 rule filter_illumina_assembly:
     input:
-        fastq = config['short_reads_1'], # check short reads were supplied
+        fastq = "data/short_reads.fastq.gz" if config["reference_filter"] != ["none"] else config['short_reads_1'], # check short reads were supplied
         fasta = "data/flye_high_cov.fasta"
     output:
         bam = temp("data/sr_vs_long.sort.bam"),
@@ -412,25 +314,25 @@ rule filter_illumina_assembly:
         "scripts/filter_illumina_assembly.py"
 
 
-# If unassembled long reads are provided, skip the long read assembly
-rule skip_long_assembly:
-    input:
-        unassembled_long = config["unassembled_long"]
-    output:
-        # fastq = "data/short_reads.filt.fastq.gz",
-        fasta = "data/flye_high_cov.fasta",
-        long_reads = temp("data/long_reads.fastq.gz")
-    shell:
-        """
-        touch {output.fasta} && \
-        ln {input.unassembled_long} {output.long_reads}
-        """
+# # If unassembled long reads are provided, skip the long read assembly
+# rule skip_long_assembly:
+#     input:
+#         unassembled_long = [] if "none" in config["unassembled_long"] else config["unassembled_long"]
+#     output:
+#         # fastq = "data/short_reads.filt.fastq.gz",
+#         fasta = "data/flye_high_cov.fasta",
+#         long_reads = temp("data/long_reads.fastq.gz")
+#     shell:
+#         """
+#         touch {output.fasta} && \
+#         ln {input.unassembled_long} {output.long_reads}
+#         """
 
 
 # If only short reads are provided
 rule short_only:
     input:
-        fastq = config["short_reads_1"]
+        fastq = "data/short_reads.fastq.gz"
     output:
         fasta = "data/flye_high_cov.fasta",
         # long_reads = temp("data/long_reads.fastq.gz")
@@ -487,15 +389,15 @@ rule spades_assembly:
                 cp data/spades_assembly/scaffolds.fasta data/spades_assembly.fasta
             fi
         else
-            touch {output.fasta}
+            mkdir -p {output.spades_folder} && touch {output.fasta}
         fi 
         """
 
 
 # Perform short read assembly only with no other steps
 rule assemble_short_reads:
     input:
-        fastq = config["short_reads_1"]
+        fastq = "data/short_reads.fastq.gz"
     output:
         fasta = "data/short_read_assembly/scaffolds.fasta",
         # We cannot mark the output_folder as temp as then it gets deleted,
@@ -533,6 +435,7 @@ rule move_spades_assembly:
         assembly = "data/short_read_assembly/scaffolds.fasta"
     output:
         out = "data/final_contigs.fasta"
+    priority: 1
     shell:
         "cp {input.assembly} {output.out} && rm -rf data/short_read_assembly"
 
@@ -703,6 +606,7 @@ rule combine_long_only:
         "scripts/combine_assemblies.py"
 
 
+
 rule skip_unicycler:
     input:
         short_fasta = "data/spades_assembly.fasta",