rhysnewell · rhysnewell · Sep 13, 2023 · Aug 28, 2023 · Aug 29, 2023 · Aug 30, 2023
diff --git a/README.md b/README.md
@@ -10,13 +10,11 @@ that can seamlessly communicate with each other. Each module can be run independ
 
 # Quick Installation
 
-Your conda channels should be configured ideally in this order with strict channel priority order
-turned on:
+Your conda channels should be configured ideally in this order:
 ```
 conda config --add channels defaults
 conda config --add channels bioconda
 conda config --add channels conda-forge
-conda config --set channel_priority strict
 ```
 
 Your resulting `.condarc` file should look something like:
@@ -25,7 +23,6 @@ channels:
   - conda-forge
   - bioconda
   - defaults
-channel_priority: strict
 ```
 
 #### Option 1) Install from Bioconda

diff --git a/aviary.yml b/aviary.yml
@@ -4,7 +4,7 @@ channels:
   - anaconda
 dependencies:
   - python>=3.8
-  - snakemake>=6.0.5,<=7.17
+  - snakemake>=6.0.5,<=7.32.3
   - ruamel.yaml>=0.15.99 # needs to be explicit
   - numpy
   - pandas

diff --git a/aviary/__init__.py b/aviary/__init__.py
@@ -1 +1 @@
-__version__ = "0.7.2"
+__version__ = "0.8.0"
diff --git a/aviary/aviary.py b/aviary/aviary.py
@@ -164,6 +164,16 @@ def main():
         default=250,
     )
 
+    base_group.add_argument(
+        '--request-gpu', '--request_gpu',
+        help='Request a GPU for use with the pipeline. This will only work if the pipeline is run on a cluster',
+        type=str2bool,
+        nargs='?',
+        const=True,
+        dest='request_gpu',
+        default=False,
+    )
+
     base_group.add_argument(
         '-o', '--output',
         help='Output directory',
@@ -195,6 +205,23 @@ def main():
         default=" "
     )
 
+    base_group.add_argument(
+        '--snakemake-profile',
+        help='Snakemake profile (see https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles)\n'
+             'Create profile as `~/.config/snakemake/[CLUSTER_PROFILE]/config.yaml`. \n'
+             'Can be used to submit rules as jobs to cluster engine (see https://snakemake.readthedocs.io/en/stable/executing/cluster.html), \n'
+             'requires cluster, cluster-status, jobs, cluster-cancel. ',
+        dest='snakemake_profile',
+        default=""
+    )
+
+    base_group.add_argument(
+        '--cluster-retries',
+        help='Number of times to retry a failed job when using cluster submission (see `--snakemake-profile`). ',
+        dest='cluster_retries',
+        default=0
+    )
+
     base_group.add_argument(
         '--dry-run', '--dry_run', '--dryrun',
         help='Perform snakemake dry run, tests workflow order and conda environments',
@@ -250,7 +277,7 @@ def main():
         '--rerun-triggers', '--rerun_triggers',
         help='Specify which kinds of modifications will trigger rules to rerun',\
         dest='rerun_triggers',
-        default="mtime",
+        default=["mtime"],
         nargs="*",
         choices=["mtime","params","input","software-env","code"]
     )
@@ -286,32 +313,43 @@ def main():
 
     qc_group.add_argument(
         '-r', '--reference-filter', '--reference_filter',
-        help='Reference filter file to aid in the assembly',
+        help='One or more reference filter files to aid in the assembly. Remove contaminant reads from the assembly.',
         dest="reference_filter",
-        default='none'
+        nargs='*',
+        default=['none']
     )
 
     qc_group.add_argument(
         '--min-read-size', '--min_read_size',
         help='Minimum long read size when filtering using Filtlong',
         dest="min_read_size",
-        default=250
+        default=100
     )
 
     qc_group.add_argument(
         '--min-mean-q', '--min_mean_q',
         help='Minimum mean quality threshold',
         dest="min_mean_q",
-        default=50
+        default=10
     )
 
     qc_group.add_argument(
         '--keep-percent', '--keep_percent',
-        help='Percentage of reads passing quality thresholds kept by filtlong',
+        help='DEPRECATED: Percentage of reads passing quality thresholds kept by filtlong',
         dest="keep_percent",
         default=100
     )
 
+    qc_group.add_argument(
+        '--skip-qc', '--skip_qc',
+        help='Skip quality control steps',
+        type=str2bool,
+        nargs='?',
+        const=True,
+        dest="skip_qc",
+        default=False
+    )
+
 
     ####################################################################
 
@@ -322,10 +360,9 @@ def main():
     read_group_exclusive.add_argument(
         '-1', '--pe-1', '--paired-reads-1', '--paired_reads_1', '--pe1',
         help='A space separated list of forwards read files \n'
-             'NOTE: If performing assembly and multiple files and longreads \n'
-             '      are provided then only the first file will be used for assembly. \n'
+             'NOTE: If performing assembly and multiple files are provided then only the first file will be used for assembly. \n'
              '      If no longreads are provided then all samples will be co-assembled \n'
-             '      with megahit or metaspades depending on the --coassemble parameter\n',
+             '      with megahit or metaspades depending on the --coassemble parameter',
         dest='pe1',
         nargs='*',
         default="none"
@@ -334,8 +371,7 @@ def main():
     short_read_group.add_argument(
         '-2', '--pe-2', '--paired-reads-2', '--paired_reads_2', '--pe2',
         help='A space separated list of reverse read files \n'
-             'NOTE: If performing assembly and multiple files and longreads \n'
-             '      are provided then only the first file will be used for assembly. \n'
+             'NOTE: If performing assembly and multiple files are provided then only the first file will be used for assembly. \n'
              '      If no longreads are provided then all samples will be co-assembled \n'
              '      with megahit or metaspades depending on the --coassemble parameter',
         dest='pe2',
@@ -346,8 +382,7 @@ def main():
     read_group_exclusive.add_argument(
         '-i','--interleaved',
         help='A space separated list of interleaved read files \n'
-             'NOTE: If performing assembly and multiple files and longreads \n'
-             '      are provided then only the first file will be used for assembly. \n'
+             'NOTE: If performing assembly and multiple files are provided then only the first file will be used for assembly. \n'
              '      If no longreads are provided then all samples will be co-assembled \n'
              '      with megahit or metaspades depending on the --coassemble parameter',
         dest='interleaved',
@@ -358,8 +393,7 @@ def main():
     read_group_exclusive.add_argument(
         '-c', '--coupled',
         help='Forward and reverse read files in a coupled space separated list. \n'
-             'NOTE: If performing assembly and multiple files and longreads \n'
-             '      are provided then only the first file will be used for assembly. \n'
+             'NOTE: If performing assembly and multiple files are provided then only the first file will be used for assembly. \n'
              '      If no longreads are provided then all samples will be co-assembled \n'
              '      with megahit or metaspades depending on the --coassemble parameter',
         dest='coupled',
@@ -382,8 +416,7 @@ def main():
     long_read_group.add_argument(
         '-l', '--longreads', '--long-reads', '--long_reads',
         help='A space separated list of long-read read files. '
-             'NOTE: If performing assembly and multiple long read files are provided, \n'
-             '      then only the first file is used for assembly. This behaviour might change in future.',
+             'NOTE: The first file will be used for assembly unless --coassemble is set to True. Then all files will be used.',
         dest='longreads',
         nargs='*',
         default="none"
@@ -677,7 +710,7 @@ def main():
         nargs='?',
         const=True,
         dest='coassemble',
-        default=True,
+        default=False,
     )
 
     assemble_group.add_argument(
@@ -1182,7 +1215,10 @@ def main():
                                dryrun=args.dryrun,
                                clean=args.clean,
                                conda_frontend=args.conda_frontend,
-                               snakemake_args=args.cmds)
+                               snakemake_args=args.cmds,
+                               rerun_triggers=args.rerun_triggers,
+                               profile=args.snakemake_profile,
+                               cluster_retries=args.cluster_retries)
     else:
         process_batch(args, prefix)
 

diff --git a/aviary/envs/coverm.yaml b/aviary/envs/coverm.yaml
@@ -4,6 +4,8 @@ channels:
 dependencies:
   - coverm >= 0.6
   - galah >= 0.3
+  - chopper >= 0.6
+  - pigz
   - parallel
   - dashing
   - fastani
diff --git a/aviary/modules/Snakefile b/aviary/modules/Snakefile
@@ -1,5 +1,5 @@
-ruleorder: skip_long_assembly > get_reads_list_ref > link_reads > short_only
-ruleorder: filtlong_no_reference > link_reads
+# ruleorder: skip_long_assembly > get_reads_list_ref > link_reads > short_only
+# ruleorder: filtlong_no_reference > link_reads
 
 onsuccess:
     print("Aviary finished, no error")

diff --git a/aviary/modules/annotation/annotation.smk b/aviary/modules/annotation/annotation.smk
@@ -1,3 +1,4 @@
+localrules: download_databases, download_eggnog_db, download_gtdb, download_checkm2, annotate
 
 onstart:
     import os
@@ -133,15 +134,22 @@ rule checkm2:
         mag_extension = config['mag_extension'],
         checkm2_db_path = config["checkm2_db_folder"]
     threads:
-        config["max_threads"]
+        min(config["max_threads"], 16)
+    resources:
+        mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 128*1024*attempt),
+        runtime = lambda wildcards, attempt: 8*60*attempt,
+        gpus = 1 if config["request_gpu"] else 0
+    log:
+        'logs/checkm2.log'
     benchmark:
         'benchmarks/checkm2.benchmark.txt'
     conda:
         "../../envs/checkm2.yaml"
     shell:
         'export CHECKM2DB={params.checkm2_db_path}/uniref100.KO.1.dmnd; '
-        'echo "Using CheckM2 database $CHECKM2DB"; '
+        'echo "Using CheckM2 database $CHECKM2DB" > {log}; '
         'checkm2 predict -i {input.mag_folder}/ -x {params.mag_extension} -o {output.checkm2_folder} -t {threads} --force'
+        '>> {log} 2>&1 '
 
 rule eggnog:
     input:
@@ -151,13 +159,15 @@ rule eggnog:
         mag_extension = config['mag_extension'],
         eggnog_db = config['eggnog_folder'],
         tmpdir = config["tmpdir"]
-    resources:
-        mem_mb=int(config["max_memory"])*512
-    group: 'annotation'
     output:
         done = 'data/eggnog/done'
     threads:
-        config['max_threads']
+        min(config["max_threads"], 64)
+    resources:
+        mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 512*1024*attempt),
+        runtime = lambda wildcards, attempt: 24*60*attempt,
+    log:
+        'logs/eggnog.log'
     benchmark:
         'benchmarks/eggnog.benchmark.txt'
     conda:
@@ -167,31 +177,36 @@ rule eggnog:
         'mkdir -p data/eggnog/; '
         'find {input.mag_folder}/*.{params.mag_extension} | parallel -j1 \'emapper.py --data_dir {params.eggnog_db} '
         '--dmnd_db {params.eggnog_db}/*dmnd --cpu {threads} -m diamond --itype genome --genepred prodigal -i {{}} '
-        '--output_dir data/eggnog/ --temp_dir {params.tmpdir} -o {{/.}} || echo "Genome already annotated"\'; '
+        '--output_dir data/eggnog/ --temp_dir {params.tmpdir} -o {{/.}} || echo "Genome already annotated"\' '
+        '> {log} 2>&1; '
         'touch data/eggnog/done; '
 
 rule gtdbtk:
     input:
         mag_folder = config['mag_directory']
-    group: 'annotation'
     output:
         done = "data/gtdbtk/done"
     params:
         gtdbtk_folder = config['gtdbtk_folder'],
         pplacer_threads = config["pplacer_threads"],
         extension = config['mag_extension']
-    resources:
-        mem_mb=int(config["max_memory"])*1024
     conda:
         "../../envs/gtdbtk.yaml"
     threads:
-        config["max_threads"]
+        min(config["max_threads"], 32)
+    resources:
+        mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 256*1024*attempt),
+        runtime = lambda wildcards, attempt: 12*60*attempt,
+    log:
+        'logs/gtdbtk.log'
     benchmark:
         'benchmarks/gtdbtk.benchmark.txt'
     shell:
         "export GTDBTK_DATA_PATH={params.gtdbtk_folder} && "
         "gtdbtk classify_wf --skip_ani_screen --cpus {threads} --pplacer_cpus {params.pplacer_threads} --extension {params.extension} "
-        "--genome_dir {input.mag_folder} --out_dir data/gtdbtk && touch data/gtdbtk/done"
+        "--genome_dir {input.mag_folder} --out_dir data/gtdbtk "
+        "> {log} 2>&1 "
+        "&& touch data/gtdbtk/done"
 
 rule annotate:
     input: