rhysnewell · rhysnewell · Mar 12, 2024 · Jan 4, 2024 · Jan 11, 2024 · Jan 11, 2024
diff --git a/aviary/__init__.py b/aviary/__init__.py
@@ -1 +1 @@
-__version__ = "0.8.3"
+__version__ = "0.9.0"
diff --git a/aviary/aviary.py b/aviary/aviary.py
@@ -598,23 +598,66 @@ def main():
         default=3
     )
 
+    binning_group.add_argument(
+        '--extra-binners', '--extra_binners', '--extra-binner', '--extra_binner',
+        help='Optional list of extra binning algorithms to run. Can be any combination of: \n'
+             'maxbin, maxbin2, concoct \n'
+             'These binners are skipped by default as they can have long runtimes'
+             'Capitals will be auto-corrected. N.B. specifying "maxbin" and "maxbin2" are equivalent \n',
+        dest='extra_binners',
+        nargs='*',
+        choices=["maxbin", "maxbin2", "concoct"]
+    )
+
     binning_group.add_argument(
         '--skip-binners', '--skip_binners', '--skip_binner', '--skip-binner',
         help='Optional list of binning algorithms to skip. Can be any combination of: \n'
-             'rosella, semibin, metabat1, metabat2, metabat, vamb, concoct, maxbin2, maxbin \n'
+             'rosella, semibin, metabat1, metabat2, metabat, vamb \n'
              'Capitals will be auto-corrected. N.B. specifying "metabat" will skip both \n'
              'MetaBAT1 and MetaBAT2.',
         dest='skip_binners',
-        nargs='*'
-        # default=["maxbin2"]
+        nargs='*',
+        choices=["rosella", "semibin", "metabat1", "metabat2", "metabat", "vamb"]
+    )
+
+    binning_group.add_argument(
+        '--binning-only', '--binning_only',
+        help='Only run up to the binning stage. Do not run SingleM, GTDB-tk, or CoverM',
+        type=str2bool,
+        nargs='?',
+        const=True,
+        dest='binning_only',
+        default=False,
     )
 
     binning_group.add_argument(
         '--skip-abundances', '--skip_abundances',
         help='Skip CoverM post-binning abundance calculations.',
         dest='skip_abundances',
+        type=str2bool,
+        nargs='?',
+        const=True,
+        default=False,
+    )
+
+    binning_group.add_argument(
+        '--skip-taxonomy', '--skip_taxonomy',
+        help='Skip GTDB-tk post-binning taxonomy assignment.',
+        dest='skip_taxonomy',
+        type=str2bool,
+        nargs='?',
+        const=True,
+        default=False,
+    )
+
+    binning_group.add_argument(
+        '--skip-singlem', '--skip_singlem',
+        help='Skip SingleM post-binning recovery assessment.',
+        dest='skip_singlem',
+        type=str2bool,
+        nargs='?',
+        const=True,
         default=False,
-        action="store_true",
     )
 
     ####################################################################

diff --git a/aviary/modules/binning/binning.smk b/aviary/modules/binning/binning.smk
@@ -695,8 +695,8 @@ rule finalise_stats:
     input:
         checkm1_done = "bins/checkm.out",
         checkm2_done = "bins/checkm2_output/quality_report.tsv",
-        coverage_file = "data/coverm_abundances.tsv" if not config["skip_abundances"] else [],
-        gtdbtk_done = "data/gtdbtk/done"
+        coverage_file = "data/coverm_abundances.tsv" if not (config["skip_abundances"] or config["binning_only"]) else [],
+        gtdbtk_done = "data/gtdbtk/done" if not (config["binning_only"] or config["skip_taxonomy"]) else []
     output:
         bin_stats = "bins/bin_info.tsv",
         checkm_minimal = "bins/checkm_minimal.tsv"
@@ -746,7 +746,7 @@ rule singlem_pipe_reads:
 rule singlem_appraise:
     input:
         metagenome = "data/singlem_out/metagenome.combined_otu_table.csv",
-        gtdbtk_done = "data/gtdbtk/done",
+        # gtdbtk_done = "data/gtdbtk/done",
         bins_complete = "bins/checkm.out"
     output:
         "data/singlem_out/singlem_appraisal.tsv"
@@ -775,46 +775,34 @@ rule singlem_appraise:
 rule recover_mags:
     input:
         final_bins = "bins/bin_info.tsv",
-        gtdbtk = "data/gtdbtk/done",
-        coverm = "data/coverm_abundances.tsv" if not config["skip_abundances"] else [],
-        singlem = "data/singlem_out/singlem_appraisal.tsv"
+        gtdbtk = "data/gtdbtk/done" if not (config["binning_only"] or config["skip_taxonomy"]) else [],
+        coverm = "data/coverm_abundances.tsv" if not (config["skip_abundances"] or config["binning_only"]) else [],
+        contig_coverage = "data/coverm.cov",
+        singlem = "data/singlem_out/singlem_appraisal.tsv" if not (config["skip_singlem"] or config["binning_only"]) else [],
     conda:
         "../../envs/coverm.yaml"
     output:
-        bins = "bins/done",
-        diversity = 'diversity/done'
+        bins = "bins/done"
     threads:
         config["max_threads"]
-    shell:
-        "cd bins/; "
-        "ln -s ../data/coverm_abundances.tsv ./; "
-        "ln -s ../data/coverm.cov ./; "
-        "cd ../; "
-        "ln -sr data/singlem_out/ diversity || echo 'SingleM linked'; "
-        "ln -sr data/gtdbtk taxonomy || echo 'GTDB-tk linked'; "
-        "touch bins/done; "
-        "touch diversity/done; "
-        "rm -f data/binning_bams/*bam; "
-        "rm -f data/binning_bams/*bai; "
+    script:
+        "scripts/finalise_recover.py"
 
 rule recover_mags_no_singlem:
     input:
         final_bins = "bins/bin_info.tsv",
-        coverm = "data/coverm_abundances.tsv" if not config["skip_abundances"] else [],
+        gtdbtk = [],
+        coverm = "data/coverm_abundances.tsv" if not (config["skip_abundances"] or config["binning_only"]) else [],
+        contig_coverage = "data/coverm.cov",
+        singlem = [],
     conda:
         "../../envs/coverm.yaml"
     output:
         bins = "bins/done",
     threads:
         config["max_threads"]
-    shell:
-        "cd bins/; "
-        "ln -s ../data/coverm_abundances.tsv ./; "
-        "ln -s ../data/coverm.cov ./; "
-        "cd ../; "
-        "touch bins/done; "
-        "rm -f data/binning_bams/*bam; "
-        "rm -f data/binning_bams/*bai; "
+    script:
+        "scripts/finalise_stats.py"
 
 # Special rule to help out with a buggy output
 rule dereplicate_and_get_abundances_paired:

diff --git a/aviary/modules/binning/envs/rosella.yaml b/aviary/modules/binning/envs/rosella.yaml
@@ -2,31 +2,8 @@ channels:
   - conda-forge
   - numba
   - bioconda
-  - defaults
 dependencies:
-  - python >= 3.8, <= 3.10
-  - gcc
-  - cxx-compiler
-  - rosella >= 0.5.2
-  - numba >= 0.53, <= 0.57
-  - numpy <= 1.24
-  - joblib >= 1.1.0, <= 1.3
-  - scikit-bio >= 0.5.7
-  - umap-learn >= 0.5.3
-  - scipy <= 1.11
-  - pandas >= 1.3
-  - pynndescent >= 0.5.7
-  - hdbscan >= 0.8.28
-  - scikit-learn >= 1.0.2, <= 1.1
-  - flight-genome >= 1.6.1
+  - rosella >= 0.5.3
+  - flight-genome >= 1.6.3
   - coverm >= 0.6.1
-  - seaborn
-  - imageio
-  - matplotlib
-  - tqdm
-  - tbb
-  - joblib
-  - pebble
-  - threadpoolctl
-  - biopython
   - checkm-genome==1.1.3
diff --git a/aviary/modules/binning/scripts/finalise_recovery.py b/aviary/modules/binning/scripts/finalise_recovery.py
@@ -0,0 +1,29 @@
+import os
+from pathlib import Path
+import glob
+
+if __name__ == '__main__':
+    final_bins = snakemake.input.final_bins
+    coverage_file = snakemake.input.coverm
+    contig_coverage = snakemake.input.contig_coverage
+    gtdbtk = snakemake.input.gtdbtk
+    singlem = snakemake.input.singlem
+
+    os.chdir('bins/')
+
+    if len(coverage_file) > 0:
+        os.symlink(f"../{coverage_file}", "./")
+
+    if len(contig_coverage) > 0:
+        os.symlink(f"../{contig_coverage}", "./")
+
+    os.chdir('..')
+    if len(gtdbtk) > 0:
+        os.symlink(f"{gtdbtk}", "taxonomy")
+    if len(singlem) > 0:
+        os.symlink(f"{singlem}", "diversity")
+
+    Path('bins/done').touch()
+
+    for f in glob.glob('data/binning_bams/*.ba*'):
+        os.remove(f)
diff --git a/aviary/modules/binning/scripts/finalise_stats.py b/aviary/modules/binning/scripts/finalise_stats.py
@@ -55,9 +55,12 @@ def get_taxonomy(rename_columns="Bin Id"):
         taxa.append(df_arc)
     except (FileNotFoundError, IndexError) as e:
         pass
-
-    taxa = pd.concat(taxa)
-    taxa.rename({'user_genome' : rename_columns}, inplace=True, axis=1)
+
+    try:
+        taxa = pd.concat(taxa)
+        taxa.rename({'user_genome' : rename_columns}, inplace=True, axis=1)
+    except ValueError:
+        taxa = pd.DataFrame(columns=[rename_columns])
     return taxa
 
 

diff --git a/aviary/modules/processor.py b/aviary/modules/processor.py
@@ -112,16 +112,32 @@ def __init__(self,
             self.refinery_max_iterations = args.refinery_max_iterations
             self.refinery_max_retries = args.refinery_max_retries
             self.skip_abundances = args.skip_abundances
+            self.skip_taxonomy = args.skip_taxonomy
+            self.skip_singlem = args.skip_singlem
+            if args.binning_only:
+                self.skip_abundances = True
+                self.skip_taxonomy = True
+                self.skip_singlem = True
+            self.binning_only = args.binning_only
+
+            self.skip_binners = ["maxbin2", "concoct"]
+            if args.extra_binners:
+                for binner in args.extra_binners:
+                    binner = binner.lower()   
+                    if binner == "maxbin" or binner == "maxbin2":
+                        self.skip_binners.remove("maxbin2")
+                    elif binner == "concoct":
+                        self.skip_binners.remove("concoct")
+                    else:
+                        logging.warning(f"Unknown extra binner {binner} specified. Skipping...")
 
-            self.skip_binners = []
             if args.skip_binners:
                 for binner in args.skip_binners:
+                    binner = binner.lower()   
                     if binner == "metabat":
                         self.skip_binners.extend(["metabat_sens", "metabat_ssens", "metabat_spec", "metabat_sspec", "metabat2"])
                     elif binner == "metabat1":
                         self.skip_binners.extend(["metabat_sens", "metabat_ssens", "metabat_spec", "metabat_sspec"])
-                    elif binner == "maxbin":
-                        self.skip_binners.append("maxbin2")
                     else:
                         self.skip_binners.append(binner)
 
@@ -133,6 +149,7 @@ def __init__(self,
             self.refinery_max_retries = 3
             self.skip_binners = ["none"]
             self.skip_abundances = False
+            self.binning_only = False
 
         try:
             self.assembly = args.assembly
@@ -354,6 +371,9 @@ def make_config(self):
         conf["gsa_mappings"] = self.gsa_mappings
         conf["skip_binners"] = self.skip_binners
         conf["skip_abundances"] = self.skip_abundances
+        conf["skip_taxonomy"] = self.skip_taxonomy
+        conf["skip_singlem"] = self.skip_singlem
+        conf["binning_only"] = self.binning_only
         conf["semibin_model"] = self.semibin_model
         conf["refinery_max_iterations"] = self.refinery_max_iterations
         conf["refinery_max_retries"] = self.refinery_max_retries
@@ -604,4 +624,4 @@ def fraction_to_percent(val):
     val = float(val)
     if val <= 1:
         return val * 100
-    return val
+    return val
diff --git a/aviary/modules/quality_control/scripts/qc_short_reads.py b/aviary/modules/quality_control/scripts/qc_short_reads.py
@@ -87,20 +87,49 @@ def combine_reads(
 
         # we've got to concatenate the files together
         if coassemble:
-            # reads 1 first
-            for reads_1, reads_2 in zip(short_reads_1, short_reads_2):
-                if reads_1 == "none" or reads_2 == "none":
-                    continue
-
-                if not os.path.exists(reads_1):
-                    logf.write(f"Short read file {reads_1} does not exist\n")
-                    exit(1)
-
-                if not os.path.exists(reads_2):
-                    logf.write(f"Short read file {reads_2} does not exist\n")
-                    exit(1)
+            if "none" not in short_reads_2 and "none" not in short_reads_1:
+                for reads_1, reads_2 in zip(short_reads_1, short_reads_2):
+                    if reads_1 == "none" or reads_2 == "none":
+                        continue
+
+                    if not os.path.exists(reads_1):
+                        logf.write(f"Short read file {reads_1} does not exist\n")
+                        exit(1)
+
+                    if not os.path.exists(reads_2):
+                        logf.write(f"Short read file {reads_2} does not exist\n")
+                        exit(1)
+
+                    setup_interleave(reads_1, reads_2, output_fastq, logf)
+            elif "none" not in short_reads_1:
 
-                setup_interleave(reads_1, reads_2, output_fastq, logf)
+                gzip_output = False
+                if not output_fastq.endswith(".gz"):
+                    output_fastq = output_fastq + '.gz'
+
+                for reads_1 in short_reads_1:
+                    if reads_1 == "none":
+                        continue
+
+                    if not os.path.exists(reads_1):
+                        logf.write(f"Short read file {reads_1} does not exist\n")
+                        exit(1)
+
+                    cat_reads(reads_1, output_fastq, threads, log_file)
+            elif "none" not in short_reads_2:
+                gzip_output = False
+                if not output_fastq.endswith(".gz"):
+                    output_fastq = output_fastq + '.gz'
+
+                for reads_2 in short_reads_2:
+                    if reads_2 == "none":
+                        continue
+
+                    if not os.path.exists(reads_2):
+                        logf.write(f"Short read file {reads_2} does not exist\n")
+                        exit(1)
+
+                    cat_reads(reads_2, output_fastq, threads, log_file)
 
 
         else:
@@ -287,13 +316,13 @@ def filter_illumina_reference(
     # run fastp for quality control of reads
     se1_string = None
     if "none" not in short_reads_1:
-        combine_reads(short_reads_1, ["none"], "data/short_reads.pre_qc.1.fastq.gz", coassemble, log, threads)
         se1_string = "data/short_reads.pre_qc.1.fastq.gz"
+        combine_reads(short_reads_1, ["none"], se1_string, coassemble, log, threads)
 
     se2_string = None
     if "none" not in short_reads_2:
-        combine_reads(["none"], short_reads_2, "data/short_reads.pre_qc.2.fastq.gz", coassemble, log, threads)
         se2_string = "data/short_reads.pre_qc.2.fastq.gz"
+        combine_reads(["none"], short_reads_2, se2_string, coassemble, log, threads)
 
     run_fastp(
         reads_1=se1_string,

diff --git a/test/test_integration.py b/test/test_integration.py
@@ -139,8 +139,8 @@ def test_short_read_recovery_fast(self):
             f"-o {output_dir}/aviary_out "
             f"-1 {data}/wgsim.1.fq.gz "
             f"-2 {data}/wgsim.2.fq.gz "
-            f"--skip-abundances "
-            f"--skip-binners concoct rosella vamb maxbin2 metabat "
+            f"--binning-only "
+            f"--skip-binners rosella vamb metabat "
             f"--skip-qc "
             f"--refinery-max-iterations 0 "
             f"--conda-prefix {path_to_conda} "