From b17aaf1821901e2dab45f9f69e4480bd0da6fb66 Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Mon, 11 Dec 2023 11:06:41 +1000 Subject: [PATCH 1/5] README: Streamline, improve. --- README.md | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 127d25a7..c8391c44 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,12 @@ conda env create -n aviary -f aviary.yml conda activate aviary pip install -e . ``` +The `aviary` executable can then be run from any directory. Since the code in +this directory is then used for running, any updates made there will be +immediately available. We recommend this mode for developing and debugging +aviary. +## Checking installation Whatever option you choose, running `aviary --help` should return the following output: @@ -86,22 +91,6 @@ Utility modules: ``` -Upon first running aviary you will be prompted to input the location for where you would like -your conda environments to be stored, the GTDB release installed on your system, the location of your -EnrichM database, and the location of your BUSCO database. These locations will be stored as environment -variables, but for aviary to be able to use those environment variables you will have to either source your .bashrc -or reactivate your conda environment depending on whether you installed aviary within a conda environment or not: - -``` -conda deactivate; conda activate aviary - -OR - -source ~/.bashrc -``` - -These environment variables can be reset using `aviary configure` - ## Databases Aviary uses programs which require access to locally stored databases. @@ -111,7 +100,7 @@ The **required** databases are as follows: * [GTDB](https://gtdb.ecogenomic.org/downloads) * [EggNog](https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2.1.5-to-v2.1.8#setup) * [CheckM2](https://github.com/chklovski/CheckM2) - +* [SingleM](https://wwood.github.io/singlem/) ### Installing databases From 18bbcacbf863ece01763e1fe134abd74d429f222 Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Mon, 11 Dec 2023 11:21:53 +1000 Subject: [PATCH 2/5] aviary: Improve and refactor help. --- aviary/aviary.py | 95 +++++++++++++----------------------------------- 1 file changed, 26 insertions(+), 69 deletions(-) mode change 100644 => 100755 aviary/aviary.py diff --git a/aviary/aviary.py b/aviary/aviary.py old mode 100644 new mode 100755 index 6d8a4f99..56e42eff --- a/aviary/aviary.py +++ b/aviary/aviary.py @@ -100,6 +100,17 @@ def str2bool(v): else: raise argparse.ArgumentTypeError('Boolean value expected.') +def add_workflow_arg(parser, default, help=None): + if help is None: + help = 'Main workflow to run. This is the snakemake target rule to run.' + parser.add_argument( + '-w', '--workflow', + help=help, + dest='workflow', + nargs="+", + default=default, + ) + def main(): if len(sys.argv) == 1 or sys.argv[1] == '-h' or sys.argv[1] == '--help': phelp() @@ -832,13 +843,7 @@ def main(): - assemble_options.add_argument( - '-w', '--workflow', - help='Main workflow to run', - dest='workflow', - nargs="+", - default=['complete_assembly_with_qc'], - ) + add_workflow_arg(assemble_options, ['complete_assembly_with_qc']) ########################## ~ RECOVER ~ ########################### @@ -862,13 +867,7 @@ def main(): required=False, ) - recover_options.add_argument( - '-w', '--workflow', - help='Main workflow to run', - dest='workflow', - nargs="+", - default=['recover_mags'], - ) + add_workflow_arg(recover_options, ['recover_mags']) recover_options.add_argument( '--perform-strain-analysis', '--perform_strain_analysis', @@ -902,13 +901,7 @@ def main(): required=False, ) - annotate_options.add_argument( - '-w', '--workflow', - help='Main workflow to run', - dest='workflow', - nargs="+", - default=['annotate'], - ) + add_workflow_arg(annotate_options, ['annotate']) ########################## ~ diversity ~ ########################### @@ -933,13 +926,7 @@ def main(): required=False, ) - diversity_options.add_argument( - '-w', '--workflow', - help='Main workflow to run', - dest='workflow', - nargs="+", - default=['lorikeet'], - ) + add_workflow_arg(diversity_options, ['lorikeet']) diversity_options.add_argument( '--perform-strain-analysis', '--perform_strain_analysis', @@ -975,13 +962,7 @@ def main(): required=True, ) - cluster_options.add_argument( - '-w', '--workflow', - help='Main workflow to run', - dest='workflow', - nargs="+", - default=['complete_cluster'], - ) + add_workflow_arg(cluster_options, ['complete_cluster']) ########################## ~ VIRAL ~ ########################### @@ -997,13 +978,7 @@ def main(): ''') - viral_options.add_argument( - '-w', '--workflow', - help='Main workflow to run', - dest='workflow', - nargs="+", - default=['create_webpage_genotype'], - ) + add_workflow_arg(viral_options, ['create_webpage_genotype']) ########################## ~ COMPLETE ~ ########################### @@ -1028,13 +1003,7 @@ def main(): required=False, ) - complete_options.add_argument( - '-w', '--workflow', - help='Main workflow to run', - dest='workflow', - nargs="+", - default=['get_bam_indices', 'recover_mags', 'annotate', 'lorikeet'], - ) + add_workflow_arg(complete_options, ['get_bam_indices', 'recover_mags', 'annotate', 'lorikeet']) ########################## ~ ISOLATE ~ ########################### @@ -1050,13 +1019,7 @@ def main(): ''') - isolate_options.add_argument( - '-w', '--workflow', - help='Main workflows to run', - dest='workflow', - nargs="+", - default=['circlator'], - ) + add_workflow_arg(isolate_options, ['circlator']) ########################## ~ BATCH ~ ########################### @@ -1131,12 +1094,10 @@ def main(): default='95' ) - batch_options.add_argument( - '-w', '--workflow', - help='Main workflow to run for each sample', - dest='workflow', - nargs="+", - default=['get_bam_indices', 'recover_mags', 'annotate', 'lorikeet'], + add_workflow_arg( + batch_options, + ['get_bam_indices', 'recover_mags', 'annotate', 'lorikeet'], + help='Main workflow (snakemake target rule) to run for each sample' ) ########################## ~ configure ~ ########################### @@ -1188,13 +1149,7 @@ def main(): required=False, ) - configure_options.add_argument( - '-w', '--workflow', - help=argparse.SUPPRESS, - dest='workflow', - nargs="+", - default=['download_databases'], - ) + add_workflow_arg(configure_options, ['download_databases'], help=argparse.SUPPRESS) ########################################################################### # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # @@ -1303,6 +1258,8 @@ def manage_env_vars(args): args.eggnog_db_path = Config.get_software_db_path('EGGNOG_DATA_DIR', '--eggnog-db-path') if args.checkm2_db_path is None: args.checkm2_db_path = Config.get_software_db_path('CHECKM2DB', '--checkm2-db-path') + if args.singlem_metapackage_path is None: + args.singlem_db_path = Config.get_software_db_path('SINGLEM_METAPACKAGE_PATH', '--singlem-metapackage-path') except AttributeError: pass From c9371d1240e3c68e2dd1fa5d68fe3c212c953568 Mon Sep 17 00:00:00 2001 From: Ben J Woodcroft Date: Fri, 15 Dec 2023 09:26:55 +1000 Subject: [PATCH 3/5] das_tool: Use script not shell. (#187) * das_tool: Use script not shell. * das_tool: Do not croak when there is only 1 binner. --- aviary/modules/binning/binning.smk | 23 ++------- aviary/modules/binning/envs/das_tool.yaml | 1 + aviary/modules/binning/scripts/das_tool.py | 58 ++++++++++++++++++++++ test/test_integration.py | 2 +- 4 files changed, 63 insertions(+), 21 deletions(-) create mode 100644 aviary/modules/binning/scripts/das_tool.py diff --git a/aviary/modules/binning/binning.smk b/aviary/modules/binning/binning.smk index 9bbdfe1d..eb514c38 100644 --- a/aviary/modules/binning/binning.smk +++ b/aviary/modules/binning/binning.smk @@ -629,32 +629,15 @@ rule das_tool: mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 512*1024*attempt), runtime = lambda wildcards, attempt: 12*60*attempt, output: - das_tool_done = "data/das_tool_bins_pre_refine/done" + touch("data/das_tool_bins_pre_refine/done") conda: "envs/das_tool.yaml" log: "logs/das_tool.log" benchmark: "benchmarks/das_tool.benchmark.txt" - shell: - """ - Fasta_to_Scaffolds2Bin.sh -i data/metabat_bins_sspec -e fa > data/metabat_bins_sspec.tsv 2> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/metabat_bins_ssens -e fa > data/metabat_bins_ssens.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/metabat_bins_sens -e fa > data/metabat_bins_sens.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/metabat_bins_spec -e fa > data/metabat_bins_spec.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/concoct_bins -e fa > data/concoct_bins.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/maxbin2_bins -e fasta > data/maxbin_bins.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/vamb_bins/bins -e fna > data/vamb_bins.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/rosella_refined/final_bins/ -e fna > data/rosella_refined_bins.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/metabat2_refined/final_bins/ -e fna > data/metabat2_refined_bins.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/semibin_refined/final_bins/ -e fna > data/semibin_refined_bins.tsv 2>> {log}; - scaffold2bin_files=$(find data/*bins*.tsv -not -empty -exec ls {{}} \; | tr "\n" ',' | sed "s/,$//g"); - DAS_Tool --search_engine diamond --write_bin_evals 1 --write_bins 1 -t {threads} --score_threshold -42 \ - -i $scaffold2bin_files \ - -c {input.fasta} \ - -o data/das_tool_bins_pre_refine/das_tool >> {log} 2>&1 && \ - touch data/das_tool_bins_pre_refine/done - """ + script: + "scripts/das_tool.py" rule refine_dastool: input: diff --git a/aviary/modules/binning/envs/das_tool.yaml b/aviary/modules/binning/envs/das_tool.yaml index bf5406ca..e9f4b628 100644 --- a/aviary/modules/binning/envs/das_tool.yaml +++ b/aviary/modules/binning/envs/das_tool.yaml @@ -3,3 +3,4 @@ channels: - bioconda dependencies: - das_tool = 1.1.2 + - extern = 0.4.1 diff --git a/aviary/modules/binning/scripts/das_tool.py b/aviary/modules/binning/scripts/das_tool.py new file mode 100644 index 00000000..1b4186a3 --- /dev/null +++ b/aviary/modules/binning/scripts/das_tool.py @@ -0,0 +1,58 @@ +import logging +import os +import sys + +import extern + +if __name__ == '__main__': + unrefined_binners_to_use = [ + ('concoct', 'fa'), + ('maxbin2', 'fasta'), + ('vamb', 'fna')] + refined_binners_to_use = [ + ('rosella', 'fna'), + ('semibin', 'fna')] + + # N.B. specifying "metabat" will skip both MetaBAT1 and MetaBAT2. + metabats = ['metabat_sspec', 'metabat_ssens', 'metabat_sens', 'metabat_spec'] + + binners = [] + for (binner, extension) in unrefined_binners_to_use: + if binner not in snakemake.config['skip_binners']: + binners.append((f'{binner}_bins/', extension, f'data/{binner}_bins.tsv')) + + for (binner, extension) in refined_binners_to_use: + if binner not in snakemake.config['skip_binners']: + binners.append((binner+'_refined/final_bins/', extension, f'data/{binner}_refined_bins.tsv')) + + for metabat in metabats: + if metabat not in snakemake.config['skip_binners']: + binners.append((metabat.replace('metabat','metabat_bins'), 'fa', f'data/{metabat}_bins.tsv')) + if 'metabat2' not in snakemake.config['skip_binners']: + binners.append(('metabat2_refined/final_bins/', 'fna', f'data/metabat2_refined_bins.tsv')) + + logfile = snakemake.log[0] + logging.basicConfig(filename=logfile, level=logging.INFO) + logging.info("Using the following binners: " + str(binners)) + + if len(binners) == 0: + logging.error("All binners have been skipped, so DAS_tool cannot be run.") + sys.exit(1) + + bin_definition_files = [] + for binner, extension, bin_definition_file in binners: + extern.run(f'Fasta_to_Scaffolds2Bin.sh -i data/{binner} -e {extension} >{bin_definition_file} 2>> {logfile}') + if os.path.getsize(bin_definition_file) == 0: + logging.warning(f'Bin definition file {bin_definition_file} is empty, suggesting that {binner} failed or did not not create any output bins.') + else: + bin_definition_files.append(bin_definition_file) + logging.info("Bin definition files created: " + str(bin_definition_files)) + + scaffold2bin_files = ','.join(bin_definition_files) + + das_tool_command = f'DAS_Tool --search_engine diamond --write_bin_evals 1 --write_bins 1 -t {snakemake.threads} --score_threshold -42 \ + -i {scaffold2bin_files} \ + -c {snakemake.input.fasta} \ + -o data/das_tool_bins_pre_refine/das_tool >> {logfile} 2>&1' + logging.info("Running DAS_Tool with command: " + das_tool_command) + extern.run(das_tool_command) diff --git a/test/test_integration.py b/test/test_integration.py index 93702429..4f868494 100755 --- a/test/test_integration.py +++ b/test/test_integration.py @@ -140,7 +140,7 @@ def test_short_read_recovery_fast(self): f"-1 {data}/wgsim.1.fq.gz " f"-2 {data}/wgsim.2.fq.gz " f"--skip-abundances " - f"--skip-binners concoct rosella vamb metabat maxbin " + f"--skip-binners concoct rosella vamb maxbin2 metabat " f"--skip-qc " f"--refinery-max-iterations 0 " f"--conda-prefix {path_to_conda} " From f89d76c0804e1481661a773510449aeade83cd45 Mon Sep 17 00:00:00 2001 From: AroneyS Date: Tue, 2 Jan 2024 15:44:41 +1000 Subject: [PATCH 4/5] loosen same-group permissions for folders and logs --- aviary/modules/Snakefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/aviary/modules/Snakefile b/aviary/modules/Snakefile index 96675c13..16d0c0bf 100644 --- a/aviary/modules/Snakefile +++ b/aviary/modules/Snakefile @@ -2,9 +2,11 @@ # ruleorder: filtlong_no_reference > link_reads onsuccess: + shell("chmod g+w {log}") print("Aviary finished, no error") onerror: + shell("chmod g+w {log}") print("An error occurred") onstart: @@ -44,6 +46,7 @@ onstart: from snakemake.utils import min_version import os min_version("6.0") +os.umask(0o002) module qc: snakefile: "quality_control/qc.smk" From 6450fc6b52853b7ba1802989ede6ba396a00ea1f Mon Sep 17 00:00:00 2001 From: Rhys Newell Date: Thu, 11 Jan 2024 18:17:24 +1100 Subject: [PATCH 5/5] fix: change binner to lower case to ensure safe checking --- aviary/modules/processor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/aviary/modules/processor.py b/aviary/modules/processor.py index 733049d3..93943dce 100644 --- a/aviary/modules/processor.py +++ b/aviary/modules/processor.py @@ -123,6 +123,7 @@ def __init__(self, self.skip_binners = ["maxbin2", "concoct"] if args.extra_binners: for binner in args.extra_binners: + binner = binner.lower() if binner == "maxbin" or binner == "maxbin2": self.skip_binners.remove("maxbin2") elif binner == "concoct": @@ -132,6 +133,7 @@ def __init__(self, if args.skip_binners: for binner in args.skip_binners: + binner = binner.lower() if binner == "metabat": self.skip_binners.extend(["metabat_sens", "metabat_ssens", "metabat_spec", "metabat_sspec", "metabat2"]) elif binner == "metabat1": @@ -622,4 +624,4 @@ def fraction_to_percent(val): val = float(val) if val <= 1: return val * 100 - return val \ No newline at end of file + return val