diff --git a/README.md b/README.md index 127d25a7..c8391c44 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,12 @@ conda env create -n aviary -f aviary.yml conda activate aviary pip install -e . ``` +The `aviary` executable can then be run from any directory. Since the code in +this directory is then used for running, any updates made there will be +immediately available. We recommend this mode for developing and debugging +aviary. +## Checking installation Whatever option you choose, running `aviary --help` should return the following output: @@ -86,22 +91,6 @@ Utility modules: ``` -Upon first running aviary you will be prompted to input the location for where you would like -your conda environments to be stored, the GTDB release installed on your system, the location of your -EnrichM database, and the location of your BUSCO database. These locations will be stored as environment -variables, but for aviary to be able to use those environment variables you will have to either source your .bashrc -or reactivate your conda environment depending on whether you installed aviary within a conda environment or not: - -``` -conda deactivate; conda activate aviary - -OR - -source ~/.bashrc -``` - -These environment variables can be reset using `aviary configure` - ## Databases Aviary uses programs which require access to locally stored databases. @@ -111,7 +100,7 @@ The **required** databases are as follows: * [GTDB](https://gtdb.ecogenomic.org/downloads) * [EggNog](https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2.1.5-to-v2.1.8#setup) * [CheckM2](https://github.com/chklovski/CheckM2) - +* [SingleM](https://wwood.github.io/singlem/) ### Installing databases diff --git a/aviary/aviary.py b/aviary/aviary.py old mode 100644 new mode 100755 index d8610a26..bbdba814 --- a/aviary/aviary.py +++ b/aviary/aviary.py @@ -100,6 +100,17 @@ def str2bool(v): else: raise argparse.ArgumentTypeError('Boolean value expected.') +def add_workflow_arg(parser, default, help=None): + if help is None: + help = 'Main workflow to run. This is the snakemake target rule to run.' + parser.add_argument( + '-w', '--workflow', + help=help, + dest='workflow', + nargs="+", + default=default, + ) + def main(): if len(sys.argv) == 1 or sys.argv[1] == '-h' or sys.argv[1] == '--help': phelp() @@ -875,13 +886,7 @@ def main(): - assemble_options.add_argument( - '-w', '--workflow', - help='Main workflow to run', - dest='workflow', - nargs="+", - default=['complete_assembly_with_qc'], - ) + add_workflow_arg(assemble_options, ['complete_assembly_with_qc']) ########################## ~ RECOVER ~ ########################### @@ -905,13 +910,7 @@ def main(): required=False, ) - recover_options.add_argument( - '-w', '--workflow', - help='Main workflow to run', - dest='workflow', - nargs="+", - default=['recover_mags'], - ) + add_workflow_arg(recover_options, ['recover_mags']) recover_options.add_argument( '--perform-strain-analysis', '--perform_strain_analysis', @@ -945,13 +944,7 @@ def main(): required=False, ) - annotate_options.add_argument( - '-w', '--workflow', - help='Main workflow to run', - dest='workflow', - nargs="+", - default=['annotate'], - ) + add_workflow_arg(annotate_options, ['annotate']) ########################## ~ diversity ~ ########################### @@ -976,13 +969,7 @@ def main(): required=False, ) - diversity_options.add_argument( - '-w', '--workflow', - help='Main workflow to run', - dest='workflow', - nargs="+", - default=['lorikeet'], - ) + add_workflow_arg(diversity_options, ['lorikeet']) diversity_options.add_argument( '--perform-strain-analysis', '--perform_strain_analysis', @@ -1018,13 +1005,7 @@ def main(): required=True, ) - cluster_options.add_argument( - '-w', '--workflow', - help='Main workflow to run', - dest='workflow', - nargs="+", - default=['complete_cluster'], - ) + add_workflow_arg(cluster_options, ['complete_cluster']) ########################## ~ VIRAL ~ ########################### @@ -1040,13 +1021,7 @@ def main(): ''') - viral_options.add_argument( - '-w', '--workflow', - help='Main workflow to run', - dest='workflow', - nargs="+", - default=['create_webpage_genotype'], - ) + add_workflow_arg(viral_options, ['create_webpage_genotype']) ########################## ~ COMPLETE ~ ########################### @@ -1071,13 +1046,7 @@ def main(): required=False, ) - complete_options.add_argument( - '-w', '--workflow', - help='Main workflow to run', - dest='workflow', - nargs="+", - default=['get_bam_indices', 'recover_mags', 'annotate', 'lorikeet'], - ) + add_workflow_arg(complete_options, ['get_bam_indices', 'recover_mags', 'annotate', 'lorikeet']) ########################## ~ ISOLATE ~ ########################### @@ -1093,13 +1062,7 @@ def main(): ''') - isolate_options.add_argument( - '-w', '--workflow', - help='Main workflows to run', - dest='workflow', - nargs="+", - default=['circlator'], - ) + add_workflow_arg(isolate_options, ['circlator']) ########################## ~ BATCH ~ ########################### @@ -1174,12 +1137,10 @@ def main(): default='95' ) - batch_options.add_argument( - '-w', '--workflow', - help='Main workflow to run for each sample', - dest='workflow', - nargs="+", - default=['get_bam_indices', 'recover_mags', 'annotate', 'lorikeet'], + add_workflow_arg( + batch_options, + ['get_bam_indices', 'recover_mags', 'annotate', 'lorikeet'], + help='Main workflow (snakemake target rule) to run for each sample' ) ########################## ~ configure ~ ########################### @@ -1231,13 +1192,7 @@ def main(): required=False, ) - configure_options.add_argument( - '-w', '--workflow', - help=argparse.SUPPRESS, - dest='workflow', - nargs="+", - default=['download_databases'], - ) + add_workflow_arg(configure_options, ['download_databases'], help=argparse.SUPPRESS) ########################################################################### # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # @@ -1346,6 +1301,8 @@ def manage_env_vars(args): args.eggnog_db_path = Config.get_software_db_path('EGGNOG_DATA_DIR', '--eggnog-db-path') if args.checkm2_db_path is None: args.checkm2_db_path = Config.get_software_db_path('CHECKM2DB', '--checkm2-db-path') + if args.singlem_metapackage_path is None: + args.singlem_db_path = Config.get_software_db_path('SINGLEM_METAPACKAGE_PATH', '--singlem-metapackage-path') except AttributeError: pass diff --git a/aviary/modules/Snakefile b/aviary/modules/Snakefile index 96675c13..16d0c0bf 100644 --- a/aviary/modules/Snakefile +++ b/aviary/modules/Snakefile @@ -2,9 +2,11 @@ # ruleorder: filtlong_no_reference > link_reads onsuccess: + shell("chmod g+w {log}") print("Aviary finished, no error") onerror: + shell("chmod g+w {log}") print("An error occurred") onstart: @@ -44,6 +46,7 @@ onstart: from snakemake.utils import min_version import os min_version("6.0") +os.umask(0o002) module qc: snakefile: "quality_control/qc.smk" diff --git a/aviary/modules/binning/binning.smk b/aviary/modules/binning/binning.smk index be6060ea..3f110c94 100644 --- a/aviary/modules/binning/binning.smk +++ b/aviary/modules/binning/binning.smk @@ -629,32 +629,15 @@ rule das_tool: mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 512*1024*attempt), runtime = lambda wildcards, attempt: 12*60*attempt, output: - das_tool_done = "data/das_tool_bins_pre_refine/done" + touch("data/das_tool_bins_pre_refine/done") conda: "envs/das_tool.yaml" log: "logs/das_tool.log" benchmark: "benchmarks/das_tool.benchmark.txt" - shell: - """ - Fasta_to_Scaffolds2Bin.sh -i data/metabat_bins_sspec -e fa > data/metabat_bins_sspec.tsv 2> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/metabat_bins_ssens -e fa > data/metabat_bins_ssens.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/metabat_bins_sens -e fa > data/metabat_bins_sens.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/metabat_bins_spec -e fa > data/metabat_bins_spec.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/concoct_bins -e fa > data/concoct_bins.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/maxbin2_bins -e fasta > data/maxbin_bins.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/vamb_bins/bins -e fna > data/vamb_bins.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/rosella_refined/final_bins/ -e fna > data/rosella_refined_bins.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/metabat2_refined/final_bins/ -e fna > data/metabat2_refined_bins.tsv 2>> {log}; - Fasta_to_Scaffolds2Bin.sh -i data/semibin_refined/final_bins/ -e fna > data/semibin_refined_bins.tsv 2>> {log}; - scaffold2bin_files=$(find data/*bins*.tsv -not -empty -exec ls {{}} \; | tr "\n" ',' | sed "s/,$//g"); - DAS_Tool --search_engine diamond --write_bin_evals 1 --write_bins 1 -t {threads} --score_threshold -42 \ - -i $scaffold2bin_files \ - -c {input.fasta} \ - -o data/das_tool_bins_pre_refine/das_tool >> {log} 2>&1 && \ - touch data/das_tool_bins_pre_refine/done - """ + script: + "scripts/das_tool.py" rule refine_dastool: input: diff --git a/aviary/modules/binning/envs/das_tool.yaml b/aviary/modules/binning/envs/das_tool.yaml index bf5406ca..e9f4b628 100644 --- a/aviary/modules/binning/envs/das_tool.yaml +++ b/aviary/modules/binning/envs/das_tool.yaml @@ -3,3 +3,4 @@ channels: - bioconda dependencies: - das_tool = 1.1.2 + - extern = 0.4.1 diff --git a/aviary/modules/binning/scripts/das_tool.py b/aviary/modules/binning/scripts/das_tool.py new file mode 100644 index 00000000..1b4186a3 --- /dev/null +++ b/aviary/modules/binning/scripts/das_tool.py @@ -0,0 +1,58 @@ +import logging +import os +import sys + +import extern + +if __name__ == '__main__': + unrefined_binners_to_use = [ + ('concoct', 'fa'), + ('maxbin2', 'fasta'), + ('vamb', 'fna')] + refined_binners_to_use = [ + ('rosella', 'fna'), + ('semibin', 'fna')] + + # N.B. specifying "metabat" will skip both MetaBAT1 and MetaBAT2. + metabats = ['metabat_sspec', 'metabat_ssens', 'metabat_sens', 'metabat_spec'] + + binners = [] + for (binner, extension) in unrefined_binners_to_use: + if binner not in snakemake.config['skip_binners']: + binners.append((f'{binner}_bins/', extension, f'data/{binner}_bins.tsv')) + + for (binner, extension) in refined_binners_to_use: + if binner not in snakemake.config['skip_binners']: + binners.append((binner+'_refined/final_bins/', extension, f'data/{binner}_refined_bins.tsv')) + + for metabat in metabats: + if metabat not in snakemake.config['skip_binners']: + binners.append((metabat.replace('metabat','metabat_bins'), 'fa', f'data/{metabat}_bins.tsv')) + if 'metabat2' not in snakemake.config['skip_binners']: + binners.append(('metabat2_refined/final_bins/', 'fna', f'data/metabat2_refined_bins.tsv')) + + logfile = snakemake.log[0] + logging.basicConfig(filename=logfile, level=logging.INFO) + logging.info("Using the following binners: " + str(binners)) + + if len(binners) == 0: + logging.error("All binners have been skipped, so DAS_tool cannot be run.") + sys.exit(1) + + bin_definition_files = [] + for binner, extension, bin_definition_file in binners: + extern.run(f'Fasta_to_Scaffolds2Bin.sh -i data/{binner} -e {extension} >{bin_definition_file} 2>> {logfile}') + if os.path.getsize(bin_definition_file) == 0: + logging.warning(f'Bin definition file {bin_definition_file} is empty, suggesting that {binner} failed or did not not create any output bins.') + else: + bin_definition_files.append(bin_definition_file) + logging.info("Bin definition files created: " + str(bin_definition_files)) + + scaffold2bin_files = ','.join(bin_definition_files) + + das_tool_command = f'DAS_Tool --search_engine diamond --write_bin_evals 1 --write_bins 1 -t {snakemake.threads} --score_threshold -42 \ + -i {scaffold2bin_files} \ + -c {snakemake.input.fasta} \ + -o data/das_tool_bins_pre_refine/das_tool >> {logfile} 2>&1' + logging.info("Running DAS_Tool with command: " + das_tool_command) + extern.run(das_tool_command)