Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding binning-only flag, simplify recipes #192

Merged
merged 20 commits into from
Mar 12, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion aviary/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.3"
__version__ = "0.9.0"
51 changes: 47 additions & 4 deletions aviary/aviary.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,23 +598,66 @@ def main():
default=3
)

binning_group.add_argument(
'--extra-binners', '--extra_binners', '--extra-binner', '--extra_binner',
help='Optional list of extra binning algorithms to run. Can be any combination of: \n'
'maxbin, maxbin2, concoct \n'
'These binners are skipped by default as they can have long runtimes'
'Capitals will be auto-corrected. N.B. specifying "maxbin" and "maxbin2" are equivalent \n',
AroneyS marked this conversation as resolved.
Show resolved Hide resolved
dest='extra_binners',
nargs='*',
choices=["maxbin", "maxbin2", "concoct"]
)

binning_group.add_argument(
'--skip-binners', '--skip_binners', '--skip_binner', '--skip-binner',
help='Optional list of binning algorithms to skip. Can be any combination of: \n'
'rosella, semibin, metabat1, metabat2, metabat, vamb, concoct, maxbin2, maxbin \n'
'rosella, semibin, metabat1, metabat2, metabat, vamb \n'
'Capitals will be auto-corrected. N.B. specifying "metabat" will skip both \n'
'MetaBAT1 and MetaBAT2.',
dest='skip_binners',
nargs='*'
# default=["maxbin2"]
nargs='*',
choices=["rosella", "semibin", "metabat1", "metabat2", "metabat", "vamb"]
)

binning_group.add_argument(
'--binning-only', '--binning_only',
help='Only run up to the binning stage. Do not run SingleM, GTDB-tk, or CoverM',
type=str2bool,
nargs='?',
const=True,
dest='binning_only',
default=False,
)

binning_group.add_argument(
'--skip-abundances', '--skip_abundances',
help='Skip CoverM post-binning abundance calculations.',
dest='skip_abundances',
type=str2bool,
nargs='?',
const=True,
default=False,
)

binning_group.add_argument(
'--skip-taxonomy', '--skip_taxonomy',
help='Skip GTDB-tk post-binning taxonomy assignment.',
dest='skip_taxonomy',
type=str2bool,
nargs='?',
const=True,
default=False,
)

binning_group.add_argument(
'--skip-singlem', '--skip_singlem',
help='Skip SingleM post-binning recovery assessment.',
dest='skip_singlem',
type=str2bool,
nargs='?',
const=True,
default=False,
action="store_true",
)

####################################################################
Expand Down
44 changes: 16 additions & 28 deletions aviary/modules/binning/binning.smk
Original file line number Diff line number Diff line change
Expand Up @@ -695,8 +695,8 @@ rule finalise_stats:
input:
checkm1_done = "bins/checkm.out",
checkm2_done = "bins/checkm2_output/quality_report.tsv",
coverage_file = "data/coverm_abundances.tsv" if not config["skip_abundances"] else [],
gtdbtk_done = "data/gtdbtk/done"
coverage_file = "data/coverm_abundances.tsv" if not (config["skip_abundances"] or config["binning_only"]) else [],
gtdbtk_done = "data/gtdbtk/done" if not (config["binning_only"] or config["skip_taxonomy"]) else []
AroneyS marked this conversation as resolved.
Show resolved Hide resolved
output:
bin_stats = "bins/bin_info.tsv",
checkm_minimal = "bins/checkm_minimal.tsv"
Expand Down Expand Up @@ -746,7 +746,7 @@ rule singlem_pipe_reads:
rule singlem_appraise:
input:
metagenome = "data/singlem_out/metagenome.combined_otu_table.csv",
gtdbtk_done = "data/gtdbtk/done",
# gtdbtk_done = "data/gtdbtk/done",
bins_complete = "bins/checkm.out"
output:
"data/singlem_out/singlem_appraisal.tsv"
Expand Down Expand Up @@ -775,46 +775,34 @@ rule singlem_appraise:
rule recover_mags:
input:
final_bins = "bins/bin_info.tsv",
gtdbtk = "data/gtdbtk/done",
coverm = "data/coverm_abundances.tsv" if not config["skip_abundances"] else [],
singlem = "data/singlem_out/singlem_appraisal.tsv"
gtdbtk = "data/gtdbtk/done" if not (config["binning_only"] or config["skip_taxonomy"]) else [],
coverm = "data/coverm_abundances.tsv" if not (config["skip_abundances"] or config["binning_only"]) else [],
contig_coverage = "data/coverm.cov",
singlem = "data/singlem_out/singlem_appraisal.tsv" if not (config["skip_singlem"] or config["binning_only"]) else [],
conda:
"../../envs/coverm.yaml"
output:
bins = "bins/done",
diversity = 'diversity/done'
bins = "bins/done"
threads:
config["max_threads"]
shell:
"cd bins/; "
"ln -s ../data/coverm_abundances.tsv ./; "
"ln -s ../data/coverm.cov ./; "
"cd ../; "
"ln -sr data/singlem_out/ diversity || echo 'SingleM linked'; "
"ln -sr data/gtdbtk taxonomy || echo 'GTDB-tk linked'; "
"touch bins/done; "
"touch diversity/done; "
"rm -f data/binning_bams/*bam; "
"rm -f data/binning_bams/*bai; "
script:
"scripts/finalise_recover.py"

rule recover_mags_no_singlem:
AroneyS marked this conversation as resolved.
Show resolved Hide resolved
input:
final_bins = "bins/bin_info.tsv",
coverm = "data/coverm_abundances.tsv" if not config["skip_abundances"] else [],
gtdbtk = [],
coverm = "data/coverm_abundances.tsv" if not (config["skip_abundances"] or config["binning_only"]) else [],
contig_coverage = "data/coverm.cov",
singlem = [],
conda:
"../../envs/coverm.yaml"
output:
bins = "bins/done",
threads:
config["max_threads"]
shell:
"cd bins/; "
"ln -s ../data/coverm_abundances.tsv ./; "
"ln -s ../data/coverm.cov ./; "
"cd ../; "
"touch bins/done; "
"rm -f data/binning_bams/*bam; "
"rm -f data/binning_bams/*bai; "
script:
"scripts/finalise_stats.py"

# Special rule to help out with a buggy output
rule dereplicate_and_get_abundances_paired:
Expand Down
27 changes: 2 additions & 25 deletions aviary/modules/binning/envs/rosella.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,8 @@ channels:
- conda-forge
- numba
- bioconda
- defaults
dependencies:
- python >= 3.8, <= 3.10
- gcc
- cxx-compiler
- rosella >= 0.5.2
- numba >= 0.53, <= 0.57
- numpy <= 1.24
- joblib >= 1.1.0, <= 1.3
- scikit-bio >= 0.5.7
- umap-learn >= 0.5.3
- scipy <= 1.11
- pandas >= 1.3
- pynndescent >= 0.5.7
- hdbscan >= 0.8.28
- scikit-learn >= 1.0.2, <= 1.1
- flight-genome >= 1.6.1
- rosella >= 0.5.3
- flight-genome >= 1.6.3
- coverm >= 0.6.1
- seaborn
- imageio
- matplotlib
- tqdm
- tbb
- joblib
- pebble
- threadpoolctl
- biopython
- checkm-genome==1.1.3
29 changes: 29 additions & 0 deletions aviary/modules/binning/scripts/finalise_recovery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import os
from pathlib import Path
import glob

if __name__ == '__main__':
final_bins = snakemake.input.final_bins
coverage_file = snakemake.input.coverm
contig_coverage = snakemake.input.contig_coverage
gtdbtk = snakemake.input.gtdbtk
singlem = snakemake.input.singlem

os.chdir('bins/')

if len(coverage_file) > 0:
os.symlink(f"../{coverage_file}", "./")

if len(contig_coverage) > 0:
os.symlink(f"../{contig_coverage}", "./")

os.chdir('..')
if len(gtdbtk) > 0:
os.symlink(f"{gtdbtk}", "taxonomy")
if len(singlem) > 0:
os.symlink(f"{singlem}", "diversity")
AroneyS marked this conversation as resolved.
Show resolved Hide resolved

Path('bins/done').touch()

for f in glob.glob('data/binning_bams/*.ba*'):
os.remove(f)
AroneyS marked this conversation as resolved.
Show resolved Hide resolved
9 changes: 6 additions & 3 deletions aviary/modules/binning/scripts/finalise_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,12 @@ def get_taxonomy(rename_columns="Bin Id"):
taxa.append(df_arc)
except (FileNotFoundError, IndexError) as e:
pass

taxa = pd.concat(taxa)
taxa.rename({'user_genome' : rename_columns}, inplace=True, axis=1)

try:
taxa = pd.concat(taxa)
taxa.rename({'user_genome' : rename_columns}, inplace=True, axis=1)
except ValueError:
taxa = pd.DataFrame(columns=[rename_columns])
return taxa


Expand Down
28 changes: 24 additions & 4 deletions aviary/modules/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,16 +112,32 @@ def __init__(self,
self.refinery_max_iterations = args.refinery_max_iterations
self.refinery_max_retries = args.refinery_max_retries
self.skip_abundances = args.skip_abundances
self.skip_taxonomy = args.skip_taxonomy
self.skip_singlem = args.skip_singlem
if args.binning_only:
self.skip_abundances = True
self.skip_taxonomy = True
self.skip_singlem = True
self.binning_only = args.binning_only
AroneyS marked this conversation as resolved.
Show resolved Hide resolved

self.skip_binners = ["maxbin2", "concoct"]
if args.extra_binners:
for binner in args.extra_binners:
binner = binner.lower()
if binner == "maxbin" or binner == "maxbin2":
self.skip_binners.remove("maxbin2")
elif binner == "concoct":
self.skip_binners.remove("concoct")
else:
logging.warning(f"Unknown extra binner {binner} specified. Skipping...")
AroneyS marked this conversation as resolved.
Show resolved Hide resolved

self.skip_binners = []
if args.skip_binners:
for binner in args.skip_binners:
binner = binner.lower()
if binner == "metabat":
self.skip_binners.extend(["metabat_sens", "metabat_ssens", "metabat_spec", "metabat_sspec", "metabat2"])
elif binner == "metabat1":
self.skip_binners.extend(["metabat_sens", "metabat_ssens", "metabat_spec", "metabat_sspec"])
elif binner == "maxbin":
self.skip_binners.append("maxbin2")
else:
self.skip_binners.append(binner)

Expand All @@ -133,6 +149,7 @@ def __init__(self,
self.refinery_max_retries = 3
self.skip_binners = ["none"]
self.skip_abundances = False
self.binning_only = False

try:
self.assembly = args.assembly
Expand Down Expand Up @@ -354,6 +371,9 @@ def make_config(self):
conf["gsa_mappings"] = self.gsa_mappings
conf["skip_binners"] = self.skip_binners
conf["skip_abundances"] = self.skip_abundances
conf["skip_taxonomy"] = self.skip_taxonomy
conf["skip_singlem"] = self.skip_singlem
conf["binning_only"] = self.binning_only
conf["semibin_model"] = self.semibin_model
conf["refinery_max_iterations"] = self.refinery_max_iterations
conf["refinery_max_retries"] = self.refinery_max_retries
Expand Down Expand Up @@ -604,4 +624,4 @@ def fraction_to_percent(val):
val = float(val)
if val <= 1:
return val * 100
return val
return val
59 changes: 44 additions & 15 deletions aviary/modules/quality_control/scripts/qc_short_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,20 +87,49 @@ def combine_reads(

# we've got to concatenate the files together
if coassemble:
# reads 1 first
for reads_1, reads_2 in zip(short_reads_1, short_reads_2):
if reads_1 == "none" or reads_2 == "none":
continue

if not os.path.exists(reads_1):
logf.write(f"Short read file {reads_1} does not exist\n")
exit(1)

if not os.path.exists(reads_2):
logf.write(f"Short read file {reads_2} does not exist\n")
exit(1)
if "none" not in short_reads_2 and "none" not in short_reads_1:
for reads_1, reads_2 in zip(short_reads_1, short_reads_2):
if reads_1 == "none" or reads_2 == "none":
continue

if not os.path.exists(reads_1):
logf.write(f"Short read file {reads_1} does not exist\n")
exit(1)

if not os.path.exists(reads_2):
logf.write(f"Short read file {reads_2} does not exist\n")
exit(1)

setup_interleave(reads_1, reads_2, output_fastq, logf)
elif "none" not in short_reads_1:

setup_interleave(reads_1, reads_2, output_fastq, logf)
gzip_output = False
if not output_fastq.endswith(".gz"):
output_fastq = output_fastq + '.gz'

for reads_1 in short_reads_1:
if reads_1 == "none":
continue

if not os.path.exists(reads_1):
logf.write(f"Short read file {reads_1} does not exist\n")
exit(1)

cat_reads(reads_1, output_fastq, threads, log_file)
elif "none" not in short_reads_2:
gzip_output = False
if not output_fastq.endswith(".gz"):
output_fastq = output_fastq + '.gz'

for reads_2 in short_reads_2:
if reads_2 == "none":
continue

if not os.path.exists(reads_2):
logf.write(f"Short read file {reads_2} does not exist\n")
exit(1)

cat_reads(reads_2, output_fastq, threads, log_file)


else:
Expand Down Expand Up @@ -287,13 +316,13 @@ def filter_illumina_reference(
# run fastp for quality control of reads
se1_string = None
if "none" not in short_reads_1:
combine_reads(short_reads_1, ["none"], "data/short_reads.pre_qc.1.fastq.gz", coassemble, log, threads)
se1_string = "data/short_reads.pre_qc.1.fastq.gz"
combine_reads(short_reads_1, ["none"], se1_string, coassemble, log, threads)

se2_string = None
if "none" not in short_reads_2:
combine_reads(["none"], short_reads_2, "data/short_reads.pre_qc.2.fastq.gz", coassemble, log, threads)
se2_string = "data/short_reads.pre_qc.2.fastq.gz"
combine_reads(["none"], short_reads_2, se2_string, coassemble, log, threads)

run_fastp(
reads_1=se1_string,
Expand Down
4 changes: 2 additions & 2 deletions test/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ def test_short_read_recovery_fast(self):
f"-o {output_dir}/aviary_out "
f"-1 {data}/wgsim.1.fq.gz "
f"-2 {data}/wgsim.2.fq.gz "
f"--skip-abundances "
f"--skip-binners concoct rosella vamb maxbin2 metabat "
f"--binning-only "
f"--skip-binners rosella vamb metabat "
f"--skip-qc "
f"--refinery-max-iterations 0 "
f"--conda-prefix {path_to_conda} "
Expand Down
Loading
Loading