Skip to content

Commit

Permalink
Merge pull request #199 from rhysnewell/ISS-198
Browse files Browse the repository at this point in the history
fix: batch submission now does not require user to write to script first
  • Loading branch information
rhysnewell authored Apr 10, 2024
2 parents 7195d25 + b5a1d3a commit bf90668
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 20 deletions.
4 changes: 2 additions & 2 deletions aviary/aviary.py
Original file line number Diff line number Diff line change
Expand Up @@ -1078,14 +1078,14 @@ def main():
aviary batch -f batch_file.tsv -t 32 -o batch_test
An example batch file can be found at:
An example batch file can be found at: https://rhysnewell.github.io/aviary/examples
''')

batch_options.add_argument(
'-f', '--batch_file', '--batch-file',
help='The tab or comma separated batch file containing the input samples to assemble and/or recover MAGs from. \n'
'An example batch file can be found at XXX. The heading line is required. \n'
'An example batch file can be found at https://rhysnewell.github.io/aviary/examples. The heading line is required. \n'
'The number of reads provided to each sample is flexible as is the type of assembly being performed (if any). \n'
'Multiple reads can be supplied by providing a comma-separated list (surrounded by double quotes \"\" if using a \n'
'comma separated batch file) within the specific read column.',
Expand Down
38 changes: 26 additions & 12 deletions aviary/modules/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
from snakemake.io import load_configfile
from ruamel.yaml import YAML # used for yaml reading with comments

BATCH_HEADER=['sample', 'short_reads_1', 'short_reads_2', 'long_reads', 'long_read_type', 'assembly', 'coassemble']

# Debug
debug={1:logging.CRITICAL,
2:logging.ERROR,
Expand Down Expand Up @@ -468,6 +470,8 @@ def run_workflow(self, cores=16, profile=None, cluster_retries=None,
resources=f"--resources mem_mb={int(self.max_memory)*1024} {self.resources}" if not dryrun else ""
)

logging.debug(f"Command: {cmd}")

if write_to_script is not None:
write_to_script.append(cmd)
continue
Expand Down Expand Up @@ -495,20 +499,28 @@ def process_batch(args, prefix):

logging.info(f"Reading batch file: {args.batch_file}")

header=0
header=None
separator=' '
with open(args.batch_file, mode='r') as check_batch:
for line in check_batch.readlines():
if "sample\tshort_reads_1\tshort_reads_2\tlong_reads\tlong_read_type\tassembly\tcoassemble" in line \
or "sample,short_reads_1,short_reads_2,long_reads,long_read_type,assembly,coassemble" in line \
or "sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble" in line \
or "sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble" in line:
header=1
logging.debug("Inferred header")
else:
logging.debug("No heading inferred.")
line = line.strip()
for sep in ['\t', ',', ' ']:
separated = line.split(sep)
if separated == BATCH_HEADER:
header=0
separator=sep
logging.debug("Inferred header")
break
elif len(separated) >= 7:
header=None
separator=sep
logging.debug("Inferred no header")
break
if header is None:
logging.debug("No header found")
break

batch = pd.read_csv(args.batch_file, sep=None, engine='python', skiprows=header)
batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=BATCH_HEADER, header=header)
if len(batch.columns) != 7:
logging.critical(f"Batch file contains incorrect number of columns ({len(batch.columns)}). Should contain 7.")
logging.critical(f"Current columns: {batch.columns}")
Expand All @@ -525,10 +537,12 @@ def process_batch(args, prefix):

try:
script_file = args.write_script
write_to_script = []
except AttributeError:
script_file = None
write_to_script = None

write_to_script = None
if script_file is not None:
write_to_script = []

runs = []
args.interleaved = "none" # hacky solution to skip attribute error
Expand Down
10 changes: 5 additions & 5 deletions docs/_include/examples/example_batch.tsv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble
sample_1 pe1.fq.gz pe2.fq.gz nanopore.fq.gz ont NA NA
sample_2 interleaved.fq.gz NA pacbio.fq.gz ccs NA NA
sample_3 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq NA True
sample_4 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq assembly.fasta False
sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble
sample_1 pe1.fq.gz pe2.fq.gz nanopore.fq.gz ont NA NA
sample_2 interleaved.fq.gz NA pacbio.fq.gz ccs NA NA
sample_3 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq NA True
sample_4 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq assembly.fasta False
3 changes: 3 additions & 0 deletions test/data/example_batch.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble
sample_1 test/data/wgsim.1.fq.gz test/data/wgsim.2.fq.gz NA ont NA NA
sample_2 test/data/wgsim.1.fq.gz test/data/wgsim.2.fq.gz test/data/pbsim.fq.gz ont NA NA
36 changes: 35 additions & 1 deletion test/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,6 @@ def test_short_read_recovery_queue_submission(self):

cmd = (
f"aviary recover "
f"--output {output_dir} "
f"-o {output_dir}/aviary_out "
f"-1 {data}/wgsim.1.fq.gz "
f"-2 {data}/wgsim.2.fq.gz "
Expand All @@ -180,6 +179,41 @@ def test_short_read_recovery_queue_submission(self):
num_lines = sum(1 for _ in f)
self.assertEqual(num_lines, 3)

def test_batch_recovery(self):
output_dir = os.path.join("example", "test_batch_recovery")
self.setup_output_dir(output_dir)
cmd = (
f"aviary batch "
f"-o {output_dir}/aviary_out "
f"-f {data}/example_batch.tsv "
f"--conda-prefix {path_to_conda} "
f"--skip-binners rosella vamb metabat "
f"--skip-qc "
f"--refinery-max-iterations 0 "
f"--min-read-size 10 --min-mean-q 1 "
f"-n 32 -t 32 "
)
subprocess.run(cmd, shell=True, check=True)

self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/sample_1/data/final_contigs.fasta"))
self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/sample_2/data/final_contigs.fasta"))

bin_info_path_1 = f"{output_dir}/aviary_out/sample_1/bins/bin_info.tsv"
bin_info_path_2 = f"{output_dir}/aviary_out/sample_2/bins/bin_info.tsv"
self.assertTrue(os.path.isfile(bin_info_path_1))
self.assertTrue(os.path.isfile(bin_info_path_2))
with open(bin_info_path_1) as f:
num_lines = sum(1 for _ in f)
self.assertEqual(num_lines, 3)

self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.95"))
self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.97"))
self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.99"))

self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.95/pangenomes"))
self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.97/pangenomes"))
self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.99/pangenomes"))


if __name__ == "__main__":
unittest.main()

0 comments on commit bf90668

Please sign in to comment.