From bfebaf9959989d999636fd693f189836629fb949 Mon Sep 17 00:00:00 2001 From: rhysnewell Date: Mon, 8 Apr 2024 03:13:20 +0000 Subject: [PATCH 1/5] fix: batch submission now does not require user to write to script first --- aviary/aviary.py | 4 ++-- aviary/modules/processor.py | 43 ++++++++++++++++++++++++++----------- test/example_batch.tsv | 3 +++ 3 files changed, 35 insertions(+), 15 deletions(-) create mode 100755 test/example_batch.tsv diff --git a/aviary/aviary.py b/aviary/aviary.py index 1e88a6e1..0364ab62 100755 --- a/aviary/aviary.py +++ b/aviary/aviary.py @@ -1078,14 +1078,14 @@ def main(): aviary batch -f batch_file.tsv -t 32 -o batch_test - An example batch file can be found at: + An example batch file can be found at: https://rhysnewell.github.io/aviary/examples ''') batch_options.add_argument( '-f', '--batch_file', '--batch-file', help='The tab or comma separated batch file containing the input samples to assemble and/or recover MAGs from. \n' - 'An example batch file can be found at XXX. The heading line is required. \n' + 'An example batch file can be found at https://rhysnewell.github.io/aviary/examples. The heading line is required. \n' 'The number of reads provided to each sample is flexible as is the type of assembly being performed (if any). \n' 'Multiple reads can be supplied by providing a comma-separated list (surrounded by double quotes \"\" if using a \n' 'comma separated batch file) within the specific read column.', diff --git a/aviary/modules/processor.py b/aviary/modules/processor.py index bba9dd66..42bd440b 100644 --- a/aviary/modules/processor.py +++ b/aviary/modules/processor.py @@ -468,6 +468,8 @@ def run_workflow(self, cores=16, profile=None, cluster_retries=None, resources=f"--resources mem_mb={int(self.max_memory)*1024} {self.resources}" if not dryrun else "" ) + logging.debug(f"Command: {cmd}") + if write_to_script is not None: write_to_script.append(cmd) continue @@ -495,20 +497,31 @@ def process_batch(args, prefix): logging.info(f"Reading batch file: {args.batch_file}") - header=0 + header=None + separator=' ' with open(args.batch_file, mode='r') as check_batch: for line in check_batch.readlines(): - if "sample\tshort_reads_1\tshort_reads_2\tlong_reads\tlong_read_type\tassembly\tcoassemble" in line \ - or "sample,short_reads_1,short_reads_2,long_reads,long_read_type,assembly,coassemble" in line \ - or "sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble" in line \ - or "sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble" in line: - header=1 - logging.debug("Inferred header") - else: - logging.debug("No heading inferred.") + line = line.strip() + for sep in ['\t', ',', ' ']: + separated = line.split(sep) + print(separated) + if separated == ['sample', 'short_reads_1', 'short_reads_2', 'long_reads', 'long_read_type', 'assembly', 'coassemble']: + header=0 + separator=sep + logging.debug("Inferred header") + break + elif len(separated) >= 7: + header=None + separator=sep + logging.debug("Inferred no header") + break + if header is None: + logging.debug("No header found") break - - batch = pd.read_csv(args.batch_file, sep=None, engine='python', skiprows=header) + if header is not None: + batch = pd.read_csv(args.batch_file, sep=separator, engine='python', header=header) + else: + batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=['sample', 'short_reads_1', 'short_reads_2', 'long_reads', 'long_read_type', 'assembly', 'coassemble']) if len(batch.columns) != 7: logging.critical(f"Batch file contains incorrect number of columns ({len(batch.columns)}). Should contain 7.") logging.critical(f"Current columns: {batch.columns}") @@ -525,11 +538,15 @@ def process_batch(args, prefix): try: script_file = args.write_script - write_to_script = [] except AttributeError: script_file = None - write_to_script = None + + write_to_script = None + if script_file is not None: + write_to_script = [] + print(script_file) + print(write_to_script) runs = [] args.interleaved = "none" # hacky solution to skip attribute error args.coupled = "none" diff --git a/test/example_batch.tsv b/test/example_batch.tsv new file mode 100755 index 00000000..bb246ed9 --- /dev/null +++ b/test/example_batch.tsv @@ -0,0 +1,3 @@ +sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble +sample_1 test/data/wgsim.1.fq.gz test/data/wgsim.2.fq.gz NA ont NA NA +sample_2 NA NA test/data/pbsim.fq.gz pacbio NA NA \ No newline at end of file From c3d9277c12b04a111b2a4ea02fa8b78ccd2f506c Mon Sep 17 00:00:00 2001 From: rhysnewell Date: Mon, 8 Apr 2024 03:54:14 +0000 Subject: [PATCH 2/5] fix: batch headers now constant var --- aviary/modules/processor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/aviary/modules/processor.py b/aviary/modules/processor.py index 42bd440b..a22c6a0f 100644 --- a/aviary/modules/processor.py +++ b/aviary/modules/processor.py @@ -43,6 +43,8 @@ from snakemake.io import load_configfile from ruamel.yaml import YAML # used for yaml reading with comments +BATCH_HEADER=['sample', 'short_reads_1', 'short_reads_2', 'long_reads', 'long_read_type', 'assembly', 'coassemble'] + # Debug debug={1:logging.CRITICAL, 2:logging.ERROR, @@ -505,7 +507,7 @@ def process_batch(args, prefix): for sep in ['\t', ',', ' ']: separated = line.split(sep) print(separated) - if separated == ['sample', 'short_reads_1', 'short_reads_2', 'long_reads', 'long_read_type', 'assembly', 'coassemble']: + if separated == BATCH_HEADER: header=0 separator=sep logging.debug("Inferred header") @@ -521,7 +523,7 @@ def process_batch(args, prefix): if header is not None: batch = pd.read_csv(args.batch_file, sep=separator, engine='python', header=header) else: - batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=['sample', 'short_reads_1', 'short_reads_2', 'long_reads', 'long_read_type', 'assembly', 'coassemble']) + batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=BATCH_HEADER) if len(batch.columns) != 7: logging.critical(f"Batch file contains incorrect number of columns ({len(batch.columns)}). Should contain 7.") logging.critical(f"Current columns: {batch.columns}") From 95f00d3b5879b07f0b516be2cd288b097465d401 Mon Sep 17 00:00:00 2001 From: rhysnewell Date: Mon, 8 Apr 2024 05:35:38 +0000 Subject: [PATCH 3/5] fix: single line for reading in batch tsv --- aviary/modules/processor.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/aviary/modules/processor.py b/aviary/modules/processor.py index a22c6a0f..c881b6ba 100644 --- a/aviary/modules/processor.py +++ b/aviary/modules/processor.py @@ -506,7 +506,6 @@ def process_batch(args, prefix): line = line.strip() for sep in ['\t', ',', ' ']: separated = line.split(sep) - print(separated) if separated == BATCH_HEADER: header=0 separator=sep @@ -520,10 +519,9 @@ def process_batch(args, prefix): if header is None: logging.debug("No header found") break - if header is not None: - batch = pd.read_csv(args.batch_file, sep=separator, engine='python', header=header) - else: - batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=BATCH_HEADER) + + batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=BATCH_HEADER, header=header) + print(batch) if len(batch.columns) != 7: logging.critical(f"Batch file contains incorrect number of columns ({len(batch.columns)}). Should contain 7.") logging.critical(f"Current columns: {batch.columns}") @@ -547,8 +545,6 @@ def process_batch(args, prefix): if script_file is not None: write_to_script = [] - print(script_file) - print(write_to_script) runs = [] args.interleaved = "none" # hacky solution to skip attribute error args.coupled = "none" From c50b6b541e554a8e7e198742d62c3ecd41ab0f6e Mon Sep 17 00:00:00 2001 From: rhysnewell Date: Tue, 9 Apr 2024 07:13:13 +0000 Subject: [PATCH 4/5] fix: add batch test to test integration --- test/{ => data}/example_batch.tsv | 2 +- test/test_integration.py | 36 ++++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) rename test/{ => data}/example_batch.tsv (62%) diff --git a/test/example_batch.tsv b/test/data/example_batch.tsv similarity index 62% rename from test/example_batch.tsv rename to test/data/example_batch.tsv index bb246ed9..e014b39a 100755 --- a/test/example_batch.tsv +++ b/test/data/example_batch.tsv @@ -1,3 +1,3 @@ sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble sample_1 test/data/wgsim.1.fq.gz test/data/wgsim.2.fq.gz NA ont NA NA -sample_2 NA NA test/data/pbsim.fq.gz pacbio NA NA \ No newline at end of file +sample_2 test/data/wgsim.1.fq.gz test/data/wgsim.2.fq.gz test/data/pbsim.fq.gz ont NA NA \ No newline at end of file diff --git a/test/test_integration.py b/test/test_integration.py index 687143dd..fe17f77f 100755 --- a/test/test_integration.py +++ b/test/test_integration.py @@ -162,7 +162,6 @@ def test_short_read_recovery_queue_submission(self): cmd = ( f"aviary recover " - f"--output {output_dir} " f"-o {output_dir}/aviary_out " f"-1 {data}/wgsim.1.fq.gz " f"-2 {data}/wgsim.2.fq.gz " @@ -180,6 +179,41 @@ def test_short_read_recovery_queue_submission(self): num_lines = sum(1 for _ in f) self.assertEqual(num_lines, 3) + def test_batch_recovery(self): + output_dir = os.path.join("example", "test_batch_recovery") + self.setup_output_dir(output_dir) + cmd = ( + f"aviary batch " + f"-o {output_dir}/aviary_out " + f"-f {data}/example_batch.tsv " + f"--conda-prefix {path_to_conda} " + f"--skip-binners rosella vamb metabat " + f"--skip-qc " + f"--refinery-max-iterations 0 " + f"--min-read-size 10 --min-mean-q 1 " + f"-n 32 -t 32 " + ) + subprocess.run(cmd, shell=True, check=True) + + self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/sample_1/data/final_contigs.fasta")) + self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/sample_2/data/final_contigs.fasta")) + + bin_info_path_1 = f"{output_dir}/aviary_out/sample_1/bins/bin_info.tsv" + bin_info_path_2 = f"{output_dir}/aviary_out/sample_2/bins/bin_info.tsv" + self.assertTrue(os.path.isfile(bin_info_path_1)) + self.assertTrue(os.path.isfile(bin_info_path_2)) + with open(bin_info_path_1) as f: + num_lines = sum(1 for _ in f) + self.assertEqual(num_lines, 3) + + self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.95")) + self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.97")) + self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.99")) + + self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.95/pangenomes")) + self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.97/pangenomes")) + self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.99/pangenomes")) + if __name__ == "__main__": unittest.main() From b5a1d3a49da777311b9adf7192bf3208d9474d56 Mon Sep 17 00:00:00 2001 From: rhysnewell Date: Tue, 9 Apr 2024 07:14:04 +0000 Subject: [PATCH 5/5] fix: remove debug --- aviary/modules/processor.py | 1 - docs/_include/examples/example_batch.tsv | 10 +++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/aviary/modules/processor.py b/aviary/modules/processor.py index c881b6ba..57df8d40 100644 --- a/aviary/modules/processor.py +++ b/aviary/modules/processor.py @@ -521,7 +521,6 @@ def process_batch(args, prefix): break batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=BATCH_HEADER, header=header) - print(batch) if len(batch.columns) != 7: logging.critical(f"Batch file contains incorrect number of columns ({len(batch.columns)}). Should contain 7.") logging.critical(f"Current columns: {batch.columns}") diff --git a/docs/_include/examples/example_batch.tsv b/docs/_include/examples/example_batch.tsv index 1749ed9c..a8288358 100755 --- a/docs/_include/examples/example_batch.tsv +++ b/docs/_include/examples/example_batch.tsv @@ -1,5 +1,5 @@ -sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble -sample_1 pe1.fq.gz pe2.fq.gz nanopore.fq.gz ont NA NA -sample_2 interleaved.fq.gz NA pacbio.fq.gz ccs NA NA -sample_3 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq NA True -sample_4 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq assembly.fasta False \ No newline at end of file +sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble +sample_1 pe1.fq.gz pe2.fq.gz nanopore.fq.gz ont NA NA +sample_2 interleaved.fq.gz NA pacbio.fq.gz ccs NA NA +sample_3 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq NA True +sample_4 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq assembly.fasta False \ No newline at end of file