Merge pull request #199 from rhysnewell/ISS-198

fix: batch submission now does not require user to write to script first
rhysnewell · Apr 10, 2024 · bf90668 · bf90668
2 parents 7195d25 + b5a1d3a
commit bf90668
Show file tree

Hide file tree

Showing 5 changed files with 71 additions and 20 deletions.
diff --git a/aviary/aviary.py b/aviary/aviary.py
@@ -1078,14 +1078,14 @@ def main():
 
                                              aviary batch -f batch_file.tsv -t 32 -o batch_test
                                              
-                                             An example batch file can be found at: 
+                                             An example batch file can be found at: https://rhysnewell.github.io/aviary/examples
 
                                              ''')
 
     batch_options.add_argument(
         '-f', '--batch_file', '--batch-file',
         help='The tab or comma separated batch file containing the input samples to assemble and/or recover MAGs from. \n'
-             'An example batch file can be found at XXX. The heading line is required. \n'
+             'An example batch file can be found at https://rhysnewell.github.io/aviary/examples. The heading line is required. \n'
              'The number of reads provided to each sample is flexible as is the type of assembly being performed (if any). \n'
              'Multiple reads can be supplied by providing a comma-separated list (surrounded by double quotes \"\" if using a \n'
              'comma separated batch file) within the specific read column.',

diff --git a/aviary/modules/processor.py b/aviary/modules/processor.py
@@ -43,6 +43,8 @@
 from snakemake.io import load_configfile
 from ruamel.yaml import YAML  # used for yaml reading with comments
 
+BATCH_HEADER=['sample', 'short_reads_1', 'short_reads_2', 'long_reads', 'long_read_type', 'assembly', 'coassemble']
+
 # Debug
 debug={1:logging.CRITICAL,
        2:logging.ERROR,
@@ -468,6 +470,8 @@ def run_workflow(self, cores=16, profile=None, cluster_retries=None,
                 resources=f"--resources mem_mb={int(self.max_memory)*1024} {self.resources}" if not dryrun else ""
             )
 
+            logging.debug(f"Command: {cmd}")
+
             if write_to_script is not None:
                 write_to_script.append(cmd)
                 continue
@@ -495,20 +499,28 @@ def process_batch(args, prefix):
 
     logging.info(f"Reading batch file: {args.batch_file}")
 
-    header=0
+    header=None
+    separator=' '
     with open(args.batch_file, mode='r') as check_batch:
         for line in check_batch.readlines():
-            if "sample\tshort_reads_1\tshort_reads_2\tlong_reads\tlong_read_type\tassembly\tcoassemble" in line \
-                or "sample,short_reads_1,short_reads_2,long_reads,long_read_type,assembly,coassemble" in line \
-                or "sample  short_reads_1   short_reads_2   long_reads      long_read_type  assembly        coassemble" in line \
-                or "sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble" in line:
-               header=1
-               logging.debug("Inferred header")
-            else:
-                logging.debug("No heading inferred.")
+            line = line.strip()
+            for sep in ['\t', ',', ' ']:
+                separated = line.split(sep)
+                if separated == BATCH_HEADER:
+                    header=0
+                    separator=sep
+                    logging.debug("Inferred header")
+                    break
+                elif len(separated) >= 7:
+                    header=None
+                    separator=sep
+                    logging.debug("Inferred no header")
+                    break
+            if header is None:
+                logging.debug("No header found")
             break
 
-    batch = pd.read_csv(args.batch_file, sep=None, engine='python', skiprows=header)
+    batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=BATCH_HEADER, header=header)
     if len(batch.columns) != 7:
         logging.critical(f"Batch file contains incorrect number of columns ({len(batch.columns)}). Should contain 7.")
         logging.critical(f"Current columns: {batch.columns}")
@@ -525,10 +537,12 @@ def process_batch(args, prefix):
 
     try:
         script_file = args.write_script
-        write_to_script = []
     except AttributeError:
         script_file = None
-        write_to_script = None
+
+    write_to_script = None
+    if script_file is not None:
+        write_to_script = []
 
     runs = []
     args.interleaved = "none" # hacky solution to skip attribute error

diff --git a/docs/_include/examples/example_batch.tsv b/docs/_include/examples/example_batch.tsv
@@ -1,5 +1,5 @@
-sample	short_reads_1	short_reads_2	long_reads	long_read_type	assembly    coassemble
-sample_1	pe1.fq.gz	pe2.fq.gz	nanopore.fq.gz	ont	NA  NA
-sample_2	interleaved.fq.gz	NA	pacbio.fq.gz	ccs	NA  NA
-sample_3	pe1.1.fq.gz,pe1.2.fq.gz	pe2.1.fq.gz,pe2.2.fq.gz	n1.fq.gz,n2.fq.gz,n3.fq.gz	ont-hq	NA  True
-sample_4	pe1.1.fq.gz,pe1.2.fq.gz	pe2.1.fq.gz,pe2.2.fq.gz	n1.fq.gz,n2.fq.gz,n3.fq.gz	ont-hq	assembly.fasta  False
+sample   	short_reads_1           	short_reads_2           	long_reads                 	long_read_type 	assembly    coassemble
+sample_1 	pe1.fq.gz               	pe2.fq.gz               	nanopore.fq.gz             	ont            	NA  NA
+sample_2 	interleaved.fq.gz       	NA                      	pacbio.fq.gz               	ccs            	NA  NA
+sample_3 	pe1.1.fq.gz,pe1.2.fq.gz 	pe2.1.fq.gz,pe2.2.fq.gz 	n1.fq.gz,n2.fq.gz,n3.fq.gz 	ont-hq         	NA  True
+sample_4 	pe1.1.fq.gz,pe1.2.fq.gz 	pe2.1.fq.gz,pe2.2.fq.gz 	n1.fq.gz,n2.fq.gz,n3.fq.gz 	ont-hq         	assembly.fasta  False
diff --git a/test/data/example_batch.tsv b/test/data/example_batch.tsv
@@ -0,0 +1,3 @@
+sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble
+sample_1 test/data/wgsim.1.fq.gz test/data/wgsim.2.fq.gz NA ont NA NA
+sample_2 test/data/wgsim.1.fq.gz test/data/wgsim.2.fq.gz test/data/pbsim.fq.gz ont NA NA
diff --git a/test/test_integration.py b/test/test_integration.py
@@ -162,7 +162,6 @@ def test_short_read_recovery_queue_submission(self):
 
         cmd = (
             f"aviary recover "
-            f"--output {output_dir} "
             f"-o {output_dir}/aviary_out "
             f"-1 {data}/wgsim.1.fq.gz "
             f"-2 {data}/wgsim.2.fq.gz "
@@ -180,6 +179,41 @@ def test_short_read_recovery_queue_submission(self):
             num_lines = sum(1 for _ in f)
         self.assertEqual(num_lines, 3)
 
+    def test_batch_recovery(self):
+        output_dir = os.path.join("example", "test_batch_recovery")
+        self.setup_output_dir(output_dir)
+        cmd = (
+            f"aviary batch "
+            f"-o {output_dir}/aviary_out "
+            f"-f {data}/example_batch.tsv "
+            f"--conda-prefix {path_to_conda} "
+            f"--skip-binners rosella vamb metabat "
+            f"--skip-qc "
+            f"--refinery-max-iterations 0 "
+            f"--min-read-size 10 --min-mean-q 1 "
+            f"-n 32 -t 32 "
+        )
+        subprocess.run(cmd, shell=True, check=True)
+
+        self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/sample_1/data/final_contigs.fasta"))
+        self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/sample_2/data/final_contigs.fasta"))
+
+        bin_info_path_1 = f"{output_dir}/aviary_out/sample_1/bins/bin_info.tsv"
+        bin_info_path_2 = f"{output_dir}/aviary_out/sample_2/bins/bin_info.tsv"
+        self.assertTrue(os.path.isfile(bin_info_path_1))
+        self.assertTrue(os.path.isfile(bin_info_path_2))
+        with open(bin_info_path_1) as f:
+            num_lines = sum(1 for _ in f)
+        self.assertEqual(num_lines, 3)
+
+        self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.95"))
+        self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.97"))
+        self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.99"))
+
+        self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.95/pangenomes"))
+        self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.97/pangenomes"))
+        self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.99/pangenomes"))
+
 
 if __name__ == "__main__":
     unittest.main()