From bfebaf9959989d999636fd693f189836629fb949 Mon Sep 17 00:00:00 2001
From: rhysnewell <rhys.newell94@gmail.com>
Date: Mon, 8 Apr 2024 03:13:20 +0000
Subject: [PATCH 1/5] fix: batch submission now does not require user to write
 to script first

---
 aviary/aviary.py            |  4 ++--
 aviary/modules/processor.py | 43 ++++++++++++++++++++++++++-----------
 test/example_batch.tsv      |  3 +++
 3 files changed, 35 insertions(+), 15 deletions(-)
 create mode 100755 test/example_batch.tsv

diff --git a/aviary/aviary.py b/aviary/aviary.py
index 1e88a6e1..0364ab62 100755
--- a/aviary/aviary.py
+++ b/aviary/aviary.py
@@ -1078,14 +1078,14 @@ def main():
 
                                              aviary batch -f batch_file.tsv -t 32 -o batch_test
                                              
-                                             An example batch file can be found at: 
+                                             An example batch file can be found at: https://rhysnewell.github.io/aviary/examples
 
                                              ''')
 
     batch_options.add_argument(
         '-f', '--batch_file', '--batch-file',
         help='The tab or comma separated batch file containing the input samples to assemble and/or recover MAGs from. \n'
-             'An example batch file can be found at XXX. The heading line is required. \n'
+             'An example batch file can be found at https://rhysnewell.github.io/aviary/examples. The heading line is required. \n'
              'The number of reads provided to each sample is flexible as is the type of assembly being performed (if any). \n'
              'Multiple reads can be supplied by providing a comma-separated list (surrounded by double quotes \"\" if using a \n'
              'comma separated batch file) within the specific read column.',
diff --git a/aviary/modules/processor.py b/aviary/modules/processor.py
index bba9dd66..42bd440b 100644
--- a/aviary/modules/processor.py
+++ b/aviary/modules/processor.py
@@ -468,6 +468,8 @@ def run_workflow(self, cores=16, profile=None, cluster_retries=None,
                 resources=f"--resources mem_mb={int(self.max_memory)*1024} {self.resources}" if not dryrun else ""
             )
 
+            logging.debug(f"Command: {cmd}")
+
             if write_to_script is not None:
                 write_to_script.append(cmd)
                 continue
@@ -495,20 +497,31 @@ def process_batch(args, prefix):
 
     logging.info(f"Reading batch file: {args.batch_file}")
 
-    header=0
+    header=None
+    separator=' '
     with open(args.batch_file, mode='r') as check_batch:
         for line in check_batch.readlines():
-            if "sample\tshort_reads_1\tshort_reads_2\tlong_reads\tlong_read_type\tassembly\tcoassemble" in line \
-                or "sample,short_reads_1,short_reads_2,long_reads,long_read_type,assembly,coassemble" in line \
-                or "sample  short_reads_1   short_reads_2   long_reads      long_read_type  assembly        coassemble" in line \
-                or "sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble" in line:
-               header=1
-               logging.debug("Inferred header")
-            else:
-                logging.debug("No heading inferred.")
+            line = line.strip()
+            for sep in ['\t', ',', ' ']:
+                separated = line.split(sep)
+                print(separated)
+                if separated == ['sample', 'short_reads_1', 'short_reads_2', 'long_reads', 'long_read_type', 'assembly', 'coassemble']:
+                    header=0
+                    separator=sep
+                    logging.debug("Inferred header")
+                    break
+                elif len(separated) >= 7:
+                    header=None
+                    separator=sep
+                    logging.debug("Inferred no header")
+                    break
+            if header is None:
+                logging.debug("No header found")
             break
-
-    batch = pd.read_csv(args.batch_file, sep=None, engine='python', skiprows=header)
+    if header is not None:
+        batch = pd.read_csv(args.batch_file, sep=separator, engine='python', header=header)
+    else:
+        batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=['sample', 'short_reads_1', 'short_reads_2', 'long_reads', 'long_read_type', 'assembly', 'coassemble'])
     if len(batch.columns) != 7:
         logging.critical(f"Batch file contains incorrect number of columns ({len(batch.columns)}). Should contain 7.")
         logging.critical(f"Current columns: {batch.columns}")
@@ -525,11 +538,15 @@ def process_batch(args, prefix):
 
     try:
         script_file = args.write_script
-        write_to_script = []
     except AttributeError:
         script_file = None
-        write_to_script = None
+    
+    write_to_script = None
+    if script_file is not None:
+        write_to_script = []
 
+    print(script_file)
+    print(write_to_script)
     runs = []
     args.interleaved = "none" # hacky solution to skip attribute error
     args.coupled = "none"
diff --git a/test/example_batch.tsv b/test/example_batch.tsv
new file mode 100755
index 00000000..bb246ed9
--- /dev/null
+++ b/test/example_batch.tsv
@@ -0,0 +1,3 @@
+sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble
+sample_1 test/data/wgsim.1.fq.gz test/data/wgsim.2.fq.gz NA ont NA NA
+sample_2 NA NA test/data/pbsim.fq.gz pacbio NA NA
\ No newline at end of file

From c3d9277c12b04a111b2a4ea02fa8b78ccd2f506c Mon Sep 17 00:00:00 2001
From: rhysnewell <rhys.newell94@gmail.com>
Date: Mon, 8 Apr 2024 03:54:14 +0000
Subject: [PATCH 2/5] fix: batch headers now constant var

---
 aviary/modules/processor.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/aviary/modules/processor.py b/aviary/modules/processor.py
index 42bd440b..a22c6a0f 100644
--- a/aviary/modules/processor.py
+++ b/aviary/modules/processor.py
@@ -43,6 +43,8 @@
 from snakemake.io import load_configfile
 from ruamel.yaml import YAML  # used for yaml reading with comments
 
+BATCH_HEADER=['sample', 'short_reads_1', 'short_reads_2', 'long_reads', 'long_read_type', 'assembly', 'coassemble']
+
 # Debug
 debug={1:logging.CRITICAL,
        2:logging.ERROR,
@@ -505,7 +507,7 @@ def process_batch(args, prefix):
             for sep in ['\t', ',', ' ']:
                 separated = line.split(sep)
                 print(separated)
-                if separated == ['sample', 'short_reads_1', 'short_reads_2', 'long_reads', 'long_read_type', 'assembly', 'coassemble']:
+                if separated == BATCH_HEADER:
                     header=0
                     separator=sep
                     logging.debug("Inferred header")
@@ -521,7 +523,7 @@ def process_batch(args, prefix):
     if header is not None:
         batch = pd.read_csv(args.batch_file, sep=separator, engine='python', header=header)
     else:
-        batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=['sample', 'short_reads_1', 'short_reads_2', 'long_reads', 'long_read_type', 'assembly', 'coassemble'])
+        batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=BATCH_HEADER)
     if len(batch.columns) != 7:
         logging.critical(f"Batch file contains incorrect number of columns ({len(batch.columns)}). Should contain 7.")
         logging.critical(f"Current columns: {batch.columns}")

From 95f00d3b5879b07f0b516be2cd288b097465d401 Mon Sep 17 00:00:00 2001
From: rhysnewell <rhys.newell94@gmail.com>
Date: Mon, 8 Apr 2024 05:35:38 +0000
Subject: [PATCH 3/5] fix: single line for reading in batch tsv

---
 aviary/modules/processor.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/aviary/modules/processor.py b/aviary/modules/processor.py
index a22c6a0f..c881b6ba 100644
--- a/aviary/modules/processor.py
+++ b/aviary/modules/processor.py
@@ -506,7 +506,6 @@ def process_batch(args, prefix):
             line = line.strip()
             for sep in ['\t', ',', ' ']:
                 separated = line.split(sep)
-                print(separated)
                 if separated == BATCH_HEADER:
                     header=0
                     separator=sep
@@ -520,10 +519,9 @@ def process_batch(args, prefix):
             if header is None:
                 logging.debug("No header found")
             break
-    if header is not None:
-        batch = pd.read_csv(args.batch_file, sep=separator, engine='python', header=header)
-    else:
-        batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=BATCH_HEADER)
+
+    batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=BATCH_HEADER, header=header)
+    print(batch)
     if len(batch.columns) != 7:
         logging.critical(f"Batch file contains incorrect number of columns ({len(batch.columns)}). Should contain 7.")
         logging.critical(f"Current columns: {batch.columns}")
@@ -547,8 +545,6 @@ def process_batch(args, prefix):
     if script_file is not None:
         write_to_script = []
 
-    print(script_file)
-    print(write_to_script)
     runs = []
     args.interleaved = "none" # hacky solution to skip attribute error
     args.coupled = "none"

From c50b6b541e554a8e7e198742d62c3ecd41ab0f6e Mon Sep 17 00:00:00 2001
From: rhysnewell <rhys.newell94@gmail.com>
Date: Tue, 9 Apr 2024 07:13:13 +0000
Subject: [PATCH 4/5] fix: add batch test to test integration

---
 test/{ => data}/example_batch.tsv |  2 +-
 test/test_integration.py          | 36 ++++++++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 2 deletions(-)
 rename test/{ => data}/example_batch.tsv (62%)

diff --git a/test/example_batch.tsv b/test/data/example_batch.tsv
similarity index 62%
rename from test/example_batch.tsv
rename to test/data/example_batch.tsv
index bb246ed9..e014b39a 100755
--- a/test/example_batch.tsv
+++ b/test/data/example_batch.tsv
@@ -1,3 +1,3 @@
 sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble
 sample_1 test/data/wgsim.1.fq.gz test/data/wgsim.2.fq.gz NA ont NA NA
-sample_2 NA NA test/data/pbsim.fq.gz pacbio NA NA
\ No newline at end of file
+sample_2 test/data/wgsim.1.fq.gz test/data/wgsim.2.fq.gz test/data/pbsim.fq.gz ont NA NA
\ No newline at end of file
diff --git a/test/test_integration.py b/test/test_integration.py
index 687143dd..fe17f77f 100755
--- a/test/test_integration.py
+++ b/test/test_integration.py
@@ -162,7 +162,6 @@ def test_short_read_recovery_queue_submission(self):
 
         cmd = (
             f"aviary recover "
-            f"--output {output_dir} "
             f"-o {output_dir}/aviary_out "
             f"-1 {data}/wgsim.1.fq.gz "
             f"-2 {data}/wgsim.2.fq.gz "
@@ -180,6 +179,41 @@ def test_short_read_recovery_queue_submission(self):
             num_lines = sum(1 for _ in f)
         self.assertEqual(num_lines, 3)
 
+    def test_batch_recovery(self):
+        output_dir = os.path.join("example", "test_batch_recovery")
+        self.setup_output_dir(output_dir)
+        cmd = (
+            f"aviary batch "
+            f"-o {output_dir}/aviary_out "
+            f"-f {data}/example_batch.tsv "
+            f"--conda-prefix {path_to_conda} "
+            f"--skip-binners rosella vamb metabat "
+            f"--skip-qc "
+            f"--refinery-max-iterations 0 "
+            f"--min-read-size 10 --min-mean-q 1 "
+            f"-n 32 -t 32 "
+        )
+        subprocess.run(cmd, shell=True, check=True)
+
+        self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/sample_1/data/final_contigs.fasta"))
+        self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/sample_2/data/final_contigs.fasta"))
+
+        bin_info_path_1 = f"{output_dir}/aviary_out/sample_1/bins/bin_info.tsv"
+        bin_info_path_2 = f"{output_dir}/aviary_out/sample_2/bins/bin_info.tsv"
+        self.assertTrue(os.path.isfile(bin_info_path_1))
+        self.assertTrue(os.path.isfile(bin_info_path_2))
+        with open(bin_info_path_1) as f:
+            num_lines = sum(1 for _ in f)
+        self.assertEqual(num_lines, 3)
+
+        self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.95"))
+        self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.97"))
+        self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.99"))
+
+        self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.95/pangenomes"))
+        self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.97/pangenomes"))
+        self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.99/pangenomes"))
+
 
 if __name__ == "__main__":
     unittest.main()

From b5a1d3a49da777311b9adf7192bf3208d9474d56 Mon Sep 17 00:00:00 2001
From: rhysnewell <rhys.newell94@gmail.com>
Date: Tue, 9 Apr 2024 07:14:04 +0000
Subject: [PATCH 5/5] fix: remove debug

---
 aviary/modules/processor.py              |  1 -
 docs/_include/examples/example_batch.tsv | 10 +++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/aviary/modules/processor.py b/aviary/modules/processor.py
index c881b6ba..57df8d40 100644
--- a/aviary/modules/processor.py
+++ b/aviary/modules/processor.py
@@ -521,7 +521,6 @@ def process_batch(args, prefix):
             break
 
     batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=BATCH_HEADER, header=header)
-    print(batch)
     if len(batch.columns) != 7:
         logging.critical(f"Batch file contains incorrect number of columns ({len(batch.columns)}). Should contain 7.")
         logging.critical(f"Current columns: {batch.columns}")
diff --git a/docs/_include/examples/example_batch.tsv b/docs/_include/examples/example_batch.tsv
index 1749ed9c..a8288358 100755
--- a/docs/_include/examples/example_batch.tsv
+++ b/docs/_include/examples/example_batch.tsv
@@ -1,5 +1,5 @@
-sample	short_reads_1	short_reads_2	long_reads	long_read_type	assembly    coassemble
-sample_1	pe1.fq.gz	pe2.fq.gz	nanopore.fq.gz	ont	NA  NA
-sample_2	interleaved.fq.gz	NA	pacbio.fq.gz	ccs	NA  NA
-sample_3	pe1.1.fq.gz,pe1.2.fq.gz	pe2.1.fq.gz,pe2.2.fq.gz	n1.fq.gz,n2.fq.gz,n3.fq.gz	ont-hq	NA  True
-sample_4	pe1.1.fq.gz,pe1.2.fq.gz	pe2.1.fq.gz,pe2.2.fq.gz	n1.fq.gz,n2.fq.gz,n3.fq.gz	ont-hq	assembly.fasta  False
\ No newline at end of file
+sample   	short_reads_1           	short_reads_2           	long_reads                 	long_read_type 	assembly    coassemble
+sample_1 	pe1.fq.gz               	pe2.fq.gz               	nanopore.fq.gz             	ont            	NA  NA
+sample_2 	interleaved.fq.gz       	NA                      	pacbio.fq.gz               	ccs            	NA  NA
+sample_3 	pe1.1.fq.gz,pe1.2.fq.gz 	pe2.1.fq.gz,pe2.2.fq.gz 	n1.fq.gz,n2.fq.gz,n3.fq.gz 	ont-hq         	NA  True
+sample_4 	pe1.1.fq.gz,pe1.2.fq.gz 	pe2.1.fq.gz,pe2.2.fq.gz 	n1.fq.gz,n2.fq.gz,n3.fq.gz 	ont-hq         	assembly.fasta  False
\ No newline at end of file