From 35c27daad244bc8f69c346564fa611fbe310a707 Mon Sep 17 00:00:00 2001
From: Cole Lyman <Cole@colelyman.com>
Date: Thu, 14 Mar 2024 13:57:07 -0600
Subject: [PATCH] Fix interleaved fastq input in CRISPRessoPooled and suppress
 CRISPRessoWGS params (#42)

* Extract out split_interleaved_fastq function to CRISPRessoShared

* Implement splitting interleaved fastq files in CRISPRessoPooled

* Suppress split_interleaved_input from CRISPRessoWGS parameters

* Suppress other parameters in CRISPRessoWGS

* Move where interleaved fastq files are split to be trimmed properly
---
 CRISPResso2/CRISPRessoCORE.py       | 22 ++------
 CRISPResso2/CRISPRessoPooledCORE.py | 17 ++++--
 CRISPResso2/CRISPRessoShared.py     | 84 ++++++++++++++++++++++++-----
 CRISPResso2/CRISPRessoWGSCORE.py    | 16 +++---
 4 files changed, 98 insertions(+), 41 deletions(-)

diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py
index 96b916af..f4a5909e 100644
--- a/CRISPResso2/CRISPRessoCORE.py
+++ b/CRISPResso2/CRISPRessoCORE.py
@@ -1000,22 +1000,6 @@ def process_single_fastq_write_bam_out(fastq_input, bam_output, bam_header, vari
     return(aln_stats)
 
 
-def split_interleaved_fastq(fastq_filename, output_filename_r1, output_filename_r2):
-    if fastq_filename.endswith('.gz'):
-        fastq_handle = gzip.open(fastq_filename, 'rt')
-    else:
-        fastq_handle=open(fastq_filename)
-
-    try:
-        fastq_splitted_outfile_r1 = gzip.open(output_filename_r1, 'wt')
-        fastq_splitted_outfile_r2 = gzip.open(output_filename_r2, 'wt')
-        [fastq_splitted_outfile_r1.write(line) if (i % 8 < 4) else fastq_splitted_outfile_r2.write(line) for i, line in enumerate(fastq_handle)]
-    except:
-        raise CRISPRessoShared.BadParameterException('Error in splitting read pairs from a single file')
-
-    return output_filename_r1, output_filename_r2
-
-
 def normalize_name(name, fastq_r1, fastq_r2, bam_input):
     """Normalize the name according to the inputs and clean it.
 
@@ -2193,11 +2177,11 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
                 raise CRISPRessoShared.BadParameterException('The option --split_interleaved_input is available only when a single fastq file is specified!')
             else:
                 info('Splitting paired end single fastq file into two files...')
-                args.fastq_r1, args.fastq_r2=split_interleaved_fastq(args.fastq_r1,
+                args.fastq_r1, args.fastq_r2 = CRISPRessoShared.split_interleaved_fastq(args.fastq_r1,
                     output_filename_r1=_jp(os.path.basename(args.fastq_r1.replace('.fastq', '')).replace('.gz', '')+'_splitted_r1.fastq.gz'),
                     output_filename_r2=_jp(os.path.basename(args.fastq_r1.replace('.fastq', '')).replace('.gz', '')+'_splitted_r2.fastq.gz'),)
-                files_to_remove+=[args.fastq_r1, args.fastq_r2]
-                N_READS_INPUT = N_READS_INPUT/2
+                files_to_remove += [args.fastq_r1, args.fastq_r2]
+                N_READS_INPUT /= 2
 
                 info('Done!', {'percent_complete': 4})
 
diff --git a/CRISPResso2/CRISPRessoPooledCORE.py b/CRISPResso2/CRISPRessoPooledCORE.py
index f1142488..d84b987e 100644
--- a/CRISPResso2/CRISPRessoPooledCORE.py
+++ b/CRISPResso2/CRISPRessoPooledCORE.py
@@ -338,7 +338,7 @@ def main():
         CRISPRessoShared.set_console_log_level(logger, args.verbosity, args.debug)
 
         crispresso_options = CRISPRessoShared.get_crispresso_options()
-        options_to_ignore = {'fastq_r1', 'fastq_r2', 'amplicon_seq', 'amplicon_name', 'output_folder', 'name', 'zip_output'}
+        options_to_ignore = {'fastq_r1', 'fastq_r2', 'amplicon_seq', 'amplicon_name', 'output_folder', 'name', 'zip_output', 'split_interleaved_input'}
         crispresso_options_for_pooled = list(crispresso_options-options_to_ignore)
 
         files_to_remove = []
@@ -511,6 +511,15 @@ def main():
 
         info('Processing input', {'percent_complete': 5})
 
+        if args.split_interleaved_input:
+            info('Splitting paired end single fastq file into two files...')
+            args.fastq_r1, args.fastq_r2 = CRISPRessoShared.split_interleaved_fastq(
+                args.fastq_r1,
+                output_filename_r1=_jp('{0}_splitted_r1.fastq.gz'.format(os.path.basename(args.fastq_r1).replace('.fastq', '').replace('.gz', ''))),
+                output_filename_r2=_jp('{0}_splitted_r2.fastq.gz'.format(os.path.basename(args.fastq_r1).replace('.fastq', '').replace('.gz', ''))),
+            )
+            files_to_remove += [args.fastq_r1, args.fastq_r2]
+
         # perform read trimming if necessary
         if args.aligned_pooled_bam is not None:
             # don't trim reads in aligned bams
@@ -615,6 +624,8 @@ def main():
                 N_READS_AFTER_PREPROCESSING = N_READS_INPUT
             else:
                 N_READS_INPUT = get_n_reads_fastq(args.fastq_r1)
+                if args.split_interleaved_input:
+                    N_READS_INPUT /= 2
                 N_READS_AFTER_PREPROCESSING = get_n_reads_fastq(processed_output_filename)
 
             crispresso2_info['running_info']['finished_steps']['count_input_reads'] = (N_READS_INPUT, N_READS_AFTER_PREPROCESSING)
@@ -689,7 +700,7 @@ def main():
                 raise CRISPRessoShared.BadParameterException('Incorrect number of columns provided without header.')
             elif has_header and len(unmatched_headers) > 0:
                 raise CRISPRessoShared.BadParameterException('Unable to match headers: ' + str(unmatched_headers))
-            
+
             if not has_header:
                 # Default header
                 headers = []
@@ -886,7 +897,7 @@ def main():
             failed_batch_arr = []
             failed_batch_arr_desc = []
             for cmd in crispresso_cmds:
-                
+
                 # Extract the folder name from the CRISPResso command
                 folder_name_regex = re.search(r'-o\s+\S+\s+--name\s+(\S+)', cmd)
                 if folder_name_regex:
diff --git a/CRISPResso2/CRISPRessoShared.py b/CRISPResso2/CRISPRessoShared.py
index dcddfe2f..da9d1251 100644
--- a/CRISPResso2/CRISPRessoShared.py
+++ b/CRISPResso2/CRISPRessoShared.py
@@ -140,15 +140,19 @@ def set_console_log_level(logger, level, debug=False):
 def getCRISPRessoArgParser(parser_title="CRISPResso Parameters", required_params=[], suppress_params=[]):
     parser = argparse.ArgumentParser(description=parser_title, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--version', action='version', version='%(prog)s ' + __version__)
-    parser.add_argument('-r1', '--fastq_r1', type=str, help='First fastq file', default='',
-                        required='fastq_r1' in required_params)
-    parser.add_argument('-r2', '--fastq_r2', type=str, help='Second fastq file for paired end reads', default='')
-    parser.add_argument('-a', '--amplicon_seq', type=str,
-                        help='Amplicon Sequence (can be comma-separated list of multiple sequences)',
-                        required='amplicon_seq' in required_params)
-    parser.add_argument('-an', '--amplicon_name', type=str,
-                        help='Amplicon Name (can be comma-separated list of multiple names, corresponding to amplicon sequences given in --amplicon_seq',
-                        default='Reference')
+    if 'fastq_r1' not in suppress_params:
+        parser.add_argument('-r1', '--fastq_r1', type=str, help='First fastq file', default='',
+                            required='fastq_r1' in required_params)
+    if 'fastq_r2' not in suppress_params:
+        parser.add_argument('-r2', '--fastq_r2', type=str, help='Second fastq file for paired end reads', default='')
+    if 'amplicon_seq' not in suppress_params:
+        parser.add_argument('-a', '--amplicon_seq', type=str,
+                            help='Amplicon Sequence (can be comma-separated list of multiple sequences)',
+                            required='amplicon_seq' in required_params)
+    if 'amplicon_name' not in suppress_params:
+        parser.add_argument('-an', '--amplicon_name', type=str,
+                            help='Amplicon Name (can be comma-separated list of multiple names, corresponding to amplicon sequences given in --amplicon_seq',
+                            default='Reference')
     parser.add_argument('-amas', '--amplicon_min_alignment_score', type=str,
                         help='Amplicon Minimum Alignment Score; score between 0 and 100; sequences must have at least this homology score with the amplicon to be aligned (can be comma-separated list of multiple scores, corresponding to amplicon sequences given in --amplicon_seq)',
                         default="")
@@ -199,9 +203,10 @@ def getCRISPRessoArgParser(parser_title="CRISPResso Parameters", required_params
     parser.add_argument('-v', '--verbosity', type=int, help='Verbosity level of output to the console (1-4), 4 is the most verbose', default=3)
 
     ## read preprocessing params
-    parser.add_argument('--split_interleaved_input', '--split_paired_end',
-                        help='Splits a single fastq file containing paired end reads into two files before running CRISPResso',
-                        action='store_true')
+    if 'split_interleaved_input' not in suppress_params:
+        parser.add_argument('--split_interleaved_input', '--split_paired_end',
+                            help='Splits a single fastq file containing paired end reads into two files before running CRISPResso',
+                            action='store_true')
     parser.add_argument('--trim_sequences', help='Enable the trimming of Illumina adapters with Trimmomatic',
                         action='store_true')
     parser.add_argument('--trimmomatic_command', type=str, help='Command to run trimmomatic', default='trimmomatic')
@@ -1326,6 +1331,61 @@ def force_merge_pairs(r1_filename, r2_filename, output_filename):
     return (lineCount)
 
 
+def split_interleaved_fastq(fastq_filename, output_filename_r1, output_filename_r2):
+    """Split an interleaved fastq file into two files, one for each read pair.
+
+    This assumes that the input fastq file is interleaved, i.e. that the reads are ordered as follows:
+        R1
+        R2
+        R1
+        R2
+        ...
+
+    And results in two files, one for each read pair:
+        output_filename_r1
+            R1
+            R1
+            ...
+        output_filename_r2
+            R2
+            R2
+            ...
+
+    Parameters
+    ----------
+    fastq_filename : str
+        Path to the input fastq file.
+    output_filename_r1 : str
+        Path to the output fastq file for r1.
+    output_filename_r2 : str
+        Path to the output fastq file for r2.
+
+    Returns
+    -------
+    output_filename_r1 : str
+        Path to the output fastq file for r1.
+    output_filename_r2 : str
+        Path to the output fastq file for r2.
+    """
+    if fastq_filename.endswith('.gz'):
+        fastq_handle = gzip.open(fastq_filename, 'rt')
+    else:
+        fastq_handle = open(fastq_filename)
+
+    try:
+        fastq_splitted_outfile_r1 = gzip.open(output_filename_r1, 'wt')
+        fastq_splitted_outfile_r2 = gzip.open(output_filename_r2, 'wt')
+        [fastq_splitted_outfile_r1.write(line) if (i % 8 < 4) else fastq_splitted_outfile_r2.write(line) for i, line in enumerate(fastq_handle)]
+    except:
+        raise BadParameterException('Error in splitting read pairs from a single file')
+    finally:
+        fastq_handle.close()
+        fastq_splitted_outfile_r1.close()
+        fastq_splitted_outfile_r2.close()
+
+    return output_filename_r1, output_filename_r2
+
+
 ######
 # allele modification functions
 ######
diff --git a/CRISPResso2/CRISPRessoWGSCORE.py b/CRISPResso2/CRISPRessoWGSCORE.py
index 861b124d..75d3ce43 100644
--- a/CRISPResso2/CRISPRessoWGSCORE.py
+++ b/CRISPResso2/CRISPRessoWGSCORE.py
@@ -323,13 +323,15 @@ def print_stacktrace_if_debug():
             sys.exit()
 
         parser = CRISPRessoShared.getCRISPRessoArgParser(parser_title = 'CRISPRessoWGS Parameters', required_params=[],
-                    suppress_params=['bam_input',
-                                   'bam_chr_loc',
-                                   'fastq_r1',
-                                   'fastq_r2',
-                                   'amplicon_seq',
-                                   'amplicon_name',
-                                   ])
+                    suppress_params=[
+                        'bam_input',
+                        'bam_chr_loc',
+                        'fastq_r1',
+                        'fastq_r2',
+                        'amplicon_seq',
+                        'amplicon_name',
+                        'split_interleaved_input',
+                    ])
 
         #tool specific optional
         parser.add_argument('-b', '--bam_file', type=str,  help='WGS aligned bam file', required=True, default='bam filename' )