From 35c27daad244bc8f69c346564fa611fbe310a707 Mon Sep 17 00:00:00 2001 From: Cole Lyman Date: Thu, 14 Mar 2024 13:57:07 -0600 Subject: [PATCH] Fix interleaved fastq input in CRISPRessoPooled and suppress CRISPRessoWGS params (#42) * Extract out split_interleaved_fastq function to CRISPRessoShared * Implement splitting interleaved fastq files in CRISPRessoPooled * Suppress split_interleaved_input from CRISPRessoWGS parameters * Suppress other parameters in CRISPRessoWGS * Move where interleaved fastq files are split to be trimmed properly --- CRISPResso2/CRISPRessoCORE.py | 22 ++------ CRISPResso2/CRISPRessoPooledCORE.py | 17 ++++-- CRISPResso2/CRISPRessoShared.py | 84 ++++++++++++++++++++++++----- CRISPResso2/CRISPRessoWGSCORE.py | 16 +++--- 4 files changed, 98 insertions(+), 41 deletions(-) diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py index 96b916af..f4a5909e 100644 --- a/CRISPResso2/CRISPRessoCORE.py +++ b/CRISPResso2/CRISPRessoCORE.py @@ -1000,22 +1000,6 @@ def process_single_fastq_write_bam_out(fastq_input, bam_output, bam_header, vari return(aln_stats) -def split_interleaved_fastq(fastq_filename, output_filename_r1, output_filename_r2): - if fastq_filename.endswith('.gz'): - fastq_handle = gzip.open(fastq_filename, 'rt') - else: - fastq_handle=open(fastq_filename) - - try: - fastq_splitted_outfile_r1 = gzip.open(output_filename_r1, 'wt') - fastq_splitted_outfile_r2 = gzip.open(output_filename_r2, 'wt') - [fastq_splitted_outfile_r1.write(line) if (i % 8 < 4) else fastq_splitted_outfile_r2.write(line) for i, line in enumerate(fastq_handle)] - except: - raise CRISPRessoShared.BadParameterException('Error in splitting read pairs from a single file') - - return output_filename_r1, output_filename_r2 - - def normalize_name(name, fastq_r1, fastq_r2, bam_input): """Normalize the name according to the inputs and clean it. @@ -2193,11 +2177,11 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited raise CRISPRessoShared.BadParameterException('The option --split_interleaved_input is available only when a single fastq file is specified!') else: info('Splitting paired end single fastq file into two files...') - args.fastq_r1, args.fastq_r2=split_interleaved_fastq(args.fastq_r1, + args.fastq_r1, args.fastq_r2 = CRISPRessoShared.split_interleaved_fastq(args.fastq_r1, output_filename_r1=_jp(os.path.basename(args.fastq_r1.replace('.fastq', '')).replace('.gz', '')+'_splitted_r1.fastq.gz'), output_filename_r2=_jp(os.path.basename(args.fastq_r1.replace('.fastq', '')).replace('.gz', '')+'_splitted_r2.fastq.gz'),) - files_to_remove+=[args.fastq_r1, args.fastq_r2] - N_READS_INPUT = N_READS_INPUT/2 + files_to_remove += [args.fastq_r1, args.fastq_r2] + N_READS_INPUT /= 2 info('Done!', {'percent_complete': 4}) diff --git a/CRISPResso2/CRISPRessoPooledCORE.py b/CRISPResso2/CRISPRessoPooledCORE.py index f1142488..d84b987e 100644 --- a/CRISPResso2/CRISPRessoPooledCORE.py +++ b/CRISPResso2/CRISPRessoPooledCORE.py @@ -338,7 +338,7 @@ def main(): CRISPRessoShared.set_console_log_level(logger, args.verbosity, args.debug) crispresso_options = CRISPRessoShared.get_crispresso_options() - options_to_ignore = {'fastq_r1', 'fastq_r2', 'amplicon_seq', 'amplicon_name', 'output_folder', 'name', 'zip_output'} + options_to_ignore = {'fastq_r1', 'fastq_r2', 'amplicon_seq', 'amplicon_name', 'output_folder', 'name', 'zip_output', 'split_interleaved_input'} crispresso_options_for_pooled = list(crispresso_options-options_to_ignore) files_to_remove = [] @@ -511,6 +511,15 @@ def main(): info('Processing input', {'percent_complete': 5}) + if args.split_interleaved_input: + info('Splitting paired end single fastq file into two files...') + args.fastq_r1, args.fastq_r2 = CRISPRessoShared.split_interleaved_fastq( + args.fastq_r1, + output_filename_r1=_jp('{0}_splitted_r1.fastq.gz'.format(os.path.basename(args.fastq_r1).replace('.fastq', '').replace('.gz', ''))), + output_filename_r2=_jp('{0}_splitted_r2.fastq.gz'.format(os.path.basename(args.fastq_r1).replace('.fastq', '').replace('.gz', ''))), + ) + files_to_remove += [args.fastq_r1, args.fastq_r2] + # perform read trimming if necessary if args.aligned_pooled_bam is not None: # don't trim reads in aligned bams @@ -615,6 +624,8 @@ def main(): N_READS_AFTER_PREPROCESSING = N_READS_INPUT else: N_READS_INPUT = get_n_reads_fastq(args.fastq_r1) + if args.split_interleaved_input: + N_READS_INPUT /= 2 N_READS_AFTER_PREPROCESSING = get_n_reads_fastq(processed_output_filename) crispresso2_info['running_info']['finished_steps']['count_input_reads'] = (N_READS_INPUT, N_READS_AFTER_PREPROCESSING) @@ -689,7 +700,7 @@ def main(): raise CRISPRessoShared.BadParameterException('Incorrect number of columns provided without header.') elif has_header and len(unmatched_headers) > 0: raise CRISPRessoShared.BadParameterException('Unable to match headers: ' + str(unmatched_headers)) - + if not has_header: # Default header headers = [] @@ -886,7 +897,7 @@ def main(): failed_batch_arr = [] failed_batch_arr_desc = [] for cmd in crispresso_cmds: - + # Extract the folder name from the CRISPResso command folder_name_regex = re.search(r'-o\s+\S+\s+--name\s+(\S+)', cmd) if folder_name_regex: diff --git a/CRISPResso2/CRISPRessoShared.py b/CRISPResso2/CRISPRessoShared.py index dcddfe2f..da9d1251 100644 --- a/CRISPResso2/CRISPRessoShared.py +++ b/CRISPResso2/CRISPRessoShared.py @@ -140,15 +140,19 @@ def set_console_log_level(logger, level, debug=False): def getCRISPRessoArgParser(parser_title="CRISPResso Parameters", required_params=[], suppress_params=[]): parser = argparse.ArgumentParser(description=parser_title, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--version', action='version', version='%(prog)s ' + __version__) - parser.add_argument('-r1', '--fastq_r1', type=str, help='First fastq file', default='', - required='fastq_r1' in required_params) - parser.add_argument('-r2', '--fastq_r2', type=str, help='Second fastq file for paired end reads', default='') - parser.add_argument('-a', '--amplicon_seq', type=str, - help='Amplicon Sequence (can be comma-separated list of multiple sequences)', - required='amplicon_seq' in required_params) - parser.add_argument('-an', '--amplicon_name', type=str, - help='Amplicon Name (can be comma-separated list of multiple names, corresponding to amplicon sequences given in --amplicon_seq', - default='Reference') + if 'fastq_r1' not in suppress_params: + parser.add_argument('-r1', '--fastq_r1', type=str, help='First fastq file', default='', + required='fastq_r1' in required_params) + if 'fastq_r2' not in suppress_params: + parser.add_argument('-r2', '--fastq_r2', type=str, help='Second fastq file for paired end reads', default='') + if 'amplicon_seq' not in suppress_params: + parser.add_argument('-a', '--amplicon_seq', type=str, + help='Amplicon Sequence (can be comma-separated list of multiple sequences)', + required='amplicon_seq' in required_params) + if 'amplicon_name' not in suppress_params: + parser.add_argument('-an', '--amplicon_name', type=str, + help='Amplicon Name (can be comma-separated list of multiple names, corresponding to amplicon sequences given in --amplicon_seq', + default='Reference') parser.add_argument('-amas', '--amplicon_min_alignment_score', type=str, help='Amplicon Minimum Alignment Score; score between 0 and 100; sequences must have at least this homology score with the amplicon to be aligned (can be comma-separated list of multiple scores, corresponding to amplicon sequences given in --amplicon_seq)', default="") @@ -199,9 +203,10 @@ def getCRISPRessoArgParser(parser_title="CRISPResso Parameters", required_params parser.add_argument('-v', '--verbosity', type=int, help='Verbosity level of output to the console (1-4), 4 is the most verbose', default=3) ## read preprocessing params - parser.add_argument('--split_interleaved_input', '--split_paired_end', - help='Splits a single fastq file containing paired end reads into two files before running CRISPResso', - action='store_true') + if 'split_interleaved_input' not in suppress_params: + parser.add_argument('--split_interleaved_input', '--split_paired_end', + help='Splits a single fastq file containing paired end reads into two files before running CRISPResso', + action='store_true') parser.add_argument('--trim_sequences', help='Enable the trimming of Illumina adapters with Trimmomatic', action='store_true') parser.add_argument('--trimmomatic_command', type=str, help='Command to run trimmomatic', default='trimmomatic') @@ -1326,6 +1331,61 @@ def force_merge_pairs(r1_filename, r2_filename, output_filename): return (lineCount) +def split_interleaved_fastq(fastq_filename, output_filename_r1, output_filename_r2): + """Split an interleaved fastq file into two files, one for each read pair. + + This assumes that the input fastq file is interleaved, i.e. that the reads are ordered as follows: + R1 + R2 + R1 + R2 + ... + + And results in two files, one for each read pair: + output_filename_r1 + R1 + R1 + ... + output_filename_r2 + R2 + R2 + ... + + Parameters + ---------- + fastq_filename : str + Path to the input fastq file. + output_filename_r1 : str + Path to the output fastq file for r1. + output_filename_r2 : str + Path to the output fastq file for r2. + + Returns + ------- + output_filename_r1 : str + Path to the output fastq file for r1. + output_filename_r2 : str + Path to the output fastq file for r2. + """ + if fastq_filename.endswith('.gz'): + fastq_handle = gzip.open(fastq_filename, 'rt') + else: + fastq_handle = open(fastq_filename) + + try: + fastq_splitted_outfile_r1 = gzip.open(output_filename_r1, 'wt') + fastq_splitted_outfile_r2 = gzip.open(output_filename_r2, 'wt') + [fastq_splitted_outfile_r1.write(line) if (i % 8 < 4) else fastq_splitted_outfile_r2.write(line) for i, line in enumerate(fastq_handle)] + except: + raise BadParameterException('Error in splitting read pairs from a single file') + finally: + fastq_handle.close() + fastq_splitted_outfile_r1.close() + fastq_splitted_outfile_r2.close() + + return output_filename_r1, output_filename_r2 + + ###### # allele modification functions ###### diff --git a/CRISPResso2/CRISPRessoWGSCORE.py b/CRISPResso2/CRISPRessoWGSCORE.py index 861b124d..75d3ce43 100644 --- a/CRISPResso2/CRISPRessoWGSCORE.py +++ b/CRISPResso2/CRISPRessoWGSCORE.py @@ -323,13 +323,15 @@ def print_stacktrace_if_debug(): sys.exit() parser = CRISPRessoShared.getCRISPRessoArgParser(parser_title = 'CRISPRessoWGS Parameters', required_params=[], - suppress_params=['bam_input', - 'bam_chr_loc', - 'fastq_r1', - 'fastq_r2', - 'amplicon_seq', - 'amplicon_name', - ]) + suppress_params=[ + 'bam_input', + 'bam_chr_loc', + 'fastq_r1', + 'fastq_r2', + 'amplicon_seq', + 'amplicon_name', + 'split_interleaved_input', + ]) #tool specific optional parser.add_argument('-b', '--bam_file', type=str, help='WGS aligned bam file', required=True, default='bam filename' )