Skip to content

Commit

Permalink
Fix interleaved fastq input in CRISPRessoPooled and suppress CRISPRes…
Browse files Browse the repository at this point in the history
…soWGS params (#42)

* Extract out split_interleaved_fastq function to CRISPRessoShared

* Implement splitting interleaved fastq files in CRISPRessoPooled

* Suppress split_interleaved_input from CRISPRessoWGS parameters

* Suppress other parameters in CRISPRessoWGS

* Move where interleaved fastq files are split to be trimmed properly
  • Loading branch information
Colelyman authored and mbowcut2 committed Mar 18, 2024
1 parent e6b74ce commit 35c27da
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 41 deletions.
22 changes: 3 additions & 19 deletions CRISPResso2/CRISPRessoCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -1000,22 +1000,6 @@ def process_single_fastq_write_bam_out(fastq_input, bam_output, bam_header, vari
return(aln_stats)


def split_interleaved_fastq(fastq_filename, output_filename_r1, output_filename_r2):
if fastq_filename.endswith('.gz'):
fastq_handle = gzip.open(fastq_filename, 'rt')
else:
fastq_handle=open(fastq_filename)

try:
fastq_splitted_outfile_r1 = gzip.open(output_filename_r1, 'wt')
fastq_splitted_outfile_r2 = gzip.open(output_filename_r2, 'wt')
[fastq_splitted_outfile_r1.write(line) if (i % 8 < 4) else fastq_splitted_outfile_r2.write(line) for i, line in enumerate(fastq_handle)]
except:
raise CRISPRessoShared.BadParameterException('Error in splitting read pairs from a single file')

return output_filename_r1, output_filename_r2


def normalize_name(name, fastq_r1, fastq_r2, bam_input):
"""Normalize the name according to the inputs and clean it.
Expand Down Expand Up @@ -2193,11 +2177,11 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
raise CRISPRessoShared.BadParameterException('The option --split_interleaved_input is available only when a single fastq file is specified!')
else:
info('Splitting paired end single fastq file into two files...')
args.fastq_r1, args.fastq_r2=split_interleaved_fastq(args.fastq_r1,
args.fastq_r1, args.fastq_r2 = CRISPRessoShared.split_interleaved_fastq(args.fastq_r1,
output_filename_r1=_jp(os.path.basename(args.fastq_r1.replace('.fastq', '')).replace('.gz', '')+'_splitted_r1.fastq.gz'),
output_filename_r2=_jp(os.path.basename(args.fastq_r1.replace('.fastq', '')).replace('.gz', '')+'_splitted_r2.fastq.gz'),)
files_to_remove+=[args.fastq_r1, args.fastq_r2]
N_READS_INPUT = N_READS_INPUT/2
files_to_remove += [args.fastq_r1, args.fastq_r2]
N_READS_INPUT /= 2

info('Done!', {'percent_complete': 4})

Expand Down
17 changes: 14 additions & 3 deletions CRISPResso2/CRISPRessoPooledCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ def main():
CRISPRessoShared.set_console_log_level(logger, args.verbosity, args.debug)

crispresso_options = CRISPRessoShared.get_crispresso_options()
options_to_ignore = {'fastq_r1', 'fastq_r2', 'amplicon_seq', 'amplicon_name', 'output_folder', 'name', 'zip_output'}
options_to_ignore = {'fastq_r1', 'fastq_r2', 'amplicon_seq', 'amplicon_name', 'output_folder', 'name', 'zip_output', 'split_interleaved_input'}
crispresso_options_for_pooled = list(crispresso_options-options_to_ignore)

files_to_remove = []
Expand Down Expand Up @@ -511,6 +511,15 @@ def main():

info('Processing input', {'percent_complete': 5})

if args.split_interleaved_input:
info('Splitting paired end single fastq file into two files...')
args.fastq_r1, args.fastq_r2 = CRISPRessoShared.split_interleaved_fastq(
args.fastq_r1,
output_filename_r1=_jp('{0}_splitted_r1.fastq.gz'.format(os.path.basename(args.fastq_r1).replace('.fastq', '').replace('.gz', ''))),
output_filename_r2=_jp('{0}_splitted_r2.fastq.gz'.format(os.path.basename(args.fastq_r1).replace('.fastq', '').replace('.gz', ''))),
)
files_to_remove += [args.fastq_r1, args.fastq_r2]

# perform read trimming if necessary
if args.aligned_pooled_bam is not None:
# don't trim reads in aligned bams
Expand Down Expand Up @@ -615,6 +624,8 @@ def main():
N_READS_AFTER_PREPROCESSING = N_READS_INPUT
else:
N_READS_INPUT = get_n_reads_fastq(args.fastq_r1)
if args.split_interleaved_input:
N_READS_INPUT /= 2
N_READS_AFTER_PREPROCESSING = get_n_reads_fastq(processed_output_filename)

crispresso2_info['running_info']['finished_steps']['count_input_reads'] = (N_READS_INPUT, N_READS_AFTER_PREPROCESSING)
Expand Down Expand Up @@ -689,7 +700,7 @@ def main():
raise CRISPRessoShared.BadParameterException('Incorrect number of columns provided without header.')
elif has_header and len(unmatched_headers) > 0:
raise CRISPRessoShared.BadParameterException('Unable to match headers: ' + str(unmatched_headers))

if not has_header:
# Default header
headers = []
Expand Down Expand Up @@ -886,7 +897,7 @@ def main():
failed_batch_arr = []
failed_batch_arr_desc = []
for cmd in crispresso_cmds:

# Extract the folder name from the CRISPResso command
folder_name_regex = re.search(r'-o\s+\S+\s+--name\s+(\S+)', cmd)
if folder_name_regex:
Expand Down
84 changes: 72 additions & 12 deletions CRISPResso2/CRISPRessoShared.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,15 +140,19 @@ def set_console_log_level(logger, level, debug=False):
def getCRISPRessoArgParser(parser_title="CRISPResso Parameters", required_params=[], suppress_params=[]):
parser = argparse.ArgumentParser(description=parser_title, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--version', action='version', version='%(prog)s ' + __version__)
parser.add_argument('-r1', '--fastq_r1', type=str, help='First fastq file', default='',
required='fastq_r1' in required_params)
parser.add_argument('-r2', '--fastq_r2', type=str, help='Second fastq file for paired end reads', default='')
parser.add_argument('-a', '--amplicon_seq', type=str,
help='Amplicon Sequence (can be comma-separated list of multiple sequences)',
required='amplicon_seq' in required_params)
parser.add_argument('-an', '--amplicon_name', type=str,
help='Amplicon Name (can be comma-separated list of multiple names, corresponding to amplicon sequences given in --amplicon_seq',
default='Reference')
if 'fastq_r1' not in suppress_params:
parser.add_argument('-r1', '--fastq_r1', type=str, help='First fastq file', default='',
required='fastq_r1' in required_params)
if 'fastq_r2' not in suppress_params:
parser.add_argument('-r2', '--fastq_r2', type=str, help='Second fastq file for paired end reads', default='')
if 'amplicon_seq' not in suppress_params:
parser.add_argument('-a', '--amplicon_seq', type=str,
help='Amplicon Sequence (can be comma-separated list of multiple sequences)',
required='amplicon_seq' in required_params)
if 'amplicon_name' not in suppress_params:
parser.add_argument('-an', '--amplicon_name', type=str,
help='Amplicon Name (can be comma-separated list of multiple names, corresponding to amplicon sequences given in --amplicon_seq',
default='Reference')
parser.add_argument('-amas', '--amplicon_min_alignment_score', type=str,
help='Amplicon Minimum Alignment Score; score between 0 and 100; sequences must have at least this homology score with the amplicon to be aligned (can be comma-separated list of multiple scores, corresponding to amplicon sequences given in --amplicon_seq)',
default="")
Expand Down Expand Up @@ -199,9 +203,10 @@ def getCRISPRessoArgParser(parser_title="CRISPResso Parameters", required_params
parser.add_argument('-v', '--verbosity', type=int, help='Verbosity level of output to the console (1-4), 4 is the most verbose', default=3)

## read preprocessing params
parser.add_argument('--split_interleaved_input', '--split_paired_end',
help='Splits a single fastq file containing paired end reads into two files before running CRISPResso',
action='store_true')
if 'split_interleaved_input' not in suppress_params:
parser.add_argument('--split_interleaved_input', '--split_paired_end',
help='Splits a single fastq file containing paired end reads into two files before running CRISPResso',
action='store_true')
parser.add_argument('--trim_sequences', help='Enable the trimming of Illumina adapters with Trimmomatic',
action='store_true')
parser.add_argument('--trimmomatic_command', type=str, help='Command to run trimmomatic', default='trimmomatic')
Expand Down Expand Up @@ -1326,6 +1331,61 @@ def force_merge_pairs(r1_filename, r2_filename, output_filename):
return (lineCount)


def split_interleaved_fastq(fastq_filename, output_filename_r1, output_filename_r2):
"""Split an interleaved fastq file into two files, one for each read pair.
This assumes that the input fastq file is interleaved, i.e. that the reads are ordered as follows:
R1
R2
R1
R2
...
And results in two files, one for each read pair:
output_filename_r1
R1
R1
...
output_filename_r2
R2
R2
...
Parameters
----------
fastq_filename : str
Path to the input fastq file.
output_filename_r1 : str
Path to the output fastq file for r1.
output_filename_r2 : str
Path to the output fastq file for r2.
Returns
-------
output_filename_r1 : str
Path to the output fastq file for r1.
output_filename_r2 : str
Path to the output fastq file for r2.
"""
if fastq_filename.endswith('.gz'):
fastq_handle = gzip.open(fastq_filename, 'rt')
else:
fastq_handle = open(fastq_filename)

try:
fastq_splitted_outfile_r1 = gzip.open(output_filename_r1, 'wt')
fastq_splitted_outfile_r2 = gzip.open(output_filename_r2, 'wt')
[fastq_splitted_outfile_r1.write(line) if (i % 8 < 4) else fastq_splitted_outfile_r2.write(line) for i, line in enumerate(fastq_handle)]
except:
raise BadParameterException('Error in splitting read pairs from a single file')
finally:
fastq_handle.close()
fastq_splitted_outfile_r1.close()
fastq_splitted_outfile_r2.close()

return output_filename_r1, output_filename_r2


######
# allele modification functions
######
Expand Down
16 changes: 9 additions & 7 deletions CRISPResso2/CRISPRessoWGSCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,13 +323,15 @@ def print_stacktrace_if_debug():
sys.exit()

parser = CRISPRessoShared.getCRISPRessoArgParser(parser_title = 'CRISPRessoWGS Parameters', required_params=[],
suppress_params=['bam_input',
'bam_chr_loc',
'fastq_r1',
'fastq_r2',
'amplicon_seq',
'amplicon_name',
])
suppress_params=[
'bam_input',
'bam_chr_loc',
'fastq_r1',
'fastq_r2',
'amplicon_seq',
'amplicon_name',
'split_interleaved_input',
])

#tool specific optional
parser.add_argument('-b', '--bam_file', type=str, help='WGS aligned bam file', required=True, default='bam filename' )
Expand Down

0 comments on commit 35c27da

Please sign in to comment.