diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py index ea66e438..6c21ee52 100644 --- a/CRISPResso2/CRISPRessoCORE.py +++ b/CRISPResso2/CRISPRessoCORE.py @@ -138,10 +138,6 @@ def get_avg_read_length_fastq(fastq_filename): p = sb.Popen(cmd, shell=True, stdout=sb.PIPE) return int(p.communicate()[0].strip()) -def get_n_reads_fastq(fastq_filename): - p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < \"%s\" | wc -l" % fastq_filename, shell=True, stdout=sb.PIPE) - return int(float(p.communicate()[0])/4.0) - def get_n_reads_bam(bam_filename,bam_chr_loc=""): cmd = "samtools view -c " + bam_filename + " " + bam_chr_loc p = sb.Popen(cmd, shell=True, stdout=sb.PIPE) @@ -2458,7 +2454,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited N_READS_INPUT = 0 if args.fastq_r1: - N_READS_INPUT = get_n_reads_fastq(args.fastq_r1) + N_READS_INPUT = CRISPRessoShared.get_n_reads_fastq(args.fastq_r1) elif args.bam_input: N_READS_INPUT = get_n_reads_bam(args.bam_input, args.bam_chr_loc) @@ -2620,7 +2616,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited if args.bam_input: N_READS_AFTER_PREPROCESSING = N_READS_INPUT else: - N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename) + N_READS_AFTER_PREPROCESSING=CRISPRessoShared.get_n_reads_fastq(processed_output_filename) if N_READS_AFTER_PREPROCESSING == 0: raise CRISPRessoShared.NoReadsAfterQualityFilteringException('No reads in input or no reads survived the average or single bp quality filtering.') diff --git a/CRISPResso2/CRISPRessoPooledCORE.py b/CRISPResso2/CRISPRessoPooledCORE.py index acd101db..777c5720 100644 --- a/CRISPResso2/CRISPRessoPooledCORE.py +++ b/CRISPResso2/CRISPRessoPooledCORE.py @@ -145,11 +145,6 @@ def get_read_length_from_cigar(cigar_string): result += int(length) return result -def get_n_reads_fastq(fastq_filename): - p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < %s | wc -l" % fastq_filename, shell=True, stdout=sb.PIPE) - n_reads = int(float(p.communicate()[0])/4.0) - return n_reads - def get_n_reads_bam(bam_filename): p = sb.Popen("samtools view -c %s" % bam_filename, shell=True, stdout=sb.PIPE) return int(p.communicate()[0]) @@ -629,10 +624,10 @@ def main(): N_READS_INPUT = get_n_reads_bam(args.aligned_pooled_bam) N_READS_AFTER_PREPROCESSING = N_READS_INPUT else: - N_READS_INPUT = get_n_reads_fastq(args.fastq_r1) + N_READS_INPUT = CRISPRessoShared.get_n_reads_fastq(args.fastq_r1) if args.split_interleaved_input: N_READS_INPUT /= 2 - N_READS_AFTER_PREPROCESSING = get_n_reads_fastq(processed_output_filename) + N_READS_AFTER_PREPROCESSING = CRISPRessoShared.get_n_reads_fastq(processed_output_filename) crispresso2_info['running_info']['finished_steps']['count_input_reads'] = (N_READS_INPUT, N_READS_AFTER_PREPROCESSING) CRISPRessoShared.write_crispresso_info( @@ -880,7 +875,7 @@ def main(): n_reads_aligned_amplicons=[] crispresso_cmds = [] for idx, row in df_template.iterrows(): - this_n_reads = get_n_reads_fastq(row['Demultiplexed_fastq.gz_filename']) + this_n_reads = CRISPRessoShared.get_n_reads_fastq(row['Demultiplexed_fastq.gz_filename']) n_reads_aligned_amplicons.append(this_n_reads) info('\n Processing:%s with %d reads'%(idx,this_n_reads)) this_amp_seq = row['amplicon_seq'] @@ -1159,7 +1154,7 @@ def rreplace(s, old, new): END{ \ if (fastq_filename!="NA") {if (num_records < __MIN_READS__){\ record_log_str = record_log_str chr_id"\t"bpstart"\t"bpend"\t"num_records"\tNA\n"} \ - else{printf("%s",fastq_records)>fastq_filename;close(fastq_filename); system("gzip -f "fastq_filename); record_log_str = record_log_str chr_id"\t"bpstart"\t"bpend"\t"num_records"\t"fastq_filename".gz\n"} \ + else{print(fastq_records)>fastq_filename;close(fastq_filename); system("gzip -f "fastq_filename); record_log_str = record_log_str chr_id"\t"bpstart"\t"bpend"\t"num_records"\t"fastq_filename".gz\n"} \ }\ print record_log_str > "__DEMUX_CHR_LOGFILENAME__" \ }' ''' @@ -1579,8 +1574,6 @@ def rreplace(s, old, new): #if many reads weren't aligned, print those out for the user if RUNNING_MODE != 'ONLY_GENOME': - #N_READS_INPUT=get_n_reads_fastq(args.fastq_r1) - #N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename) tot_reads_aligned = df_summary_quantification['Reads_aligned'].fillna(0).sum() tot_reads = df_summary_quantification['Reads_total'].sum() diff --git a/CRISPResso2/CRISPRessoShared.py b/CRISPResso2/CRISPRessoShared.py index 15d1b33f..2114f8da 100644 --- a/CRISPResso2/CRISPRessoShared.py +++ b/CRISPResso2/CRISPRessoShared.py @@ -492,6 +492,15 @@ def assert_fastq_format(file_path, max_lines_to_check=100): raise InputFileFormatException('File %s is not in fastq format!' % (file_path)) from e +def get_n_reads_fastq(fastq_filename): + if not os.path.exists(fastq_filename) or os.path.getsize(fastq_filename) == 0: + return 0 + + p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < %s | grep -c ." % fastq_filename, shell=True, stdout=sb.PIPE) + n_reads = int(float(p.communicate()[0])/4.0) + return n_reads + + def check_output_folder(output_folder): """ Checks to see that the CRISPResso run has completed, and gathers the amplicon info for that run diff --git a/tests/unit_tests/test_CRISPRessoShared.py b/tests/unit_tests/test_CRISPRessoShared.py index 9916f8a2..3d6855d5 100644 --- a/tests/unit_tests/test_CRISPRessoShared.py +++ b/tests/unit_tests/test_CRISPRessoShared.py @@ -1,4 +1,7 @@ from CRISPResso2 import CRISPResso2Align, CRISPRessoShared +import tempfile +import os +import gzip ALN_MATRIX = CRISPResso2Align.read_matrix('./CRISPResso2/EDNAFULL') @@ -28,6 +31,120 @@ def test_get_relative_coordinates(): assert s1inds_gap_right == [0, 1, 2, 3, 4] +def test_get_n_reads_fastq(): + with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f: + f.write("@SEQ_ID\n") + f.write("GATTACA\n") + f.write("+\n") + f.write("AAAAAAA\n") # Ensure the file content is correct and ends with a newline + f.flush() # Flush the content to disk + os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk + + # Since the file needs to be read by another process, we ensure it's closed and written before passing it + assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1 + + # Clean up: delete the file after the test + os.remove(f.name) + +def test_get_n_reads_fastq_gzip(): + with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f: + f.write("@SEQ_ID\n") + f.write("GATTACA\n") + f.write("+\n") + f.write("AAAAAAA\n") # Ensure the file content is correct and ends with a newline + f.flush() # Flush the content to disk + os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk + + # gzip file + with open(f.name, 'rb') as f_in, gzip.open(f.name + '.gz', 'wb') as f_out: + f_out.writelines(f_in) + + # Since the file needs to be read by another process, we ensure it's closed and written before passing it + assert CRISPRessoShared.get_n_reads_fastq(f.name + '.gz') == 1 + + # Clean up: delete the file after the test + os.remove(f.name) + os.remove(f.name + '.gz') + + +def test_get_n_reads_fastq_three_extra_newlines(): + with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f: + f.write("@SEQ_ID\n") + f.write("GATTACA\n") + f.write("+\n") + f.write("AAAAAAA\n") # Ensure the file content is correct and ends with a newline + f.write("\n\n") # Ensure the file content is correct and ends with a newline + f.flush() # Flush the content to disk + os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk + + # Since the file needs to be read by another process, we ensure it's closed and written before passing it + assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1 + + # Clean up: delete the file after the test + os.remove(f.name) + + +def test_get_n_reads_fastq_four_extra_newlines(): + with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f: + f.write("@SEQ_ID\n") + f.write("GATTACA\n") + f.write("+\n") + f.write("AAAAAAA\n") # Ensure the file content is correct and ends with a newline + f.write("\n\n\n\n\n\n\n\n") # Ensure the file content is correct and ends with a newline + f.flush() # Flush the content to disk + os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk + + # Since the file needs to be read by another process, we ensure it's closed and written before passing it + assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1 + + # Clean up: delete the file after the test + os.remove(f.name) + + +def test_get_n_reads_fastq_100_reads(): + with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f: + for i in range(100): + f.write("@SEQ_ID\n") + f.write("GATTACA\n") + f.write("+\n") + f.write("AAAAAAA\n") + f.flush() # Flush the content to disk + os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk + + # Since the file needs to be read by another process, we ensure it's closed and written before passing it + assert CRISPRessoShared.get_n_reads_fastq(f.name) == 100 + + # Clean up: delete the file after the test + os.remove(f.name) + +def test_get_n_reads_fastq_no_newline(): + with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f: + f.write("@SEQ_ID\n") + f.write("GATTACA\n") + f.write("+\n") + f.write("AAAAAAA") # Ensure the file content is correct and ends with a newline + f.flush() # Flush the content to disk + os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk + + # Since the file needs to be read by another process, we ensure it's closed and written before passing it + assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1 + + # Clean up: delete the file after the test + os.remove(f.name) + + +def test_get_n_reads_fastq_empty_file(): + with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f: + f.flush() # Flush the content to disk + os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk + + # Since the file needs to be read by another process, we ensure it's closed and written before passing it + assert CRISPRessoShared.get_n_reads_fastq(f.name) == 0 + + # Clean up: delete the file after the test + os.remove(f.name) + + def test_get_relative_coordinates_to_gap(): # unaligned sequences seq_1 = 'TTCGT' @@ -98,3 +215,4 @@ def test_get_quant_window_ranges_from_include_idxs_single_gap(): def test_get_quant_window_ranges_from_include_idxs_multiple_gaps(): include_idxs = [50, 51, 52, 53, 55, 56, 57, 58, 60] assert CRISPRessoShared.get_quant_window_ranges_from_include_idxs(include_idxs) == [(50, 53), (55, 58), (60, 60)] +