diff --git a/bin/qualimap_bamqc_genome_results_to_csv.py b/bin/qualimap_bamqc_genome_results_to_csv.py index 51a3e48..caee394 100755 --- a/bin/qualimap_bamqc_genome_results_to_csv.py +++ b/bin/qualimap_bamqc_genome_results_to_csv.py @@ -26,58 +26,72 @@ def parse_qualimap_bamqc_genome_results(qualimap_bamqc_genome_results): num_mapped_reads = line.split('=')[1].strip().split(' ')[0].replace(',', '') qualimap_bamqc_genome_results_data['num_mapped_reads'] = int(num_mapped_reads) percent_mapped_reads = line.split('=')[1].strip().split(' ')[1].strip().replace('(', '').replace(')', '').replace('%', '') - qualimap_bamqc_genome_results_data['percent_mapped_reads'] = round(float(percent_mapped_reads), 2) + qualimap_bamqc_genome_results_data['percent_mapped_reads'] = round(float(percent_mapped_reads), 6) + if line.startswith('number of supplementary alignments'): + num_supplementary_alignments = int(line.split('=')[1].strip().split()[0].replace(',', '')) + qualimap_bamqc_genome_results_data['num_supplementary_alignments'] = num_supplementary_alignments + percent_supplementary_alignments = line.split('=')[1].strip().split(' ')[1].strip().replace('(', '').replace(')', '').replace('%', '') + qualimap_bamqc_genome_results_data['percent_supplementary_alignments'] = round(float(percent_supplementary_alignments), 6) if line.startswith('number of secondary alignments'): num_secondary_alignments = int(line.split('=')[1].strip().replace(',', '')) qualimap_bamqc_genome_results_data['num_secondary_alignments'] = num_secondary_alignments - if line.startswith('duplication rate'): - duplication_rate = line.split('=')[1].strip().replace('%', '') - qualimap_bamqc_genome_results_data['duplication_rate_percent'] = round(float(duplication_rate), 2) + if line.startswith('number of mapped bases'): + num_mapped_bases = line.split('=')[1].strip().split()[0].replace(',', '') + qualimap_bamqc_genome_results_data['num_mapped_bases'] = int(num_mapped_bases) + if line.startswith('number of sequenced bases'): + num_sequenced_bases = line.split('=')[1].strip().split()[0].replace(',', '') + qualimap_bamqc_genome_results_data['num_sequenced_bases'] = int(num_sequenced_bases) + if line.startswith('number of duplicated reads'): + num_duplicated_reads = line.split('=')[1].strip().replace(',', '') + qualimap_bamqc_genome_results_data['num_duplicated_reads'] = int(num_duplicated_reads) + num_mapped_reads = qualimap_bamqc_genome_results_data['num_mapped_reads'] + duplication_rate_percent = (int(num_duplicated_reads) / int(num_mapped_reads)) * 100 + qualimap_bamqc_genome_results_data['duplication_rate_percent'] = round(duplication_rate_percent, 6) if line.startswith('mean coverageData'): mean_coverage = line.split('=')[1].strip().strip('X').replace(',', '') - qualimap_bamqc_genome_results_data['mean_depth_coverage'] = round(float(mean_coverage), 2) + qualimap_bamqc_genome_results_data['mean_depth_coverage'] = round(float(mean_coverage), 6) if line.startswith('std coverageData'): stdev_coverage = line.split('=')[1].strip().strip('X').replace(',', '') - qualimap_bamqc_genome_results_data['stdev_depth_coverage'] = round(float(stdev_coverage), 2) + qualimap_bamqc_genome_results_data['stdev_depth_coverage'] = round(float(stdev_coverage), 6) if line.startswith('mean mapping quality'): mean_mapping_quality = line.split('=')[1].strip() - qualimap_bamqc_genome_results_data['mean_mapping_quality'] = round(float(mean_mapping_quality), 2) + qualimap_bamqc_genome_results_data['mean_mapping_quality'] = round(float(mean_mapping_quality), 6) if line.startswith('general error rate'): general_error_rate = line.split('=')[1].strip() - qualimap_bamqc_genome_results_data['error_rate'] = round(float(general_error_rate), 2) + qualimap_bamqc_genome_results_data['error_rate'] = round(float(general_error_rate), 6) if line.startswith('number of mismatches'): number_of_mismatches = line.split('=')[1].strip().replace(',', '') - qualimap_bamqc_genome_results_data['number_of_mismatches'] = int(number_of_mismatches) + qualimap_bamqc_genome_results_data['num_mismatches'] = int(number_of_mismatches) if line.startswith('number of insertions'): number_of_insertions = line.split('=')[1].strip().replace(',', '') - qualimap_bamqc_genome_results_data['number_of_insertions'] = int(number_of_insertions) + qualimap_bamqc_genome_results_data['num_insertions'] = int(number_of_insertions) if line.startswith('mapped reads with insertion percentage'): mapped_reads_with_insertion_percentage = line.split('=')[1].strip().replace('%', '') - qualimap_bamqc_genome_results_data['mapped_reads_with_insertion_percentage'] = round(float(mapped_reads_with_insertion_percentage), 2) + qualimap_bamqc_genome_results_data['mapped_reads_with_insertion_percent'] = round(float(mapped_reads_with_insertion_percentage), 6) if line.startswith('number of deletions'): number_of_deletions = line.split('=')[1].strip().replace(',', '') - qualimap_bamqc_genome_results_data['number_of_deletions'] = int(number_of_deletions) + qualimap_bamqc_genome_results_data['num_deletions'] = int(number_of_deletions) if line.startswith('mapped reads with deletion percentage'): mapped_reads_with_deletion_percentage = line.split('=')[1].strip().replace('%', '') - qualimap_bamqc_genome_results_data['mapped_reads_with_deletion_percentage'] = round(float(mapped_reads_with_deletion_percentage), 2) + qualimap_bamqc_genome_results_data['mapped_reads_with_deletion_percent'] = round(float(mapped_reads_with_deletion_percentage), 6) if 'reference with a coverageData >= 5X' in line: proportion_genome_covered_over_5x = float(line.split(' ')[3].strip('%')) / 100 - qualimap_bamqc_genome_results_data['proportion_genome_covered_over_5x'] = round(proportion_genome_covered_over_5x, 4) + qualimap_bamqc_genome_results_data['proportion_genome_covered_over_5x'] = round(proportion_genome_covered_over_5x, 6) if 'reference with a coverageData >= 10X' in line: proportion_genome_covered_over_10x = float(line.split(' ')[3].strip('%')) / 100 - qualimap_bamqc_genome_results_data['proportion_genome_covered_over_10x'] = round(proportion_genome_covered_over_10x, 4) + qualimap_bamqc_genome_results_data['proportion_genome_covered_over_10x'] = round(proportion_genome_covered_over_10x, 6) if 'reference with a coverageData >= 20X' in line: proportion_genome_covered_over_20x = float(line.split(' ')[3].strip('%')) / 100 - qualimap_bamqc_genome_results_data['proportion_genome_covered_over_20x'] = round(proportion_genome_covered_over_20x, 4) + qualimap_bamqc_genome_results_data['proportion_genome_covered_over_20x'] = round(proportion_genome_covered_over_20x, 6) if 'reference with a coverageData >= 30X' in line: proportion_genome_covered_over_30x = float(line.split(' ')[3].strip('%')) / 100 - qualimap_bamqc_genome_results_data['proportion_genome_covered_over_30x'] = round(proportion_genome_covered_over_30x, 4) + qualimap_bamqc_genome_results_data['proportion_genome_covered_over_30x'] = round(proportion_genome_covered_over_30x, 6) if 'reference with a coverageData >= 40X' in line: proportion_genome_covered_over_40x = float(line.split(' ')[3].strip('%')) / 100 - qualimap_bamqc_genome_results_data['proportion_genome_covered_over_40x'] = round(proportion_genome_covered_over_40x, 4) + qualimap_bamqc_genome_results_data['proportion_genome_covered_over_40x'] = round(proportion_genome_covered_over_40x, 6) if 'reference with a coverageData >= 50X' in line: proportion_genome_covered_over_50x = float(line.split(' ')[3].strip('%')) / 100 - qualimap_bamqc_genome_results_data['proportion_genome_covered_over_50x'] = round(proportion_genome_covered_over_50x, 4) + qualimap_bamqc_genome_results_data['proportion_genome_covered_over_50x'] = round(proportion_genome_covered_over_50x, 6) return qualimap_bamqc_genome_results_data @@ -91,13 +105,18 @@ def main(args): 'num_mapped_reads', 'percent_mapped_reads', 'mean_mapping_quality', + 'num_sequenced_bases', + 'num_mapped_bases', + 'num_mismatches', + 'num_insertions', + 'num_deletions', 'error_rate', - 'number_of_mismatches', - 'number_of_insertions', - 'mapped_reads_with_insertion_percentage', - 'number_of_deletions', - 'mapped_reads_with_deletion_percentage', + 'mapped_reads_with_insertion_percent', + 'mapped_reads_with_deletion_percent', 'num_secondary_alignments', + 'num_supplementary_alignments', + 'percent_supplementary_alignments', + 'num_duplicated_reads', 'duplication_rate_percent', 'proportion_genome_covered_over_5x', 'proportion_genome_covered_over_10x', diff --git a/bin/qualimap_bamqc_genome_results_to_json.py b/bin/qualimap_bamqc_genome_results_to_json.py new file mode 100755 index 0000000..0d45464 --- /dev/null +++ b/bin/qualimap_bamqc_genome_results_to_json.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python + +import argparse +import json +import os +import sys + + +def get_section_lines(report_path, section_name): + """ + """ + section_lines = [] + + with open(report_path, 'r') as f: + # skip lines until we reach the input section + # indicated by the line starting with '>>>>>>> Input' + for line in f: + if line.startswith(f'>>>>>>> {section_name}'): + break + for line in f: + if line.startswith('>>>>>>>'): + break + if line.strip() != '': + section_lines.append(line.strip()) + + return section_lines + + +def parse_input_section(report_path): + """ + """ + section_lines = get_section_lines(report_path, 'Input') + input_section = {} + for line in section_lines: + key, value = line.split('=') + key = key.strip().replace(' ', '_').lower() + value = value.strip() + input_section[key] = value + + return input_section + + +def parse_reference_section(report_path): + """ + """ + section_lines = get_section_lines(report_path, 'Reference') + reference_section = {} + keys_with_units = [ + 'number_of_bases', + ] + int_keys = [ + 'number_of_bases', + 'number_of_contigs', + ] + for line in section_lines: + key, value = line.split('=') + key = key.strip().replace(' ', '_').lower() + value = value.strip() + if key in keys_with_units: + value, unit = value.split() + if key in int_keys: + value = value.replace(',', '') + value = int(value) + reference_section[key] = value + + return reference_section + + +def parse_int_with_percentage(value): + """ + Convert a string like "20,096,173 (99.89%)" to a tuple (20096173, 99.89) + """ + value, percentage = value.split() + value = value.replace(',', '') + value = int(value) + percentage = percentage.strip('()').replace('%', '') + percentage = float(percentage) + + return value, percentage + +def parse_globals_section(report_path): + """ + """ + section_lines = get_section_lines(report_path, 'Globals') + globals_section = {} + int_keys = [ + 'number_of_windows', + 'number_of_reads', + 'number_of_secondary_alignments', + 'number_of_mapped_paired_reads_first_in_pair', + 'number_of_mapped_paired_reads_second_in_pair', + 'number_of_mapped_paired_reads_both_in_pair', + 'number_of_mapped_paired_reads_singletons', + 'number_of_overlapping_read_pairs', + 'number_of_duplicated_reads_flagged', + ] + int_with_unit_keys = [ + 'number_of_mapped_bases', + 'number_of_sequenced_bases', + 'number_of_aligned_bases', + ] + int_with_percentage_keys = [ + 'number_of_mapped_reads', + 'number_of_supplementary_alignments', + ] + for line in section_lines: + key, value = line.split('=') + key = key.strip().replace(' ', '_').replace('(', '').replace(')', '').lower() + value = value.strip() + if key in int_keys: + value = value.replace(',', '') + value = int(value) + elif key in int_with_percentage_keys: + value, percentage = parse_int_with_percentage(value) + percentage_key = key.replace('number_of_', 'percent_') + globals_section[percentage_key] = percentage + elif key in int_with_unit_keys: + value, unit = value.split() + value = value.replace(',', '') + value = int(value) + globals_section[key] = value + + return globals_section + + +def parse_insert_size_section(report_path): + """ + """ + insert_size_section = {} + section_lines = get_section_lines(report_path, 'Insert size') + float_keys = [ + 'mean_insert_size', + 'std_insert_size', + ] + int_keys = [ + 'median_insert_size', + ] + for line in section_lines: + key, value = line.split('=') + key = key.strip().replace(' ', '_').lower() + value = value.strip() + if key in float_keys: + value = value.replace(',', '') + value = float(value) + elif key in int_keys: + value = value.replace(',', '') + value = int(value) + insert_size_section[key] = value + + return insert_size_section + + +def parse_actg_content_section(report_path): + """ + """ + actg_content_section = {} + section_lines = get_section_lines(report_path, 'ACTG content') + int_with_units_and_percentage_keys = [ + 'number_of_a_bases', + 'number_of_c_bases', + 'number_of_g_bases', + 'number_of_t_bases', + 'number_of_n_bases', + ] + percentage_keys = [ + 'gc_percentage', + ] + for line in section_lines: + key, value = line.split('=') + key = key.strip().replace(' ', '_').replace("'", "_base").lower() + value = value.strip() + if key in int_with_units_and_percentage_keys: + value_split = value.split() + value_without_units = ' '.join([value_split[0], value_split[2]]) + value, percentage = parse_int_with_percentage(value_without_units) + percentage_key = key.replace('number_of_', 'percent_') + actg_content_section[percentage_key] = percentage + elif key in percentage_keys: + value = value.replace('%', '') + value = float(value) + actg_content_section[key] = value + + return actg_content_section + + +def parse_mismatches_and_indels_section(report_path): + """ + """ + mismatches_and_indels_section = {} + section_lines = get_section_lines(report_path, 'Mismatches and indels') + int_keys = [ + 'number_of_mismatches', + 'number_of_insertions', + 'number_of_deletions', + ] + float_keys = [ + 'general_error_rate', + ] + percentage_keys = [ + 'mapped_reads_with_insertion_percentage', + 'mapped_reads_with_deletion_percentage', + 'homopolymer_indels', + ] + for line in section_lines: + key, value = line.split('=') + key = key.strip().replace(' ', '_').lower() + value = value.strip() + if key in int_keys: + value = value.replace(',', '') + value = int(value) + elif key in float_keys: + value = float(value) + elif key in percentage_keys: + value = value.replace('%', '') + value = float(value) + if key == 'homopolymer_indels': + key = 'homopolymer_indels_percentage' + mismatches_and_indels_section[key] = value + + return mismatches_and_indels_section + +def parse_percent_coverage_line(line): + """ + Convert lines that look like: + + There is a 100% of reference with a coverageData >= 1X + + ...into a tuple (1, 100) + """ + line_split = line.split() + coverage = line_split[-1] + percentage = line_split[3] + coverage = int(coverage.strip('X')) + percentage = float(percentage.strip('%')) + + return coverage, percentage + +def parse_coverage_section(report_path): + """ + """ + coverage_section = { + 'coverage_profile': [], + } + section_lines = get_section_lines(report_path, 'Coverage') + float_x_keys = [ + 'mean_coverage', + 'std_coverage', + 'paired-end_adapted_mean_coverage', + ] + for line in section_lines: + if line.startswith('There is a'): + coverage, value = parse_percent_coverage_line(line) + coverage_profile_record = { + 'depth_threshold': coverage, + 'percent_reference_covered': value, + } + coverage_section['coverage_profile'].append(coverage_profile_record) + else: + key, value = line.split('=') + key = key.strip().replace(' ', '_').replace('coverageData', 'coverage').lower() + value = value.strip() + if key in float_x_keys: + value = value.replace('X', '') + value = float(value) + coverage_section[key] = value + + return coverage_section + + +def parse_coverage_per_contig_section(report_path): + """ + """ + coverage_per_contig_section = [] + section_lines = get_section_lines(report_path, 'Coverage per contig') + section_headers = [ + 'name', + 'length', + 'mapped_bases', + 'mean_coverage', + 'standard_deviation', + ] + int_keys = [ + 'length', + 'mapped_bases', + ] + float_keys = [ + 'mean_coverage', + 'standard_deviation', + ] + for line in section_lines: + line_split = line.split('\t') + contig_data = {} + for i, header in enumerate(section_headers): + value = line_split[i] + if header in int_keys: + value = int(value) + elif header in float_keys: + value = float(value) + contig_data[header] = value + coverage_per_contig_section.append(contig_data) + + return coverage_per_contig_section + + +def main(args): + + parsed_genome_results = {} + input_section = parse_input_section(args.input) + parsed_genome_results['input'] = input_section + + reference_section = parse_reference_section(args.input) + parsed_genome_results['reference'] = reference_section + + globals_section = parse_globals_section(args.input) + parsed_genome_results['globals'] = globals_section + + insert_size_section = parse_insert_size_section(args.input) + parsed_genome_results['insert_size'] = insert_size_section + + actg_content_section = parse_actg_content_section(args.input) + parsed_genome_results['actg_content'] = actg_content_section + + mismatches_and_indels_section = parse_mismatches_and_indels_section(args.input) + parsed_genome_results['mismatches_and_indels'] = mismatches_and_indels_section + + coverage_section = parse_coverage_section(args.input) + parsed_genome_results['coverage'] = coverage_section + + coverage_per_contig_section = parse_coverage_per_contig_section(args.input) + parsed_genome_results['coverage_per_contig'] = coverage_per_contig_section + + print(json.dumps(parsed_genome_results, indent=4)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Description of your script') + parser.add_argument('--input', type=str, required=True, help='Path to the input file') + args = parser.parse_args() + main(args) + diff --git a/main.nf b/main.nf index 159d4e5..0666747 100644 --- a/main.nf +++ b/main.nf @@ -80,7 +80,7 @@ workflow { merge_nanoq_reports(nanoq_pre_filter.out.report.join(nanoq_post_filter.out.report)) - if (! params.align_unfiltered_reads) { + if (! params.align_untrimmed_reads) { ch_nanopore_reads_to_align = filtlong.out.filtered_reads } else { ch_nanopore_reads_to_align = ch_nanopore_fastq diff --git a/modules/alignment_variants.nf b/modules/alignment_variants.nf index 43b5f30..30b02ea 100644 --- a/modules/alignment_variants.nf +++ b/modules/alignment_variants.nf @@ -142,7 +142,7 @@ process qualimap_bamqc { tag { sample_id + ' / ' + short_long } publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_qualimap_alignment_qc.csv" - publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_qualimap_genome_results.txt" + publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_qualimap_genome_results.{txt,json}" publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_qualimap_report.pdf" input: @@ -152,6 +152,7 @@ process qualimap_bamqc { tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_alignment_qc.csv"), emit: alignment_qc tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_report.pdf"), emit: report, optional: true tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_genome_results.txt"), emit: genome_results, optional: true + tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_genome_results.json"), emit: genome_results_json, optional: true tuple val(sample_id), path("${sample_id}_${short_long}_qualimap_bamqc_provenance.yml"), emit: provenance script: @@ -192,7 +193,13 @@ process qualimap_bamqc { --read-type ${short_long} \ --failed \ > ${sample_id}_${short_long}_qualimap_alignment_qc.csv + + echo '{}' > ${sample_id}_${short_long}_qualimap_genome_results.json else + qualimap_bamqc_genome_results_to_json.py \ + --input ${sample_id}_${short_long}_bamqc/genome_results.txt \ + > ${sample_id}_${short_long}_qualimap_genome_results.json + qualimap_bamqc_genome_results_to_csv.py \ -s ${sample_id} \ --read-type ${short_long} \