diff --git a/README.md b/README.md index a08fd86..0abb30a 100644 --- a/README.md +++ b/README.md @@ -262,3 +262,5 @@ Hardware|Default settings # 8. Credits The key steps of poreCov are carried out using the [ARTIC Network field bioinformatics pipeline](https://github.com/artic-network/fieldbioinformatics). Kudos to all amazing developers for your incredible efforts during this pandemic! Many thanks to all others who have helped out and contributed to poreCov as well. + +The script [convert_VCF_info_fields.py](bin/convert_VCF_info_fields.py) was originally developed by the Intergalactic Utilities Commission in the [Tool Shed repository](https://github.com/galaxyproject/tools-iuc) under a MIT license. \ No newline at end of file diff --git a/bin/convert_VCF_info_fields.py b/bin/convert_VCF_info_fields.py new file mode 100755 index 0000000..19cef64 --- /dev/null +++ b/bin/convert_VCF_info_fields.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 + +# ----------------------------------------------------------------------------- +# The MIT License (MIT) + +# Copyright (c) 2014 galaxy-iuc + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ----------------------------------------------------------------------------- + +# Takes in VCF file annotated with medaka tools annotate and converts +# +# Usage statement: +# python convert_VCF_info_fields.py in_vcf.vcf out_vcf.vcf + +# 10/21/2020 - Nathan P. Roach, natproach@gmail.com + +# adapted June 2024 - Marie Lataretu, lataretum@rki.de +# source for this scripts: +# https://github.com/galaxyproject/tools-iuc/blob/fd148a124034b44d0d61db3eec32ff991d8c152c/tools/medaka/convert_VCF_info_fields.py + +import re +import sys +from collections import OrderedDict +from math import log10 + +import scipy.stats + + +def pval_to_phredqual(pval): + try: + ret = round(-10 * log10(pval)) + except ValueError: + ret = 2147483647 # transform pval of 0.0 to max signed 32 bit int + return ret + + +def parseInfoField(info): + info_fields = info.split(";") + info_dict = OrderedDict() + for info_field in info_fields: + code, val = info_field.split("=") + info_dict[code] = val + return info_dict + + +def annotateVCF(in_vcf_filepath, out_vcf_filepath): + """Postprocess output of medaka tools annotate. + + Splits multiallelic sites into separate records. + Replaces medaka INFO fields that might represent information of the ref + and multiple alternate alleles with simple ref, alt allele counterparts. + """ + + in_vcf = open(in_vcf_filepath, "r") + # medaka INFO fields that do not make sense after splitting of + # multi-allelic records + # DP will be overwritten with the value of DPSP because medaka tools + # annotate currently only calculates the latter correctly + # (https://github.com/nanoporetech/medaka/issues/192). + # DPS, which is as unreliable as DP, gets skipped and the code + # calculates the spanning reads equivalent DPSPS instead. + to_skip = {"SC", "SR", "AR", "DP", "DPSP", "DPS"} + struct_meta_pat = re.compile("##(.+)=") + header_lines = [] + contig_ids = set() + contig_ids_simple = set() + # parse the metadata lines of the input VCF and drop: + # - duplicate lines + # - INFO lines declaring keys we are not going to write + # - redundant contig information + while True: + line = in_vcf.readline() + if line[:2] != "##": + assert line.startswith("#CHROM") + break + if line in header_lines: + # the annotate tool may generate lines already written by + # medaka variant again (example: medaka version line) + continue + match = struct_meta_pat.match(line) + if match: + match_type, match_id, match_misc = match.groups() + if match_type == "INFO": + if match_id == "DPSP": + line = line.replace("DPSP", "DP") + elif match_id in to_skip: + continue + elif match_type == "contig": + contig_ids.add(match_id) + if not match_misc: + # the annotate tools writes its own contig info, + # which is redundant with contig info generated by + # medaka variant, but lacks a length value. + # We don't need the incomplete line. + contig_ids_simple.add(match_id) + continue + header_lines.append(line) + # Lets check the above assumption about each ID-only contig line + # having a more complete counterpart. + assert not (contig_ids_simple - contig_ids) + header_lines.insert(1, "##convert_VCF_info_fields=0.2\n") + header_lines += [ + '##INFO=\n', + '##INFO=\n', + '##INFO=\n', + '##INFO=\n', + '##INFO=\n', + '##INFO=\n', + '##INFO=\n', + line, + ] + + with open(out_vcf_filepath, "w") as out_vcf: + out_vcf.writelines(header_lines) + for line in in_vcf: + fields = line.split("\t") + info_dict = parseInfoField(fields[7]) + sr_list = [int(x) for x in info_dict["SR"].split(",")] + sc_list = [int(x) for x in info_dict["SC"].split(",")] + if len(sr_list) != len(sc_list): + print("WARNING - SR and SC are different lengths, " "skipping variant") + print(line.strip()) # Print the line for debugging purposes + continue + variant_list = fields[4].split(",") + dpsp = int(info_dict["DPSP"]) + ref_fwd, ref_rev = 0, 1 + dpspf, dpspr = (int(x) for x in info_dict["AR"].split(",")) + for i in range(0, len(sr_list), 2): + dpspf += sr_list[i] + dpspr += sr_list[i + 1] + for j, i in enumerate(range(2, len(sr_list), 2)): + dp4 = (sr_list[ref_fwd], sr_list[ref_rev], sr_list[i], sr_list[i + 1]) + dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] + _, p_val = scipy.stats.fisher_exact(dp2x2) + sb = pval_to_phredqual(p_val) + + as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1]) + + info = [] + for code in info_dict: + if code in to_skip: + continue + val = info_dict[code] + info.append("%s=%s" % (code, val)) + + info.append("DP=%d" % dpsp) + info.append("DPSPS=%d,%d" % (dpspf, dpspr)) + + if dpsp == 0: + info.append("AF=.") + else: + af = (dp4[2] + dp4[3]) / dpsp + info.append("AF=%.6f" % af) + if dpspf == 0: + info.append("FAF=.") + else: + faf = dp4[2] / dpspf + info.append("FAF=%.6f" % faf) + if dpspr == 0: + info.append("RAF=.") + else: + raf = dp4[3] / dpspr + info.append("RAF=%.6f" % raf) + info.append("SB=%d" % sb) + info.append("DP4=%d,%d,%d,%d" % dp4) + info.append("AS=%d,%d,%d,%d" % as_) + new_info = ";".join(info) + fields[4] = variant_list[j] + fields[7] = new_info + out_vcf.write("\t".join(fields)) + in_vcf.close() + + +if __name__ == "__main__": + annotateVCF(sys.argv[1], sys.argv[2]) \ No newline at end of file diff --git a/bin/summary_report.py b/bin/summary_report.py index c0884f4..ab4a247 100755 --- a/bin/summary_report.py +++ b/bin/summary_report.py @@ -640,6 +640,9 @@ def unclass_markup(value): self.add_col_description(f'Read classification was determined against a database containing only SARS-CoV-2 and human with Kraken2 (v{self.tool_versions["kraken2"]}).') + def add_mixed_sites_results(self, mixed_site_results): + res_data = pd.read_csv(mixed_site_results, index_col='sample', dtype={'sample': str}) + self.add_column_raw('num_mixed_sites', res_data['num_mixed_sites']) def add_coverage_plots(self, coverage_plots): for coverage_plot in sorted(coverage_plots.split(',')): @@ -826,6 +829,7 @@ def get_lineage_status(self, lineage): parser.add_argument("-n", "--nextclade_results", help="nextclade results") parser.add_argument("-q", "--president_results", help="president results") parser.add_argument("-k", "--kraken2_results", help="kraken2 results") + parser.add_argument("-m", "--mixed_sites_results", help="mixed sites statisics") parser.add_argument("-c", "--coverage_plots", help="coverage plots (comma separated)") parser.add_argument("-s", "--samples", help="sample ids (comma separated)") args = parser.parse_args() @@ -852,6 +856,8 @@ def get_lineage_status(self, lineage): report.add_pangolin_results(args.pangolin_results) if args.nextclade_results: report.add_nextclade_results(args.nextclade_results) + if args.mixed_sites_results: + report.add_mixed_sites_results(args.mixed_sites_results) if args.coverage_plots: report.add_coverage_plots(args.coverage_plots) diff --git a/modules/add_alt_allele_ratio_vcf.nf b/modules/add_alt_allele_ratio_vcf.nf new file mode 100644 index 0000000..396f301 --- /dev/null +++ b/modules/add_alt_allele_ratio_vcf.nf @@ -0,0 +1,46 @@ +process add_alt_allele_ratio_vcf { + label 'artic' + publishDir "${params.output}/${params.lineagedir}/${name}/", mode: 'copy' + input: + tuple val(name), path(bam), path(bai), path(vcf), path(failed_vcf) + path(external_scheme) // primer scheme dir as input + output: + tuple val(name), path("${name}_all-vars-with-aar.vcf"), emit: vcf + tuple val(name), path("mixed_sites_stats.csv"), emit: stats + script: + primer_version_tag = params.primerV.toString().contains(".bed") ? 'V_custom' : "${params.primerV}" + primer_dir = params.primerV.toString().contains(".bed") ? "${external_scheme}" : "${external_scheme}/nCoV-2019" + """ + # reheader failed VCF and change FILTER + echo '##FILTER=' > add-to-hdr.txt + # -c and --rename-annots add a header with a default/wrong description + bcftools annotate -h add-to-hdr.txt -c "FILTER/ARTICFAIL:=FILTER/PASS" ${failed_vcf} | sed '/##FILTER=/d' > tmp_failed_updated-filter.vcf + + # concat failed and passed VCF + bcftools concat ${vcf} tmp_failed_updated-filter.vcf | bcftools sort -o tmp_merged.vcf + + # call medaka tools annotate for each pool and add the alternate allele ratio + for pool in `cut -f5 ${primer_dir}/${primer_version_tag}/nCoV-2019.scheme.bed | sort | uniq`; do + bcftools view -i 'INFO/Pool="'\$pool'"' tmp_merged.vcf -o tmp_\$pool.vcf + medaka tools annotate --dpsp --pad 25 --RG \$pool tmp_\$pool.vcf ${primer_dir}/${primer_version_tag}/nCoV-2019.reference.fasta ${bam} tmp_annotated_\$pool.vcf + convert_VCF_info_fields.py tmp_annotated_\$pool.vcf tmp_aar_\$pool.vcf + done + + # merge pool VCF and sort VCF + bcftools concat tmp_aar_*.vcf | bcftools sort -o ${name}_all-vars-with-aar.vcf + + # count mixed sites + # thresholds for human geotyping: 0.35 <= x <= 0.65 + NUM_MIXED_SITES=\$(bcftools view -H -i 'INFO/DP>${params.min_depth} & INFO/AF>=0.3 & INFO/AF<=0.8' ${name}_all-vars-with-aar.vcf | wc -l) + echo sample,num_mixed_sites > mixed_sites_stats.csv + echo ${name},\$NUM_MIXED_SITES >> mixed_sites_stats.csv + + # remove intermediate files + rm tmp_* + """ + stub: + """ + touch ${name}_all-vars-with-aar.vcf mixed_sites_stats.csv + """ + +} diff --git a/poreCov.nf b/poreCov.nf index 0cf7d5e..1b91e30 100755 --- a/poreCov.nf +++ b/poreCov.nf @@ -309,6 +309,7 @@ include { get_fasta } from './modules/get_fasta_test_data.nf' include { align_to_reference } from './modules/align_to_reference.nf' include { split_fasta } from './modules/split_fasta.nf' include { filter_fastq_by_length } from './modules/filter_fastq_by_length.nf' +include { add_alt_allele_ratio_vcf } from './modules/add_alt_allele_ratio_vcf.nf' /************************** * Workflows @@ -360,7 +361,16 @@ workflow { else if (!params.nanopolish) { artic_ncov_wf(filtered_reads_ch, params.artic_normalize) fasta_input_ch = artic_ncov_wf.out.assembly + + // add alternative allele ratio to the VCF + if (params.primerV.toString().contains(".bed")) { + external_primer_schemes = artic_ncov_wf.out.primer_dir + } + else { + external_primer_schemes = file(workflow.projectDir + "/data/external_primer_schemes", checkIfExists: true, type: 'dir' ) } + add_alt_allele_ratio_vcf(artic_ncov_wf.out.trimmed_bam.join(artic_ncov_wf.out.vcf).join(artic_ncov_wf.out.failed_vcf), external_primer_schemes) + } } // fastq input via dir and or files if ( (params.fastq || params.fastq_pass) || workflow.profile.contains('test_fastq')) { @@ -392,7 +402,16 @@ workflow { else if (!params.nanopolish) { artic_ncov_wf(filtered_reads_ch, params.artic_normalize) fasta_input_ch = artic_ncov_wf.out.assembly + + // add alternative allele ratio to the VCF + if (params.primerV.toString().contains(".bed")) { + external_primer_schemes = artic_ncov_wf.out.primer_dir + } + else { + external_primer_schemes = file(workflow.projectDir + "/data/external_primer_schemes", checkIfExists: true, type: 'dir' ) } + add_alt_allele_ratio_vcf(artic_ncov_wf.out.trimmed_bam.join(artic_ncov_wf.out.vcf).join(artic_ncov_wf.out.failed_vcf), external_primer_schemes) + } } // 2. Genome quality, lineages, clades and mutations @@ -416,6 +435,7 @@ workflow { if (params.fasta || workflow.profile.contains('test_fasta')) { taxonomic_read_classification_ch = Channel.from( ['deactivated', 'deactivated', 'deactivated'] ).collect() alignments_ch = Channel.from( ['deactivated'] ) + } else { taxonomic_read_classification_ch = read_classification_wf.out.kraken if (params.screen_reads) { @@ -428,6 +448,11 @@ workflow { } alignments_ch = align_to_reference(filtered_reads_ch.combine(reference_for_qc_input_ch)) } + if (params.fasta || workflow.profile.contains('test_fasta') || params.nanopolish ) { + alt_allele_ratio_ch = Channel.from( ['deactivated'] ) + } else { + alt_allele_ratio_ch = add_alt_allele_ratio_vcf.out.stats + } /* if (params.samples) { @@ -436,7 +461,7 @@ workflow { else { samples_table_ch = Channel.from( ['deactivated'] ) } */ create_summary_report_wf(determine_lineage_wf.out, genome_quality_wf.out[0], determine_mutations_wf.out, - taxonomic_read_classification_ch, alignments_ch, samples_file_ch) + taxonomic_read_classification_ch, alt_allele_ratio_ch, alignments_ch, samples_file_ch) } diff --git a/workflows/artic_nanopore_nCov19.nf b/workflows/artic_nanopore_nCov19.nf index 8c5feba..b2f5860 100755 --- a/workflows/artic_nanopore_nCov19.nf +++ b/workflows/artic_nanopore_nCov19.nf @@ -15,6 +15,10 @@ workflow artic_ncov_wf { artic_medaka_custom_bed(fastq.combine(external_primer_schemes).combine(primerBed), normalise_threshold) assembly = artic_medaka_custom_bed.out.fasta binary_alignment = artic_medaka_custom_bed.out.fullbam + trimmed_bam = artic_medaka_custom_bed.out.reference_bam + vcf = artic_medaka_custom_bed.out.vcf + failed_vcf = artic_medaka_custom_bed.out.vcf_fail + primer_dir = artic_medaka_custom_bed.out.primer_dir // plot amplicon coverage covarplot_custom_bed(artic_medaka_custom_bed.out.covarplot.combine(primerBed)) @@ -27,6 +31,10 @@ workflow artic_ncov_wf { artic_medaka(fastq.combine(external_primer_schemes), normalise_threshold) assembly = artic_medaka.out.fasta binary_alignment = artic_medaka.out.fullbam + trimmed_bam = artic_medaka.out.reference_bam + vcf = artic_medaka.out.vcf + failed_vcf = artic_medaka.out.vcf_fail + primer_dir = Channel.empty() // plot amplicon coverage covarplot(artic_medaka.out.covarplot.combine(external_primer_schemes)) @@ -40,6 +48,10 @@ workflow artic_ncov_wf { emit: assembly binary_alignment + trimmed_bam + vcf + primer_dir + failed_vcf } workflow artic_ncov_np_wf { diff --git a/workflows/create_summary_report.nf b/workflows/create_summary_report.nf index 806f0ba..9b606c0 100644 --- a/workflows/create_summary_report.nf +++ b/workflows/create_summary_report.nf @@ -8,6 +8,7 @@ workflow create_summary_report_wf { president nextclade kraken2 + mixed_sites alignments samples_table @@ -18,6 +19,7 @@ workflow create_summary_report_wf { pangolin_results = pangolin.map {it -> it[1]}.collectFile(name: 'pangolin_results.csv', skip: 1, keepHeader: true) president_results = president.map {it -> it[1]}.collectFile(name: 'president_results.tsv', skip: 1, keepHeader: true) nextclade_results = nextclade.map {it -> it[1]}.collectFile(name: 'nextclade_results.tsv', skip: 1, keepHeader: true) + mixed_site_results = mixed_sites.map {it -> it[1]}.collectFile(name: 'mixed_sites_results.tsv', skip: 1, keepHeader: true) alignment_files = alignments.map {it -> it[0]}.collect() if (params.fasta || workflow.profile.contains('test_fasta')) { @@ -30,7 +32,7 @@ workflow create_summary_report_wf { coverage_plots = plot_coverages(alignments.map{it -> it[0]}.toSortedList({ a, b -> a.simpleName <=> b.simpleName }).flatten().collate(6), \ alignments.map{it -> it[1]}.toSortedList({ a, b -> a.simpleName <=> b.simpleName }).flatten().collate(6)).collect() - if (params.samples) { summary_report(version_ch, variants_table_ch, pangolin_results, president_results, nextclade_results, kraken2_results, coverage_plots, samples_table) } - else { summary_report_default(version_ch, variants_table_ch, pangolin_results, president_results, nextclade_results, kraken2_results, coverage_plots) } + if (params.samples) { summary_report(version_ch, variants_table_ch, pangolin_results, president_results, nextclade_results, kraken2_results, mixed_site_results, coverage_plots, samples_table) } + else { summary_report_default(version_ch, variants_table_ch, pangolin_results, president_results, nextclade_results, kraken2_results, mixed_site_results, coverage_plots) } } } diff --git a/workflows/process/artic.nf b/workflows/process/artic.nf index d34aa00..6c738a4 100755 --- a/workflows/process/artic.nf +++ b/workflows/process/artic.nf @@ -88,6 +88,7 @@ process artic_medaka_custom_bed { tuple val(name), path("${name}.primersitereport.txt"), emit: primersitereport tuple val(name), path("${name}.coverage_mask.txt"), emit: coverage_mask tuple val(name), path("${name}.fail.vcf"), emit: vcf_fail + path ("primer_scheme/nCoV-2019/"), emit: primer_dir script: def normalise_arg = normalise_threshold ? "--normalise ${normalise_threshold}" : '--normalise 0' """ diff --git a/workflows/process/summary_report.nf b/workflows/process/summary_report.nf index f601060..d0406d7 100644 --- a/workflows/process/summary_report.nf +++ b/workflows/process/summary_report.nf @@ -10,6 +10,7 @@ process summary_report { path(president_results) path(nextclade_results) file(kraken2_results) + file(mixed_sites_results) file(coverage_plots) file(samples_table) output: @@ -43,6 +44,7 @@ process summary_report { -q !{president_results} \ -n !{nextclade_results} \ -k kraken2_results.csv \ + -m !{mixed_sites_results} \ -c $(echo !{coverage_plots} | tr ' ' ',') \ -s !{samples_table} ''' @@ -64,6 +66,7 @@ process summary_report_default { path(president_results) path(nextclade_results) path(kraken2_results) + path(mixed_sites_results) path(coverage_plots) output: path("poreCov_summary_report_*.html") @@ -96,6 +99,7 @@ process summary_report_default { -q !{president_results} \ -n !{nextclade_results} \ -k kraken2_results.csv \ + -m !{mixed_sites_results} \ -c $(echo !{coverage_plots} | tr ' ' ',') ''' stub: