From d0f293d2b8136fb0d8ef3753733ab1d30fb20d56 Mon Sep 17 00:00:00 2001 From: Alexander Thomas <77535027+alethomas@users.noreply.github.com> Date: Thu, 21 Sep 2023 16:07:57 +0200 Subject: [PATCH] fix: switch from jupyter notebook to script (#598) * switch from jupyter notebook to script * change code format * fmt --- .../assembly-benchmark-results.py.ipynb | 201 ------------------ workflow/rules/benchmarking.smk | 4 +- .../scripts/assembly-benchmark-results.py | 167 +++++++++++++++ 3 files changed, 169 insertions(+), 203 deletions(-) delete mode 100644 workflow/notebooks/assembly-benchmark-results.py.ipynb create mode 100644 workflow/scripts/assembly-benchmark-results.py diff --git a/workflow/notebooks/assembly-benchmark-results.py.ipynb b/workflow/notebooks/assembly-benchmark-results.py.ipynb deleted file mode 100644 index 46e7dbc64..000000000 --- a/workflow/notebooks/assembly-benchmark-results.py.ipynb +++ /dev/null @@ -1,201 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "sys.stderr = open(snakemake.log[0], \"w\")\n", - "import pysam\n", - "\n", - "# see https://pysam.readthedocs.io/en/latest/api.html#pysam.AlignedSegment.cigartuples\n", - "BAM_CEQUAL = 7\n", - "BAM_CDIFF = 8\n", - "BAM_CSOFT_CLIP = 4\n", - "BAM_CHARD_CLIP = 5\n", - "BAM_CINS = 1\n", - "BAM_CDEL = 2\n", - "\n", - "N_WINDOW = 100\n", - "\n", - "\n", - "def is_uncertain_region(record, rpos, rend, ref_fasta):\n", - " refseq = ref_fasta.fetch(record.reference_name, rpos - N_WINDOW, rend + N_WINDOW)\n", - " return (\"N\" * 10) in refseq\n", - "\n", - "\n", - "def get_edit_dist(record, ref_fasta):\n", - " edit_dist = 0\n", - " qpos = 0\n", - " rpos = record.reference_start\n", - " for op, length in record.cigartuples:\n", - " if op == BAM_CEQUAL:\n", - " # no edit\n", - " qpos += length\n", - " rpos += length\n", - " else:\n", - " if op == BAM_CDIFF:\n", - " refseq = ref_fasta.fetch(record.reference_name, rpos, rpos + length)\n", - " if not is_uncertain_region(record, rpos, rpos + length, ref_fasta):\n", - " # Only consider edits where the reference has a true nucleotide\n", - " # because IUPAC codes lead to mason simulating an N in the reads.\n", - " edit_dist += sum(base in \"ACGT\" for base in refseq)\n", - " qpos += length\n", - " rpos += length\n", - "\n", - " elif op == BAM_CSOFT_CLIP:\n", - " edit_dist += length\n", - " qpos += length\n", - " elif op == BAM_CHARD_CLIP:\n", - " edit_dist += length\n", - " elif op == BAM_CINS:\n", - " refseq = ref_fasta.fetch(record.reference_name, rpos, rpos + length)\n", - " if not is_uncertain_region(record, rpos, rpos + length, ref_fasta):\n", - " # only count edit distance if the region behind the insert does not\n", - " # contain N stretches in the reference. Rationale: those regions are apparently\n", - " # hard to assemble, and we cannot properly simulate reads for them, so\n", - " # we should not count them in a benchmark.\n", - " edit_dist += length\n", - " qpos += length\n", - " elif op == BAM_CDEL:\n", - " refseq = ref_fasta.fetch(record.reference_name, rpos, rpos + length)\n", - " if not is_uncertain_region(record, rpos, rpos + length, ref_fasta):\n", - " # only count edit distance if the region behind the insert does not\n", - " # contain N stretches in the reference. Rationale: those regions are apparently\n", - " # hard to assemble, and we cannot properly simulate reads for them, so\n", - " # we should not count them in a benchmark.\n", - " edit_dist += length\n", - " rpos += length\n", - " else:\n", - " raise ValueError(f\"Unsupported CIGAR operation: {op}\")\n", - " return edit_dist\n", - "\n", - "\n", - "with open(snakemake.output[0], \"w\") as out:\n", - " print(\n", - " \"Accession\",\n", - " \"Contigs\",\n", - " \"Total contigs\",\n", - " \"Contig length\",\n", - " \"Reference length\",\n", - " \"Contig frac\",\n", - " \"Cum frac\",\n", - " \"Relevant edit dist\",\n", - " \"Cigar string\",\n", - " \"Edit frac\",\n", - " sep=\"\\t\",\n", - " file=out,\n", - " )\n", - " sum_of_edit_dist = 0\n", - " largest_no_contig = 0\n", - " small_larg_cover = 1.0\n", - " small_larg_cover_name = \"asd\"\n", - " smallest_cum_frac = 1.0\n", - "\n", - " # for bam_file in snakemake_input:\n", - " for bam_file, ref_fasta in zip(snakemake.input.bams, snakemake.input.refs):\n", - " current_contig = 1\n", - "\n", - " ref_fasta = pysam.FastaFile(ref_fasta)\n", - "\n", - " with pysam.AlignmentFile(bam_file, \"rb\") as samfile:\n", - " total_contigs = samfile.count()\n", - " accession = samfile.get_reference_name(0)\n", - "\n", - " largest_no_contig = (\n", - " total_contigs\n", - " if total_contigs > largest_no_contig\n", - " else largest_no_contig\n", - " )\n", - "\n", - " with pysam.AlignmentFile(bam_file, \"rb\") as samfile:\n", - " ref_lengths = samfile.lengths[0]\n", - " largest_coverage = 0\n", - " cum_frac = 0\n", - " for read in samfile.fetch():\n", - " query_alignment_length = read.query_alignment_length\n", - " frac = round(query_alignment_length / ref_lengths, 2)\n", - "\n", - " # Calculate edit distance from CIGAR string, because NM tag counts matching Ns as edits.\n", - " edit = get_edit_dist(read, ref_fasta)\n", - "\n", - " cum_frac += frac\n", - " cum_frac = round(cum_frac, 2)\n", - " sum_of_edit_dist = sum_of_edit_dist + edit\n", - " edit_frac = round(edit / query_alignment_length, 5)\n", - " largest_coverage = frac if frac > largest_coverage else largest_coverage\n", - "\n", - " print(\n", - " accession,\n", - " current_contig,\n", - " total_contigs,\n", - " query_alignment_length,\n", - " ref_lengths,\n", - " frac,\n", - " cum_frac,\n", - " edit,\n", - " read.cigarstring,\n", - " edit_frac,\n", - " sep=\"\\t\",\n", - " file=out,\n", - " )\n", - "\n", - " current_contig += 1\n", - " smallest_cum_frac = (\n", - " cum_frac if smallest_cum_frac > cum_frac else smallest_cum_frac\n", - " )\n", - " small_larg_cover = (\n", - " largest_coverage\n", - " if small_larg_cover > largest_coverage\n", - " else small_larg_cover\n", - " )\n", - " small_larg_cover_name = (\n", - " accession\n", - " if small_larg_cover >= largest_coverage\n", - " else small_larg_cover_name\n", - " )\n", - "\n", - " print(\"Largest number of contigs\")\n", - " print(largest_no_contig)\n", - " print(\"Smallest largest coverage in \" + str(small_larg_cover_name))\n", - " print(small_larg_cover)\n", - " print(\"Sum of edit distances\")\n", - " print(sum_of_edit_dist)\n", - " print(\"Smallest cum. fraction of contigs\")\n", - " print(smallest_cum_frac)\n", - " print(\"Largest number of contigs\", file=out)\n", - " print(largest_no_contig, file=out)\n", - " print(\"Smallest largest coverage in \" + str(small_larg_cover_name), file=out)\n", - " print(small_larg_cover, file=out)\n", - " print(\"Sum of edit distances\", file=out)\n", - " print(sum_of_edit_dist, file=out)\n", - " print(\"Smallest cum. fraction of contigs\", file=out)\n", - " print(smallest_cum_frac, file=out)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/workflow/rules/benchmarking.smk b/workflow/rules/benchmarking.smk index c5683ecf1..6f455f0ce 100644 --- a/workflow/rules/benchmarking.smk +++ b/workflow/rules/benchmarking.smk @@ -130,8 +130,8 @@ rule summarize_assembly_results: notebooks=1, conda: "../envs/pysam.yaml" - notebook: - "../notebooks/assembly-benchmark-results.py.ipynb" + script: + "../scripts/assembly-benchmark-results.py" rule test_non_cov2: diff --git a/workflow/scripts/assembly-benchmark-results.py b/workflow/scripts/assembly-benchmark-results.py new file mode 100644 index 000000000..aca8c4c7e --- /dev/null +++ b/workflow/scripts/assembly-benchmark-results.py @@ -0,0 +1,167 @@ +# %% +sys.stderr = open(snakemake.log[0], "w") +import pysam + +# see https://pysam.readthedocs.io/en/latest/api.html#pysam.AlignedSegment.cigartuples +BAM_CEQUAL = 7 +BAM_CDIFF = 8 +BAM_CSOFT_CLIP = 4 +BAM_CHARD_CLIP = 5 +BAM_CINS = 1 +BAM_CDEL = 2 + +N_WINDOW = 100 + + +def is_uncertain_region(record, rpos, rend, ref_fasta): + refseq = ref_fasta.fetch(record.reference_name, rpos - N_WINDOW, rend + N_WINDOW) + return ("N" * 10) in refseq + + +def get_edit_dist(record, ref_fasta): + edit_dist = 0 + qpos = 0 + rpos = record.reference_start + for op, length in record.cigartuples: + if op == BAM_CEQUAL: + # no edit + qpos += length + rpos += length + else: + if op == BAM_CDIFF: + refseq = ref_fasta.fetch(record.reference_name, rpos, rpos + length) + if not is_uncertain_region(record, rpos, rpos + length, ref_fasta): + # Only consider edits where the reference has a true nucleotide + # because IUPAC codes lead to mason simulating an N in the reads. + edit_dist += sum(base in "ACGT" for base in refseq) + qpos += length + rpos += length + + elif op == BAM_CSOFT_CLIP: + edit_dist += length + qpos += length + elif op == BAM_CHARD_CLIP: + edit_dist += length + elif op == BAM_CINS: + refseq = ref_fasta.fetch(record.reference_name, rpos, rpos + length) + if not is_uncertain_region(record, rpos, rpos + length, ref_fasta): + # only count edit distance if the region behind the insert does not + # contain N stretches in the reference. Rationale: those regions are apparently + # hard to assemble, and we cannot properly simulate reads for them, so + # we should not count them in a benchmark. + edit_dist += length + qpos += length + elif op == BAM_CDEL: + refseq = ref_fasta.fetch(record.reference_name, rpos, rpos + length) + if not is_uncertain_region(record, rpos, rpos + length, ref_fasta): + # only count edit distance if the region behind the insert does not + # contain N stretches in the reference. Rationale: those regions are apparently + # hard to assemble, and we cannot properly simulate reads for them, so + # we should not count them in a benchmark. + edit_dist += length + rpos += length + else: + raise ValueError(f"Unsupported CIGAR operation: {op}") + return edit_dist + + +with open(snakemake.output[0], "w") as out: + print( + "Accession", + "Contigs", + "Total contigs", + "Contig length", + "Reference length", + "Contig frac", + "Cum frac", + "Relevant edit dist", + "Cigar string", + "Edit frac", + sep="\t", + file=out, + ) + sum_of_edit_dist = 0 + largest_no_contig = 0 + small_larg_cover = 1.0 + small_larg_cover_name = "asd" + smallest_cum_frac = 1.0 + + # for bam_file in snakemake_input: + for bam_file, ref_fasta in zip(snakemake.input.bams, snakemake.input.refs): + current_contig = 1 + + ref_fasta = pysam.FastaFile(ref_fasta) + + with pysam.AlignmentFile(bam_file, "rb") as samfile: + total_contigs = samfile.count() + accession = samfile.get_reference_name(0) + + largest_no_contig = ( + total_contigs + if total_contigs > largest_no_contig + else largest_no_contig + ) + + with pysam.AlignmentFile(bam_file, "rb") as samfile: + ref_lengths = samfile.lengths[0] + largest_coverage = 0 + cum_frac = 0 + for read in samfile.fetch(): + query_alignment_length = read.query_alignment_length + frac = round(query_alignment_length / ref_lengths, 2) + + # Calculate edit distance from CIGAR string, because NM tag counts matching Ns as edits. + edit = get_edit_dist(read, ref_fasta) + + cum_frac += frac + cum_frac = round(cum_frac, 2) + sum_of_edit_dist = sum_of_edit_dist + edit + edit_frac = round(edit / query_alignment_length, 5) + largest_coverage = frac if frac > largest_coverage else largest_coverage + + print( + accession, + current_contig, + total_contigs, + query_alignment_length, + ref_lengths, + frac, + cum_frac, + edit, + read.cigarstring, + edit_frac, + sep="\t", + file=out, + ) + + current_contig += 1 + smallest_cum_frac = ( + cum_frac if smallest_cum_frac > cum_frac else smallest_cum_frac + ) + small_larg_cover = ( + largest_coverage + if small_larg_cover > largest_coverage + else small_larg_cover + ) + small_larg_cover_name = ( + accession + if small_larg_cover >= largest_coverage + else small_larg_cover_name + ) + + print("Largest number of contigs") + print(largest_no_contig) + print("Smallest largest coverage in " + str(small_larg_cover_name)) + print(small_larg_cover) + print("Sum of edit distances") + print(sum_of_edit_dist) + print("Smallest cum. fraction of contigs") + print(smallest_cum_frac) + print("Largest number of contigs", file=out) + print(largest_no_contig, file=out) + print("Smallest largest coverage in " + str(small_larg_cover_name), file=out) + print(small_larg_cover, file=out) + print("Sum of edit distances", file=out) + print(sum_of_edit_dist, file=out) + print("Smallest cum. fraction of contigs", file=out) + print(smallest_cum_frac, file=out)