diff --git a/misc/bug_hunting/check_align.py b/misc/bug_hunting/check_align.py new file mode 100644 index 0000000..ae495f4 --- /dev/null +++ b/misc/bug_hunting/check_align.py @@ -0,0 +1,56 @@ +import logging +import pickle + +from cool_seq_tool.schemas import Strand + +from dcd_mapping.align import align +from dcd_mapping.resources import LOCAL_STORE_PATH, get_scoreset_metadata + +logging.basicConfig( + filename="dcd-check-align.log", + format="%(asctime)s %(levelname)s:%(name)s:%(message)s", + level=logging.WARNING, + force=True, +) + +_logger = logging.getLogger(__name__) + +with open("notebooks/analysis/results/mave_blat.pickle", "rb") as f: + mave_blat_dict = pickle.load(f) + +with open(LOCAL_STORE_PATH / "human_urns.txt", "r") as f: + urns = [line.strip() for line in f.readlines()] + +strand_reformat = {1: Strand.POSITIVE, -1: Strand.NEGATIVE} + + +def format_chrom(chrom): + return f"chr{chrom}" + + +for urn in urns: + print(f"Checking {urn}...") + try: + metadata = get_scoreset_metadata(urn) + alignment = align(metadata, False, True) + except Exception as e: + _logger.error("%s error: %s", urn, e) + continue + if urn not in mave_blat_dict: + continue + original = mave_blat_dict[urn] + try: + for name, actual, expected in [ + ("chromosome", alignment.chrom, format_chrom(original["chrom"])), + ("strand", alignment.strand, strand_reformat[original["strand"]]), + ]: + if actual != expected: + _logger.error( + "%s %s mismatch: %s (actual) vs %s (expected)", + urn, + name, + actual, + expected, + ) + except Exception as e: + _logger.error("%s exception: %s", urn, e) diff --git a/pyproject.toml b/pyproject.toml index 0d5fd0f..3386039 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ branch = true [tool.ruff] src = ["src"] +exclude = ["misc/*"] # pycodestyle (E, W) # Pyflakes (F) # flake8-annotations (ANN) diff --git a/tests/conftest.py b/tests/conftest.py index 92713db..e323fcd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,11 @@ -"""Provide shared testing utilities.""" +"""Provide shared testing utilities. + + +Notes on test cases: +------------------- + +* urn:mavedb:00000068-a-1: TP53, protein-coding, DNA. +""" import json import os from pathlib import Path diff --git a/tests/fixtures/align_result.json b/tests/fixtures/align_result.json index 71cf8d5..5336206 100644 --- a/tests/fixtures/align_result.json +++ b/tests/fixtures/align_result.json @@ -58,5 +58,37 @@ { "start": 1261199, "end": 12612062 }, { "start": 12618514, "end": 12618568 } ] + }, + "urn:mavedb:00000068-a-1": { + "chrom": "chr17", + "strand": -1, + "coverage": 99.83079526226734, + "ident_pct": 99.91525423728814, + "query_range": { "start": 0, "end": 1180 }, + "query_subranges": [ + { "start": 1100, "end": 1180 }, + { "start": 993, "end": 1100 }, + { "start": 919, "end": 993 }, + { "start": 782, "end": 919 }, + { "start": 672, "end": 782 }, + { "start": 559, "end": 672 }, + { "start": 375, "end": 559 }, + { "start": 96, "end": 375 }, + { "start": 74, "end": 96 }, + { "start": 0, "end": 74 } + ], + "hit_range": { "start": 7676520, "end": 7669690 }, + "hit_subranges": [ + { "start": 7669610, "end": 7669690 }, + { "start": 7670608, "end": 7670715 }, + { "start": 7673534, "end": 7673608 }, + { "start": 7673700, "end": 7673837 }, + { "start": 7674180, "end": 7674290 }, + { "start": 7674858, "end": 7674971 }, + { "start": 7675052, "end": 7675236 }, + { "start": 7675993, "end": 7676272 }, + { "start": 7676381, "end": 7676403 }, + { "start": 7676520, "end": 7676594 } + ] } } diff --git a/tests/fixtures/scoreset_metadata.json b/tests/fixtures/scoreset_metadata.json index 0b2893e..1b36625 100644 --- a/tests/fixtures/scoreset_metadata.json +++ b/tests/fixtures/scoreset_metadata.json @@ -62,6 +62,15 @@ "target_sequence_type": "dna", "target_reference_genome": "hg38", "target_uniprot_ref": { "id": "uniprot:P04049", "offset": 51 } + }, + { + "urn": "urn:mavedb:00000068-a-1", + "target_gene_name": "TP53 (P72R)", + "target_gene_category": "Protein coding", + "target_sequence": "ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCGCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTAG", + "target_sequence_type": "dna", + "target_reference_genome": "hg38", + "target_uniprot_ref": null } ] } diff --git a/tests/fixtures/urn:mavedb:00000068-a-1_scores.csv b/tests/fixtures/urn:mavedb:00000068-a-1_scores.csv new file mode 100644 index 0000000..53b2e6d --- /dev/null +++ b/tests/fixtures/urn:mavedb:00000068-a-1_scores.csv @@ -0,0 +1,48 @@ +accession,hgvs_nt,hgvs_splice,hgvs_pro,score +urn:mavedb:00000068-a-1#528,NA,NA,p.Gly389Ter,-0.8448754497234633 +urn:mavedb:00000068-a-1#546,NA,NA,p.Gly389Cys,-0.5081453414881882 +urn:mavedb:00000068-a-1#612,NA,NA,p.His365Ter,-2.226787236679561 +urn:mavedb:00000068-a-1#547,NA,NA,p.Gly389=,-0.3114032004197305 +urn:mavedb:00000068-a-1#613,NA,NA,p.His365Tyr,-0.36585451364742 +urn:mavedb:00000068-a-1#548,NA,NA,p.Gly389Ala,-0.8482077472283238 +urn:mavedb:00000068-a-1#549,NA,NA,p.Glu388Ter,0.05530191813883238 +urn:mavedb:00000068-a-1#550,NA,NA,p.Glu388Tyr,-0.34329774205619 +urn:mavedb:00000068-a-1#551,NA,NA,p.Glu388Trp,-1.5238647488862949 +urn:mavedb:00000068-a-1#552,NA,NA,p.Glu388Val,-0.6422615384883972 +urn:mavedb:00000068-a-1#553,NA,NA,p.Glu388Thr,-0.8490660813565178 +urn:mavedb:00000068-a-1#554,NA,NA,p.Glu388Ser,-1.2839487661086697 +urn:mavedb:00000068-a-1#555,NA,NA,p.Glu388Arg,-1.024302690973422 +urn:mavedb:00000068-a-1#556,NA,NA,p.Glu388Gln,-0.3114282471795208 +urn:mavedb:00000068-a-1#557,NA,NA,p.Glu388Pro,-0.5571635356150665 +urn:mavedb:00000068-a-1#558,NA,NA,p.Glu388Asn,-0.5151230575285044 +urn:mavedb:00000068-a-1#559,NA,NA,p.Glu388Met,-1.2506258908030141 +urn:mavedb:00000068-a-1#614,NA,NA,p.His365Trp,-1.1230151778981163 +urn:mavedb:00000068-a-1#560,NA,NA,p.Glu388Leu,-0.6969258713954155 +urn:mavedb:00000068-a-1#615,NA,NA,p.His365Val,-0.826647313158234 +urn:mavedb:00000068-a-1#561,NA,NA,p.Glu388Lys,0.044449447270876415 +urn:mavedb:00000068-a-1#562,NA,NA,p.Glu388Ile,-0.6703081212668602 +urn:mavedb:00000068-a-1#616,NA,NA,p.His365Thr,-1.2339323429418732 +urn:mavedb:00000068-a-1#563,NA,NA,p.Glu388His,-1.0228758006971406 +urn:mavedb:00000068-a-1#617,NA,NA,p.His365Ser,-1.207958290024986 +urn:mavedb:00000068-a-1#564,NA,NA,p.Glu388Gly,0.1928432362603335 +urn:mavedb:00000068-a-1#565,NA,NA,p.Glu388Phe,-0.7995695025755253 +urn:mavedb:00000068-a-1#566,NA,NA,p.Glu388Asp,-0.5734467636981624 +urn:mavedb:00000068-a-1#567,NA,NA,p.Glu388Cys,-0.5200106027473981 +urn:mavedb:00000068-a-1#618,NA,NA,p.His365Arg,-0.15648499129119786 +urn:mavedb:00000068-a-1#568,NA,NA,p.Glu388=,0.13439466320551569 +urn:mavedb:00000068-a-1#569,NA,NA,p.Glu388Ala,-0.24796460732041514 +urn:mavedb:00000068-a-1#595,NA,NA,p.Ser366Tyr,-0.11899995564102445 +urn:mavedb:00000068-a-1#596,NA,NA,p.Ser366Trp,-1.419503452209785 +urn:mavedb:00000068-a-1#597,NA,NA,p.Ser366Val,-0.38831518205082655 +urn:mavedb:00000068-a-1#598,NA,NA,p.Ser366Thr,-0.843833245301256 +urn:mavedb:00000068-a-1#599,NA,NA,p.Ser366Arg,-0.6638684644018105 +urn:mavedb:00000068-a-1#600,NA,NA,p.Ser366Gln,-0.4299226799331797 +urn:mavedb:00000068-a-1#601,NA,NA,p.Ser366Pro,-0.5476098144406847 +urn:mavedb:00000068-a-1#602,NA,NA,p.Ser366Lys,-1.1512867723916729 +urn:mavedb:00000068-a-1#603,NA,NA,p.Ser366Ile,-0.6304135202423646 +urn:mavedb:00000068-a-1#604,NA,NA,p.Ser366His,-1.1104041758877286 +urn:mavedb:00000068-a-1#605,NA,NA,p.Ser366Gly,-0.5861940789803509 +urn:mavedb:00000068-a-1#606,NA,NA,p.Ser366Phe,-0.1552908998376018 +urn:mavedb:00000068-a-1#607,NA,NA,p.Ser366Glu,-0.8533090945531668 +urn:mavedb:00000068-a-1#608,NA,NA,p.Ser366Asp,-1.3609765742847353 +urn:mavedb:00000068-a-1#609,NA,NA,p.Ser366Cys,-1.6901296514759063 diff --git a/tests/unit/test_align.py b/tests/unit/test_align.py index 8b93b39..87f7e24 100644 --- a/tests/unit/test_align.py +++ b/tests/unit/test_align.py @@ -6,159 +6,86 @@ """ import pytest -from cool_seq_tool.schemas import Strand from dcd_mapping.align import align +from dcd_mapping.schemas import AlignmentResult -def test_align_src_catalytic_domain(scoreset_metadata_fixture): +def check_alignment_result_equality(actual: AlignmentResult, expected: AlignmentResult): + """Check correctness of alignment result against fixture""" + assert actual.chrom == expected.chrom + assert actual.strand == expected.strand + assert actual.coverage == pytest.approx(expected.coverage) + assert actual.ident_pct == pytest.approx(expected.ident_pct) + assert actual.query_range.start == expected.query_range.start + assert actual.query_range.end == expected.query_range.end + for a, e in zip(actual.query_subranges, expected.query_subranges): + assert a.start == e.start + assert a.end == e.end + assert len(actual.query_subranges) == len(expected.query_subranges) + assert actual.hit_range.start == expected.hit_range.start + assert actual.hit_range.end == expected.hit_range.end + for a, e in zip(actual.hit_subranges, expected.hit_subranges): + assert a.start == e.start + assert a.end == e.end + assert len(actual.hit_subranges) == len(expected.hit_subranges) + + +def test_align_src_catalytic_domain(scoreset_metadata_fixture, align_result_fixture): """Test ``align()`` method on urn:mavedb:00000041-a-1""" - scoreset_metadata = scoreset_metadata_fixture["urn:mavedb:00000041-a-1"] + urn = "urn:mavedb:00000041-a-1" + scoreset_metadata = scoreset_metadata_fixture[urn] align_result = align(scoreset_metadata) + expected = AlignmentResult(**align_result_fixture[urn]) assert align_result - assert align_result.chrom == "chr20" - assert align_result.strand == Strand.POSITIVE - assert align_result.coverage == pytest.approx(100.0) - assert align_result.ident_pct == pytest.approx(99.86) - assert align_result.query_range.start == 0 - assert align_result.query_range.end == 750 - query_subranges = [ - [0, 52], - [52, 232], - [232, 309], - [309, 463], - [463, 595], - [595, 750], - ] - for actual, expected in zip(align_result.query_subranges, query_subranges): - assert actual.start == expected[0] - assert actual.end == expected[1] - assert len(align_result.query_subranges) == len(query_subranges) - assert align_result.hit_range.start == 37397802 - assert align_result.hit_range.end == 37403325 - hit_subranges = [ - [37397802, 37397854], - [37400114, 37400294], - [37401601, 37401678], - [37402434, 37402588], - [37402748, 37402880], - [37403170, 37403325], - ] - for actual, expected in zip(align_result.hit_subranges, hit_subranges): - assert actual.start == expected[0] - assert actual.end == expected[1] - assert len(align_result.hit_subranges) == len(hit_subranges) - - -def test_align_hbb(scoreset_metadata_fixture): + check_alignment_result_equality(align_result, expected) + + +def test_align_hbb(scoreset_metadata_fixture, align_result_fixture): """Test ``align()`` method on urn:mavedb:00000018-a-1""" - scoreset_metadata = scoreset_metadata_fixture["urn:mavedb:00000018-a-1"] + urn = "urn:mavedb:00000018-a-1" + scoreset_metadata = scoreset_metadata_fixture[urn] align_result = align(scoreset_metadata) + expected = AlignmentResult(**align_result_fixture[urn]) assert align_result - assert align_result.chrom == "chr11" - assert align_result.strand == Strand.POSITIVE - assert align_result.coverage == pytest.approx(100.0) - assert align_result.ident_pct == 0 # TODO - assert align_result.query_range.start == 0 - assert align_result.query_range.end == 187 - query_subranges = [[0, 187]] - for actual, expected in zip(align_result.query_subranges, query_subranges): - assert actual.start == expected[0] - assert actual.end == expected[1] - assert len(align_result.query_subranges) == len(query_subranges) - assert align_result.hit_range.start == 5227021 - assert align_result.hit_range.end == 5227208 - hit_subranges = [[5227021, 5227208]] - for actual, expected in zip(align_result.hit_subranges, hit_subranges): - assert actual.start == expected[0] - assert actual.end == expected[1] - assert len(align_result.hit_subranges) == len(hit_subranges) - - -def test_align_ube2i(scoreset_metadata_fixture): + check_alignment_result_equality(align_result, expected) + + +def test_align_ube2i(scoreset_metadata_fixture, align_result_fixture): """Test ``align()`` on urn:mavedb:00000001-a-4""" - scoreset_metadata = scoreset_metadata_fixture["urn:mavedb:00000001-a-4"] + urn = "urn:mavedb:00000001-a-4" + scoreset_metadata = scoreset_metadata_fixture[urn] align_result = align(scoreset_metadata) + expected = AlignmentResult(**align_result_fixture[urn]) assert align_result - assert align_result.chrom == "chr16" - assert align_result.strand == Strand.POSITIVE - assert align_result.coverage == pytest.approx(100.0) - assert align_result.ident_pct == pytest.approx(100.0) - assert align_result.query_range.start == 0 - assert align_result.query_range.end == 477 - query_subranges = [ - [0, 66], - [66, 150], - [150, 223], - [223, 333], - [333, 413], - [413, 477], - ] - for actual, expected in zip(align_result.query_subranges, query_subranges): - assert actual.start == expected[0] - assert actual.end == expected[1] - assert len(align_result.query_subranges) == len(query_subranges) - assert align_result.hit_range.start == 5227021 - assert align_result.hit_range.end == 5227208 - hit_subranges = [ - [1314030, 1314096], - [1314292, 1314376], - [1315653, 1315726], - [1320173, 1320283], - [1320437, 1320517], - [1324729, 1324793], - ] - for actual, expected in zip(align_result.hit_subranges, hit_subranges): - assert actual.start == expected[0] - assert actual.end == expected[1] - assert len(align_result.hit_subranges) == len(hit_subranges) - - -def test_align_scn5a(scoreset_metadata_fixture): + check_alignment_result_equality(align_result, expected) + + +def test_align_scn5a(scoreset_metadata_fixture, align_result_fixture): """Test ``align()`` method on urn:mavedb:00000098-a-1""" - scoreset_metadata = scoreset_metadata_fixture["urn:mavedb:00000098-a-1"] + urn = "urn:mavedb:00000098-a-1" + scoreset_metadata = scoreset_metadata_fixture[urn] align_result = align(scoreset_metadata) + expected = AlignmentResult(**align_result_fixture[urn]) assert align_result - assert align_result.chrom == "chr3" - assert align_result.strand == Strand.NEGATIVE - assert align_result.coverage == pytest.approx(100.0) - assert align_result.ident_pct == pytest.approx(100.0) - assert align_result.query_range.start == 0 - assert align_result.query_range.end == 12 - query_subranges = [[0, 12]] - for actual, expected in zip(align_result.query_subranges, query_subranges): - assert actual.start == expected[0] - assert actual.end == expected[1] - assert len(align_result.query_subranges) == len(query_subranges) - assert align_result.hit_range.start == 38551475 - assert align_result.hit_range.end == 38551511 - hit_subranges = [[38551475, 38551511]] - for actual, expected in zip(align_result.hit_subranges, hit_subranges): - assert actual.start == expected[0] - assert actual.end == expected[1] - assert len(align_result.hit_subranges) == len(hit_subranges) - - -def test_align_raf(scoreset_metadata_fixture): + check_alignment_result_equality(align_result, expected) + + +def test_align_raf(scoreset_metadata_fixture, align_result_fixture): """Test ``align()`` method on urn:mavedb:00000061-h-1.""" - scoreset_metadata = scoreset_metadata_fixture["urn:mavedb:00000061-h-1"] + urn = "urn:mavedb:00000061-h-1" + scoreset_metadata = scoreset_metadata_fixture[urn] align_result = align(scoreset_metadata) + expected = AlignmentResult(**align_result_fixture[urn]) + assert align_result + check_alignment_result_equality(align_result, expected) + + +def test_align_tp53(scoreset_metadata_fixture, align_result_fixture): + """Test ``align()`` method on urn:mavedb:00000068-a-1""" + urn = "urn:mavedb:00000068-a-1" + metadata = scoreset_metadata_fixture[urn] + align_result = align(metadata) + expected = AlignmentResult(**align_result_fixture[urn]) assert align_result - assert align_result.chrom == "chr3" - assert align_result.strand == Strand.NEGATIVE - assert align_result.coverage == pytest.approx(100.0) - assert align_result.ident_pct == pytest.approx(100.0) - assert align_result.query_range.start == 0 - assert align_result.query_range.end == 117 - query_subranges = [[54, 117], [0, 54]] - for actual, expected in zip(align_result.query_subranges, query_subranges): - assert actual.start == expected[0] - assert actual.end == expected[1] - assert len(align_result.query_subranges) == len(query_subranges) - assert align_result.hit_range.start == 12618514 - assert align_result.hit_range.end == 12612062 - hit_subranges = [[12611999, 12612062], [12618514, 12618568]] - for actual, expected in zip(align_result.hit_subranges, hit_subranges): - assert actual.start == expected[0] - assert actual.end == expected[1] - assert len(align_result.hit_subranges) == len(hit_subranges) + check_alignment_result_equality(align_result, expected)