diff --git a/misc/bug_hunting/check_align.py b/misc/bug_hunting/check_align.py index eb2dbcd..05111c7 100644 --- a/misc/bug_hunting/check_align.py +++ b/misc/bug_hunting/check_align.py @@ -18,7 +18,7 @@ with open("notebooks/analysis/results/mave_blat.pickle", "rb") as f: mave_blat_dict = pickle.load(f) -with open("human_urns.txt", "r") as f: +with open("misc/bug_hunting/human_urns.txt", "r") as f: urns = [line.strip() for line in f.readlines()] strand_reformat = {1: Strand.POSITIVE, -1: Strand.NEGATIVE} diff --git a/misc/bug_hunting/check_transcript.py b/misc/bug_hunting/check_transcript.py new file mode 100644 index 0000000..effda4b --- /dev/null +++ b/misc/bug_hunting/check_transcript.py @@ -0,0 +1,84 @@ +import logging +import asyncio +import pickle + +from cool_seq_tool.schemas import Strand, TranscriptPriority + +from dcd_mapping.align import align +from dcd_mapping.resources import get_scoreset_metadata, get_scoreset_records +from dcd_mapping.transcripts import select_transcript + +logging.basicConfig( + filename="dcd-check-transcript.log", + format="%(asctime)s %(levelname)s:%(name)s:%(message)s", + level=logging.DEBUG, + force=True, +) +_logger = logging.getLogger(__name__) + +with open("misc/bug_hunting/human_urns.txt", "r") as f: + urns = [line.strip() for line in f.readlines()] + +with open("notebooks/analysis/results/mave_blat.pickle", "rb") as f: + mave_blat_dict = pickle.load(f) + +with open("notebooks/analysis/results/mappings.pickle", "rb") as f: + expected_mappings = pickle.load(f) + +async def check_tx_results(): + for urn in urns: + try: + # prereqs + metadata = get_scoreset_metadata(urn) + records = get_scoreset_records(urn) + alignment = align(metadata, use_cached=True) + except Exception as e: + _logger.error("%s error before transcript selection: %s", urn, e) + continue + try: + tx_result = await select_transcript(metadata, records, alignment) + except Exception as e: + if urn in expected_mappings: + _logger.error("%s error during transcript selection: %s", urn, e) + else: + _logger.error("%s error during transcript selection (not in expected_mappings): %s", urn, e) + continue + + if urn not in mave_blat_dict: + continue # not in original experiment + if urn not in expected_mappings: + if tx_result is not None: + _logger.error("%s performed transcript selection when it shouldn't have", urn) + else: + _logger.info("Skipping %s as expected", urn) + continue + if urn in expected_mappings and tx_result is None: + _logger.error("%s skipped transcript selection when it shouldn't have", urn) + continue + + def reformat_mane(status: str): + return TranscriptPriority[status.replace(" ", "_").upper()] + + try: + expected = expected_mappings[urn] + for (name, actual, expected) in [ + ("NP accession", tx_result.np, expected[0]), + ("start position", tx_result.start, expected[1]), + ("is full match", tx_result.is_full_match, expected[3]), + ("NM accession", tx_result.nm, expected[4]), + ("Tx priority", tx_result.transcript_mode, reformat_mane(expected[5])) + ]: + if actual != expected: + _logger.error( + "%s %s mismatch: %s (actual) vs %s (expected)", + urn, + name, + actual, + expected, + ) + except Exception as e: + _logger.error("%s exception: %s", urn, e) + + _logger.info("%s completed successfully", urn) # not a warning but w/e + +asyncio.run(check_tx_results()) diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py index 1f244f0..37bedb7 100644 --- a/src/dcd_mapping/transcripts.py +++ b/src/dcd_mapping/transcripts.py @@ -3,7 +3,7 @@ Outstanding questions/confusion ------------------------------- * ``urn:mavedb:00000097-n-1``: unable to find any matching transcripts -* Lots of scoresets (2023-) giving index errors in offset calculation +* Lots of scoresets (esp. 2023-) giving index errors in offset calculation * Remove MANE sorting once upstream sorting is confirmed """ import logging @@ -274,7 +274,8 @@ def _offset_target_sequence(metadata: ScoresetMetadata, records: List[ScoreRow]) protein_sequence[i + p1 - p0] == amino_acids_by_position[p1], protein_sequence[i + p2 - p0] == amino_acids_by_position[p2], protein_sequence[i + p3 - p0] == amino_acids_by_position[p3], - protein_sequence[i + p4 - p0] == amino_acids_by_position[p4], + protein_sequence[i + p4 - p0] + == amino_acids_by_position[p4], # TODO problem here-ish ] ): if i + 1 == min(amino_acids_by_position.keys()) or i + 2 == min( diff --git a/tests/conftest.py b/tests/conftest.py index e323fcd..38a3aed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,7 +4,13 @@ Notes on test cases: ------------------- -* urn:mavedb:00000068-a-1: TP53, protein-coding, DNA. +* urn:mavedb:00000041-a-1: SRC, protein-coding, dna, uniprot ref +* urn:mavedb:00000018-a-1: HBB promoter, regulatory, DNA +* urn:mavedb:00000001-a-4: UBE2I, protein-coding, dna, uniprot ref +* urn:mavedb:00000113-a-2: APP, protein-coding, protein sequence, uniprot ref. Not in original notebooks. +* urn:mavedb:00000098-a-1: SCN5A, protein-coding, protein sequence, uniprot ref with offset +* urn:mavedb:00000061-h-1: RAF, protein coding, DNA, uniprot ref with offset +* urn:mavedb:00000068-a-1: TP53, protein-coding, DNA """ import json import os diff --git a/tests/fixtures/align_result.json b/tests/fixtures/align_result.json index 5336206..518ddf8 100644 --- a/tests/fixtures/align_result.json +++ b/tests/fixtures/align_result.json @@ -23,16 +23,6 @@ { "start": 37403170, "end": 37403325 } ] }, - "urn:mavedb:00000098-a-1": { - "chrom": "chr3", - "strand": -1, - "coverage": 100.0, - "ident_pct": 100.0, - "query_range": { "start": 0, "end": 12 }, - "query_subranges": [{ "start": 0, "end": 12 }], - "hit_range": { "start": 38551475, "end": 38551511 }, - "hit_subranges": [{ "start": 38551475, "end": 38551511 }] - }, "urn:mavedb:00000018-a-1": { "chrom": "chr11", "strand": 1, @@ -43,6 +33,40 @@ "hit_range": { "start": 5227021, "end": 5227208 }, "hit_subranges": [{ "start": 5227021, "end": 5227208 }] }, + "urn:mavedb:00000001-a-4": { + "chrom": "chr16", + "strand": 1, + "coverage": 100.0, + "ident_pct": 100.0, + "query_range": { "start": 0, "end": 477 }, + "query_subranges": [ + { "start": 0, "end": 66 }, + { "start": 66, "end": 150 }, + { "start": 150, "end": 223 }, + { "start": 223, "end": 333 }, + { "start": 333, "end": 413 }, + { "start": 413, "end": 477 } + ], + "hit_range": { "start": 1314030, "end": 1324793 }, + "hit_subranges": [ + { "start": 1314030, "end": 1314096 }, + { "start": 1314292, "end": 1314376 }, + { "start": 1315653, "end": 1315726 }, + { "start": 1320173, "end": 1320283 }, + { "start": 1320437, "end": 1320517 }, + { "start": 1324729, "end": 1324793 } + ] + }, + "urn:mavedb:00000098-a-1": { + "chrom": "chr3", + "strand": -1, + "coverage": 100.0, + "ident_pct": 100.0, + "query_range": { "start": 0, "end": 12 }, + "query_subranges": [{ "start": 0, "end": 12 }], + "hit_range": { "start": 38551475, "end": 38551511 }, + "hit_subranges": [{ "start": 38551475, "end": 38551511 }] + }, "urn:mavedb:00000061-h-1": { "chrom": "chr3", "strand": -1, diff --git a/tests/fixtures/transcript_result.json b/tests/fixtures/transcript_result.json index 19a510c..f4a6086 100644 --- a/tests/fixtures/transcript_result.json +++ b/tests/fixtures/transcript_result.json @@ -7,28 +7,35 @@ "transcript_mode": "mane_select", "sequence": "TODO" }, - "urn:mavedb:00000001-a-4": { - "nm": "NM_003345.5", - "np": "NP_003336.1", - "start": 0, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "TODO" - }, - "urn:mavedb:00000098-a-1": { - "nm": "NM_000335.5", - "np": "NP_000326.2", - "start": 1619, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "TODO" - }, - "urn:mavedb:00000061-h-1": { - "nm": "NM_002880.4", - "np": "NP_002871.1", - "start": 51, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "TODO" - } + "urn:mavedb:00000001-a-4": { + "nm": "NM_003345.5", + "np": "NP_003336.1", + "start": 0, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "TODO" + }, + "urn:mavedb:00000098-a-1": { + "nm": "NM_000335.5", + "np": "NP_000326.2", + "start": 1619, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "TODO" + }, + "urn:mavedb:00000061-h-1": { + "nm": "NM_002880.4", + "np": "NP_002871.1", + "start": 51, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "TODO" + }, + "urn:mavedb:00000068-a-1": { + "nm": "NM_000546.6", + "np": "NP_000537.3", + "start": 0, + "is_full_match": false, + "transcript_mode": "mane_select" + } } diff --git a/tests/unit/test_transcript.py b/tests/unit/test_transcript.py index 1024c19..da23c00 100644 --- a/tests/unit/test_transcript.py +++ b/tests/unit/test_transcript.py @@ -8,6 +8,15 @@ from dcd_mapping.transcripts import select_transcript +def check_transcript_results_equality(actual: TxSelectResult, expected: TxSelectResult): + """Check equality of transcript selection result vs fixture""" + assert actual.np == expected.np + assert actual.start == expected.start + assert actual.is_full_match is expected.is_full_match + assert actual.nm == expected.nm + assert actual.transcript_mode == expected.transcript_mode + + @pytest.mark.asyncio(scope="module") async def test_tx_src( scoreset_metadata_fixture: Dict[str, ScoresetMetadata], @@ -17,18 +26,12 @@ async def test_tx_src( """Test transcript selection for urn:mavedb:00000041-a-1""" urn = "urn:mavedb:00000041-a-1" metadata = scoreset_metadata_fixture[urn] - records = get_scoreset_records(urn) # TODO real fixture + records = get_scoreset_records(urn) alignment_result = align_result_fixture[urn] expected = transcript_results_fixture[urn] - actual = await select_transcript(metadata, records, alignment_result) - assert actual - assert actual.np == expected.np - assert actual.start == expected.start - assert actual.is_full_match is expected.is_full_match - assert actual.nm == expected.nm - assert actual.transcript_mode == expected.transcript_mode + check_transcript_results_equality(actual, expected) @pytest.mark.asyncio(scope="module") @@ -43,15 +46,9 @@ async def test_tx_scn5a( records = get_scoreset_records(urn) alignment_result = align_result_fixture[urn] expected = transcript_results_fixture[urn] - actual = await select_transcript(metadata, records, alignment_result) - assert actual - assert actual.np == expected.np - assert actual.start == expected.start - assert actual.is_full_match is expected.is_full_match - assert actual.nm == expected.nm - assert actual.transcript_mode == expected.transcript_mode + check_transcript_results_equality(actual, expected) @pytest.mark.asyncio(scope="module") @@ -64,7 +61,6 @@ async def test_tx_hbb( metadata = scoreset_metadata_fixture[urn] records = get_scoreset_records(urn) alignment_result = align_result_fixture[urn] - actual = await select_transcript(metadata, records, alignment_result) assert actual is None @@ -81,11 +77,23 @@ async def test_tx_raf( records = get_scoreset_records(urn) alignment_result = align_result_fixture[urn] expected = transcript_results_fixture[urn] + actual = await select_transcript(metadata, records, alignment_result) + assert actual + check_transcript_results_equality(actual, expected) + +@pytest.mark.asyncio(scope="module") +async def test_tx_tp53( + scoreset_metadata_fixture: Dict[str, ScoresetMetadata], + align_result_fixture: Dict[str, AlignmentResult], + transcript_results_fixture: Dict[str, TxSelectResult], +): + """Test transcript selection for urn:mavedb:00000068-a-1""" + urn = "urn:mavedb:00000068-a-1" + metadata = scoreset_metadata_fixture[urn] + records = get_scoreset_records(urn) + alignment_result = align_result_fixture[urn] + expected = transcript_results_fixture[urn] actual = await select_transcript(metadata, records, alignment_result) assert actual - assert actual.np == expected.np - assert actual.start == expected.start - assert actual.is_full_match is expected.is_full_match - assert actual.nm == expected.nm - assert actual.transcript_mode == expected.transcript_mode + check_transcript_results_equality(actual, expected)