Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: add transcript selection checks #13

Merged
merged 2 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion misc/bug_hunting/check_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
with open("notebooks/analysis/results/mave_blat.pickle", "rb") as f:
mave_blat_dict = pickle.load(f)

with open("human_urns.txt", "r") as f:
with open("misc/bug_hunting/human_urns.txt", "r") as f:
urns = [line.strip() for line in f.readlines()]

strand_reformat = {1: Strand.POSITIVE, -1: Strand.NEGATIVE}
Expand Down
84 changes: 84 additions & 0 deletions misc/bug_hunting/check_transcript.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import logging
import asyncio
import pickle

from cool_seq_tool.schemas import Strand, TranscriptPriority

from dcd_mapping.align import align
from dcd_mapping.resources import get_scoreset_metadata, get_scoreset_records
from dcd_mapping.transcripts import select_transcript

logging.basicConfig(
filename="dcd-check-transcript.log",
format="%(asctime)s %(levelname)s:%(name)s:%(message)s",
level=logging.DEBUG,
force=True,
)
_logger = logging.getLogger(__name__)

with open("misc/bug_hunting/human_urns.txt", "r") as f:
urns = [line.strip() for line in f.readlines()]

with open("notebooks/analysis/results/mave_blat.pickle", "rb") as f:
mave_blat_dict = pickle.load(f)

with open("notebooks/analysis/results/mappings.pickle", "rb") as f:
expected_mappings = pickle.load(f)

async def check_tx_results():
for urn in urns:
try:
# prereqs
metadata = get_scoreset_metadata(urn)
records = get_scoreset_records(urn)
alignment = align(metadata, use_cached=True)
except Exception as e:
_logger.error("%s error before transcript selection: %s", urn, e)
continue
try:
tx_result = await select_transcript(metadata, records, alignment)
except Exception as e:
if urn in expected_mappings:
_logger.error("%s error during transcript selection: %s", urn, e)
else:
_logger.error("%s error during transcript selection (not in expected_mappings): %s", urn, e)
continue

if urn not in mave_blat_dict:
continue # not in original experiment
if urn not in expected_mappings:
if tx_result is not None:
_logger.error("%s performed transcript selection when it shouldn't have", urn)
else:
_logger.info("Skipping %s as expected", urn)
continue
if urn in expected_mappings and tx_result is None:
_logger.error("%s skipped transcript selection when it shouldn't have", urn)
continue

def reformat_mane(status: str):
return TranscriptPriority[status.replace(" ", "_").upper()]

try:
expected = expected_mappings[urn]
for (name, actual, expected) in [
("NP accession", tx_result.np, expected[0]),
("start position", tx_result.start, expected[1]),
("is full match", tx_result.is_full_match, expected[3]),
("NM accession", tx_result.nm, expected[4]),
("Tx priority", tx_result.transcript_mode, reformat_mane(expected[5]))
]:
if actual != expected:
_logger.error(
"%s %s mismatch: %s (actual) vs %s (expected)",
urn,
name,
actual,
expected,
)
except Exception as e:
_logger.error("%s exception: %s", urn, e)

_logger.info("%s completed successfully", urn) # not a warning but w/e

asyncio.run(check_tx_results())
5 changes: 3 additions & 2 deletions src/dcd_mapping/transcripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Outstanding questions/confusion
-------------------------------
* ``urn:mavedb:00000097-n-1``: unable to find any matching transcripts
* Lots of scoresets (2023-) giving index errors in offset calculation
* Lots of scoresets (esp. 2023-) giving index errors in offset calculation
* Remove MANE sorting once upstream sorting is confirmed
"""
import logging
Expand Down Expand Up @@ -274,7 +274,8 @@ def _offset_target_sequence(metadata: ScoresetMetadata, records: List[ScoreRow])
protein_sequence[i + p1 - p0] == amino_acids_by_position[p1],
protein_sequence[i + p2 - p0] == amino_acids_by_position[p2],
protein_sequence[i + p3 - p0] == amino_acids_by_position[p3],
protein_sequence[i + p4 - p0] == amino_acids_by_position[p4],
protein_sequence[i + p4 - p0]
== amino_acids_by_position[p4], # TODO problem here-ish
]
):
if i + 1 == min(amino_acids_by_position.keys()) or i + 2 == min(
Expand Down
8 changes: 7 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,13 @@
Notes on test cases:
-------------------

* urn:mavedb:00000068-a-1: TP53, protein-coding, DNA.
* urn:mavedb:00000041-a-1: SRC, protein-coding, dna, uniprot ref
* urn:mavedb:00000018-a-1: HBB promoter, regulatory, DNA
* urn:mavedb:00000001-a-4: UBE2I, protein-coding, dna, uniprot ref
* urn:mavedb:00000113-a-2: APP, protein-coding, protein sequence, uniprot ref. Not in original notebooks.
* urn:mavedb:00000098-a-1: SCN5A, protein-coding, protein sequence, uniprot ref with offset
* urn:mavedb:00000061-h-1: RAF, protein coding, DNA, uniprot ref with offset
* urn:mavedb:00000068-a-1: TP53, protein-coding, DNA
"""
import json
import os
Expand Down
44 changes: 34 additions & 10 deletions tests/fixtures/align_result.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,6 @@
{ "start": 37403170, "end": 37403325 }
]
},
"urn:mavedb:00000098-a-1": {
"chrom": "chr3",
"strand": -1,
"coverage": 100.0,
"ident_pct": 100.0,
"query_range": { "start": 0, "end": 12 },
"query_subranges": [{ "start": 0, "end": 12 }],
"hit_range": { "start": 38551475, "end": 38551511 },
"hit_subranges": [{ "start": 38551475, "end": 38551511 }]
},
"urn:mavedb:00000018-a-1": {
"chrom": "chr11",
"strand": 1,
Expand All @@ -43,6 +33,40 @@
"hit_range": { "start": 5227021, "end": 5227208 },
"hit_subranges": [{ "start": 5227021, "end": 5227208 }]
},
"urn:mavedb:00000001-a-4": {
"chrom": "chr16",
"strand": 1,
"coverage": 100.0,
"ident_pct": 100.0,
"query_range": { "start": 0, "end": 477 },
"query_subranges": [
{ "start": 0, "end": 66 },
{ "start": 66, "end": 150 },
{ "start": 150, "end": 223 },
{ "start": 223, "end": 333 },
{ "start": 333, "end": 413 },
{ "start": 413, "end": 477 }
],
"hit_range": { "start": 1314030, "end": 1324793 },
"hit_subranges": [
{ "start": 1314030, "end": 1314096 },
{ "start": 1314292, "end": 1314376 },
{ "start": 1315653, "end": 1315726 },
{ "start": 1320173, "end": 1320283 },
{ "start": 1320437, "end": 1320517 },
{ "start": 1324729, "end": 1324793 }
]
},
"urn:mavedb:00000098-a-1": {
"chrom": "chr3",
"strand": -1,
"coverage": 100.0,
"ident_pct": 100.0,
"query_range": { "start": 0, "end": 12 },
"query_subranges": [{ "start": 0, "end": 12 }],
"hit_range": { "start": 38551475, "end": 38551511 },
"hit_subranges": [{ "start": 38551475, "end": 38551511 }]
},
"urn:mavedb:00000061-h-1": {
"chrom": "chr3",
"strand": -1,
Expand Down
55 changes: 31 additions & 24 deletions tests/fixtures/transcript_result.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,35 @@
"transcript_mode": "mane_select",
"sequence": "TODO"
},
"urn:mavedb:00000001-a-4": {
"nm": "NM_003345.5",
"np": "NP_003336.1",
"start": 0,
"is_full_match": true,
"transcript_mode": "mane_select",
"sequence": "TODO"
},
"urn:mavedb:00000098-a-1": {
"nm": "NM_000335.5",
"np": "NP_000326.2",
"start": 1619,
"is_full_match": true,
"transcript_mode": "mane_select",
"sequence": "TODO"
},
"urn:mavedb:00000061-h-1": {
"nm": "NM_002880.4",
"np": "NP_002871.1",
"start": 51,
"is_full_match": true,
"transcript_mode": "mane_select",
"sequence": "TODO"
}
"urn:mavedb:00000001-a-4": {
"nm": "NM_003345.5",
"np": "NP_003336.1",
"start": 0,
"is_full_match": true,
"transcript_mode": "mane_select",
"sequence": "TODO"
},
"urn:mavedb:00000098-a-1": {
"nm": "NM_000335.5",
"np": "NP_000326.2",
"start": 1619,
"is_full_match": true,
"transcript_mode": "mane_select",
"sequence": "TODO"
},
"urn:mavedb:00000061-h-1": {
"nm": "NM_002880.4",
"np": "NP_002871.1",
"start": 51,
"is_full_match": true,
"transcript_mode": "mane_select",
"sequence": "TODO"
},
"urn:mavedb:00000068-a-1": {
"nm": "NM_000546.6",
"np": "NP_000537.3",
"start": 0,
"is_full_match": false,
"transcript_mode": "mane_select"
}
}
50 changes: 29 additions & 21 deletions tests/unit/test_transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@
from dcd_mapping.transcripts import select_transcript


def check_transcript_results_equality(actual: TxSelectResult, expected: TxSelectResult):
"""Check equality of transcript selection result vs fixture"""
assert actual.np == expected.np
assert actual.start == expected.start
assert actual.is_full_match is expected.is_full_match
assert actual.nm == expected.nm
assert actual.transcript_mode == expected.transcript_mode


@pytest.mark.asyncio(scope="module")
async def test_tx_src(
scoreset_metadata_fixture: Dict[str, ScoresetMetadata],
Expand All @@ -17,18 +26,12 @@ async def test_tx_src(
"""Test transcript selection for urn:mavedb:00000041-a-1"""
urn = "urn:mavedb:00000041-a-1"
metadata = scoreset_metadata_fixture[urn]
records = get_scoreset_records(urn) # TODO real fixture
records = get_scoreset_records(urn)
alignment_result = align_result_fixture[urn]
expected = transcript_results_fixture[urn]

actual = await select_transcript(metadata, records, alignment_result)

assert actual
assert actual.np == expected.np
assert actual.start == expected.start
assert actual.is_full_match is expected.is_full_match
assert actual.nm == expected.nm
assert actual.transcript_mode == expected.transcript_mode
check_transcript_results_equality(actual, expected)


@pytest.mark.asyncio(scope="module")
Expand All @@ -43,15 +46,9 @@ async def test_tx_scn5a(
records = get_scoreset_records(urn)
alignment_result = align_result_fixture[urn]
expected = transcript_results_fixture[urn]

actual = await select_transcript(metadata, records, alignment_result)

assert actual
assert actual.np == expected.np
assert actual.start == expected.start
assert actual.is_full_match is expected.is_full_match
assert actual.nm == expected.nm
assert actual.transcript_mode == expected.transcript_mode
check_transcript_results_equality(actual, expected)


@pytest.mark.asyncio(scope="module")
Expand All @@ -64,7 +61,6 @@ async def test_tx_hbb(
metadata = scoreset_metadata_fixture[urn]
records = get_scoreset_records(urn)
alignment_result = align_result_fixture[urn]

actual = await select_transcript(metadata, records, alignment_result)
assert actual is None

Expand All @@ -81,11 +77,23 @@ async def test_tx_raf(
records = get_scoreset_records(urn)
alignment_result = align_result_fixture[urn]
expected = transcript_results_fixture[urn]
actual = await select_transcript(metadata, records, alignment_result)
assert actual
check_transcript_results_equality(actual, expected)


@pytest.mark.asyncio(scope="module")
async def test_tx_tp53(
scoreset_metadata_fixture: Dict[str, ScoresetMetadata],
align_result_fixture: Dict[str, AlignmentResult],
transcript_results_fixture: Dict[str, TxSelectResult],
):
"""Test transcript selection for urn:mavedb:00000068-a-1"""
urn = "urn:mavedb:00000068-a-1"
metadata = scoreset_metadata_fixture[urn]
records = get_scoreset_records(urn)
alignment_result = align_result_fixture[urn]
expected = transcript_results_fixture[urn]
actual = await select_transcript(metadata, records, alignment_result)
assert actual
assert actual.np == expected.np
assert actual.start == expected.start
assert actual.is_full_match is expected.is_full_match
assert actual.nm == expected.nm
assert actual.transcript_mode == expected.transcript_mode
check_transcript_results_equality(actual, expected)
Loading