Skip to content

Commit

Permalink
feat(IPVC-3107): detect transcripts missing transl_except (#48)
Browse files Browse the repository at this point in the history
  • Loading branch information
nvta1209 authored Dec 9, 2024
1 parent dea5bd5 commit 2d0eaf8
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 2 deletions.
25 changes: 25 additions & 0 deletions misc/check-transl-except/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
version: '3'

services:
uta:
container_name: uta
image: biocommons/uta:${UTA_VERSION}
environment:
- POSTGRES_HOST_AUTH_METHOD=trust
healthcheck:
test: psql -h localhost -U anonymous -d uta -c "select * from ${UTA_VERSION}.meta"
interval: 10s
retries: 80
network_mode: host
uta-check-transl-except:
image: uta-update
command: uta --conf=etc/global.conf --conf=etc/[email protected] check-transl-except /opt/uta/transcripts.txt ${UTA_VERSION} /opt/uta/check-transl-except.txt 2>&1 | tee /opt/uta/logs
environment:
- UTA_USE_SCHEMA=false
depends_on:
uta:
condition: service_healthy
volumes:
- ${SEQREPO_DIR}:/biocommons/dl.biocommons.org/seqrepo/master
- ${WORK_DIR}:/opt/uta
network_mode: host
2 changes: 2 additions & 0 deletions src/uta/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
uta (-C CONF ...) [options] shell
uta (-C CONF ...) [options] drop-schema
uta (-C CONF ...) [options] check-transcripts [--prefixes=PREFIXES] TXINFO_FILE UTA_SCHEMA OUTPUT_FILE
uta (-C CONF ...) [options] check-transl-except TRANSCRIPT_FILE UTA_SCHEMA OUTPUT_FILE
uta (-C CONF ...) [options] create-schema
uta (-C CONF ...) [options] update-meta-data
uta (-C CONF ...) [options] load-sql FILES ...
Expand Down Expand Up @@ -70,6 +71,7 @@ def main():
("align-exons", ul.align_exons),
("analyze", ul.analyze),
("check-transcripts", ul.check_transcripts),
("check-transl-except", ul.check_transl_except),
("create-schema", ul.create_schema),
("update-meta-data", ul.update_meta_data),
("drop-schema", ul.drop_schema),
Expand Down
85 changes: 83 additions & 2 deletions src/uta/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
import itertools
import logging
import time
from typing import Any, Dict, List
from typing import Any, Dict, Generator, List, Optional, Tuple

from biocommons.seqrepo import SeqRepo
from bioutils.coordinates import strand_pm_to_int, MINUS_STRAND
from bioutils.digests import seq_md5
from bioutils.sequences import reverse_complement
from bioutils.sequences import reverse_complement, translate_cds
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session
from sqlalchemy.orm.exc import NoResultFound
Expand Down Expand Up @@ -206,6 +206,87 @@ def check_transcripts(session: Session, opts: Dict, cf: ConfigParser):
output_fp.writelines(f'{t}\n' for t in sorted(result_transcripts))


def check_transl_except(session, opts, cf):
"""
Find transcripts in the given transcript file which are in the given UTA database version
and do not have transl_except entries when they should.
"""
# required opts
transcript_file = opts['TRANSCRIPT_FILE']
uta_schema = opts['UTA_SCHEMA']
output_file = opts['OUTPUT_FILE']

role = cf.get('uta', 'admin_role')
session.execute(text(f"set role {role};"))
session.execute(text(f"set search_path = {uta_schema};"))

Transcript = uta.models.Transcript
AssociatedAccessions = usam.AssociatedAccessions

def transcript_iterator() -> Generator[Tuple[str, int, int, Optional[str]], None, None]:
with open(transcript_file, 'rt') as transcript_fp:
for line in transcript_fp:
transcript_ac = line.strip()
transcript = (
session.query(Transcript)
.filter(Transcript.ac == transcript_ac)
.with_entities(Transcript.ac, Transcript.cds_start_i, Transcript.cds_end_i)
.one()
)
try:
assoc_ac = (
session.query(AssociatedAccessions)
.filter(AssociatedAccessions.tx_ac == transcript.ac)
.with_entities(AssociatedAccessions.tx_ac, AssociatedAccessions.pro_ac)
.one()
)
protein_ac = assoc_ac.pro_ac
except NoResultFound:
logger.warning(f'No NP for transcript: {transcript_ac}')
protein_ac = None

yield transcript.ac, transcript.cds_start_i, transcript.cds_end_i, protein_ac

result_transcripts = set()
warning_transcripts = set()
sf = _get_seqfetcher(cf)
for i, (transcript_ac, transcript_cds_start_i, transcript_cds_end_i, protein_ac) in enumerate(transcript_iterator()):
if i % 1000 == 0:
print(f'Progress: {i}')

if protein_ac is None:
warning_transcripts.add(transcript_ac)
continue

try:
transcript_seq = sf.fetch(transcript_ac, transcript_cds_start_i, transcript_cds_end_i)
except:
logger.warning(f'No sequence for transcript: {transcript_ac}')
transcript_seq = None

if transcript_seq is None:
warning_transcripts.add(transcript_ac)
continue

translated_protein_seq = translate_cds(transcript_seq, full_codons=False, ter_symbol='X').rstrip('*')

try:
protein_seq = sf.fetch(protein_ac)
except:
logger.warning(f'NP has no sequence: {protein_ac}, {transcript_ac}')
protein_seq = None

if protein_seq is None:
warning_transcripts.add(transcript_ac)
continue

if protein_seq != translated_protein_seq:
result_transcripts.add(transcript_ac)

with open(output_file, 'wt') as output_fp:
output_fp.writelines(f'{t}\n' for t in sorted(result_transcripts))


def create_schema(session, opts, cf):
"""Create and populate initial schema"""
session.execute(text("set role {admin_role};".format(
Expand Down

0 comments on commit 2d0eaf8

Please sign in to comment.