Skip to content

Commit

Permalink
feat(IPVC-2386): small changes, skip transcripts with unknown exon st…
Browse files Browse the repository at this point in the history
…ructure
  • Loading branch information
bsgiles73 committed May 3, 2024
1 parent 80d3196 commit b6d5b04
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 34 deletions.
1 change: 0 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ services:
uta:
condition: service_healthy
volumes:
- ${UTA_ETL_NCBI_DIR}:/ncbi-dir
- ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
- ${UTA_ETL_WORK_DIR}:/uta-load/work
- ${UTA_ETL_LOG_DIR}:/uta-load/logs
Expand Down
8 changes: 2 additions & 6 deletions sbin/exonset-to-seqinfo
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,15 @@
import argparse
import configparser as ConfigParser
import gzip
import importlib_resources
import itertools
import logging
import logging.config
import pkg_resources
import re
import sys

from bioutils.digests import seq_md5
from biocommons.seqrepo import SeqRepo
# from multifastadb import MultiFastaDB

from uta.formats.exonset import ExonSetReader
from uta.formats.seqinfo import SeqInfo, SeqInfoWriter
Expand All @@ -40,8 +39,7 @@ def parse_args(argv):


if __name__ == "__main__":
logging_conf_fn = pkg_resources.resource_filename(
"uta", "etc/logging.conf")
logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
logging.config.fileConfig(logging_conf_fn)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
Expand All @@ -60,8 +58,6 @@ if __name__ == "__main__":
esr = ExonSetReader(in_fh)
logger.info("opened " + in_fn)

#fa_dirs = cf.get("sequences", "fasta_directories").strip().splitlines()
#mfdb = MultiFastaDB(fa_dirs, use_meta_index=True)
sr_dir = cf.get("sequences", "seqrepo")
sr = SeqRepo(root_dir=sr_dir)
logger.info("Opened sequence directories: " + sr_dir)
Expand Down
62 changes: 42 additions & 20 deletions sbin/ncbi_extract_gbff.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ def main(gbff_files: Iterable, origin: str, prefix: str, output_dir: str) -> Non
# Protein fasta file
protein_fasta_fh = stack.enter_context(
io.TextIOWrapper(
gzip.open(f"{output_dir}/{prefix}protein.faa.gz", "wb"), encoding="utf-8"
gzip.open(f"{output_dir}/{prefix}protein.faa.gz", "wb"),
encoding="utf-8",
)
)

Expand All @@ -83,37 +84,45 @@ def main(gbff_files: Iterable, origin: str, prefix: str, output_dir: str) -> Non
assocacs_writer = GeneAccessionsWriter(assocacs_fh)

total_genes = set()
skipped_ids = set()
total_skipped = set()
all_prefixes = Counter()
for gbff_fn in gbff_files:
logger.info(f"Processing {gbff_fn}")
gbff_file_handler = stack.enter_context(open_file(gbff_fn))
i = 0
genes = set()
skipped = set()
prefixes = Counter()
for r in Bio.SeqIO.parse(gbff_file_handler, "gb"):
srf = SeqRecordFacade(r)

# skip transcripts where the exon structure is unknown
if not srf.exons_se_i:
skipped.add(srf.id)
continue

prefixes.update([srf.id[:2]])
try:
fna_record = SeqRecord(
Seq(srf.feature_seq), id=srf.id, description=""
)
dna_fasta_fh.write(fna_record.format("fasta"))

geneinfo_writer.write(
GeneInfo(
gene_id=srf.gene_id,
gene_symbol=srf.gene_symbol,
tax_id="9606",
hgnc=srf.gene_symbol,
maploc="",
aliases=srf.gene_synonyms,
type=srf.gene_type,
summary="",
descr="",
xrefs=srf.db_xrefs,
if srf.gene_id not in genes:
geneinfo_writer.write(
GeneInfo(
gene_id=srf.gene_id,
gene_symbol=srf.gene_symbol,
tax_id="9606",
hgnc=srf.gene_symbol,
maploc="",
aliases=srf.gene_synonyms,
type=srf.gene_type,
summary="",
descr="",
xrefs=srf.db_xrefs,
)
)
)

txinfo_writer.write(
TxInfo(
Expand All @@ -132,7 +141,9 @@ def main(gbff_files: Iterable, origin: str, prefix: str, output_dir: str) -> Non
# only write cds features for protein-coding transcripts
if srf.cds_feature is not None:
pro_record = SeqRecord(
Seq(srf.cds_translation), id=srf.cds_protein_id, description=srf.cds_product,
Seq(srf.cds_translation),
id=srf.cds_protein_id,
description=srf.cds_product,
)
protein_fasta_fh.write(pro_record.format("fasta"))

Expand All @@ -149,22 +160,33 @@ def main(gbff_files: Iterable, origin: str, prefix: str, output_dir: str) -> Non
genes.add(srf.gene_id)
i += 1
if i % 5000 == 0:
logger.info(f"Processed {i} records")
logger.info(
f" - {ng} genes in {fn} ({c}); {s} transcripts skipped".format(
ng=len(genes),
fn=gbff_fn,
c=prefixes,
s=len(skipped),
)
)
except SeqRecordFeatureError as e:
logger.error(f"SeqRecordFeatureError processing {sr.id}: {e}")
raise
except ValueError as e:
logger.error(f"ValueError processing {sr.id}: {e}")
raise


logger.info(
"{ng} genes in {fn} ({c})".format(ng=len(genes), fn=gbff_fn, c=prefixes)
"{ng} genes in {fn} ({c}); {s} transcripts skipped".format(
ng=len(genes), fn=gbff_fn, c=prefixes, s=len(skipped)
)
)
total_genes ^= genes
total_skipped ^= skipped
all_prefixes += prefixes
logger.info(
"{ng} genes in {nf} files ({c})".format(
ng=len(total_genes), nf=len(gbff_files), c=all_prefixes
"{ng} genes in {nf} ({c}); {s} transcripts skipped".format(
ng=len(total_genes), nf=len(gbff_files), c=all_prefixes, s=len(total_skipped)
)
)

Expand Down
9 changes: 7 additions & 2 deletions sbin/uta-extract-historical
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ relative_path="refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_hi
file_path="$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz"
download_ncbi_file $file_path $ncbi_dir

#download historical gff file
# download historical gff file
file_path="$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz"
download_ncbi_file $file_path $ncbi_dir

Expand All @@ -43,4 +43,9 @@ python sbin/ncbi_extract_gbff.py "$ncbi_dir/$relative_path/GCF_000001405.40-RS_2

# extract exonset intermediate file from gff file
python sbin/ncbi_parse_genomic_gff.py "$ncbi_dir/$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz" | \
gzip -c > "$working_dir/exonsets.gz" 2>&1 | tee "$log_dir/ncbi-parse-historical-gff.log"
gzip -c > "$working_dir/unfiltered_exonsets.gz" 2>&1 | tee "$log_dir/ncbi-parse-historical-gff.log"

# filter exonset alignments by txinfo
sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \
--missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \
tee "$log_dir/filter_exonset_transcripts.log"
4 changes: 2 additions & 2 deletions src/uta/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def load_geneinfo(session, opts, cf):
type=gi.type,
xrefs=gi.xrefs,
))
logger.info("Added {gi.gene_symbol}: {gi.gene_id} ({gi.summary})".format(gi=gi))
logger.debug("Added {gi.gene_symbol}: {gi.gene_id} ({gi.summary})".format(gi=gi))
session.commit()


Expand Down Expand Up @@ -834,7 +834,7 @@ def _upsert_exon_set_record(session, tx_ac, alt_ac, strand, method, ess):
returns tuple of (new_record, old_record) as follows:
(new, None) -- no prior record; new was inserted
(None, old) -- prior record and unchaged; nothing was inserted
(None, old) -- prior record and unchanged; nothing was inserted
(new, old) -- prior record existed and was changed
"""
Expand Down
6 changes: 3 additions & 3 deletions src/uta/parsers/seqrecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,11 +143,11 @@ def cds_translation(self):

@property
def exons_se_i(self):
se_i = []
if "exon" in self.features_by_type:
exons = self.features_by_type["exon"]
else:
exons = [self.gene_feature]
return [(f.location.start.real, f.location.end.real) for f in exons]
se_i = [(f.location.start.real, f.location.end.real) for f in exons]
return se_i

@property
def transl_except(self) -> Optional[List[str]]:
Expand Down

0 comments on commit b6d5b04

Please sign in to comment.