feat(IPVC-2386): small changes, skip transcripts with unknown exon st…

…ructure
invitae · May 3, 2024 · b6d5b04 · b6d5b04
1 parent 80d3196
commit b6d5b04
Show file tree

Hide file tree

Showing 6 changed files with 56 additions and 34 deletions.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -55,7 +55,6 @@ services:
       uta:
         condition: service_healthy
     volumes:
-      - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
       - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
       - ${UTA_ETL_WORK_DIR}:/uta-load/work
       - ${UTA_ETL_LOG_DIR}:/uta-load/logs

diff --git a/sbin/exonset-to-seqinfo b/sbin/exonset-to-seqinfo
@@ -5,16 +5,15 @@
 import argparse
 import configparser as ConfigParser
 import gzip
+import importlib_resources
 import itertools
 import logging
 import logging.config
-import pkg_resources
 import re
 import sys
 
 from bioutils.digests import seq_md5
 from biocommons.seqrepo import SeqRepo
-# from multifastadb import MultiFastaDB
 
 from uta.formats.exonset import ExonSetReader
 from uta.formats.seqinfo import SeqInfo, SeqInfoWriter
@@ -40,8 +39,7 @@ def parse_args(argv):
 
 
 if __name__ == "__main__":
-    logging_conf_fn = pkg_resources.resource_filename(
-        "uta", "etc/logging.conf")
+    logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
     logging.config.fileConfig(logging_conf_fn)
     logger = logging.getLogger(__name__)
     logger.setLevel(logging.INFO)
@@ -60,8 +58,6 @@ if __name__ == "__main__":
     esr = ExonSetReader(in_fh)
     logger.info("opened " + in_fn)
 
-    #fa_dirs = cf.get("sequences", "fasta_directories").strip().splitlines()
-    #mfdb = MultiFastaDB(fa_dirs, use_meta_index=True)
     sr_dir = cf.get("sequences", "seqrepo")
     sr = SeqRepo(root_dir=sr_dir)
     logger.info("Opened sequence directories: " + sr_dir)

diff --git a/sbin/ncbi_extract_gbff.py b/sbin/ncbi_extract_gbff.py
@@ -57,7 +57,8 @@ def main(gbff_files: Iterable, origin: str, prefix: str, output_dir: str) -> Non
         # Protein fasta file
         protein_fasta_fh = stack.enter_context(
             io.TextIOWrapper(
-                gzip.open(f"{output_dir}/{prefix}protein.faa.gz", "wb"), encoding="utf-8"
+                gzip.open(f"{output_dir}/{prefix}protein.faa.gz", "wb"),
+                encoding="utf-8",
             )
         )
 
@@ -83,37 +84,45 @@ def main(gbff_files: Iterable, origin: str, prefix: str, output_dir: str) -> Non
         assocacs_writer = GeneAccessionsWriter(assocacs_fh)
 
         total_genes = set()
-        skipped_ids = set()
+        total_skipped = set()
         all_prefixes = Counter()
         for gbff_fn in gbff_files:
             logger.info(f"Processing {gbff_fn}")
             gbff_file_handler = stack.enter_context(open_file(gbff_fn))
             i = 0
             genes = set()
+            skipped = set()
             prefixes = Counter()
             for r in Bio.SeqIO.parse(gbff_file_handler, "gb"):
                 srf = SeqRecordFacade(r)
+
+                # skip transcripts where the exon structure is unknown
+                if not srf.exons_se_i:
+                    skipped.add(srf.id)
+                    continue
+
                 prefixes.update([srf.id[:2]])
                 try:
                     fna_record = SeqRecord(
                         Seq(srf.feature_seq), id=srf.id, description=""
                     )
                     dna_fasta_fh.write(fna_record.format("fasta"))
 
-                    geneinfo_writer.write(
-                        GeneInfo(
-                            gene_id=srf.gene_id,
-                            gene_symbol=srf.gene_symbol,
-                            tax_id="9606",
-                            hgnc=srf.gene_symbol,
-                            maploc="",
-                            aliases=srf.gene_synonyms,
-                            type=srf.gene_type,
-                            summary="",
-                            descr="",
-                            xrefs=srf.db_xrefs,
+                    if srf.gene_id not in genes:
+                        geneinfo_writer.write(
+                            GeneInfo(
+                                gene_id=srf.gene_id,
+                                gene_symbol=srf.gene_symbol,
+                                tax_id="9606",
+                                hgnc=srf.gene_symbol,
+                                maploc="",
+                                aliases=srf.gene_synonyms,
+                                type=srf.gene_type,
+                                summary="",
+                                descr="",
+                                xrefs=srf.db_xrefs,
+                            )
                         )
-                    )
 
                     txinfo_writer.write(
                         TxInfo(
@@ -132,7 +141,9 @@ def main(gbff_files: Iterable, origin: str, prefix: str, output_dir: str) -> Non
                     # only write cds features for protein-coding transcripts
                     if srf.cds_feature is not None:
                         pro_record = SeqRecord(
-                            Seq(srf.cds_translation), id=srf.cds_protein_id, description=srf.cds_product,
+                            Seq(srf.cds_translation),
+                            id=srf.cds_protein_id,
+                            description=srf.cds_product,
                         )
                         protein_fasta_fh.write(pro_record.format("fasta"))
 
@@ -149,22 +160,33 @@ def main(gbff_files: Iterable, origin: str, prefix: str, output_dir: str) -> Non
                     genes.add(srf.gene_id)
                     i += 1
                     if i % 5000 == 0:
-                        logger.info(f"Processed {i} records")
+                        logger.info(
+                            f"  - {ng} genes in {fn} ({c}); {s} transcripts skipped".format(
+                                ng=len(genes),
+                                fn=gbff_fn,
+                                c=prefixes,
+                                s=len(skipped),
+                            )
+                        )
                 except SeqRecordFeatureError as e:
                     logger.error(f"SeqRecordFeatureError processing {sr.id}: {e}")
                     raise
                 except ValueError as e:
                     logger.error(f"ValueError processing {sr.id}: {e}")
                     raise
 
+
             logger.info(
-                "{ng} genes in {fn} ({c})".format(ng=len(genes), fn=gbff_fn, c=prefixes)
+                "{ng} genes in {fn} ({c}); {s} transcripts skipped".format(
+                    ng=len(genes), fn=gbff_fn, c=prefixes, s=len(skipped)
+                )
             )
             total_genes ^= genes
+            total_skipped ^= skipped
             all_prefixes += prefixes
         logger.info(
-            "{ng} genes in {nf} files ({c})".format(
-                ng=len(total_genes), nf=len(gbff_files), c=all_prefixes
+            "{ng} genes in {nf} ({c}); {s} transcripts skipped".format(
+                ng=len(total_genes), nf=len(gbff_files), c=all_prefixes, s=len(total_skipped)
             )
         )
 

diff --git a/sbin/uta-extract-historical b/sbin/uta-extract-historical
@@ -33,7 +33,7 @@ relative_path="refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_hi
 file_path="$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz"
 download_ncbi_file $file_path $ncbi_dir
 
-#download historical gff file
+# download historical gff file
 file_path="$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz"
 download_ncbi_file $file_path $ncbi_dir
 
@@ -43,4 +43,9 @@ python sbin/ncbi_extract_gbff.py "$ncbi_dir/$relative_path/GCF_000001405.40-RS_2
 
 # extract exonset intermediate file from gff file
 python sbin/ncbi_parse_genomic_gff.py "$ncbi_dir/$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz" | \
-  gzip -c > "$working_dir/exonsets.gz" 2>&1 | tee "$log_dir/ncbi-parse-historical-gff.log"
+  gzip -c > "$working_dir/unfiltered_exonsets.gz" 2>&1 | tee "$log_dir/ncbi-parse-historical-gff.log"
+
+# filter exonset alignments by txinfo
+sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \
+    --missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \
+    tee "$log_dir/filter_exonset_transcripts.log"
diff --git a/src/uta/loading.py b/src/uta/loading.py
@@ -346,7 +346,7 @@ def load_geneinfo(session, opts, cf):
                 type=gi.type,
                 xrefs=gi.xrefs,
             ))
-        logger.info("Added {gi.gene_symbol}: {gi.gene_id} ({gi.summary})".format(gi=gi))
+        logger.debug("Added {gi.gene_symbol}: {gi.gene_id} ({gi.summary})".format(gi=gi))
     session.commit()
 
 
@@ -834,7 +834,7 @@ def _upsert_exon_set_record(session, tx_ac, alt_ac, strand, method, ess):
     returns tuple of (new_record, old_record) as follows:
 
     (new, None) -- no prior record; new was inserted
-    (None, old) -- prior record and unchaged; nothing was inserted
+    (None, old) -- prior record and unchanged; nothing was inserted
     (new, old)  -- prior record existed and was changed
 
     """

diff --git a/src/uta/parsers/seqrecord.py b/src/uta/parsers/seqrecord.py
@@ -143,11 +143,11 @@ def cds_translation(self):
 
     @property
     def exons_se_i(self):
+        se_i = []
         if "exon" in self.features_by_type:
             exons = self.features_by_type["exon"]
-        else:
-            exons = [self.gene_feature]
-        return [(f.location.start.real, f.location.end.real) for f in exons]
+            se_i = [(f.location.start.real, f.location.end.real) for f in exons]
+        return se_i
 
     @property
     def transl_except(self) -> Optional[List[str]]: