invitae · bsgiles73 · Apr 18, 2024 · Apr 11, 2024 · Apr 12, 2024 · Apr 12, 2024
diff --git a/.gitignore b/.gitignore
@@ -57,7 +57,6 @@ misc/ncbi-geneinfo/ncbi.geneinfo.tsv.gz
 misc/partial-alignments
 sdist
 tmp
-update/ncbi/
 venv
 nosetests.xml
 .vscode
diff --git a/etc/scripts/create-new-schema.sh b/etc/scripts/create-new-schema.sh
@@ -21,5 +21,4 @@ pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \
 # create new schema
 gzip -cdq $dumps_dir/"$source_uta_v".pgd.gz | \
  sbin/pg-dump-schema-rename "$source_uta_v" "$dest_uta_v" | \
- sbin/pg-dump-schema-rename "uta_1_1" "$dest_uta_v" | \
- psql -U uta_admin -h localhost -d uta -aeE
+ psql -U uta_admin -h localhost -d uta -aeE
diff --git a/misc/gene-update/backfill_gene_id.py b/misc/gene-update/backfill_gene_id.py
@@ -0,0 +1,116 @@
+import argparse
+import logging
+
+from datetime import datetime
+from sqlalchemy.orm import Session
+from sqlalchemy import text
+
+import uta
+from uta.models import Gene, Transcript
+
+
+logger = None
+n = 50000
+
+
+def backfill_gene(uta_session: Session, gene_update_file: str) -> None:
+    logger.info("Dropping gene table contents")
+    uta_session.execute(text("DELETE FROM uta.gene;"))
+    uta_session.commit()
+
+    logger.info(f"Back filling gene table from {gene_update_file}")
+    now_ts = datetime.now()
+    i = 0
+    new_genes = []
+    with open(gene_update_file) as f:
+        for line in f:
+            if line.startswith("gene_id"):
+                continue
+
+            if i % n == 0:
+                if i > 0:
+                    logger.info(f"Bulk inserting {len(new_genes)} genes")
+                    uta_session.bulk_save_objects(new_genes)
+                    uta_session.commit()
+                logger.info(f"Processing chunk {int(i/n) + 1}")
+                new_genes = []
+
+            gene_id, hgnc, maploc, desc, summary, aliases, added = line.rstrip("\r\n").split("\t")
+            # set timestamp from file string, if empty set to now.
+            if added == "":
+                added_ts = now_ts
+            else:
+                added_ts = datetime.strptime(added, "%Y-%m-%d %H:%M:%S.%f")
+
+            # clean up aliases
+            aliases = aliases.replace("{", "").replace("}", "")
+            if aliases == "-":
+                aliases = None
+
+            gene = Gene(
+                gene_id=gene_id,
+                hgnc=hgnc,
+                maploc=maploc if maploc else None,
+                descr=desc if desc else None,
+                summary=summary if desc else None,
+                aliases=aliases if aliases else None,
+                added=added_ts,
+            )
+            i += 1
+            new_genes.append(gene)
+
+    logger.info(f"Bulk inserting {len(new_genes)} genes")
+    uta_session.bulk_save_objects(new_genes)
+    uta_session.commit()
+    logger.info(f"Inserted {i} total genes")
+
+
+def backfill_transcript(uta_session: Session, transcript_update_file: str) -> None:
+    logger.info("Backfilling gene_id in transcript table")
+    tx_ac_to_gene_id = {}
+
+    logger.info(f"Reading transcript to gene id mappings from {transcript_update_file}")
+    with open(transcript_update_file) as f:
+        for line in f:
+            if line.startswith("origin"):
+                continue
+            _, tx_ac, gene_id, _ = line.rstrip("\r\n").split("\t")
+            tx_ac_to_gene_id[tx_ac] = gene_id
+    logger.info(f"  - {len(tx_ac_to_gene_id)} mappings read")
+
+    i = 0
+    txs = []
+    for tx_ac, gene_id in tx_ac_to_gene_id.items():
+        if i % n == 0:
+            if i > 0:
+                logger.info(f"Updating {len(txs)} transcripts")
+                uta_session.flush()
+
+            logger.info(f"Processing chunk {int(i/n) + 1}")
+            txs = []
+
+        tx = uta_session.query(Transcript).filter(Transcript.ac == tx_ac).one()
+        tx.gene_id = gene_id
+        txs.append(tx)
+        i += 1
+
+    logger.info(f"Updating {len(txs)} transcripts")
+    uta_session.flush()
+    uta_session.commit()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Backfill gene_id in gene and transcript tables")
+    parser.add_argument("db_url", help="URL of the UTA database")
+    parser.add_argument("gene_update_file", type=str, help="File containing gene_id updates for gene table")
+    parser.add_argument("transcript_update_file", type=str, help="File containing gene_id updates for transcript table")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+    logger = logging.getLogger("backfill_gene_id")
+
+    session = uta.connect(args.db_url)
+
+    backfill_gene(session, args.gene_update_file)
+    backfill_transcript(session, args.transcript_update_file)
+    session.close()
diff --git a/misc/gene-update/upgrade-uta-schema.sh b/misc/gene-update/upgrade-uta-schema.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# This script is used to upgrade older UTA schemas (specifically uta_20210129b) to a newer version.
+# Part of this upgrade is to introduce gene_id to the gene and transcript tables. The columns are
+# added with a Alembic migration. Then a data migration to back fill the new columns. Then a second
+# Alembic migration to add the constraints to the columns and update primary and foreign keys.
+
+if [ "$#" -lt 1 ]
+then
+    echo "error: too few arguments, you provided $#, 1 required"
+    echo "usage: upgrade-uta-schema.sh <dest_uta_v>"
+    exit 1
+fi
+
+set -euxo pipefail
+
+source_uta_v="uta_20210129b"
+working_uta_v="uta"
+dest_uta_v=$1
+dumps_dir="/workdir/dumps"
+mkdir -p $dumps_dir
+
+## setup working uta schema
+# delete schema if exists
+psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $working_uta_v CASCADE;"
+
+# dump source version
+pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \
+ gzip -c > $dumps_dir/"$source_uta_v".pgd.gz
+
+# create new schema
+gzip -cdq $dumps_dir/"$source_uta_v".pgd.gz | \
+ sbin/pg-dump-schema-rename "$source_uta_v" "$working_uta_v" | \
+ sbin/pg-dump-schema-rename "uta_1_1" "$working_uta_v" | \
+ psql -U uta_admin -h localhost -d uta -aeE
+
+## upgrade working uta schema
+# set initial Alembic migration so it is not ran.
+alembic -c etc/alembic.ini stamp edadb97f6502
+
+# run Alembic migration to add gene_id to gene and transcript tables
+alembic -c etc/alembic.ini upgrade 595a586e6de7
+
+# run data migration to back fill gene_id
+python misc/gene-update/backfill_gene_id.py \
+  postgresql://uta_admin:@localhost/uta \
+  /workdir/backfill/gene_update.tsv \
+  /workdir/backfill/transcript_update.tsv
+
+# run Alembic migrations to add constraints and update existing views
+alembic -c etc/alembic.ini upgrade head
+
+## Rename schema to destination schema name
+psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $dest_uta_v CASCADE;"
+psql -h localhost -U uta_admin -d uta -c "ALTER SCHEMA uta RENAME TO $dest_uta_v";
diff --git a/sbin/ncbi-parse-geneinfo b/sbin/ncbi-parse-geneinfo
@@ -30,8 +30,6 @@ if __name__ == "__main__":
     giw = GeneInfoWriter(sys.stdout)
 
     for rec in gi_in:
-        if rec["symbol_from_nomenclature_authority"] == "-":
-            continue
         gi = GeneInfo(
             tax_id=rec["tax_id"],
             gene_symbol=rec["symbol"],

diff --git a/sbin/uta-diff b/sbin/uta-diff
@@ -14,7 +14,7 @@ cmp_cols = collections.defaultdict(lambda: ['*'])
 cmp_cols.update({
     "associated_accessions": "tx_ac pro_ac origin".split(),
     "exon_aln": "exon_aln_id tx_exon_id alt_exon_id cigar added".split(),
-    "gene": "hgnc".split(),
+    "gene": "gene_id".split(),
     "transcript": "ac".split(),
     })
 

diff --git a/sbin/uta-extract b/sbin/uta-extract
@@ -38,6 +38,6 @@ sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets
     tee "$log_dir/filter_exonset_transcripts.log"
 
 # move fasta files into same dir
-ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/
-ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/
-ln $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/
+cp -f $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/
+cp -f $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/
+cp -f $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/
diff --git a/sbin/uta-load b/sbin/uta-load
@@ -32,7 +32,6 @@ etc/scripts/delete-schema.sh "$loading_uta_v"
 etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v"
 
 ## for now set up Alembic for schema migrations
-alembic -c etc/alembic.ini stamp edadb97f6502
 alembic -c etc/alembic.ini upgrade head
 
 # generate seqinfo files from exonsets (this step requires seqrepo)
@@ -65,8 +64,8 @@ uta --conf=etc/global.conf --conf=etc/[email protected] load-exonset "$work
     tee "$log_dir/load-exonsets.log"
 
 # Load seqinfo into the seq and seqanno tables.
-uta --conf=etc/global.conf --conf=etc/[email protected] load-seqinfo "$loading_dir/seqinfo.gz" 2>&1 | \
-    tee "$logs_dir/load-seqinfo.log"
+uta --conf=etc/global.conf --conf=etc/[email protected] load-seqinfo "$working_dir/seqinfo.gz" 2>&1 | \
+    tee "$log_dir/load-seqinfo.log"
 
 # Create cigar strings for all rows in tx_alt_exon_pairs_v view and update exon_aln table.
 uta --conf=etc/global.conf --conf=etc/[email protected] align-exons 2>&1 | \

diff --git a/src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py b/src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py
@@ -0,0 +1,44 @@
+"""add gene_id to gene and transcript
+
+Revision ID: 595a586e6de7
+Revises: a697b584f699
+Create Date: 2024-04-10 19:47:43.685672
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '595a586e6de7'
+down_revision: Union[str, None] = 'a697b584f699'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('gene', sa.Column('gene_id', sa.Text(), nullable=True), schema='uta')
+    op.add_column('gene', sa.Column('type', sa.Text(), nullable=True), schema='uta')
+    op.add_column('gene', sa.Column('xrefs', sa.Text(), nullable=True), schema='uta')
+    op.add_column('transcript', sa.Column('gene_id', sa.Text(), nullable=True), schema='uta')
+    # ### end Alembic commands ###
+
+    # ### commands to drop existing primary key on gene table ###
+    op.drop_constraint('gene_pkey', 'gene', schema='uta')
+    # ### end of commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('transcript', 'gene_id', schema='uta')
+    op.drop_column('gene', 'xrefs', schema='uta')
+    op.drop_column('gene', 'type', schema='uta')
+    op.drop_column('gene', 'gene_id', schema='uta')
+    # ### end Alembic commands ###
+
+    # ### commands to add primary key on gene table ###
+    op.create_primary_key('gene_pkey', 'gene', ['hgnc'], schema='uta')
+    # ### end of commands ###