feat(IPVC-2264): add in database update script, update loading methods

invitae · Apr 12, 2024 · 0d4d308 · 0d4d308
1 parent ccdd431
commit 0d4d308
Show file tree

Hide file tree

Showing 3 changed files with 180 additions and 2 deletions.
diff --git a/misc/gene-update/backfill_gene_id.py b/misc/gene-update/backfill_gene_id.py
@@ -0,0 +1,116 @@
+import argparse
+import logging
+
+from datetime import datetime
+from sqlalchemy.orm import Session
+from sqlalchemy import text
+
+import uta
+from uta.models import Gene, Transcript
+
+
+logger = None
+n = 50000
+
+
+def backfill_gene(uta_session: Session, gene_update_file: str) -> None:
+    logger.info("Dropping gene table contents")
+    uta_session.execute(text("DELETE FROM uta.gene;"))
+    uta_session.commit()
+
+    logger.info(f"Back filling gene table from {gene_update_file}")
+    now_ts = datetime.now()
+    i = 0
+    new_genes = []
+    with open(gene_update_file) as f:
+        for line in f:
+            if line.startswith("gene_id"):
+                continue
+
+            if i % n == 0:
+                if i > 0:
+                    logger.info(f"Bulk inserting {len(new_genes)} genes")
+                    uta_session.bulk_save_objects(new_genes)
+                    uta_session.commit()
+                logger.info(f"Processing chunk {int(i/n) + 1}")
+                new_genes = []
+
+            gene_id, hgnc, maploc, desc, summary, aliases, added = line.rstrip("\r\n").split("\t")
+            # set timestamp from file string, if empty set to now.
+            if added == "":
+                added_ts = now_ts
+            else:
+                added_ts = datetime.strptime(added, "%Y-%m-%d %H:%M:%S.%f")
+
+            # clean up aliases
+            aliases = aliases.replace("{", "").replace("}", "")
+            if aliases == "-":
+                aliases = None
+
+            gene = Gene(
+                gene_id=gene_id,
+                hgnc=hgnc,
+                maploc=maploc if maploc else None,
+                descr=desc if desc else None,
+                summary=summary if desc else None,
+                aliases=aliases if aliases else None,
+                added=added_ts,
+            )
+            i += 1
+            new_genes.append(gene)
+
+    logger.info(f"Bulk inserting {len(new_genes)} genes")
+    uta_session.bulk_save_objects(new_genes)
+    uta_session.commit()
+    logger.info(f"Inserted {i} total genes")
+
+
+def backfill_transcript(uta_session: Session, transcript_update_file: str) -> None:
+    logger.info("Backfilling gene_id in transcript table")
+    tx_ac_to_gene_id = {}
+
+    logger.info(f"Reading transcript to gene id mappings from {transcript_update_file}")
+    with open(transcript_update_file) as f:
+        for line in f:
+            if line.startswith("origin"):
+                continue
+            _, tx_ac, gene_id, _ = line.rstrip("\r\n").split("\t")
+            tx_ac_to_gene_id[tx_ac] = gene_id
+    logger.info(f"  - {len(tx_ac_to_gene_id)} mappings read")
+
+    i = 0
+    txs = []
+    for tx_ac, gene_id in tx_ac_to_gene_id.items():
+        if i % n == 0:
+            if i > 0:
+                logger.info(f"Updating {len(txs)} transcripts")
+                uta_session.flush()
+
+            logger.info(f"Processing chunk {int(i/n) + 1}")
+            txs = []
+
+        tx = uta_session.query(Transcript).filter(Transcript.ac == tx_ac).one()
+        tx.gene_id = gene_id
+        txs.append(tx)
+        i += 1
+
+    logger.info(f"Updating {len(txs)} transcripts")
+    uta_session.flush()
+    uta_session.commit()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Backfill gene_id in gene and transcript tables")
+    parser.add_argument("db_url", help="URL of the UTA database")
+    parser.add_argument("gene_update_file", type=str, help="File containing gene_id updates for gene table")
+    parser.add_argument("transcript_update_file", type=str, help="File containing gene_id updates for transcript table")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+    logger = logging.getLogger("backfill_gene_id")
+
+    session = uta.connect(args.db_url)
+
+    backfill_gene(session, args.gene_update_file)
+    backfill_transcript(session, args.transcript_update_file)
+    session.close()
diff --git a/misc/gene-update/upgrade-uta-schema.sh b/misc/gene-update/upgrade-uta-schema.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# This script is used to upgrade older UTA schemas (specifically uta_20210129b) to a newer version.
+# Part of this upgrade is to introduce gene_id to the gene and transcript tables. The columns are
+# added with a Alembic migration. Then a data migration to back fill the new columns. Then a second
+# Alembic migration to add the constraints to the columns and update primary and foreign keys.
+
+if [ "$#" -lt 1 ]
+then
+    echo "error: too few arguments, you provided $#, 1 required"
+    echo "usage: upgrade-uta-schema.sh <dest_uta_v>"
+    exit 1
+fi
+
+set -euxo pipefail
+
+source_uta_v="uta_20210129b"
+working_uta_v="uta"
+dest_uta_v=$1
+dumps_dir="/workdir/dumps"
+mkdir -p $dumps_dir
+
+## setup working uta schema
+# delete destination schema if exists
+psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $working_uta_v CASCADE;"
+
+# dump source version
+pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \
+ gzip -c > $dumps_dir/"$source_uta_v".pgd.gz
+
+# create new schema
+gzip -cdq $dumps_dir/"$source_uta_v".pgd.gz | \
+ sbin/pg-dump-schema-rename "$source_uta_v" "$working_uta_v" | \
+ sbin/pg-dump-schema-rename "uta_1_1" "$working_uta_v" | \
+ psql -U uta_admin -h localhost -d uta -aeE
+
+## upgrade working uta schema
+# set initial Alembic migration so it is not ran.
+alembic -c etc/alembic.ini stamp edadb97f6502
+
+# run Alembic migration to add gene_id to gene and transcript tables
+alembic -c etc/alembic.ini upgrade 595a586e6de7
+
+# run data migration to back fill gene_id
+python misc/gene-update/backfill_gene_id.py \
+  postgresql://uta_admin:@localhost/uta \
+  /workdir/backfill/gene_update.tsv \
+  /workdir/backfill/transcript_update.tsv
+
+# run Alembic migrations to add constraints and update existing views
+alembic -c etc/alembic.ini upgrade head
+
+## Copy data into destination schema
+# dump working schema
+pg_dump -U uta_admin -h localhost -d uta -n "$working_uta_v" | \
+ gzip -c > "$dumps_dir/$working_uta_v".pgd.gz
+
+# copy data into destination schema
+gzip -cdq "$dumps_dir/$working_uta_v".pgd.gz | \
+ sbin/pg-dump-schema-rename "$working_uta_v" "$dest_uta_v" | \
+ psql -U uta_admin -h localhost -d uta -aeE
diff --git a/src/uta/loading.py b/src/uta/loading.py
@@ -336,13 +336,14 @@ def load_geneinfo(session, opts, cf):
     for i_gi, gi in enumerate(gir):
         session.merge(
             usam.Gene(
+                gene_id=gi.gene_id,
                 hgnc=gi.hgnc,
                 maploc=gi.maploc,
                 descr=gi.descr,
                 summary=gi.summary,
                 aliases=gi.aliases,
             ))
-        logger.info("Added {gi.hgnc} ({gi.summary})".format(gi=gi))
+        logger.info("Added {gi.hgnc}: {gi.gene_id} ({gi.summary})".format(gi=gi))
     session.commit()
 
 
@@ -698,7 +699,7 @@ def _fetch_origin_by_name(name):
             u_tx = usam.Transcript(
                 ac=ti.ac,
                 origin=ori,
-                hgnc=ti.gene_symbol,
+                gene_id=ti.gene_id,
                 cds_start_i=cds_start_i,
                 cds_end_i=cds_end_i,
                 cds_md5=cds_md5,