Skip to content

Commit

Permalink
feat(IPVC-2264): add in database update script, update loading methods
Browse files Browse the repository at this point in the history
  • Loading branch information
bsgiles73 committed Apr 12, 2024
1 parent ccdd431 commit 0d4d308
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 2 deletions.
116 changes: 116 additions & 0 deletions misc/gene-update/backfill_gene_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import argparse
import logging

from datetime import datetime
from sqlalchemy.orm import Session
from sqlalchemy import text

import uta
from uta.models import Gene, Transcript


logger = None
n = 50000


def backfill_gene(uta_session: Session, gene_update_file: str) -> None:
logger.info("Dropping gene table contents")
uta_session.execute(text("DELETE FROM uta.gene;"))
uta_session.commit()

logger.info(f"Back filling gene table from {gene_update_file}")
now_ts = datetime.now()
i = 0
new_genes = []
with open(gene_update_file) as f:
for line in f:
if line.startswith("gene_id"):
continue

if i % n == 0:
if i > 0:
logger.info(f"Bulk inserting {len(new_genes)} genes")
uta_session.bulk_save_objects(new_genes)
uta_session.commit()
logger.info(f"Processing chunk {int(i/n) + 1}")
new_genes = []

gene_id, hgnc, maploc, desc, summary, aliases, added = line.rstrip("\r\n").split("\t")
# set timestamp from file string, if empty set to now.
if added == "":
added_ts = now_ts
else:
added_ts = datetime.strptime(added, "%Y-%m-%d %H:%M:%S.%f")

# clean up aliases
aliases = aliases.replace("{", "").replace("}", "")
if aliases == "-":
aliases = None

gene = Gene(
gene_id=gene_id,
hgnc=hgnc,
maploc=maploc if maploc else None,
descr=desc if desc else None,
summary=summary if desc else None,
aliases=aliases if aliases else None,
added=added_ts,
)
i += 1
new_genes.append(gene)

logger.info(f"Bulk inserting {len(new_genes)} genes")
uta_session.bulk_save_objects(new_genes)
uta_session.commit()
logger.info(f"Inserted {i} total genes")


def backfill_transcript(uta_session: Session, transcript_update_file: str) -> None:
logger.info("Backfilling gene_id in transcript table")
tx_ac_to_gene_id = {}

logger.info(f"Reading transcript to gene id mappings from {transcript_update_file}")
with open(transcript_update_file) as f:
for line in f:
if line.startswith("origin"):
continue
_, tx_ac, gene_id, _ = line.rstrip("\r\n").split("\t")
tx_ac_to_gene_id[tx_ac] = gene_id
logger.info(f" - {len(tx_ac_to_gene_id)} mappings read")

i = 0
txs = []
for tx_ac, gene_id in tx_ac_to_gene_id.items():
if i % n == 0:
if i > 0:
logger.info(f"Updating {len(txs)} transcripts")
uta_session.flush()

logger.info(f"Processing chunk {int(i/n) + 1}")
txs = []

tx = uta_session.query(Transcript).filter(Transcript.ac == tx_ac).one()
tx.gene_id = gene_id
txs.append(tx)
i += 1

logger.info(f"Updating {len(txs)} transcripts")
uta_session.flush()
uta_session.commit()


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Backfill gene_id in gene and transcript tables")
parser.add_argument("db_url", help="URL of the UTA database")
parser.add_argument("gene_update_file", type=str, help="File containing gene_id updates for gene table")
parser.add_argument("transcript_update_file", type=str, help="File containing gene_id updates for transcript table")
args = parser.parse_args()

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
logger = logging.getLogger("backfill_gene_id")

session = uta.connect(args.db_url)

backfill_gene(session, args.gene_update_file)
backfill_transcript(session, args.transcript_update_file)
session.close()
61 changes: 61 additions & 0 deletions misc/gene-update/upgrade-uta-schema.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/bash

# This script is used to upgrade older UTA schemas (specifically uta_20210129b) to a newer version.
# Part of this upgrade is to introduce gene_id to the gene and transcript tables. The columns are
# added with a Alembic migration. Then a data migration to back fill the new columns. Then a second
# Alembic migration to add the constraints to the columns and update primary and foreign keys.

if [ "$#" -lt 1 ]
then
echo "error: too few arguments, you provided $#, 1 required"
echo "usage: upgrade-uta-schema.sh <dest_uta_v>"
exit 1
fi

set -euxo pipefail

source_uta_v="uta_20210129b"
working_uta_v="uta"
dest_uta_v=$1
dumps_dir="/workdir/dumps"
mkdir -p $dumps_dir

## setup working uta schema
# delete destination schema if exists
psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $working_uta_v CASCADE;"

# dump source version
pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \
gzip -c > $dumps_dir/"$source_uta_v".pgd.gz

# create new schema
gzip -cdq $dumps_dir/"$source_uta_v".pgd.gz | \
sbin/pg-dump-schema-rename "$source_uta_v" "$working_uta_v" | \
sbin/pg-dump-schema-rename "uta_1_1" "$working_uta_v" | \
psql -U uta_admin -h localhost -d uta -aeE

## upgrade working uta schema
# set initial Alembic migration so it is not ran.
alembic -c etc/alembic.ini stamp edadb97f6502

# run Alembic migration to add gene_id to gene and transcript tables
alembic -c etc/alembic.ini upgrade 595a586e6de7

# run data migration to back fill gene_id
python misc/gene-update/backfill_gene_id.py \
postgresql://uta_admin:@localhost/uta \
/workdir/backfill/gene_update.tsv \
/workdir/backfill/transcript_update.tsv

# run Alembic migrations to add constraints and update existing views
alembic -c etc/alembic.ini upgrade head

## Copy data into destination schema
# dump working schema
pg_dump -U uta_admin -h localhost -d uta -n "$working_uta_v" | \
gzip -c > "$dumps_dir/$working_uta_v".pgd.gz

# copy data into destination schema
gzip -cdq "$dumps_dir/$working_uta_v".pgd.gz | \
sbin/pg-dump-schema-rename "$working_uta_v" "$dest_uta_v" | \
psql -U uta_admin -h localhost -d uta -aeE
5 changes: 3 additions & 2 deletions src/uta/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,13 +336,14 @@ def load_geneinfo(session, opts, cf):
for i_gi, gi in enumerate(gir):
session.merge(
usam.Gene(
gene_id=gi.gene_id,
hgnc=gi.hgnc,
maploc=gi.maploc,
descr=gi.descr,
summary=gi.summary,
aliases=gi.aliases,
))
logger.info("Added {gi.hgnc} ({gi.summary})".format(gi=gi))
logger.info("Added {gi.hgnc}: {gi.gene_id} ({gi.summary})".format(gi=gi))
session.commit()


Expand Down Expand Up @@ -698,7 +699,7 @@ def _fetch_origin_by_name(name):
u_tx = usam.Transcript(
ac=ti.ac,
origin=ori,
hgnc=ti.gene_symbol,
gene_id=ti.gene_id,
cds_start_i=cds_start_i,
cds_end_i=cds_end_i,
cds_md5=cds_md5,
Expand Down

0 comments on commit 0d4d308

Please sign in to comment.