forked from biocommons/uta
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(IPVC-2264): add in database update script, update loading methods
- Loading branch information
Showing
3 changed files
with
180 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import argparse | ||
import logging | ||
|
||
from datetime import datetime | ||
from sqlalchemy.orm import Session | ||
from sqlalchemy import text | ||
|
||
import uta | ||
from uta.models import Gene, Transcript | ||
|
||
|
||
logger = None | ||
n = 50000 | ||
|
||
|
||
def backfill_gene(uta_session: Session, gene_update_file: str) -> None: | ||
logger.info("Dropping gene table contents") | ||
uta_session.execute(text("DELETE FROM uta.gene;")) | ||
uta_session.commit() | ||
|
||
logger.info(f"Back filling gene table from {gene_update_file}") | ||
now_ts = datetime.now() | ||
i = 0 | ||
new_genes = [] | ||
with open(gene_update_file) as f: | ||
for line in f: | ||
if line.startswith("gene_id"): | ||
continue | ||
|
||
if i % n == 0: | ||
if i > 0: | ||
logger.info(f"Bulk inserting {len(new_genes)} genes") | ||
uta_session.bulk_save_objects(new_genes) | ||
uta_session.commit() | ||
logger.info(f"Processing chunk {int(i/n) + 1}") | ||
new_genes = [] | ||
|
||
gene_id, hgnc, maploc, desc, summary, aliases, added = line.rstrip("\r\n").split("\t") | ||
# set timestamp from file string, if empty set to now. | ||
if added == "": | ||
added_ts = now_ts | ||
else: | ||
added_ts = datetime.strptime(added, "%Y-%m-%d %H:%M:%S.%f") | ||
|
||
# clean up aliases | ||
aliases = aliases.replace("{", "").replace("}", "") | ||
if aliases == "-": | ||
aliases = None | ||
|
||
gene = Gene( | ||
gene_id=gene_id, | ||
hgnc=hgnc, | ||
maploc=maploc if maploc else None, | ||
descr=desc if desc else None, | ||
summary=summary if desc else None, | ||
aliases=aliases if aliases else None, | ||
added=added_ts, | ||
) | ||
i += 1 | ||
new_genes.append(gene) | ||
|
||
logger.info(f"Bulk inserting {len(new_genes)} genes") | ||
uta_session.bulk_save_objects(new_genes) | ||
uta_session.commit() | ||
logger.info(f"Inserted {i} total genes") | ||
|
||
|
||
def backfill_transcript(uta_session: Session, transcript_update_file: str) -> None: | ||
logger.info("Backfilling gene_id in transcript table") | ||
tx_ac_to_gene_id = {} | ||
|
||
logger.info(f"Reading transcript to gene id mappings from {transcript_update_file}") | ||
with open(transcript_update_file) as f: | ||
for line in f: | ||
if line.startswith("origin"): | ||
continue | ||
_, tx_ac, gene_id, _ = line.rstrip("\r\n").split("\t") | ||
tx_ac_to_gene_id[tx_ac] = gene_id | ||
logger.info(f" - {len(tx_ac_to_gene_id)} mappings read") | ||
|
||
i = 0 | ||
txs = [] | ||
for tx_ac, gene_id in tx_ac_to_gene_id.items(): | ||
if i % n == 0: | ||
if i > 0: | ||
logger.info(f"Updating {len(txs)} transcripts") | ||
uta_session.flush() | ||
|
||
logger.info(f"Processing chunk {int(i/n) + 1}") | ||
txs = [] | ||
|
||
tx = uta_session.query(Transcript).filter(Transcript.ac == tx_ac).one() | ||
tx.gene_id = gene_id | ||
txs.append(tx) | ||
i += 1 | ||
|
||
logger.info(f"Updating {len(txs)} transcripts") | ||
uta_session.flush() | ||
uta_session.commit() | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Backfill gene_id in gene and transcript tables") | ||
parser.add_argument("db_url", help="URL of the UTA database") | ||
parser.add_argument("gene_update_file", type=str, help="File containing gene_id updates for gene table") | ||
parser.add_argument("transcript_update_file", type=str, help="File containing gene_id updates for transcript table") | ||
args = parser.parse_args() | ||
|
||
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') | ||
logger = logging.getLogger("backfill_gene_id") | ||
|
||
session = uta.connect(args.db_url) | ||
|
||
backfill_gene(session, args.gene_update_file) | ||
backfill_transcript(session, args.transcript_update_file) | ||
session.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#!/bin/bash | ||
|
||
# This script is used to upgrade older UTA schemas (specifically uta_20210129b) to a newer version. | ||
# Part of this upgrade is to introduce gene_id to the gene and transcript tables. The columns are | ||
# added with a Alembic migration. Then a data migration to back fill the new columns. Then a second | ||
# Alembic migration to add the constraints to the columns and update primary and foreign keys. | ||
|
||
if [ "$#" -lt 1 ] | ||
then | ||
echo "error: too few arguments, you provided $#, 1 required" | ||
echo "usage: upgrade-uta-schema.sh <dest_uta_v>" | ||
exit 1 | ||
fi | ||
|
||
set -euxo pipefail | ||
|
||
source_uta_v="uta_20210129b" | ||
working_uta_v="uta" | ||
dest_uta_v=$1 | ||
dumps_dir="/workdir/dumps" | ||
mkdir -p $dumps_dir | ||
|
||
## setup working uta schema | ||
# delete destination schema if exists | ||
psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $working_uta_v CASCADE;" | ||
|
||
# dump source version | ||
pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \ | ||
gzip -c > $dumps_dir/"$source_uta_v".pgd.gz | ||
|
||
# create new schema | ||
gzip -cdq $dumps_dir/"$source_uta_v".pgd.gz | \ | ||
sbin/pg-dump-schema-rename "$source_uta_v" "$working_uta_v" | \ | ||
sbin/pg-dump-schema-rename "uta_1_1" "$working_uta_v" | \ | ||
psql -U uta_admin -h localhost -d uta -aeE | ||
|
||
## upgrade working uta schema | ||
# set initial Alembic migration so it is not ran. | ||
alembic -c etc/alembic.ini stamp edadb97f6502 | ||
|
||
# run Alembic migration to add gene_id to gene and transcript tables | ||
alembic -c etc/alembic.ini upgrade 595a586e6de7 | ||
|
||
# run data migration to back fill gene_id | ||
python misc/gene-update/backfill_gene_id.py \ | ||
postgresql://uta_admin:@localhost/uta \ | ||
/workdir/backfill/gene_update.tsv \ | ||
/workdir/backfill/transcript_update.tsv | ||
|
||
# run Alembic migrations to add constraints and update existing views | ||
alembic -c etc/alembic.ini upgrade head | ||
|
||
## Copy data into destination schema | ||
# dump working schema | ||
pg_dump -U uta_admin -h localhost -d uta -n "$working_uta_v" | \ | ||
gzip -c > "$dumps_dir/$working_uta_v".pgd.gz | ||
|
||
# copy data into destination schema | ||
gzip -cdq "$dumps_dir/$working_uta_v".pgd.gz | \ | ||
sbin/pg-dump-schema-rename "$working_uta_v" "$dest_uta_v" | \ | ||
psql -U uta_admin -h localhost -d uta -aeE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters