diff --git a/misc/gene-update/backfill_gene_id.py b/misc/gene-update/backfill_gene_id.py new file mode 100644 index 0000000..5961cfd --- /dev/null +++ b/misc/gene-update/backfill_gene_id.py @@ -0,0 +1,116 @@ +import argparse +import logging + +from datetime import datetime +from sqlalchemy.orm import Session +from sqlalchemy import text + +import uta +from uta.models import Gene, Transcript + + +logger = None +n = 50000 + + +def backfill_gene(uta_session: Session, gene_update_file: str) -> None: + logger.info("Dropping gene table contents") + uta_session.execute(text("DELETE FROM uta.gene;")) + uta_session.commit() + + logger.info(f"Back filling gene table from {gene_update_file}") + now_ts = datetime.now() + i = 0 + new_genes = [] + with open(gene_update_file) as f: + for line in f: + if line.startswith("gene_id"): + continue + + if i % n == 0: + if i > 0: + logger.info(f"Bulk inserting {len(new_genes)} genes") + uta_session.bulk_save_objects(new_genes) + uta_session.commit() + logger.info(f"Processing chunk {int(i/n) + 1}") + new_genes = [] + + gene_id, hgnc, maploc, desc, summary, aliases, added = line.rstrip("\r\n").split("\t") + # set timestamp from file string, if empty set to now. + if added == "": + added_ts = now_ts + else: + added_ts = datetime.strptime(added, "%Y-%m-%d %H:%M:%S.%f") + + # clean up aliases + aliases = aliases.replace("{", "").replace("}", "") + if aliases == "-": + aliases = None + + gene = Gene( + gene_id=gene_id, + hgnc=hgnc, + maploc=maploc if maploc else None, + descr=desc if desc else None, + summary=summary if desc else None, + aliases=aliases if aliases else None, + added=added_ts, + ) + i += 1 + new_genes.append(gene) + + logger.info(f"Bulk inserting {len(new_genes)} genes") + uta_session.bulk_save_objects(new_genes) + uta_session.commit() + logger.info(f"Inserted {i} total genes") + + +def backfill_transcript(uta_session: Session, transcript_update_file: str) -> None: + logger.info("Backfilling gene_id in transcript table") + tx_ac_to_gene_id = {} + + logger.info(f"Reading transcript to gene id mappings from {transcript_update_file}") + with open(transcript_update_file) as f: + for line in f: + if line.startswith("origin"): + continue + _, tx_ac, gene_id, _ = line.rstrip("\r\n").split("\t") + tx_ac_to_gene_id[tx_ac] = gene_id + logger.info(f" - {len(tx_ac_to_gene_id)} mappings read") + + i = 0 + txs = [] + for tx_ac, gene_id in tx_ac_to_gene_id.items(): + if i % n == 0: + if i > 0: + logger.info(f"Updating {len(txs)} transcripts") + uta_session.flush() + + logger.info(f"Processing chunk {int(i/n) + 1}") + txs = [] + + tx = uta_session.query(Transcript).filter(Transcript.ac == tx_ac).one() + tx.gene_id = gene_id + txs.append(tx) + i += 1 + + logger.info(f"Updating {len(txs)} transcripts") + uta_session.flush() + uta_session.commit() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Backfill gene_id in gene and transcript tables") + parser.add_argument("db_url", help="URL of the UTA database") + parser.add_argument("gene_update_file", type=str, help="File containing gene_id updates for gene table") + parser.add_argument("transcript_update_file", type=str, help="File containing gene_id updates for transcript table") + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + logger = logging.getLogger("backfill_gene_id") + + session = uta.connect(args.db_url) + + backfill_gene(session, args.gene_update_file) + backfill_transcript(session, args.transcript_update_file) + session.close() diff --git a/misc/gene-update/upgrade-uta-schema.sh b/misc/gene-update/upgrade-uta-schema.sh new file mode 100644 index 0000000..9472fd9 --- /dev/null +++ b/misc/gene-update/upgrade-uta-schema.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# This script is used to upgrade older UTA schemas (specifically uta_20210129b) to a newer version. +# Part of this upgrade is to introduce gene_id to the gene and transcript tables. The columns are +# added with a Alembic migration. Then a data migration to back fill the new columns. Then a second +# Alembic migration to add the constraints to the columns and update primary and foreign keys. + +if [ "$#" -lt 1 ] +then + echo "error: too few arguments, you provided $#, 1 required" + echo "usage: upgrade-uta-schema.sh " + exit 1 +fi + +set -euxo pipefail + +source_uta_v="uta_20210129b" +working_uta_v="uta" +dest_uta_v=$1 +dumps_dir="/workdir/dumps" +mkdir -p $dumps_dir + +## setup working uta schema +# delete destination schema if exists +psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $working_uta_v CASCADE;" + +# dump source version +pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \ + gzip -c > $dumps_dir/"$source_uta_v".pgd.gz + +# create new schema +gzip -cdq $dumps_dir/"$source_uta_v".pgd.gz | \ + sbin/pg-dump-schema-rename "$source_uta_v" "$working_uta_v" | \ + sbin/pg-dump-schema-rename "uta_1_1" "$working_uta_v" | \ + psql -U uta_admin -h localhost -d uta -aeE + +## upgrade working uta schema +# set initial Alembic migration so it is not ran. +alembic -c etc/alembic.ini stamp edadb97f6502 + +# run Alembic migration to add gene_id to gene and transcript tables +alembic -c etc/alembic.ini upgrade 595a586e6de7 + +# run data migration to back fill gene_id +python misc/gene-update/backfill_gene_id.py \ + postgresql://uta_admin:@localhost/uta \ + /workdir/backfill/gene_update.tsv \ + /workdir/backfill/transcript_update.tsv + +# run Alembic migrations to add constraints and update existing views +alembic -c etc/alembic.ini upgrade head + +## Copy data into destination schema +# dump working schema +pg_dump -U uta_admin -h localhost -d uta -n "$working_uta_v" | \ + gzip -c > "$dumps_dir/$working_uta_v".pgd.gz + +# copy data into destination schema +gzip -cdq "$dumps_dir/$working_uta_v".pgd.gz | \ + sbin/pg-dump-schema-rename "$working_uta_v" "$dest_uta_v" | \ + psql -U uta_admin -h localhost -d uta -aeE diff --git a/src/uta/loading.py b/src/uta/loading.py index 7ddd354..2465607 100644 --- a/src/uta/loading.py +++ b/src/uta/loading.py @@ -336,13 +336,14 @@ def load_geneinfo(session, opts, cf): for i_gi, gi in enumerate(gir): session.merge( usam.Gene( + gene_id=gi.gene_id, hgnc=gi.hgnc, maploc=gi.maploc, descr=gi.descr, summary=gi.summary, aliases=gi.aliases, )) - logger.info("Added {gi.hgnc} ({gi.summary})".format(gi=gi)) + logger.info("Added {gi.hgnc}: {gi.gene_id} ({gi.summary})".format(gi=gi)) session.commit() @@ -698,7 +699,7 @@ def _fetch_origin_by_name(name): u_tx = usam.Transcript( ac=ti.ac, origin=ori, - hgnc=ti.gene_symbol, + gene_id=ti.gene_id, cds_start_i=cds_start_i, cds_end_i=cds_end_i, cds_md5=cds_md5,