diff --git a/etc/scripts/backfill_gene_id.py b/etc/scripts/backfill_gene_id.py deleted file mode 100644 index 5961cfd..0000000 --- a/etc/scripts/backfill_gene_id.py +++ /dev/null @@ -1,116 +0,0 @@ -import argparse -import logging - -from datetime import datetime -from sqlalchemy.orm import Session -from sqlalchemy import text - -import uta -from uta.models import Gene, Transcript - - -logger = None -n = 50000 - - -def backfill_gene(uta_session: Session, gene_update_file: str) -> None: - logger.info("Dropping gene table contents") - uta_session.execute(text("DELETE FROM uta.gene;")) - uta_session.commit() - - logger.info(f"Back filling gene table from {gene_update_file}") - now_ts = datetime.now() - i = 0 - new_genes = [] - with open(gene_update_file) as f: - for line in f: - if line.startswith("gene_id"): - continue - - if i % n == 0: - if i > 0: - logger.info(f"Bulk inserting {len(new_genes)} genes") - uta_session.bulk_save_objects(new_genes) - uta_session.commit() - logger.info(f"Processing chunk {int(i/n) + 1}") - new_genes = [] - - gene_id, hgnc, maploc, desc, summary, aliases, added = line.rstrip("\r\n").split("\t") - # set timestamp from file string, if empty set to now. - if added == "": - added_ts = now_ts - else: - added_ts = datetime.strptime(added, "%Y-%m-%d %H:%M:%S.%f") - - # clean up aliases - aliases = aliases.replace("{", "").replace("}", "") - if aliases == "-": - aliases = None - - gene = Gene( - gene_id=gene_id, - hgnc=hgnc, - maploc=maploc if maploc else None, - descr=desc if desc else None, - summary=summary if desc else None, - aliases=aliases if aliases else None, - added=added_ts, - ) - i += 1 - new_genes.append(gene) - - logger.info(f"Bulk inserting {len(new_genes)} genes") - uta_session.bulk_save_objects(new_genes) - uta_session.commit() - logger.info(f"Inserted {i} total genes") - - -def backfill_transcript(uta_session: Session, transcript_update_file: str) -> None: - logger.info("Backfilling gene_id in transcript table") - tx_ac_to_gene_id = {} - - logger.info(f"Reading transcript to gene id mappings from {transcript_update_file}") - with open(transcript_update_file) as f: - for line in f: - if line.startswith("origin"): - continue - _, tx_ac, gene_id, _ = line.rstrip("\r\n").split("\t") - tx_ac_to_gene_id[tx_ac] = gene_id - logger.info(f" - {len(tx_ac_to_gene_id)} mappings read") - - i = 0 - txs = [] - for tx_ac, gene_id in tx_ac_to_gene_id.items(): - if i % n == 0: - if i > 0: - logger.info(f"Updating {len(txs)} transcripts") - uta_session.flush() - - logger.info(f"Processing chunk {int(i/n) + 1}") - txs = [] - - tx = uta_session.query(Transcript).filter(Transcript.ac == tx_ac).one() - tx.gene_id = gene_id - txs.append(tx) - i += 1 - - logger.info(f"Updating {len(txs)} transcripts") - uta_session.flush() - uta_session.commit() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Backfill gene_id in gene and transcript tables") - parser.add_argument("db_url", help="URL of the UTA database") - parser.add_argument("gene_update_file", type=str, help="File containing gene_id updates for gene table") - parser.add_argument("transcript_update_file", type=str, help="File containing gene_id updates for transcript table") - args = parser.parse_args() - - logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') - logger = logging.getLogger("backfill_gene_id") - - session = uta.connect(args.db_url) - - backfill_gene(session, args.gene_update_file) - backfill_transcript(session, args.transcript_update_file) - session.close() diff --git a/etc/scripts/create-new-schema.sh b/etc/scripts/create-new-schema.sh index 5200e8d..d13100a 100755 --- a/etc/scripts/create-new-schema.sh +++ b/etc/scripts/create-new-schema.sh @@ -21,5 +21,4 @@ pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \ # create new schema gzip -cdq $dumps_dir/"$source_uta_v".pgd.gz | \ sbin/pg-dump-schema-rename "$source_uta_v" "$dest_uta_v" | \ - sbin/pg-dump-schema-rename "uta_1_1" "$dest_uta_v" | \ - psql -U uta_admin -h localhost -d uta -aeE \ No newline at end of file + psql -U uta_admin -h localhost -d uta -aeE diff --git a/misc/gene-update/upgrade-uta-schema.sh b/misc/gene-update/upgrade-uta-schema.sh index 9472fd9..d443141 100644 --- a/misc/gene-update/upgrade-uta-schema.sh +++ b/misc/gene-update/upgrade-uta-schema.sh @@ -50,12 +50,5 @@ python misc/gene-update/backfill_gene_id.py \ # run Alembic migrations to add constraints and update existing views alembic -c etc/alembic.ini upgrade head -## Copy data into destination schema -# dump working schema -pg_dump -U uta_admin -h localhost -d uta -n "$working_uta_v" | \ - gzip -c > "$dumps_dir/$working_uta_v".pgd.gz - -# copy data into destination schema -gzip -cdq "$dumps_dir/$working_uta_v".pgd.gz | \ - sbin/pg-dump-schema-rename "$working_uta_v" "$dest_uta_v" | \ - psql -U uta_admin -h localhost -d uta -aeE +## Rename schema to destination schema name +psql -h localhost -U uta_admin -d uta -c "ALTER SCHEMA uta RENAME TO $dest_uta_v";