From ccdd431235f888ac981d1cadfdb006a198153d2e Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Thu, 11 Apr 2024 15:59:26 -0600 Subject: [PATCH 01/10] feat(IPVC-2264): model changes to add gene_id, new Alembic migrations, and a backfill script --- etc/scripts/backfill_gene_id.py | 116 ++++++++ ...6de7_add_gene_id_to_gene_and_transcript.py | 40 +++ ...et_gene_id_and_primary_and_foreign_keys.py | 271 ++++++++++++++++++ src/uta/models.py | 9 +- 4 files changed, 432 insertions(+), 4 deletions(-) create mode 100644 etc/scripts/backfill_gene_id.py create mode 100644 src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py create mode 100644 src/alembic/versions/f85dd97bd9f5_set_gene_id_and_primary_and_foreign_keys.py diff --git a/etc/scripts/backfill_gene_id.py b/etc/scripts/backfill_gene_id.py new file mode 100644 index 0000000..5961cfd --- /dev/null +++ b/etc/scripts/backfill_gene_id.py @@ -0,0 +1,116 @@ +import argparse +import logging + +from datetime import datetime +from sqlalchemy.orm import Session +from sqlalchemy import text + +import uta +from uta.models import Gene, Transcript + + +logger = None +n = 50000 + + +def backfill_gene(uta_session: Session, gene_update_file: str) -> None: + logger.info("Dropping gene table contents") + uta_session.execute(text("DELETE FROM uta.gene;")) + uta_session.commit() + + logger.info(f"Back filling gene table from {gene_update_file}") + now_ts = datetime.now() + i = 0 + new_genes = [] + with open(gene_update_file) as f: + for line in f: + if line.startswith("gene_id"): + continue + + if i % n == 0: + if i > 0: + logger.info(f"Bulk inserting {len(new_genes)} genes") + uta_session.bulk_save_objects(new_genes) + uta_session.commit() + logger.info(f"Processing chunk {int(i/n) + 1}") + new_genes = [] + + gene_id, hgnc, maploc, desc, summary, aliases, added = line.rstrip("\r\n").split("\t") + # set timestamp from file string, if empty set to now. + if added == "": + added_ts = now_ts + else: + added_ts = datetime.strptime(added, "%Y-%m-%d %H:%M:%S.%f") + + # clean up aliases + aliases = aliases.replace("{", "").replace("}", "") + if aliases == "-": + aliases = None + + gene = Gene( + gene_id=gene_id, + hgnc=hgnc, + maploc=maploc if maploc else None, + descr=desc if desc else None, + summary=summary if desc else None, + aliases=aliases if aliases else None, + added=added_ts, + ) + i += 1 + new_genes.append(gene) + + logger.info(f"Bulk inserting {len(new_genes)} genes") + uta_session.bulk_save_objects(new_genes) + uta_session.commit() + logger.info(f"Inserted {i} total genes") + + +def backfill_transcript(uta_session: Session, transcript_update_file: str) -> None: + logger.info("Backfilling gene_id in transcript table") + tx_ac_to_gene_id = {} + + logger.info(f"Reading transcript to gene id mappings from {transcript_update_file}") + with open(transcript_update_file) as f: + for line in f: + if line.startswith("origin"): + continue + _, tx_ac, gene_id, _ = line.rstrip("\r\n").split("\t") + tx_ac_to_gene_id[tx_ac] = gene_id + logger.info(f" - {len(tx_ac_to_gene_id)} mappings read") + + i = 0 + txs = [] + for tx_ac, gene_id in tx_ac_to_gene_id.items(): + if i % n == 0: + if i > 0: + logger.info(f"Updating {len(txs)} transcripts") + uta_session.flush() + + logger.info(f"Processing chunk {int(i/n) + 1}") + txs = [] + + tx = uta_session.query(Transcript).filter(Transcript.ac == tx_ac).one() + tx.gene_id = gene_id + txs.append(tx) + i += 1 + + logger.info(f"Updating {len(txs)} transcripts") + uta_session.flush() + uta_session.commit() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Backfill gene_id in gene and transcript tables") + parser.add_argument("db_url", help="URL of the UTA database") + parser.add_argument("gene_update_file", type=str, help="File containing gene_id updates for gene table") + parser.add_argument("transcript_update_file", type=str, help="File containing gene_id updates for transcript table") + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + logger = logging.getLogger("backfill_gene_id") + + session = uta.connect(args.db_url) + + backfill_gene(session, args.gene_update_file) + backfill_transcript(session, args.transcript_update_file) + session.close() diff --git a/src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py b/src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py new file mode 100644 index 0000000..1f6573a --- /dev/null +++ b/src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py @@ -0,0 +1,40 @@ +"""add gene_id to gene and transcript + +Revision ID: 595a586e6de7 +Revises: a697b584f699 +Create Date: 2024-04-10 19:47:43.685672 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '595a586e6de7' +down_revision: Union[str, None] = 'a697b584f699' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('gene', sa.Column('gene_id', sa.Text(), nullable=True), schema='uta') + op.add_column('transcript', sa.Column('gene_id', sa.Text(), nullable=True), schema='uta') + # ### end Alembic commands ### + + # ### commands to drop existing primary key on gene table ### + op.drop_constraint('gene_pkey', 'gene', schema='uta') + # ### end of commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('transcript', 'gene_id', schema='uta') + op.drop_column('gene', 'gene_id', schema='uta') + # ### end Alembic commands ### + + # ### commands to add primary key on gene table ### + op.create_primary_key('gene_pkey', 'gene', ['hgnc'], schema='uta') + # ### end of commands ### diff --git a/src/alembic/versions/f85dd97bd9f5_set_gene_id_and_primary_and_foreign_keys.py b/src/alembic/versions/f85dd97bd9f5_set_gene_id_and_primary_and_foreign_keys.py new file mode 100644 index 0000000..4669b8b --- /dev/null +++ b/src/alembic/versions/f85dd97bd9f5_set_gene_id_and_primary_and_foreign_keys.py @@ -0,0 +1,271 @@ +"""set gene_id and primary and foreign keys + +Revision ID: f85dd97bd9f5 +Revises: 595a586e6de7 +Create Date: 2024-04-10 22:14:14.055461 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'f85dd97bd9f5' +down_revision: Union[str, None] = '595a586e6de7' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column( + "gene", "gene_id", existing_type=sa.TEXT(), nullable=False, schema="uta" + ) + op.create_primary_key("gene_pkey", "gene", ["gene_id"], schema="uta") + op.create_index(op.f("ix_uta_gene_hgnc"), "gene", ["hgnc"], unique=False, schema="uta") + op.alter_column( + "transcript", "gene_id", existing_type=sa.TEXT(), nullable=False, schema="uta" + ) + op.create_index( + op.f("ix_uta_transcript_gene_id"), + "transcript", + ["gene_id"], + unique=False, + schema="uta", + ) + op.create_foreign_key( + "fk_uta_transcript_gene_gene_id", + "transcript", + "gene", + ["gene_id"], + ["gene_id"], + source_schema="uta", + referent_schema="uta", + onupdate="RESTRICT", + ondelete="RESTRICT", + ) + # ### end Alembic commands ### + + # ### updates required to existing views needed to drop hgnc from transcript. ### + op.execute("""DROP MATERIALIZED VIEW IF EXISTS tx_def_summary_mv CASCADE;""") + op.execute("""DROP VIEW IF EXISTS tx_def_summary_dv CASCADE;""") + op.execute("""DROP MATERIALIZED VIEW IF EXISTS tx_exon_set_summary_mv CASCADE;""") + op.execute("""DROP VIEW IF EXISTS tx_exon_set_summary_dv CASCADE;""") + op.execute("""DROP VIEW IF EXISTS tx_exon_aln_v CASCADE;""") + op.execute("""DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;""") + op.execute("""DROP VIEW IF EXISTS _discontiguous_tx CASCADE;""") + op.execute(""" + CREATE VIEW _discontiguous_tx AS + SELECT g.hgnc, + g.gene_id, + es.exon_set_id, + es.tx_ac, + format('[%s-%s]'::text, e1.end_i, e2.start_i) AS gap, + e1.exon_id AS e1_exon_id, + e1.ord AS e1_ord, + e1.start_i AS e1_start_i, + e1.end_i AS e1_end_i, + e2.exon_id AS e2_exon_id, + e2.ord AS e2_ord, + e2.start_i AS e2_start_i, + e2.end_i AS e2_end_i + FROM exon_set es + JOIN transcript t ON es.tx_ac = t.ac + JOIN gene as g ON t.gene_id = g.gene_id + JOIN exon e1 ON es.exon_set_id = e1.exon_set_id + JOIN exon e2 ON es.exon_set_id = e2.exon_set_id AND e2.ord = (e1.ord + 1) AND e1.end_i <> e2.start_i + WHERE es.alt_aln_method = 'transcript'::text; + """) + op.execute(""" + CREATE VIEW tx_alt_exon_pairs_v AS + SELECT g.hgnc, g.gene_id,TES.exon_SET_id AS tes_exon_SET_id,AES.exon_SET_id AS aes_exon_SET_id, + TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac,AES.alt_strand,AES.alt_aln_method, + TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id,AEX.exon_id AS alt_exon_id, + TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, AEX.start_i AS alt_start_i,AEX.END_i AS alt_END_i, + EA.exon_aln_id,EA.cigar + FROM exon_SET tes + JOIN transcript t ON tes.tx_ac=t.ac + JOIN gene g ON t.gene_id=g.gene_id + JOIN exon_set aes ON tes.tx_ac=aes.tx_ac AND tes.alt_aln_method='transcript' AND aes.alt_aln_method!='transcript' + JOIN exon tex ON tes.exon_SET_id=tex.exon_SET_id + JOIN exon aex ON aes.exon_SET_id=aex.exon_SET_id AND tex.ORD=aex.ORD + LEFT JOIN exon_aln ea ON ea.tx_exon_id=tex.exon_id AND ea.alt_exon_id=AEX.exon_id; + """) + op.execute(""" + CREATE VIEW tx_exon_aln_v AS + SELECT G.hgnc,G.gene_id,T.ac as tx_ac,AES.alt_ac,AES.alt_aln_method,AES.alt_strand, + TE.ord, TE.start_i as tx_start_i,TE.end_i as tx_end_i, + AE.start_i as alt_start_i, AE.end_i as alt_end_i, + EA.cigar, EA.tx_aseq, EA.alt_aseq, + TES.exon_set_id AS tx_exon_set_id,AES.exon_set_id as alt_exon_set_id, + TE.exon_id as tx_exon_id, AE.exon_id as alt_exon_id, + EA.exon_aln_id + FROM transcript T + JOIN gene G ON T.gene_id=G.gene_id + JOIN exon_set TES ON T.ac=TES.tx_ac AND TES.alt_aln_method ='transcript' + JOIN exon_set AES on T.ac=AES.tx_ac and AES.alt_aln_method!='transcript' + JOIN exon TE ON TES.exon_set_id=TE.exon_set_id + JOIN exon AE ON AES.exon_set_id=AE.exon_set_id AND TE.ord=AE.ord + LEFT JOIN exon_aln EA ON TE.exon_id=EA.tx_exon_id AND AE.exon_id=EA.alt_exon_id; + """) + op.execute(""" + CREATE VIEW tx_exon_set_summary_dv AS + SELECT G.hgnc,G.gene_id,cds_md5,es_fingerprint,tx_ac,alt_ac,alt_aln_method,alt_strand,exon_set_id,n_exons,se_i,starts_i,ends_i,lengths + FROM transcript T + JOIN gene G ON T.gene_id=G.gene_id + JOIN exon_set_exons_fp_mv ESE ON T.ac=ESE.tx_ac; + """) + op.execute(""" + CREATE MATERIALIZED VIEW tx_exon_set_summary_mv AS SELECT * FROM tx_exon_set_summary_dv WITH NO DATA; + CREATE INDEX tx_exon_set_summary_mv_cds_md5_ix ON tx_exon_set_summary_mv(cds_md5); + CREATE INDEX tx_exon_set_summary_mv_es_fingerprint_ix ON tx_exon_set_summary_mv(es_fingerprint); + CREATE INDEX tx_exon_set_summary_mv_tx_ac_ix ON tx_exon_set_summary_mv(tx_ac); + CREATE INDEX tx_exon_set_summary_mv_alt_ac_ix ON tx_exon_set_summary_mv(alt_ac); + CREATE INDEX tx_exon_set_summary_mv_alt_aln_method_ix ON tx_exon_set_summary_mv(alt_aln_method); + GRANT SELECT ON tx_exon_set_summary_mv TO public; + REFRESH MATERIALIZED VIEW tx_exon_set_summary_mv; + """) + op.execute(""" + CREATE VIEW tx_def_summary_dv AS + SELECT TESS.exon_set_id, TESS.tx_ac, TESS.alt_ac, TESS.alt_aln_method, TESS.alt_strand, + TESS.hgnc, TESS.gene_id, TESS.cds_md5, TESS.es_fingerprint, CEF.cds_es_fp, + CEF.cds_exon_lengths_fp, TESS.n_exons, TESS.se_i, CEF.cds_se_i, TESS.starts_i, + TESS.ends_i, TESS.lengths, T.cds_start_i, T.cds_end_i, CEF.cds_start_exon, CEF.cds_end_exon + FROM tx_exon_set_summary_mv TESS + JOIN transcript T ON TESS.tx_ac=T.ac + LEFT JOIN _cds_exons_fp_v CEF ON TESS.exon_set_id=CEF.exon_set_id + WHERE TESS.alt_aln_method = 'transcript'; + """) + op.execute(""" + CREATE MATERIALIZED VIEW tx_def_summary_mv AS SELECT * FROM tx_def_summary_dv WITH NO DATA; + CREATE INDEX tx_def_summary_mv_tx_ac ON tx_def_summary_mv (tx_ac); + CREATE INDEX tx_def_summary_mv_alt_ac ON tx_def_summary_mv (alt_ac); + CREATE INDEX tx_def_summary_mv_alt_aln_method ON tx_def_summary_mv (alt_aln_method); + CREATE INDEX tx_def_summary_mv_hgnc ON tx_def_summary_mv (hgnc); + CREATE INDEX tx_def_summary_mv_gene_id ON tx_def_summary_mv (gene_id); + REFRESH MATERIALIZED VIEW tx_def_summary_mv; + """) + + # ### end of updates to existing views ### + + # ### drop hgnc from transcript ### + op.drop_column('transcript', 'hgnc', schema='uta') + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### updates to views to add hgnc to transcript ### + op.add_column('transcript', sa.Column('hgnc', sa.Text(), nullable=True), schema='uta') + # ### end of updates to transcript ### + + # ### commands to downgrade views before adding hgnc to transcript ### + op.execute("""DROP MATERIALIZED VIEW IF EXISTS tx_def_summary_mv CASCADE;""") + op.execute("""DROP VIEW IF EXISTS tx_def_summary_dv CASCADE;""") + op.execute("""DROP MATERIALIZED VIEW IF EXISTS tx_exon_set_summary_mv CASCADE;""") + op.execute("""DROP VIEW IF EXISTS tx_exon_set_summary_dv CASCADE;""") + op.execute("""DROP VIEW IF EXISTS tx_exon_aln_v CASCADE;""") + op.execute("""DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;""") + op.execute("""DROP VIEW IF EXISTS _discontiguous_tx CASCADE;""") + op.execute(""" + CREATE VIEW _discontiguous_tx AS + SELECT t.hgnc, + es.exon_set_id, + es.tx_ac, + format('[%s-%s]'::text, e1.end_i, e2.start_i) AS gap, + e1.exon_id AS e1_exon_id, + e1.ord AS e1_ord, + e1.start_i AS e1_start_i, + e1.end_i AS e1_end_i, + e2.exon_id AS e2_exon_id, + e2.ord AS e2_ord, + e2.start_i AS e2_start_i, + e2.end_i AS e2_end_i + FROM exon_set es + JOIN transcript t ON es.tx_ac = t.ac + JOIN exon e1 ON es.exon_set_id = e1.exon_set_id + JOIN exon e2 ON es.exon_set_id = e2.exon_set_id AND e2.ord = (e1.ord + 1) AND e1.end_i <> e2.start_i + WHERE es.alt_aln_method = 'transcript'::text; + """) + op.execute(""" + CREATE VIEW tx_alt_exon_pairs_v AS + SELECT t.hgnc,TES.exon_SET_id AS tes_exon_SET_id,AES.exon_SET_id AS aes_exon_SET_id, + TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac,AES.alt_strand,AES.alt_aln_method, + TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id,AEX.exon_id AS alt_exon_id, + TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, AEX.start_i AS alt_start_i,AEX.END_i AS alt_END_i, + EA.exon_aln_id,EA.cigar + FROM exon_SET tes + JOIN transcript t ON tes.tx_ac=t.ac + JOIN exon_set aes ON tes.tx_ac=aes.tx_ac AND tes.alt_aln_method='transcript' AND aes.alt_aln_method!='transcript' + JOIN exon tex ON tes.exon_SET_id=tex.exon_SET_id + JOIN exon aex ON aes.exon_SET_id=aex.exon_SET_id AND tex.ORD=aex.ORD + LEFT JOIN exon_aln ea ON ea.tx_exon_id=tex.exon_id AND ea.alt_exon_id=AEX.exon_id; + """) + op.execute(""" + CREATE VIEW tx_exon_aln_v AS + SELECT T.hgnc,T.ac as tx_ac,AES.alt_ac,AES.alt_aln_method,AES.alt_strand, + TE.ord, TE.start_i as tx_start_i,TE.end_i as tx_end_i, + AE.start_i as alt_start_i, AE.end_i as alt_end_i, + EA.cigar, EA.tx_aseq, EA.alt_aseq, + TES.exon_set_id AS tx_exon_set_id,AES.exon_set_id as alt_exon_set_id, + TE.exon_id as tx_exon_id, AE.exon_id as alt_exon_id, + EA.exon_aln_id + FROM transcript T + JOIN exon_set TES ON T.ac=TES.tx_ac AND TES.alt_aln_method ='transcript' + JOIN exon_set AES on T.ac=AES.tx_ac and AES.alt_aln_method!='transcript' + JOIN exon TE ON TES.exon_set_id=TE.exon_set_id + JOIN exon AE ON AES.exon_set_id=AE.exon_set_id AND TE.ord=AE.ord + LEFT JOIN exon_aln EA ON TE.exon_id=EA.tx_exon_id AND AE.exon_id=EA.alt_exon_id; + """) + op.execute(""" + CREATE VIEW tx_exon_set_summary_dv AS + SELECT T.hgnc,cds_md5,es_fingerprint,tx_ac,alt_ac,alt_aln_method,alt_strand,exon_set_id,n_exons,se_i,starts_i,ends_i,lengths + FROM transcript T + JOIN exon_set_exons_fp_mv ESE ON T.ac=ESE.tx_ac; + """) + op.execute(""" + CREATE MATERIALIZED VIEW tx_exon_set_summary_mv AS SELECT * FROM tx_exon_set_summary_dv WITH NO DATA; + CREATE INDEX tx_exon_set_summary_mv_cds_md5_ix ON tx_exon_set_summary_mv(cds_md5); + CREATE INDEX tx_exon_set_summary_mv_es_fingerprint_ix ON tx_exon_set_summary_mv(es_fingerprint); + CREATE INDEX tx_exon_set_summary_mv_tx_ac_ix ON tx_exon_set_summary_mv(tx_ac); + CREATE INDEX tx_exon_set_summary_mv_alt_ac_ix ON tx_exon_set_summary_mv(alt_ac); + CREATE INDEX tx_exon_set_summary_mv_alt_aln_method_ix ON tx_exon_set_summary_mv(alt_aln_method); + GRANT SELECT ON tx_exon_set_summary_mv TO public; + REFRESH MATERIALIZED VIEW tx_exon_set_summary_mv; + """) + op.execute(""" + CREATE VIEW tx_def_summary_dv AS + SELECT TESS.exon_set_id, TESS.tx_ac, TESS.alt_ac, TESS.alt_aln_method, TESS.alt_strand, + TESS.hgnc, TESS.cds_md5, TESS.es_fingerprint, CEF.cds_es_fp, + CEF.cds_exon_lengths_fp, TESS.n_exons, TESS.se_i, CEF.cds_se_i, TESS.starts_i, + TESS.ends_i, TESS.lengths, T.cds_start_i, T.cds_end_i, CEF.cds_start_exon, CEF.cds_end_exon + FROM tx_exon_set_summary_mv TESS + JOIN transcript T ON TESS.tx_ac=T.ac + LEFT JOIN _cds_exons_fp_v CEF ON TESS.exon_set_id=CEF.exon_set_id + WHERE TESS.alt_aln_method = 'transcript'; + """) + op.execute(""" + CREATE MATERIALIZED VIEW tx_def_summary_mv AS SELECT * FROM tx_def_summary_dv WITH NO DATA; + CREATE INDEX tx_def_summary_mv_tx_ac ON tx_def_summary_mv (tx_ac); + CREATE INDEX tx_def_summary_mv_alt_ac ON tx_def_summary_mv (alt_ac); + CREATE INDEX tx_def_summary_mv_alt_aln_method ON tx_def_summary_mv (alt_aln_method); + CREATE INDEX tx_def_summary_mv_hgnc ON tx_def_summary_mv (hgnc); + REFRESH MATERIALIZED VIEW tx_def_summary_mv; + """) + # ### end of updates to views ### + + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint('fk_uta_transcript_gene_gene_id', 'transcript', schema='uta', type_='foreignkey') + op.drop_index(op.f('ix_uta_transcript_gene_id'), table_name='transcript', schema='uta') + op.alter_column('transcript', 'gene_id', + existing_type=sa.TEXT(), + nullable=True, + schema='uta') + op.drop_index(op.f('ix_uta_gene_hgnc'), table_name='gene', schema='uta') + op.drop_constraint('gene_pkey', 'gene', schema='uta') + op.alter_column('gene', 'gene_id', + existing_type=sa.TEXT(), + nullable=True, + schema='uta') + # ### end Alembic commands ### diff --git a/src/uta/models.py b/src/uta/models.py index 31a5392..2b64741 100644 --- a/src/uta/models.py +++ b/src/uta/models.py @@ -98,7 +98,8 @@ class Gene(Base): __tablename__ = "gene" # columns: - hgnc = sa.Column(sa.Text, primary_key=True) + gene_id = sa.Column(sa.Text, primary_key=True) + hgnc = sa.Column(sa.Text, nullable=False, index=True) maploc = sa.Column(sa.Text) descr = sa.Column(sa.Text) summary = sa.Column(sa.Text) @@ -124,9 +125,9 @@ class Transcript(Base): ac = sa.Column(sa.Text, primary_key=True) origin_id = sa.Column( sa.Integer, sa.ForeignKey("origin.origin_id", onupdate="CASCADE", ondelete="CASCADE"), nullable=False, index=True) - hgnc = sa.Column(sa.Text) # , sa.ForeignKey("gene.hgnc")) - cds_start_i = sa.Column(sa.Integer) #, nullable=False) - cds_end_i = sa.Column(sa.Integer) #, nullable=False) + gene_id = sa.Column(sa.Text, sa.ForeignKey("gene.gene_id"), nullable=False, index=True) + cds_start_i = sa.Column(sa.Integer) + cds_end_i = sa.Column(sa.Integer) cds_md5 = sa.Column(sa.Text, index=True) added = sa.Column( sa.DateTime, default=datetime.datetime.now(), nullable=False) From 0d4d30884376418b5cc797a89bd828f39c5b5868 Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Fri, 12 Apr 2024 15:57:57 -0600 Subject: [PATCH 02/10] feat(IPVC-2264): add in database update script, update loading methods --- misc/gene-update/backfill_gene_id.py | 116 +++++++++++++++++++++++++ misc/gene-update/upgrade-uta-schema.sh | 61 +++++++++++++ src/uta/loading.py | 5 +- 3 files changed, 180 insertions(+), 2 deletions(-) create mode 100644 misc/gene-update/backfill_gene_id.py create mode 100644 misc/gene-update/upgrade-uta-schema.sh diff --git a/misc/gene-update/backfill_gene_id.py b/misc/gene-update/backfill_gene_id.py new file mode 100644 index 0000000..5961cfd --- /dev/null +++ b/misc/gene-update/backfill_gene_id.py @@ -0,0 +1,116 @@ +import argparse +import logging + +from datetime import datetime +from sqlalchemy.orm import Session +from sqlalchemy import text + +import uta +from uta.models import Gene, Transcript + + +logger = None +n = 50000 + + +def backfill_gene(uta_session: Session, gene_update_file: str) -> None: + logger.info("Dropping gene table contents") + uta_session.execute(text("DELETE FROM uta.gene;")) + uta_session.commit() + + logger.info(f"Back filling gene table from {gene_update_file}") + now_ts = datetime.now() + i = 0 + new_genes = [] + with open(gene_update_file) as f: + for line in f: + if line.startswith("gene_id"): + continue + + if i % n == 0: + if i > 0: + logger.info(f"Bulk inserting {len(new_genes)} genes") + uta_session.bulk_save_objects(new_genes) + uta_session.commit() + logger.info(f"Processing chunk {int(i/n) + 1}") + new_genes = [] + + gene_id, hgnc, maploc, desc, summary, aliases, added = line.rstrip("\r\n").split("\t") + # set timestamp from file string, if empty set to now. + if added == "": + added_ts = now_ts + else: + added_ts = datetime.strptime(added, "%Y-%m-%d %H:%M:%S.%f") + + # clean up aliases + aliases = aliases.replace("{", "").replace("}", "") + if aliases == "-": + aliases = None + + gene = Gene( + gene_id=gene_id, + hgnc=hgnc, + maploc=maploc if maploc else None, + descr=desc if desc else None, + summary=summary if desc else None, + aliases=aliases if aliases else None, + added=added_ts, + ) + i += 1 + new_genes.append(gene) + + logger.info(f"Bulk inserting {len(new_genes)} genes") + uta_session.bulk_save_objects(new_genes) + uta_session.commit() + logger.info(f"Inserted {i} total genes") + + +def backfill_transcript(uta_session: Session, transcript_update_file: str) -> None: + logger.info("Backfilling gene_id in transcript table") + tx_ac_to_gene_id = {} + + logger.info(f"Reading transcript to gene id mappings from {transcript_update_file}") + with open(transcript_update_file) as f: + for line in f: + if line.startswith("origin"): + continue + _, tx_ac, gene_id, _ = line.rstrip("\r\n").split("\t") + tx_ac_to_gene_id[tx_ac] = gene_id + logger.info(f" - {len(tx_ac_to_gene_id)} mappings read") + + i = 0 + txs = [] + for tx_ac, gene_id in tx_ac_to_gene_id.items(): + if i % n == 0: + if i > 0: + logger.info(f"Updating {len(txs)} transcripts") + uta_session.flush() + + logger.info(f"Processing chunk {int(i/n) + 1}") + txs = [] + + tx = uta_session.query(Transcript).filter(Transcript.ac == tx_ac).one() + tx.gene_id = gene_id + txs.append(tx) + i += 1 + + logger.info(f"Updating {len(txs)} transcripts") + uta_session.flush() + uta_session.commit() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Backfill gene_id in gene and transcript tables") + parser.add_argument("db_url", help="URL of the UTA database") + parser.add_argument("gene_update_file", type=str, help="File containing gene_id updates for gene table") + parser.add_argument("transcript_update_file", type=str, help="File containing gene_id updates for transcript table") + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + logger = logging.getLogger("backfill_gene_id") + + session = uta.connect(args.db_url) + + backfill_gene(session, args.gene_update_file) + backfill_transcript(session, args.transcript_update_file) + session.close() diff --git a/misc/gene-update/upgrade-uta-schema.sh b/misc/gene-update/upgrade-uta-schema.sh new file mode 100644 index 0000000..9472fd9 --- /dev/null +++ b/misc/gene-update/upgrade-uta-schema.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# This script is used to upgrade older UTA schemas (specifically uta_20210129b) to a newer version. +# Part of this upgrade is to introduce gene_id to the gene and transcript tables. The columns are +# added with a Alembic migration. Then a data migration to back fill the new columns. Then a second +# Alembic migration to add the constraints to the columns and update primary and foreign keys. + +if [ "$#" -lt 1 ] +then + echo "error: too few arguments, you provided $#, 1 required" + echo "usage: upgrade-uta-schema.sh " + exit 1 +fi + +set -euxo pipefail + +source_uta_v="uta_20210129b" +working_uta_v="uta" +dest_uta_v=$1 +dumps_dir="/workdir/dumps" +mkdir -p $dumps_dir + +## setup working uta schema +# delete destination schema if exists +psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $working_uta_v CASCADE;" + +# dump source version +pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \ + gzip -c > $dumps_dir/"$source_uta_v".pgd.gz + +# create new schema +gzip -cdq $dumps_dir/"$source_uta_v".pgd.gz | \ + sbin/pg-dump-schema-rename "$source_uta_v" "$working_uta_v" | \ + sbin/pg-dump-schema-rename "uta_1_1" "$working_uta_v" | \ + psql -U uta_admin -h localhost -d uta -aeE + +## upgrade working uta schema +# set initial Alembic migration so it is not ran. +alembic -c etc/alembic.ini stamp edadb97f6502 + +# run Alembic migration to add gene_id to gene and transcript tables +alembic -c etc/alembic.ini upgrade 595a586e6de7 + +# run data migration to back fill gene_id +python misc/gene-update/backfill_gene_id.py \ + postgresql://uta_admin:@localhost/uta \ + /workdir/backfill/gene_update.tsv \ + /workdir/backfill/transcript_update.tsv + +# run Alembic migrations to add constraints and update existing views +alembic -c etc/alembic.ini upgrade head + +## Copy data into destination schema +# dump working schema +pg_dump -U uta_admin -h localhost -d uta -n "$working_uta_v" | \ + gzip -c > "$dumps_dir/$working_uta_v".pgd.gz + +# copy data into destination schema +gzip -cdq "$dumps_dir/$working_uta_v".pgd.gz | \ + sbin/pg-dump-schema-rename "$working_uta_v" "$dest_uta_v" | \ + psql -U uta_admin -h localhost -d uta -aeE diff --git a/src/uta/loading.py b/src/uta/loading.py index 7ddd354..2465607 100644 --- a/src/uta/loading.py +++ b/src/uta/loading.py @@ -336,13 +336,14 @@ def load_geneinfo(session, opts, cf): for i_gi, gi in enumerate(gir): session.merge( usam.Gene( + gene_id=gi.gene_id, hgnc=gi.hgnc, maploc=gi.maploc, descr=gi.descr, summary=gi.summary, aliases=gi.aliases, )) - logger.info("Added {gi.hgnc} ({gi.summary})".format(gi=gi)) + logger.info("Added {gi.hgnc}: {gi.gene_id} ({gi.summary})".format(gi=gi)) session.commit() @@ -698,7 +699,7 @@ def _fetch_origin_by_name(name): u_tx = usam.Transcript( ac=ti.ac, origin=ori, - hgnc=ti.gene_symbol, + gene_id=ti.gene_id, cds_start_i=cds_start_i, cds_end_i=cds_end_i, cds_md5=cds_md5, From d5833c4b9efe5e9f45b3e626561209f6ca186c1b Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Fri, 12 Apr 2024 16:38:40 -0600 Subject: [PATCH 03/10] feat(IPVC-2264): rename schema rather than export and re-import --- etc/scripts/backfill_gene_id.py | 116 ------------------------- etc/scripts/create-new-schema.sh | 3 +- misc/gene-update/upgrade-uta-schema.sh | 11 +-- 3 files changed, 3 insertions(+), 127 deletions(-) delete mode 100644 etc/scripts/backfill_gene_id.py diff --git a/etc/scripts/backfill_gene_id.py b/etc/scripts/backfill_gene_id.py deleted file mode 100644 index 5961cfd..0000000 --- a/etc/scripts/backfill_gene_id.py +++ /dev/null @@ -1,116 +0,0 @@ -import argparse -import logging - -from datetime import datetime -from sqlalchemy.orm import Session -from sqlalchemy import text - -import uta -from uta.models import Gene, Transcript - - -logger = None -n = 50000 - - -def backfill_gene(uta_session: Session, gene_update_file: str) -> None: - logger.info("Dropping gene table contents") - uta_session.execute(text("DELETE FROM uta.gene;")) - uta_session.commit() - - logger.info(f"Back filling gene table from {gene_update_file}") - now_ts = datetime.now() - i = 0 - new_genes = [] - with open(gene_update_file) as f: - for line in f: - if line.startswith("gene_id"): - continue - - if i % n == 0: - if i > 0: - logger.info(f"Bulk inserting {len(new_genes)} genes") - uta_session.bulk_save_objects(new_genes) - uta_session.commit() - logger.info(f"Processing chunk {int(i/n) + 1}") - new_genes = [] - - gene_id, hgnc, maploc, desc, summary, aliases, added = line.rstrip("\r\n").split("\t") - # set timestamp from file string, if empty set to now. - if added == "": - added_ts = now_ts - else: - added_ts = datetime.strptime(added, "%Y-%m-%d %H:%M:%S.%f") - - # clean up aliases - aliases = aliases.replace("{", "").replace("}", "") - if aliases == "-": - aliases = None - - gene = Gene( - gene_id=gene_id, - hgnc=hgnc, - maploc=maploc if maploc else None, - descr=desc if desc else None, - summary=summary if desc else None, - aliases=aliases if aliases else None, - added=added_ts, - ) - i += 1 - new_genes.append(gene) - - logger.info(f"Bulk inserting {len(new_genes)} genes") - uta_session.bulk_save_objects(new_genes) - uta_session.commit() - logger.info(f"Inserted {i} total genes") - - -def backfill_transcript(uta_session: Session, transcript_update_file: str) -> None: - logger.info("Backfilling gene_id in transcript table") - tx_ac_to_gene_id = {} - - logger.info(f"Reading transcript to gene id mappings from {transcript_update_file}") - with open(transcript_update_file) as f: - for line in f: - if line.startswith("origin"): - continue - _, tx_ac, gene_id, _ = line.rstrip("\r\n").split("\t") - tx_ac_to_gene_id[tx_ac] = gene_id - logger.info(f" - {len(tx_ac_to_gene_id)} mappings read") - - i = 0 - txs = [] - for tx_ac, gene_id in tx_ac_to_gene_id.items(): - if i % n == 0: - if i > 0: - logger.info(f"Updating {len(txs)} transcripts") - uta_session.flush() - - logger.info(f"Processing chunk {int(i/n) + 1}") - txs = [] - - tx = uta_session.query(Transcript).filter(Transcript.ac == tx_ac).one() - tx.gene_id = gene_id - txs.append(tx) - i += 1 - - logger.info(f"Updating {len(txs)} transcripts") - uta_session.flush() - uta_session.commit() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Backfill gene_id in gene and transcript tables") - parser.add_argument("db_url", help="URL of the UTA database") - parser.add_argument("gene_update_file", type=str, help="File containing gene_id updates for gene table") - parser.add_argument("transcript_update_file", type=str, help="File containing gene_id updates for transcript table") - args = parser.parse_args() - - logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') - logger = logging.getLogger("backfill_gene_id") - - session = uta.connect(args.db_url) - - backfill_gene(session, args.gene_update_file) - backfill_transcript(session, args.transcript_update_file) - session.close() diff --git a/etc/scripts/create-new-schema.sh b/etc/scripts/create-new-schema.sh index 5200e8d..d13100a 100755 --- a/etc/scripts/create-new-schema.sh +++ b/etc/scripts/create-new-schema.sh @@ -21,5 +21,4 @@ pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \ # create new schema gzip -cdq $dumps_dir/"$source_uta_v".pgd.gz | \ sbin/pg-dump-schema-rename "$source_uta_v" "$dest_uta_v" | \ - sbin/pg-dump-schema-rename "uta_1_1" "$dest_uta_v" | \ - psql -U uta_admin -h localhost -d uta -aeE \ No newline at end of file + psql -U uta_admin -h localhost -d uta -aeE diff --git a/misc/gene-update/upgrade-uta-schema.sh b/misc/gene-update/upgrade-uta-schema.sh index 9472fd9..d443141 100644 --- a/misc/gene-update/upgrade-uta-schema.sh +++ b/misc/gene-update/upgrade-uta-schema.sh @@ -50,12 +50,5 @@ python misc/gene-update/backfill_gene_id.py \ # run Alembic migrations to add constraints and update existing views alembic -c etc/alembic.ini upgrade head -## Copy data into destination schema -# dump working schema -pg_dump -U uta_admin -h localhost -d uta -n "$working_uta_v" | \ - gzip -c > "$dumps_dir/$working_uta_v".pgd.gz - -# copy data into destination schema -gzip -cdq "$dumps_dir/$working_uta_v".pgd.gz | \ - sbin/pg-dump-schema-rename "$working_uta_v" "$dest_uta_v" | \ - psql -U uta_admin -h localhost -d uta -aeE +## Rename schema to destination schema name +psql -h localhost -U uta_admin -d uta -c "ALTER SCHEMA uta RENAME TO $dest_uta_v"; From 6c4b3787aec62240c338c25f1bc738faf30a12bc Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Tue, 16 Apr 2024 14:53:39 -0600 Subject: [PATCH 04/10] feat(IPVC-2264): address hgnc to gene_id updates --- .gitignore | 4 ++++ sbin/ncbi-parse-geneinfo | 2 -- sbin/uta-diff | 2 +- sbin/uta-load | 7 ++++--- src/uta/loading.py | 5 ++--- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index b0a2906..e83d7e6 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,10 @@ misc/partial-alignments sdist tmp update/ncbi/ +seqrepo-data/ +ncbi-data/ +uta-update/ +output/ venv nosetests.xml .vscode diff --git a/sbin/ncbi-parse-geneinfo b/sbin/ncbi-parse-geneinfo index 3dd2534..bc33aad 100755 --- a/sbin/ncbi-parse-geneinfo +++ b/sbin/ncbi-parse-geneinfo @@ -30,8 +30,6 @@ if __name__ == "__main__": giw = GeneInfoWriter(sys.stdout) for rec in gi_in: - if rec["symbol_from_nomenclature_authority"] == "-": - continue gi = GeneInfo( tax_id=rec["tax_id"], gene_symbol=rec["symbol"], diff --git a/sbin/uta-diff b/sbin/uta-diff index 4caa50e..ffca1f5 100755 --- a/sbin/uta-diff +++ b/sbin/uta-diff @@ -14,7 +14,7 @@ cmp_cols = collections.defaultdict(lambda: ['*']) cmp_cols.update({ "associated_accessions": "tx_ac pro_ac origin".split(), "exon_aln": "exon_aln_id tx_exon_id alt_exon_id cigar added".split(), - "gene": "hgnc".split(), + "gene": "gene_id".split(), "transcript": "ac".split(), }) diff --git a/sbin/uta-load b/sbin/uta-load index c3da837..df0f3db 100755 --- a/sbin/uta-load +++ b/sbin/uta-load @@ -32,7 +32,6 @@ etc/scripts/delete-schema.sh "$loading_uta_v" etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v" ## for now set up Alembic for schema migrations -alembic -c etc/alembic.ini stamp edadb97f6502 alembic -c etc/alembic.ini upgrade head # generate seqinfo files from exonsets (this step requires seqrepo) @@ -64,12 +63,14 @@ uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$worki uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$working_dir/exonsets.gz" 2>&1 | \ tee "$log_dir/load-exonsets.log" +# Load seqinfo into the seq and seqanno tables. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-seqinfo "$working_dir/seqinfo.gz" 2>&1 | \ + tee "$log_dir/load-seqinfo.log" + # Create cigar strings for all rows in tx_alt_exon_pairs_v view and update exon_aln table. uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | \ tee "$log_dir/align-exons.log" -# Load seqinfo? - ### run diff sbin/uta-diff "$source_uta_v" "$loading_uta_v" diff --git a/src/uta/loading.py b/src/uta/loading.py index 2465607..1962ccd 100644 --- a/src/uta/loading.py +++ b/src/uta/loading.py @@ -705,10 +705,9 @@ def _fetch_origin_by_name(name): cds_md5=cds_md5, ) session.add(u_tx) - if u_tx.hgnc != ti.gene_symbol: - logger.warn("{ti.ac}: HGNC symbol changed from {u_tx.hgnc} to {ti.gene_symbol}".format( + if u_tx.gene_id != ti.gene_id: + logger.warn("{ti.ac}: GeneID changed from {u_tx.gene_id} to {ti.gene_id}".format( u_tx=u_tx, ti=ti)) - u_tx.hgnc = ti.gene_symbol # state: transcript now exists, either existing or freshly-created From 4ed5101118edb8039c3b5617c59246f80aa74c8e Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Tue, 16 Apr 2024 19:50:43 -0600 Subject: [PATCH 05/10] feat(IPVC-2264): update shell script, don't drop hgnc, add type and xrefs to gene --- misc/gene-update/upgrade-uta-schema.sh | 3 +- sbin/uta-extract | 6 +- ...6de7_add_gene_id_to_gene_and_transcript.py | 4 + ...et_gene_id_and_primary_and_foreign_keys.py | 123 ++++++++++++------ src/uta/loading.py | 34 +---- src/uta/models.py | 3 + 6 files changed, 102 insertions(+), 71 deletions(-) diff --git a/misc/gene-update/upgrade-uta-schema.sh b/misc/gene-update/upgrade-uta-schema.sh index d443141..0671780 100644 --- a/misc/gene-update/upgrade-uta-schema.sh +++ b/misc/gene-update/upgrade-uta-schema.sh @@ -21,7 +21,7 @@ dumps_dir="/workdir/dumps" mkdir -p $dumps_dir ## setup working uta schema -# delete destination schema if exists +# delete schema if exists psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $working_uta_v CASCADE;" # dump source version @@ -51,4 +51,5 @@ python misc/gene-update/backfill_gene_id.py \ alembic -c etc/alembic.ini upgrade head ## Rename schema to destination schema name +psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $dest_uta_v CASCADE;" psql -h localhost -U uta_admin -d uta -c "ALTER SCHEMA uta RENAME TO $dest_uta_v"; diff --git a/sbin/uta-extract b/sbin/uta-extract index 2a60f3f..a8d3d60 100755 --- a/sbin/uta-extract +++ b/sbin/uta-extract @@ -38,6 +38,6 @@ sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets tee "$log_dir/filter_exonset_transcripts.log" # move fasta files into same dir -ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/ -ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/ -ln $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/ +cp -f $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/ +cp -f $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/ +cp -f $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/ diff --git a/src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py b/src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py index 1f6573a..06156bb 100644 --- a/src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py +++ b/src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py @@ -21,6 +21,8 @@ def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.add_column('gene', sa.Column('gene_id', sa.Text(), nullable=True), schema='uta') + op.add_column('gene', sa.Column('type', sa.Text(), nullable=True), schema='uta') + op.add_column('gene', sa.Column('xrefs', sa.Text(), nullable=True), schema='uta') op.add_column('transcript', sa.Column('gene_id', sa.Text(), nullable=True), schema='uta') # ### end Alembic commands ### @@ -32,6 +34,8 @@ def upgrade() -> None: def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_column('transcript', 'gene_id', schema='uta') + op.drop_column('gene', 'xrefs', schema='uta') + op.drop_column('gene', 'type', schema='uta') op.drop_column('gene', 'gene_id', schema='uta') # ### end Alembic commands ### diff --git a/src/alembic/versions/f85dd97bd9f5_set_gene_id_and_primary_and_foreign_keys.py b/src/alembic/versions/f85dd97bd9f5_set_gene_id_and_primary_and_foreign_keys.py index 4669b8b..094d05a 100644 --- a/src/alembic/versions/f85dd97bd9f5_set_gene_id_and_primary_and_foreign_keys.py +++ b/src/alembic/versions/f85dd97bd9f5_set_gene_id_and_primary_and_foreign_keys.py @@ -48,17 +48,25 @@ def upgrade() -> None: ) # ### end Alembic commands ### + # ### handle first part of hgnc -> gene_symbol column rename ### + op.add_column("gene", sa.Column("symbol", sa.Text(), nullable=True), schema="uta") + op.create_index(op.f("ix_uta_gene_symbol"), "gene", ["symbol"], unique=False, schema="uta") + op.execute("UPDATE gene SET symbol = hgnc;") + # ### end of hgnc -> gene_symbol column rename ### + # ### updates required to existing views needed to drop hgnc from transcript. ### - op.execute("""DROP MATERIALIZED VIEW IF EXISTS tx_def_summary_mv CASCADE;""") - op.execute("""DROP VIEW IF EXISTS tx_def_summary_dv CASCADE;""") - op.execute("""DROP MATERIALIZED VIEW IF EXISTS tx_exon_set_summary_mv CASCADE;""") - op.execute("""DROP VIEW IF EXISTS tx_exon_set_summary_dv CASCADE;""") - op.execute("""DROP VIEW IF EXISTS tx_exon_aln_v CASCADE;""") - op.execute("""DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;""") - op.execute("""DROP VIEW IF EXISTS _discontiguous_tx CASCADE;""") + op.execute("DROP VIEW IF EXISTS tx_similarity_v CASCADE;") + op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_def_summary_mv CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_def_summary_dv CASCADE;") + op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_exon_set_summary_mv CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_exon_set_summary_dv CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_exon_aln_v CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;") + op.execute("DROP VIEW IF EXISTS _discontiguous_tx CASCADE;") op.execute(""" CREATE VIEW _discontiguous_tx AS - SELECT g.hgnc, + SELECT g.symbol, + g.symbol as hgnc, g.gene_id, es.exon_set_id, es.tx_ac, @@ -80,11 +88,11 @@ def upgrade() -> None: """) op.execute(""" CREATE VIEW tx_alt_exon_pairs_v AS - SELECT g.hgnc, g.gene_id,TES.exon_SET_id AS tes_exon_SET_id,AES.exon_SET_id AS aes_exon_SET_id, - TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac,AES.alt_strand,AES.alt_aln_method, - TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id,AEX.exon_id AS alt_exon_id, - TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, AEX.start_i AS alt_start_i,AEX.END_i AS alt_END_i, - EA.exon_aln_id,EA.cigar + SELECT g.symbol, g.symbol as hgnc, g.gene_id,TES.exon_SET_id AS tes_exon_SET_id, + AES.exon_SET_id AS aes_exon_SET_id, TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac, + AES.alt_strand,AES.alt_aln_method, TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id, + AEX.exon_id AS alt_exon_id, TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, + AEX.start_i AS alt_start_i, AEX.END_i AS alt_END_i, EA.exon_aln_id,EA.cigar FROM exon_SET tes JOIN transcript t ON tes.tx_ac=t.ac JOIN gene g ON t.gene_id=g.gene_id @@ -95,13 +103,12 @@ def upgrade() -> None: """) op.execute(""" CREATE VIEW tx_exon_aln_v AS - SELECT G.hgnc,G.gene_id,T.ac as tx_ac,AES.alt_ac,AES.alt_aln_method,AES.alt_strand, - TE.ord, TE.start_i as tx_start_i,TE.end_i as tx_end_i, - AE.start_i as alt_start_i, AE.end_i as alt_end_i, - EA.cigar, EA.tx_aseq, EA.alt_aseq, - TES.exon_set_id AS tx_exon_set_id,AES.exon_set_id as alt_exon_set_id, - TE.exon_id as tx_exon_id, AE.exon_id as alt_exon_id, - EA.exon_aln_id + SELECT G.symbol, G.symbol AS hgnc, G.gene_id, T.ac as tx_ac, AES.alt_ac, + AES.alt_aln_method,AES.alt_strand, TE.ord, TE.start_i as tx_start_i, + TE.end_i as tx_end_i, AE.start_i as alt_start_i, AE.end_i as alt_end_i, + EA.cigar, EA.tx_aseq, EA.alt_aseq, TES.exon_set_id AS tx_exon_set_id, + AES.exon_set_id as alt_exon_set_id, TE.exon_id as tx_exon_id, + AE.exon_id as alt_exon_id, EA.exon_aln_id FROM transcript T JOIN gene G ON T.gene_id=G.gene_id JOIN exon_set TES ON T.ac=TES.tx_ac AND TES.alt_aln_method ='transcript' @@ -112,7 +119,8 @@ def upgrade() -> None: """) op.execute(""" CREATE VIEW tx_exon_set_summary_dv AS - SELECT G.hgnc,G.gene_id,cds_md5,es_fingerprint,tx_ac,alt_ac,alt_aln_method,alt_strand,exon_set_id,n_exons,se_i,starts_i,ends_i,lengths + SELECT G.symbol, G.symbol as hgnc, G.gene_id, cds_md5, es_fingerprint, tx_ac, alt_ac, + alt_aln_method, alt_strand, exon_set_id, n_exons, se_i, starts_i, ends_i, lengths FROM transcript T JOIN gene G ON T.gene_id=G.gene_id JOIN exon_set_exons_fp_mv ESE ON T.ac=ESE.tx_ac; @@ -130,7 +138,7 @@ def upgrade() -> None: op.execute(""" CREATE VIEW tx_def_summary_dv AS SELECT TESS.exon_set_id, TESS.tx_ac, TESS.alt_ac, TESS.alt_aln_method, TESS.alt_strand, - TESS.hgnc, TESS.gene_id, TESS.cds_md5, TESS.es_fingerprint, CEF.cds_es_fp, + TESS.symbol, TESS.hgnc, TESS.gene_id, TESS.cds_md5, TESS.es_fingerprint, CEF.cds_es_fp, CEF.cds_exon_lengths_fp, TESS.n_exons, TESS.se_i, CEF.cds_se_i, TESS.starts_i, TESS.ends_i, TESS.lengths, T.cds_start_i, T.cds_end_i, CEF.cds_start_exon, CEF.cds_end_exon FROM tx_exon_set_summary_mv TESS @@ -144,10 +152,28 @@ def upgrade() -> None: CREATE INDEX tx_def_summary_mv_alt_ac ON tx_def_summary_mv (alt_ac); CREATE INDEX tx_def_summary_mv_alt_aln_method ON tx_def_summary_mv (alt_aln_method); CREATE INDEX tx_def_summary_mv_hgnc ON tx_def_summary_mv (hgnc); + CREATE INDEX tx_def_summary_mv_symbol ON tx_def_summary_mv (symbol); CREATE INDEX tx_def_summary_mv_gene_id ON tx_def_summary_mv (gene_id); REFRESH MATERIALIZED VIEW tx_def_summary_mv; """) - + op.execute(""" + CREATE VIEW tx_similarity_v AS + SELECT DISTINCT + D1.tx_ac as tx_ac1, D2.tx_ac as tx_ac2, + D1.symbol = D2.symbol as symbol_eq, + D1.cds_md5=D2.cds_md5 as cds_eq, + D1.es_fingerprint=D2.es_fingerprint as es_fp_eq, + D1.cds_es_fp=D2.cds_es_fp as cds_es_fp_eq, + D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp as cds_exon_lengths_fp_eq + FROM tx_def_summary_mv D1 + JOIN tx_def_summary_mv D2 on (D1.tx_ac != D2.tx_ac + and (D1.symbol=D2.symbol + or D1.cds_md5=D2.cds_md5 + or D1.es_fingerprint=D2.es_fingerprint + or D1.cds_es_fp=D2.cds_es_fp + or D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp + )); + """) # ### end of updates to existing views ### # ### drop hgnc from transcript ### @@ -161,13 +187,14 @@ def downgrade() -> None: # ### end of updates to transcript ### # ### commands to downgrade views before adding hgnc to transcript ### - op.execute("""DROP MATERIALIZED VIEW IF EXISTS tx_def_summary_mv CASCADE;""") - op.execute("""DROP VIEW IF EXISTS tx_def_summary_dv CASCADE;""") - op.execute("""DROP MATERIALIZED VIEW IF EXISTS tx_exon_set_summary_mv CASCADE;""") - op.execute("""DROP VIEW IF EXISTS tx_exon_set_summary_dv CASCADE;""") - op.execute("""DROP VIEW IF EXISTS tx_exon_aln_v CASCADE;""") - op.execute("""DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;""") - op.execute("""DROP VIEW IF EXISTS _discontiguous_tx CASCADE;""") + op.execute("DROP VIEW IF EXISTS tx_similarity_v CASCADE;") + op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_def_summary_mv CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_def_summary_dv CASCADE;") + op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_exon_set_summary_mv CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_exon_set_summary_dv CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_exon_aln_v CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;") + op.execute("DROP VIEW IF EXISTS _discontiguous_tx CASCADE;") op.execute(""" CREATE VIEW _discontiguous_tx AS SELECT t.hgnc, @@ -253,19 +280,39 @@ def downgrade() -> None: CREATE INDEX tx_def_summary_mv_hgnc ON tx_def_summary_mv (hgnc); REFRESH MATERIALIZED VIEW tx_def_summary_mv; """) + op.execute(""" + CREATE VIEW tx_similarity_v AS + SELECT DISTINCT + D1.tx_ac as tx_ac1, D2.tx_ac as tx_ac2, + D1.hgnc = D2.hgnc as hgnc_eq, + D1.cds_md5=D2.cds_md5 as cds_eq, + D1.es_fingerprint=D2.es_fingerprint as es_fp_eq, + D1.cds_es_fp=D2.cds_es_fp as cds_es_fp_eq, + D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp as cds_exon_lengths_fp_eq + FROM tx_def_summary_mv D1 + JOIN tx_def_summary_mv D2 on (D1.tx_ac != D2.tx_ac + and (D1.hgnc=D2.hgnc + or D1.cds_md5=D2.cds_md5 + or D1.es_fingerprint=D2.es_fingerprint + or D1.cds_es_fp=D2.cds_es_fp + or D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp + )); + """) # ### end of updates to views ### # ### commands auto generated by Alembic - please adjust! ### - op.drop_constraint('fk_uta_transcript_gene_gene_id', 'transcript', schema='uta', type_='foreignkey') - op.drop_index(op.f('ix_uta_transcript_gene_id'), table_name='transcript', schema='uta') - op.alter_column('transcript', 'gene_id', + op.drop_constraint("fk_uta_transcript_gene_gene_id", "transcript", schema="uta", type_="foreignkey") + op.drop_index(op.f("ix_uta_transcript_gene_id"), table_name="transcript", schema="uta") + op.alter_column("transcript", "gene_id", existing_type=sa.TEXT(), nullable=True, - schema='uta') - op.drop_index(op.f('ix_uta_gene_hgnc'), table_name='gene', schema='uta') - op.drop_constraint('gene_pkey', 'gene', schema='uta') - op.alter_column('gene', 'gene_id', + schema="uta") + op.drop_index(op.f("ix_uta_gene_hgnc"), table_name="gene", schema="uta") + op.drop_constraint("gene_pkey", "gene", schema="uta") + op.alter_column("gene", "gene_id", existing_type=sa.TEXT(), nullable=True, - schema='uta') + schema="uta") + op.drop_index(op.f("ix_uta_gene_symbol"), table_name="gene", schema="uta") + op.drop_column("gene", "symbol", schema="uta") # ### end Alembic commands ### diff --git a/src/uta/loading.py b/src/uta/loading.py index 1962ccd..582c200 100644 --- a/src/uta/loading.py +++ b/src/uta/loading.py @@ -337,40 +337,16 @@ def load_geneinfo(session, opts, cf): session.merge( usam.Gene( gene_id=gi.gene_id, - hgnc=gi.hgnc, + hgnc=gi.gene_symbol, + symbol=gi.gene_symbol, maploc=gi.maploc, descr=gi.descr, summary=gi.summary, aliases=gi.aliases, + type=gi.type, + xrefs=gi.xrefs, )) - logger.info("Added {gi.hgnc}: {gi.gene_id} ({gi.summary})".format(gi=gi)) - session.commit() - - -def load_ncbi_geneinfo(session, opts, cf): - """ - import data as downloaded (by you) from - ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz - """ - - session.execute(text("set role {admin_role};".format( - admin_role=cf.get("uta", "admin_role")))) - session.execute(text("set search_path = " + usam.schema_name)) - - gip = uta.parsers.geneinfo.GeneInfoParser(gzip.open(opts["FILE"], 'rt')) - for gi in gip: - if gi["tax_id"] != "9606" or gi["Symbol_from_nomenclature_authority"] == "-": - continue - g = usam.Gene( - gene_id=gi["GeneID"], - hgnc=gi["Symbol_from_nomenclature_authority"], - maploc=gi["map_location"], - descr=gi["Full_name_from_nomenclature_authority"], - aliases=gi["Synonyms"], - strand=gi[""], - ) - session.add(g) - logger.info("loaded gene {g.hgnc} ({g.descr})".format(g=g)) + logger.info("Added {gi.gene_symbol}: {gi.gene_id} ({gi.summary})".format(gi=gi)) session.commit() diff --git a/src/uta/models.py b/src/uta/models.py index 2b64741..58476d9 100644 --- a/src/uta/models.py +++ b/src/uta/models.py @@ -100,12 +100,15 @@ class Gene(Base): # columns: gene_id = sa.Column(sa.Text, primary_key=True) hgnc = sa.Column(sa.Text, nullable=False, index=True) + symbol = sa.Column(sa.Text, nullable=False, index=True) maploc = sa.Column(sa.Text) descr = sa.Column(sa.Text) summary = sa.Column(sa.Text) aliases = sa.Column(sa.Text) added = sa.Column( sa.DateTime, nullable=False, default=datetime.datetime.now()) + type = sa.Column(sa.Text) + xrefs = sa.Column(sa.Text) # methods: From 807b8e673e7b72dd66738f37fdeff0f8a4d72fcf Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Tue, 16 Apr 2024 21:27:52 -0600 Subject: [PATCH 06/10] feat(IPVC-2264): update .gitignore --- .gitignore | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.gitignore b/.gitignore index e83d7e6..d7fefb9 100644 --- a/.gitignore +++ b/.gitignore @@ -57,11 +57,6 @@ misc/ncbi-geneinfo/ncbi.geneinfo.tsv.gz misc/partial-alignments sdist tmp -update/ncbi/ -seqrepo-data/ -ncbi-data/ -uta-update/ -output/ venv nosetests.xml .vscode From 6a18461d6f4c7e6a1765f2000bae261773c6262f Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Wed, 17 Apr 2024 11:12:19 -0600 Subject: [PATCH 07/10] feat(IPVC-2264): fix broken unit tests --- tests/test_uta_loading.py | 20 ++++++++++++++++++-- tests/test_uta_models.py | 4 +++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/tests/test_uta_loading.py b/tests/test_uta_loading.py index 1fbd029..1500863 100644 --- a/tests/test_uta_loading.py +++ b/tests/test_uta_loading.py @@ -48,11 +48,27 @@ def test_load_assoc_ac(self): ) self.session.add(o1) + # insert genes required for transcripts + g1 = usam.Gene( + gene_id='49', + hgnc='ACR', + symbol='ACR', + descr='acrosin', + ) + g2 = usam.Gene( + gene_id=50, + hgnc='ACO2', + symbol='ACO2', + descr='aconitase 2', + ) + self.session.add(g1) + self.session.add(g2) + # insert transcripts referenced in data file t1 = usam.Transcript( ac='NM_001097.3', origin=o1, - hgnc='ACR', + gene_id=g1.gene_id, cds_start_i=0, cds_end_i=1, cds_md5='a', @@ -60,7 +76,7 @@ def test_load_assoc_ac(self): t2 = usam.Transcript( ac='NM_001098.3', origin=o1, - hgnc='ACO2', + gene_id=g2.gene_id, cds_start_i=2, cds_end_i=3, cds_md5='b', diff --git a/tests/test_uta_models.py b/tests/test_uta_models.py index 3d81cb4..d221d16 100644 --- a/tests/test_uta_models.py +++ b/tests/test_uta_models.py @@ -77,7 +77,9 @@ def setUpClass(cls): cls.session.add(o) g = usam.Gene( + gene_id='148', hgnc='ADRA1A', + symbol='ADRA1A', maploc='8p21.2', descr='adrenoceptor alpha 1A', summary='''Alpha-1-adrenergic receptors (alpha-1-ARs) are @@ -121,7 +123,7 @@ def setUpClass(cls): t = usam.Transcript( ac=ac, origin=o, - hgnc=g.hgnc, + gene_id=g.gene_id, cds_start_i=tx_info['t_cds_start_i'], cds_end_i=tx_info['t_cds_end_i'], cds_md5='d41d8cd98f00b204e9800998ecf8427e', From 708844f66eec0a771c5eebc9b61722d9d411071c Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Wed, 17 Apr 2024 12:58:07 -0600 Subject: [PATCH 08/10] feat(IPVC-2264): update unittest for genes with new fields --- tests/test_uta_loading.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_uta_loading.py b/tests/test_uta_loading.py index 1500863..711d714 100644 --- a/tests/test_uta_loading.py +++ b/tests/test_uta_loading.py @@ -53,13 +53,23 @@ def test_load_assoc_ac(self): gene_id='49', hgnc='ACR', symbol='ACR', + maploc='22q13.33', descr='acrosin', + summary='acrosin', + aliases='SPGF87', + type='protein-coding', + xrefs='MIM:102480,HGNC:HGNC:126,Ensembl:ENSG00000100312,AllianceGenome:HGNC:126', ) g2 = usam.Gene( gene_id=50, hgnc='ACO2', symbol='ACO2', + maploc='22q13.2', descr='aconitase 2', + summary='aconitase 2', + aliases='ACONM,HEL-S-284,ICRD,OCA8,OPA9', + type='protein-coding', + xrefs='MIM:100850,HGNC:HGNC:118,Ensembl:ENSG00000100412,AllianceGenome:HGNC:118', ) self.session.add(g1) self.session.add(g2) From c92075975ac10f6e4dbd19251699c242f5ba52f1 Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Thu, 18 Apr 2024 12:13:06 -0600 Subject: [PATCH 09/10] feat(IPVC-2264): address PR comments. Gene.hgnc should come from HGNC value in intermediate file, and transcript to gene id changes should raise exception --- .gitignore | 4 ++++ src/uta/loading.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index d7fefb9..5f05187 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,7 @@ tmp venv nosetests.xml .vscode +output/ +uta-update/ +seqrepo-data/ +ncbi-data/ \ No newline at end of file diff --git a/src/uta/loading.py b/src/uta/loading.py index 582c200..97d3306 100644 --- a/src/uta/loading.py +++ b/src/uta/loading.py @@ -337,7 +337,7 @@ def load_geneinfo(session, opts, cf): session.merge( usam.Gene( gene_id=gi.gene_id, - hgnc=gi.gene_symbol, + hgnc=gi.hgnc, symbol=gi.gene_symbol, maploc=gi.maploc, descr=gi.descr, @@ -682,7 +682,7 @@ def _fetch_origin_by_name(name): ) session.add(u_tx) if u_tx.gene_id != ti.gene_id: - logger.warn("{ti.ac}: GeneID changed from {u_tx.gene_id} to {ti.gene_id}".format( + raise Exception("{ti.ac}: GeneID changed from {u_tx.gene_id} to {ti.gene_id}".format( u_tx=u_tx, ti=ti)) # state: transcript now exists, either existing or freshly-created From 06d60d627a19690523de5f54ac1730cd01d6d52e Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Thu, 18 Apr 2024 12:14:56 -0600 Subject: [PATCH 10/10] feat(IPVC-2264): fix .gitignore --- .gitignore | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.gitignore b/.gitignore index 5f05187..d7fefb9 100644 --- a/.gitignore +++ b/.gitignore @@ -60,7 +60,3 @@ tmp venv nosetests.xml .vscode -output/ -uta-update/ -seqrepo-data/ -ncbi-data/ \ No newline at end of file