Skip to content

Commit

Permalink
feat(IPVC-2264): update shell script, don't drop hgnc, add type and x…
Browse files Browse the repository at this point in the history
…refs to gene
  • Loading branch information
bsgiles73 committed Apr 17, 2024
1 parent 6c4b378 commit 4ed5101
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 71 deletions.
3 changes: 2 additions & 1 deletion misc/gene-update/upgrade-uta-schema.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ dumps_dir="/workdir/dumps"
mkdir -p $dumps_dir

## setup working uta schema
# delete destination schema if exists
# delete schema if exists
psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $working_uta_v CASCADE;"

# dump source version
Expand Down Expand Up @@ -51,4 +51,5 @@ python misc/gene-update/backfill_gene_id.py \
alembic -c etc/alembic.ini upgrade head

## Rename schema to destination schema name
psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $dest_uta_v CASCADE;"
psql -h localhost -U uta_admin -d uta -c "ALTER SCHEMA uta RENAME TO $dest_uta_v";
6 changes: 3 additions & 3 deletions sbin/uta-extract
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,6 @@ sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets
tee "$log_dir/filter_exonset_transcripts.log"

# move fasta files into same dir
ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/
ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/
ln $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/
cp -f $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/
cp -f $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/
cp -f $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('gene', sa.Column('gene_id', sa.Text(), nullable=True), schema='uta')
op.add_column('gene', sa.Column('type', sa.Text(), nullable=True), schema='uta')
op.add_column('gene', sa.Column('xrefs', sa.Text(), nullable=True), schema='uta')
op.add_column('transcript', sa.Column('gene_id', sa.Text(), nullable=True), schema='uta')
# ### end Alembic commands ###

Expand All @@ -32,6 +34,8 @@ def upgrade() -> None:
def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('transcript', 'gene_id', schema='uta')
op.drop_column('gene', 'xrefs', schema='uta')
op.drop_column('gene', 'type', schema='uta')
op.drop_column('gene', 'gene_id', schema='uta')
# ### end Alembic commands ###

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,25 @@ def upgrade() -> None:
)
# ### end Alembic commands ###

# ### handle first part of hgnc -> gene_symbol column rename ###
op.add_column("gene", sa.Column("symbol", sa.Text(), nullable=True), schema="uta")
op.create_index(op.f("ix_uta_gene_symbol"), "gene", ["symbol"], unique=False, schema="uta")
op.execute("UPDATE gene SET symbol = hgnc;")
# ### end of hgnc -> gene_symbol column rename ###

# ### updates required to existing views needed to drop hgnc from transcript. ###
op.execute("""DROP MATERIALIZED VIEW IF EXISTS tx_def_summary_mv CASCADE;""")
op.execute("""DROP VIEW IF EXISTS tx_def_summary_dv CASCADE;""")
op.execute("""DROP MATERIALIZED VIEW IF EXISTS tx_exon_set_summary_mv CASCADE;""")
op.execute("""DROP VIEW IF EXISTS tx_exon_set_summary_dv CASCADE;""")
op.execute("""DROP VIEW IF EXISTS tx_exon_aln_v CASCADE;""")
op.execute("""DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;""")
op.execute("""DROP VIEW IF EXISTS _discontiguous_tx CASCADE;""")
op.execute("DROP VIEW IF EXISTS tx_similarity_v CASCADE;")
op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_def_summary_mv CASCADE;")
op.execute("DROP VIEW IF EXISTS tx_def_summary_dv CASCADE;")
op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_exon_set_summary_mv CASCADE;")
op.execute("DROP VIEW IF EXISTS tx_exon_set_summary_dv CASCADE;")
op.execute("DROP VIEW IF EXISTS tx_exon_aln_v CASCADE;")
op.execute("DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;")
op.execute("DROP VIEW IF EXISTS _discontiguous_tx CASCADE;")
op.execute("""
CREATE VIEW _discontiguous_tx AS
SELECT g.hgnc,
SELECT g.symbol,
g.symbol as hgnc,
g.gene_id,
es.exon_set_id,
es.tx_ac,
Expand All @@ -80,11 +88,11 @@ def upgrade() -> None:
""")
op.execute("""
CREATE VIEW tx_alt_exon_pairs_v AS
SELECT g.hgnc, g.gene_id,TES.exon_SET_id AS tes_exon_SET_id,AES.exon_SET_id AS aes_exon_SET_id,
TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac,AES.alt_strand,AES.alt_aln_method,
TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id,AEX.exon_id AS alt_exon_id,
TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, AEX.start_i AS alt_start_i,AEX.END_i AS alt_END_i,
EA.exon_aln_id,EA.cigar
SELECT g.symbol, g.symbol as hgnc, g.gene_id,TES.exon_SET_id AS tes_exon_SET_id,
AES.exon_SET_id AS aes_exon_SET_id, TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac,
AES.alt_strand,AES.alt_aln_method, TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id,
AEX.exon_id AS alt_exon_id, TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i,
AEX.start_i AS alt_start_i, AEX.END_i AS alt_END_i, EA.exon_aln_id,EA.cigar
FROM exon_SET tes
JOIN transcript t ON tes.tx_ac=t.ac
JOIN gene g ON t.gene_id=g.gene_id
Expand All @@ -95,13 +103,12 @@ def upgrade() -> None:
""")
op.execute("""
CREATE VIEW tx_exon_aln_v AS
SELECT G.hgnc,G.gene_id,T.ac as tx_ac,AES.alt_ac,AES.alt_aln_method,AES.alt_strand,
TE.ord, TE.start_i as tx_start_i,TE.end_i as tx_end_i,
AE.start_i as alt_start_i, AE.end_i as alt_end_i,
EA.cigar, EA.tx_aseq, EA.alt_aseq,
TES.exon_set_id AS tx_exon_set_id,AES.exon_set_id as alt_exon_set_id,
TE.exon_id as tx_exon_id, AE.exon_id as alt_exon_id,
EA.exon_aln_id
SELECT G.symbol, G.symbol AS hgnc, G.gene_id, T.ac as tx_ac, AES.alt_ac,
AES.alt_aln_method,AES.alt_strand, TE.ord, TE.start_i as tx_start_i,
TE.end_i as tx_end_i, AE.start_i as alt_start_i, AE.end_i as alt_end_i,
EA.cigar, EA.tx_aseq, EA.alt_aseq, TES.exon_set_id AS tx_exon_set_id,
AES.exon_set_id as alt_exon_set_id, TE.exon_id as tx_exon_id,
AE.exon_id as alt_exon_id, EA.exon_aln_id
FROM transcript T
JOIN gene G ON T.gene_id=G.gene_id
JOIN exon_set TES ON T.ac=TES.tx_ac AND TES.alt_aln_method ='transcript'
Expand All @@ -112,7 +119,8 @@ def upgrade() -> None:
""")
op.execute("""
CREATE VIEW tx_exon_set_summary_dv AS
SELECT G.hgnc,G.gene_id,cds_md5,es_fingerprint,tx_ac,alt_ac,alt_aln_method,alt_strand,exon_set_id,n_exons,se_i,starts_i,ends_i,lengths
SELECT G.symbol, G.symbol as hgnc, G.gene_id, cds_md5, es_fingerprint, tx_ac, alt_ac,
alt_aln_method, alt_strand, exon_set_id, n_exons, se_i, starts_i, ends_i, lengths
FROM transcript T
JOIN gene G ON T.gene_id=G.gene_id
JOIN exon_set_exons_fp_mv ESE ON T.ac=ESE.tx_ac;
Expand All @@ -130,7 +138,7 @@ def upgrade() -> None:
op.execute("""
CREATE VIEW tx_def_summary_dv AS
SELECT TESS.exon_set_id, TESS.tx_ac, TESS.alt_ac, TESS.alt_aln_method, TESS.alt_strand,
TESS.hgnc, TESS.gene_id, TESS.cds_md5, TESS.es_fingerprint, CEF.cds_es_fp,
TESS.symbol, TESS.hgnc, TESS.gene_id, TESS.cds_md5, TESS.es_fingerprint, CEF.cds_es_fp,
CEF.cds_exon_lengths_fp, TESS.n_exons, TESS.se_i, CEF.cds_se_i, TESS.starts_i,
TESS.ends_i, TESS.lengths, T.cds_start_i, T.cds_end_i, CEF.cds_start_exon, CEF.cds_end_exon
FROM tx_exon_set_summary_mv TESS
Expand All @@ -144,10 +152,28 @@ def upgrade() -> None:
CREATE INDEX tx_def_summary_mv_alt_ac ON tx_def_summary_mv (alt_ac);
CREATE INDEX tx_def_summary_mv_alt_aln_method ON tx_def_summary_mv (alt_aln_method);
CREATE INDEX tx_def_summary_mv_hgnc ON tx_def_summary_mv (hgnc);
CREATE INDEX tx_def_summary_mv_symbol ON tx_def_summary_mv (symbol);
CREATE INDEX tx_def_summary_mv_gene_id ON tx_def_summary_mv (gene_id);
REFRESH MATERIALIZED VIEW tx_def_summary_mv;
""")

op.execute("""
CREATE VIEW tx_similarity_v AS
SELECT DISTINCT
D1.tx_ac as tx_ac1, D2.tx_ac as tx_ac2,
D1.symbol = D2.symbol as symbol_eq,
D1.cds_md5=D2.cds_md5 as cds_eq,
D1.es_fingerprint=D2.es_fingerprint as es_fp_eq,
D1.cds_es_fp=D2.cds_es_fp as cds_es_fp_eq,
D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp as cds_exon_lengths_fp_eq
FROM tx_def_summary_mv D1
JOIN tx_def_summary_mv D2 on (D1.tx_ac != D2.tx_ac
and (D1.symbol=D2.symbol
or D1.cds_md5=D2.cds_md5
or D1.es_fingerprint=D2.es_fingerprint
or D1.cds_es_fp=D2.cds_es_fp
or D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp
));
""")
# ### end of updates to existing views ###

# ### drop hgnc from transcript ###
Expand All @@ -161,13 +187,14 @@ def downgrade() -> None:
# ### end of updates to transcript ###

# ### commands to downgrade views before adding hgnc to transcript ###
op.execute("""DROP MATERIALIZED VIEW IF EXISTS tx_def_summary_mv CASCADE;""")
op.execute("""DROP VIEW IF EXISTS tx_def_summary_dv CASCADE;""")
op.execute("""DROP MATERIALIZED VIEW IF EXISTS tx_exon_set_summary_mv CASCADE;""")
op.execute("""DROP VIEW IF EXISTS tx_exon_set_summary_dv CASCADE;""")
op.execute("""DROP VIEW IF EXISTS tx_exon_aln_v CASCADE;""")
op.execute("""DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;""")
op.execute("""DROP VIEW IF EXISTS _discontiguous_tx CASCADE;""")
op.execute("DROP VIEW IF EXISTS tx_similarity_v CASCADE;")
op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_def_summary_mv CASCADE;")
op.execute("DROP VIEW IF EXISTS tx_def_summary_dv CASCADE;")
op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_exon_set_summary_mv CASCADE;")
op.execute("DROP VIEW IF EXISTS tx_exon_set_summary_dv CASCADE;")
op.execute("DROP VIEW IF EXISTS tx_exon_aln_v CASCADE;")
op.execute("DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;")
op.execute("DROP VIEW IF EXISTS _discontiguous_tx CASCADE;")
op.execute("""
CREATE VIEW _discontiguous_tx AS
SELECT t.hgnc,
Expand Down Expand Up @@ -253,19 +280,39 @@ def downgrade() -> None:
CREATE INDEX tx_def_summary_mv_hgnc ON tx_def_summary_mv (hgnc);
REFRESH MATERIALIZED VIEW tx_def_summary_mv;
""")
op.execute("""
CREATE VIEW tx_similarity_v AS
SELECT DISTINCT
D1.tx_ac as tx_ac1, D2.tx_ac as tx_ac2,
D1.hgnc = D2.hgnc as hgnc_eq,
D1.cds_md5=D2.cds_md5 as cds_eq,
D1.es_fingerprint=D2.es_fingerprint as es_fp_eq,
D1.cds_es_fp=D2.cds_es_fp as cds_es_fp_eq,
D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp as cds_exon_lengths_fp_eq
FROM tx_def_summary_mv D1
JOIN tx_def_summary_mv D2 on (D1.tx_ac != D2.tx_ac
and (D1.hgnc=D2.hgnc
or D1.cds_md5=D2.cds_md5
or D1.es_fingerprint=D2.es_fingerprint
or D1.cds_es_fp=D2.cds_es_fp
or D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp
));
""")
# ### end of updates to views ###

# ### commands auto generated by Alembic - please adjust! ###
op.drop_constraint('fk_uta_transcript_gene_gene_id', 'transcript', schema='uta', type_='foreignkey')
op.drop_index(op.f('ix_uta_transcript_gene_id'), table_name='transcript', schema='uta')
op.alter_column('transcript', 'gene_id',
op.drop_constraint("fk_uta_transcript_gene_gene_id", "transcript", schema="uta", type_="foreignkey")
op.drop_index(op.f("ix_uta_transcript_gene_id"), table_name="transcript", schema="uta")
op.alter_column("transcript", "gene_id",
existing_type=sa.TEXT(),
nullable=True,
schema='uta')
op.drop_index(op.f('ix_uta_gene_hgnc'), table_name='gene', schema='uta')
op.drop_constraint('gene_pkey', 'gene', schema='uta')
op.alter_column('gene', 'gene_id',
schema="uta")
op.drop_index(op.f("ix_uta_gene_hgnc"), table_name="gene", schema="uta")
op.drop_constraint("gene_pkey", "gene", schema="uta")
op.alter_column("gene", "gene_id",
existing_type=sa.TEXT(),
nullable=True,
schema='uta')
schema="uta")
op.drop_index(op.f("ix_uta_gene_symbol"), table_name="gene", schema="uta")
op.drop_column("gene", "symbol", schema="uta")
# ### end Alembic commands ###
34 changes: 5 additions & 29 deletions src/uta/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,40 +337,16 @@ def load_geneinfo(session, opts, cf):
session.merge(
usam.Gene(
gene_id=gi.gene_id,
hgnc=gi.hgnc,
hgnc=gi.gene_symbol,
symbol=gi.gene_symbol,
maploc=gi.maploc,
descr=gi.descr,
summary=gi.summary,
aliases=gi.aliases,
type=gi.type,
xrefs=gi.xrefs,
))
logger.info("Added {gi.hgnc}: {gi.gene_id} ({gi.summary})".format(gi=gi))
session.commit()


def load_ncbi_geneinfo(session, opts, cf):
"""
import data as downloaded (by you) from
ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
"""

session.execute(text("set role {admin_role};".format(
admin_role=cf.get("uta", "admin_role"))))
session.execute(text("set search_path = " + usam.schema_name))

gip = uta.parsers.geneinfo.GeneInfoParser(gzip.open(opts["FILE"], 'rt'))
for gi in gip:
if gi["tax_id"] != "9606" or gi["Symbol_from_nomenclature_authority"] == "-":
continue
g = usam.Gene(
gene_id=gi["GeneID"],
hgnc=gi["Symbol_from_nomenclature_authority"],
maploc=gi["map_location"],
descr=gi["Full_name_from_nomenclature_authority"],
aliases=gi["Synonyms"],
strand=gi[""],
)
session.add(g)
logger.info("loaded gene {g.hgnc} ({g.descr})".format(g=g))
logger.info("Added {gi.gene_symbol}: {gi.gene_id} ({gi.summary})".format(gi=gi))
session.commit()


Expand Down
3 changes: 3 additions & 0 deletions src/uta/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,15 @@ class Gene(Base):
# columns:
gene_id = sa.Column(sa.Text, primary_key=True)
hgnc = sa.Column(sa.Text, nullable=False, index=True)
symbol = sa.Column(sa.Text, nullable=False, index=True)
maploc = sa.Column(sa.Text)
descr = sa.Column(sa.Text)
summary = sa.Column(sa.Text)
aliases = sa.Column(sa.Text)
added = sa.Column(
sa.DateTime, nullable=False, default=datetime.datetime.now())
type = sa.Column(sa.Text)
xrefs = sa.Column(sa.Text)

# methods:

Expand Down

0 comments on commit 4ed5101

Please sign in to comment.