Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(IPVC-2264) add gene_id to UTA models #24

Merged
merged 11 commits into from
Apr 18, 2024
Merged
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ misc/ncbi-geneinfo/ncbi.geneinfo.tsv.gz
misc/partial-alignments
sdist
tmp
update/ncbi/
bsgiles73 marked this conversation as resolved.
Show resolved Hide resolved
venv
nosetests.xml
.vscode
3 changes: 1 addition & 2 deletions etc/scripts/create-new-schema.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,4 @@ pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \
# create new schema
gzip -cdq $dumps_dir/"$source_uta_v".pgd.gz | \
sbin/pg-dump-schema-rename "$source_uta_v" "$dest_uta_v" | \
sbin/pg-dump-schema-rename "uta_1_1" "$dest_uta_v" | \
bsgiles73 marked this conversation as resolved.
Show resolved Hide resolved
psql -U uta_admin -h localhost -d uta -aeE
psql -U uta_admin -h localhost -d uta -aeE
116 changes: 116 additions & 0 deletions misc/gene-update/backfill_gene_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import argparse
import logging

from datetime import datetime
from sqlalchemy.orm import Session
from sqlalchemy import text

import uta
from uta.models import Gene, Transcript


logger = None
n = 50000


def backfill_gene(uta_session: Session, gene_update_file: str) -> None:
logger.info("Dropping gene table contents")
uta_session.execute(text("DELETE FROM uta.gene;"))
uta_session.commit()

logger.info(f"Back filling gene table from {gene_update_file}")
now_ts = datetime.now()
i = 0
new_genes = []
with open(gene_update_file) as f:
for line in f:
if line.startswith("gene_id"):
continue

if i % n == 0:
if i > 0:
logger.info(f"Bulk inserting {len(new_genes)} genes")
uta_session.bulk_save_objects(new_genes)
uta_session.commit()
logger.info(f"Processing chunk {int(i/n) + 1}")
new_genes = []

gene_id, hgnc, maploc, desc, summary, aliases, added = line.rstrip("\r\n").split("\t")
# set timestamp from file string, if empty set to now.
if added == "":
added_ts = now_ts
else:
added_ts = datetime.strptime(added, "%Y-%m-%d %H:%M:%S.%f")

# clean up aliases
aliases = aliases.replace("{", "").replace("}", "")
if aliases == "-":
aliases = None

gene = Gene(
gene_id=gene_id,
hgnc=hgnc,
maploc=maploc if maploc else None,
descr=desc if desc else None,
summary=summary if desc else None,
aliases=aliases if aliases else None,
added=added_ts,
)
i += 1
new_genes.append(gene)

logger.info(f"Bulk inserting {len(new_genes)} genes")
uta_session.bulk_save_objects(new_genes)
uta_session.commit()
logger.info(f"Inserted {i} total genes")


def backfill_transcript(uta_session: Session, transcript_update_file: str) -> None:
logger.info("Backfilling gene_id in transcript table")
tx_ac_to_gene_id = {}

logger.info(f"Reading transcript to gene id mappings from {transcript_update_file}")
with open(transcript_update_file) as f:
for line in f:
if line.startswith("origin"):
continue
_, tx_ac, gene_id, _ = line.rstrip("\r\n").split("\t")
tx_ac_to_gene_id[tx_ac] = gene_id
logger.info(f" - {len(tx_ac_to_gene_id)} mappings read")

i = 0
txs = []
for tx_ac, gene_id in tx_ac_to_gene_id.items():
if i % n == 0:
if i > 0:
logger.info(f"Updating {len(txs)} transcripts")
uta_session.flush()

logger.info(f"Processing chunk {int(i/n) + 1}")
txs = []

tx = uta_session.query(Transcript).filter(Transcript.ac == tx_ac).one()
tx.gene_id = gene_id
txs.append(tx)
i += 1

logger.info(f"Updating {len(txs)} transcripts")
uta_session.flush()
uta_session.commit()


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Backfill gene_id in gene and transcript tables")
parser.add_argument("db_url", help="URL of the UTA database")
parser.add_argument("gene_update_file", type=str, help="File containing gene_id updates for gene table")
parser.add_argument("transcript_update_file", type=str, help="File containing gene_id updates for transcript table")
args = parser.parse_args()

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
logger = logging.getLogger("backfill_gene_id")

session = uta.connect(args.db_url)

backfill_gene(session, args.gene_update_file)
backfill_transcript(session, args.transcript_update_file)
session.close()
55 changes: 55 additions & 0 deletions misc/gene-update/upgrade-uta-schema.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash

# This script is used to upgrade older UTA schemas (specifically uta_20210129b) to a newer version.
# Part of this upgrade is to introduce gene_id to the gene and transcript tables. The columns are
# added with a Alembic migration. Then a data migration to back fill the new columns. Then a second
# Alembic migration to add the constraints to the columns and update primary and foreign keys.

if [ "$#" -lt 1 ]
then
echo "error: too few arguments, you provided $#, 1 required"
echo "usage: upgrade-uta-schema.sh <dest_uta_v>"
exit 1
fi

set -euxo pipefail

source_uta_v="uta_20210129b"
working_uta_v="uta"
dest_uta_v=$1
dumps_dir="/workdir/dumps"
mkdir -p $dumps_dir

## setup working uta schema
# delete schema if exists
psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $working_uta_v CASCADE;"

# dump source version
pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \
gzip -c > $dumps_dir/"$source_uta_v".pgd.gz

# create new schema
gzip -cdq $dumps_dir/"$source_uta_v".pgd.gz | \
sbin/pg-dump-schema-rename "$source_uta_v" "$working_uta_v" | \
sbin/pg-dump-schema-rename "uta_1_1" "$working_uta_v" | \
psql -U uta_admin -h localhost -d uta -aeE

## upgrade working uta schema
# set initial Alembic migration so it is not ran.
alembic -c etc/alembic.ini stamp edadb97f6502

# run Alembic migration to add gene_id to gene and transcript tables
alembic -c etc/alembic.ini upgrade 595a586e6de7

# run data migration to back fill gene_id
python misc/gene-update/backfill_gene_id.py \
postgresql://uta_admin:@localhost/uta \
/workdir/backfill/gene_update.tsv \
/workdir/backfill/transcript_update.tsv

# run Alembic migrations to add constraints and update existing views
alembic -c etc/alembic.ini upgrade head

## Rename schema to destination schema name
psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $dest_uta_v CASCADE;"
psql -h localhost -U uta_admin -d uta -c "ALTER SCHEMA uta RENAME TO $dest_uta_v";
2 changes: 0 additions & 2 deletions sbin/ncbi-parse-geneinfo
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ if __name__ == "__main__":
giw = GeneInfoWriter(sys.stdout)

for rec in gi_in:
if rec["symbol_from_nomenclature_authority"] == "-":
bsgiles73 marked this conversation as resolved.
Show resolved Hide resolved
continue
gi = GeneInfo(
tax_id=rec["tax_id"],
gene_symbol=rec["symbol"],
Expand Down
2 changes: 1 addition & 1 deletion sbin/uta-diff
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ cmp_cols = collections.defaultdict(lambda: ['*'])
cmp_cols.update({
"associated_accessions": "tx_ac pro_ac origin".split(),
"exon_aln": "exon_aln_id tx_exon_id alt_exon_id cigar added".split(),
"gene": "hgnc".split(),
"gene": "gene_id".split(),
"transcript": "ac".split(),
})

Expand Down
6 changes: 3 additions & 3 deletions sbin/uta-extract
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,6 @@ sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets
tee "$log_dir/filter_exonset_transcripts.log"

# move fasta files into same dir
ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/
ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/
ln $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/
cp -f $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/
bsgiles73 marked this conversation as resolved.
Show resolved Hide resolved
cp -f $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/
cp -f $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/
5 changes: 2 additions & 3 deletions sbin/uta-load
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ etc/scripts/delete-schema.sh "$loading_uta_v"
etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v"

## for now set up Alembic for schema migrations
alembic -c etc/alembic.ini stamp edadb97f6502
bsgiles73 marked this conversation as resolved.
Show resolved Hide resolved
alembic -c etc/alembic.ini upgrade head

# generate seqinfo files from exonsets (this step requires seqrepo)
Expand Down Expand Up @@ -65,8 +64,8 @@ uta --conf=etc/global.conf --conf=etc/[email protected] load-exonset "$work
tee "$log_dir/load-exonsets.log"

# Load seqinfo into the seq and seqanno tables.
uta --conf=etc/global.conf --conf=etc/[email protected] load-seqinfo "$loading_dir/seqinfo.gz" 2>&1 | \
tee "$logs_dir/load-seqinfo.log"
uta --conf=etc/global.conf --conf=etc/[email protected] load-seqinfo "$working_dir/seqinfo.gz" 2>&1 | \
bsgiles73 marked this conversation as resolved.
Show resolved Hide resolved
tee "$log_dir/load-seqinfo.log"

# Create cigar strings for all rows in tx_alt_exon_pairs_v view and update exon_aln table.
uta --conf=etc/global.conf --conf=etc/[email protected] align-exons 2>&1 | \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""add gene_id to gene and transcript

Revision ID: 595a586e6de7
Revises: a697b584f699
Create Date: 2024-04-10 19:47:43.685672

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = '595a586e6de7'
down_revision: Union[str, None] = 'a697b584f699'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('gene', sa.Column('gene_id', sa.Text(), nullable=True), schema='uta')
op.add_column('gene', sa.Column('type', sa.Text(), nullable=True), schema='uta')
op.add_column('gene', sa.Column('xrefs', sa.Text(), nullable=True), schema='uta')
op.add_column('transcript', sa.Column('gene_id', sa.Text(), nullable=True), schema='uta')
# ### end Alembic commands ###

# ### commands to drop existing primary key on gene table ###
op.drop_constraint('gene_pkey', 'gene', schema='uta')
# ### end of commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('transcript', 'gene_id', schema='uta')
op.drop_column('gene', 'xrefs', schema='uta')
op.drop_column('gene', 'type', schema='uta')
op.drop_column('gene', 'gene_id', schema='uta')
# ### end Alembic commands ###

# ### commands to add primary key on gene table ###
op.create_primary_key('gene_pkey', 'gene', ['hgnc'], schema='uta')
# ### end of commands ###
Loading
Loading