Skip to content

Commit

Permalink
feat(IPVC-2441): various workflow fixes (#39)
Browse files Browse the repository at this point in the history
  • Loading branch information
bsgiles73 authored May 15, 2024
1 parent 59fd1b2 commit c0d5a37
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 26 deletions.
2 changes: 1 addition & 1 deletion etc/ncbi-files.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# │ ├── GENE_INFO
# │ │ └── Mammalia
# │ │ └── Homo_sapiens.gene_info.gz
# │ └── gene2accession.gz
# │ └── gene2refseq.gz
# ├── genomes
# │ └── refseq
# │ └── vertebrate_mammalian
Expand Down
4 changes: 2 additions & 2 deletions sbin/seqrepo-load
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env bash

set -e
set -euxo pipefail

seqrepo_root=$1
seqrepo_version=$2
Expand All @@ -19,5 +19,5 @@ mapfile -t FASTA_FILES < <(find "$sequence_dir" -type f -name "*.f[an]a*")
# Load SeqRepo with new sequences
seqrepo --root-directory "$seqrepo_root" \
load -n NCBI --instance-name "$seqrepo_version" \
"${FASTA_FILES[@]}" 2>& 1 | \
"${FASTA_FILES[@]}" 2>&1 | \
tee "$log_dir/seqrepo-load.log"
8 changes: 4 additions & 4 deletions sbin/uta-extract
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Extract data from NCBI files into intermediate files.

set -e
set -euxo pipefail

ncbi_dir=$1
working_dir=$2
Expand All @@ -19,12 +19,12 @@ sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gen
gzip -c > "$working_dir/geneinfo.gz" 2>&1 | tee "$log_dir/ncbi-parse-geneinfo.log"

# transcript protein associations
sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$working_dir/assocacs.gz" 2>&1 | \
sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2refseq.gz | gzip -c > "$working_dir/assocacs.gz" 2>&1 | \
tee "$log_dir/ncbi-fetch-assoc-acs.log"

# parse transcript info from GBFF input files
GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz)
sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$working_dir/txinfo.gz" 2>&1 | \
mapfile -t GBFF_FILES < <(find "$ncbi_dir/refseq" -type f -name "human.*.rna.gbff.gz")
sbin/ncbi-parse-gbff "${GBFF_FILES[@]}" | gzip -c > "$working_dir/txinfo.gz" 2>&1 | \
tee "$log_dir/ncbi-parse-gbff.log"

# parse alignments from GFF input files
Expand Down
4 changes: 4 additions & 0 deletions src/uta/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ class EutilsDownloadError(Exception):
pass


class ExonStructureMismatchError(UTAError):
pass


# <LICENSE>
# Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta)
##
Expand Down
43 changes: 24 additions & 19 deletions src/uta/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import uta.formats.txinfo as ufti
import uta.parsers.geneinfo
import uta.parsers.seqgene
from uta.exceptions import ExonStructureMismatchError

usam = uta.models

Expand Down Expand Up @@ -317,37 +318,41 @@ def load_exonset(session, opts, cf):
)
tx_exon_count = len(tx_es.exons_se_i())
aln_exon_count = len(es.exons_se_i.split(";"))
if tx_exon_count != aln_exon_count:
logger.warning(
if tx_exon_count == aln_exon_count:
n, o = _upsert_exon_set_record(
session, es.tx_ac, es.alt_ac, es.strand, es.method, es.exons_se_i
)
session.commit()
else:
raise ExonStructureMismatchError(
"Exon structure mismatch: {tx_exon_count} exons in transcript {es.tx_ac}; {aln_exon_count} in alignment {es.alt_ac}".format(
tx_exon_count=tx_exon_count,
aln_exon_count=aln_exon_count,
es=es,
)
)
skipped = True
continue

n, o = _upsert_exon_set_record(
session, es.tx_ac, es.alt_ac, es.strand, es.method, es.exons_se_i
)
session.commit()
except IntegrityError as e:
logger.exception(e)
session.rollback()
n_errors += 1
except NoResultFound as e:
logger.exception(e)
logger.warning("NoResultFound for transcript ExonSet: {es.tx_ac}".format(es=es))
skipped = True
except ExonStructureMismatchError as e:
logger.exception(e)
skipped = True
else:
(no) = (n is not None, o is not None)
if no == (True, False):
n_new += 1
elif no == (True, True):
n_deprecated += 1
elif no == (False, True):
n_unchanged += 1
finally:
if skipped:
n_skipped += 1
else:
(no) = (n is not None, o is not None)
if no == (True, False):
n_new += 1
elif no == (True, True):
n_deprecated += 1
elif no == (False, True):
n_unchanged += 1
finally:
if i_es % update_period == 0 or i_es + 1 == n_rows:
logger.info(
"{i_es}/{n_rows} {p:.1f}%; {n_new} new, {n_unchanged} unchanged, {n_deprecated} deprecated, {n_skipped} skipped, {n_errors} n_errors".format(
Expand Down Expand Up @@ -756,7 +761,7 @@ def _fetch_origin_by_name(name):
session.add(usam.TranslationException(**te))

if u_tx.gene_id != ti.gene_id:
raise Exception("{ti.ac}: GeneID changed from {u_tx.gene_id} to {ti.gene_id}".format(u_tx=u_tx, ti=ti))
logger.warning("{ti.ac}: GeneID changed from {u_tx.gene_id} to {ti.gene_id}".format(u_tx=u_tx, ti=ti))

# state: transcript now exists, either existing or freshly-created

Expand Down

0 comments on commit c0d5a37

Please sign in to comment.