From d65d764c3e54f974511f95b9dfe55ecfae7c3bb4 Mon Sep 17 00:00:00 2001
From: nvta1209 <162694616+nvta1209@users.noreply.github.com>
Date: Tue, 9 Apr 2024 08:07:55 -0700
Subject: [PATCH] feat(IPVC-2283): mitochondrial transcript workflow (#20)

---
 README.md                       | 95 +++++++++++++++++++-------------
 docker-compose.yml              | 50 ++++++++++++++---
 etc/scripts/run-uta-build.sh    | 98 ---------------------------------
 sbin/ncbi-download              | 24 ++++++++
 sbin/ncbi-download-docker       | 24 --------
 sbin/ncbi_process_mito.py       | 23 ++++----
 sbin/seqrepo-download           | 53 ------------------
 sbin/seqrepo-load               | 21 +++++++
 sbin/uta-extract                | 43 +++++++++++++++
 sbin/uta-load                   | 77 ++++++++++++++++++++++++++
 sbin/uta-update                 | 67 ----------------------
 tests/test_ncbi_process_mito.py |  8 +--
 12 files changed, 280 insertions(+), 303 deletions(-)
 delete mode 100755 etc/scripts/run-uta-build.sh
 delete mode 100755 sbin/ncbi-download-docker
 mode change 100644 => 100755 sbin/ncbi_process_mito.py
 delete mode 100755 sbin/seqrepo-download
 create mode 100755 sbin/seqrepo-load
 create mode 100755 sbin/uta-extract
 create mode 100755 sbin/uta-load
 delete mode 100755 sbin/uta-update

diff --git a/README.md b/README.md
index 84e606c..2ade8c5 100644
--- a/README.md
+++ b/README.md
@@ -289,60 +289,77 @@ To develop UTA, follow these steps.
 4. Testing
 
         $ docker build --target uta-test -t uta-test .
-        $ docker run -it --rm uta-test python -m unittest
+        $ docker run --rm uta-test python -m unittest
 
 ## UTA update procedure
 
-### 1. Download files from NCBI
+Requires docker.
 
-Run `sbin/ncbi-download-docker`. Requires bash and docker.
+### 0. Setup
 
-Example:
+Make directories:
 ```
-sbin/ncbi-download-docker $(pwd)/ncbi-data
+mkdir -p $(pwd)/ncbi-data
+mkdir -p $(pwd)/output/artifacts
+mkdir -p $(pwd)/output/logs
 ```
 
-The specified directory will have the following structure:
-
-    ├── gene
-    │   └── DATA
-    │       ├── GENE_INFO
-    │       │   └── Mammalia
-    │       │       └── Homo_sapiens.gene_info.gz
-    │       └── gene2accession.gz
-    ├── genomes
-    │   └── refseq
-    │       └── vertebrate_mammalian
-    │           └── Homo_sapiens
-    │               └── all_assembly_versions
-    │                   └── GCF_000001405.25_GRCh37.p13
-    │                       ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz
-    │                       └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz
-    └── refseq
-        └── H_sapiens
-            └── mRNA_Prot
-                ├── human.1.protein.faa.gz
-                ├── human.1.rna.fna.gz
-                └── human.1.rna.gbff.gz
-
-### 2. Download SeqRepo data
-
-Run `sbin/seqrepo-download`. Requires bash and docker.
-
-Example:
+Set variables:
 ```
-sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data
+export UTA_ETL_OLD_SEQREPO_VERSION=2024-02-20
+export UTA_ETL_OLD_UTA_VERSION=uta_20210129b
+export UTA_ETL_NCBI_DIR=./ncbi-data
+export UTA_ETL_SEQREPO_DIR=./seqrepo-data
+export UTA_ETL_WORK_DIR=./output/artifacts
+export UTA_ETL_LOG_DIR=./output/logs
 ```
 
-### 3. Update UTA and SeqRepo
+Build the UTA image:
+```
+docker build --target uta -t uta-update .
+```
+
+### 1. Download SeqRepo data
+```
+docker pull biocommons/seqrepo:$UTA_ETL_OLD_SEQREPO_VERSION
+
+# download seqrepo. can skip if container already exists.
+docker run --name seqrepo biocommons/seqrepo:$UTA_ETL_OLD_SEQREPO_VERSION
+
+# copy seqrepo data into a local directory
+docker run -v $UTA_ETL_SEQREPO_DIR:/output-dir --volumes-from seqrepo ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir'
+
+# allow seqrepo to be modified
+docker run -it -v $UTA_ETL_SEQREPO_DIR:/output-dir ubuntu bash -c 'chmod -R +w /output-dir'
+```
 
-Run `sbin/uta-update`. Requires bash and docker.
+Note: pulling data takes ~30 minutes and requires ~13 GB.
+Note: a container called seqrepo will be left behind.
 
-Example:
+### 2. Extract and transform data from NCBI
+
+Download files from NCBI, extract into intermediate files, and load into UTA and SeqRepo.
+
+See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
+
+#### 2A. Nuclear transcripts
+```
+docker compose run ncbi-download
+docker compose run uta-extract
+docker compose run seqrepo-load
+UTA_ETL_SKIP_GENE_LOAD=false docker compose run uta-load
 ```
-sbin/uta-update $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/uta-build uta_20210129b 2024-02-20
+
+#### 2B. Mitochondrial transcripts
+```
+docker compose run mito-extract
+docker compose run seqrepo-load
+UTA_ETL_SKIP_GENE_LOAD=true docker compose run uta-load
 ```
 
+UTA has updated and the database has been dumped into a pgd file in `UTA_ETL_WORK_DIR`. SeqRepo has been updated in place.
+
+
 ## Migrations
 UTA uses alembic to manage database migrations. To auto-generate a migration:
 ```
@@ -353,7 +370,7 @@ Adjust the upgrade and downgrade function definitions. To apply the migration:
 ```
 alembic -c etc/alembic.ini upgrade head
 ```
-To reverse a migration, use `downgrade` with the number of steps to reverse. For example, to reverse the last: 
+To reverse a migration, use `downgrade` with the number of steps to reverse. For example, to reverse the last:
 ```
 alembic -c etc/alembic.ini downgrade -1
 ```
diff --git a/docker-compose.yml b/docker-compose.yml
index 856a5ae..1247c09 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -3,24 +3,60 @@
 version: '3'
 
 services:
+  ncbi-download:
+    image: uta-update
+    command: sbin/ncbi-download /ncbi-dir
+    volumes:
+      - .:/opt/repos/uta
+      - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
+    working_dir: /opt/repos/uta
+    network_mode: host
+  uta-extract:
+    image: uta-update
+    command: sbin/uta-extract /ncbi-dir /uta-extract/work /uta-extract/logs
+    volumes:
+      - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
+      - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
+      - ${UTA_ETL_WORK_DIR}:/uta-extract/work
+      - ${UTA_ETL_LOG_DIR}:/uta-extract/logs
+    working_dir: /opt/repos/uta
+    network_mode: host
+  seqrepo-load:
+    image: uta-update
+    command: sbin/seqrepo-load /usr/local/share/seqrepo 2024-02-20 /seqrepo-load/work /seqrepo-load/logs
+    volumes:
+      - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
+      - ${UTA_ETL_WORK_DIR}:/seqrepo-load/work
+      - ${UTA_ETL_LOG_DIR}:/seqrepo-load/logs
+    working_dir: /opt/repos/uta
+    network_mode: host
   uta:
     container_name: uta
-    image: biocommons/uta:${UTA_VERSION}
+    image: biocommons/uta:${UTA_ETL_OLD_UTA_VERSION}
     environment:
       - POSTGRES_HOST_AUTH_METHOD=trust
     healthcheck:
-      test: psql -h localhost -U anonymous -d uta -c "select * from ${UTA_VERSION}.meta"
+      test: psql -h localhost -U anonymous -d uta -c "select * from ${UTA_ETL_OLD_UTA_VERSION}.meta"
       interval: 10s
       retries: 60
     network_mode: host
-  uta-update:
+  uta-load:
     image: uta-update
-    command: etc/scripts/run-uta-build.sh ${UTA_VERSION} ${SEQREPO_VERSION} /ncbi-dir /workdir
+    command: sbin/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs ${UTA_ETL_SKIP_GENE_LOAD}
     depends_on:
       uta:
         condition: service_healthy
     volumes:
-      - ${NCBI_DIR}:/ncbi-dir
-      - ${SEQREPO_DIR}:/usr/local/share/seqrepo
-      - ${WORKING_DIR}:/workdir
+      - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
+      - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
+      - ${UTA_ETL_WORK_DIR}:/uta-load/work
+      - ${UTA_ETL_LOG_DIR}:/uta-load/logs
+    network_mode: host
+  mito-extract:
+    image: uta-update
+    command: sbin/ncbi_process_mito.py NC_012920.1 --output-dir /mito-extract/work | tee /mito-extract/logs/mito.log
+    volumes:
+      - ${UTA_ETL_WORK_DIR}:/mito-extract/work
+      - ${UTA_ETL_LOG_DIR}:/mito-extract/logs
+    working_dir: /opt/repos/uta
     network_mode: host
diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh
deleted file mode 100755
index 613d6ce..0000000
--- a/etc/scripts/run-uta-build.sh
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/bin/bash
-
-# source_uta_v is the UTA version before the update.
-# seqrepo_data_release is the SeqRepo version before the update.
-# ncbi_dir is where the script looks for NCBI data files.
-# working_dir stores log files, intermediate data files, and the final database dump.
-
-set -euxo pipefail
-
-source_uta_v=$1
-seqrepo_data_release=$2
-ncbi_dir=$3
-working_dir=$4
-
-if [ -z "$source_uta_v" ] || [ -z "$seqrepo_data_release" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ]
-then
-    echo 'Usage: run-uta-build.sh <source_uta_v> <seqrepo_data_release> <ncbi_dir> <working_dir>'
-    exit 1
-fi
-
-# set local variables and create working directories
-loading_uta_v="uta"
-loading_dir="$working_dir/loading"
-dumps_dir="$working_dir/dumps"
-logs_dir="$working_dir/logs"
-for d in "$loading_dir" "$dumps_dir" "$logs_dir";
-  do mkdir -p "$d"
-done
-
-## Drop loading schema, and recreate
-etc/scripts/delete-schema.sh "$loading_uta_v"
-etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v"
-
-## for now set up Alembic for schema migrations
-alembic -c etc/alembic.ini stamp edadb97f6502
-alembic -c etc/alembic.ini upgrade head
-
-## Load SeqRepo with new sequences
-seqrepo load -n NCBI -i "$seqrepo_data_release" \
-  $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz \
-  $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz \
-  $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz 2>& 1 | \
-  tee "$logs_dir/seqrepo-load.log"
-
-### extract meta data
-# genes
-sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz | \
-  gzip -c > "$loading_dir/genes.geneinfo.gz" 2>&1 | tee "$logs_dir/ncbi-parse-geneinfo.log"
-
-# transcript protein associations
-sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$loading_dir/assocacs.gz" 2>&1 | \
-  tee "$logs_dir/ncbi-fetch-assoc-acs"
-
-sbin/assoc-acs-merge "$loading_dir/assocacs.gz" | gzip -c > "$loading_dir/assocacs.cleaned.gz" 2>&1 | \
-  tee "$logs_dir/assoc-acs-merge"
-
-# parse transcript info from GBFF input files
-GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz)
-sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$loading_dir/gbff.txinfo.gz" 2>&1 | \
-  tee "$logs_dir/ncbi-parse-gbff.log"
-
-# parse alignments from GFF input files
-GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.gff.gz)
-sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/gff.exonsets.gz" 2>&1 | \
-  tee "$logs_dir/ncbi-parse-genomic-gff.log"
-
-sbin/filter_exonset_transcripts.py --tx-info "$loading_dir/gbff.txinfo.gz" --exonsets "$loading_dir/gff.exonsets.gz" \
- --missing-ids "$loading_dir/filtered_tx_acs.txt" | gzip -c > "$loading_dir/gff.filtered_exonsets.gz" 2>&1 | \
- tee "$logs_dir/filter_exonset_transcripts.log"
-
-# generate seqinfo files from exonsets
-sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.filtered_exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \
-  tee "$logs_dir/exonset-to-seqinfo.log"
-
-### update the uta database
-# genes
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$loading_dir/genes.geneinfo.gz" 2>&1 | \
-  tee "$logs_dir/load-geneinfo.log"
-
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$loading_dir/assocacs.cleaned.gz" 2>&1 | \
-  tee "$logs_dir/load-assoc-ac.log"
-
-# transcript info
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$loading_dir/gbff.txinfo.gz" 2>&1 | \
-  tee "$logs_dir/load-txinfo.log"
-
-# gff exon sets
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$loading_dir/gff.filtered_exonsets.gz" 2>&1 | \
-  tee "$logs_dir/load-exonsets.log"
-
-# align exons
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | tee "$logs_dir/align-exons.log"
-
-### run diff
-sbin/uta-diff "$source_uta_v" "$loading_uta_v"
-
-### psql_dump
-pg_dump -U uta_admin -h localhost -d uta -t "$loading_uta_v.gene" | gzip -c > "$dumps_dir/uta.pgd.gz"
diff --git a/sbin/ncbi-download b/sbin/ncbi-download
index 61a6874..dc445cd 100755
--- a/sbin/ncbi-download
+++ b/sbin/ncbi-download
@@ -1,6 +1,29 @@
 #!/usr/bin/env bash
 
 # This script downloads the files needed for a UTA+SeqRepo update into to the given directory.
+#
+# DONWLOAD_DIR will have the following structure:
+#
+#    ├── gene
+#    │   └── DATA
+#    │       ├── GENE_INFO
+#    │       │   └── Mammalia
+#    │       │       └── Homo_sapiens.gene_info.gz
+#    │       └── gene2accession.gz
+#    ├── genomes
+#    │   └── refseq
+#    │       └── vertebrate_mammalian
+#    │           └── Homo_sapiens
+#    │               └── all_assembly_versions
+#    │                   └── GCF_000001405.25_GRCh37.p13
+#    │                       ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz
+#    │                       └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz
+#    └── refseq
+#        └── H_sapiens
+#            └── mRNA_Prot
+#                ├── human.1.protein.faa.gz
+#                ├── human.1.rna.fna.gz
+#                └── human.1.rna.gbff.gz
 
 set -e
 
@@ -26,6 +49,7 @@ do
     DOWNLOAD_MODULE="${DOWNLOAD_PATH%%/*}"
     DOWNLOAD_SRC="ftp.ncbi.nlm.nih.gov::$DOWNLOAD_PATH"
     DOWNLOAD_DST="$DOWNLOAD_DIR/$DOWNLOAD_MODULE"
+    mkdir -p $DOWNLOAD_DST
     echo "Downloading $DOWNLOAD_SRC to $DOWNLOAD_DST"
     rsync --no-motd -DHPRprtv "$DOWNLOAD_SRC" "$DOWNLOAD_DST"
 done
diff --git a/sbin/ncbi-download-docker b/sbin/ncbi-download-docker
deleted file mode 100755
index 30d4f63..0000000
--- a/sbin/ncbi-download-docker
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env bash
-
-# This script runs ncbi-download in a docker container.
-
-set -e
-
-DOWNLOAD_DIR=$1
-
-if [ -z "$DOWNLOAD_DIR" ]
-then
-    echo 'Usage: sbin/ncbi-download-docker <download_dir>'
-    exit 1
-else
-    echo "Downloading files to $DOWNLOAD_DIR"
-fi
-
-if [[ $DOWNLOAD_DIR != /* ]]
-then
-    echo 'Download directory must be an absolute path'
-    exit 1
-fi
-
-docker build --target uta -t ncbi-download --progress plain .
-docker run -it -v $(pwd)/sbin/ncbi-download:/dl-script -v $DOWNLOAD_DIR:/output-dir ncbi-download /dl-script /output-dir
diff --git a/sbin/ncbi_process_mito.py b/sbin/ncbi_process_mito.py
old mode 100644
new mode 100755
index 2afe071..c65984b
--- a/sbin/ncbi_process_mito.py
+++ b/sbin/ncbi_process_mito.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 """
 Download mito fasta and gbff file. Use BioPython to parse the features in the Mitochondrial genbank file to get
 the attributes of a region of the genome that correspond to genes along with their attributes. Output gene/tx/alignment
@@ -52,6 +54,7 @@
 """
 import argparse
 import dataclasses
+import gzip
 import importlib_resources
 import logging
 import logging.config
@@ -83,7 +86,7 @@ class MitoGeneData:
     alt_ac: str
     alt_start: int
     alt_end: int
-    strand: str
+    strand: int
     origin: str = "NCBI"
     alignment_method: str = "splign"
     transl_table: Optional[str] = None
@@ -166,6 +169,7 @@ def parse_nomenclature_value(gb_feature: SeqFeature) -> Dict[str, str]:
 def get_mito_genes(gbff_filepath: str):
     logger.info(f"processing NCBI GBFF file from {gbff_filepath}")
     with open(gbff_filepath) as fh:
+        # Bio.SeqIO.parse(fh, "gb") returns an empty iterator for .fna files and does not fail
         for record in Bio.SeqIO.parse(fh, "gb"):
             for feature in record.features:
                 xrefs = parse_db_xrefs(feature)
@@ -200,9 +204,7 @@ def get_mito_genes(gbff_filepath: str):
                     # retrieve sequence, and reverse compliment if on reverse strand
                     ac = f"{record.id}_{feature.location.start:05}_{feature.location.end:05}"
                     feature_seq = record.seq[feature_start:feature_end]
-                    strand = "+"
                     if feature.location.strand == -1:
-                        strand = "-"
                         feature_seq = feature_seq.reverse_complement()
 
                     if feature.type == "CDS":
@@ -225,7 +227,7 @@ def get_mito_genes(gbff_filepath: str):
                         alt_ac=record.id,
                         alt_start=feature_start,
                         alt_end=feature_end,
-                        strand=strand,
+                        strand=feature.location.strand,
                         transl_table=transl_table,
                         transl_except=transl_except,
                         pro_ac=pro_ac,
@@ -242,7 +244,7 @@ def main(ncbi_accession: str, output_dir: str):
     logger.info(f"found {len(mito_genes)} genes from parsing {input_files['gbff']}")
 
     # write gene accessions
-    with open(f"{output_dir}/{ncbi_accession}.assocacs", "w") as o_file:
+    with gzip.open(f"{output_dir}/assocacs.gz", "wt") as o_file:
         gaw = GeneAccessionsWriter(o_file)
         for mg in mito_genes:
             if mg.pro_ac is not None:
@@ -253,7 +255,7 @@ def main(ncbi_accession: str, output_dir: str):
                 )
 
     # write sequence information
-    with open(f"{output_dir}/{ncbi_accession}.seqinfo", "w") as o_file:
+    with gzip.open(f"{output_dir}/seqinfo.gz", "wt") as o_file:
         siw = SeqInfoWriter(o_file)
         for mg in mito_genes:
             siw.write(
@@ -279,7 +281,7 @@ def main(ncbi_accession: str, output_dir: str):
                 )
 
     # write out transcript sequence fasta files.
-    with open(f"{output_dir}/{ncbi_accession}.rna.fna", "w") as o_file:
+    with gzip.open(f"{output_dir}/{ncbi_accession}.rna.fna.gz", "wt") as o_file:
         for mg in mito_genes:
             record = SeqRecord(
                 Seq(mg.tx_seq),
@@ -289,7 +291,7 @@ def main(ncbi_accession: str, output_dir: str):
             o_file.write(record.format("fasta"))
 
     # write out protein sequence fasta files.
-    with open(f"{output_dir}/{ncbi_accession}.protein.faa", "w") as o_file:
+    with gzip.open(f"{output_dir}/{ncbi_accession}.protein.faa.gz", "wt") as o_file:
         for mg in mito_genes:
             if mg.pro_ac is not None:
                 record = SeqRecord(
@@ -300,7 +302,7 @@ def main(ncbi_accession: str, output_dir: str):
                 o_file.write(record.format("fasta"))
 
     # write transcript information
-    with open(f"{output_dir}/{ncbi_accession}.txinfo", "w") as o_file:
+    with gzip.open(f"{output_dir}/txinfo.gz", "wt") as o_file:
         tiw = TxInfoWriter(o_file)
         for mg in mito_genes:
             tiw.write(
@@ -315,7 +317,7 @@ def main(ncbi_accession: str, output_dir: str):
             )
 
     # write exonset
-    with open(f"{output_dir}/{ncbi_accession}.exonset", "w") as o_file:
+    with gzip.open(f"{output_dir}/exonsets.gz", "wt") as o_file:
         esw = ExonSetWriter(o_file)
         for mg in mito_genes:
             esw.write(
@@ -331,5 +333,4 @@ def main(ncbi_accession: str, output_dir: str):
 
 if __name__ == "__main__":
     args = parse_args()
-
     main(args.accession, args.output_dir)
diff --git a/sbin/seqrepo-download b/sbin/seqrepo-download
deleted file mode 100755
index 0773030..0000000
--- a/sbin/seqrepo-download
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/env bash
-
-# This script downloads SeqRepo into the given directory.
-# Note: pulling data takes ~30 minutes and requires ~13 GB.
-# Note: a container called seqrepo will be left behind.
-# The name of the container can be changed by providing a third argument.
-
-set -e
-
-SEQREPO_VERSION=$1
-OUTPUT_DIR=$2
-# optional:
-SEQREPO_CONTAINER_NAME=$3
-
-if [ -z "$SEQREPO_VERSION" ] || [ -z "$OUTPUT_DIR" ]
-then
-    echo 'Usage: sbin/seqrepo-download <seqrepo_version> <output_dir>'
-    exit 1
-else
-    echo "SeqRepo data for version $SEQREPO_VERSION will be available in $OUTPUT_DIR"
-fi
-
-# Name of seqrepo container
-if [ -z "$SEQREPO_CONTAINER_NAME" ]
-then
-    SEQREPO_CONTAINER_NAME=seqrepo
-fi
-
-if [[ $OUTPUT_DIR != /* ]]
-then
-    echo 'Output directory must be an absolute path'
-    exit 1
-fi
-
-if [ ! -d "$OUTPUT_DIR" ]; then
-    echo "Directory $OUTPUT_DIR does not exist."
-    exit 1
-fi
-
-# Pull seqrepo image
-docker pull biocommons/seqrepo:$SEQREPO_VERSION
-
-# Download seqrepo data using seqrepo image
-if docker ps -aq -f name=$SEQREPO_CONTAINER_NAME
-then
-    echo "Container called $SEQREPO_CONTAINER_NAME already exists. Skipping seqrepo data download."
-else
-    docker run --name seqrepo biocommons/seqrepo:$SEQREPO_VERSION
-fi
-
-# Copy seqrepo data into a local directory
-echo "Copying seqrepo data into $OUTPUT_DIR ..."
-docker run -it -v $OUTPUT_DIR:/output-dir --volumes-from $SEQREPO_CONTAINER_NAME:ro ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir'
diff --git a/sbin/seqrepo-load b/sbin/seqrepo-load
new file mode 100755
index 0000000..93942cf
--- /dev/null
+++ b/sbin/seqrepo-load
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+set -e
+
+seqrepo_root=$1
+seqrepo_version=$2
+sequence_dir=$3
+log_dir=$4
+
+if [ -z "$seqrepo_root" ] || [ -z "$seqrepo_version" ] || [ -z "$sequence_dir" ] || [ -z "$log_dir" ]
+then
+    echo 'Usage: sbin/seqrepo-load <seqrepo_root> <seqrepo_version> <sequence_dir> <log_dir>'
+    exit 1
+fi
+
+## Load SeqRepo with new sequences
+seqrepo --root-directory "$seqrepo_root" \
+    load -n NCBI --instance-name "$seqrepo_version" \
+    $sequence_dir/*.fna.gz \
+    $sequence_dir/*.faa.gz 2>& 1 | \
+    tee "$log_dir/seqrepo-load.log"
diff --git a/sbin/uta-extract b/sbin/uta-extract
new file mode 100755
index 0000000..2a60f3f
--- /dev/null
+++ b/sbin/uta-extract
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+# Extract data from NCBI files into intermediate files.
+
+set -e
+
+ncbi_dir=$1
+working_dir=$2
+log_dir=$3
+
+if [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
+then
+    echo 'Usage: sbin/uta-extract <ncbi_dir> <working_dir> <log_dir>'
+    exit 1
+fi
+
+# genes
+sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz | \
+    gzip -c > "$working_dir/geneinfo.gz" 2>&1 | tee "$log_dir/ncbi-parse-geneinfo.log"
+
+# transcript protein associations
+sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$working_dir/assocacs.gz" 2>&1 | \
+    tee "$log_dir/ncbi-fetch-assoc-acs.log"
+
+# parse transcript info from GBFF input files
+GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz)
+sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$working_dir/txinfo.gz" 2>&1 | \
+    tee "$log_dir/ncbi-parse-gbff.log"
+
+# parse alignments from GFF input files
+GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.gff.gz)
+sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$working_dir/unfiltered_exonsets.gz" 2>&1 | \
+    tee "$log_dir/ncbi-parse-genomic-gff.log"
+
+# filter transcripts
+sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \
+    --missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \
+    tee "$log_dir/filter_exonset_transcripts.log"
+
+# move fasta files into same dir
+ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/
+ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/
+ln $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/
diff --git a/sbin/uta-load b/sbin/uta-load
new file mode 100755
index 0000000..c3da837
--- /dev/null
+++ b/sbin/uta-load
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+
+# This script updates UTA and SeqRepo using NCBI files.
+# source_uta_v is the UTA version before the update.
+# ncbi_dir is where the script looks for NCBI data files.
+# working_dir stores intermediate data files and the final database dump.
+# log_dir stores log files.
+# skip_load_genes, if truthy, will skip the gene loading step
+
+# Note that the uta loading code uses the seqrepo location defined in the conf files, under [sequences].seqrepo.
+
+set -euxo pipefail
+
+source_uta_v=$1
+ncbi_dir=$2
+working_dir=$3
+log_dir=$4
+skip_load_genes=$5
+
+if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
+then
+    echo 'Usage: uta-load <source_uta_v> <ncbi_dir> <working_dir> <log_dir>'
+    exit 1
+fi
+
+# set local variables and create working directories
+loading_uta_v="uta"
+mkdir -p "$log_dir"
+
+## Drop loading schema, and recreate
+etc/scripts/delete-schema.sh "$loading_uta_v"
+etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v"
+
+## for now set up Alembic for schema migrations
+alembic -c etc/alembic.ini stamp edadb97f6502
+alembic -c etc/alembic.ini upgrade head
+
+# generate seqinfo files from exonsets (this step requires seqrepo)
+sbin/exonset-to-seqinfo -o NCBI "$working_dir/exonsets.gz" | gzip -c > "$working_dir/seqinfo.gz" 2>&1 | \
+    tee "$log_dir/exonset-to-seqinfo.log"
+
+# Filter out columns from assocacs file.
+sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-ac.gz" 2>&1 | \
+    tee "$log_dir/assoc-acs-merge.log"
+
+# Load genes into gene table.
+if [ "$skip_load_genes" = "true" ]
+then
+    echo "Skipping load-geneinfo"
+else
+    uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \
+        tee "$log_dir/load-geneinfo.log"
+fi
+
+# Load accessions into associated_accessions table.
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$working_dir/assoc-ac.gz" 2>&1 | \
+    tee "$log_dir/load-assoc-ac.log"
+
+# Load transcript info into transcript and exon_set tables.
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$working_dir/txinfo.gz" 2>&1 | \
+    tee "$log_dir/load-txinfo.log"
+
+# Load exon sets into into exon_set and exon tables.
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$working_dir/exonsets.gz" 2>&1 | \
+    tee "$log_dir/load-exonsets.log"
+
+# Create cigar strings for all rows in tx_alt_exon_pairs_v view and update exon_aln table.
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | \
+    tee "$log_dir/align-exons.log"
+
+# Load seqinfo?
+
+### run diff
+sbin/uta-diff "$source_uta_v" "$loading_uta_v"
+
+### psql_dump
+pg_dump -U uta_admin -h localhost -d uta -t "$loading_uta_v.gene" | gzip -c > "$working_dir/uta.pgd.gz"
diff --git a/sbin/uta-update b/sbin/uta-update
deleted file mode 100755
index 23d93af..0000000
--- a/sbin/uta-update
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env bash
-
-# This script runs the UTA update procedure.
-# It updates the specified UTA and SeqRepo using the given NCBI files.
-# It produces a postgres dump of the updated UTA database and an updated SeqRepo (updated in place).
-# It expects to be run from the root of the uta repository.
-
-set -e
-
-# export environment variables for docker compose file
-export NCBI_DIR=$1
-export SEQREPO_DIR=$2
-export WORKING_DIR=$3
-export UTA_VERSION=$4
-export SEQREPO_VERSION=$5
-
-if [ -z "$NCBI_DIR" ] || [ -z "$SEQREPO_DIR" ] || [ -z "$WORKING_DIR" ] || [ -z "$UTA_VERSION" ] || [ -z "$SEQREPO_VERSION" ]
-then
-    echo 'Usage: sbin/uta-update <ncbi_file_dir> <seqrepo_dir> <working_dir> <uta_version> <seqrepo_version>'
-    exit 1
-else
-    echo "Updating UTA and SeqRepo using files in $NCBI_DIR and SeqRepo data in $SEQREPO_DIR"
-    echo "Starting from UTA version $UTA_VERSION and SeqRepo version $SEQREPO_VERSION"
-    echo "Logs and intermediate files will be available in $WORKING_DIR"
-fi
-
-# Ensure directories are compatible with docker volume usage
-if [[ $NCBI_DIR != /* ]] && [[ $NCBI_DIR != .* ]]
-then
-    echo 'NCBI file directory must start with / or .'
-    exit 1
-fi
-
-if [[ $SEQREPO_DIR != /* ]] && [[ $SEQREPO_DIR != .* ]]
-then
-    echo 'SeqRepo data directory must start with / or .'
-    exit 1
-fi
-
-if [[ $WORKING_DIR != /* ]] && [[ $WORKING_DIR != .* ]]
-then
-    echo 'Working directory must start with / or .'
-    exit 1
-fi
-
-# Ensure directories exist.
-if [ ! -d "$NCBI_DIR" ]; then
-    echo "Directory $NCBI_DIR does not exist."
-    exit 1
-fi
-
-if [ ! -d "$SEQREPO_DIR" ]; then
-    echo "Directory $SEQREPO_DIR does not exist."
-    exit 1
-fi
-
-if [ ! -d "$WORKING_DIR" ]; then
-    echo "Directory $WORKING_DIR does not exist."
-    exit 1
-fi
-
-# Build the UTA image.
-docker build --target uta -t uta-update .
-
-# Bring up a UTA database and run the UTA update procedure.
-# docker compose doesn't respect the container name specified in the compose file, so container name is specified here
-docker compose run --rm --name uta-update uta-update
diff --git a/tests/test_ncbi_process_mito.py b/tests/test_ncbi_process_mito.py
index d0a1982..8beb535 100644
--- a/tests/test_ncbi_process_mito.py
+++ b/tests/test_ncbi_process_mito.py
@@ -193,7 +193,7 @@ def test_get_mito_genes(self):
             "alt_ac": "NC_012920.1",
             "alt_start": 1601,
             "alt_end": 1670,
-            "strand": "+",
+            "strand": 1,
             "transl_table": None,
             "transl_except": None,
             "pro_ac": None,
@@ -212,7 +212,7 @@ def test_get_mito_genes(self):
             "alt_ac": "NC_012920.1",
             "alt_start": 4328,
             "alt_end": 4400,
-            "strand": "-",
+            "strand": -1,
             "transl_table": None,
             "transl_except": None,
             "pro_ac": None,
@@ -237,7 +237,7 @@ def test_get_mito_genes(self):
             "alt_ac": "NC_012920.1",
             "alt_start": 7585,
             "alt_end": 8269,
-            "strand": "+",
+            "strand": 1,
             "transl_table": "2",
             "transl_except": None,
             "pro_ac": "YP_003024029.1",
@@ -267,7 +267,7 @@ def test_get_mito_genes(self):
             "alt_ac": "NC_012920.1",
             "alt_start": 3306,
             "alt_end": 4262,
-            "strand": "+",
+            "strand": 1,
             "transl_table": "2",
             "transl_except": "(pos:4261..4262,aa:TERM)",
             "pro_ac": "YP_003024026.1",