From 33638c4cedbed6c4c035117e46c65cb5aa771303 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Wed, 3 Apr 2024 21:20:10 -0700
Subject: [PATCH 01/36] add shebang and comment to mito script, and make it
 executable

---
 sbin/ncbi_process_mito.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 sbin/ncbi_process_mito.py

diff --git a/sbin/ncbi_process_mito.py b/sbin/ncbi_process_mito.py
old mode 100644
new mode 100755
index 2afe071..d51b207
--- a/sbin/ncbi_process_mito.py
+++ b/sbin/ncbi_process_mito.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 """
 Download mito fasta and gbff file. Use BioPython to parse the features in the Mitochondrial genbank file to get
 the attributes of a region of the genome that correspond to genes along with their attributes. Output gene/tx/alignment
@@ -166,6 +168,7 @@ def parse_nomenclature_value(gb_feature: SeqFeature) -> Dict[str, str]:
 def get_mito_genes(gbff_filepath: str):
     logger.info(f"processing NCBI GBFF file from {gbff_filepath}")
     with open(gbff_filepath) as fh:
+        # Bio.SeqIO.parse(fh, "gb") returns an empty iterator for .fna files and does not fail
         for record in Bio.SeqIO.parse(fh, "gb"):
             for feature in record.features:
                 xrefs = parse_db_xrefs(feature)
@@ -331,5 +334,4 @@ def main(ncbi_accession: str, output_dir: str):
 
 if __name__ == "__main__":
     args = parse_args()
-
     main(args.accession, args.output_dir)

From 59323c7044b219fe701500fff7d2b2beaa08d06c Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Thu, 4 Apr 2024 17:50:41 -0700
Subject: [PATCH 02/36] move ncbi parsing scripts into uta-extract

---
 README.md                    | 32 +++++++++++++++--
 etc/scripts/run-uta-build.sh | 69 +++++++++++++-----------------------
 sbin/uta-extract             | 35 ++++++++++++++++++
 3 files changed, 90 insertions(+), 46 deletions(-)
 create mode 100755 sbin/uta-extract

diff --git a/README.md b/README.md
index 78da019..d490965 100644
--- a/README.md
+++ b/README.md
@@ -289,7 +289,7 @@ To develop UTA, follow these steps.
 4. Testing
 
         $ docker build --target uta-test -t uta-test .
-        $ docker run -it --rm uta-test python -m unittest
+        $ docker run --rm uta-test python -m unittest
 
 ## UTA update procedure
 
@@ -334,7 +334,35 @@ Example:
 sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data
 ```
 
-### 3. Update UTA and SeqRepo
+### 3. Extract data from NCBI files into intermediate files
+
+See 3A for nuclear transcript updates and 3B for mitochondrial transcript updates.
+
+In either case, first create directories:
+```
+mkdir -p $(pwd)/uta-build/loading
+mkdir -p $(pwd)/uta-build/logs
+```
+
+#### 3A. Nuclear transcripts
+
+Run `sbin/uta-extract`. Requires bash and docker.
+
+Example:
+```
+sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build/loading $(pwd)/uta-build/logs
+```
+
+#### 3B. Mitochondrial transcripts
+
+Run `sbin/ncbi_process_mito.py`. Requires bash and docker.
+
+Example:
+```
+sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/uta-build/loading | tee $(pwd)/uta-build/logs/mito.log
+```
+
+### 4. Update UTA and SeqRepo
 
 Run `sbin/uta-update`. Requires bash and docker.
 
diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh
index 166c6cb..9160cc2 100755
--- a/etc/scripts/run-uta-build.sh
+++ b/etc/scripts/run-uta-build.sh
@@ -38,50 +38,31 @@ seqrepo load -n NCBI -i "$seqrepo_data_release" \
   $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz 2>& 1 | \
   tee "$logs_dir/seqrepo-load.log"
 
-### extract meta data
-# genes
-sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz | \
-  gzip -c > "$loading_dir/genes.geneinfo.gz" 2>&1 | tee "$logs_dir/ncbi-parse-geneinfo.log"
-
-# transcript protein associations
-sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$loading_dir/assocacs.gz" 2>&1 | \
-  tee "$logs_dir/ncbi-fetch-assoc-acs"
-
-sbin/assoc-acs-merge "$loading_dir/assocacs.gz" | gzip -c > "$loading_dir/assocacs.cleaned.gz" 2>&1 | \
-  tee "$logs_dir/assoc-acs-merge"
-
-# parse transcript info from GBFF input files
-GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz)
-sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$loading_dir/gbff.txinfo.gz" 2>&1 | \
-  tee "$logs_dir/ncbi-parse-gbff.log"
-
-# parse alignments from GFF input files
-GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.gff.gz)
-sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/gff.exonsets.gz" 2>&1 | \
-  tee "$logs_dir/ncbi-parse-genomic-gff.log"
-
-# generate seqinfo files from exonsets
-sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \
-  tee "$logs_dir/exonset-to-seqinfo.log"
-
-### update the uta database
-# genes
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$loading_dir/genes.geneinfo.gz" 2>&1 | \
-  tee "$logs_dir/load-geneinfo.log"
-
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$loading_dir/assocacs.cleaned.gz" 2>&1 | \
-  tee "$logs_dir/load-assoc-ac.log"
-
-# transcript info
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$loading_dir/gbff.txinfo.gz" 2>&1 | \
-  tee "$logs_dir/load-txinfo.log"
-
-# gff exon sets
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$loading_dir/gff.exonsets.gz" 2>&1 | \
-  tee "$logs_dir/load-exonsets.log"
-
-# align exons
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | tee "$logs_dir/align-exons.log"
+# Filter out columns from assocacs file.
+sbin/assoc-acs-merge "$loading_dir/assocacs.gz" | gzip -c > "$loading_dir/assoc-ac.gz" 2>&1 | \
+    tee "$logs_dir/assoc-acs-merge.log"
+
+# Load genes into gene table.
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$loading_dir/geneinfo.gz" 2>&1 | \
+    tee "$logs_dir/load-geneinfo.log"
+
+# Load accessions into associated_accessions table.
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$loading_dir/assoc-ac.gz" 2>&1 | \
+    tee "$logs_dir/load-assoc-ac.log"
+
+# Load transcript info into transcript and exon_set tables.
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$loading_dir/txinfo.gz" 2>&1 | \
+    tee "$logs_dir/load-txinfo.log"
+
+# Load exon sets into into exon_set and exon tables.
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$loading_dir/exonsets.gz" 2>&1 | \
+    tee "$logs_dir/load-exonsets.log"
+
+# Create cigar strings for all rows in tx_alt_exon_pairs_v view and update exon_aln table.
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | \
+    tee "$logs_dir/align-exons.log"
+
+# Load seqinfo?
 
 ### run diff
 sbin/uta-diff "$source_uta_v" "$loading_uta_v"
diff --git a/sbin/uta-extract b/sbin/uta-extract
new file mode 100755
index 0000000..6864de4
--- /dev/null
+++ b/sbin/uta-extract
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+# Extract data from NCBI files into intermediate files.
+
+ncbi_dir=$1
+loading_dir=$2
+logs_dir=$3
+
+if [ -z "$ncbi_dir" ] || [ -z "$loading_dir" ] || [ -z "$logs_dir" ]
+then
+    echo 'Usage: sbin/uta-extract <ncbi_dir> <loading_dir> <logs_dir>'
+    exit 1
+fi
+
+# genes
+sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz | \
+    gzip -c > "$loading_dir/geneinfo.gz" 2>&1 | tee "$logs_dir/ncbi-parse-geneinfo.log"
+
+# transcript protein associations
+sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$loading_dir/assocacs.gz" 2>&1 | \
+    tee "$logs_dir/ncbi-fetch-assoc-acs.log"
+
+# parse transcript info from GBFF input files
+GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz)
+sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$loading_dir/txinfo.gz" 2>&1 | \
+    tee "$logs_dir/ncbi-parse-gbff.log"
+
+# parse alignments from GFF input files
+GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.gff.gz)
+sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/exonsets.gz" 2>&1 | \
+    tee "$logs_dir/ncbi-parse-genomic-gff.log"
+
+# generate seqinfo files from exonsets
+sbin/exonset-to-seqinfo -o NCBI "$loading_dir/exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \
+    tee "$logs_dir/exonset-to-seqinfo.log"

From 6297bac365bd95499ed4fa65b8ae32609750c8b4 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Thu, 4 Apr 2024 18:09:55 -0700
Subject: [PATCH 03/36] produce gzip files from mito script

---
 sbin/ncbi_process_mito.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sbin/ncbi_process_mito.py b/sbin/ncbi_process_mito.py
index d51b207..f8cbdc3 100755
--- a/sbin/ncbi_process_mito.py
+++ b/sbin/ncbi_process_mito.py
@@ -54,6 +54,7 @@
 """
 import argparse
 import dataclasses
+import gzip
 import importlib_resources
 import logging
 import logging.config
@@ -245,7 +246,7 @@ def main(ncbi_accession: str, output_dir: str):
     logger.info(f"found {len(mito_genes)} genes from parsing {input_files['gbff']}")
 
     # write gene accessions
-    with open(f"{output_dir}/{ncbi_accession}.assocacs", "w") as o_file:
+    with gzip.open(f"{output_dir}/assocacs.gz", "wt") as o_file:
         gaw = GeneAccessionsWriter(o_file)
         for mg in mito_genes:
             if mg.pro_ac is not None:
@@ -256,7 +257,7 @@ def main(ncbi_accession: str, output_dir: str):
                 )
 
     # write sequence information
-    with open(f"{output_dir}/{ncbi_accession}.seqinfo", "w") as o_file:
+    with gzip.open(f"{output_dir}/seqinfo.gz", "wt") as o_file:
         siw = SeqInfoWriter(o_file)
         for mg in mito_genes:
             siw.write(
@@ -303,7 +304,7 @@ def main(ncbi_accession: str, output_dir: str):
                 o_file.write(record.format("fasta"))
 
     # write transcript information
-    with open(f"{output_dir}/{ncbi_accession}.txinfo", "w") as o_file:
+    with gzip.open(f"{output_dir}/txinfo.gz", "wt") as o_file:
         tiw = TxInfoWriter(o_file)
         for mg in mito_genes:
             tiw.write(
@@ -318,7 +319,7 @@ def main(ncbi_accession: str, output_dir: str):
             )
 
     # write exonset
-    with open(f"{output_dir}/{ncbi_accession}.exonset", "w") as o_file:
+    with gzip.open(f"{output_dir}/exonsets.gz", "wt") as o_file:
         esw = ExonSetWriter(o_file)
         for mg in mito_genes:
             esw.write(

From aa97fe6ceb90e345241754106e63db198f51ebbc Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Thu, 4 Apr 2024 18:45:10 -0700
Subject: [PATCH 04/36] move seqrepo load into its own script

---
 README.md                    | 23 +++++++++++++++++------
 etc/scripts/run-uta-build.sh |  7 -------
 sbin/seqrepo-load            | 22 ++++++++++++++++++++++
 3 files changed, 39 insertions(+), 13 deletions(-)
 create mode 100755 sbin/seqrepo-load

diff --git a/README.md b/README.md
index d490965..9e0bfe8 100644
--- a/README.md
+++ b/README.md
@@ -293,9 +293,11 @@ To develop UTA, follow these steps.
 
 ## UTA update procedure
 
+Requires bash and docker.
+
 ### 1. Download files from NCBI
 
-Run `sbin/ncbi-download-docker`. Requires bash and docker.
+Run `sbin/ncbi-download-docker`.
 
 Example:
 ```
@@ -327,7 +329,7 @@ The specified directory will have the following structure:
 
 ### 2. Download SeqRepo data
 
-Run `sbin/seqrepo-download`. Requires bash and docker.
+Run `sbin/seqrepo-download`.
 
 Example:
 ```
@@ -346,7 +348,7 @@ mkdir -p $(pwd)/uta-build/logs
 
 #### 3A. Nuclear transcripts
 
-Run `sbin/uta-extract`. Requires bash and docker.
+Run `sbin/uta-extract`.
 
 Example:
 ```
@@ -355,16 +357,25 @@ sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build/loading $(pwd)/uta-build/logs
 
 #### 3B. Mitochondrial transcripts
 
-Run `sbin/ncbi_process_mito.py`. Requires bash and docker.
+Run `sbin/ncbi_process_mito.py`.
 
 Example:
 ```
 sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/uta-build/loading | tee $(pwd)/uta-build/logs/mito.log
 ```
 
-### 4. Update UTA and SeqRepo
+### 4. Load data into SeqRepo
+
+Run `sbin/seqrepo-load`.
+
+Example:
+```
+sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/logs
+```
+
+### 5. Load data into UTA
 
-Run `sbin/uta-update`. Requires bash and docker.
+Run `sbin/uta-update`.
 
 Example:
 ```
diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh
index 9160cc2..ccba691 100755
--- a/etc/scripts/run-uta-build.sh
+++ b/etc/scripts/run-uta-build.sh
@@ -31,13 +31,6 @@ done
 etc/scripts/delete-schema.sh "$loading_uta_v"
 etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v"
 
-## Load SeqRepo with new sequences
-seqrepo load -n NCBI -i "$seqrepo_data_release" \
-  $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz \
-  $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz \
-  $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz 2>& 1 | \
-  tee "$logs_dir/seqrepo-load.log"
-
 # Filter out columns from assocacs file.
 sbin/assoc-acs-merge "$loading_dir/assocacs.gz" | gzip -c > "$loading_dir/assoc-ac.gz" 2>&1 | \
     tee "$logs_dir/assoc-acs-merge.log"
diff --git a/sbin/seqrepo-load b/sbin/seqrepo-load
new file mode 100755
index 0000000..9b830ce
--- /dev/null
+++ b/sbin/seqrepo-load
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+set -e
+
+seqrepo_root=$1
+seqrepo_version=$2
+ncbi_dir=$3
+logs_dir=$4
+
+if [ -z "$seqrepo_root" ] || [ -z "$seqrepo_version" ] || [ -z "$ncbi_dir" ] || [ -z "$logs_dir" ]
+then
+    echo 'Usage: sbin/seqrepo-load <seqrepo_root> <seqrepo_version> <ncbi_dir> <logs_dir>'
+    exit 1
+fi
+
+## Load SeqRepo with new sequences
+seqrepo --root-directory "$seqrepo_root" \
+    load -n NCBI --instance-name "$seqrepo_version" \
+    $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz \
+    $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz \
+    $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz 2>& 1 | \
+    tee "$logs_dir/seqrepo-load.log"

From 46d8ac3cda3fa73c5db8ed14ebe6e9b60d262369 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 00:31:55 -0700
Subject: [PATCH 05/36] copy fasta files into the loading dir

---
 README.md                 | 10 +++++-----
 sbin/ncbi_process_mito.py |  4 ++--
 sbin/seqrepo-load         | 12 ++++++------
 sbin/uta-extract          |  5 +++++
 4 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 9e0bfe8..1792c6b 100644
--- a/README.md
+++ b/README.md
@@ -342,8 +342,8 @@ See 3A for nuclear transcript updates and 3B for mitochondrial transcript update
 
 In either case, first create directories:
 ```
-mkdir -p $(pwd)/uta-build/loading
-mkdir -p $(pwd)/uta-build/logs
+mkdir -p $(pwd)/uta-build
+mkdir -p $(pwd)/uta-logs
 ```
 
 #### 3A. Nuclear transcripts
@@ -352,7 +352,7 @@ Run `sbin/uta-extract`.
 
 Example:
 ```
-sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build/loading $(pwd)/uta-build/logs
+sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build $(pwd)/uta-logs
 ```
 
 #### 3B. Mitochondrial transcripts
@@ -361,7 +361,7 @@ Run `sbin/ncbi_process_mito.py`.
 
 Example:
 ```
-sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/uta-build/loading | tee $(pwd)/uta-build/logs/mito.log
+sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/uta-build | tee $(pwd)/uta-logs/mito.log
 ```
 
 ### 4. Load data into SeqRepo
@@ -370,7 +370,7 @@ Run `sbin/seqrepo-load`.
 
 Example:
 ```
-sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/logs
+sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/uta-build $(pwd)/uta-logs
 ```
 
 ### 5. Load data into UTA
diff --git a/sbin/ncbi_process_mito.py b/sbin/ncbi_process_mito.py
index f8cbdc3..b8d5480 100755
--- a/sbin/ncbi_process_mito.py
+++ b/sbin/ncbi_process_mito.py
@@ -283,7 +283,7 @@ def main(ncbi_accession: str, output_dir: str):
                 )
 
     # write out transcript sequence fasta files.
-    with open(f"{output_dir}/{ncbi_accession}.rna.fna", "w") as o_file:
+    with gzip.open(f"{output_dir}/{ncbi_accession}.rna.fna.gz", "wt") as o_file:
         for mg in mito_genes:
             record = SeqRecord(
                 Seq(mg.tx_seq),
@@ -293,7 +293,7 @@ def main(ncbi_accession: str, output_dir: str):
             o_file.write(record.format("fasta"))
 
     # write out protein sequence fasta files.
-    with open(f"{output_dir}/{ncbi_accession}.protein.faa", "w") as o_file:
+    with gzip.open(f"{output_dir}/{ncbi_accession}.protein.faa.gz", "wt") as o_file:
         for mg in mito_genes:
             if mg.pro_ac is not None:
                 record = SeqRecord(
diff --git a/sbin/seqrepo-load b/sbin/seqrepo-load
index 9b830ce..9aa1453 100755
--- a/sbin/seqrepo-load
+++ b/sbin/seqrepo-load
@@ -4,19 +4,19 @@ set -e
 
 seqrepo_root=$1
 seqrepo_version=$2
-ncbi_dir=$3
+sequence_dir=$3
 logs_dir=$4
 
-if [ -z "$seqrepo_root" ] || [ -z "$seqrepo_version" ] || [ -z "$ncbi_dir" ] || [ -z "$logs_dir" ]
+if [ -z "$seqrepo_root" ] || [ -z "$seqrepo_version" ] || [ -z "$sequence_dir" ] || [ -z "$logs_dir" ]
 then
-    echo 'Usage: sbin/seqrepo-load <seqrepo_root> <seqrepo_version> <ncbi_dir> <logs_dir>'
+    echo 'Usage: sbin/seqrepo-load <seqrepo_root> <seqrepo_version> <sequence_dir> <logs_dir>'
     exit 1
 fi
 
 ## Load SeqRepo with new sequences
 seqrepo --root-directory "$seqrepo_root" \
     load -n NCBI --instance-name "$seqrepo_version" \
-    $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz \
-    $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz \
-    $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz 2>& 1 | \
+    $sequence_dir/*.rna.fna.gz
+    $sequence_dir/*.protein.faa.gz
+    $sequence_dir/*_genomic.fna.gz
     tee "$logs_dir/seqrepo-load.log"
diff --git a/sbin/uta-extract b/sbin/uta-extract
index 6864de4..84c3940 100755
--- a/sbin/uta-extract
+++ b/sbin/uta-extract
@@ -33,3 +33,8 @@ sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/exonsets.g
 # generate seqinfo files from exonsets
 sbin/exonset-to-seqinfo -o NCBI "$loading_dir/exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \
     tee "$logs_dir/exonset-to-seqinfo.log"
+
+# move fasta files into same dir
+cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz uta-build/
+cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz uta-build/
+cp $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz uta-build/

From 17168b992082de55248c05b8b6c5bfa24df96bd5 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 02:33:30 -0700
Subject: [PATCH 06/36] remove unneeded seqrepo version input

---
 docker-compose.yml           |  2 +-
 etc/scripts/run-uta-build.sh | 12 ++++++------
 sbin/seqrepo-download        |  2 +-
 sbin/seqrepo-load            |  5 ++---
 sbin/uta-update              |  7 +++----
 5 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 856a5ae..0d4923a 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -15,7 +15,7 @@ services:
     network_mode: host
   uta-update:
     image: uta-update
-    command: etc/scripts/run-uta-build.sh ${UTA_VERSION} ${SEQREPO_VERSION} /ncbi-dir /workdir
+    command: etc/scripts/run-uta-build.sh ${UTA_VERSION} /ncbi-dir /workdir
     depends_on:
       uta:
         condition: service_healthy
diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh
index ccba691..ab98a16 100755
--- a/etc/scripts/run-uta-build.sh
+++ b/etc/scripts/run-uta-build.sh
@@ -1,20 +1,20 @@
 #!/bin/bash
 
 # source_uta_v is the UTA version before the update.
-# seqrepo_data_release is the SeqRepo version before the update.
 # ncbi_dir is where the script looks for NCBI data files.
 # working_dir stores log files, intermediate data files, and the final database dump.
 
+# Note that the uta loading code uses the seqrepo location defined in the conf files, under [sequences].seqrepo.
+
 set -euxo pipefail
 
 source_uta_v=$1
-seqrepo_data_release=$2
-ncbi_dir=$3
-working_dir=$4
+ncbi_dir=$2
+working_dir=$3
 
-if [ -z "$source_uta_v" ] || [ -z "$seqrepo_data_release" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ]
+if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ]
 then
-    echo 'Usage: run-uta-build.sh <source_uta_v> <seqrepo_data_release> <ncbi_dir> <working_dir>'
+    echo 'Usage: run-uta-build.sh <source_uta_v> <ncbi_dir> <working_dir>'
     exit 1
 fi
 
diff --git a/sbin/seqrepo-download b/sbin/seqrepo-download
index 0773030..6f628f1 100755
--- a/sbin/seqrepo-download
+++ b/sbin/seqrepo-download
@@ -50,4 +50,4 @@ fi
 
 # Copy seqrepo data into a local directory
 echo "Copying seqrepo data into $OUTPUT_DIR ..."
-docker run -it -v $OUTPUT_DIR:/output-dir --volumes-from $SEQREPO_CONTAINER_NAME:ro ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir'
+docker run -it -v $OUTPUT_DIR:/output-dir --volumes-from $SEQREPO_CONTAINER_NAME ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir'
diff --git a/sbin/seqrepo-load b/sbin/seqrepo-load
index 9aa1453..3f05842 100755
--- a/sbin/seqrepo-load
+++ b/sbin/seqrepo-load
@@ -16,7 +16,6 @@ fi
 ## Load SeqRepo with new sequences
 seqrepo --root-directory "$seqrepo_root" \
     load -n NCBI --instance-name "$seqrepo_version" \
-    $sequence_dir/*.rna.fna.gz
-    $sequence_dir/*.protein.faa.gz
-    $sequence_dir/*_genomic.fna.gz
+    $sequence_dir/*.fna.gz \
+    $sequence_dir/*.faa.gz 2>& 1 | \
     tee "$logs_dir/seqrepo-load.log"
diff --git a/sbin/uta-update b/sbin/uta-update
index 23d93af..81578bc 100755
--- a/sbin/uta-update
+++ b/sbin/uta-update
@@ -12,15 +12,14 @@ export NCBI_DIR=$1
 export SEQREPO_DIR=$2
 export WORKING_DIR=$3
 export UTA_VERSION=$4
-export SEQREPO_VERSION=$5
 
-if [ -z "$NCBI_DIR" ] || [ -z "$SEQREPO_DIR" ] || [ -z "$WORKING_DIR" ] || [ -z "$UTA_VERSION" ] || [ -z "$SEQREPO_VERSION" ]
+if [ -z "$NCBI_DIR" ] || [ -z "$SEQREPO_DIR" ] || [ -z "$WORKING_DIR" ] || [ -z "$UTA_VERSION" ]
 then
-    echo 'Usage: sbin/uta-update <ncbi_file_dir> <seqrepo_dir> <working_dir> <uta_version> <seqrepo_version>'
+    echo 'Usage: sbin/uta-update <ncbi_file_dir> <seqrepo_dir> <working_dir> <uta_version>'
     exit 1
 else
     echo "Updating UTA and SeqRepo using files in $NCBI_DIR and SeqRepo data in $SEQREPO_DIR"
-    echo "Starting from UTA version $UTA_VERSION and SeqRepo version $SEQREPO_VERSION"
+    echo "Starting from UTA version $UTA_VERSION"
     echo "Logs and intermediate files will be available in $WORKING_DIR"
 fi
 

From 894ef4d0f67b13080f2a0d15d64dc901eda65a98 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 02:36:08 -0700
Subject: [PATCH 07/36] rename uta-update uta-load, in line with seqrepo-load
 and extract-transform-load

---
 README.md                     | 4 ++--
 docker-compose.yml            | 6 +++---
 sbin/{uta-update => uta-load} | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)
 rename sbin/{uta-update => uta-load} (87%)

diff --git a/README.md b/README.md
index 1792c6b..c68d3e3 100644
--- a/README.md
+++ b/README.md
@@ -375,9 +375,9 @@ sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/uta-build $(pwd)/uta-log
 
 ### 5. Load data into UTA
 
-Run `sbin/uta-update`.
+Run `sbin/uta-load`.
 
 Example:
 ```
-sbin/uta-update $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/uta-build uta_20210129b 2024-02-20
+sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/uta-build uta_20210129b 2024-02-20
 ```
diff --git a/docker-compose.yml b/docker-compose.yml
index 0d4923a..39c8069 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,4 +1,4 @@
-# docker compose file for the UTA update procedure
+# docker compose file for the UTA load procedure
 
 version: '3'
 
@@ -13,8 +13,8 @@ services:
       interval: 10s
       retries: 60
     network_mode: host
-  uta-update:
-    image: uta-update
+  uta-load:
+    image: uta-load
     command: etc/scripts/run-uta-build.sh ${UTA_VERSION} /ncbi-dir /workdir
     depends_on:
       uta:
diff --git a/sbin/uta-update b/sbin/uta-load
similarity index 87%
rename from sbin/uta-update
rename to sbin/uta-load
index 81578bc..1a22a80 100755
--- a/sbin/uta-update
+++ b/sbin/uta-load
@@ -15,7 +15,7 @@ export UTA_VERSION=$4
 
 if [ -z "$NCBI_DIR" ] || [ -z "$SEQREPO_DIR" ] || [ -z "$WORKING_DIR" ] || [ -z "$UTA_VERSION" ]
 then
-    echo 'Usage: sbin/uta-update <ncbi_file_dir> <seqrepo_dir> <working_dir> <uta_version>'
+    echo 'Usage: sbin/uta-load <ncbi_file_dir> <seqrepo_dir> <working_dir> <uta_version>'
     exit 1
 else
     echo "Updating UTA and SeqRepo using files in $NCBI_DIR and SeqRepo data in $SEQREPO_DIR"
@@ -59,8 +59,8 @@ if [ ! -d "$WORKING_DIR" ]; then
 fi
 
 # Build the UTA image.
-docker build --target uta -t uta-update .
+docker build --target uta -t uta-load .
 
-# Bring up a UTA database and run the UTA update procedure.
+# Bring up a UTA database and run the UTA load procedure.
 # docker compose doesn't respect the container name specified in the compose file, so container name is specified here
-docker compose run --rm --name uta-update uta-update
+docker compose run --rm --name uta-load uta-load

From 89f8b3b7dca0c78fd61d99ca2a6bad50709c2ea6 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 02:37:27 -0700
Subject: [PATCH 08/36] simplify readme

---
 README.md | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/README.md b/README.md
index c68d3e3..de90910 100644
--- a/README.md
+++ b/README.md
@@ -296,10 +296,6 @@ To develop UTA, follow these steps.
 Requires bash and docker.
 
 ### 1. Download files from NCBI
-
-Run `sbin/ncbi-download-docker`.
-
-Example:
 ```
 sbin/ncbi-download-docker $(pwd)/ncbi-data
 ```
@@ -328,10 +324,6 @@ The specified directory will have the following structure:
                 └── human.1.rna.gbff.gz
 
 ### 2. Download SeqRepo data
-
-Run `sbin/seqrepo-download`.
-
-Example:
 ```
 sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data
 ```
@@ -347,37 +339,21 @@ mkdir -p $(pwd)/uta-logs
 ```
 
 #### 3A. Nuclear transcripts
-
-Run `sbin/uta-extract`.
-
-Example:
 ```
 sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build $(pwd)/uta-logs
 ```
 
 #### 3B. Mitochondrial transcripts
-
-Run `sbin/ncbi_process_mito.py`.
-
-Example:
 ```
 sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/uta-build | tee $(pwd)/uta-logs/mito.log
 ```
 
 ### 4. Load data into SeqRepo
-
-Run `sbin/seqrepo-load`.
-
-Example:
 ```
 sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/uta-build $(pwd)/uta-logs
 ```
 
 ### 5. Load data into UTA
-
-Run `sbin/uta-load`.
-
-Example:
 ```
 sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/uta-build uta_20210129b 2024-02-20
 ```

From 6bd387ad115658e2021294c4389b4c0f03e9bee5 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 02:44:10 -0700
Subject: [PATCH 09/36] allow seqrepo to be modified

---
 sbin/seqrepo-download | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sbin/seqrepo-download b/sbin/seqrepo-download
index 6f628f1..5c581b0 100755
--- a/sbin/seqrepo-download
+++ b/sbin/seqrepo-download
@@ -51,3 +51,6 @@ fi
 # Copy seqrepo data into a local directory
 echo "Copying seqrepo data into $OUTPUT_DIR ..."
 docker run -it -v $OUTPUT_DIR:/output-dir --volumes-from $SEQREPO_CONTAINER_NAME ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir'
+
+# Allow seqrepo to be modified
+docker run -it -v $OUTPUT_DIR:/output-dir ubuntu bash -c 'chmod -R +w /output-dir'

From ede145e2cba2104fe3f669a694c4b5ec4bcb6f32 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 02:50:49 -0700
Subject: [PATCH 10/36] restructure readme

---
 README.md | 47 ++++++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index de90910..57896f6 100644
--- a/README.md
+++ b/README.md
@@ -295,12 +295,28 @@ To develop UTA, follow these steps.
 
 Requires bash and docker.
 
-### 1. Download files from NCBI
+### 1. Download SeqRepo data
 ```
+sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data
+```
+
+### 2. Extract and transform data from NCBI
+
+Download files from NCBI, and extract into intermediate files.
+
+See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
+
+#### 2A. Nuclear transcripts
+```
+mkdir -p $(pwd)/ncbi-data
+mkdir -p $(pwd)/uta-build
+mkdir -p $(pwd)/uta-logs
+
 sbin/ncbi-download-docker $(pwd)/ncbi-data
+sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build $(pwd)/uta-logs
 ```
 
-The specified directory will have the following structure:
+The `ncbi-data` directory will have the following structure:
 
     ├── gene
     │   └── DATA
@@ -323,37 +339,18 @@ The specified directory will have the following structure:
                 ├── human.1.rna.fna.gz
                 └── human.1.rna.gbff.gz
 
-### 2. Download SeqRepo data
-```
-sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data
-```
-
-### 3. Extract data from NCBI files into intermediate files
-
-See 3A for nuclear transcript updates and 3B for mitochondrial transcript updates.
-
-In either case, first create directories:
-```
-mkdir -p $(pwd)/uta-build
-mkdir -p $(pwd)/uta-logs
-```
-
-#### 3A. Nuclear transcripts
-```
-sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build $(pwd)/uta-logs
-```
 
-#### 3B. Mitochondrial transcripts
+#### 2B. Mitochondrial transcripts
 ```
 sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/uta-build | tee $(pwd)/uta-logs/mito.log
 ```
 
-### 4. Load data into SeqRepo
+### 3. Load data into SeqRepo
 ```
 sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/uta-build $(pwd)/uta-logs
 ```
 
-### 5. Load data into UTA
+### 4. Load data into UTA
 ```
-sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/uta-build uta_20210129b 2024-02-20
+sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/uta-build uta_20210129b
 ```

From 44054fa1599e048d647d1f21274af51471895d66 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 02:55:57 -0700
Subject: [PATCH 11/36] change dirs in readme

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 57896f6..a1c6407 100644
--- a/README.md
+++ b/README.md
@@ -309,11 +309,11 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
 #### 2A. Nuclear transcripts
 ```
 mkdir -p $(pwd)/ncbi-data
-mkdir -p $(pwd)/uta-build
-mkdir -p $(pwd)/uta-logs
+mkdir -p $(pwd)/output/artifacts
+mkdir -p $(pwd)/output/logs
 
 sbin/ncbi-download-docker $(pwd)/ncbi-data
-sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build $(pwd)/uta-logs
+sbin/uta-extract $(pwd)/ncbi-data $(pwd)/output/artifacts $(pwd)/output/logs
 ```
 
 The `ncbi-data` directory will have the following structure:
@@ -342,15 +342,15 @@ The `ncbi-data` directory will have the following structure:
 
 #### 2B. Mitochondrial transcripts
 ```
-sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/uta-build | tee $(pwd)/uta-logs/mito.log
+sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/output/artifacts | tee $(pwd)/output/logs/mito.log
 ```
 
 ### 3. Load data into SeqRepo
 ```
-sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/uta-build $(pwd)/uta-logs
+sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/output/artifacts $(pwd)/output/logs
 ```
 
 ### 4. Load data into UTA
 ```
-sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/uta-build uta_20210129b
+sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/output/artifacts uta_20210129b
 ```

From cc329156ca46f7353c3f97f581e6fc33267f35d1 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 03:11:25 -0700
Subject: [PATCH 12/36] be explicit about all dirs

---
 README.md                    |  2 +-
 docker-compose.yml           |  5 +++--
 etc/scripts/run-uta-build.sh | 27 ++++++++++++---------------
 sbin/uta-load                | 29 +++++++++++++++++++++--------
 4 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index a1c6407..1fd195b 100644
--- a/README.md
+++ b/README.md
@@ -352,5 +352,5 @@ sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/output/artifacts $(pwd)/
 
 ### 4. Load data into UTA
 ```
-sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/output/artifacts uta_20210129b
+sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data uta_20210129b $(pwd)/output/artifacts $(pwd)/output/logs
 ```
diff --git a/docker-compose.yml b/docker-compose.yml
index 39c8069..29a592d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -15,12 +15,13 @@ services:
     network_mode: host
   uta-load:
     image: uta-load
-    command: etc/scripts/run-uta-build.sh ${UTA_VERSION} /ncbi-dir /workdir
+    command: etc/scripts/run-uta-build.sh ${UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs
     depends_on:
       uta:
         condition: service_healthy
     volumes:
       - ${NCBI_DIR}:/ncbi-dir
       - ${SEQREPO_DIR}:/usr/local/share/seqrepo
-      - ${WORKING_DIR}:/workdir
+      - ${WORKING_DIR}:/uta-load/work
+      - ${LOG_DIR}:/uta-load/logs
     network_mode: host
diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh
index ab98a16..a23c9b6 100755
--- a/etc/scripts/run-uta-build.sh
+++ b/etc/scripts/run-uta-build.sh
@@ -2,7 +2,8 @@
 
 # source_uta_v is the UTA version before the update.
 # ncbi_dir is where the script looks for NCBI data files.
-# working_dir stores log files, intermediate data files, and the final database dump.
+# working_dir stores intermediate data files and the final database dump.
+# log_dir stores log files.
 
 # Note that the uta loading code uses the seqrepo location defined in the conf files, under [sequences].seqrepo.
 
@@ -11,44 +12,40 @@ set -euxo pipefail
 source_uta_v=$1
 ncbi_dir=$2
 working_dir=$3
+log_dir=$4
 
-if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ]
+if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
 then
-    echo 'Usage: run-uta-build.sh <source_uta_v> <ncbi_dir> <working_dir>'
+    echo 'Usage: run-uta-build.sh <source_uta_v> <ncbi_dir> <working_dir> <log_dir>'
     exit 1
 fi
 
 # set local variables and create working directories
 loading_uta_v="uta_1_1"
-loading_dir="$working_dir/loading"
-dumps_dir="$working_dir/dumps"
-logs_dir="$working_dir/logs"
-for d in "$loading_dir" "$dumps_dir" "$logs_dir";
-  do mkdir -p "$d"
-done
+mkdir -p "$logs_dir"
 
 ## Drop loading schema, and recreate
 etc/scripts/delete-schema.sh "$loading_uta_v"
 etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v"
 
 # Filter out columns from assocacs file.
-sbin/assoc-acs-merge "$loading_dir/assocacs.gz" | gzip -c > "$loading_dir/assoc-ac.gz" 2>&1 | \
+sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-ac.gz" 2>&1 | \
     tee "$logs_dir/assoc-acs-merge.log"
 
 # Load genes into gene table.
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$loading_dir/geneinfo.gz" 2>&1 | \
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \
     tee "$logs_dir/load-geneinfo.log"
 
 # Load accessions into associated_accessions table.
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$loading_dir/assoc-ac.gz" 2>&1 | \
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$working_dir/assoc-ac.gz" 2>&1 | \
     tee "$logs_dir/load-assoc-ac.log"
 
 # Load transcript info into transcript and exon_set tables.
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$loading_dir/txinfo.gz" 2>&1 | \
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$working_dir/txinfo.gz" 2>&1 | \
     tee "$logs_dir/load-txinfo.log"
 
 # Load exon sets into into exon_set and exon tables.
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$loading_dir/exonsets.gz" 2>&1 | \
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$working_dir/exonsets.gz" 2>&1 | \
     tee "$logs_dir/load-exonsets.log"
 
 # Create cigar strings for all rows in tx_alt_exon_pairs_v view and update exon_aln table.
@@ -61,4 +58,4 @@ uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 |
 sbin/uta-diff "$source_uta_v" "$loading_uta_v"
 
 ### psql_dump
-pg_dump -U uta_admin -h localhost -d uta -t "$loading_uta_v.gene" | gzip -c > "$dumps_dir/uta.pgd.gz"
+pg_dump -U uta_admin -h localhost -d uta -t "$loading_uta_v.gene" | gzip -c > "$working_dir/uta.pgd.gz"
diff --git a/sbin/uta-load b/sbin/uta-load
index 1a22a80..6de36ab 100755
--- a/sbin/uta-load
+++ b/sbin/uta-load
@@ -1,26 +1,28 @@
 #!/usr/bin/env bash
 
 # This script runs the UTA update procedure.
-# It updates the specified UTA and SeqRepo using the given NCBI files.
-# It produces a postgres dump of the updated UTA database and an updated SeqRepo (updated in place).
 # It expects to be run from the root of the uta repository.
+# It updates the specified UTA and SeqRepo using the given NCBI files.
+# The UTA update is provided as a postgres database dump, and SeqRepo is updated in place.
 
 set -e
 
 # export environment variables for docker compose file
 export NCBI_DIR=$1
 export SEQREPO_DIR=$2
-export WORKING_DIR=$3
-export UTA_VERSION=$4
+export UTA_VERSION=$3
+export WORKING_DIR=$4
+export LOG_DIR=$4
 
-if [ -z "$NCBI_DIR" ] || [ -z "$SEQREPO_DIR" ] || [ -z "$WORKING_DIR" ] || [ -z "$UTA_VERSION" ]
+if [ -z "$NCBI_DIR" ] || [ -z "$SEQREPO_DIR" ] || [ -z "$UTA_VERSION" ] || [ -z "$WORKING_DIR" ] || [ -z "$LOG_DIR" ]
 then
-    echo 'Usage: sbin/uta-load <ncbi_file_dir> <seqrepo_dir> <working_dir> <uta_version>'
+    echo 'Usage: sbin/uta-load <ncbi_file_dir> <seqrepo_dir> <uta_version> <output_dir> <log_dir>'
     exit 1
 else
     echo "Updating UTA and SeqRepo using files in $NCBI_DIR and SeqRepo data in $SEQREPO_DIR"
     echo "Starting from UTA version $UTA_VERSION"
-    echo "Logs and intermediate files will be available in $WORKING_DIR"
+    echo "Final dump and any intermediate files will be available in $WORKING_DIR"
+    echo "Logs will be available in $LOG_DIR"
 fi
 
 # Ensure directories are compatible with docker volume usage
@@ -38,7 +40,13 @@ fi
 
 if [[ $WORKING_DIR != /* ]] && [[ $WORKING_DIR != .* ]]
 then
-    echo 'Working directory must start with / or .'
+    echo 'Output directory must start with / or .'
+    exit 1
+fi
+
+if [[ $LOG_DIR != /* ]] && [[ $LOG_DIR != .* ]]
+then
+    echo 'Log directory must start with / or .'
     exit 1
 fi
 
@@ -58,6 +66,11 @@ if [ ! -d "$WORKING_DIR" ]; then
     exit 1
 fi
 
+if [ ! -d "$LOG_DIR" ]; then
+    echo "Directory $LOG_DIR does not exist."
+    exit 1
+fi
+
 # Build the UTA image.
 docker build --target uta -t uta-load .
 

From df62d46228e830b7de29baaa932709b2e7d37636 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 03:13:01 -0700
Subject: [PATCH 13/36] mkdir needs to happen in both nuclear and mito paths

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 1fd195b..4f4b450 100644
--- a/README.md
+++ b/README.md
@@ -301,6 +301,11 @@ sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data
 ```
 
 ### 2. Extract and transform data from NCBI
+```
+mkdir -p $(pwd)/ncbi-data
+mkdir -p $(pwd)/output/artifacts
+mkdir -p $(pwd)/output/logs
+```
 
 Download files from NCBI, and extract into intermediate files.
 
@@ -308,10 +313,6 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
 
 #### 2A. Nuclear transcripts
 ```
-mkdir -p $(pwd)/ncbi-data
-mkdir -p $(pwd)/output/artifacts
-mkdir -p $(pwd)/output/logs
-
 sbin/ncbi-download-docker $(pwd)/ncbi-data
 sbin/uta-extract $(pwd)/ncbi-data $(pwd)/output/artifacts $(pwd)/output/logs
 ```

From f6e5032a941da686bd11d085f0e192d6ff253ae3 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 03:17:29 -0700
Subject: [PATCH 14/36] consistent dir name

---
 README.md                    |  4 ++++
 etc/scripts/run-uta-build.sh | 14 +++++++-------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 4f4b450..eff1f88 100644
--- a/README.md
+++ b/README.md
@@ -314,6 +314,8 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
 #### 2A. Nuclear transcripts
 ```
 sbin/ncbi-download-docker $(pwd)/ncbi-data
+
+# todo: move into docker
 sbin/uta-extract $(pwd)/ncbi-data $(pwd)/output/artifacts $(pwd)/output/logs
 ```
 
@@ -343,11 +345,13 @@ The `ncbi-data` directory will have the following structure:
 
 #### 2B. Mitochondrial transcripts
 ```
+# todo: move into docker
 sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/output/artifacts | tee $(pwd)/output/logs/mito.log
 ```
 
 ### 3. Load data into SeqRepo
 ```
+# todo: move into docker
 sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/output/artifacts $(pwd)/output/logs
 ```
 
diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh
index a23c9b6..69c4750 100755
--- a/etc/scripts/run-uta-build.sh
+++ b/etc/scripts/run-uta-build.sh
@@ -22,7 +22,7 @@ fi
 
 # set local variables and create working directories
 loading_uta_v="uta_1_1"
-mkdir -p "$logs_dir"
+mkdir -p "$log_dir"
 
 ## Drop loading schema, and recreate
 etc/scripts/delete-schema.sh "$loading_uta_v"
@@ -30,27 +30,27 @@ etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v"
 
 # Filter out columns from assocacs file.
 sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-ac.gz" 2>&1 | \
-    tee "$logs_dir/assoc-acs-merge.log"
+    tee "$log_dir/assoc-acs-merge.log"
 
 # Load genes into gene table.
 uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \
-    tee "$logs_dir/load-geneinfo.log"
+    tee "$log_dir/load-geneinfo.log"
 
 # Load accessions into associated_accessions table.
 uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$working_dir/assoc-ac.gz" 2>&1 | \
-    tee "$logs_dir/load-assoc-ac.log"
+    tee "$log_dir/load-assoc-ac.log"
 
 # Load transcript info into transcript and exon_set tables.
 uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$working_dir/txinfo.gz" 2>&1 | \
-    tee "$logs_dir/load-txinfo.log"
+    tee "$log_dir/load-txinfo.log"
 
 # Load exon sets into into exon_set and exon tables.
 uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$working_dir/exonsets.gz" 2>&1 | \
-    tee "$logs_dir/load-exonsets.log"
+    tee "$log_dir/load-exonsets.log"
 
 # Create cigar strings for all rows in tx_alt_exon_pairs_v view and update exon_aln table.
 uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | \
-    tee "$logs_dir/align-exons.log"
+    tee "$log_dir/align-exons.log"
 
 # Load seqinfo?
 

From 23617261e5dfae16f3c4e529b3ac531dd4668068 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 03:44:15 -0700
Subject: [PATCH 15/36] mito: strand should be an int

---
 etc/scripts/run-uta-build.sh | 6 +++---
 sbin/ncbi_process_mito.py    | 6 ++----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh
index 69c4750..0aab2f5 100755
--- a/etc/scripts/run-uta-build.sh
+++ b/etc/scripts/run-uta-build.sh
@@ -32,9 +32,9 @@ etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v"
 sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-ac.gz" 2>&1 | \
     tee "$log_dir/assoc-acs-merge.log"
 
-# Load genes into gene table.
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \
-    tee "$log_dir/load-geneinfo.log"
+# # Load genes into gene table.
+# uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \
+#     tee "$log_dir/load-geneinfo.log"
 
 # Load accessions into associated_accessions table.
 uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$working_dir/assoc-ac.gz" 2>&1 | \
diff --git a/sbin/ncbi_process_mito.py b/sbin/ncbi_process_mito.py
index b8d5480..c65984b 100755
--- a/sbin/ncbi_process_mito.py
+++ b/sbin/ncbi_process_mito.py
@@ -86,7 +86,7 @@ class MitoGeneData:
     alt_ac: str
     alt_start: int
     alt_end: int
-    strand: str
+    strand: int
     origin: str = "NCBI"
     alignment_method: str = "splign"
     transl_table: Optional[str] = None
@@ -204,9 +204,7 @@ def get_mito_genes(gbff_filepath: str):
                     # retrieve sequence, and reverse compliment if on reverse strand
                     ac = f"{record.id}_{feature.location.start:05}_{feature.location.end:05}"
                     feature_seq = record.seq[feature_start:feature_end]
-                    strand = "+"
                     if feature.location.strand == -1:
-                        strand = "-"
                         feature_seq = feature_seq.reverse_complement()
 
                     if feature.type == "CDS":
@@ -229,7 +227,7 @@ def get_mito_genes(gbff_filepath: str):
                         alt_ac=record.id,
                         alt_start=feature_start,
                         alt_end=feature_end,
-                        strand=strand,
+                        strand=feature.location.strand,
                         transl_table=transl_table,
                         transl_except=transl_except,
                         pro_ac=pro_ac,

From cf46cc0538ef0bf35a3b7cbc16341a3f4483d7c2 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 10:53:39 -0700
Subject: [PATCH 16/36] reanme uta loading script

---
 docker-compose.yml           | 2 +-
 etc/scripts/run-uta-build.sh | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 29a592d..65611fa 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -15,7 +15,7 @@ services:
     network_mode: host
   uta-load:
     image: uta-load
-    command: etc/scripts/run-uta-build.sh ${UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs
+    command: etc/scripts/uta-load ${UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs
     depends_on:
       uta:
         condition: service_healthy
diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh
index 0aab2f5..b541bb7 100755
--- a/etc/scripts/run-uta-build.sh
+++ b/etc/scripts/run-uta-build.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # source_uta_v is the UTA version before the update.
 # ncbi_dir is where the script looks for NCBI data files.
@@ -16,7 +16,7 @@ log_dir=$4
 
 if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
 then
-    echo 'Usage: run-uta-build.sh <source_uta_v> <ncbi_dir> <working_dir> <log_dir>'
+    echo 'Usage: uta-load <source_uta_v> <ncbi_dir> <working_dir> <log_dir>'
     exit 1
 fi
 

From 329fdc4ee69f9294e8d3086c899a46e81a4dc8a6 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 11:12:08 -0700
Subject: [PATCH 17/36] remove docker wrapper script for download

---
 README.md                 |  2 +-
 docker-compose.yml        |  8 ++++++++
 sbin/ncbi-download        |  1 +
 sbin/ncbi-download-docker | 24 ------------------------
 4 files changed, 10 insertions(+), 25 deletions(-)
 delete mode 100755 sbin/ncbi-download-docker

diff --git a/README.md b/README.md
index eff1f88..a432682 100644
--- a/README.md
+++ b/README.md
@@ -313,7 +313,7 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
 
 #### 2A. Nuclear transcripts
 ```
-sbin/ncbi-download-docker $(pwd)/ncbi-data
+NCBI_DIR=./ncbi-data2 docker compose run ncbi-download
 
 # todo: move into docker
 sbin/uta-extract $(pwd)/ncbi-data $(pwd)/output/artifacts $(pwd)/output/logs
diff --git a/docker-compose.yml b/docker-compose.yml
index 65611fa..807fc49 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -3,6 +3,14 @@
 version: '3'
 
 services:
+  ncbi-download:
+    image: uta-load
+    command: sbin/ncbi-download /ncbi-dir
+    volumes:
+      - .:/opt/repos/uta
+      - ${NCBI_DIR}:/ncbi-dir
+    working_dir: /opt/repos/uta
+    network_mode: host
   uta:
     container_name: uta
     image: biocommons/uta:${UTA_VERSION}
diff --git a/sbin/ncbi-download b/sbin/ncbi-download
index 61a6874..500c25f 100755
--- a/sbin/ncbi-download
+++ b/sbin/ncbi-download
@@ -26,6 +26,7 @@ do
     DOWNLOAD_MODULE="${DOWNLOAD_PATH%%/*}"
     DOWNLOAD_SRC="ftp.ncbi.nlm.nih.gov::$DOWNLOAD_PATH"
     DOWNLOAD_DST="$DOWNLOAD_DIR/$DOWNLOAD_MODULE"
+    mkdir -p $DOWNLOAD_DST
     echo "Downloading $DOWNLOAD_SRC to $DOWNLOAD_DST"
     rsync --no-motd -DHPRprtv "$DOWNLOAD_SRC" "$DOWNLOAD_DST"
 done
diff --git a/sbin/ncbi-download-docker b/sbin/ncbi-download-docker
deleted file mode 100755
index 30d4f63..0000000
--- a/sbin/ncbi-download-docker
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env bash
-
-# This script runs ncbi-download in a docker container.
-
-set -e
-
-DOWNLOAD_DIR=$1
-
-if [ -z "$DOWNLOAD_DIR" ]
-then
-    echo 'Usage: sbin/ncbi-download-docker <download_dir>'
-    exit 1
-else
-    echo "Downloading files to $DOWNLOAD_DIR"
-fi
-
-if [[ $DOWNLOAD_DIR != /* ]]
-then
-    echo 'Download directory must be an absolute path'
-    exit 1
-fi
-
-docker build --target uta -t ncbi-download --progress plain .
-docker run -it -v $(pwd)/sbin/ncbi-download:/dl-script -v $DOWNLOAD_DIR:/output-dir ncbi-download /dl-script /output-dir

From bcfbf3816e9ef43d3d96e1c372ffee597f9a619f Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 11:28:28 -0700
Subject: [PATCH 18/36] create compose service for uta-extract

---
 README.md          | 56 ++++++++++++++++++----------------------------
 docker-compose.yml | 21 ++++++++++++-----
 sbin/ncbi-download | 23 +++++++++++++++++++
 sbin/uta-extract   |  2 +-
 4 files changed, 62 insertions(+), 40 deletions(-)

diff --git a/README.md b/README.md
index a432682..b2a397a 100644
--- a/README.md
+++ b/README.md
@@ -293,56 +293,44 @@ To develop UTA, follow these steps.
 
 ## UTA update procedure
 
-Requires bash and docker.
+Requires docker.
 
-### 1. Download SeqRepo data
-```
-sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data
-```
+### 0. Setup
 
-### 2. Extract and transform data from NCBI
+Make directories:
 ```
 mkdir -p $(pwd)/ncbi-data
 mkdir -p $(pwd)/output/artifacts
 mkdir -p $(pwd)/output/logs
 ```
 
+Set variables:
+```
+export UTA_ETL_NCBI_DIR=./ncbi-data
+export UTA_ETL_SEQREPO_DIR=./seqrepo-data
+export UTA_ETL_UTA_VERSION=uta_20210129b
+export UTA_ETL_WORK_DIR=./output/artifacts
+export UTA_ETL_LOG_DIR=./output/logs
+```
+
+### 1. Download SeqRepo data
+```
+tbd
+sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data
+```
+
+### 2. Extract and transform data from NCBI
+
 Download files from NCBI, and extract into intermediate files.
 
 See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
 
 #### 2A. Nuclear transcripts
 ```
-NCBI_DIR=./ncbi-data2 docker compose run ncbi-download
-
-# todo: move into docker
-sbin/uta-extract $(pwd)/ncbi-data $(pwd)/output/artifacts $(pwd)/output/logs
+docker compose run ncbi-download
+docker compose run uta-extract
 ```
 
-The `ncbi-data` directory will have the following structure:
-
-    ├── gene
-    │   └── DATA
-    │       ├── GENE_INFO
-    │       │   └── Mammalia
-    │       │       └── Homo_sapiens.gene_info.gz
-    │       └── gene2accession.gz
-    ├── genomes
-    │   └── refseq
-    │       └── vertebrate_mammalian
-    │           └── Homo_sapiens
-    │               └── all_assembly_versions
-    │                   └── GCF_000001405.25_GRCh37.p13
-    │                       ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz
-    │                       └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz
-    └── refseq
-        └── H_sapiens
-            └── mRNA_Prot
-                ├── human.1.protein.faa.gz
-                ├── human.1.rna.fna.gz
-                └── human.1.rna.gbff.gz
-
-
 #### 2B. Mitochondrial transcripts
 ```
 # todo: move into docker
diff --git a/docker-compose.yml b/docker-compose.yml
index 807fc49..f048104 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -8,7 +8,18 @@ services:
     command: sbin/ncbi-download /ncbi-dir
     volumes:
       - .:/opt/repos/uta
-      - ${NCBI_DIR}:/ncbi-dir
+      - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
+    working_dir: /opt/repos/uta
+    network_mode: host
+  uta-extract:
+    image: uta-load
+    command: sbin/uta-extract /ncbi-dir /uta-extract/work /uta-extract/logs
+    volumes:
+      - .:/opt/repos/uta
+      - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
+      - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
+      - ${UTA_ETL_WORK_DIR}:/uta-extract/work
+      - ${UTA_ETL_LOG_DIR}:/uta-extract/logs
     working_dir: /opt/repos/uta
     network_mode: host
   uta:
@@ -28,8 +39,8 @@ services:
       uta:
         condition: service_healthy
     volumes:
-      - ${NCBI_DIR}:/ncbi-dir
-      - ${SEQREPO_DIR}:/usr/local/share/seqrepo
-      - ${WORKING_DIR}:/uta-load/work
-      - ${LOG_DIR}:/uta-load/logs
+      - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
+      - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
+      - ${UTA_ETL_WORK_DIR}:/uta-load/work
+      - ${UTA_ETL_LOG_DIR}:/uta-load/logs
     network_mode: host
diff --git a/sbin/ncbi-download b/sbin/ncbi-download
index 500c25f..dc445cd 100755
--- a/sbin/ncbi-download
+++ b/sbin/ncbi-download
@@ -1,6 +1,29 @@
 #!/usr/bin/env bash
 
 # This script downloads the files needed for a UTA+SeqRepo update into to the given directory.
+#
+# DONWLOAD_DIR will have the following structure:
+#
+#    ├── gene
+#    │   └── DATA
+#    │       ├── GENE_INFO
+#    │       │   └── Mammalia
+#    │       │       └── Homo_sapiens.gene_info.gz
+#    │       └── gene2accession.gz
+#    ├── genomes
+#    │   └── refseq
+#    │       └── vertebrate_mammalian
+#    │           └── Homo_sapiens
+#    │               └── all_assembly_versions
+#    │                   └── GCF_000001405.25_GRCh37.p13
+#    │                       ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz
+#    │                       └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz
+#    └── refseq
+#        └── H_sapiens
+#            └── mRNA_Prot
+#                ├── human.1.protein.faa.gz
+#                ├── human.1.rna.fna.gz
+#                └── human.1.rna.gbff.gz
 
 set -e
 
diff --git a/sbin/uta-extract b/sbin/uta-extract
index 84c3940..4d89f86 100755
--- a/sbin/uta-extract
+++ b/sbin/uta-extract
@@ -30,7 +30,7 @@ GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_as
 sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/exonsets.gz" 2>&1 | \
     tee "$logs_dir/ncbi-parse-genomic-gff.log"
 
-# generate seqinfo files from exonsets
+# generate seqinfo files from exonsets (this step requires seqrepo)
 sbin/exonset-to-seqinfo -o NCBI "$loading_dir/exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \
     tee "$logs_dir/exonset-to-seqinfo.log"
 

From a3dcbcf705dfcd0d02f2519bf7476892b355afee Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 11:38:46 -0700
Subject: [PATCH 19/36] create compose service for mito-extract

---
 README.md          |  3 +--
 docker-compose.yml |  9 +++++++++
 sbin/uta-extract   | 22 +++++++++++-----------
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index b2a397a..332980f 100644
--- a/README.md
+++ b/README.md
@@ -333,8 +333,7 @@ docker compose run uta-extract
 
 #### 2B. Mitochondrial transcripts
 ```
-# todo: move into docker
-sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/output/artifacts | tee $(pwd)/output/logs/mito.log
+docker compose run mito-extract
 ```
 
 ### 3. Load data into SeqRepo
diff --git a/docker-compose.yml b/docker-compose.yml
index f048104..8ba1345 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -44,3 +44,12 @@ services:
       - ${UTA_ETL_WORK_DIR}:/uta-load/work
       - ${UTA_ETL_LOG_DIR}:/uta-load/logs
     network_mode: host
+  mito-extract:
+    image: uta-load
+    command: sbin/ncbi_process_mito.py NC_012920.1 --output-dir /mito-extract/work | tee /mito-extract/logs/mito.log
+    volumes:
+      - .:/opt/repos/uta
+      - ${UTA_ETL_WORK_DIR}:/mito-extract/work
+      - ${UTA_ETL_LOG_DIR}:/mito-extract/logs
+    working_dir: /opt/repos/uta
+    network_mode: host
diff --git a/sbin/uta-extract b/sbin/uta-extract
index 4d89f86..b34938f 100755
--- a/sbin/uta-extract
+++ b/sbin/uta-extract
@@ -3,38 +3,38 @@
 # Extract data from NCBI files into intermediate files.
 
 ncbi_dir=$1
-loading_dir=$2
+working_dir=$2
 logs_dir=$3
 
-if [ -z "$ncbi_dir" ] || [ -z "$loading_dir" ] || [ -z "$logs_dir" ]
+if [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$logs_dir" ]
 then
-    echo 'Usage: sbin/uta-extract <ncbi_dir> <loading_dir> <logs_dir>'
+    echo 'Usage: sbin/uta-extract <ncbi_dir> <working_dir> <logs_dir>'
     exit 1
 fi
 
 # genes
 sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz | \
-    gzip -c > "$loading_dir/geneinfo.gz" 2>&1 | tee "$logs_dir/ncbi-parse-geneinfo.log"
+    gzip -c > "$working_dir/geneinfo.gz" 2>&1 | tee "$logs_dir/ncbi-parse-geneinfo.log"
 
 # transcript protein associations
-sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$loading_dir/assocacs.gz" 2>&1 | \
+sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$working_dir/assocacs.gz" 2>&1 | \
     tee "$logs_dir/ncbi-fetch-assoc-acs.log"
 
 # parse transcript info from GBFF input files
 GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz)
-sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$loading_dir/txinfo.gz" 2>&1 | \
+sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$working_dir/txinfo.gz" 2>&1 | \
     tee "$logs_dir/ncbi-parse-gbff.log"
 
 # parse alignments from GFF input files
 GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.gff.gz)
-sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/exonsets.gz" 2>&1 | \
+sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \
     tee "$logs_dir/ncbi-parse-genomic-gff.log"
 
 # generate seqinfo files from exonsets (this step requires seqrepo)
-sbin/exonset-to-seqinfo -o NCBI "$loading_dir/exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \
+sbin/exonset-to-seqinfo -o NCBI "$working_dir/exonsets.gz" | gzip -c > "$working_dir/seqinfo.gz" 2>&1 | \
     tee "$logs_dir/exonset-to-seqinfo.log"
 
 # move fasta files into same dir
-cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz uta-build/
-cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz uta-build/
-cp $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz uta-build/
+cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/
+cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/
+cp $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/

From d327c0a253ee1c00b2eb0da01db6ffba54179f1e Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 11:41:44 -0700
Subject: [PATCH 20/36] create compose service for seqrepo-load

---
 README.md          |  3 +--
 docker-compose.yml | 10 ++++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 332980f..3985719 100644
--- a/README.md
+++ b/README.md
@@ -338,8 +338,7 @@ docker compose run mito-extract
 
 ### 3. Load data into SeqRepo
 ```
-# todo: move into docker
-sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/output/artifacts $(pwd)/output/logs
+docker compose run seqrepo-load
 ```
 
 ### 4. Load data into UTA
diff --git a/docker-compose.yml b/docker-compose.yml
index 8ba1345..652f3ff 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -22,6 +22,16 @@ services:
       - ${UTA_ETL_LOG_DIR}:/uta-extract/logs
     working_dir: /opt/repos/uta
     network_mode: host
+  seqrepo-load:
+    image: uta-load
+    command: sbin/seqrepo-load /usr/local/share/seqrepo 2024-02-20 /seqrepo-load/work /seqrepo-load/logs
+    volumes:
+      - .:/opt/repos/uta
+      - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
+      - ${UTA_ETL_WORK_DIR}:/seqrepo-load/work
+      - ${UTA_ETL_LOG_DIR}:/seqrepo-load/logs
+    working_dir: /opt/repos/uta
+    network_mode: host
   uta:
     container_name: uta
     image: biocommons/uta:${UTA_VERSION}

From 540b4ddeb93adf4f01ecf3b5511d385aed800d39 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 11:57:01 -0700
Subject: [PATCH 21/36] clean up uta-load command

---
 README.md                                  | 15 ++++++++++++++-
 docker-compose.yml                         | 19 ++++++++++---------
 etc/scripts/{run-uta-build.sh => uta-load} |  1 +
 sbin/uta-load                              |  9 +--------
 4 files changed, 26 insertions(+), 18 deletions(-)
 rename etc/scripts/{run-uta-build.sh => uta-load} (97%)

diff --git a/README.md b/README.md
index 3985719..f5a28d7 100644
--- a/README.md
+++ b/README.md
@@ -313,6 +313,14 @@ export UTA_ETL_WORK_DIR=./output/artifacts
 export UTA_ETL_LOG_DIR=./output/logs
 ```
 
+Build the UTA image:
+```
+docker build --target uta -t uta-update .
+```
+
+Note: docker compose does not respect the container name specified in the compose file,
+so you may want to specify it with `--name=<container_name>`
+
 ### 1. Download SeqRepo data
 ```
 tbd
@@ -342,6 +350,11 @@ docker compose run seqrepo-load
 ```
 
 ### 4. Load data into UTA
+
+Bring up a UTA database and run the UTA load procedure.
+
+UTA is updated and the database is dumped into a pgd file. SeqRepo is updated in place.
+
 ```
-sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data uta_20210129b $(pwd)/output/artifacts $(pwd)/output/logs
+docker compose run uta-load
 ```
diff --git a/docker-compose.yml b/docker-compose.yml
index 652f3ff..bdfc033 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,10 +1,10 @@
-# docker compose file for the UTA load procedure
+# docker compose file for the UTA update procedure
 
 version: '3'
 
 services:
   ncbi-download:
-    image: uta-load
+    image: uta-update
     command: sbin/ncbi-download /ncbi-dir
     volumes:
       - .:/opt/repos/uta
@@ -12,7 +12,7 @@ services:
     working_dir: /opt/repos/uta
     network_mode: host
   uta-extract:
-    image: uta-load
+    image: uta-update
     command: sbin/uta-extract /ncbi-dir /uta-extract/work /uta-extract/logs
     volumes:
       - .:/opt/repos/uta
@@ -23,7 +23,7 @@ services:
     working_dir: /opt/repos/uta
     network_mode: host
   seqrepo-load:
-    image: uta-load
+    image: uta-update
     command: sbin/seqrepo-load /usr/local/share/seqrepo 2024-02-20 /seqrepo-load/work /seqrepo-load/logs
     volumes:
       - .:/opt/repos/uta
@@ -34,28 +34,29 @@ services:
     network_mode: host
   uta:
     container_name: uta
-    image: biocommons/uta:${UTA_VERSION}
+    image: biocommons/uta:${UTA_ETL_UTA_VERSION}
     environment:
       - POSTGRES_HOST_AUTH_METHOD=trust
     healthcheck:
-      test: psql -h localhost -U anonymous -d uta -c "select * from ${UTA_VERSION}.meta"
+      test: psql -h localhost -U anonymous -d uta -c "select * from ${UTA_ETL_UTA_VERSION}.meta"
       interval: 10s
       retries: 60
     network_mode: host
   uta-load:
-    image: uta-load
-    command: etc/scripts/uta-load ${UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs
+    image: uta-update
+    command: etc/scripts/uta-load ${UTA_ETL_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs
     depends_on:
       uta:
         condition: service_healthy
     volumes:
+      - .:/opt/repos/uta
       - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
       - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
       - ${UTA_ETL_WORK_DIR}:/uta-load/work
       - ${UTA_ETL_LOG_DIR}:/uta-load/logs
     network_mode: host
   mito-extract:
-    image: uta-load
+    image: uta-update
     command: sbin/ncbi_process_mito.py NC_012920.1 --output-dir /mito-extract/work | tee /mito-extract/logs/mito.log
     volumes:
       - .:/opt/repos/uta
diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/uta-load
similarity index 97%
rename from etc/scripts/run-uta-build.sh
rename to etc/scripts/uta-load
index b541bb7..ebc3e36 100755
--- a/etc/scripts/run-uta-build.sh
+++ b/etc/scripts/uta-load
@@ -1,5 +1,6 @@
 #!/usr/bin/env bash
 
+# This script updates UTA and SeqRepo using NCBI files.
 # source_uta_v is the UTA version before the update.
 # ncbi_dir is where the script looks for NCBI data files.
 # working_dir stores intermediate data files and the final database dump.
diff --git a/sbin/uta-load b/sbin/uta-load
index 6de36ab..340a82f 100755
--- a/sbin/uta-load
+++ b/sbin/uta-load
@@ -1,9 +1,5 @@
 #!/usr/bin/env bash
 
-# This script runs the UTA update procedure.
-# It expects to be run from the root of the uta repository.
-# It updates the specified UTA and SeqRepo using the given NCBI files.
-# The UTA update is provided as a postgres database dump, and SeqRepo is updated in place.
 
 set -e
 
@@ -19,10 +15,7 @@ then
     echo 'Usage: sbin/uta-load <ncbi_file_dir> <seqrepo_dir> <uta_version> <output_dir> <log_dir>'
     exit 1
 else
-    echo "Updating UTA and SeqRepo using files in $NCBI_DIR and SeqRepo data in $SEQREPO_DIR"
-    echo "Starting from UTA version $UTA_VERSION"
-    echo "Final dump and any intermediate files will be available in $WORKING_DIR"
-    echo "Logs will be available in $LOG_DIR"
+
 fi
 
 # Ensure directories are compatible with docker volume usage

From edd2a094c71651299bb25c3e8f0e786d223a64d7 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 12:05:00 -0700
Subject: [PATCH 22/36] delete docker wrapper script for uta-load

---
 sbin/uta-load | 72 ---------------------------------------------------
 1 file changed, 72 deletions(-)
 delete mode 100755 sbin/uta-load

diff --git a/sbin/uta-load b/sbin/uta-load
deleted file mode 100755
index 340a82f..0000000
--- a/sbin/uta-load
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/env bash
-
-
-set -e
-
-# export environment variables for docker compose file
-export NCBI_DIR=$1
-export SEQREPO_DIR=$2
-export UTA_VERSION=$3
-export WORKING_DIR=$4
-export LOG_DIR=$4
-
-if [ -z "$NCBI_DIR" ] || [ -z "$SEQREPO_DIR" ] || [ -z "$UTA_VERSION" ] || [ -z "$WORKING_DIR" ] || [ -z "$LOG_DIR" ]
-then
-    echo 'Usage: sbin/uta-load <ncbi_file_dir> <seqrepo_dir> <uta_version> <output_dir> <log_dir>'
-    exit 1
-else
-
-fi
-
-# Ensure directories are compatible with docker volume usage
-if [[ $NCBI_DIR != /* ]] && [[ $NCBI_DIR != .* ]]
-then
-    echo 'NCBI file directory must start with / or .'
-    exit 1
-fi
-
-if [[ $SEQREPO_DIR != /* ]] && [[ $SEQREPO_DIR != .* ]]
-then
-    echo 'SeqRepo data directory must start with / or .'
-    exit 1
-fi
-
-if [[ $WORKING_DIR != /* ]] && [[ $WORKING_DIR != .* ]]
-then
-    echo 'Output directory must start with / or .'
-    exit 1
-fi
-
-if [[ $LOG_DIR != /* ]] && [[ $LOG_DIR != .* ]]
-then
-    echo 'Log directory must start with / or .'
-    exit 1
-fi
-
-# Ensure directories exist.
-if [ ! -d "$NCBI_DIR" ]; then
-    echo "Directory $NCBI_DIR does not exist."
-    exit 1
-fi
-
-if [ ! -d "$SEQREPO_DIR" ]; then
-    echo "Directory $SEQREPO_DIR does not exist."
-    exit 1
-fi
-
-if [ ! -d "$WORKING_DIR" ]; then
-    echo "Directory $WORKING_DIR does not exist."
-    exit 1
-fi
-
-if [ ! -d "$LOG_DIR" ]; then
-    echo "Directory $LOG_DIR does not exist."
-    exit 1
-fi
-
-# Build the UTA image.
-docker build --target uta -t uta-load .
-
-# Bring up a UTA database and run the UTA load procedure.
-# docker compose doesn't respect the container name specified in the compose file, so container name is specified here
-docker compose run --rm --name uta-load uta-load

From 844b1276432f51dedaaa8d92c6adc000f20eaead Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 12:14:31 -0700
Subject: [PATCH 23/36] remove docker wraper script for sr download

---
 README.md             | 18 +++++++++++---
 docker-compose.yml    |  6 ++---
 sbin/seqrepo-download | 56 -------------------------------------------
 3 files changed, 18 insertions(+), 62 deletions(-)
 delete mode 100755 sbin/seqrepo-download

diff --git a/README.md b/README.md
index f5a28d7..6ae9e75 100644
--- a/README.md
+++ b/README.md
@@ -306,9 +306,10 @@ mkdir -p $(pwd)/output/logs
 
 Set variables:
 ```
+export UTA_ETL_OLD_SEQREPO_VERSION=2024-02-20
+export UTA_ETL_OLD_UTA_VERSION=uta_20210129b
 export UTA_ETL_NCBI_DIR=./ncbi-data
 export UTA_ETL_SEQREPO_DIR=./seqrepo-data
-export UTA_ETL_UTA_VERSION=uta_20210129b
 export UTA_ETL_WORK_DIR=./output/artifacts
 export UTA_ETL_LOG_DIR=./output/logs
 ```
@@ -323,10 +324,21 @@ so you may want to specify it with `--name=<container_name>`
 
 ### 1. Download SeqRepo data
 ```
-tbd
-sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data
+docker pull biocommons/seqrepo:$UTA_ETL_OLD_SEQREPO_VERSION
+
+# download seqrepo. can skip if container already exists.
+docker run --name seqrepo biocommons/seqrepo:$UTA_ETL_OLD_SEQREPO_VERSION
+
+# copy seqrepo data into a local directory
+docker run -v $UTA_ETL_SEQREPO_DIR:/output-dir --volumes-from seqrepo ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir'
+
+# allow seqrepo to be modified
+docker run -it -v $UTA_ETL_SEQREPO_DIR:/output-dir ubuntu bash -c 'chmod -R +w /output-dir'
 ```
 
+Note: pulling data takes ~30 minutes and requires ~13 GB.
+Note: a container called seqrepo will be left behind.
+
 ### 2. Extract and transform data from NCBI
 
 Download files from NCBI, and extract into intermediate files.
diff --git a/docker-compose.yml b/docker-compose.yml
index bdfc033..55ea6fd 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -34,17 +34,17 @@ services:
     network_mode: host
   uta:
     container_name: uta
-    image: biocommons/uta:${UTA_ETL_UTA_VERSION}
+    image: biocommons/uta:${UTA_ETL_OLD_UTA_VERSION}
     environment:
       - POSTGRES_HOST_AUTH_METHOD=trust
     healthcheck:
-      test: psql -h localhost -U anonymous -d uta -c "select * from ${UTA_ETL_UTA_VERSION}.meta"
+      test: psql -h localhost -U anonymous -d uta -c "select * from ${UTA_ETL_OLD_UTA_VERSION}.meta"
       interval: 10s
       retries: 60
     network_mode: host
   uta-load:
     image: uta-update
-    command: etc/scripts/uta-load ${UTA_ETL_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs
+    command: etc/scripts/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs
     depends_on:
       uta:
         condition: service_healthy
diff --git a/sbin/seqrepo-download b/sbin/seqrepo-download
deleted file mode 100755
index 5c581b0..0000000
--- a/sbin/seqrepo-download
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/usr/bin/env bash
-
-# This script downloads SeqRepo into the given directory.
-# Note: pulling data takes ~30 minutes and requires ~13 GB.
-# Note: a container called seqrepo will be left behind.
-# The name of the container can be changed by providing a third argument.
-
-set -e
-
-SEQREPO_VERSION=$1
-OUTPUT_DIR=$2
-# optional:
-SEQREPO_CONTAINER_NAME=$3
-
-if [ -z "$SEQREPO_VERSION" ] || [ -z "$OUTPUT_DIR" ]
-then
-    echo 'Usage: sbin/seqrepo-download <seqrepo_version> <output_dir>'
-    exit 1
-else
-    echo "SeqRepo data for version $SEQREPO_VERSION will be available in $OUTPUT_DIR"
-fi
-
-# Name of seqrepo container
-if [ -z "$SEQREPO_CONTAINER_NAME" ]
-then
-    SEQREPO_CONTAINER_NAME=seqrepo
-fi
-
-if [[ $OUTPUT_DIR != /* ]]
-then
-    echo 'Output directory must be an absolute path'
-    exit 1
-fi
-
-if [ ! -d "$OUTPUT_DIR" ]; then
-    echo "Directory $OUTPUT_DIR does not exist."
-    exit 1
-fi
-
-# Pull seqrepo image
-docker pull biocommons/seqrepo:$SEQREPO_VERSION
-
-# Download seqrepo data using seqrepo image
-if docker ps -aq -f name=$SEQREPO_CONTAINER_NAME
-then
-    echo "Container called $SEQREPO_CONTAINER_NAME already exists. Skipping seqrepo data download."
-else
-    docker run --name seqrepo biocommons/seqrepo:$SEQREPO_VERSION
-fi
-
-# Copy seqrepo data into a local directory
-echo "Copying seqrepo data into $OUTPUT_DIR ..."
-docker run -it -v $OUTPUT_DIR:/output-dir --volumes-from $SEQREPO_CONTAINER_NAME ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir'
-
-# Allow seqrepo to be modified
-docker run -it -v $OUTPUT_DIR:/output-dir ubuntu bash -c 'chmod -R +w /output-dir'

From d63f20f0f7ac3dca56e7932f4ef6cb6de82ad350 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 12:19:23 -0700
Subject: [PATCH 24/36] remove current dir mount

---
 README.md          | 3 ---
 docker-compose.yml | 4 ----
 2 files changed, 7 deletions(-)

diff --git a/README.md b/README.md
index 6ae9e75..c15aa48 100644
--- a/README.md
+++ b/README.md
@@ -319,9 +319,6 @@ Build the UTA image:
 docker build --target uta -t uta-update .
 ```
 
-Note: docker compose does not respect the container name specified in the compose file,
-so you may want to specify it with `--name=<container_name>`
-
 ### 1. Download SeqRepo data
 ```
 docker pull biocommons/seqrepo:$UTA_ETL_OLD_SEQREPO_VERSION
diff --git a/docker-compose.yml b/docker-compose.yml
index 55ea6fd..404fe09 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -15,7 +15,6 @@ services:
     image: uta-update
     command: sbin/uta-extract /ncbi-dir /uta-extract/work /uta-extract/logs
     volumes:
-      - .:/opt/repos/uta
       - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
       - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
       - ${UTA_ETL_WORK_DIR}:/uta-extract/work
@@ -26,7 +25,6 @@ services:
     image: uta-update
     command: sbin/seqrepo-load /usr/local/share/seqrepo 2024-02-20 /seqrepo-load/work /seqrepo-load/logs
     volumes:
-      - .:/opt/repos/uta
       - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
       - ${UTA_ETL_WORK_DIR}:/seqrepo-load/work
       - ${UTA_ETL_LOG_DIR}:/seqrepo-load/logs
@@ -49,7 +47,6 @@ services:
       uta:
         condition: service_healthy
     volumes:
-      - .:/opt/repos/uta
       - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
       - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
       - ${UTA_ETL_WORK_DIR}:/uta-load/work
@@ -59,7 +56,6 @@ services:
     image: uta-update
     command: sbin/ncbi_process_mito.py NC_012920.1 --output-dir /mito-extract/work | tee /mito-extract/logs/mito.log
     volumes:
-      - .:/opt/repos/uta
       - ${UTA_ETL_WORK_DIR}:/mito-extract/work
       - ${UTA_ETL_LOG_DIR}:/mito-extract/logs
     working_dir: /opt/repos/uta

From f096f4204a329b81762189097ddbdf55854cc6bf Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 12:23:52 -0700
Subject: [PATCH 25/36] clean up readme

---
 README.md | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index c15aa48..4721aac 100644
--- a/README.md
+++ b/README.md
@@ -359,11 +359,8 @@ docker compose run seqrepo-load
 ```
 
 ### 4. Load data into UTA
-
-Bring up a UTA database and run the UTA load procedure.
-
-UTA is updated and the database is dumped into a pgd file. SeqRepo is updated in place.
-
 ```
 docker compose run uta-load
 ```
+
+UTA has updated and the database has been dumped into a pgd file in `UTA_ETL_WORK_DIR`. SeqRepo has been updated in place.

From 66ac44f12f13a2e796fc0a5ee8e568a4ee4b02c7 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 12:45:59 -0700
Subject: [PATCH 26/36] skip gene load for mito

---
 README.md            | 14 ++++----------
 docker-compose.yml   |  2 +-
 etc/scripts/uta-load | 12 +++++++++---
 sbin/uta-extract     |  1 +
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 4721aac..6f8d4ac 100644
--- a/README.md
+++ b/README.md
@@ -338,29 +338,23 @@ Note: a container called seqrepo will be left behind.
 
 ### 2. Extract and transform data from NCBI
 
-Download files from NCBI, and extract into intermediate files.
+Download files from NCBI, extract into intermediate files, and load into UTA and SeqRepo.
 
 See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
 
 #### 2A. Nuclear transcripts
 ```
 docker compose run ncbi-download
+docker compose run seqrepo-load
 docker compose run uta-extract
+docker compose run uta-load
 ```
 
 #### 2B. Mitochondrial transcripts
 ```
 docker compose run mito-extract
-```
-
-### 3. Load data into SeqRepo
-```
 docker compose run seqrepo-load
-```
-
-### 4. Load data into UTA
-```
-docker compose run uta-load
+UTA_ETL_SKIP_GENE_LOAD=true docker compose run uta-load
 ```
 
 UTA has updated and the database has been dumped into a pgd file in `UTA_ETL_WORK_DIR`. SeqRepo has been updated in place.
diff --git a/docker-compose.yml b/docker-compose.yml
index 404fe09..3e988fa 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -42,7 +42,7 @@ services:
     network_mode: host
   uta-load:
     image: uta-update
-    command: etc/scripts/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs
+    command: etc/scripts/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs ${UTA_ETL_SKIP_GENE_LOAD}
     depends_on:
       uta:
         condition: service_healthy
diff --git a/etc/scripts/uta-load b/etc/scripts/uta-load
index ebc3e36..10844f2 100755
--- a/etc/scripts/uta-load
+++ b/etc/scripts/uta-load
@@ -5,6 +5,7 @@
 # ncbi_dir is where the script looks for NCBI data files.
 # working_dir stores intermediate data files and the final database dump.
 # log_dir stores log files.
+# skip_load_genes, if truthy, will skip the gene loading step
 
 # Note that the uta loading code uses the seqrepo location defined in the conf files, under [sequences].seqrepo.
 
@@ -14,6 +15,8 @@ source_uta_v=$1
 ncbi_dir=$2
 working_dir=$3
 log_dir=$4
+# optionally skip loading geneinfo
+skip_load_genes=$5
 
 if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
 then
@@ -33,9 +36,12 @@ etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v"
 sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-ac.gz" 2>&1 | \
     tee "$log_dir/assoc-acs-merge.log"
 
-# # Load genes into gene table.
-# uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \
-#     tee "$log_dir/load-geneinfo.log"
+# Load genes into gene table.
+if [ -z "$skip_load_genes" ]
+then
+    uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \
+        tee "$log_dir/load-geneinfo.log"
+fi
 
 # Load accessions into associated_accessions table.
 uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$working_dir/assoc-ac.gz" 2>&1 | \
diff --git a/sbin/uta-extract b/sbin/uta-extract
index 432e8a6..e708e46 100755
--- a/sbin/uta-extract
+++ b/sbin/uta-extract
@@ -30,6 +30,7 @@ GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_as
 sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$working_dir/unfiltered_exonsets.gz" 2>&1 | \
     tee "$logs_dir/ncbi-parse-genomic-gff.log"
 
+# filter transcripts
 sbin/filter_exonset_transcripts.py --tx-info "$working_dir/gbff.txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \
     --missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \
     tee "$logs_dir/filter_exonset_transcripts.log"

From 49a2ce8a4ffa9b1c5dfa2094689a848024b5bece Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 12:51:12 -0700
Subject: [PATCH 27/36] set -e on uta-load

---
 sbin/uta-extract | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sbin/uta-extract b/sbin/uta-extract
index e708e46..0583b9a 100755
--- a/sbin/uta-extract
+++ b/sbin/uta-extract
@@ -2,6 +2,8 @@
 
 # Extract data from NCBI files into intermediate files.
 
+set -e
+
 ncbi_dir=$1
 working_dir=$2
 logs_dir=$3

From f6e9d9d38dd467cc2f8c4dccf3958ba4049aefdd Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 12:57:43 -0700
Subject: [PATCH 28/36] fix naming

---
 sbin/uta-extract | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sbin/uta-extract b/sbin/uta-extract
index 0583b9a..84376c3 100755
--- a/sbin/uta-extract
+++ b/sbin/uta-extract
@@ -33,7 +33,7 @@ sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$working_dir/unfiltered
     tee "$logs_dir/ncbi-parse-genomic-gff.log"
 
 # filter transcripts
-sbin/filter_exonset_transcripts.py --tx-info "$working_dir/gbff.txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \
+sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \
     --missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \
     tee "$logs_dir/filter_exonset_transcripts.log"
 

From 3780859ad9030494a7402904b7a686184f34a971 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 13:01:01 -0700
Subject: [PATCH 29/36] move step that requires seqrepo out of extract script,
 so that seqrepo step can come after extract

---
 README.md            |  2 +-
 etc/scripts/uta-load |  4 ++++
 sbin/uta-extract     | 20 ++++++++------------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 6f8d4ac..c24ffdb 100644
--- a/README.md
+++ b/README.md
@@ -345,8 +345,8 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
 #### 2A. Nuclear transcripts
 ```
 docker compose run ncbi-download
-docker compose run seqrepo-load
 docker compose run uta-extract
+docker compose run seqrepo-load
 docker compose run uta-load
 ```
 
diff --git a/etc/scripts/uta-load b/etc/scripts/uta-load
index 10844f2..1d8084a 100755
--- a/etc/scripts/uta-load
+++ b/etc/scripts/uta-load
@@ -32,6 +32,10 @@ mkdir -p "$log_dir"
 etc/scripts/delete-schema.sh "$loading_uta_v"
 etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v"
 
+# generate seqinfo files from exonsets (this step requires seqrepo)
+sbin/exonset-to-seqinfo -o NCBI "$working_dir/exonsets.gz" | gzip -c > "$working_dir/seqinfo.gz" 2>&1 | \
+    tee "$log_dir/exonset-to-seqinfo.log"
+
 # Filter out columns from assocacs file.
 sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-ac.gz" 2>&1 | \
     tee "$log_dir/assoc-acs-merge.log"
diff --git a/sbin/uta-extract b/sbin/uta-extract
index 84376c3..7f2a8e9 100755
--- a/sbin/uta-extract
+++ b/sbin/uta-extract
@@ -6,40 +6,36 @@ set -e
 
 ncbi_dir=$1
 working_dir=$2
-logs_dir=$3
+log_dir=$3
 
-if [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$logs_dir" ]
+if [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
 then
-    echo 'Usage: sbin/uta-extract <ncbi_dir> <working_dir> <logs_dir>'
+    echo 'Usage: sbin/uta-extract <ncbi_dir> <working_dir> <log_dir>'
     exit 1
 fi
 
 # genes
 sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz | \
-    gzip -c > "$working_dir/geneinfo.gz" 2>&1 | tee "$logs_dir/ncbi-parse-geneinfo.log"
+    gzip -c > "$working_dir/geneinfo.gz" 2>&1 | tee "$log_dir/ncbi-parse-geneinfo.log"
 
 # transcript protein associations
 sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$working_dir/assocacs.gz" 2>&1 | \
-    tee "$logs_dir/ncbi-fetch-assoc-acs.log"
+    tee "$log_dir/ncbi-fetch-assoc-acs.log"
 
 # parse transcript info from GBFF input files
 GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz)
 sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$working_dir/txinfo.gz" 2>&1 | \
-    tee "$logs_dir/ncbi-parse-gbff.log"
+    tee "$log_dir/ncbi-parse-gbff.log"
 
 # parse alignments from GFF input files
 GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.gff.gz)
 sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$working_dir/unfiltered_exonsets.gz" 2>&1 | \
-    tee "$logs_dir/ncbi-parse-genomic-gff.log"
+    tee "$log_dir/ncbi-parse-genomic-gff.log"
 
 # filter transcripts
 sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \
     --missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \
-    tee "$logs_dir/filter_exonset_transcripts.log"
-
-# generate seqinfo files from exonsets (this step requires seqrepo)
-sbin/exonset-to-seqinfo -o NCBI "$working_dir/exonsets.gz" | gzip -c > "$working_dir/seqinfo.gz" 2>&1 | \
-    tee "$logs_dir/exonset-to-seqinfo.log"
+    tee "$log_dir/filter_exonset_transcripts.log"
 
 # move fasta files into same dir
 cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/

From e4bebf5c28a0a162c4b1a004647395c5c23bb2a3 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 13:01:56 -0700
Subject: [PATCH 30/36] move uta-load script into sbin

---
 docker-compose.yml             | 2 +-
 {etc/scripts => sbin}/uta-load | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename {etc/scripts => sbin}/uta-load (100%)

diff --git a/docker-compose.yml b/docker-compose.yml
index 3e988fa..1247c09 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -42,7 +42,7 @@ services:
     network_mode: host
   uta-load:
     image: uta-update
-    command: etc/scripts/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs ${UTA_ETL_SKIP_GENE_LOAD}
+    command: sbin/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs ${UTA_ETL_SKIP_GENE_LOAD}
     depends_on:
       uta:
         condition: service_healthy
diff --git a/etc/scripts/uta-load b/sbin/uta-load
similarity index 100%
rename from etc/scripts/uta-load
rename to sbin/uta-load

From 50dbeb806993a1d3246f78d8144eda462bc0b609 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 13:42:11 -0700
Subject: [PATCH 31/36] always set skip_load_genes

---
 README.md     | 2 +-
 sbin/uta-load | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index c24ffdb..11517d0 100644
--- a/README.md
+++ b/README.md
@@ -347,7 +347,7 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
 docker compose run ncbi-download
 docker compose run uta-extract
 docker compose run seqrepo-load
-docker compose run uta-load
+UTA_ETL_SKIP_GENE_LOAD=false docker compose run uta-load
 ```
 
 #### 2B. Mitochondrial transcripts
diff --git a/sbin/uta-load b/sbin/uta-load
index 1d8084a..48d7219 100755
--- a/sbin/uta-load
+++ b/sbin/uta-load
@@ -15,7 +15,6 @@ source_uta_v=$1
 ncbi_dir=$2
 working_dir=$3
 log_dir=$4
-# optionally skip loading geneinfo
 skip_load_genes=$5
 
 if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
@@ -41,7 +40,7 @@ sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-
     tee "$log_dir/assoc-acs-merge.log"
 
 # Load genes into gene table.
-if [ -z "$skip_load_genes" ]
+if [ "$skip_load_genes" = "true" ]
 then
     uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \
         tee "$log_dir/load-geneinfo.log"

From b7e0f4279ca9d7bf85435b476e28c5447a3f73d2 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 13:54:16 -0700
Subject: [PATCH 32/36] restore missed changes from alembic pr

---
 sbin/uta-load | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sbin/uta-load b/sbin/uta-load
index 48d7219..62ea293 100755
--- a/sbin/uta-load
+++ b/sbin/uta-load
@@ -24,13 +24,17 @@ then
 fi
 
 # set local variables and create working directories
-loading_uta_v="uta_1_1"
+loading_uta_v="uta"
 mkdir -p "$log_dir"
 
 ## Drop loading schema, and recreate
 etc/scripts/delete-schema.sh "$loading_uta_v"
 etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v"
 
+## for now set up Alembic for schema migrations
+alembic -c etc/alembic.ini stamp edadb97f6502
+alembic -c etc/alembic.ini upgrade head
+
 # generate seqinfo files from exonsets (this step requires seqrepo)
 sbin/exonset-to-seqinfo -o NCBI "$working_dir/exonsets.gz" | gzip -c > "$working_dir/seqinfo.gz" 2>&1 | \
     tee "$log_dir/exonset-to-seqinfo.log"

From 608deb97b2decc91323e6ea7914840522e219f9a Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 14:18:50 -0700
Subject: [PATCH 33/36] consistent naming of log_dir var

---
 sbin/seqrepo-load | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sbin/seqrepo-load b/sbin/seqrepo-load
index 3f05842..93942cf 100755
--- a/sbin/seqrepo-load
+++ b/sbin/seqrepo-load
@@ -5,11 +5,11 @@ set -e
 seqrepo_root=$1
 seqrepo_version=$2
 sequence_dir=$3
-logs_dir=$4
+log_dir=$4
 
-if [ -z "$seqrepo_root" ] || [ -z "$seqrepo_version" ] || [ -z "$sequence_dir" ] || [ -z "$logs_dir" ]
+if [ -z "$seqrepo_root" ] || [ -z "$seqrepo_version" ] || [ -z "$sequence_dir" ] || [ -z "$log_dir" ]
 then
-    echo 'Usage: sbin/seqrepo-load <seqrepo_root> <seqrepo_version> <sequence_dir> <logs_dir>'
+    echo 'Usage: sbin/seqrepo-load <seqrepo_root> <seqrepo_version> <sequence_dir> <log_dir>'
     exit 1
 fi
 
@@ -18,4 +18,4 @@ seqrepo --root-directory "$seqrepo_root" \
     load -n NCBI --instance-name "$seqrepo_version" \
     $sequence_dir/*.fna.gz \
     $sequence_dir/*.faa.gz 2>& 1 | \
-    tee "$logs_dir/seqrepo-load.log"
+    tee "$log_dir/seqrepo-load.log"

From e32add99f4e7e9c5dc1ed79a70cd231cd73661fb Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 14:22:40 -0700
Subject: [PATCH 34/36] invert condition

---
 sbin/uta-load | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sbin/uta-load b/sbin/uta-load
index 62ea293..c3da837 100755
--- a/sbin/uta-load
+++ b/sbin/uta-load
@@ -46,6 +46,8 @@ sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-
 # Load genes into gene table.
 if [ "$skip_load_genes" = "true" ]
 then
+    echo "Skipping load-geneinfo"
+else
     uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \
         tee "$log_dir/load-geneinfo.log"
 fi

From 10f4e0efd4e03912acf176749942cb58a6d52757 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Fri, 5 Apr 2024 14:34:24 -0700
Subject: [PATCH 35/36] fix tests for mito strand change

---
 tests/test_ncbi_process_mito.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_ncbi_process_mito.py b/tests/test_ncbi_process_mito.py
index d0a1982..8beb535 100644
--- a/tests/test_ncbi_process_mito.py
+++ b/tests/test_ncbi_process_mito.py
@@ -193,7 +193,7 @@ def test_get_mito_genes(self):
             "alt_ac": "NC_012920.1",
             "alt_start": 1601,
             "alt_end": 1670,
-            "strand": "+",
+            "strand": 1,
             "transl_table": None,
             "transl_except": None,
             "pro_ac": None,
@@ -212,7 +212,7 @@ def test_get_mito_genes(self):
             "alt_ac": "NC_012920.1",
             "alt_start": 4328,
             "alt_end": 4400,
-            "strand": "-",
+            "strand": -1,
             "transl_table": None,
             "transl_except": None,
             "pro_ac": None,
@@ -237,7 +237,7 @@ def test_get_mito_genes(self):
             "alt_ac": "NC_012920.1",
             "alt_start": 7585,
             "alt_end": 8269,
-            "strand": "+",
+            "strand": 1,
             "transl_table": "2",
             "transl_except": None,
             "pro_ac": "YP_003024029.1",
@@ -267,7 +267,7 @@ def test_get_mito_genes(self):
             "alt_ac": "NC_012920.1",
             "alt_start": 3306,
             "alt_end": 4262,
-            "strand": "+",
+            "strand": 1,
             "transl_table": "2",
             "transl_except": "(pos:4261..4262,aa:TERM)",
             "pro_ac": "YP_003024026.1",

From 858b143e0f3586002b3dbb6dc86b93e9ad7c77a8 Mon Sep 17 00:00:00 2001
From: NVTA <162694616+nvta1209@users.noreply.github.com>
Date: Mon, 8 Apr 2024 23:33:20 -0700
Subject: [PATCH 36/36] hard links

---
 sbin/uta-extract | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sbin/uta-extract b/sbin/uta-extract
index 7f2a8e9..2a60f3f 100755
--- a/sbin/uta-extract
+++ b/sbin/uta-extract
@@ -38,6 +38,6 @@ sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets
     tee "$log_dir/filter_exonset_transcripts.log"
 
 # move fasta files into same dir
-cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/
-cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/
-cp $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/
+ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/
+ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/
+ln $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/