From 33638c4cedbed6c4c035117e46c65cb5aa771303 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Wed, 3 Apr 2024 21:20:10 -0700 Subject: [PATCH 01/36] add shebang and comment to mito script, and make it executable --- sbin/ncbi_process_mito.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) mode change 100644 => 100755 sbin/ncbi_process_mito.py diff --git a/sbin/ncbi_process_mito.py b/sbin/ncbi_process_mito.py old mode 100644 new mode 100755 index 2afe071..d51b207 --- a/sbin/ncbi_process_mito.py +++ b/sbin/ncbi_process_mito.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + """ Download mito fasta and gbff file. Use BioPython to parse the features in the Mitochondrial genbank file to get the attributes of a region of the genome that correspond to genes along with their attributes. Output gene/tx/alignment @@ -166,6 +168,7 @@ def parse_nomenclature_value(gb_feature: SeqFeature) -> Dict[str, str]: def get_mito_genes(gbff_filepath: str): logger.info(f"processing NCBI GBFF file from {gbff_filepath}") with open(gbff_filepath) as fh: + # Bio.SeqIO.parse(fh, "gb") returns an empty iterator for .fna files and does not fail for record in Bio.SeqIO.parse(fh, "gb"): for feature in record.features: xrefs = parse_db_xrefs(feature) @@ -331,5 +334,4 @@ def main(ncbi_accession: str, output_dir: str): if __name__ == "__main__": args = parse_args() - main(args.accession, args.output_dir) From 59323c7044b219fe701500fff7d2b2beaa08d06c Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Thu, 4 Apr 2024 17:50:41 -0700 Subject: [PATCH 02/36] move ncbi parsing scripts into uta-extract --- README.md | 32 +++++++++++++++-- etc/scripts/run-uta-build.sh | 69 +++++++++++++----------------------- sbin/uta-extract | 35 ++++++++++++++++++ 3 files changed, 90 insertions(+), 46 deletions(-) create mode 100755 sbin/uta-extract diff --git a/README.md b/README.md index 78da019..d490965 100644 --- a/README.md +++ b/README.md @@ -289,7 +289,7 @@ To develop UTA, follow these steps. 4. Testing $ docker build --target uta-test -t uta-test . - $ docker run -it --rm uta-test python -m unittest + $ docker run --rm uta-test python -m unittest ## UTA update procedure @@ -334,7 +334,35 @@ Example: sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data ``` -### 3. Update UTA and SeqRepo +### 3. Extract data from NCBI files into intermediate files + +See 3A for nuclear transcript updates and 3B for mitochondrial transcript updates. + +In either case, first create directories: +``` +mkdir -p $(pwd)/uta-build/loading +mkdir -p $(pwd)/uta-build/logs +``` + +#### 3A. Nuclear transcripts + +Run `sbin/uta-extract`. Requires bash and docker. + +Example: +``` +sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build/loading $(pwd)/uta-build/logs +``` + +#### 3B. Mitochondrial transcripts + +Run `sbin/ncbi_process_mito.py`. Requires bash and docker. + +Example: +``` +sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/uta-build/loading | tee $(pwd)/uta-build/logs/mito.log +``` + +### 4. Update UTA and SeqRepo Run `sbin/uta-update`. Requires bash and docker. diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh index 166c6cb..9160cc2 100755 --- a/etc/scripts/run-uta-build.sh +++ b/etc/scripts/run-uta-build.sh @@ -38,50 +38,31 @@ seqrepo load -n NCBI -i "$seqrepo_data_release" \ $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz 2>& 1 | \ tee "$logs_dir/seqrepo-load.log" -### extract meta data -# genes -sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz | \ - gzip -c > "$loading_dir/genes.geneinfo.gz" 2>&1 | tee "$logs_dir/ncbi-parse-geneinfo.log" - -# transcript protein associations -sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$loading_dir/assocacs.gz" 2>&1 | \ - tee "$logs_dir/ncbi-fetch-assoc-acs" - -sbin/assoc-acs-merge "$loading_dir/assocacs.gz" | gzip -c > "$loading_dir/assocacs.cleaned.gz" 2>&1 | \ - tee "$logs_dir/assoc-acs-merge" - -# parse transcript info from GBFF input files -GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz) -sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$loading_dir/gbff.txinfo.gz" 2>&1 | \ - tee "$logs_dir/ncbi-parse-gbff.log" - -# parse alignments from GFF input files -GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.gff.gz) -sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/gff.exonsets.gz" 2>&1 | \ - tee "$logs_dir/ncbi-parse-genomic-gff.log" - -# generate seqinfo files from exonsets -sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \ - tee "$logs_dir/exonset-to-seqinfo.log" - -### update the uta database -# genes -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$loading_dir/genes.geneinfo.gz" 2>&1 | \ - tee "$logs_dir/load-geneinfo.log" - -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$loading_dir/assocacs.cleaned.gz" 2>&1 | \ - tee "$logs_dir/load-assoc-ac.log" - -# transcript info -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$loading_dir/gbff.txinfo.gz" 2>&1 | \ - tee "$logs_dir/load-txinfo.log" - -# gff exon sets -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$loading_dir/gff.exonsets.gz" 2>&1 | \ - tee "$logs_dir/load-exonsets.log" - -# align exons -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | tee "$logs_dir/align-exons.log" +# Filter out columns from assocacs file. +sbin/assoc-acs-merge "$loading_dir/assocacs.gz" | gzip -c > "$loading_dir/assoc-ac.gz" 2>&1 | \ + tee "$logs_dir/assoc-acs-merge.log" + +# Load genes into gene table. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$loading_dir/geneinfo.gz" 2>&1 | \ + tee "$logs_dir/load-geneinfo.log" + +# Load accessions into associated_accessions table. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$loading_dir/assoc-ac.gz" 2>&1 | \ + tee "$logs_dir/load-assoc-ac.log" + +# Load transcript info into transcript and exon_set tables. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$loading_dir/txinfo.gz" 2>&1 | \ + tee "$logs_dir/load-txinfo.log" + +# Load exon sets into into exon_set and exon tables. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$loading_dir/exonsets.gz" 2>&1 | \ + tee "$logs_dir/load-exonsets.log" + +# Create cigar strings for all rows in tx_alt_exon_pairs_v view and update exon_aln table. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | \ + tee "$logs_dir/align-exons.log" + +# Load seqinfo? ### run diff sbin/uta-diff "$source_uta_v" "$loading_uta_v" diff --git a/sbin/uta-extract b/sbin/uta-extract new file mode 100755 index 0000000..6864de4 --- /dev/null +++ b/sbin/uta-extract @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +# Extract data from NCBI files into intermediate files. + +ncbi_dir=$1 +loading_dir=$2 +logs_dir=$3 + +if [ -z "$ncbi_dir" ] || [ -z "$loading_dir" ] || [ -z "$logs_dir" ] +then + echo 'Usage: sbin/uta-extract ' + exit 1 +fi + +# genes +sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz | \ + gzip -c > "$loading_dir/geneinfo.gz" 2>&1 | tee "$logs_dir/ncbi-parse-geneinfo.log" + +# transcript protein associations +sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$loading_dir/assocacs.gz" 2>&1 | \ + tee "$logs_dir/ncbi-fetch-assoc-acs.log" + +# parse transcript info from GBFF input files +GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz) +sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$loading_dir/txinfo.gz" 2>&1 | \ + tee "$logs_dir/ncbi-parse-gbff.log" + +# parse alignments from GFF input files +GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.gff.gz) +sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/exonsets.gz" 2>&1 | \ + tee "$logs_dir/ncbi-parse-genomic-gff.log" + +# generate seqinfo files from exonsets +sbin/exonset-to-seqinfo -o NCBI "$loading_dir/exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \ + tee "$logs_dir/exonset-to-seqinfo.log" From 6297bac365bd95499ed4fa65b8ae32609750c8b4 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Thu, 4 Apr 2024 18:09:55 -0700 Subject: [PATCH 03/36] produce gzip files from mito script --- sbin/ncbi_process_mito.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sbin/ncbi_process_mito.py b/sbin/ncbi_process_mito.py index d51b207..f8cbdc3 100755 --- a/sbin/ncbi_process_mito.py +++ b/sbin/ncbi_process_mito.py @@ -54,6 +54,7 @@ """ import argparse import dataclasses +import gzip import importlib_resources import logging import logging.config @@ -245,7 +246,7 @@ def main(ncbi_accession: str, output_dir: str): logger.info(f"found {len(mito_genes)} genes from parsing {input_files['gbff']}") # write gene accessions - with open(f"{output_dir}/{ncbi_accession}.assocacs", "w") as o_file: + with gzip.open(f"{output_dir}/assocacs.gz", "wt") as o_file: gaw = GeneAccessionsWriter(o_file) for mg in mito_genes: if mg.pro_ac is not None: @@ -256,7 +257,7 @@ def main(ncbi_accession: str, output_dir: str): ) # write sequence information - with open(f"{output_dir}/{ncbi_accession}.seqinfo", "w") as o_file: + with gzip.open(f"{output_dir}/seqinfo.gz", "wt") as o_file: siw = SeqInfoWriter(o_file) for mg in mito_genes: siw.write( @@ -303,7 +304,7 @@ def main(ncbi_accession: str, output_dir: str): o_file.write(record.format("fasta")) # write transcript information - with open(f"{output_dir}/{ncbi_accession}.txinfo", "w") as o_file: + with gzip.open(f"{output_dir}/txinfo.gz", "wt") as o_file: tiw = TxInfoWriter(o_file) for mg in mito_genes: tiw.write( @@ -318,7 +319,7 @@ def main(ncbi_accession: str, output_dir: str): ) # write exonset - with open(f"{output_dir}/{ncbi_accession}.exonset", "w") as o_file: + with gzip.open(f"{output_dir}/exonsets.gz", "wt") as o_file: esw = ExonSetWriter(o_file) for mg in mito_genes: esw.write( From aa97fe6ceb90e345241754106e63db198f51ebbc Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Thu, 4 Apr 2024 18:45:10 -0700 Subject: [PATCH 04/36] move seqrepo load into its own script --- README.md | 23 +++++++++++++++++------ etc/scripts/run-uta-build.sh | 7 ------- sbin/seqrepo-load | 22 ++++++++++++++++++++++ 3 files changed, 39 insertions(+), 13 deletions(-) create mode 100755 sbin/seqrepo-load diff --git a/README.md b/README.md index d490965..9e0bfe8 100644 --- a/README.md +++ b/README.md @@ -293,9 +293,11 @@ To develop UTA, follow these steps. ## UTA update procedure +Requires bash and docker. + ### 1. Download files from NCBI -Run `sbin/ncbi-download-docker`. Requires bash and docker. +Run `sbin/ncbi-download-docker`. Example: ``` @@ -327,7 +329,7 @@ The specified directory will have the following structure: ### 2. Download SeqRepo data -Run `sbin/seqrepo-download`. Requires bash and docker. +Run `sbin/seqrepo-download`. Example: ``` @@ -346,7 +348,7 @@ mkdir -p $(pwd)/uta-build/logs #### 3A. Nuclear transcripts -Run `sbin/uta-extract`. Requires bash and docker. +Run `sbin/uta-extract`. Example: ``` @@ -355,16 +357,25 @@ sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build/loading $(pwd)/uta-build/logs #### 3B. Mitochondrial transcripts -Run `sbin/ncbi_process_mito.py`. Requires bash and docker. +Run `sbin/ncbi_process_mito.py`. Example: ``` sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/uta-build/loading | tee $(pwd)/uta-build/logs/mito.log ``` -### 4. Update UTA and SeqRepo +### 4. Load data into SeqRepo + +Run `sbin/seqrepo-load`. + +Example: +``` +sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/logs +``` + +### 5. Load data into UTA -Run `sbin/uta-update`. Requires bash and docker. +Run `sbin/uta-update`. Example: ``` diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh index 9160cc2..ccba691 100755 --- a/etc/scripts/run-uta-build.sh +++ b/etc/scripts/run-uta-build.sh @@ -31,13 +31,6 @@ done etc/scripts/delete-schema.sh "$loading_uta_v" etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v" -## Load SeqRepo with new sequences -seqrepo load -n NCBI -i "$seqrepo_data_release" \ - $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz \ - $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz \ - $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz 2>& 1 | \ - tee "$logs_dir/seqrepo-load.log" - # Filter out columns from assocacs file. sbin/assoc-acs-merge "$loading_dir/assocacs.gz" | gzip -c > "$loading_dir/assoc-ac.gz" 2>&1 | \ tee "$logs_dir/assoc-acs-merge.log" diff --git a/sbin/seqrepo-load b/sbin/seqrepo-load new file mode 100755 index 0000000..9b830ce --- /dev/null +++ b/sbin/seqrepo-load @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +set -e + +seqrepo_root=$1 +seqrepo_version=$2 +ncbi_dir=$3 +logs_dir=$4 + +if [ -z "$seqrepo_root" ] || [ -z "$seqrepo_version" ] || [ -z "$ncbi_dir" ] || [ -z "$logs_dir" ] +then + echo 'Usage: sbin/seqrepo-load ' + exit 1 +fi + +## Load SeqRepo with new sequences +seqrepo --root-directory "$seqrepo_root" \ + load -n NCBI --instance-name "$seqrepo_version" \ + $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz \ + $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz \ + $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz 2>& 1 | \ + tee "$logs_dir/seqrepo-load.log" From 46d8ac3cda3fa73c5db8ed14ebe6e9b60d262369 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 00:31:55 -0700 Subject: [PATCH 05/36] copy fasta files into the loading dir --- README.md | 10 +++++----- sbin/ncbi_process_mito.py | 4 ++-- sbin/seqrepo-load | 12 ++++++------ sbin/uta-extract | 5 +++++ 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 9e0bfe8..1792c6b 100644 --- a/README.md +++ b/README.md @@ -342,8 +342,8 @@ See 3A for nuclear transcript updates and 3B for mitochondrial transcript update In either case, first create directories: ``` -mkdir -p $(pwd)/uta-build/loading -mkdir -p $(pwd)/uta-build/logs +mkdir -p $(pwd)/uta-build +mkdir -p $(pwd)/uta-logs ``` #### 3A. Nuclear transcripts @@ -352,7 +352,7 @@ Run `sbin/uta-extract`. Example: ``` -sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build/loading $(pwd)/uta-build/logs +sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build $(pwd)/uta-logs ``` #### 3B. Mitochondrial transcripts @@ -361,7 +361,7 @@ Run `sbin/ncbi_process_mito.py`. Example: ``` -sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/uta-build/loading | tee $(pwd)/uta-build/logs/mito.log +sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/uta-build | tee $(pwd)/uta-logs/mito.log ``` ### 4. Load data into SeqRepo @@ -370,7 +370,7 @@ Run `sbin/seqrepo-load`. Example: ``` -sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/logs +sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/uta-build $(pwd)/uta-logs ``` ### 5. Load data into UTA diff --git a/sbin/ncbi_process_mito.py b/sbin/ncbi_process_mito.py index f8cbdc3..b8d5480 100755 --- a/sbin/ncbi_process_mito.py +++ b/sbin/ncbi_process_mito.py @@ -283,7 +283,7 @@ def main(ncbi_accession: str, output_dir: str): ) # write out transcript sequence fasta files. - with open(f"{output_dir}/{ncbi_accession}.rna.fna", "w") as o_file: + with gzip.open(f"{output_dir}/{ncbi_accession}.rna.fna.gz", "wt") as o_file: for mg in mito_genes: record = SeqRecord( Seq(mg.tx_seq), @@ -293,7 +293,7 @@ def main(ncbi_accession: str, output_dir: str): o_file.write(record.format("fasta")) # write out protein sequence fasta files. - with open(f"{output_dir}/{ncbi_accession}.protein.faa", "w") as o_file: + with gzip.open(f"{output_dir}/{ncbi_accession}.protein.faa.gz", "wt") as o_file: for mg in mito_genes: if mg.pro_ac is not None: record = SeqRecord( diff --git a/sbin/seqrepo-load b/sbin/seqrepo-load index 9b830ce..9aa1453 100755 --- a/sbin/seqrepo-load +++ b/sbin/seqrepo-load @@ -4,19 +4,19 @@ set -e seqrepo_root=$1 seqrepo_version=$2 -ncbi_dir=$3 +sequence_dir=$3 logs_dir=$4 -if [ -z "$seqrepo_root" ] || [ -z "$seqrepo_version" ] || [ -z "$ncbi_dir" ] || [ -z "$logs_dir" ] +if [ -z "$seqrepo_root" ] || [ -z "$seqrepo_version" ] || [ -z "$sequence_dir" ] || [ -z "$logs_dir" ] then - echo 'Usage: sbin/seqrepo-load ' + echo 'Usage: sbin/seqrepo-load ' exit 1 fi ## Load SeqRepo with new sequences seqrepo --root-directory "$seqrepo_root" \ load -n NCBI --instance-name "$seqrepo_version" \ - $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz \ - $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz \ - $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz 2>& 1 | \ + $sequence_dir/*.rna.fna.gz + $sequence_dir/*.protein.faa.gz + $sequence_dir/*_genomic.fna.gz tee "$logs_dir/seqrepo-load.log" diff --git a/sbin/uta-extract b/sbin/uta-extract index 6864de4..84c3940 100755 --- a/sbin/uta-extract +++ b/sbin/uta-extract @@ -33,3 +33,8 @@ sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/exonsets.g # generate seqinfo files from exonsets sbin/exonset-to-seqinfo -o NCBI "$loading_dir/exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \ tee "$logs_dir/exonset-to-seqinfo.log" + +# move fasta files into same dir +cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz uta-build/ +cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz uta-build/ +cp $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz uta-build/ From 17168b992082de55248c05b8b6c5bfa24df96bd5 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 02:33:30 -0700 Subject: [PATCH 06/36] remove unneeded seqrepo version input --- docker-compose.yml | 2 +- etc/scripts/run-uta-build.sh | 12 ++++++------ sbin/seqrepo-download | 2 +- sbin/seqrepo-load | 5 ++--- sbin/uta-update | 7 +++---- 5 files changed, 13 insertions(+), 15 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 856a5ae..0d4923a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,7 +15,7 @@ services: network_mode: host uta-update: image: uta-update - command: etc/scripts/run-uta-build.sh ${UTA_VERSION} ${SEQREPO_VERSION} /ncbi-dir /workdir + command: etc/scripts/run-uta-build.sh ${UTA_VERSION} /ncbi-dir /workdir depends_on: uta: condition: service_healthy diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh index ccba691..ab98a16 100755 --- a/etc/scripts/run-uta-build.sh +++ b/etc/scripts/run-uta-build.sh @@ -1,20 +1,20 @@ #!/bin/bash # source_uta_v is the UTA version before the update. -# seqrepo_data_release is the SeqRepo version before the update. # ncbi_dir is where the script looks for NCBI data files. # working_dir stores log files, intermediate data files, and the final database dump. +# Note that the uta loading code uses the seqrepo location defined in the conf files, under [sequences].seqrepo. + set -euxo pipefail source_uta_v=$1 -seqrepo_data_release=$2 -ncbi_dir=$3 -working_dir=$4 +ncbi_dir=$2 +working_dir=$3 -if [ -z "$source_uta_v" ] || [ -z "$seqrepo_data_release" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] +if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] then - echo 'Usage: run-uta-build.sh ' + echo 'Usage: run-uta-build.sh ' exit 1 fi diff --git a/sbin/seqrepo-download b/sbin/seqrepo-download index 0773030..6f628f1 100755 --- a/sbin/seqrepo-download +++ b/sbin/seqrepo-download @@ -50,4 +50,4 @@ fi # Copy seqrepo data into a local directory echo "Copying seqrepo data into $OUTPUT_DIR ..." -docker run -it -v $OUTPUT_DIR:/output-dir --volumes-from $SEQREPO_CONTAINER_NAME:ro ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir' +docker run -it -v $OUTPUT_DIR:/output-dir --volumes-from $SEQREPO_CONTAINER_NAME ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir' diff --git a/sbin/seqrepo-load b/sbin/seqrepo-load index 9aa1453..3f05842 100755 --- a/sbin/seqrepo-load +++ b/sbin/seqrepo-load @@ -16,7 +16,6 @@ fi ## Load SeqRepo with new sequences seqrepo --root-directory "$seqrepo_root" \ load -n NCBI --instance-name "$seqrepo_version" \ - $sequence_dir/*.rna.fna.gz - $sequence_dir/*.protein.faa.gz - $sequence_dir/*_genomic.fna.gz + $sequence_dir/*.fna.gz \ + $sequence_dir/*.faa.gz 2>& 1 | \ tee "$logs_dir/seqrepo-load.log" diff --git a/sbin/uta-update b/sbin/uta-update index 23d93af..81578bc 100755 --- a/sbin/uta-update +++ b/sbin/uta-update @@ -12,15 +12,14 @@ export NCBI_DIR=$1 export SEQREPO_DIR=$2 export WORKING_DIR=$3 export UTA_VERSION=$4 -export SEQREPO_VERSION=$5 -if [ -z "$NCBI_DIR" ] || [ -z "$SEQREPO_DIR" ] || [ -z "$WORKING_DIR" ] || [ -z "$UTA_VERSION" ] || [ -z "$SEQREPO_VERSION" ] +if [ -z "$NCBI_DIR" ] || [ -z "$SEQREPO_DIR" ] || [ -z "$WORKING_DIR" ] || [ -z "$UTA_VERSION" ] then - echo 'Usage: sbin/uta-update ' + echo 'Usage: sbin/uta-update ' exit 1 else echo "Updating UTA and SeqRepo using files in $NCBI_DIR and SeqRepo data in $SEQREPO_DIR" - echo "Starting from UTA version $UTA_VERSION and SeqRepo version $SEQREPO_VERSION" + echo "Starting from UTA version $UTA_VERSION" echo "Logs and intermediate files will be available in $WORKING_DIR" fi From 894ef4d0f67b13080f2a0d15d64dc901eda65a98 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 02:36:08 -0700 Subject: [PATCH 07/36] rename uta-update uta-load, in line with seqrepo-load and extract-transform-load --- README.md | 4 ++-- docker-compose.yml | 6 +++--- sbin/{uta-update => uta-load} | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) rename sbin/{uta-update => uta-load} (87%) diff --git a/README.md b/README.md index 1792c6b..c68d3e3 100644 --- a/README.md +++ b/README.md @@ -375,9 +375,9 @@ sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/uta-build $(pwd)/uta-log ### 5. Load data into UTA -Run `sbin/uta-update`. +Run `sbin/uta-load`. Example: ``` -sbin/uta-update $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/uta-build uta_20210129b 2024-02-20 +sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/uta-build uta_20210129b 2024-02-20 ``` diff --git a/docker-compose.yml b/docker-compose.yml index 0d4923a..39c8069 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,4 +1,4 @@ -# docker compose file for the UTA update procedure +# docker compose file for the UTA load procedure version: '3' @@ -13,8 +13,8 @@ services: interval: 10s retries: 60 network_mode: host - uta-update: - image: uta-update + uta-load: + image: uta-load command: etc/scripts/run-uta-build.sh ${UTA_VERSION} /ncbi-dir /workdir depends_on: uta: diff --git a/sbin/uta-update b/sbin/uta-load similarity index 87% rename from sbin/uta-update rename to sbin/uta-load index 81578bc..1a22a80 100755 --- a/sbin/uta-update +++ b/sbin/uta-load @@ -15,7 +15,7 @@ export UTA_VERSION=$4 if [ -z "$NCBI_DIR" ] || [ -z "$SEQREPO_DIR" ] || [ -z "$WORKING_DIR" ] || [ -z "$UTA_VERSION" ] then - echo 'Usage: sbin/uta-update ' + echo 'Usage: sbin/uta-load ' exit 1 else echo "Updating UTA and SeqRepo using files in $NCBI_DIR and SeqRepo data in $SEQREPO_DIR" @@ -59,8 +59,8 @@ if [ ! -d "$WORKING_DIR" ]; then fi # Build the UTA image. -docker build --target uta -t uta-update . +docker build --target uta -t uta-load . -# Bring up a UTA database and run the UTA update procedure. +# Bring up a UTA database and run the UTA load procedure. # docker compose doesn't respect the container name specified in the compose file, so container name is specified here -docker compose run --rm --name uta-update uta-update +docker compose run --rm --name uta-load uta-load From 89f8b3b7dca0c78fd61d99ca2a6bad50709c2ea6 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 02:37:27 -0700 Subject: [PATCH 08/36] simplify readme --- README.md | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/README.md b/README.md index c68d3e3..de90910 100644 --- a/README.md +++ b/README.md @@ -296,10 +296,6 @@ To develop UTA, follow these steps. Requires bash and docker. ### 1. Download files from NCBI - -Run `sbin/ncbi-download-docker`. - -Example: ``` sbin/ncbi-download-docker $(pwd)/ncbi-data ``` @@ -328,10 +324,6 @@ The specified directory will have the following structure: └── human.1.rna.gbff.gz ### 2. Download SeqRepo data - -Run `sbin/seqrepo-download`. - -Example: ``` sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data ``` @@ -347,37 +339,21 @@ mkdir -p $(pwd)/uta-logs ``` #### 3A. Nuclear transcripts - -Run `sbin/uta-extract`. - -Example: ``` sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build $(pwd)/uta-logs ``` #### 3B. Mitochondrial transcripts - -Run `sbin/ncbi_process_mito.py`. - -Example: ``` sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/uta-build | tee $(pwd)/uta-logs/mito.log ``` ### 4. Load data into SeqRepo - -Run `sbin/seqrepo-load`. - -Example: ``` sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/uta-build $(pwd)/uta-logs ``` ### 5. Load data into UTA - -Run `sbin/uta-load`. - -Example: ``` sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/uta-build uta_20210129b 2024-02-20 ``` From 6bd387ad115658e2021294c4389b4c0f03e9bee5 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 02:44:10 -0700 Subject: [PATCH 09/36] allow seqrepo to be modified --- sbin/seqrepo-download | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sbin/seqrepo-download b/sbin/seqrepo-download index 6f628f1..5c581b0 100755 --- a/sbin/seqrepo-download +++ b/sbin/seqrepo-download @@ -51,3 +51,6 @@ fi # Copy seqrepo data into a local directory echo "Copying seqrepo data into $OUTPUT_DIR ..." docker run -it -v $OUTPUT_DIR:/output-dir --volumes-from $SEQREPO_CONTAINER_NAME ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir' + +# Allow seqrepo to be modified +docker run -it -v $OUTPUT_DIR:/output-dir ubuntu bash -c 'chmod -R +w /output-dir' From ede145e2cba2104fe3f669a694c4b5ec4bcb6f32 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 02:50:49 -0700 Subject: [PATCH 10/36] restructure readme --- README.md | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index de90910..57896f6 100644 --- a/README.md +++ b/README.md @@ -295,12 +295,28 @@ To develop UTA, follow these steps. Requires bash and docker. -### 1. Download files from NCBI +### 1. Download SeqRepo data ``` +sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data +``` + +### 2. Extract and transform data from NCBI + +Download files from NCBI, and extract into intermediate files. + +See 2A for nuclear transcripts and 2B for mitochondrial transcripts. + +#### 2A. Nuclear transcripts +``` +mkdir -p $(pwd)/ncbi-data +mkdir -p $(pwd)/uta-build +mkdir -p $(pwd)/uta-logs + sbin/ncbi-download-docker $(pwd)/ncbi-data +sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build $(pwd)/uta-logs ``` -The specified directory will have the following structure: +The `ncbi-data` directory will have the following structure: ├── gene │ └── DATA @@ -323,37 +339,18 @@ The specified directory will have the following structure: ├── human.1.rna.fna.gz └── human.1.rna.gbff.gz -### 2. Download SeqRepo data -``` -sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data -``` - -### 3. Extract data from NCBI files into intermediate files - -See 3A for nuclear transcript updates and 3B for mitochondrial transcript updates. - -In either case, first create directories: -``` -mkdir -p $(pwd)/uta-build -mkdir -p $(pwd)/uta-logs -``` - -#### 3A. Nuclear transcripts -``` -sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build $(pwd)/uta-logs -``` -#### 3B. Mitochondrial transcripts +#### 2B. Mitochondrial transcripts ``` sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/uta-build | tee $(pwd)/uta-logs/mito.log ``` -### 4. Load data into SeqRepo +### 3. Load data into SeqRepo ``` sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/uta-build $(pwd)/uta-logs ``` -### 5. Load data into UTA +### 4. Load data into UTA ``` -sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/uta-build uta_20210129b 2024-02-20 +sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/uta-build uta_20210129b ``` From 44054fa1599e048d647d1f21274af51471895d66 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 02:55:57 -0700 Subject: [PATCH 11/36] change dirs in readme --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 57896f6..a1c6407 100644 --- a/README.md +++ b/README.md @@ -309,11 +309,11 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts. #### 2A. Nuclear transcripts ``` mkdir -p $(pwd)/ncbi-data -mkdir -p $(pwd)/uta-build -mkdir -p $(pwd)/uta-logs +mkdir -p $(pwd)/output/artifacts +mkdir -p $(pwd)/output/logs sbin/ncbi-download-docker $(pwd)/ncbi-data -sbin/uta-extract $(pwd)/ncbi-data $(pwd)/uta-build $(pwd)/uta-logs +sbin/uta-extract $(pwd)/ncbi-data $(pwd)/output/artifacts $(pwd)/output/logs ``` The `ncbi-data` directory will have the following structure: @@ -342,15 +342,15 @@ The `ncbi-data` directory will have the following structure: #### 2B. Mitochondrial transcripts ``` -sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/uta-build | tee $(pwd)/uta-logs/mito.log +sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/output/artifacts | tee $(pwd)/output/logs/mito.log ``` ### 3. Load data into SeqRepo ``` -sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/uta-build $(pwd)/uta-logs +sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/output/artifacts $(pwd)/output/logs ``` ### 4. Load data into UTA ``` -sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/uta-build uta_20210129b +sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/output/artifacts uta_20210129b ``` From cc329156ca46f7353c3f97f581e6fc33267f35d1 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 03:11:25 -0700 Subject: [PATCH 12/36] be explicit about all dirs --- README.md | 2 +- docker-compose.yml | 5 +++-- etc/scripts/run-uta-build.sh | 27 ++++++++++++--------------- sbin/uta-load | 29 +++++++++++++++++++++-------- 4 files changed, 37 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index a1c6407..1fd195b 100644 --- a/README.md +++ b/README.md @@ -352,5 +352,5 @@ sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/output/artifacts $(pwd)/ ### 4. Load data into UTA ``` -sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/output/artifacts uta_20210129b +sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data uta_20210129b $(pwd)/output/artifacts $(pwd)/output/logs ``` diff --git a/docker-compose.yml b/docker-compose.yml index 39c8069..29a592d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,12 +15,13 @@ services: network_mode: host uta-load: image: uta-load - command: etc/scripts/run-uta-build.sh ${UTA_VERSION} /ncbi-dir /workdir + command: etc/scripts/run-uta-build.sh ${UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs depends_on: uta: condition: service_healthy volumes: - ${NCBI_DIR}:/ncbi-dir - ${SEQREPO_DIR}:/usr/local/share/seqrepo - - ${WORKING_DIR}:/workdir + - ${WORKING_DIR}:/uta-load/work + - ${LOG_DIR}:/uta-load/logs network_mode: host diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh index ab98a16..a23c9b6 100755 --- a/etc/scripts/run-uta-build.sh +++ b/etc/scripts/run-uta-build.sh @@ -2,7 +2,8 @@ # source_uta_v is the UTA version before the update. # ncbi_dir is where the script looks for NCBI data files. -# working_dir stores log files, intermediate data files, and the final database dump. +# working_dir stores intermediate data files and the final database dump. +# log_dir stores log files. # Note that the uta loading code uses the seqrepo location defined in the conf files, under [sequences].seqrepo. @@ -11,44 +12,40 @@ set -euxo pipefail source_uta_v=$1 ncbi_dir=$2 working_dir=$3 +log_dir=$4 -if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] +if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ] then - echo 'Usage: run-uta-build.sh ' + echo 'Usage: run-uta-build.sh ' exit 1 fi # set local variables and create working directories loading_uta_v="uta_1_1" -loading_dir="$working_dir/loading" -dumps_dir="$working_dir/dumps" -logs_dir="$working_dir/logs" -for d in "$loading_dir" "$dumps_dir" "$logs_dir"; - do mkdir -p "$d" -done +mkdir -p "$logs_dir" ## Drop loading schema, and recreate etc/scripts/delete-schema.sh "$loading_uta_v" etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v" # Filter out columns from assocacs file. -sbin/assoc-acs-merge "$loading_dir/assocacs.gz" | gzip -c > "$loading_dir/assoc-ac.gz" 2>&1 | \ +sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-ac.gz" 2>&1 | \ tee "$logs_dir/assoc-acs-merge.log" # Load genes into gene table. -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$loading_dir/geneinfo.gz" 2>&1 | \ +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \ tee "$logs_dir/load-geneinfo.log" # Load accessions into associated_accessions table. -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$loading_dir/assoc-ac.gz" 2>&1 | \ +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$working_dir/assoc-ac.gz" 2>&1 | \ tee "$logs_dir/load-assoc-ac.log" # Load transcript info into transcript and exon_set tables. -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$loading_dir/txinfo.gz" 2>&1 | \ +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$working_dir/txinfo.gz" 2>&1 | \ tee "$logs_dir/load-txinfo.log" # Load exon sets into into exon_set and exon tables. -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$loading_dir/exonsets.gz" 2>&1 | \ +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$working_dir/exonsets.gz" 2>&1 | \ tee "$logs_dir/load-exonsets.log" # Create cigar strings for all rows in tx_alt_exon_pairs_v view and update exon_aln table. @@ -61,4 +58,4 @@ uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | sbin/uta-diff "$source_uta_v" "$loading_uta_v" ### psql_dump -pg_dump -U uta_admin -h localhost -d uta -t "$loading_uta_v.gene" | gzip -c > "$dumps_dir/uta.pgd.gz" +pg_dump -U uta_admin -h localhost -d uta -t "$loading_uta_v.gene" | gzip -c > "$working_dir/uta.pgd.gz" diff --git a/sbin/uta-load b/sbin/uta-load index 1a22a80..6de36ab 100755 --- a/sbin/uta-load +++ b/sbin/uta-load @@ -1,26 +1,28 @@ #!/usr/bin/env bash # This script runs the UTA update procedure. -# It updates the specified UTA and SeqRepo using the given NCBI files. -# It produces a postgres dump of the updated UTA database and an updated SeqRepo (updated in place). # It expects to be run from the root of the uta repository. +# It updates the specified UTA and SeqRepo using the given NCBI files. +# The UTA update is provided as a postgres database dump, and SeqRepo is updated in place. set -e # export environment variables for docker compose file export NCBI_DIR=$1 export SEQREPO_DIR=$2 -export WORKING_DIR=$3 -export UTA_VERSION=$4 +export UTA_VERSION=$3 +export WORKING_DIR=$4 +export LOG_DIR=$4 -if [ -z "$NCBI_DIR" ] || [ -z "$SEQREPO_DIR" ] || [ -z "$WORKING_DIR" ] || [ -z "$UTA_VERSION" ] +if [ -z "$NCBI_DIR" ] || [ -z "$SEQREPO_DIR" ] || [ -z "$UTA_VERSION" ] || [ -z "$WORKING_DIR" ] || [ -z "$LOG_DIR" ] then - echo 'Usage: sbin/uta-load ' + echo 'Usage: sbin/uta-load ' exit 1 else echo "Updating UTA and SeqRepo using files in $NCBI_DIR and SeqRepo data in $SEQREPO_DIR" echo "Starting from UTA version $UTA_VERSION" - echo "Logs and intermediate files will be available in $WORKING_DIR" + echo "Final dump and any intermediate files will be available in $WORKING_DIR" + echo "Logs will be available in $LOG_DIR" fi # Ensure directories are compatible with docker volume usage @@ -38,7 +40,13 @@ fi if [[ $WORKING_DIR != /* ]] && [[ $WORKING_DIR != .* ]] then - echo 'Working directory must start with / or .' + echo 'Output directory must start with / or .' + exit 1 +fi + +if [[ $LOG_DIR != /* ]] && [[ $LOG_DIR != .* ]] +then + echo 'Log directory must start with / or .' exit 1 fi @@ -58,6 +66,11 @@ if [ ! -d "$WORKING_DIR" ]; then exit 1 fi +if [ ! -d "$LOG_DIR" ]; then + echo "Directory $LOG_DIR does not exist." + exit 1 +fi + # Build the UTA image. docker build --target uta -t uta-load . From df62d46228e830b7de29baaa932709b2e7d37636 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 03:13:01 -0700 Subject: [PATCH 13/36] mkdir needs to happen in both nuclear and mito paths --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 1fd195b..4f4b450 100644 --- a/README.md +++ b/README.md @@ -301,6 +301,11 @@ sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data ``` ### 2. Extract and transform data from NCBI +``` +mkdir -p $(pwd)/ncbi-data +mkdir -p $(pwd)/output/artifacts +mkdir -p $(pwd)/output/logs +``` Download files from NCBI, and extract into intermediate files. @@ -308,10 +313,6 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts. #### 2A. Nuclear transcripts ``` -mkdir -p $(pwd)/ncbi-data -mkdir -p $(pwd)/output/artifacts -mkdir -p $(pwd)/output/logs - sbin/ncbi-download-docker $(pwd)/ncbi-data sbin/uta-extract $(pwd)/ncbi-data $(pwd)/output/artifacts $(pwd)/output/logs ``` From f6e5032a941da686bd11d085f0e192d6ff253ae3 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 03:17:29 -0700 Subject: [PATCH 14/36] consistent dir name --- README.md | 4 ++++ etc/scripts/run-uta-build.sh | 14 +++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 4f4b450..eff1f88 100644 --- a/README.md +++ b/README.md @@ -314,6 +314,8 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts. #### 2A. Nuclear transcripts ``` sbin/ncbi-download-docker $(pwd)/ncbi-data + +# todo: move into docker sbin/uta-extract $(pwd)/ncbi-data $(pwd)/output/artifacts $(pwd)/output/logs ``` @@ -343,11 +345,13 @@ The `ncbi-data` directory will have the following structure: #### 2B. Mitochondrial transcripts ``` +# todo: move into docker sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/output/artifacts | tee $(pwd)/output/logs/mito.log ``` ### 3. Load data into SeqRepo ``` +# todo: move into docker sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/output/artifacts $(pwd)/output/logs ``` diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh index a23c9b6..69c4750 100755 --- a/etc/scripts/run-uta-build.sh +++ b/etc/scripts/run-uta-build.sh @@ -22,7 +22,7 @@ fi # set local variables and create working directories loading_uta_v="uta_1_1" -mkdir -p "$logs_dir" +mkdir -p "$log_dir" ## Drop loading schema, and recreate etc/scripts/delete-schema.sh "$loading_uta_v" @@ -30,27 +30,27 @@ etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v" # Filter out columns from assocacs file. sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-ac.gz" 2>&1 | \ - tee "$logs_dir/assoc-acs-merge.log" + tee "$log_dir/assoc-acs-merge.log" # Load genes into gene table. uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \ - tee "$logs_dir/load-geneinfo.log" + tee "$log_dir/load-geneinfo.log" # Load accessions into associated_accessions table. uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$working_dir/assoc-ac.gz" 2>&1 | \ - tee "$logs_dir/load-assoc-ac.log" + tee "$log_dir/load-assoc-ac.log" # Load transcript info into transcript and exon_set tables. uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$working_dir/txinfo.gz" 2>&1 | \ - tee "$logs_dir/load-txinfo.log" + tee "$log_dir/load-txinfo.log" # Load exon sets into into exon_set and exon tables. uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$working_dir/exonsets.gz" 2>&1 | \ - tee "$logs_dir/load-exonsets.log" + tee "$log_dir/load-exonsets.log" # Create cigar strings for all rows in tx_alt_exon_pairs_v view and update exon_aln table. uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | \ - tee "$logs_dir/align-exons.log" + tee "$log_dir/align-exons.log" # Load seqinfo? From 23617261e5dfae16f3c4e529b3ac531dd4668068 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 03:44:15 -0700 Subject: [PATCH 15/36] mito: strand should be an int --- etc/scripts/run-uta-build.sh | 6 +++--- sbin/ncbi_process_mito.py | 6 ++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh index 69c4750..0aab2f5 100755 --- a/etc/scripts/run-uta-build.sh +++ b/etc/scripts/run-uta-build.sh @@ -32,9 +32,9 @@ etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v" sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-ac.gz" 2>&1 | \ tee "$log_dir/assoc-acs-merge.log" -# Load genes into gene table. -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \ - tee "$log_dir/load-geneinfo.log" +# # Load genes into gene table. +# uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \ +# tee "$log_dir/load-geneinfo.log" # Load accessions into associated_accessions table. uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$working_dir/assoc-ac.gz" 2>&1 | \ diff --git a/sbin/ncbi_process_mito.py b/sbin/ncbi_process_mito.py index b8d5480..c65984b 100755 --- a/sbin/ncbi_process_mito.py +++ b/sbin/ncbi_process_mito.py @@ -86,7 +86,7 @@ class MitoGeneData: alt_ac: str alt_start: int alt_end: int - strand: str + strand: int origin: str = "NCBI" alignment_method: str = "splign" transl_table: Optional[str] = None @@ -204,9 +204,7 @@ def get_mito_genes(gbff_filepath: str): # retrieve sequence, and reverse compliment if on reverse strand ac = f"{record.id}_{feature.location.start:05}_{feature.location.end:05}" feature_seq = record.seq[feature_start:feature_end] - strand = "+" if feature.location.strand == -1: - strand = "-" feature_seq = feature_seq.reverse_complement() if feature.type == "CDS": @@ -229,7 +227,7 @@ def get_mito_genes(gbff_filepath: str): alt_ac=record.id, alt_start=feature_start, alt_end=feature_end, - strand=strand, + strand=feature.location.strand, transl_table=transl_table, transl_except=transl_except, pro_ac=pro_ac, From cf46cc0538ef0bf35a3b7cbc16341a3f4483d7c2 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 10:53:39 -0700 Subject: [PATCH 16/36] reanme uta loading script --- docker-compose.yml | 2 +- etc/scripts/run-uta-build.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 29a592d..65611fa 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,7 +15,7 @@ services: network_mode: host uta-load: image: uta-load - command: etc/scripts/run-uta-build.sh ${UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs + command: etc/scripts/uta-load ${UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs depends_on: uta: condition: service_healthy diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh index 0aab2f5..b541bb7 100755 --- a/etc/scripts/run-uta-build.sh +++ b/etc/scripts/run-uta-build.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # source_uta_v is the UTA version before the update. # ncbi_dir is where the script looks for NCBI data files. @@ -16,7 +16,7 @@ log_dir=$4 if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ] then - echo 'Usage: run-uta-build.sh ' + echo 'Usage: uta-load ' exit 1 fi From 329fdc4ee69f9294e8d3086c899a46e81a4dc8a6 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 11:12:08 -0700 Subject: [PATCH 17/36] remove docker wrapper script for download --- README.md | 2 +- docker-compose.yml | 8 ++++++++ sbin/ncbi-download | 1 + sbin/ncbi-download-docker | 24 ------------------------ 4 files changed, 10 insertions(+), 25 deletions(-) delete mode 100755 sbin/ncbi-download-docker diff --git a/README.md b/README.md index eff1f88..a432682 100644 --- a/README.md +++ b/README.md @@ -313,7 +313,7 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts. #### 2A. Nuclear transcripts ``` -sbin/ncbi-download-docker $(pwd)/ncbi-data +NCBI_DIR=./ncbi-data2 docker compose run ncbi-download # todo: move into docker sbin/uta-extract $(pwd)/ncbi-data $(pwd)/output/artifacts $(pwd)/output/logs diff --git a/docker-compose.yml b/docker-compose.yml index 65611fa..807fc49 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,6 +3,14 @@ version: '3' services: + ncbi-download: + image: uta-load + command: sbin/ncbi-download /ncbi-dir + volumes: + - .:/opt/repos/uta + - ${NCBI_DIR}:/ncbi-dir + working_dir: /opt/repos/uta + network_mode: host uta: container_name: uta image: biocommons/uta:${UTA_VERSION} diff --git a/sbin/ncbi-download b/sbin/ncbi-download index 61a6874..500c25f 100755 --- a/sbin/ncbi-download +++ b/sbin/ncbi-download @@ -26,6 +26,7 @@ do DOWNLOAD_MODULE="${DOWNLOAD_PATH%%/*}" DOWNLOAD_SRC="ftp.ncbi.nlm.nih.gov::$DOWNLOAD_PATH" DOWNLOAD_DST="$DOWNLOAD_DIR/$DOWNLOAD_MODULE" + mkdir -p $DOWNLOAD_DST echo "Downloading $DOWNLOAD_SRC to $DOWNLOAD_DST" rsync --no-motd -DHPRprtv "$DOWNLOAD_SRC" "$DOWNLOAD_DST" done diff --git a/sbin/ncbi-download-docker b/sbin/ncbi-download-docker deleted file mode 100755 index 30d4f63..0000000 --- a/sbin/ncbi-download-docker +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env bash - -# This script runs ncbi-download in a docker container. - -set -e - -DOWNLOAD_DIR=$1 - -if [ -z "$DOWNLOAD_DIR" ] -then - echo 'Usage: sbin/ncbi-download-docker ' - exit 1 -else - echo "Downloading files to $DOWNLOAD_DIR" -fi - -if [[ $DOWNLOAD_DIR != /* ]] -then - echo 'Download directory must be an absolute path' - exit 1 -fi - -docker build --target uta -t ncbi-download --progress plain . -docker run -it -v $(pwd)/sbin/ncbi-download:/dl-script -v $DOWNLOAD_DIR:/output-dir ncbi-download /dl-script /output-dir From bcfbf3816e9ef43d3d96e1c372ffee597f9a619f Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 11:28:28 -0700 Subject: [PATCH 18/36] create compose service for uta-extract --- README.md | 56 ++++++++++++++++++---------------------------- docker-compose.yml | 21 ++++++++++++----- sbin/ncbi-download | 23 +++++++++++++++++++ sbin/uta-extract | 2 +- 4 files changed, 62 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index a432682..b2a397a 100644 --- a/README.md +++ b/README.md @@ -293,56 +293,44 @@ To develop UTA, follow these steps. ## UTA update procedure -Requires bash and docker. +Requires docker. -### 1. Download SeqRepo data -``` -sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data -``` +### 0. Setup -### 2. Extract and transform data from NCBI +Make directories: ``` mkdir -p $(pwd)/ncbi-data mkdir -p $(pwd)/output/artifacts mkdir -p $(pwd)/output/logs ``` +Set variables: +``` +export UTA_ETL_NCBI_DIR=./ncbi-data +export UTA_ETL_SEQREPO_DIR=./seqrepo-data +export UTA_ETL_UTA_VERSION=uta_20210129b +export UTA_ETL_WORK_DIR=./output/artifacts +export UTA_ETL_LOG_DIR=./output/logs +``` + +### 1. Download SeqRepo data +``` +tbd +sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data +``` + +### 2. Extract and transform data from NCBI + Download files from NCBI, and extract into intermediate files. See 2A for nuclear transcripts and 2B for mitochondrial transcripts. #### 2A. Nuclear transcripts ``` -NCBI_DIR=./ncbi-data2 docker compose run ncbi-download - -# todo: move into docker -sbin/uta-extract $(pwd)/ncbi-data $(pwd)/output/artifacts $(pwd)/output/logs +docker compose run ncbi-download +docker compose run uta-extract ``` -The `ncbi-data` directory will have the following structure: - - ├── gene - │ └── DATA - │ ├── GENE_INFO - │ │ └── Mammalia - │ │ └── Homo_sapiens.gene_info.gz - │ └── gene2accession.gz - ├── genomes - │ └── refseq - │ └── vertebrate_mammalian - │ └── Homo_sapiens - │ └── all_assembly_versions - │ └── GCF_000001405.25_GRCh37.p13 - │ ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz - │ └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz - └── refseq - └── H_sapiens - └── mRNA_Prot - ├── human.1.protein.faa.gz - ├── human.1.rna.fna.gz - └── human.1.rna.gbff.gz - - #### 2B. Mitochondrial transcripts ``` # todo: move into docker diff --git a/docker-compose.yml b/docker-compose.yml index 807fc49..f048104 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,7 +8,18 @@ services: command: sbin/ncbi-download /ncbi-dir volumes: - .:/opt/repos/uta - - ${NCBI_DIR}:/ncbi-dir + - ${UTA_ETL_NCBI_DIR}:/ncbi-dir + working_dir: /opt/repos/uta + network_mode: host + uta-extract: + image: uta-load + command: sbin/uta-extract /ncbi-dir /uta-extract/work /uta-extract/logs + volumes: + - .:/opt/repos/uta + - ${UTA_ETL_NCBI_DIR}:/ncbi-dir + - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo + - ${UTA_ETL_WORK_DIR}:/uta-extract/work + - ${UTA_ETL_LOG_DIR}:/uta-extract/logs working_dir: /opt/repos/uta network_mode: host uta: @@ -28,8 +39,8 @@ services: uta: condition: service_healthy volumes: - - ${NCBI_DIR}:/ncbi-dir - - ${SEQREPO_DIR}:/usr/local/share/seqrepo - - ${WORKING_DIR}:/uta-load/work - - ${LOG_DIR}:/uta-load/logs + - ${UTA_ETL_NCBI_DIR}:/ncbi-dir + - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo + - ${UTA_ETL_WORK_DIR}:/uta-load/work + - ${UTA_ETL_LOG_DIR}:/uta-load/logs network_mode: host diff --git a/sbin/ncbi-download b/sbin/ncbi-download index 500c25f..dc445cd 100755 --- a/sbin/ncbi-download +++ b/sbin/ncbi-download @@ -1,6 +1,29 @@ #!/usr/bin/env bash # This script downloads the files needed for a UTA+SeqRepo update into to the given directory. +# +# DONWLOAD_DIR will have the following structure: +# +# ├── gene +# │ └── DATA +# │ ├── GENE_INFO +# │ │ └── Mammalia +# │ │ └── Homo_sapiens.gene_info.gz +# │ └── gene2accession.gz +# ├── genomes +# │ └── refseq +# │ └── vertebrate_mammalian +# │ └── Homo_sapiens +# │ └── all_assembly_versions +# │ └── GCF_000001405.25_GRCh37.p13 +# │ ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz +# │ └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz +# └── refseq +# └── H_sapiens +# └── mRNA_Prot +# ├── human.1.protein.faa.gz +# ├── human.1.rna.fna.gz +# └── human.1.rna.gbff.gz set -e diff --git a/sbin/uta-extract b/sbin/uta-extract index 84c3940..4d89f86 100755 --- a/sbin/uta-extract +++ b/sbin/uta-extract @@ -30,7 +30,7 @@ GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_as sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/exonsets.gz" 2>&1 | \ tee "$logs_dir/ncbi-parse-genomic-gff.log" -# generate seqinfo files from exonsets +# generate seqinfo files from exonsets (this step requires seqrepo) sbin/exonset-to-seqinfo -o NCBI "$loading_dir/exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \ tee "$logs_dir/exonset-to-seqinfo.log" From a3dcbcf705dfcd0d02f2519bf7476892b355afee Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 11:38:46 -0700 Subject: [PATCH 19/36] create compose service for mito-extract --- README.md | 3 +-- docker-compose.yml | 9 +++++++++ sbin/uta-extract | 22 +++++++++++----------- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index b2a397a..332980f 100644 --- a/README.md +++ b/README.md @@ -333,8 +333,7 @@ docker compose run uta-extract #### 2B. Mitochondrial transcripts ``` -# todo: move into docker -sbin/ncbi_process_mito.py NC_012920.1 --output-dir $(pwd)/output/artifacts | tee $(pwd)/output/logs/mito.log +docker compose run mito-extract ``` ### 3. Load data into SeqRepo diff --git a/docker-compose.yml b/docker-compose.yml index f048104..8ba1345 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -44,3 +44,12 @@ services: - ${UTA_ETL_WORK_DIR}:/uta-load/work - ${UTA_ETL_LOG_DIR}:/uta-load/logs network_mode: host + mito-extract: + image: uta-load + command: sbin/ncbi_process_mito.py NC_012920.1 --output-dir /mito-extract/work | tee /mito-extract/logs/mito.log + volumes: + - .:/opt/repos/uta + - ${UTA_ETL_WORK_DIR}:/mito-extract/work + - ${UTA_ETL_LOG_DIR}:/mito-extract/logs + working_dir: /opt/repos/uta + network_mode: host diff --git a/sbin/uta-extract b/sbin/uta-extract index 4d89f86..b34938f 100755 --- a/sbin/uta-extract +++ b/sbin/uta-extract @@ -3,38 +3,38 @@ # Extract data from NCBI files into intermediate files. ncbi_dir=$1 -loading_dir=$2 +working_dir=$2 logs_dir=$3 -if [ -z "$ncbi_dir" ] || [ -z "$loading_dir" ] || [ -z "$logs_dir" ] +if [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$logs_dir" ] then - echo 'Usage: sbin/uta-extract ' + echo 'Usage: sbin/uta-extract ' exit 1 fi # genes sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz | \ - gzip -c > "$loading_dir/geneinfo.gz" 2>&1 | tee "$logs_dir/ncbi-parse-geneinfo.log" + gzip -c > "$working_dir/geneinfo.gz" 2>&1 | tee "$logs_dir/ncbi-parse-geneinfo.log" # transcript protein associations -sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$loading_dir/assocacs.gz" 2>&1 | \ +sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$working_dir/assocacs.gz" 2>&1 | \ tee "$logs_dir/ncbi-fetch-assoc-acs.log" # parse transcript info from GBFF input files GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz) -sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$loading_dir/txinfo.gz" 2>&1 | \ +sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$working_dir/txinfo.gz" 2>&1 | \ tee "$logs_dir/ncbi-parse-gbff.log" # parse alignments from GFF input files GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.gff.gz) -sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/exonsets.gz" 2>&1 | \ +sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \ tee "$logs_dir/ncbi-parse-genomic-gff.log" # generate seqinfo files from exonsets (this step requires seqrepo) -sbin/exonset-to-seqinfo -o NCBI "$loading_dir/exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \ +sbin/exonset-to-seqinfo -o NCBI "$working_dir/exonsets.gz" | gzip -c > "$working_dir/seqinfo.gz" 2>&1 | \ tee "$logs_dir/exonset-to-seqinfo.log" # move fasta files into same dir -cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz uta-build/ -cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz uta-build/ -cp $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz uta-build/ +cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/ +cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/ +cp $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/ From d327c0a253ee1c00b2eb0da01db6ffba54179f1e Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 11:41:44 -0700 Subject: [PATCH 20/36] create compose service for seqrepo-load --- README.md | 3 +-- docker-compose.yml | 10 ++++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 332980f..3985719 100644 --- a/README.md +++ b/README.md @@ -338,8 +338,7 @@ docker compose run mito-extract ### 3. Load data into SeqRepo ``` -# todo: move into docker -sbin/seqrepo-load $(pwd)/seqrepo-data 2024-02-20 $(pwd)/output/artifacts $(pwd)/output/logs +docker compose run seqrepo-load ``` ### 4. Load data into UTA diff --git a/docker-compose.yml b/docker-compose.yml index 8ba1345..652f3ff 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -22,6 +22,16 @@ services: - ${UTA_ETL_LOG_DIR}:/uta-extract/logs working_dir: /opt/repos/uta network_mode: host + seqrepo-load: + image: uta-load + command: sbin/seqrepo-load /usr/local/share/seqrepo 2024-02-20 /seqrepo-load/work /seqrepo-load/logs + volumes: + - .:/opt/repos/uta + - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo + - ${UTA_ETL_WORK_DIR}:/seqrepo-load/work + - ${UTA_ETL_LOG_DIR}:/seqrepo-load/logs + working_dir: /opt/repos/uta + network_mode: host uta: container_name: uta image: biocommons/uta:${UTA_VERSION} From 540b4ddeb93adf4f01ecf3b5511d385aed800d39 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 11:57:01 -0700 Subject: [PATCH 21/36] clean up uta-load command --- README.md | 15 ++++++++++++++- docker-compose.yml | 19 ++++++++++--------- etc/scripts/{run-uta-build.sh => uta-load} | 1 + sbin/uta-load | 9 +-------- 4 files changed, 26 insertions(+), 18 deletions(-) rename etc/scripts/{run-uta-build.sh => uta-load} (97%) diff --git a/README.md b/README.md index 3985719..f5a28d7 100644 --- a/README.md +++ b/README.md @@ -313,6 +313,14 @@ export UTA_ETL_WORK_DIR=./output/artifacts export UTA_ETL_LOG_DIR=./output/logs ``` +Build the UTA image: +``` +docker build --target uta -t uta-update . +``` + +Note: docker compose does not respect the container name specified in the compose file, +so you may want to specify it with `--name=` + ### 1. Download SeqRepo data ``` tbd @@ -342,6 +350,11 @@ docker compose run seqrepo-load ``` ### 4. Load data into UTA + +Bring up a UTA database and run the UTA load procedure. + +UTA is updated and the database is dumped into a pgd file. SeqRepo is updated in place. + ``` -sbin/uta-load $(pwd)/ncbi-data $(pwd)/seqrepo-data uta_20210129b $(pwd)/output/artifacts $(pwd)/output/logs +docker compose run uta-load ``` diff --git a/docker-compose.yml b/docker-compose.yml index 652f3ff..bdfc033 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,10 +1,10 @@ -# docker compose file for the UTA load procedure +# docker compose file for the UTA update procedure version: '3' services: ncbi-download: - image: uta-load + image: uta-update command: sbin/ncbi-download /ncbi-dir volumes: - .:/opt/repos/uta @@ -12,7 +12,7 @@ services: working_dir: /opt/repos/uta network_mode: host uta-extract: - image: uta-load + image: uta-update command: sbin/uta-extract /ncbi-dir /uta-extract/work /uta-extract/logs volumes: - .:/opt/repos/uta @@ -23,7 +23,7 @@ services: working_dir: /opt/repos/uta network_mode: host seqrepo-load: - image: uta-load + image: uta-update command: sbin/seqrepo-load /usr/local/share/seqrepo 2024-02-20 /seqrepo-load/work /seqrepo-load/logs volumes: - .:/opt/repos/uta @@ -34,28 +34,29 @@ services: network_mode: host uta: container_name: uta - image: biocommons/uta:${UTA_VERSION} + image: biocommons/uta:${UTA_ETL_UTA_VERSION} environment: - POSTGRES_HOST_AUTH_METHOD=trust healthcheck: - test: psql -h localhost -U anonymous -d uta -c "select * from ${UTA_VERSION}.meta" + test: psql -h localhost -U anonymous -d uta -c "select * from ${UTA_ETL_UTA_VERSION}.meta" interval: 10s retries: 60 network_mode: host uta-load: - image: uta-load - command: etc/scripts/uta-load ${UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs + image: uta-update + command: etc/scripts/uta-load ${UTA_ETL_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs depends_on: uta: condition: service_healthy volumes: + - .:/opt/repos/uta - ${UTA_ETL_NCBI_DIR}:/ncbi-dir - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo - ${UTA_ETL_WORK_DIR}:/uta-load/work - ${UTA_ETL_LOG_DIR}:/uta-load/logs network_mode: host mito-extract: - image: uta-load + image: uta-update command: sbin/ncbi_process_mito.py NC_012920.1 --output-dir /mito-extract/work | tee /mito-extract/logs/mito.log volumes: - .:/opt/repos/uta diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/uta-load similarity index 97% rename from etc/scripts/run-uta-build.sh rename to etc/scripts/uta-load index b541bb7..ebc3e36 100755 --- a/etc/scripts/run-uta-build.sh +++ b/etc/scripts/uta-load @@ -1,5 +1,6 @@ #!/usr/bin/env bash +# This script updates UTA and SeqRepo using NCBI files. # source_uta_v is the UTA version before the update. # ncbi_dir is where the script looks for NCBI data files. # working_dir stores intermediate data files and the final database dump. diff --git a/sbin/uta-load b/sbin/uta-load index 6de36ab..340a82f 100755 --- a/sbin/uta-load +++ b/sbin/uta-load @@ -1,9 +1,5 @@ #!/usr/bin/env bash -# This script runs the UTA update procedure. -# It expects to be run from the root of the uta repository. -# It updates the specified UTA and SeqRepo using the given NCBI files. -# The UTA update is provided as a postgres database dump, and SeqRepo is updated in place. set -e @@ -19,10 +15,7 @@ then echo 'Usage: sbin/uta-load ' exit 1 else - echo "Updating UTA and SeqRepo using files in $NCBI_DIR and SeqRepo data in $SEQREPO_DIR" - echo "Starting from UTA version $UTA_VERSION" - echo "Final dump and any intermediate files will be available in $WORKING_DIR" - echo "Logs will be available in $LOG_DIR" + fi # Ensure directories are compatible with docker volume usage From edd2a094c71651299bb25c3e8f0e786d223a64d7 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:05:00 -0700 Subject: [PATCH 22/36] delete docker wrapper script for uta-load --- sbin/uta-load | 72 --------------------------------------------------- 1 file changed, 72 deletions(-) delete mode 100755 sbin/uta-load diff --git a/sbin/uta-load b/sbin/uta-load deleted file mode 100755 index 340a82f..0000000 --- a/sbin/uta-load +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env bash - - -set -e - -# export environment variables for docker compose file -export NCBI_DIR=$1 -export SEQREPO_DIR=$2 -export UTA_VERSION=$3 -export WORKING_DIR=$4 -export LOG_DIR=$4 - -if [ -z "$NCBI_DIR" ] || [ -z "$SEQREPO_DIR" ] || [ -z "$UTA_VERSION" ] || [ -z "$WORKING_DIR" ] || [ -z "$LOG_DIR" ] -then - echo 'Usage: sbin/uta-load ' - exit 1 -else - -fi - -# Ensure directories are compatible with docker volume usage -if [[ $NCBI_DIR != /* ]] && [[ $NCBI_DIR != .* ]] -then - echo 'NCBI file directory must start with / or .' - exit 1 -fi - -if [[ $SEQREPO_DIR != /* ]] && [[ $SEQREPO_DIR != .* ]] -then - echo 'SeqRepo data directory must start with / or .' - exit 1 -fi - -if [[ $WORKING_DIR != /* ]] && [[ $WORKING_DIR != .* ]] -then - echo 'Output directory must start with / or .' - exit 1 -fi - -if [[ $LOG_DIR != /* ]] && [[ $LOG_DIR != .* ]] -then - echo 'Log directory must start with / or .' - exit 1 -fi - -# Ensure directories exist. -if [ ! -d "$NCBI_DIR" ]; then - echo "Directory $NCBI_DIR does not exist." - exit 1 -fi - -if [ ! -d "$SEQREPO_DIR" ]; then - echo "Directory $SEQREPO_DIR does not exist." - exit 1 -fi - -if [ ! -d "$WORKING_DIR" ]; then - echo "Directory $WORKING_DIR does not exist." - exit 1 -fi - -if [ ! -d "$LOG_DIR" ]; then - echo "Directory $LOG_DIR does not exist." - exit 1 -fi - -# Build the UTA image. -docker build --target uta -t uta-load . - -# Bring up a UTA database and run the UTA load procedure. -# docker compose doesn't respect the container name specified in the compose file, so container name is specified here -docker compose run --rm --name uta-load uta-load From 844b1276432f51dedaaa8d92c6adc000f20eaead Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:14:31 -0700 Subject: [PATCH 23/36] remove docker wraper script for sr download --- README.md | 18 +++++++++++--- docker-compose.yml | 6 ++--- sbin/seqrepo-download | 56 ------------------------------------------- 3 files changed, 18 insertions(+), 62 deletions(-) delete mode 100755 sbin/seqrepo-download diff --git a/README.md b/README.md index f5a28d7..6ae9e75 100644 --- a/README.md +++ b/README.md @@ -306,9 +306,10 @@ mkdir -p $(pwd)/output/logs Set variables: ``` +export UTA_ETL_OLD_SEQREPO_VERSION=2024-02-20 +export UTA_ETL_OLD_UTA_VERSION=uta_20210129b export UTA_ETL_NCBI_DIR=./ncbi-data export UTA_ETL_SEQREPO_DIR=./seqrepo-data -export UTA_ETL_UTA_VERSION=uta_20210129b export UTA_ETL_WORK_DIR=./output/artifacts export UTA_ETL_LOG_DIR=./output/logs ``` @@ -323,10 +324,21 @@ so you may want to specify it with `--name=` ### 1. Download SeqRepo data ``` -tbd -sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data +docker pull biocommons/seqrepo:$UTA_ETL_OLD_SEQREPO_VERSION + +# download seqrepo. can skip if container already exists. +docker run --name seqrepo biocommons/seqrepo:$UTA_ETL_OLD_SEQREPO_VERSION + +# copy seqrepo data into a local directory +docker run -v $UTA_ETL_SEQREPO_DIR:/output-dir --volumes-from seqrepo ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir' + +# allow seqrepo to be modified +docker run -it -v $UTA_ETL_SEQREPO_DIR:/output-dir ubuntu bash -c 'chmod -R +w /output-dir' ``` +Note: pulling data takes ~30 minutes and requires ~13 GB. +Note: a container called seqrepo will be left behind. + ### 2. Extract and transform data from NCBI Download files from NCBI, and extract into intermediate files. diff --git a/docker-compose.yml b/docker-compose.yml index bdfc033..55ea6fd 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -34,17 +34,17 @@ services: network_mode: host uta: container_name: uta - image: biocommons/uta:${UTA_ETL_UTA_VERSION} + image: biocommons/uta:${UTA_ETL_OLD_UTA_VERSION} environment: - POSTGRES_HOST_AUTH_METHOD=trust healthcheck: - test: psql -h localhost -U anonymous -d uta -c "select * from ${UTA_ETL_UTA_VERSION}.meta" + test: psql -h localhost -U anonymous -d uta -c "select * from ${UTA_ETL_OLD_UTA_VERSION}.meta" interval: 10s retries: 60 network_mode: host uta-load: image: uta-update - command: etc/scripts/uta-load ${UTA_ETL_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs + command: etc/scripts/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs depends_on: uta: condition: service_healthy diff --git a/sbin/seqrepo-download b/sbin/seqrepo-download deleted file mode 100755 index 5c581b0..0000000 --- a/sbin/seqrepo-download +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash - -# This script downloads SeqRepo into the given directory. -# Note: pulling data takes ~30 minutes and requires ~13 GB. -# Note: a container called seqrepo will be left behind. -# The name of the container can be changed by providing a third argument. - -set -e - -SEQREPO_VERSION=$1 -OUTPUT_DIR=$2 -# optional: -SEQREPO_CONTAINER_NAME=$3 - -if [ -z "$SEQREPO_VERSION" ] || [ -z "$OUTPUT_DIR" ] -then - echo 'Usage: sbin/seqrepo-download ' - exit 1 -else - echo "SeqRepo data for version $SEQREPO_VERSION will be available in $OUTPUT_DIR" -fi - -# Name of seqrepo container -if [ -z "$SEQREPO_CONTAINER_NAME" ] -then - SEQREPO_CONTAINER_NAME=seqrepo -fi - -if [[ $OUTPUT_DIR != /* ]] -then - echo 'Output directory must be an absolute path' - exit 1 -fi - -if [ ! -d "$OUTPUT_DIR" ]; then - echo "Directory $OUTPUT_DIR does not exist." - exit 1 -fi - -# Pull seqrepo image -docker pull biocommons/seqrepo:$SEQREPO_VERSION - -# Download seqrepo data using seqrepo image -if docker ps -aq -f name=$SEQREPO_CONTAINER_NAME -then - echo "Container called $SEQREPO_CONTAINER_NAME already exists. Skipping seqrepo data download." -else - docker run --name seqrepo biocommons/seqrepo:$SEQREPO_VERSION -fi - -# Copy seqrepo data into a local directory -echo "Copying seqrepo data into $OUTPUT_DIR ..." -docker run -it -v $OUTPUT_DIR:/output-dir --volumes-from $SEQREPO_CONTAINER_NAME ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir' - -# Allow seqrepo to be modified -docker run -it -v $OUTPUT_DIR:/output-dir ubuntu bash -c 'chmod -R +w /output-dir' From d63f20f0f7ac3dca56e7932f4ef6cb6de82ad350 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:19:23 -0700 Subject: [PATCH 24/36] remove current dir mount --- README.md | 3 --- docker-compose.yml | 4 ---- 2 files changed, 7 deletions(-) diff --git a/README.md b/README.md index 6ae9e75..c15aa48 100644 --- a/README.md +++ b/README.md @@ -319,9 +319,6 @@ Build the UTA image: docker build --target uta -t uta-update . ``` -Note: docker compose does not respect the container name specified in the compose file, -so you may want to specify it with `--name=` - ### 1. Download SeqRepo data ``` docker pull biocommons/seqrepo:$UTA_ETL_OLD_SEQREPO_VERSION diff --git a/docker-compose.yml b/docker-compose.yml index 55ea6fd..404fe09 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,7 +15,6 @@ services: image: uta-update command: sbin/uta-extract /ncbi-dir /uta-extract/work /uta-extract/logs volumes: - - .:/opt/repos/uta - ${UTA_ETL_NCBI_DIR}:/ncbi-dir - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo - ${UTA_ETL_WORK_DIR}:/uta-extract/work @@ -26,7 +25,6 @@ services: image: uta-update command: sbin/seqrepo-load /usr/local/share/seqrepo 2024-02-20 /seqrepo-load/work /seqrepo-load/logs volumes: - - .:/opt/repos/uta - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo - ${UTA_ETL_WORK_DIR}:/seqrepo-load/work - ${UTA_ETL_LOG_DIR}:/seqrepo-load/logs @@ -49,7 +47,6 @@ services: uta: condition: service_healthy volumes: - - .:/opt/repos/uta - ${UTA_ETL_NCBI_DIR}:/ncbi-dir - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo - ${UTA_ETL_WORK_DIR}:/uta-load/work @@ -59,7 +56,6 @@ services: image: uta-update command: sbin/ncbi_process_mito.py NC_012920.1 --output-dir /mito-extract/work | tee /mito-extract/logs/mito.log volumes: - - .:/opt/repos/uta - ${UTA_ETL_WORK_DIR}:/mito-extract/work - ${UTA_ETL_LOG_DIR}:/mito-extract/logs working_dir: /opt/repos/uta From f096f4204a329b81762189097ddbdf55854cc6bf Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:23:52 -0700 Subject: [PATCH 25/36] clean up readme --- README.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index c15aa48..4721aac 100644 --- a/README.md +++ b/README.md @@ -359,11 +359,8 @@ docker compose run seqrepo-load ``` ### 4. Load data into UTA - -Bring up a UTA database and run the UTA load procedure. - -UTA is updated and the database is dumped into a pgd file. SeqRepo is updated in place. - ``` docker compose run uta-load ``` + +UTA has updated and the database has been dumped into a pgd file in `UTA_ETL_WORK_DIR`. SeqRepo has been updated in place. From 66ac44f12f13a2e796fc0a5ee8e568a4ee4b02c7 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:45:59 -0700 Subject: [PATCH 26/36] skip gene load for mito --- README.md | 14 ++++---------- docker-compose.yml | 2 +- etc/scripts/uta-load | 12 +++++++++--- sbin/uta-extract | 1 + 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 4721aac..6f8d4ac 100644 --- a/README.md +++ b/README.md @@ -338,29 +338,23 @@ Note: a container called seqrepo will be left behind. ### 2. Extract and transform data from NCBI -Download files from NCBI, and extract into intermediate files. +Download files from NCBI, extract into intermediate files, and load into UTA and SeqRepo. See 2A for nuclear transcripts and 2B for mitochondrial transcripts. #### 2A. Nuclear transcripts ``` docker compose run ncbi-download +docker compose run seqrepo-load docker compose run uta-extract +docker compose run uta-load ``` #### 2B. Mitochondrial transcripts ``` docker compose run mito-extract -``` - -### 3. Load data into SeqRepo -``` docker compose run seqrepo-load -``` - -### 4. Load data into UTA -``` -docker compose run uta-load +UTA_ETL_SKIP_GENE_LOAD=true docker compose run uta-load ``` UTA has updated and the database has been dumped into a pgd file in `UTA_ETL_WORK_DIR`. SeqRepo has been updated in place. diff --git a/docker-compose.yml b/docker-compose.yml index 404fe09..3e988fa 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -42,7 +42,7 @@ services: network_mode: host uta-load: image: uta-update - command: etc/scripts/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs + command: etc/scripts/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs ${UTA_ETL_SKIP_GENE_LOAD} depends_on: uta: condition: service_healthy diff --git a/etc/scripts/uta-load b/etc/scripts/uta-load index ebc3e36..10844f2 100755 --- a/etc/scripts/uta-load +++ b/etc/scripts/uta-load @@ -5,6 +5,7 @@ # ncbi_dir is where the script looks for NCBI data files. # working_dir stores intermediate data files and the final database dump. # log_dir stores log files. +# skip_load_genes, if truthy, will skip the gene loading step # Note that the uta loading code uses the seqrepo location defined in the conf files, under [sequences].seqrepo. @@ -14,6 +15,8 @@ source_uta_v=$1 ncbi_dir=$2 working_dir=$3 log_dir=$4 +# optionally skip loading geneinfo +skip_load_genes=$5 if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ] then @@ -33,9 +36,12 @@ etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v" sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-ac.gz" 2>&1 | \ tee "$log_dir/assoc-acs-merge.log" -# # Load genes into gene table. -# uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \ -# tee "$log_dir/load-geneinfo.log" +# Load genes into gene table. +if [ -z "$skip_load_genes" ] +then + uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \ + tee "$log_dir/load-geneinfo.log" +fi # Load accessions into associated_accessions table. uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$working_dir/assoc-ac.gz" 2>&1 | \ diff --git a/sbin/uta-extract b/sbin/uta-extract index 432e8a6..e708e46 100755 --- a/sbin/uta-extract +++ b/sbin/uta-extract @@ -30,6 +30,7 @@ GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_as sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$working_dir/unfiltered_exonsets.gz" 2>&1 | \ tee "$logs_dir/ncbi-parse-genomic-gff.log" +# filter transcripts sbin/filter_exonset_transcripts.py --tx-info "$working_dir/gbff.txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \ --missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \ tee "$logs_dir/filter_exonset_transcripts.log" From 49a2ce8a4ffa9b1c5dfa2094689a848024b5bece Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:51:12 -0700 Subject: [PATCH 27/36] set -e on uta-load --- sbin/uta-extract | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sbin/uta-extract b/sbin/uta-extract index e708e46..0583b9a 100755 --- a/sbin/uta-extract +++ b/sbin/uta-extract @@ -2,6 +2,8 @@ # Extract data from NCBI files into intermediate files. +set -e + ncbi_dir=$1 working_dir=$2 logs_dir=$3 From f6e9d9d38dd467cc2f8c4dccf3958ba4049aefdd Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:57:43 -0700 Subject: [PATCH 28/36] fix naming --- sbin/uta-extract | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sbin/uta-extract b/sbin/uta-extract index 0583b9a..84376c3 100755 --- a/sbin/uta-extract +++ b/sbin/uta-extract @@ -33,7 +33,7 @@ sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$working_dir/unfiltered tee "$logs_dir/ncbi-parse-genomic-gff.log" # filter transcripts -sbin/filter_exonset_transcripts.py --tx-info "$working_dir/gbff.txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \ +sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \ --missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \ tee "$logs_dir/filter_exonset_transcripts.log" From 3780859ad9030494a7402904b7a686184f34a971 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 13:01:01 -0700 Subject: [PATCH 29/36] move step that requires seqrepo out of extract script, so that seqrepo step can come after extract --- README.md | 2 +- etc/scripts/uta-load | 4 ++++ sbin/uta-extract | 20 ++++++++------------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 6f8d4ac..c24ffdb 100644 --- a/README.md +++ b/README.md @@ -345,8 +345,8 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts. #### 2A. Nuclear transcripts ``` docker compose run ncbi-download -docker compose run seqrepo-load docker compose run uta-extract +docker compose run seqrepo-load docker compose run uta-load ``` diff --git a/etc/scripts/uta-load b/etc/scripts/uta-load index 10844f2..1d8084a 100755 --- a/etc/scripts/uta-load +++ b/etc/scripts/uta-load @@ -32,6 +32,10 @@ mkdir -p "$log_dir" etc/scripts/delete-schema.sh "$loading_uta_v" etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v" +# generate seqinfo files from exonsets (this step requires seqrepo) +sbin/exonset-to-seqinfo -o NCBI "$working_dir/exonsets.gz" | gzip -c > "$working_dir/seqinfo.gz" 2>&1 | \ + tee "$log_dir/exonset-to-seqinfo.log" + # Filter out columns from assocacs file. sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-ac.gz" 2>&1 | \ tee "$log_dir/assoc-acs-merge.log" diff --git a/sbin/uta-extract b/sbin/uta-extract index 84376c3..7f2a8e9 100755 --- a/sbin/uta-extract +++ b/sbin/uta-extract @@ -6,40 +6,36 @@ set -e ncbi_dir=$1 working_dir=$2 -logs_dir=$3 +log_dir=$3 -if [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$logs_dir" ] +if [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ] then - echo 'Usage: sbin/uta-extract ' + echo 'Usage: sbin/uta-extract ' exit 1 fi # genes sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz | \ - gzip -c > "$working_dir/geneinfo.gz" 2>&1 | tee "$logs_dir/ncbi-parse-geneinfo.log" + gzip -c > "$working_dir/geneinfo.gz" 2>&1 | tee "$log_dir/ncbi-parse-geneinfo.log" # transcript protein associations sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$working_dir/assocacs.gz" 2>&1 | \ - tee "$logs_dir/ncbi-fetch-assoc-acs.log" + tee "$log_dir/ncbi-fetch-assoc-acs.log" # parse transcript info from GBFF input files GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz) sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$working_dir/txinfo.gz" 2>&1 | \ - tee "$logs_dir/ncbi-parse-gbff.log" + tee "$log_dir/ncbi-parse-gbff.log" # parse alignments from GFF input files GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.gff.gz) sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$working_dir/unfiltered_exonsets.gz" 2>&1 | \ - tee "$logs_dir/ncbi-parse-genomic-gff.log" + tee "$log_dir/ncbi-parse-genomic-gff.log" # filter transcripts sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \ --missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \ - tee "$logs_dir/filter_exonset_transcripts.log" - -# generate seqinfo files from exonsets (this step requires seqrepo) -sbin/exonset-to-seqinfo -o NCBI "$working_dir/exonsets.gz" | gzip -c > "$working_dir/seqinfo.gz" 2>&1 | \ - tee "$logs_dir/exonset-to-seqinfo.log" + tee "$log_dir/filter_exonset_transcripts.log" # move fasta files into same dir cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/ From e4bebf5c28a0a162c4b1a004647395c5c23bb2a3 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 13:01:56 -0700 Subject: [PATCH 30/36] move uta-load script into sbin --- docker-compose.yml | 2 +- {etc/scripts => sbin}/uta-load | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename {etc/scripts => sbin}/uta-load (100%) diff --git a/docker-compose.yml b/docker-compose.yml index 3e988fa..1247c09 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -42,7 +42,7 @@ services: network_mode: host uta-load: image: uta-update - command: etc/scripts/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs ${UTA_ETL_SKIP_GENE_LOAD} + command: sbin/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs ${UTA_ETL_SKIP_GENE_LOAD} depends_on: uta: condition: service_healthy diff --git a/etc/scripts/uta-load b/sbin/uta-load similarity index 100% rename from etc/scripts/uta-load rename to sbin/uta-load From 50dbeb806993a1d3246f78d8144eda462bc0b609 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 13:42:11 -0700 Subject: [PATCH 31/36] always set skip_load_genes --- README.md | 2 +- sbin/uta-load | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c24ffdb..11517d0 100644 --- a/README.md +++ b/README.md @@ -347,7 +347,7 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts. docker compose run ncbi-download docker compose run uta-extract docker compose run seqrepo-load -docker compose run uta-load +UTA_ETL_SKIP_GENE_LOAD=false docker compose run uta-load ``` #### 2B. Mitochondrial transcripts diff --git a/sbin/uta-load b/sbin/uta-load index 1d8084a..48d7219 100755 --- a/sbin/uta-load +++ b/sbin/uta-load @@ -15,7 +15,6 @@ source_uta_v=$1 ncbi_dir=$2 working_dir=$3 log_dir=$4 -# optionally skip loading geneinfo skip_load_genes=$5 if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ] @@ -41,7 +40,7 @@ sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc- tee "$log_dir/assoc-acs-merge.log" # Load genes into gene table. -if [ -z "$skip_load_genes" ] +if [ "$skip_load_genes" = "true" ] then uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \ tee "$log_dir/load-geneinfo.log" From b7e0f4279ca9d7bf85435b476e28c5447a3f73d2 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 13:54:16 -0700 Subject: [PATCH 32/36] restore missed changes from alembic pr --- sbin/uta-load | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sbin/uta-load b/sbin/uta-load index 48d7219..62ea293 100755 --- a/sbin/uta-load +++ b/sbin/uta-load @@ -24,13 +24,17 @@ then fi # set local variables and create working directories -loading_uta_v="uta_1_1" +loading_uta_v="uta" mkdir -p "$log_dir" ## Drop loading schema, and recreate etc/scripts/delete-schema.sh "$loading_uta_v" etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v" +## for now set up Alembic for schema migrations +alembic -c etc/alembic.ini stamp edadb97f6502 +alembic -c etc/alembic.ini upgrade head + # generate seqinfo files from exonsets (this step requires seqrepo) sbin/exonset-to-seqinfo -o NCBI "$working_dir/exonsets.gz" | gzip -c > "$working_dir/seqinfo.gz" 2>&1 | \ tee "$log_dir/exonset-to-seqinfo.log" From 608deb97b2decc91323e6ea7914840522e219f9a Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 14:18:50 -0700 Subject: [PATCH 33/36] consistent naming of log_dir var --- sbin/seqrepo-load | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sbin/seqrepo-load b/sbin/seqrepo-load index 3f05842..93942cf 100755 --- a/sbin/seqrepo-load +++ b/sbin/seqrepo-load @@ -5,11 +5,11 @@ set -e seqrepo_root=$1 seqrepo_version=$2 sequence_dir=$3 -logs_dir=$4 +log_dir=$4 -if [ -z "$seqrepo_root" ] || [ -z "$seqrepo_version" ] || [ -z "$sequence_dir" ] || [ -z "$logs_dir" ] +if [ -z "$seqrepo_root" ] || [ -z "$seqrepo_version" ] || [ -z "$sequence_dir" ] || [ -z "$log_dir" ] then - echo 'Usage: sbin/seqrepo-load ' + echo 'Usage: sbin/seqrepo-load ' exit 1 fi @@ -18,4 +18,4 @@ seqrepo --root-directory "$seqrepo_root" \ load -n NCBI --instance-name "$seqrepo_version" \ $sequence_dir/*.fna.gz \ $sequence_dir/*.faa.gz 2>& 1 | \ - tee "$logs_dir/seqrepo-load.log" + tee "$log_dir/seqrepo-load.log" From e32add99f4e7e9c5dc1ed79a70cd231cd73661fb Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 14:22:40 -0700 Subject: [PATCH 34/36] invert condition --- sbin/uta-load | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sbin/uta-load b/sbin/uta-load index 62ea293..c3da837 100755 --- a/sbin/uta-load +++ b/sbin/uta-load @@ -46,6 +46,8 @@ sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc- # Load genes into gene table. if [ "$skip_load_genes" = "true" ] then + echo "Skipping load-geneinfo" +else uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \ tee "$log_dir/load-geneinfo.log" fi From 10f4e0efd4e03912acf176749942cb58a6d52757 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Fri, 5 Apr 2024 14:34:24 -0700 Subject: [PATCH 35/36] fix tests for mito strand change --- tests/test_ncbi_process_mito.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_ncbi_process_mito.py b/tests/test_ncbi_process_mito.py index d0a1982..8beb535 100644 --- a/tests/test_ncbi_process_mito.py +++ b/tests/test_ncbi_process_mito.py @@ -193,7 +193,7 @@ def test_get_mito_genes(self): "alt_ac": "NC_012920.1", "alt_start": 1601, "alt_end": 1670, - "strand": "+", + "strand": 1, "transl_table": None, "transl_except": None, "pro_ac": None, @@ -212,7 +212,7 @@ def test_get_mito_genes(self): "alt_ac": "NC_012920.1", "alt_start": 4328, "alt_end": 4400, - "strand": "-", + "strand": -1, "transl_table": None, "transl_except": None, "pro_ac": None, @@ -237,7 +237,7 @@ def test_get_mito_genes(self): "alt_ac": "NC_012920.1", "alt_start": 7585, "alt_end": 8269, - "strand": "+", + "strand": 1, "transl_table": "2", "transl_except": None, "pro_ac": "YP_003024029.1", @@ -267,7 +267,7 @@ def test_get_mito_genes(self): "alt_ac": "NC_012920.1", "alt_start": 3306, "alt_end": 4262, - "strand": "+", + "strand": 1, "transl_table": "2", "transl_except": "(pos:4261..4262,aa:TERM)", "pro_ac": "YP_003024026.1", From 858b143e0f3586002b3dbb6dc86b93e9ad7c77a8 Mon Sep 17 00:00:00 2001 From: NVTA <162694616+nvta1209@users.noreply.github.com> Date: Mon, 8 Apr 2024 23:33:20 -0700 Subject: [PATCH 36/36] hard links --- sbin/uta-extract | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sbin/uta-extract b/sbin/uta-extract index 7f2a8e9..2a60f3f 100755 --- a/sbin/uta-extract +++ b/sbin/uta-extract @@ -38,6 +38,6 @@ sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets tee "$log_dir/filter_exonset_transcripts.log" # move fasta files into same dir -cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/ -cp $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/ -cp $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/ +ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/ +ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/ +ln $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/