From d65d764c3e54f974511f95b9dfe55ecfae7c3bb4 Mon Sep 17 00:00:00 2001 From: nvta1209 <162694616+nvta1209@users.noreply.github.com> Date: Tue, 9 Apr 2024 08:07:55 -0700 Subject: [PATCH] feat(IPVC-2283): mitochondrial transcript workflow (#20) --- README.md | 95 +++++++++++++++++++------------- docker-compose.yml | 50 ++++++++++++++--- etc/scripts/run-uta-build.sh | 98 --------------------------------- sbin/ncbi-download | 24 ++++++++ sbin/ncbi-download-docker | 24 -------- sbin/ncbi_process_mito.py | 23 ++++---- sbin/seqrepo-download | 53 ------------------ sbin/seqrepo-load | 21 +++++++ sbin/uta-extract | 43 +++++++++++++++ sbin/uta-load | 77 ++++++++++++++++++++++++++ sbin/uta-update | 67 ---------------------- tests/test_ncbi_process_mito.py | 8 +-- 12 files changed, 280 insertions(+), 303 deletions(-) delete mode 100755 etc/scripts/run-uta-build.sh delete mode 100755 sbin/ncbi-download-docker mode change 100644 => 100755 sbin/ncbi_process_mito.py delete mode 100755 sbin/seqrepo-download create mode 100755 sbin/seqrepo-load create mode 100755 sbin/uta-extract create mode 100755 sbin/uta-load delete mode 100755 sbin/uta-update diff --git a/README.md b/README.md index 84e606c..2ade8c5 100644 --- a/README.md +++ b/README.md @@ -289,60 +289,77 @@ To develop UTA, follow these steps. 4. Testing $ docker build --target uta-test -t uta-test . - $ docker run -it --rm uta-test python -m unittest + $ docker run --rm uta-test python -m unittest ## UTA update procedure -### 1. Download files from NCBI +Requires docker. -Run `sbin/ncbi-download-docker`. Requires bash and docker. +### 0. Setup -Example: +Make directories: ``` -sbin/ncbi-download-docker $(pwd)/ncbi-data +mkdir -p $(pwd)/ncbi-data +mkdir -p $(pwd)/output/artifacts +mkdir -p $(pwd)/output/logs ``` -The specified directory will have the following structure: - - ├── gene - │ └── DATA - │ ├── GENE_INFO - │ │ └── Mammalia - │ │ └── Homo_sapiens.gene_info.gz - │ └── gene2accession.gz - ├── genomes - │ └── refseq - │ └── vertebrate_mammalian - │ └── Homo_sapiens - │ └── all_assembly_versions - │ └── GCF_000001405.25_GRCh37.p13 - │ ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz - │ └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz - └── refseq - └── H_sapiens - └── mRNA_Prot - ├── human.1.protein.faa.gz - ├── human.1.rna.fna.gz - └── human.1.rna.gbff.gz - -### 2. Download SeqRepo data - -Run `sbin/seqrepo-download`. Requires bash and docker. - -Example: +Set variables: ``` -sbin/seqrepo-download 2024-02-20 $(pwd)/seqrepo-data +export UTA_ETL_OLD_SEQREPO_VERSION=2024-02-20 +export UTA_ETL_OLD_UTA_VERSION=uta_20210129b +export UTA_ETL_NCBI_DIR=./ncbi-data +export UTA_ETL_SEQREPO_DIR=./seqrepo-data +export UTA_ETL_WORK_DIR=./output/artifacts +export UTA_ETL_LOG_DIR=./output/logs ``` -### 3. Update UTA and SeqRepo +Build the UTA image: +``` +docker build --target uta -t uta-update . +``` + +### 1. Download SeqRepo data +``` +docker pull biocommons/seqrepo:$UTA_ETL_OLD_SEQREPO_VERSION + +# download seqrepo. can skip if container already exists. +docker run --name seqrepo biocommons/seqrepo:$UTA_ETL_OLD_SEQREPO_VERSION + +# copy seqrepo data into a local directory +docker run -v $UTA_ETL_SEQREPO_DIR:/output-dir --volumes-from seqrepo ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir' + +# allow seqrepo to be modified +docker run -it -v $UTA_ETL_SEQREPO_DIR:/output-dir ubuntu bash -c 'chmod -R +w /output-dir' +``` -Run `sbin/uta-update`. Requires bash and docker. +Note: pulling data takes ~30 minutes and requires ~13 GB. +Note: a container called seqrepo will be left behind. -Example: +### 2. Extract and transform data from NCBI + +Download files from NCBI, extract into intermediate files, and load into UTA and SeqRepo. + +See 2A for nuclear transcripts and 2B for mitochondrial transcripts. + +#### 2A. Nuclear transcripts +``` +docker compose run ncbi-download +docker compose run uta-extract +docker compose run seqrepo-load +UTA_ETL_SKIP_GENE_LOAD=false docker compose run uta-load ``` -sbin/uta-update $(pwd)/ncbi-data $(pwd)/seqrepo-data $(pwd)/uta-build uta_20210129b 2024-02-20 + +#### 2B. Mitochondrial transcripts +``` +docker compose run mito-extract +docker compose run seqrepo-load +UTA_ETL_SKIP_GENE_LOAD=true docker compose run uta-load ``` +UTA has updated and the database has been dumped into a pgd file in `UTA_ETL_WORK_DIR`. SeqRepo has been updated in place. + + ## Migrations UTA uses alembic to manage database migrations. To auto-generate a migration: ``` @@ -353,7 +370,7 @@ Adjust the upgrade and downgrade function definitions. To apply the migration: ``` alembic -c etc/alembic.ini upgrade head ``` -To reverse a migration, use `downgrade` with the number of steps to reverse. For example, to reverse the last: +To reverse a migration, use `downgrade` with the number of steps to reverse. For example, to reverse the last: ``` alembic -c etc/alembic.ini downgrade -1 ``` diff --git a/docker-compose.yml b/docker-compose.yml index 856a5ae..1247c09 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,24 +3,60 @@ version: '3' services: + ncbi-download: + image: uta-update + command: sbin/ncbi-download /ncbi-dir + volumes: + - .:/opt/repos/uta + - ${UTA_ETL_NCBI_DIR}:/ncbi-dir + working_dir: /opt/repos/uta + network_mode: host + uta-extract: + image: uta-update + command: sbin/uta-extract /ncbi-dir /uta-extract/work /uta-extract/logs + volumes: + - ${UTA_ETL_NCBI_DIR}:/ncbi-dir + - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo + - ${UTA_ETL_WORK_DIR}:/uta-extract/work + - ${UTA_ETL_LOG_DIR}:/uta-extract/logs + working_dir: /opt/repos/uta + network_mode: host + seqrepo-load: + image: uta-update + command: sbin/seqrepo-load /usr/local/share/seqrepo 2024-02-20 /seqrepo-load/work /seqrepo-load/logs + volumes: + - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo + - ${UTA_ETL_WORK_DIR}:/seqrepo-load/work + - ${UTA_ETL_LOG_DIR}:/seqrepo-load/logs + working_dir: /opt/repos/uta + network_mode: host uta: container_name: uta - image: biocommons/uta:${UTA_VERSION} + image: biocommons/uta:${UTA_ETL_OLD_UTA_VERSION} environment: - POSTGRES_HOST_AUTH_METHOD=trust healthcheck: - test: psql -h localhost -U anonymous -d uta -c "select * from ${UTA_VERSION}.meta" + test: psql -h localhost -U anonymous -d uta -c "select * from ${UTA_ETL_OLD_UTA_VERSION}.meta" interval: 10s retries: 60 network_mode: host - uta-update: + uta-load: image: uta-update - command: etc/scripts/run-uta-build.sh ${UTA_VERSION} ${SEQREPO_VERSION} /ncbi-dir /workdir + command: sbin/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs ${UTA_ETL_SKIP_GENE_LOAD} depends_on: uta: condition: service_healthy volumes: - - ${NCBI_DIR}:/ncbi-dir - - ${SEQREPO_DIR}:/usr/local/share/seqrepo - - ${WORKING_DIR}:/workdir + - ${UTA_ETL_NCBI_DIR}:/ncbi-dir + - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo + - ${UTA_ETL_WORK_DIR}:/uta-load/work + - ${UTA_ETL_LOG_DIR}:/uta-load/logs + network_mode: host + mito-extract: + image: uta-update + command: sbin/ncbi_process_mito.py NC_012920.1 --output-dir /mito-extract/work | tee /mito-extract/logs/mito.log + volumes: + - ${UTA_ETL_WORK_DIR}:/mito-extract/work + - ${UTA_ETL_LOG_DIR}:/mito-extract/logs + working_dir: /opt/repos/uta network_mode: host diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh deleted file mode 100755 index 613d6ce..0000000 --- a/etc/scripts/run-uta-build.sh +++ /dev/null @@ -1,98 +0,0 @@ -#!/bin/bash - -# source_uta_v is the UTA version before the update. -# seqrepo_data_release is the SeqRepo version before the update. -# ncbi_dir is where the script looks for NCBI data files. -# working_dir stores log files, intermediate data files, and the final database dump. - -set -euxo pipefail - -source_uta_v=$1 -seqrepo_data_release=$2 -ncbi_dir=$3 -working_dir=$4 - -if [ -z "$source_uta_v" ] || [ -z "$seqrepo_data_release" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] -then - echo 'Usage: run-uta-build.sh ' - exit 1 -fi - -# set local variables and create working directories -loading_uta_v="uta" -loading_dir="$working_dir/loading" -dumps_dir="$working_dir/dumps" -logs_dir="$working_dir/logs" -for d in "$loading_dir" "$dumps_dir" "$logs_dir"; - do mkdir -p "$d" -done - -## Drop loading schema, and recreate -etc/scripts/delete-schema.sh "$loading_uta_v" -etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v" - -## for now set up Alembic for schema migrations -alembic -c etc/alembic.ini stamp edadb97f6502 -alembic -c etc/alembic.ini upgrade head - -## Load SeqRepo with new sequences -seqrepo load -n NCBI -i "$seqrepo_data_release" \ - $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz \ - $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz \ - $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz 2>& 1 | \ - tee "$logs_dir/seqrepo-load.log" - -### extract meta data -# genes -sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz | \ - gzip -c > "$loading_dir/genes.geneinfo.gz" 2>&1 | tee "$logs_dir/ncbi-parse-geneinfo.log" - -# transcript protein associations -sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$loading_dir/assocacs.gz" 2>&1 | \ - tee "$logs_dir/ncbi-fetch-assoc-acs" - -sbin/assoc-acs-merge "$loading_dir/assocacs.gz" | gzip -c > "$loading_dir/assocacs.cleaned.gz" 2>&1 | \ - tee "$logs_dir/assoc-acs-merge" - -# parse transcript info from GBFF input files -GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz) -sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$loading_dir/gbff.txinfo.gz" 2>&1 | \ - tee "$logs_dir/ncbi-parse-gbff.log" - -# parse alignments from GFF input files -GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.gff.gz) -sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/gff.exonsets.gz" 2>&1 | \ - tee "$logs_dir/ncbi-parse-genomic-gff.log" - -sbin/filter_exonset_transcripts.py --tx-info "$loading_dir/gbff.txinfo.gz" --exonsets "$loading_dir/gff.exonsets.gz" \ - --missing-ids "$loading_dir/filtered_tx_acs.txt" | gzip -c > "$loading_dir/gff.filtered_exonsets.gz" 2>&1 | \ - tee "$logs_dir/filter_exonset_transcripts.log" - -# generate seqinfo files from exonsets -sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.filtered_exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \ - tee "$logs_dir/exonset-to-seqinfo.log" - -### update the uta database -# genes -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$loading_dir/genes.geneinfo.gz" 2>&1 | \ - tee "$logs_dir/load-geneinfo.log" - -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$loading_dir/assocacs.cleaned.gz" 2>&1 | \ - tee "$logs_dir/load-assoc-ac.log" - -# transcript info -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$loading_dir/gbff.txinfo.gz" 2>&1 | \ - tee "$logs_dir/load-txinfo.log" - -# gff exon sets -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$loading_dir/gff.filtered_exonsets.gz" 2>&1 | \ - tee "$logs_dir/load-exonsets.log" - -# align exons -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | tee "$logs_dir/align-exons.log" - -### run diff -sbin/uta-diff "$source_uta_v" "$loading_uta_v" - -### psql_dump -pg_dump -U uta_admin -h localhost -d uta -t "$loading_uta_v.gene" | gzip -c > "$dumps_dir/uta.pgd.gz" diff --git a/sbin/ncbi-download b/sbin/ncbi-download index 61a6874..dc445cd 100755 --- a/sbin/ncbi-download +++ b/sbin/ncbi-download @@ -1,6 +1,29 @@ #!/usr/bin/env bash # This script downloads the files needed for a UTA+SeqRepo update into to the given directory. +# +# DONWLOAD_DIR will have the following structure: +# +# ├── gene +# │ └── DATA +# │ ├── GENE_INFO +# │ │ └── Mammalia +# │ │ └── Homo_sapiens.gene_info.gz +# │ └── gene2accession.gz +# ├── genomes +# │ └── refseq +# │ └── vertebrate_mammalian +# │ └── Homo_sapiens +# │ └── all_assembly_versions +# │ └── GCF_000001405.25_GRCh37.p13 +# │ ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz +# │ └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz +# └── refseq +# └── H_sapiens +# └── mRNA_Prot +# ├── human.1.protein.faa.gz +# ├── human.1.rna.fna.gz +# └── human.1.rna.gbff.gz set -e @@ -26,6 +49,7 @@ do DOWNLOAD_MODULE="${DOWNLOAD_PATH%%/*}" DOWNLOAD_SRC="ftp.ncbi.nlm.nih.gov::$DOWNLOAD_PATH" DOWNLOAD_DST="$DOWNLOAD_DIR/$DOWNLOAD_MODULE" + mkdir -p $DOWNLOAD_DST echo "Downloading $DOWNLOAD_SRC to $DOWNLOAD_DST" rsync --no-motd -DHPRprtv "$DOWNLOAD_SRC" "$DOWNLOAD_DST" done diff --git a/sbin/ncbi-download-docker b/sbin/ncbi-download-docker deleted file mode 100755 index 30d4f63..0000000 --- a/sbin/ncbi-download-docker +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env bash - -# This script runs ncbi-download in a docker container. - -set -e - -DOWNLOAD_DIR=$1 - -if [ -z "$DOWNLOAD_DIR" ] -then - echo 'Usage: sbin/ncbi-download-docker ' - exit 1 -else - echo "Downloading files to $DOWNLOAD_DIR" -fi - -if [[ $DOWNLOAD_DIR != /* ]] -then - echo 'Download directory must be an absolute path' - exit 1 -fi - -docker build --target uta -t ncbi-download --progress plain . -docker run -it -v $(pwd)/sbin/ncbi-download:/dl-script -v $DOWNLOAD_DIR:/output-dir ncbi-download /dl-script /output-dir diff --git a/sbin/ncbi_process_mito.py b/sbin/ncbi_process_mito.py old mode 100644 new mode 100755 index 2afe071..c65984b --- a/sbin/ncbi_process_mito.py +++ b/sbin/ncbi_process_mito.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + """ Download mito fasta and gbff file. Use BioPython to parse the features in the Mitochondrial genbank file to get the attributes of a region of the genome that correspond to genes along with their attributes. Output gene/tx/alignment @@ -52,6 +54,7 @@ """ import argparse import dataclasses +import gzip import importlib_resources import logging import logging.config @@ -83,7 +86,7 @@ class MitoGeneData: alt_ac: str alt_start: int alt_end: int - strand: str + strand: int origin: str = "NCBI" alignment_method: str = "splign" transl_table: Optional[str] = None @@ -166,6 +169,7 @@ def parse_nomenclature_value(gb_feature: SeqFeature) -> Dict[str, str]: def get_mito_genes(gbff_filepath: str): logger.info(f"processing NCBI GBFF file from {gbff_filepath}") with open(gbff_filepath) as fh: + # Bio.SeqIO.parse(fh, "gb") returns an empty iterator for .fna files and does not fail for record in Bio.SeqIO.parse(fh, "gb"): for feature in record.features: xrefs = parse_db_xrefs(feature) @@ -200,9 +204,7 @@ def get_mito_genes(gbff_filepath: str): # retrieve sequence, and reverse compliment if on reverse strand ac = f"{record.id}_{feature.location.start:05}_{feature.location.end:05}" feature_seq = record.seq[feature_start:feature_end] - strand = "+" if feature.location.strand == -1: - strand = "-" feature_seq = feature_seq.reverse_complement() if feature.type == "CDS": @@ -225,7 +227,7 @@ def get_mito_genes(gbff_filepath: str): alt_ac=record.id, alt_start=feature_start, alt_end=feature_end, - strand=strand, + strand=feature.location.strand, transl_table=transl_table, transl_except=transl_except, pro_ac=pro_ac, @@ -242,7 +244,7 @@ def main(ncbi_accession: str, output_dir: str): logger.info(f"found {len(mito_genes)} genes from parsing {input_files['gbff']}") # write gene accessions - with open(f"{output_dir}/{ncbi_accession}.assocacs", "w") as o_file: + with gzip.open(f"{output_dir}/assocacs.gz", "wt") as o_file: gaw = GeneAccessionsWriter(o_file) for mg in mito_genes: if mg.pro_ac is not None: @@ -253,7 +255,7 @@ def main(ncbi_accession: str, output_dir: str): ) # write sequence information - with open(f"{output_dir}/{ncbi_accession}.seqinfo", "w") as o_file: + with gzip.open(f"{output_dir}/seqinfo.gz", "wt") as o_file: siw = SeqInfoWriter(o_file) for mg in mito_genes: siw.write( @@ -279,7 +281,7 @@ def main(ncbi_accession: str, output_dir: str): ) # write out transcript sequence fasta files. - with open(f"{output_dir}/{ncbi_accession}.rna.fna", "w") as o_file: + with gzip.open(f"{output_dir}/{ncbi_accession}.rna.fna.gz", "wt") as o_file: for mg in mito_genes: record = SeqRecord( Seq(mg.tx_seq), @@ -289,7 +291,7 @@ def main(ncbi_accession: str, output_dir: str): o_file.write(record.format("fasta")) # write out protein sequence fasta files. - with open(f"{output_dir}/{ncbi_accession}.protein.faa", "w") as o_file: + with gzip.open(f"{output_dir}/{ncbi_accession}.protein.faa.gz", "wt") as o_file: for mg in mito_genes: if mg.pro_ac is not None: record = SeqRecord( @@ -300,7 +302,7 @@ def main(ncbi_accession: str, output_dir: str): o_file.write(record.format("fasta")) # write transcript information - with open(f"{output_dir}/{ncbi_accession}.txinfo", "w") as o_file: + with gzip.open(f"{output_dir}/txinfo.gz", "wt") as o_file: tiw = TxInfoWriter(o_file) for mg in mito_genes: tiw.write( @@ -315,7 +317,7 @@ def main(ncbi_accession: str, output_dir: str): ) # write exonset - with open(f"{output_dir}/{ncbi_accession}.exonset", "w") as o_file: + with gzip.open(f"{output_dir}/exonsets.gz", "wt") as o_file: esw = ExonSetWriter(o_file) for mg in mito_genes: esw.write( @@ -331,5 +333,4 @@ def main(ncbi_accession: str, output_dir: str): if __name__ == "__main__": args = parse_args() - main(args.accession, args.output_dir) diff --git a/sbin/seqrepo-download b/sbin/seqrepo-download deleted file mode 100755 index 0773030..0000000 --- a/sbin/seqrepo-download +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env bash - -# This script downloads SeqRepo into the given directory. -# Note: pulling data takes ~30 minutes and requires ~13 GB. -# Note: a container called seqrepo will be left behind. -# The name of the container can be changed by providing a third argument. - -set -e - -SEQREPO_VERSION=$1 -OUTPUT_DIR=$2 -# optional: -SEQREPO_CONTAINER_NAME=$3 - -if [ -z "$SEQREPO_VERSION" ] || [ -z "$OUTPUT_DIR" ] -then - echo 'Usage: sbin/seqrepo-download ' - exit 1 -else - echo "SeqRepo data for version $SEQREPO_VERSION will be available in $OUTPUT_DIR" -fi - -# Name of seqrepo container -if [ -z "$SEQREPO_CONTAINER_NAME" ] -then - SEQREPO_CONTAINER_NAME=seqrepo -fi - -if [[ $OUTPUT_DIR != /* ]] -then - echo 'Output directory must be an absolute path' - exit 1 -fi - -if [ ! -d "$OUTPUT_DIR" ]; then - echo "Directory $OUTPUT_DIR does not exist." - exit 1 -fi - -# Pull seqrepo image -docker pull biocommons/seqrepo:$SEQREPO_VERSION - -# Download seqrepo data using seqrepo image -if docker ps -aq -f name=$SEQREPO_CONTAINER_NAME -then - echo "Container called $SEQREPO_CONTAINER_NAME already exists. Skipping seqrepo data download." -else - docker run --name seqrepo biocommons/seqrepo:$SEQREPO_VERSION -fi - -# Copy seqrepo data into a local directory -echo "Copying seqrepo data into $OUTPUT_DIR ..." -docker run -it -v $OUTPUT_DIR:/output-dir --volumes-from $SEQREPO_CONTAINER_NAME:ro ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir' diff --git a/sbin/seqrepo-load b/sbin/seqrepo-load new file mode 100755 index 0000000..93942cf --- /dev/null +++ b/sbin/seqrepo-load @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +set -e + +seqrepo_root=$1 +seqrepo_version=$2 +sequence_dir=$3 +log_dir=$4 + +if [ -z "$seqrepo_root" ] || [ -z "$seqrepo_version" ] || [ -z "$sequence_dir" ] || [ -z "$log_dir" ] +then + echo 'Usage: sbin/seqrepo-load ' + exit 1 +fi + +## Load SeqRepo with new sequences +seqrepo --root-directory "$seqrepo_root" \ + load -n NCBI --instance-name "$seqrepo_version" \ + $sequence_dir/*.fna.gz \ + $sequence_dir/*.faa.gz 2>& 1 | \ + tee "$log_dir/seqrepo-load.log" diff --git a/sbin/uta-extract b/sbin/uta-extract new file mode 100755 index 0000000..2a60f3f --- /dev/null +++ b/sbin/uta-extract @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +# Extract data from NCBI files into intermediate files. + +set -e + +ncbi_dir=$1 +working_dir=$2 +log_dir=$3 + +if [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ] +then + echo 'Usage: sbin/uta-extract ' + exit 1 +fi + +# genes +sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz | \ + gzip -c > "$working_dir/geneinfo.gz" 2>&1 | tee "$log_dir/ncbi-parse-geneinfo.log" + +# transcript protein associations +sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$working_dir/assocacs.gz" 2>&1 | \ + tee "$log_dir/ncbi-fetch-assoc-acs.log" + +# parse transcript info from GBFF input files +GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz) +sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$working_dir/txinfo.gz" 2>&1 | \ + tee "$log_dir/ncbi-parse-gbff.log" + +# parse alignments from GFF input files +GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.gff.gz) +sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$working_dir/unfiltered_exonsets.gz" 2>&1 | \ + tee "$log_dir/ncbi-parse-genomic-gff.log" + +# filter transcripts +sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \ + --missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \ + tee "$log_dir/filter_exonset_transcripts.log" + +# move fasta files into same dir +ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz $working_dir/ +ln $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz $working_dir/ +ln $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405*/GCF_*_genomic.fna.gz $working_dir/ diff --git a/sbin/uta-load b/sbin/uta-load new file mode 100755 index 0000000..c3da837 --- /dev/null +++ b/sbin/uta-load @@ -0,0 +1,77 @@ +#!/usr/bin/env bash + +# This script updates UTA and SeqRepo using NCBI files. +# source_uta_v is the UTA version before the update. +# ncbi_dir is where the script looks for NCBI data files. +# working_dir stores intermediate data files and the final database dump. +# log_dir stores log files. +# skip_load_genes, if truthy, will skip the gene loading step + +# Note that the uta loading code uses the seqrepo location defined in the conf files, under [sequences].seqrepo. + +set -euxo pipefail + +source_uta_v=$1 +ncbi_dir=$2 +working_dir=$3 +log_dir=$4 +skip_load_genes=$5 + +if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ] +then + echo 'Usage: uta-load ' + exit 1 +fi + +# set local variables and create working directories +loading_uta_v="uta" +mkdir -p "$log_dir" + +## Drop loading schema, and recreate +etc/scripts/delete-schema.sh "$loading_uta_v" +etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v" + +## for now set up Alembic for schema migrations +alembic -c etc/alembic.ini stamp edadb97f6502 +alembic -c etc/alembic.ini upgrade head + +# generate seqinfo files from exonsets (this step requires seqrepo) +sbin/exonset-to-seqinfo -o NCBI "$working_dir/exonsets.gz" | gzip -c > "$working_dir/seqinfo.gz" 2>&1 | \ + tee "$log_dir/exonset-to-seqinfo.log" + +# Filter out columns from assocacs file. +sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-ac.gz" 2>&1 | \ + tee "$log_dir/assoc-acs-merge.log" + +# Load genes into gene table. +if [ "$skip_load_genes" = "true" ] +then + echo "Skipping load-geneinfo" +else + uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \ + tee "$log_dir/load-geneinfo.log" +fi + +# Load accessions into associated_accessions table. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$working_dir/assoc-ac.gz" 2>&1 | \ + tee "$log_dir/load-assoc-ac.log" + +# Load transcript info into transcript and exon_set tables. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$working_dir/txinfo.gz" 2>&1 | \ + tee "$log_dir/load-txinfo.log" + +# Load exon sets into into exon_set and exon tables. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$working_dir/exonsets.gz" 2>&1 | \ + tee "$log_dir/load-exonsets.log" + +# Create cigar strings for all rows in tx_alt_exon_pairs_v view and update exon_aln table. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | \ + tee "$log_dir/align-exons.log" + +# Load seqinfo? + +### run diff +sbin/uta-diff "$source_uta_v" "$loading_uta_v" + +### psql_dump +pg_dump -U uta_admin -h localhost -d uta -t "$loading_uta_v.gene" | gzip -c > "$working_dir/uta.pgd.gz" diff --git a/sbin/uta-update b/sbin/uta-update deleted file mode 100755 index 23d93af..0000000 --- a/sbin/uta-update +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env bash - -# This script runs the UTA update procedure. -# It updates the specified UTA and SeqRepo using the given NCBI files. -# It produces a postgres dump of the updated UTA database and an updated SeqRepo (updated in place). -# It expects to be run from the root of the uta repository. - -set -e - -# export environment variables for docker compose file -export NCBI_DIR=$1 -export SEQREPO_DIR=$2 -export WORKING_DIR=$3 -export UTA_VERSION=$4 -export SEQREPO_VERSION=$5 - -if [ -z "$NCBI_DIR" ] || [ -z "$SEQREPO_DIR" ] || [ -z "$WORKING_DIR" ] || [ -z "$UTA_VERSION" ] || [ -z "$SEQREPO_VERSION" ] -then - echo 'Usage: sbin/uta-update ' - exit 1 -else - echo "Updating UTA and SeqRepo using files in $NCBI_DIR and SeqRepo data in $SEQREPO_DIR" - echo "Starting from UTA version $UTA_VERSION and SeqRepo version $SEQREPO_VERSION" - echo "Logs and intermediate files will be available in $WORKING_DIR" -fi - -# Ensure directories are compatible with docker volume usage -if [[ $NCBI_DIR != /* ]] && [[ $NCBI_DIR != .* ]] -then - echo 'NCBI file directory must start with / or .' - exit 1 -fi - -if [[ $SEQREPO_DIR != /* ]] && [[ $SEQREPO_DIR != .* ]] -then - echo 'SeqRepo data directory must start with / or .' - exit 1 -fi - -if [[ $WORKING_DIR != /* ]] && [[ $WORKING_DIR != .* ]] -then - echo 'Working directory must start with / or .' - exit 1 -fi - -# Ensure directories exist. -if [ ! -d "$NCBI_DIR" ]; then - echo "Directory $NCBI_DIR does not exist." - exit 1 -fi - -if [ ! -d "$SEQREPO_DIR" ]; then - echo "Directory $SEQREPO_DIR does not exist." - exit 1 -fi - -if [ ! -d "$WORKING_DIR" ]; then - echo "Directory $WORKING_DIR does not exist." - exit 1 -fi - -# Build the UTA image. -docker build --target uta -t uta-update . - -# Bring up a UTA database and run the UTA update procedure. -# docker compose doesn't respect the container name specified in the compose file, so container name is specified here -docker compose run --rm --name uta-update uta-update diff --git a/tests/test_ncbi_process_mito.py b/tests/test_ncbi_process_mito.py index d0a1982..8beb535 100644 --- a/tests/test_ncbi_process_mito.py +++ b/tests/test_ncbi_process_mito.py @@ -193,7 +193,7 @@ def test_get_mito_genes(self): "alt_ac": "NC_012920.1", "alt_start": 1601, "alt_end": 1670, - "strand": "+", + "strand": 1, "transl_table": None, "transl_except": None, "pro_ac": None, @@ -212,7 +212,7 @@ def test_get_mito_genes(self): "alt_ac": "NC_012920.1", "alt_start": 4328, "alt_end": 4400, - "strand": "-", + "strand": -1, "transl_table": None, "transl_except": None, "pro_ac": None, @@ -237,7 +237,7 @@ def test_get_mito_genes(self): "alt_ac": "NC_012920.1", "alt_start": 7585, "alt_end": 8269, - "strand": "+", + "strand": 1, "transl_table": "2", "transl_except": None, "pro_ac": "YP_003024029.1", @@ -267,7 +267,7 @@ def test_get_mito_genes(self): "alt_ac": "NC_012920.1", "alt_start": 3306, "alt_end": 4262, - "strand": "+", + "strand": 1, "transl_table": "2", "transl_except": "(pos:4261..4262,aa:TERM)", "pro_ac": "YP_003024026.1",