Skip to content

Commit

Permalink
IPVC-2379: add necessary NCBI input files to download config (#25)
Browse files Browse the repository at this point in the history
  • Loading branch information
bsgiles73 authored Apr 24, 2024
1 parent 5432818 commit c2af389
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 33 deletions.
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ version: '3'
services:
ncbi-download:
image: uta-update
command: sbin/ncbi-download /ncbi-dir
command: sbin/ncbi-download etc/ncbi-files.txt /ncbi-dir
volumes:
- .:/opt/repos/uta
- ${UTA_ETL_NCBI_DIR}:/ncbi-dir
Expand Down
57 changes: 57 additions & 0 deletions etc/ncbi-files.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# This configuration file contains the paths to the NCBI data files needed by the SeqRepo/UTA load pipelines.
#
# ├── gene
# │ └── DATA
# │ ├── GENE_INFO
# │ │ └── Mammalia
# │ │ └── Homo_sapiens.gene_info.gz
# │ └── gene2accession.gz
# ├── genomes
# │ └── refseq
# │ └── vertebrate_mammalian
# │ └── Homo_sapiens
# │ └── all_assembly_versions
# │ └── GCF_000001405.25_GRCh37.p13
# │ ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz
# │ └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz
# └── refseq
# └── H_sapiens
# ├── historical
# │ └── GRCh38
# │ └── GCF_000001405.40-RS_2023_03_historical
# │ ├── GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz
# │ └── GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz
# └── mRNA_Prot
# ├── human.1.protein.faa.gz
# ├── human.1.rna.fna.gz
# ├── human.1.rna.gbff.gz
# ├── ...

## Gene Data
gene/DATA/gene2refseq.gz
gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz

## RefSeq Data
refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz
refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz
refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz

## Historical RefSeq alignments
refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz
refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz

## Genome build and alignment data
# Build 37
genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_report.txt
genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.fna.gz
genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz

# Build 38
genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_assembly_report.txt
genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.fna.gz
genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz

# T2Tv2.0
genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_assembly_report.txt
genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna.gz
genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz
38 changes: 6 additions & 32 deletions sbin/ncbi-download
Original file line number Diff line number Diff line change
Expand Up @@ -2,48 +2,22 @@

# This script downloads the files needed for a UTA+SeqRepo update into to the given directory.
#
# DONWLOAD_DIR will have the following structure:
#
# ├── gene
# │ └── DATA
# │ ├── GENE_INFO
# │ │ └── Mammalia
# │ │ └── Homo_sapiens.gene_info.gz
# │ └── gene2accession.gz
# ├── genomes
# │ └── refseq
# │ └── vertebrate_mammalian
# │ └── Homo_sapiens
# │ └── all_assembly_versions
# │ └── GCF_000001405.25_GRCh37.p13
# │ ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz
# │ └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz
# └── refseq
# └── H_sapiens
# └── mRNA_Prot
# ├── human.1.protein.faa.gz
# ├── human.1.rna.fna.gz
# └── human.1.rna.gbff.gz
# DESTINATION_DIR will have a directory structure matching the source.

set -e

DOWNLOAD_DIR=$1
DOWNLOAD_PATHS=(
'gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz'
'gene/DATA/gene2accession.gz'
'refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz'
)
FILE_PATH_CONFIG=$1
DOWNLOAD_DIR=$2

if [ -z "$DOWNLOAD_DIR" ]
if [ -z "$FILE_PATH_CONFIG" ] || [ -z "$DOWNLOAD_DIR" ]
then
echo 'Usage: sbin/ncbi-download <download_dir>'
echo 'Usage: sbin/ncbi-download <file path config> <download_dir> '
exit 1
else
echo "Downloading files to $DOWNLOAD_DIR"
fi

for DOWNLOAD_PATH in "${DOWNLOAD_PATHS[@]}"
do
grep -v -e '^#' -e '^$' "$FILE_PATH_CONFIG" | while read -r DOWNLOAD_PATH; do
# each top-level directory in NCBI is an rsync module.
# bash parameter expansion removes all content after first slash.
DOWNLOAD_MODULE="${DOWNLOAD_PATH%%/*}"
Expand Down

0 comments on commit c2af389

Please sign in to comment.