diff --git a/docker-compose.yml b/docker-compose.yml index b689645..cad902f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,7 @@ version: '3' services: ncbi-download: image: uta-update - command: sbin/ncbi-download /ncbi-dir + command: sbin/ncbi-download etc/ncbi-files.txt /ncbi-dir volumes: - .:/opt/repos/uta - ${UTA_ETL_NCBI_DIR}:/ncbi-dir diff --git a/etc/ncbi-files.txt b/etc/ncbi-files.txt new file mode 100644 index 0000000..6cb31ef --- /dev/null +++ b/etc/ncbi-files.txt @@ -0,0 +1,57 @@ +# This configuration file contains the paths to the NCBI data files needed by the SeqRepo/UTA load pipelines. +# +# ├── gene +# │ └── DATA +# │ ├── GENE_INFO +# │ │ └── Mammalia +# │ │ └── Homo_sapiens.gene_info.gz +# │ └── gene2accession.gz +# ├── genomes +# │ └── refseq +# │ └── vertebrate_mammalian +# │ └── Homo_sapiens +# │ └── all_assembly_versions +# │ └── GCF_000001405.25_GRCh37.p13 +# │ ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz +# │ └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz +# └── refseq +# └── H_sapiens +# ├── historical +# │ └── GRCh38 +# │ └── GCF_000001405.40-RS_2023_03_historical +# │ ├── GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz +# │ └── GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz +# └── mRNA_Prot +# ├── human.1.protein.faa.gz +# ├── human.1.rna.fna.gz +# ├── human.1.rna.gbff.gz +# ├── ... + +## Gene Data +gene/DATA/gene2refseq.gz +gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz + +## RefSeq Data +refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz +refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz +refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz + +## Historical RefSeq alignments +refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz +refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz + +## Genome build and alignment data +# Build 37 +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_report.txt +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.fna.gz +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz + +# Build 38 +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_assembly_report.txt +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.fna.gz +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz + +# T2Tv2.0 +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_assembly_report.txt +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna.gz +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz diff --git a/sbin/ncbi-download b/sbin/ncbi-download index dc445cd..09ad4c2 100755 --- a/sbin/ncbi-download +++ b/sbin/ncbi-download @@ -2,48 +2,22 @@ # This script downloads the files needed for a UTA+SeqRepo update into to the given directory. # -# DONWLOAD_DIR will have the following structure: -# -# ├── gene -# │ └── DATA -# │ ├── GENE_INFO -# │ │ └── Mammalia -# │ │ └── Homo_sapiens.gene_info.gz -# │ └── gene2accession.gz -# ├── genomes -# │ └── refseq -# │ └── vertebrate_mammalian -# │ └── Homo_sapiens -# │ └── all_assembly_versions -# │ └── GCF_000001405.25_GRCh37.p13 -# │ ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz -# │ └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz -# └── refseq -# └── H_sapiens -# └── mRNA_Prot -# ├── human.1.protein.faa.gz -# ├── human.1.rna.fna.gz -# └── human.1.rna.gbff.gz +# DESTINATION_DIR will have a directory structure matching the source. set -e -DOWNLOAD_DIR=$1 -DOWNLOAD_PATHS=( - 'gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz' - 'gene/DATA/gene2accession.gz' - 'refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz' -) +FILE_PATH_CONFIG=$1 +DOWNLOAD_DIR=$2 -if [ -z "$DOWNLOAD_DIR" ] +if [ -z "$FILE_PATH_CONFIG" ] || [ -z "$DOWNLOAD_DIR" ] then - echo 'Usage: sbin/ncbi-download ' + echo 'Usage: sbin/ncbi-download ' exit 1 else echo "Downloading files to $DOWNLOAD_DIR" fi -for DOWNLOAD_PATH in "${DOWNLOAD_PATHS[@]}" -do +grep -v -e '^#' -e '^$' "$FILE_PATH_CONFIG" | while read -r DOWNLOAD_PATH; do # each top-level directory in NCBI is an rsync module. # bash parameter expansion removes all content after first slash. DOWNLOAD_MODULE="${DOWNLOAD_PATH%%/*}"