From 2331ed515d9d9da2afcd18274d6bae791f1410fb Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Thu, 18 Apr 2024 12:04:26 -0600 Subject: [PATCH 1/5] feat(IPVC-2379): wip --- etc/ncbi-files.txt | 22 ++++++++++++++++++++++ sbin/ncbi-download | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 etc/ncbi-files.txt diff --git a/etc/ncbi-files.txt b/etc/ncbi-files.txt new file mode 100644 index 0000000..3cbbe92 --- /dev/null +++ b/etc/ncbi-files.txt @@ -0,0 +1,22 @@ +# This configuration file contains the paths to the data files to be downloaded from NCBI to use in the pipeline. +# +# ├── gene +# │ └── DATA +# │ ├── GENE_INFO +# │ │ └── Mammalia +# │ │ └── Homo_sapiens.gene_info.gz +# │ └── gene2accession.gz +# ├── genomes +# │ └── refseq +# │ └── vertebrate_mammalian +# │ └── Homo_sapiens +# │ └── all_assembly_versions +# │ └── GCF_000001405.25_GRCh37.p13 +# │ ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz +# │ └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz +# └── refseq +# └── H_sapiens +# └── mRNA_Prot +# ├── human.1.protein.faa.gz +# ├── human.1.rna.fna.gz +# └── human.1.rna.gbff.gz diff --git a/sbin/ncbi-download b/sbin/ncbi-download index dc445cd..6d20aed 100755 --- a/sbin/ncbi-download +++ b/sbin/ncbi-download @@ -2,7 +2,7 @@ # This script downloads the files needed for a UTA+SeqRepo update into to the given directory. # -# DONWLOAD_DIR will have the following structure: +# DOWNLOAD_DIR will have the following structure: # # ├── gene # │ └── DATA From 2217142334765d63a457e8c6b2c580437f391a56 Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Thu, 18 Apr 2024 14:22:58 -0600 Subject: [PATCH 2/5] feat(IPVC-2379): move NCBI file paths to config file, update download script and docker compose command for ncbi-download --- docker-compose.yml | 2 +- etc/ncbi-files.txt | 27 ++++++++++++++++++++++++++- sbin/ncbi-download | 38 ++++++-------------------------------- 3 files changed, 33 insertions(+), 34 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index b689645..cad902f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,7 @@ version: '3' services: ncbi-download: image: uta-update - command: sbin/ncbi-download /ncbi-dir + command: sbin/ncbi-download etc/ncbi-files.txt /ncbi-dir volumes: - .:/opt/repos/uta - ${UTA_ETL_NCBI_DIR}:/ncbi-dir diff --git a/etc/ncbi-files.txt b/etc/ncbi-files.txt index 3cbbe92..3601e32 100644 --- a/etc/ncbi-files.txt +++ b/etc/ncbi-files.txt @@ -1,4 +1,4 @@ -# This configuration file contains the paths to the data files to be downloaded from NCBI to use in the pipeline. +# This configuration file contains the paths to the NCBI data files needed by the SeqRepo/UTA load pipelines. # # ├── gene # │ └── DATA @@ -20,3 +20,28 @@ # ├── human.1.protein.faa.gz # ├── human.1.rna.fna.gz # └── human.1.rna.gbff.gz + +## Gene Data +gene/DATA/gene2refseq.gz +gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz + +## RefSeq Data +refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz +refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz +refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz + +## Genome build and alignment data +# Build 37 +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_report.txt +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.fna.gz +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz + +# Build 38 +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_assembly_report.txt +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.fna.gz +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz + +# T2Tv2.0 +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_assembly_report.txt +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna.gz +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz diff --git a/sbin/ncbi-download b/sbin/ncbi-download index 6d20aed..09ad4c2 100755 --- a/sbin/ncbi-download +++ b/sbin/ncbi-download @@ -2,48 +2,22 @@ # This script downloads the files needed for a UTA+SeqRepo update into to the given directory. # -# DOWNLOAD_DIR will have the following structure: -# -# ├── gene -# │ └── DATA -# │ ├── GENE_INFO -# │ │ └── Mammalia -# │ │ └── Homo_sapiens.gene_info.gz -# │ └── gene2accession.gz -# ├── genomes -# │ └── refseq -# │ └── vertebrate_mammalian -# │ └── Homo_sapiens -# │ └── all_assembly_versions -# │ └── GCF_000001405.25_GRCh37.p13 -# │ ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz -# │ └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz -# └── refseq -# └── H_sapiens -# └── mRNA_Prot -# ├── human.1.protein.faa.gz -# ├── human.1.rna.fna.gz -# └── human.1.rna.gbff.gz +# DESTINATION_DIR will have a directory structure matching the source. set -e -DOWNLOAD_DIR=$1 -DOWNLOAD_PATHS=( - 'gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz' - 'gene/DATA/gene2accession.gz' - 'refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz' -) +FILE_PATH_CONFIG=$1 +DOWNLOAD_DIR=$2 -if [ -z "$DOWNLOAD_DIR" ] +if [ -z "$FILE_PATH_CONFIG" ] || [ -z "$DOWNLOAD_DIR" ] then - echo 'Usage: sbin/ncbi-download ' + echo 'Usage: sbin/ncbi-download ' exit 1 else echo "Downloading files to $DOWNLOAD_DIR" fi -for DOWNLOAD_PATH in "${DOWNLOAD_PATHS[@]}" -do +grep -v -e '^#' -e '^$' "$FILE_PATH_CONFIG" | while read -r DOWNLOAD_PATH; do # each top-level directory in NCBI is an rsync module. # bash parameter expansion removes all content after first slash. DOWNLOAD_MODULE="${DOWNLOAD_PATH%%/*}" From ee395d91cb344b2bb8621c7f444fff9ea2edd54e Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Thu, 18 Apr 2024 14:28:47 -0600 Subject: [PATCH 3/5] feat(IPVC-2379): adding historical alignments to ncbi file config --- etc/ncbi-files.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/etc/ncbi-files.txt b/etc/ncbi-files.txt index 3601e32..aba103f 100644 --- a/etc/ncbi-files.txt +++ b/etc/ncbi-files.txt @@ -41,6 +41,10 @@ genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_00000 genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.fna.gz genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz +# RefSeq historical alignments +# genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz +# genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz + # T2Tv2.0 genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_assembly_report.txt genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna.gz From e335e1d24f75bfc6a1e935458996ca24f26e51e5 Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Wed, 24 Apr 2024 11:00:17 -0600 Subject: [PATCH 4/5] feat(IPVC-2379): updates paths in ncbi-files.txt --- etc/ncbi-files.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/etc/ncbi-files.txt b/etc/ncbi-files.txt index aba103f..5ab95c1 100644 --- a/etc/ncbi-files.txt +++ b/etc/ncbi-files.txt @@ -30,6 +30,10 @@ refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz +## Historical RefSeq alignments +refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz +refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz + ## Genome build and alignment data # Build 37 genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_report.txt @@ -41,10 +45,6 @@ genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_00000 genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.fna.gz genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz -# RefSeq historical alignments -# genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz -# genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz - # T2Tv2.0 genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_assembly_report.txt genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna.gz From e483044c8fd1d9285cebcdfd7f48ed0f3d0a7525 Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Wed, 24 Apr 2024 11:07:38 -0600 Subject: [PATCH 5/5] feat(IPVC-2379): update doc string --- etc/ncbi-files.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/etc/ncbi-files.txt b/etc/ncbi-files.txt index 5ab95c1..6cb31ef 100644 --- a/etc/ncbi-files.txt +++ b/etc/ncbi-files.txt @@ -16,10 +16,16 @@ # │ └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz # └── refseq # └── H_sapiens +# ├── historical +# │ └── GRCh38 +# │ └── GCF_000001405.40-RS_2023_03_historical +# │ ├── GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz +# │ └── GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz # └── mRNA_Prot # ├── human.1.protein.faa.gz # ├── human.1.rna.fna.gz -# └── human.1.rna.gbff.gz +# ├── human.1.rna.gbff.gz +# ├── ... ## Gene Data gene/DATA/gene2refseq.gz