diff --git a/README.md b/README.md index 6c9d5d10..9aabf227 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ # Mehari - +a camel Mehari is a software package for annotating VCF files with variant effect/consequence. The program uses [hgvs-rs](https://crates.io/crates/hgvs) for projecting genomic variants to transcripts and proteins and thus has high prediction quality. @@ -17,201 +17,22 @@ Other popular tools offering variant effect/consequence prediction include: - [SnpEff](http://pcingola.github.io/SnpEff/) - [VEP (Variant Effect Predictor)](https://www.ensembl.org/info/docs/tools/vep/index.html) -Mehari offers predictions that aim to mirror VariantValidator, the gold standard for HGVS variant descriptions. +Mehari offers HGVS predictions that aim to mirror VariantValidator, the gold standard for HGVS variant descriptions, and consequence predictions compatible with VEP. Further, it is written in the Rust programming language and can be used as a library for users' Rust software. -## Supported Sequence Variant Frequency Databases - -Mehari can import public sequence variant frequency databases. -The supported set slightly differs between import for GRCh37 and GRCh38. - -**GRCh37** - -- gnomAD r2.1.1 Exomes [`gnomad.exomes.r2.1.1.sites.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v2) -- gnomAD r2.1.1 Genomes [`gnomad.genomes.r2.1.1.sites.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v2) -- gnomAD v3.1 mtDNA [`gnomad.genomes.v3.1.sites.chrM.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v3-mitochondrial-dna) -- HelixMTdb `HelixMTdb_20200327.tsv` - -**GRCh38** - -- gnomAD r2.1.1 lift-over Exomes [`gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v2) -- gnomAD v3.1 Genomes [`gnomad.genomes.v3.1.2.sites.$CHROM.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v3) -- gnomAD v3.1 mtDNA [`gnomad.genomes.v3.1.sites.chrM.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v3-mitochondrial-dna) -- HelixMTdb `HelixMTdb_20200327.tsv` - -## Building from scratch -To reduce compile times, we recommend using a pre-built version of `rocksdb`, either from the system package manager or e.g. via `conda`: - -```bash -# Ubuntu -sudo apt-get install librocksdb-dev - -# Conda -conda install -c conda-forge rocksdb -``` - -In either case, either add -```toml -[env] -ROCKSDB_LIB_DIR = "/usr/lib/" # in case of the system package manager, adjust the path accordingly for conda -SNAPPY_LIB_DIR = "/usr/lib/" # same as above -``` -to `.cargo/config.toml` or set the environment variables `ROCKSDB_LIB_DIR` and `SNAPPY_LIB_DIR` to the appropriate paths: - -```bash -export ROCKSDB_LIB_DIR=/usr/lib/ -export SNAPPY_LIB_DIR=/usr/lib/ -``` - -By default, the environment variables are defined in the `.cargo/config.toml` as described above, i.e. may need adjustments if not using the system package manager. - -To build the project, run: -```bash -cargo build --release -``` - -To install the project locally, run: -```bash -cargo install --path . -``` -## Internal Notes - -``` -rm -rf /tmp/out ; cargo run -- db create seqvar-freqs --path-output-db /tmp/out --genome-release grch38 --path-helix-mtdb ~/Downloads/HelixMTdb_20200327.vcf.gz --path-gnomad-mtdna ~/Downloads/gnomad.genomes.v3.1.sites.chrM.vcf.bgz --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrX.vcf --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrY.vcf --path-gnomad-genomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrX.vcf --path-gnomad-genomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrY.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr1.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr2.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr1.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr2.vcf - -rm -rf /tmp/out ; cargo run -- db create seqvar-freqs --path-output-db /tmp/out --genome-release grch37 --path-gnomad-mtdna ~/Downloads/gnomad.genomes.v3.1.sites.chrM.vcf.bgz --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrX.vcf --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrY.vcf --path-gnomad-genomes-xy tests/data/db/create/seqvar_freqs/xy-37/gnomad.genomes.r2.1.1.sites.chrX.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr1.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr2.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr1.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr2 -``` - -``` -prepare() -{ - in=$1 - out=$2 - - zcat $in \ - | head -n 5000 \ - | grep ^# \ - > $out - - zcat $in \ - | grep -v ^# \ - | head -n 3 \ - >> $out -} - -base=/data/sshfs/data/gpfs-1/groups/cubi/work/projects/2021-07-20_varfish-db-downloader-holtgrewe/varfish-db-downloader/ - -mkdir -p tests/data/db/create/seqvar_freqs/{12,xy}-{37,38} - -## 37 exomes - -prepare \ - $base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr1.vcf.bgz \ - tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr1.vcf -prepare \ - $base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr2.vcf.bgz \ - tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr2.vcf -prepare \ - $base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrX.vcf.bgz \ - tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrX.vcf -prepare \ - $base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrY.vcf.bgz \ - tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrY.vcf - -## 37 genomes - -prepare \ - $base/GRCh37/gnomAD_genomes/r2.1.1/download/gnomad.genomes.r2.1.1.sites.chr1.vcf.bgz \ - tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr1.vcf -prepare \ - $base/GRCh37/gnomAD_genomes/r2.1.1/download/gnomad.genomes.r2.1.1.sites.chr2.vcf.bgz \ - tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr2.vcf -prepare \ - $base/GRCh37/gnomAD_genomes/r2.1.1/download/gnomad.genomes.r2.1.1.sites.chrX.vcf.bgz \ - tests/data/db/create/seqvar_freqs/xy-37/gnomad.genomes.r2.1.1.sites.chrX.vcf - -## 38 exomes - -prepare \ - $base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr1.vcf.bgz \ - tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr1.vcf -prepare \ - $base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr2.vcf.bgz \ - tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr2.vcf -prepare \ - $base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrX.vcf.bgz \ - tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrX.vcf -prepare \ - $base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrY.vcf.bgz \ - tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrY.vcf - -## 38 genomes - -prepare \ - $base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chr1.vcf.bgz \ - tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr1.vcf -prepare \ - $base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chr2.vcf.bgz \ - tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr2.vcf -prepare \ - $base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chrX.vcf.bgz \ - tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrX.vcf -prepare \ - $base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chrY.vcf.bgz \ - tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrY.vcf -``` - -Building tx database - - -``` -cd hgvs-rs-data - -seqrepo --root-directory seqrepo-data/master init - -mkdir -p mirror/ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot -cd !$ -wget https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.files.installed -parallel -j 16 'wget https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/{}' ::: $(cut -f 2 human.files.installed | grep fna) -cd - - -mkdir -p mirror/ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/cdna -cd !$ -wget https://ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz -cd - -mkdir -p mirror/ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/ncrna -cd !$ -wget https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz -cd - -mkdir -p mirror/ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/cdna/ -cd !$ -wget https://ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.cdna.all.fa.gz -cd - -mkdir -p mirror/ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/ncrna/ -cd !$ -wget https://ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh37.ncrna.fa.gz -cd - - -seqrepo --root-directory seqrepo-data/master load -n NCBI $(find mirror/ftp.ncbi.nih.gov -name '*.fna.gz' | sort) -seqrepo --root-directory seqrepo-data/master load -n ENSEMBL $(find mirror/ftp.ensembl.org -name '*.fa.gz' | sort) - -cd ../mehari - -cargo run --release -- \ - -v \ - db create txs \ - --path-out /tmp/txs-out.bin.zst \ - --path-lable-tsv PATH_TO_MANE_LABEL.tsv \ - --path-cdot-json ../cdot-0.2.21.ensembl.grch37_grch38.json.gz \ - --path-cdot-json ../cdot-0.2.21.refseq.grch37_grch38.json.gz \ - --path-seqrepo-instance ../hgvs-rs-data/seqrepo-data/master/master -``` - -## Development Setup - -You will need a recent version of protoc, e.g.: - -``` -# bash utils/install-protoc.sh -# export PATH=$PATH:$HOME/.local/share/protoc/bin -``` +## Usage +To annotate variant consequences, gnomAD frequencies and clinVar information for sequence variants: +```sh + mehari annotate seqvars \ + --transcripts resources/transcript_db \ + --frequencies resources/gnomad_db \ + --clinvar resources/clinvar_db \ + --path-input-vcf input.vcf \ + --path-output-vcf output.vcf +``` +The corresponding database builds can be obtained from: + - transcripts: [github.com/varfish-org/mehari-data-tx/releases](https://github.com/varfish-org/mehari-data-tx/releases) + - gnomAD frequencies: TODO + - clinVar: [github.com/varfish-org/annonars-data-clinvar/releases](https://github.com/varfish-org/annonars-data-clinvar/releases) + +See [Getting Started](docs/getting_started.md) for more information on usage, and [Development Setup](docs/development.md) for more information on how to build mehari and its databases from scratch. diff --git a/docs/anno_seqvars.md b/docs/anno_seqvars.md index 7bb42a5c..2e7cdbfa 100644 --- a/docs/anno_seqvars.md +++ b/docs/anno_seqvars.md @@ -22,16 +22,17 @@ Currently, Mehari will annotate variants using: - The predicted impact on gene transcripts and the corresponding protein sequence (in the case of coding genes). - Their frequency in the gnomAD exomes and genomes databases as well as the HelixMtDb database in the case of mitochondrial databases. +- Variant information from ClinVar, if any ## Command Line Invocation -You can invoke Mehari like this to annotate a VCF file `IN.vcf` to an output file `OUT.vcf` using the built (or downloaded) database as `path/to/db`. +You can invoke Mehari to annotate a VCF file `IN.vcf` creating an output file `OUT.vcf` using the built (or downloaded) databases – for example the transcript database – as follows: ```text $ mehari annotate seqvars \ - --path-db path/to/db \ - --input-vcf IN.vcf \ - --output-vcf OUT.vcf + --transcripts path/to/transcripts-db \ + --path-input-vcf IN.vcf \ + --path-output-vcf OUT.vcf ``` Note that the input and output files can optionally be gzip/bgzip compressed VCF files with suffixes (`.gz` or `.bgz`) or BCF files with suffix `.bcf`. diff --git a/docs/development.md b/docs/development.md new file mode 100644 index 00000000..41c2df13 --- /dev/null +++ b/docs/development.md @@ -0,0 +1,42 @@ +## Building from scratch +To reduce compile times, we recommend using a pre-built version of `rocksdb`, either from the system package manager or e.g. via `conda`: + +```bash +# Ubuntu +sudo apt-get install librocksdb-dev + +# Conda +conda install -c conda-forge rocksdb +``` + +In either case, either add +```toml +[env] +ROCKSDB_LIB_DIR = "/usr/lib/" # in case of the system package manager, adjust the path accordingly for conda +SNAPPY_LIB_DIR = "/usr/lib/" # same as above +``` +to `.cargo/config.toml` or set the environment variables `ROCKSDB_LIB_DIR` and `SNAPPY_LIB_DIR` to the appropriate paths: + +```bash +export ROCKSDB_LIB_DIR=/usr/lib/ +export SNAPPY_LIB_DIR=/usr/lib/ +``` + +By default, the environment variables are defined in the `.cargo/config.toml` as described above, i.e. may need adjustments if not using the system package manager. + +You will need a recent version of protoc, e.g.: + +```bash +bash utils/install-protoc.sh +export PATH=$PATH:$HOME/.local/share/protoc/bin +``` + +To build the project, run: +```bash +cargo build --release +``` + +To install the project locally, run: +```bash +cargo install --path . +``` \ No newline at end of file diff --git a/docs/getting_started.md b/docs/getting_started.md index df2c7476..af314e8d 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -2,42 +2,47 @@ Getting Started. # Installation -You most likely want to install via bioconda. +## via bioconda As a prerequisite, [follow the bioconda getting started guide](http://bioconda.github.io/#usage). -Then, create a new environment (use the `mamba` if you are as impatient as us). +Then, create a new environment; -```text -$ mamba create -y mehari mehari -$ conda activate mehari +```sh +conda create -n mehari -y mehari +conda activate mehari ``` -The `mehari` executable is now available: +The `mehari` executable is now available from within the activated `mehari` conda environment: +```sh +mehari --help ``` -$ mehari --help -``` + +## via docker +Docker images of mehari are available from ghcr.io, see [ghcr.io/varfish-org/mehari](https://github.com/varfish-org/mehari/pkgs/container/mehari). + # Downloading Prebuilt Databases -TODO: not yet available +- transcript database releases: https://github.com/varfish-org/mehari-data-tx/releases +- gnomAD frequency database releases: TODO +- clinVar database releases: https://github.com/varfish-org/annonars-data-clinvar/releases # Annotating Example VCF Files You can obtain an example file like this: -```text -$ wget https://raw.githubusercontent.com/varfish-org/mehari/main/tests/data/db/create/seqvar_freqs/db-rs1263393206/input.vcf \ - -O example.vcf +```sh +wget https://raw.githubusercontent.com/varfish-org/mehari/main/tests/data/db/create/seqvar_freqs/db-rs1263393206/input.vcf -O example.vcf ``` Now, annotate it using Mehari: -```text -$ mehari annotate seqvars \ - --path-db path/to/mehari-db/b37 \ +```sh +mehari annotate seqvars \ + --transcripts path/to/mehari-transcript-db \ + --frequencies path/to/mehari-frequency-db \ + --clinvar path/to/mehari-clinvar-db \ --path-input-vcf example.vcf \ --path-output-vcf example.out.vcf -$ grep -v ^# example.out.vcf -TODO: output line ``` diff --git a/docs/implementation_notes.md b/docs/implementation_notes.md index 01362482..027aabad 100644 --- a/docs/implementation_notes.md +++ b/docs/implementation_notes.md @@ -1,4 +1,4 @@ -Implementation notes. +# Implementation notes ## Frequency Databases diff --git a/docs/index.md b/docs/index.md index 085d0f38..c6eda9c7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,7 +10,8 @@ Why another software package? library. The latter serves as the basis for [VariantValidator.org](https://variantvalidator.org/) which is the gold standard for HGVS variant description generation and validation. - Mehari is written in the Rust programming language which allows it to work fast, with low memory consumption (as a C++ program would) and being memory safe at the same time (as a Java/Python/Perl program would). -- As a Rust program, it can be embedded into the backend of the [VarFish](https://github.com/varfish-org/varfish-server) variant analysis platform. +- It can be used as a rust library, as is the case for e.g. the backend of the [VarFish](https://github.com/varfish-org/varfish-server) variant analysis platform +- Provides a REST API for sequence variant annotation (see `mehari server run --help`) ## What's Next? @@ -27,4 +28,4 @@ We recommend to read the Mehari end-user documentation in the following order: Since Mehari is written in the Rust programming language, we host the documentation on `docs.rs` written as Rust online documentation. This has the advantage that the documentation is bundle with the program source code (and thus always up to date) and the latest documentation is always available at . -The drawback is that the formatting of this may not be as end-user friendly as it could be but you will manage. +The drawback is that the formatting of this may not be as end-user friendly as it could be, but you will manage. diff --git a/docs/internal_notes.md b/docs/internal_notes.md new file mode 100644 index 00000000..f21f8f5b --- /dev/null +++ b/docs/internal_notes.md @@ -0,0 +1,151 @@ +## Internal Notes + +``` +rm -rf /tmp/out ; cargo run -- db create seqvar-freqs --path-output-db /tmp/out --genome-release grch38 --path-helix-mtdb ~/Downloads/HelixMTdb_20200327.vcf.gz --path-gnomad-mtdna ~/Downloads/gnomad.genomes.v3.1.sites.chrM.vcf.bgz --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrX.vcf --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrY.vcf --path-gnomad-genomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrX.vcf --path-gnomad-genomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrY.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr1.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr2.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr1.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr2.vcf + +rm -rf /tmp/out ; cargo run -- db create seqvar-freqs --path-output-db /tmp/out --genome-release grch37 --path-gnomad-mtdna ~/Downloads/gnomad.genomes.v3.1.sites.chrM.vcf.bgz --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrX.vcf --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrY.vcf --path-gnomad-genomes-xy tests/data/db/create/seqvar_freqs/xy-37/gnomad.genomes.r2.1.1.sites.chrX.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr1.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr2.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr1.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr2 +``` + +``` +prepare() +{ + in=$1 + out=$2 + + zcat $in \ + | head -n 5000 \ + | grep ^# \ + > $out + + zcat $in \ + | grep -v ^# \ + | head -n 3 \ + >> $out +} + +base=/data/sshfs/data/gpfs-1/groups/cubi/work/projects/2021-07-20_varfish-db-downloader-holtgrewe/varfish-db-downloader/ + +mkdir -p tests/data/db/create/seqvar_freqs/{12,xy}-{37,38} + +## 37 exomes + +prepare \ + $base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr1.vcf.bgz \ + tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr1.vcf +prepare \ + $base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr2.vcf.bgz \ + tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr2.vcf +prepare \ + $base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrX.vcf.bgz \ + tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrX.vcf +prepare \ + $base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrY.vcf.bgz \ + tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrY.vcf + +## 37 genomes + +prepare \ + $base/GRCh37/gnomAD_genomes/r2.1.1/download/gnomad.genomes.r2.1.1.sites.chr1.vcf.bgz \ + tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr1.vcf +prepare \ + $base/GRCh37/gnomAD_genomes/r2.1.1/download/gnomad.genomes.r2.1.1.sites.chr2.vcf.bgz \ + tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr2.vcf +prepare \ + $base/GRCh37/gnomAD_genomes/r2.1.1/download/gnomad.genomes.r2.1.1.sites.chrX.vcf.bgz \ + tests/data/db/create/seqvar_freqs/xy-37/gnomad.genomes.r2.1.1.sites.chrX.vcf + +## 38 exomes + +prepare \ + $base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr1.vcf.bgz \ + tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr1.vcf +prepare \ + $base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr2.vcf.bgz \ + tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr2.vcf +prepare \ + $base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrX.vcf.bgz \ + tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrX.vcf +prepare \ + $base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrY.vcf.bgz \ + tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrY.vcf + +## 38 genomes + +prepare \ + $base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chr1.vcf.bgz \ + tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr1.vcf +prepare \ + $base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chr2.vcf.bgz \ + tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr2.vcf +prepare \ + $base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chrX.vcf.bgz \ + tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrX.vcf +prepare \ + $base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chrY.vcf.bgz \ + tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrY.vcf +``` + +Building tx database + + +``` +cd hgvs-rs-data + +seqrepo --root-directory seqrepo-data/master init + +mkdir -p mirror/ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot +cd !$ +wget https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.files.installed +parallel -j 16 'wget https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/{}' ::: $(cut -f 2 human.files.installed | grep fna) +cd - + +mkdir -p mirror/ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/cdna +cd !$ +wget https://ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz +cd - +mkdir -p mirror/ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/ncrna +cd !$ +wget https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz +cd - +mkdir -p mirror/ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/cdna/ +cd !$ +wget https://ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.cdna.all.fa.gz +cd - +mkdir -p mirror/ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/ncrna/ +cd !$ +wget https://ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh37.ncrna.fa.gz +cd - + +seqrepo --root-directory seqrepo-data/master load -n NCBI $(find mirror/ftp.ncbi.nih.gov -name '*.fna.gz' | sort) +seqrepo --root-directory seqrepo-data/master load -n ENSEMBL $(find mirror/ftp.ensembl.org -name '*.fa.gz' | sort) + +cd ../mehari + +cargo run --release -- \ + -v \ + db create txs \ + --path-out /tmp/txs-out.bin.zst \ + --path-lable-tsv PATH_TO_MANE_LABEL.tsv \ + --path-cdot-json ../cdot-0.2.21.ensembl.grch37_grch38.json.gz \ + --path-cdot-json ../cdot-0.2.21.refseq.grch37_grch38.json.gz \ + --path-seqrepo-instance ../hgvs-rs-data/seqrepo-data/master/master +``` + +## Supported Sequence Variant Frequency Databases + +Mehari can import public sequence variant frequency databases. +The supported set slightly differs between import for GRCh37 and GRCh38. + +**GRCh37** + +- gnomAD r2.1.1 Exomes [`gnomad.exomes.r2.1.1.sites.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v2) +- gnomAD r2.1.1 Genomes [`gnomad.genomes.r2.1.1.sites.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v2) +- gnomAD v3.1 mtDNA [`gnomad.genomes.v3.1.sites.chrM.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v3-mitochondrial-dna) +- HelixMTdb `HelixMTdb_20200327.tsv` + +**GRCh38** + +- gnomAD r2.1.1 lift-over Exomes [`gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v2) +- gnomAD v3.1 Genomes [`gnomad.genomes.v3.1.2.sites.$CHROM.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v3) +- gnomAD v3.1 mtDNA [`gnomad.genomes.v3.1.sites.chrM.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v3-mitochondrial-dna) +- HelixMTdb `HelixMTdb_20200327.tsv` \ No newline at end of file diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index 7f71c05a..dc5ccbd0 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -72,7 +72,7 @@ pub mod openapi { pub struct ApiDoc; } -/// Command line arguments for "run-server` command. +/// Command line arguments for "server run` command. #[derive(clap::Parser, Debug)] #[command(about = "Run Mehari REST API server", long_about = None)] pub struct Args { @@ -142,7 +142,7 @@ pub fn print_hints(args: &Args) { ); } -/// Main entry point for `run-server` sub command. +/// Main entry point for `server run` sub command. /// /// # Errors /// diff --git a/src/server/schema.rs b/src/server/schema.rs index 68f9689c..1ab8cf24 100644 --- a/src/server/schema.rs +++ b/src/server/schema.rs @@ -28,7 +28,7 @@ impl Args { } } -/// Main entry point for `run-server` sub command. +/// Main entry point for `server run` sub command. /// /// # Errors ///