Skip to content

Commit

Permalink
feat: transcript database building (#1) (#8)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Mar 14, 2023
1 parent 6400ed1 commit bcff954
Show file tree
Hide file tree
Showing 20 changed files with 8,278 additions and 10 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tests/data/db/create/txs/latest/** filter=lfs diff=lfs merge=lfs -text
38 changes: 38 additions & 0 deletions .github/actions/install-flatbuffers/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: install-flatbuffers
description: Install flatbuffers

runs:
using: "composite"
steps:
- name: Cache flatbuffers installation
id: cache-flatbuffers-installation
uses: actions/cache@v3
env:
cache-name: cache-install-flatbuffers
with:
path: ~/.local/share/flatbuffers
key: ${{ runner.os }}-build-${{ env.cache-name }}
restore-keys: |
${{ runner.os }}-build-
${{ runner.os }}-
- if: ${{ steps. cache-flatbuffers-installation.outputs.cache-hit != 'true' }}
name: Install flatbuffers
shell: bash
run: |
mkdir -p utils/var
cd utils/var
git clone https://github.com/google/flatbuffers.git
cd flatbuffers
git checkout v22.12.06
cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=$HOME/.local/share/flatbuffers
make
./flattests
sudo make install
export PATH=$PATH:$HOME/.local/share/flatbuffers/bin
flatc --version
- name: Make flatc available in PATH
shell: bash
run: |
echo "$HOME/.local/share/flatbuffers/bin" >> $GITHUB_PATH
11 changes: 11 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,13 @@ jobs:
override: true
components: rustfmt

- name: Install flatbuffers
uses: ./.github/actions/install-flatbuffers

- name: Check format
run: |
flatc -o target/flatbuffers --rust src/world.fbs
rustfmt target/flatbuffers/world_generated.rs
cargo fmt -- --check
Linting:
Expand All @@ -41,6 +46,9 @@ jobs:
override: true
components: clippy

- name: Install flatbuffers
uses: ./.github/actions/install-flatbuffers

# - uses: Swatinem/[email protected]
# Enable caching of the 'librocksdb-sys' crate by additionally caching the
# 'librocksdb-sys' src directory which is managed by cargo
Expand Down Expand Up @@ -86,6 +94,9 @@ jobs:
toolchain: stable
override: true

- name: Install flatbuffers
uses: ./.github/actions/install-flatbuffers

# - uses: Swatinem/[email protected]
# Enable caching of the 'librocksdb-sys' crate by additionally caching the
# 'librocksdb-sys' src directory which is managed by cargo
Expand Down
7 changes: 2 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@

*.lock

## Flatbuffers

# Added by cargo
#
# already existing elements were commented out

#/target
utils/var
13 changes: 12 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,29 @@ name = "mehari"
[dependencies]
anyhow = "1.0.69"
byteorder = "1.4.3"
byte-unit = "4.0.18"
clap = { version = "4.1.8", features = ["derive"] }
clap-verbosity-flag = "2.0.0"
csv = "1.2.0"
hgvs = "0.2.0"
flatbuffers = "23.1.21"
flate2 = "1.0.25"
hgvs = "0.3.1"
lazy_static = "1.4.0"
log = "0.4.17"
noodles = { version = "0.33.0", features = ["vcf", "bcf", "csi", "fasta", "bgzf", "tabix"] }
noodles-util = { version = "0.5.0", features = ["noodles-bcf", "noodles-bgzf", "noodles-vcf", "variant"] }
procfs = "0.15.1"
rocksdb = "0.20.1"
seqrepo = "0.2.3"
serde = { version = "1.0.152", features = ["derive"] }
serde_json = "1.0.94"
tracing = { version = "0.1.37", features = ["log"] }
tracing-subscriber = "0.3.16"
indicatif = "0.17.3"
thousands = "0.2.0"

[build-dependencies]
flatc-rust = "0.2.0"

[dev-dependencies]
pretty_assertions = "1.3.0"
Expand Down
54 changes: 54 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,57 @@ prepare \
$base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chrY.vcf.bgz \
tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrY.vcf
```

Building tx database


```
cd hgvs-rs-data
seqrepo --root-directory seqrepo-data/master init
mkdir -p mirror/ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot
cd !$
wget https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.files.installed
parallel -j 16 'wget https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/{}' ::: $(cut -f 2 human.files.installed | grep fna)
cd -
mkdir -p mirror/ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/cdna
cd !$
wget https://ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
cd -
mkdir -p mirror/ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/ncrna
cd !$
wget https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz
cd -
mkdir -p mirror/ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/cdna/
cd !$
wget https://ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.cdna.all.fa.gz
cd -
mkdir -p mirror/ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/ncrna/
cd !$
wget https://ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh37.ncrna.fa.gz
cd -
seqrepo --root-directory seqrepo-data/master load -n NCBI $(find mirror/ftp.ncbi.nih.gov -name '*.fna.gz' | sort)
seqrepo --root-directory seqrepo-data/master load -n ENSEMBL $(find mirror/ftp.ensembl.org -name '*.fa.gz' | sort)
cd ../mehari
cargo run --release -- \
-v \
db create txs \
--path-out /tmp/txs-out.bin \
--path-cdot-json ../cdot-0.2.12.ensembl.grch37_grch38.json.gz \
--path-cdot-json ../cdot-0.2.12.refseq.grch37_grch38.json.gz \
--path-seqrepo-instance ../hgvs-rs-data/seqrepo-data/master/master
```

## Development Setup

You will need a recent version of flatbuffers, e.g.:

```
# bash utils/install-flatbuffers.sh
# export PATH=$PATH:$HOME/.local/share/flatbuffers/bin
```
13 changes: 13 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// The custom build script, needed as we use flatbuffers.

use std::path::Path;

fn main() {
println!("cargo:rerun-if-changed=src/world.fbs");
flatc_rust::run(flatc_rust::Args {
inputs: &[Path::new("src/world.fbs")],
out_dir: Path::new("target/flatbuffers/"),
..Default::default()
})
.expect("flatc");
}
11 changes: 11 additions & 0 deletions src/common.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! Commonly used code.
use byte_unit::Byte;
use clap::Parser;
use clap_verbosity_flag::{InfoLevel, Verbosity};

Expand All @@ -10,3 +11,13 @@ pub struct Args {
#[clap(flatten)]
pub verbose: Verbosity<InfoLevel>,
}

/// Helper to print the current memory resident set size via `tracing`.
pub fn trace_rss_now() {
let me = procfs::process::Process::myself().unwrap();
let page_size = procfs::page_size();
tracing::debug!(
"RSS now: {}",
Byte::from_bytes((me.stat().unwrap().rss * page_size) as u128).get_appropriate_unit(true)
);
}
Loading

0 comments on commit bcff954

Please sign in to comment.