Skip to content

Commit

Permalink
feat: switch from flatbuffers to protobuf (#15) (#57)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Apr 24, 2023
1 parent 02f42f7 commit b75e5e7
Show file tree
Hide file tree
Showing 23 changed files with 459 additions and 622 deletions.
38 changes: 0 additions & 38 deletions .github/actions/install-flatbuffers/action.yml

This file was deleted.

7 changes: 3 additions & 4 deletions .github/workflows/release-please.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@ jobs:
- uses: actions/checkout@v2
if: ${{ steps.release.outputs.release_created }}

- name: Install flatbuffers
uses: ./.github/actions/install-flatbuffers
if: ${{ steps.release.outputs.release_created }}
- name: Setup protoc
uses: arduino/[email protected]

- name: Install stable toolchain
uses: actions-rs/toolchain@v1
Expand All @@ -30,7 +29,7 @@ jobs:
toolchain: stable
override: true

- uses: Swatinem/rust-cache@v1.3.0
- uses: Swatinem/rust-cache@v2
if: ${{ steps.release.outputs.release_created }}

- name: Publish crate
Expand Down
15 changes: 5 additions & 10 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,8 @@ jobs:
override: true
components: rustfmt

- name: Install flatbuffers
uses: ./.github/actions/install-flatbuffers

- name: Check format
run: |
flatc -o target/flatbuffers --rust src/world.fbs
rustfmt target/flatbuffers/world_generated.rs
run:
cargo fmt -- --check

Linting:
Expand All @@ -46,8 +41,8 @@ jobs:
override: true
components: clippy

- name: Install flatbuffers
uses: ./.github/actions/install-flatbuffers
- name: Setup protoc
uses: arduino/[email protected]

- uses: Swatinem/rust-cache@v2

Expand Down Expand Up @@ -95,8 +90,8 @@ jobs:
toolchain: stable
override: true

- name: Install flatbuffers
uses: ./.github/actions/install-flatbuffers
- name: Setup protoc
uses: arduino/[email protected]

- uses: Swatinem/rust-cache@v2

Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ perf.*

*.lock

## Flatbuffers
## Protocolbuffers library

utils/var
5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ byte-unit = "4.0.18"
clap = { version = "4.1.8", features = ["derive"] }
clap-verbosity-flag = "2.0.0"
csv = "1.2.0"
flatbuffers = "23.1.21"
flate2 = "1.0.25"
#hgvs = "0.6.2"
hgvs = { path = "/home/holtgrem_c/Development/hgvs-rs" }
Expand Down Expand Up @@ -61,9 +60,11 @@ jsonl = "4.0.1"
chrono = "0.4.24"
rand_core = "0.6.4"
rand = "0.8.5"
prost = "0.11.9"
zstd = "0.12.3"

[build-dependencies]
flatc-rust = "0.2.0"
prost-build = "0.11.9"

[dev-dependencies]
csv = "1.2.0"
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,9 +173,9 @@ cargo run --release -- \

## Development Setup

You will need a recent version of flatbuffers, e.g.:
You will need a recent version of protoc, e.g.:

```
# bash utils/install-flatbuffers.sh
# export PATH=$PATH:$HOME/.local/share/flatbuffers/bin
# bash utils/install-protoc.sh
# export PATH=$PATH:$HOME/.local/share/protoc/bin
```
13 changes: 3 additions & 10 deletions build.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
// The custom build script, needed as we use flatbuffers.

use std::path::Path;
// The custom build script, needed as we use prost.

fn main() {
println!("cargo:rerun-if-changed=src/world.fbs");
flatc_rust::run(flatc_rust::Args {
inputs: &[Path::new("src/world.fbs")],
out_dir: Path::new("target/flatbuffers/"),
..Default::default()
})
.expect("flatc");
println!("cargo:rerun-if-changed=src/db/create/txs/data.proto3");
prost_build::compile_protos(&["src/db/create/txs/data.proto3"], &["src/"]).unwrap();
}
4 changes: 3 additions & 1 deletion docs/db_build.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ This will be OK as there will be a more recent version available.

## Building Transcript Database

You can build the transcript database flatbuffers binary using the following command:
You can build the transcript database protocolbuffers binary using the following command:

```text
$ mehari db create txs \
Expand All @@ -158,6 +158,8 @@ $ mehari db create txs \
You will have to build the transcript database for each genome release that you want and manually specify the release to `--genome-release`.
For GRCh38, simply use `--genome-release grch38`.

You can enable compression by using the suffix `.gz` for gzip compression and `.zstd` for zstandard compression.

# Building ClinVar Database

This assumes that you have converted a recent ClinVar XML file to TSV using [clinvar-tsv](https://github.com/bihealth/clinvar-tsv).
Expand Down
2 changes: 1 addition & 1 deletion docs/implementation_notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ See the data structures in [`crate::db::create::seqvar_freqs::serialized`] for t

## Transcript Databases

* Transcript databases are stored as [Flatbuffers](https://github.com/google/flatbuffers).
* Transcript databases are stored as [ProtocolBuffers](https://protobuf.dev/).
* Array-backed interval trees from [rust-bio](https://github.com/rust-bio/rust-bio) are used for fast lookup from chromosomal coordinate to transcript.
* Transcripts are taken from [cdot](https://github.com/SACGF/cdot).
48 changes: 31 additions & 17 deletions src/annotate/seqvars/csq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,10 @@ impl ConsequencePredictor {

// Skip transcripts that are protein coding but do not have a CDS.
// TODO: do not include such transcripts when building the database.
if tx.biotype == TranscriptBiotype::Coding && tx.start_codon.is_none() {
if TranscriptBiotype::from_i32(tx.biotype).expect("invalid tx biotype")
== TranscriptBiotype::Coding
&& tx.start_codon.is_none()
{
return Ok(None);
}

Expand Down Expand Up @@ -213,16 +216,26 @@ impl ConsequencePredictor {
if var_start <= exon_start && var_end >= exon_end {
consequences.push(Consequence::ExonLossVariant);
if var_start < exon_start {
if alignment.strand == Strand::Plus && rank.ord != 1 {
if Strand::from_i32(alignment.strand).expect("invalid strand") == Strand::Plus
&& rank.ord != 1
{
consequences.push(Consequence::SpliceAcceptorVariant);
} else if alignment.strand == Strand::Minus && rank.ord != rank.total {
} else if Strand::from_i32(alignment.strand).expect("invalid strand")
== Strand::Minus
&& rank.ord != rank.total
{
consequences.push(Consequence::SpliceDonorVariant);
}
}
if var_end > exon_end {
if alignment.strand == Strand::Plus && rank.ord != rank.total {
if Strand::from_i32(alignment.strand).expect("invalid strand") == Strand::Plus
&& rank.ord != rank.total
{
consequences.push(Consequence::SpliceDonorVariant);
} else if alignment.strand == Strand::Minus && rank.ord != rank.total {
} else if Strand::from_i32(alignment.strand).expect("invalid strand")
== Strand::Minus
&& rank.ord != rank.total
{
consequences.push(Consequence::SpliceAcceptorVariant);
}
}
Expand All @@ -236,15 +249,15 @@ impl ConsequencePredictor {
// Check the cases where the variant overlaps with the splice acceptor/donor site.
if var_start < intron_start + 2 && var_end > intron_start - ins_shift {
// Left side, is acceptor/donor depending on transcript's strand.
match alignment.strand {
match Strand::from_i32(alignment.strand).expect("invalid strand") {
Strand::Plus => consequences.push(Consequence::SpliceDonorVariant),
Strand::Minus => consequences.push(Consequence::SpliceAcceptorVariant),
}
}
// Check the case where the variant overlaps with the splice donor site.
if var_start < intron_end + ins_shift && var_end > intron_end - 2 {
// Left side, is acceptor/donor depending on transcript's strand.
match alignment.strand {
match Strand::from_i32(alignment.strand).expect("invalid strand") {
Strand::Plus => consequences.push(Consequence::SpliceAcceptorVariant),
Strand::Minus => consequences.push(Consequence::SpliceDonorVariant),
}
Expand All @@ -260,7 +273,7 @@ impl ConsequencePredictor {
consequences.push(Consequence::SpliceRegionVariant);
}
if var_start < exon_end && var_end > exon_end - 3 {
if alignment.strand == Strand::Plus {
if Strand::from_i32(alignment.strand).expect("invalid strand") == Strand::Plus {
if rank.ord != rank.total {
consequences.push(Consequence::SpliceRegionVariant);
}
Expand All @@ -272,7 +285,7 @@ impl ConsequencePredictor {
}
}
if var_start < exon_start + 3 && var_end > exon_start {
if alignment.strand == Strand::Plus {
if Strand::from_i32(alignment.strand).expect("invalid strand") == Strand::Plus {
if rank.ord != 1 {
consequences.push(Consequence::SpliceRegionVariant);
}
Expand All @@ -294,10 +307,11 @@ impl ConsequencePredictor {
let min_start = min_start.expect("must have seen exon");
let max_end = max_end.expect("must have seen exon");

let feature_biotype = match tx.biotype {
TranscriptBiotype::Coding => FeatureBiotype::Coding,
TranscriptBiotype::NonCoding => FeatureBiotype::Noncoding,
};
let feature_biotype =
match TranscriptBiotype::from_i32(tx.biotype).expect("invalid transcript biotype") {
TranscriptBiotype::Coding => FeatureBiotype::Coding,
TranscriptBiotype::NonCoding => FeatureBiotype::Noncoding,
};

let is_upstream = var_end <= min_start;
let is_downstream = var_start >= max_end;
Expand All @@ -314,7 +328,7 @@ impl ConsequencePredictor {
} else if is_upstream {
let val = -(min_start - var_end);
if val.abs() <= 5_000 {
match alignment.strand {
match Strand::from_i32(alignment.strand).expect("invalid strand") {
Strand::Plus => consequences.push(Consequence::UpstreamGeneVariant),
Strand::Minus => consequences.push(Consequence::DownstreamGeneVariant),
}
Expand All @@ -323,7 +337,7 @@ impl ConsequencePredictor {
} else if is_downstream {
let val = var_start - max_end;
if val.abs() <= 5_000 {
match alignment.strand {
match Strand::from_i32(alignment.strand).expect("invalid strand") {
Strand::Plus => consequences.push(Consequence::DownstreamGeneVariant),
Strand::Minus => consequences.push(Consequence::UpstreamGeneVariant),
}
Expand Down Expand Up @@ -689,7 +703,7 @@ mod test {
#[test]
fn annotate_snv_brca1_one_variant() -> Result<(), anyhow::Error> {
let tx_path = "tests/data/annotate/db/seqvars/grch37/txs.bin";
let tx_db = load_tx_db(tx_path, 5_000_000)?;
let tx_db = load_tx_db(tx_path)?;
let provider = Rc::new(MehariProvider::new(tx_db, Assembly::Grch37p10));

let predictor = ConsequencePredictor::new(provider, Assembly::Grch37p10);
Expand Down Expand Up @@ -816,7 +830,7 @@ mod test {

fn annotate_vars(path_tsv: &str, txs: &[String]) -> Result<(), anyhow::Error> {
let tx_path = "tests/data/annotate/db/seqvars/grch37/txs.bin";
let tx_db = load_tx_db(tx_path, 5_000_000)?;
let tx_db = load_tx_db(tx_path)?;
let provider = Rc::new(MehariProvider::new(tx_db, Assembly::Grch37p10));
let predictor = ConsequencePredictor::new(provider, Assembly::Grch37p10);

Expand Down
Loading

0 comments on commit b75e5e7

Please sign in to comment.