diff --git a/.github/actions/install-flatbuffers/action.yml b/.github/actions/install-flatbuffers/action.yml deleted file mode 100644 index ccfba173..00000000 --- a/.github/actions/install-flatbuffers/action.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: install-flatbuffers -description: Install flatbuffers - -runs: - using: "composite" - steps: - - name: Cache flatbuffers installation - id: cache-flatbuffers-installation - uses: actions/cache@v3 - env: - cache-name: cache-install-flatbuffers - with: - path: ~/.local/share/flatbuffers - key: ${{ runner.os }}-build-${{ env.cache-name }} - restore-keys: | - ${{ runner.os }}-build- - ${{ runner.os }}- - - - if: ${{ steps. cache-flatbuffers-installation.outputs.cache-hit != 'true' }} - name: Install flatbuffers - shell: bash - run: | - mkdir -p utils/var - cd utils/var - git clone https://github.com/google/flatbuffers.git - cd flatbuffers - git checkout v22.12.06 - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=$HOME/.local/share/flatbuffers - make - ./flattests - sudo make install - export PATH=$PATH:$HOME/.local/share/flatbuffers/bin - flatc --version - - - name: Make flatc available in PATH - shell: bash - run: | - echo "$HOME/.local/share/flatbuffers/bin" >> $GITHUB_PATH diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml index 4d7d8e2c..42a46248 100644 --- a/.github/workflows/release-please.yml +++ b/.github/workflows/release-please.yml @@ -19,9 +19,8 @@ jobs: - uses: actions/checkout@v2 if: ${{ steps.release.outputs.release_created }} - - name: Install flatbuffers - uses: ./.github/actions/install-flatbuffers - if: ${{ steps.release.outputs.release_created }} + - name: Setup protoc + uses: arduino/setup-protoc@v1.1.2 - name: Install stable toolchain uses: actions-rs/toolchain@v1 @@ -30,7 +29,7 @@ jobs: toolchain: stable override: true - - uses: Swatinem/rust-cache@v1.3.0 + - uses: Swatinem/rust-cache@v2 if: ${{ steps.release.outputs.release_created }} - name: Publish crate diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index e8a5c3ac..a3875717 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -22,13 +22,8 @@ jobs: override: true components: rustfmt - - name: Install flatbuffers - uses: ./.github/actions/install-flatbuffers - - name: Check format - run: | - flatc -o target/flatbuffers --rust src/world.fbs - rustfmt target/flatbuffers/world_generated.rs + run: cargo fmt -- --check Linting: @@ -46,8 +41,8 @@ jobs: override: true components: clippy - - name: Install flatbuffers - uses: ./.github/actions/install-flatbuffers + - name: Setup protoc + uses: arduino/setup-protoc@v1.1.2 - uses: Swatinem/rust-cache@v2 @@ -95,8 +90,8 @@ jobs: toolchain: stable override: true - - name: Install flatbuffers - uses: ./.github/actions/install-flatbuffers + - name: Setup protoc + uses: arduino/setup-protoc@v1.1.2 - uses: Swatinem/rust-cache@v2 diff --git a/.gitignore b/.gitignore index df61dff1..66e06f22 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,6 @@ perf.* *.lock -## Flatbuffers +## Protocolbuffers library utils/var diff --git a/Cargo.toml b/Cargo.toml index 45be6071..73a0b6fb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,7 +27,6 @@ byte-unit = "4.0.18" clap = { version = "4.1.8", features = ["derive"] } clap-verbosity-flag = "2.0.0" csv = "1.2.0" -flatbuffers = "23.1.21" flate2 = "1.0.25" hgvs = "0.6.2" lazy_static = "1.4.0" @@ -60,9 +59,11 @@ jsonl = "4.0.1" chrono = "0.4.24" rand_core = "0.6.4" rand = "0.8.5" +prost = "0.11.9" +zstd = "0.12.3" [build-dependencies] -flatc-rust = "0.2.0" +prost-build = "0.11.9" [dev-dependencies] csv = "1.2.0" diff --git a/README.md b/README.md index f9243fc6..1790bf18 100644 --- a/README.md +++ b/README.md @@ -173,9 +173,9 @@ cargo run --release -- \ ## Development Setup -You will need a recent version of flatbuffers, e.g.: +You will need a recent version of protoc, e.g.: ``` -# bash utils/install-flatbuffers.sh -# export PATH=$PATH:$HOME/.local/share/flatbuffers/bin +# bash utils/install-protoc.sh +# export PATH=$PATH:$HOME/.local/share/protoc/bin ``` diff --git a/build.rs b/build.rs index d7bed186..7e96650c 100644 --- a/build.rs +++ b/build.rs @@ -1,13 +1,6 @@ -// The custom build script, needed as we use flatbuffers. - -use std::path::Path; +// The custom build script, needed as we use prost. fn main() { - println!("cargo:rerun-if-changed=src/world.fbs"); - flatc_rust::run(flatc_rust::Args { - inputs: &[Path::new("src/world.fbs")], - out_dir: Path::new("target/flatbuffers/"), - ..Default::default() - }) - .expect("flatc"); + println!("cargo:rerun-if-changed=src/db/create/txs/data.proto3"); + prost_build::compile_protos(&["src/db/create/txs/data.proto3"], &["src/"]).unwrap(); } diff --git a/docs/db_build.md b/docs/db_build.md index 442828a7..01153dc6 100644 --- a/docs/db_build.md +++ b/docs/db_build.md @@ -139,7 +139,7 @@ This will be OK as there will be a more recent version available. ## Building Transcript Database -You can build the transcript database flatbuffers binary using the following command: +You can build the transcript database protocolbuffers binary using the following command: ```text $ mehari db create txs \ diff --git a/docs/implementation_notes.md b/docs/implementation_notes.md index daa8684e..01362482 100644 --- a/docs/implementation_notes.md +++ b/docs/implementation_notes.md @@ -19,6 +19,6 @@ See the data structures in [`crate::db::create::seqvar_freqs::serialized`] for t ## Transcript Databases -* Transcript databases are stored as [Flatbuffers](https://github.com/google/flatbuffers). +* Transcript databases are stored as [ProtocolBuffers](https://protobuf.dev/). * Array-backed interval trees from [rust-bio](https://github.com/rust-bio/rust-bio) are used for fast lookup from chromosomal coordinate to transcript. * Transcripts are taken from [cdot](https://github.com/SACGF/cdot). diff --git a/src/annotate/seqvars/csq.rs b/src/annotate/seqvars/csq.rs index 677c2b75..cebb2cc2 100644 --- a/src/annotate/seqvars/csq.rs +++ b/src/annotate/seqvars/csq.rs @@ -138,7 +138,10 @@ impl ConsequencePredictor { // Skip transcripts that are protein coding but do not have a CDS. // TODO: do not include such transcripts when building the database. - if tx.biotype == TranscriptBiotype::Coding && tx.start_codon.is_none() { + if TranscriptBiotype::from_i32(tx.biotype).expect("invalid tx biotype") + == TranscriptBiotype::Coding + && tx.start_codon.is_none() + { return Ok(None); } @@ -213,16 +216,26 @@ impl ConsequencePredictor { if var_start <= exon_start && var_end >= exon_end { consequences.push(Consequence::ExonLossVariant); if var_start < exon_start { - if alignment.strand == Strand::Plus && rank.ord != 1 { + if Strand::from_i32(alignment.strand).expect("invalid strand") == Strand::Plus + && rank.ord != 1 + { consequences.push(Consequence::SpliceAcceptorVariant); - } else if alignment.strand == Strand::Minus && rank.ord != rank.total { + } else if Strand::from_i32(alignment.strand).expect("invalid strand") + == Strand::Minus + && rank.ord != rank.total + { consequences.push(Consequence::SpliceDonorVariant); } } if var_end > exon_end { - if alignment.strand == Strand::Plus && rank.ord != rank.total { + if Strand::from_i32(alignment.strand).expect("invalid strand") == Strand::Plus + && rank.ord != rank.total + { consequences.push(Consequence::SpliceDonorVariant); - } else if alignment.strand == Strand::Minus && rank.ord != rank.total { + } else if Strand::from_i32(alignment.strand).expect("invalid strand") + == Strand::Minus + && rank.ord != rank.total + { consequences.push(Consequence::SpliceAcceptorVariant); } } @@ -236,7 +249,7 @@ impl ConsequencePredictor { // Check the cases where the variant overlaps with the splice acceptor/donor site. if var_start < intron_start + 2 && var_end > intron_start - ins_shift { // Left side, is acceptor/donor depending on transcript's strand. - match alignment.strand { + match Strand::from_i32(alignment.strand).expect("invalid strand") { Strand::Plus => consequences.push(Consequence::SpliceDonorVariant), Strand::Minus => consequences.push(Consequence::SpliceAcceptorVariant), } @@ -244,7 +257,7 @@ impl ConsequencePredictor { // Check the case where the variant overlaps with the splice donor site. if var_start < intron_end + ins_shift && var_end > intron_end - 2 { // Left side, is acceptor/donor depending on transcript's strand. - match alignment.strand { + match Strand::from_i32(alignment.strand).expect("invalid strand") { Strand::Plus => consequences.push(Consequence::SpliceAcceptorVariant), Strand::Minus => consequences.push(Consequence::SpliceDonorVariant), } @@ -260,7 +273,7 @@ impl ConsequencePredictor { consequences.push(Consequence::SpliceRegionVariant); } if var_start < exon_end && var_end > exon_end - 3 { - if alignment.strand == Strand::Plus { + if Strand::from_i32(alignment.strand).expect("invalid strand") == Strand::Plus { if rank.ord != rank.total { consequences.push(Consequence::SpliceRegionVariant); } @@ -272,7 +285,7 @@ impl ConsequencePredictor { } } if var_start < exon_start + 3 && var_end > exon_start { - if alignment.strand == Strand::Plus { + if Strand::from_i32(alignment.strand).expect("invalid strand") == Strand::Plus { if rank.ord != 1 { consequences.push(Consequence::SpliceRegionVariant); } @@ -294,10 +307,11 @@ impl ConsequencePredictor { let min_start = min_start.expect("must have seen exon"); let max_end = max_end.expect("must have seen exon"); - let feature_biotype = match tx.biotype { - TranscriptBiotype::Coding => FeatureBiotype::Coding, - TranscriptBiotype::NonCoding => FeatureBiotype::Noncoding, - }; + let feature_biotype = + match TranscriptBiotype::from_i32(tx.biotype).expect("invalid transcript biotype") { + TranscriptBiotype::Coding => FeatureBiotype::Coding, + TranscriptBiotype::NonCoding => FeatureBiotype::Noncoding, + }; let is_upstream = var_end <= min_start; let is_downstream = var_start >= max_end; @@ -314,7 +328,7 @@ impl ConsequencePredictor { } else if is_upstream { let val = -(min_start - var_end); if val.abs() <= 5_000 { - match alignment.strand { + match Strand::from_i32(alignment.strand).expect("invalid strand") { Strand::Plus => consequences.push(Consequence::UpstreamGeneVariant), Strand::Minus => consequences.push(Consequence::DownstreamGeneVariant), } @@ -323,7 +337,7 @@ impl ConsequencePredictor { } else if is_downstream { let val = var_start - max_end; if val.abs() <= 5_000 { - match alignment.strand { + match Strand::from_i32(alignment.strand).expect("invalid strand") { Strand::Plus => consequences.push(Consequence::DownstreamGeneVariant), Strand::Minus => consequences.push(Consequence::UpstreamGeneVariant), } @@ -689,7 +703,7 @@ mod test { #[test] fn annotate_snv_brca1_one_variant() -> Result<(), anyhow::Error> { let tx_path = "tests/data/annotate/db/seqvars/grch37/txs.bin"; - let tx_db = load_tx_db(tx_path, 5_000_000)?; + let tx_db = load_tx_db(tx_path)?; let provider = Rc::new(MehariProvider::new(tx_db, Assembly::Grch37p10)); let predictor = ConsequencePredictor::new(provider, Assembly::Grch37p10); @@ -816,7 +830,7 @@ mod test { fn annotate_vars(path_tsv: &str, txs: &[String]) -> Result<(), anyhow::Error> { let tx_path = "tests/data/annotate/db/seqvars/grch37/txs.bin"; - let tx_db = load_tx_db(tx_path, 5_000_000)?; + let tx_db = load_tx_db(tx_path)?; let provider = Rc::new(MehariProvider::new(tx_db, Assembly::Grch37p10)); let predictor = ConsequencePredictor::new(provider, Assembly::Grch37p10); diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index a0e12419..972da3d0 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -5,9 +5,11 @@ pub mod binning; pub mod csq; pub mod provider; +use anyhow::anyhow; +use prost::Message; use std::collections::{HashMap, HashSet}; use std::fs::File; -use std::io::{BufWriter, Write}; +use std::io::{BufWriter, Cursor, Read, Write}; use std::ops::Deref; use std::path::Path; use std::rc::Rc; @@ -15,12 +17,9 @@ use std::str::FromStr; use std::time::Instant; use clap::{Args as ClapArgs, Parser}; -use enumset::EnumSet; -use flatbuffers::VerifierOptions; use flate2::write::GzEncoder; use flate2::Compression; use hgvs::static_data::Assembly; -use memmap2::Mmap; use noodles::bgzf::Writer as BgzfWriter; use noodles::vcf::header::format::key::{ CONDITIONAL_GENOTYPE_QUALITY, GENOTYPE, READ_DEPTH, READ_DEPTHS, @@ -54,15 +53,8 @@ use crate::db::create::seqvar_freqs::serialized::{ auto::Record as AutoRecord, mt::Record as MtRecord, xy::Record as XyRecord, }; -use crate::db::create::txs::data::{ - ExonAlignment, GeneToTxId, GenomeAlignment, GenomeBuild, SequenceDb, Strand, Transcript, - TranscriptBiotype, TranscriptDb, TranscriptTag, TxSeqDatabase, -}; +use crate::db::create::txs::data::TxSeqDatabase; use crate::ped::{PedigreeByName, Sex}; -use crate::world_flatbuffers::mehari::{ - GenomeBuild as FlatGenomeBuild, Strand as FlatStrand, - TranscriptBiotype as FlatTranscriptBiotype, TxSeqDatabase as FlatTxSeqDatabase, -}; use self::ann::{AnnField, Consequence, FeatureBiotype}; @@ -102,10 +94,6 @@ pub struct Args { /// For debug purposes, maximal number of variants to annotate. #[arg(long)] pub max_var_count: Option, - /// Maximal number of flatbuffers tables, should not need tweaking. However, if you see a - /// "too many tables" error output, increase this value. - #[arg(long, default_value_t = 5_000_000)] - pub max_fb_tables: usize, } /// Command line arguments to enforce either `--path-output-vcf` or `--path-output-tsv`. @@ -494,174 +482,33 @@ pub fn path_component(assembly: Assembly) -> &'static str { } } -/// Load flatbuffers transcripts. -pub fn load_tx_db(tx_path: &str, max_fb_tables: usize) -> Result { - let tx_file = File::open(tx_path)?; - let tx_mmap = unsafe { Mmap::map(&tx_file)? }; - let fb_opts = VerifierOptions { - max_tables: max_fb_tables, - ..Default::default() - }; - let fb_tx_db = flatbuffers::root_with_opts::(&fb_opts, &tx_mmap)?; - - let transcripts = fb_tx_db - .tx_db() - .unwrap() - .transcripts() - .unwrap() - .into_iter() - .map(|rec| { - let mut tags = EnumSet::new(); - let flat_tags = rec.biotype().0; - if flat_tags & 1 != 0 { - tags.insert(TranscriptTag::Basic); - } - if flat_tags & 2 != 0 { - tags.insert(TranscriptTag::EnsemblCanonical); - } - if flat_tags & 4 != 0 { - tags.insert(TranscriptTag::ManeSelect); - } - if flat_tags & 8 != 0 { - tags.insert(TranscriptTag::ManePlusClinical); - } - if flat_tags & 16 != 0 { - tags.insert(TranscriptTag::RefSeqSelect); - } - - let genome_alignments = rec - .genome_alignments() - .unwrap() - .iter() - .map(|rec| { - let exons = rec - .exons() - .unwrap() - .iter() - .map(|rec| ExonAlignment { - alt_start_i: rec.alt_start_i(), - alt_end_i: rec.alt_end_i(), - ord: rec.ord(), - alt_cds_start_i: if rec.alt_cds_start_i() >= 0 { - Some(rec.alt_cds_start_i()) - } else { - None - }, - alt_cds_end_i: if rec.alt_cds_end_i() >= 0 { - Some(rec.alt_cds_end_i()) - } else { - None - }, - cigar: rec.cigar().unwrap().to_string(), - }) - .collect(); - - GenomeAlignment { - genome_build: match rec.genome_build() { - FlatGenomeBuild::Grch37 => GenomeBuild::Grch37, - FlatGenomeBuild::Grch38 => GenomeBuild::Grch38, - _ => panic!("Invalid genome build: {:?}", rec.genome_build()), - }, - contig: rec.contig().unwrap().to_string(), - cds_start: if rec.cds_start() >= 0 { - Some(rec.cds_start()) - } else { - None - }, - cds_end: if rec.cds_end() >= 0 { - Some(rec.cds_end()) - } else { - None - }, - strand: match rec.strand() { - FlatStrand::Plus => Strand::Plus, - FlatStrand::Minus => Strand::Minus, - _ => panic!("invalid strand: {:?}", rec.strand()), - }, - exons, - } - }) - .collect(); - - Transcript { - id: rec.id().unwrap().to_string(), - gene_name: rec.gene_name().unwrap().to_string(), - gene_id: rec.gene_id().unwrap().to_string(), - biotype: match rec.biotype() { - FlatTranscriptBiotype::Coding => TranscriptBiotype::Coding, - FlatTranscriptBiotype::NonCoding => TranscriptBiotype::NonCoding, - _ => panic!("Invalid biotype: {:?}", rec.biotype()), - }, - tags, - protein: if rec.protein().is_none() || rec.protein().unwrap().is_empty() { - None - } else { - Some(rec.protein().unwrap().to_string()) - }, - start_codon: if rec.start_codon() >= 0 { - Some(rec.start_codon()) - } else { - None - }, - stop_codon: if rec.stop_codon() >= 0 { - Some(rec.stop_codon()) - } else { - None - }, - genome_alignments, - } - }) - .collect(); - - let gene_to_tx = fb_tx_db - .tx_db() - .unwrap() - .gene_to_tx() - .unwrap() - .into_iter() - .map(|rec| GeneToTxId { - gene_name: rec.gene_name().unwrap().to_string(), - tx_ids: rec - .tx_ids() - .unwrap() - .into_iter() - .map(|s| s.to_string()) - .collect(), - }) - .collect(); - - let tx_db = TranscriptDb { - transcripts, - gene_to_tx, - }; - - let seq_db = SequenceDb { - aliases: fb_tx_db - .seq_db() - .unwrap() - .aliases() - .unwrap() - .into_iter() - .map(|alias| alias.to_string()) - .collect(), - aliases_idx: fb_tx_db - .seq_db() - .unwrap() - .aliases_idx() - .unwrap() - .into_iter() - .collect(), - seqs: fb_tx_db - .seq_db() - .unwrap() - .seqs() - .unwrap() - .into_iter() - .map(|seq| seq.to_string()) - .collect(), +/// Load protobuf transcripts. +pub fn load_tx_db(tx_path: &str) -> Result { + // Open file and if necessary, wrap in a decompressor. + let file = std::fs::File::open(tx_path) + .map_err(|e| anyhow!("failed to open file {}: {}", tx_path, e))?; + let mut reader: Box = if tx_path.ends_with(".gz") { + Box::new(flate2::read::MultiGzDecoder::new(file)) + } else if tx_path.ends_with(".zstd") { + Box::new( + zstd::Decoder::new(file) + .map_err(|e| anyhow!("failed to open zstd decoder for {}: {}", tx_path, e))?, + ) + } else { + Box::new(file) }; - Ok(TxSeqDatabase { tx_db, seq_db }) + // Now read the whole file into a byte buffer. + let metadata = std::fs::metadata(&tx_path) + .map_err(|e| anyhow!("failed to get metadata for {}: {}", tx_path, e))?; + let mut buffer = vec![0; metadata.len() as usize]; + reader + .read(&mut buffer) + .map_err(|e| anyhow!("failed to read file {}: {}", tx_path, e))?; + + // Deserialize the buffer with prost. + TxSeqDatabase::decode(&mut Cursor::new(buffer)) + .map_err(|e| anyhow!("failed to decode protobuf file {}: {}", tx_path, e)) } /// Mehari-local trait for writing out annotated VCF records as VCF or VarFish TSV. @@ -1577,14 +1424,11 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<( // Open the serialized transcripts. tracing::info!("Opening transcript database"); - let tx_db = load_tx_db( - &format!( - "{}/seqvars/{}/txs.bin", - &args.path_db, - path_component(assembly) - ), - args.max_fb_tables, - )?; + let tx_db = load_tx_db(&format!( + "{}/seqvars/{}/txs.bin", + &args.path_db, + path_component(assembly) + ))?; tracing::info!("Building transcript interval trees ..."); let provider = Rc::new(MehariProvider::new(tx_db, assembly)); let predictor = ConsequencePredictor::new(provider, assembly); @@ -1771,7 +1615,6 @@ mod test { path_output_tsv: None, }, max_var_count: None, - max_fb_tables: 5_000_000, path_input_ped: Some(String::from( "tests/data/db/create/seqvar_freqs/db-rs1263393206/input.ped", )), @@ -1807,7 +1650,6 @@ mod test { path_output_tsv: Some(path_out.into_os_string().into_string().unwrap()), }, max_var_count: None, - max_fb_tables: 5_000_000, path_input_ped: Some(String::from( "tests/data/db/create/seqvar_freqs/db-rs1263393206/input.ped", )), diff --git a/src/annotate/seqvars/provider.rs b/src/annotate/seqvars/provider.rs index 6cd1674e..37981dd0 100644 --- a/src/annotate/seqvars/provider.rs +++ b/src/annotate/seqvars/provider.rs @@ -1,4 +1,4 @@ -//! Implementation of `hgvs` Provider interface based on flatbuffers. +//! Implementation of `hgvs` Provider interface based on protobuf. use std::collections::HashMap; @@ -63,7 +63,7 @@ impl TxIntervalTrees { } }); - for (tx_id, tx) in db.tx_db.transcripts.iter().enumerate() { + for (tx_id, tx) in db.tx_db.expect("no tx_db?").transcripts.iter().enumerate() { for genome_alignment in &tx.genome_alignments { let contig = &genome_alignment.contig; if let Some(contig_idx) = contig_to_idx.get(contig) { @@ -100,6 +100,7 @@ impl MehariProvider { let tx_map = HashMap::from_iter( tx_seq_db .tx_db + .expect("no tx_db?") .transcripts .iter() .enumerate() @@ -108,9 +109,10 @@ impl MehariProvider { let seq_map = HashMap::from_iter( tx_seq_db .seq_db + .expect("no seq_db?") .aliases .iter() - .zip(tx_seq_db.seq_db.aliases_idx.iter()) + .zip(tx_seq_db.seq_db.expect("no seq_db?").aliases_idx.iter()) .map(|(alias, idx)| (alias.clone(), *idx)), ); @@ -125,7 +127,7 @@ impl MehariProvider { pub fn get_tx(&self, tx_id: &str) -> Option { self.tx_map .get(tx_id) - .map(|idx| self.tx_seq_db.tx_db.transcripts[*idx as usize].clone()) + .map(|idx| self.tx_seq_db.tx_db.expect("no tx_db?").transcripts[*idx as usize].clone()) } } @@ -163,7 +165,7 @@ impl ProviderInterface for MehariProvider { .get(tx_ac) .ok_or(anyhow::anyhow!("Could not find transcript {}", tx_ac))?; let tx_idx = tx_idx as usize; - let tx = &self.tx_seq_db.tx_db.transcripts[tx_idx]; + let tx = &self.tx_seq_db.tx_db.expect("no tx_db?").transcripts[tx_idx]; Ok(tx.protein.clone()) } @@ -179,7 +181,7 @@ impl ProviderInterface for MehariProvider { .ok_or(anyhow::anyhow!("Sequence for {:?} not found", ac))?; let seq_idx = seq_idx as usize; - let seq = &self.tx_seq_db.seq_db.seqs[seq_idx]; + let seq = &self.tx_seq_db.seq_db.expect("no seq_db?").seqs[seq_idx]; match (begin, end) { (Some(begin), Some(end)) => { let begin = std::cmp::min(begin, seq.len()); @@ -218,7 +220,7 @@ impl ProviderInterface for MehariProvider { .ok_or(anyhow::anyhow!("Could not find transcript {}", tx_ac))?; let tx_idx = tx_idx as usize; - let tx = &self.tx_seq_db.tx_db.transcripts[tx_idx]; + let tx = &self.tx_seq_db.tx_db.expect("no tx_db?").transcripts[tx_idx]; for genome_alignment in &tx.genome_alignments { if genome_alignment.contig == alt_ac { return Ok(genome_alignment @@ -229,7 +231,7 @@ impl ProviderInterface for MehariProvider { tx_ac: tx_ac.to_string(), alt_ac: alt_ac.to_string(), alt_aln_method: ALT_ALN_METHOD.to_string(), - alt_strand: match genome_alignment.strand { + alt_strand: match Strand::from_i32(genome_alignment.strand).expect("invalid strand") { Strand::Plus => 1, Strand::Minus => -1, }, @@ -280,7 +282,8 @@ impl ProviderInterface for MehariProvider { Ok(tx_idxs .iter() .map(|entry| { - let tx = &self.tx_seq_db.tx_db.transcripts[*entry.data() as usize]; + let tx = &self.tx_seq_db.tx_db .expect("no tx_db?") + .transcripts[*entry.data() as usize]; assert_eq!( tx.genome_alignments.len(), 1, @@ -290,7 +293,7 @@ impl ProviderInterface for MehariProvider { TxForRegionRecord { tx_ac: tx.id.clone(), alt_ac: alt_ac.to_string(), - alt_strand: match alt_strand { + alt_strand: match Strand::from_i32(alt_strand).expect("invalid strand") { Strand::Plus => 1, Strand::Minus => -1, }, @@ -308,7 +311,7 @@ impl ProviderInterface for MehariProvider { .get(tx_ac) .ok_or(anyhow::anyhow!("Could not find transcript {}", tx_ac))?; let tx_idx = tx_idx as usize; - let tx = &self.tx_seq_db.tx_db.transcripts[tx_idx]; + let tx = &self.tx_seq_db.tx_db.expect("no tx_db?").transcripts[tx_idx]; let hgnc = tx.gene_name.clone(); @@ -352,7 +355,7 @@ impl ProviderInterface for MehariProvider { .get(tx_ac) .ok_or(anyhow::anyhow!("Could not find transcript {}", tx_ac))?; let tx_idx = tx_idx as usize; - let tx = &self.tx_seq_db.tx_db.transcripts[tx_idx]; + let tx = &self.tx_seq_db.tx_db.expect("no tx_db?").transcripts[tx_idx]; for genome_alignment in &tx.genome_alignments { if genome_alignment.contig == alt_ac { @@ -384,7 +387,7 @@ impl ProviderInterface for MehariProvider { .ok_or(anyhow::anyhow!("Could not find transcript {}", tx_ac))?; let tx_idx = tx_idx as usize; - let tx = &self.tx_seq_db.tx_db.transcripts[tx_idx]; + let tx = &self.tx_seq_db.tx_db.expect("no tx_db?").transcripts[tx_idx]; let genome_alignment = tx.genome_alignments.first().unwrap(); Ok(vec![TxMappingOptionsRecord { diff --git a/src/annotate/strucvars/mod.rs b/src/annotate/strucvars/mod.rs index 37585bad..61faaf22 100644 --- a/src/annotate/strucvars/mod.rs +++ b/src/annotate/strucvars/mod.rs @@ -69,10 +69,6 @@ pub struct Args { /// For debug purposes, maximal number of variants to annotate. #[arg(long)] pub max_var_count: Option, - /// Maximal number of flatbuffers tables, should not need tweaking. However, if you see a - /// "too many tables" error output, increase this value. - #[arg(long, default_value_t = 5_000_000)] - pub max_fb_tables: usize, /// Paths to the per-sample VCF files with coverage and mapping quality information as /// generated by maelstrom-core. @@ -3493,7 +3489,6 @@ mod test { } }, max_var_count: None, - max_fb_tables: 5_000_000, path_cov_vcf: vec![String::from( "tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz", )], diff --git a/src/db/create/seqvar_clinvar.rs b/src/db/create/seqvar_clinvar.rs index 6424d510..51c35eeb 100644 --- a/src/db/create/seqvar_clinvar.rs +++ b/src/db/create/seqvar_clinvar.rs @@ -9,7 +9,7 @@ use std::{ use bgzip::BGZFReader; use clap::Parser; use hgvs::static_data::Assembly; -use rocksdb::{DBWithThreadMode, SingleThreaded, UniversalCompactOptions}; +use rocksdb::{DBWithThreadMode, SingleThreaded}; use serde::{Deserialize, Serialize}; use thousands::Separable; diff --git a/src/db/create/txs/data.proto3 b/src/db/create/txs/data.proto3 new file mode 100644 index 00000000..13aa2ed1 --- /dev/null +++ b/src/db/create/txs/data.proto3 @@ -0,0 +1,123 @@ +syntax = "proto3"; + +package mehari.db.create.txs.data; + +// Stores long array of sequences with an "index" of sequence names to their +// index. +// +// The fields `aliases` and `aliases_idx` have the same length and `aliases_idx[i]` +// stores the index into `seqs` for the sequence `aliases[i]`. In other words. +// `seqs[aliases_idx[i]]` stores the sequence for `aliases[i]`. +message SequenceDb { + // The sequence aliases, cf. `aliases_idx`. + repeated string aliases = 1; + // The corresponding index in `seqs`, cf. `aliases`. + repeated uint32 aliases_idx = 2; + // The corresponding sequences. + repeated string seqs = 3; +} + +// Mapping from gene to transcript ID. +message GeneToTxId { + // Gene HGNC symbol; serves as gene identifier. + string gene_name = 1; + // Vector of all transcript IDs. + repeated string tx_ids = 2; +} + +// Container for the transcript-related database. +message TranscriptDb { + // Vector of all transcripts. + repeated Transcript transcripts = 1; + // Mapping from gene ID to vector of all transcript IDs. + repeated GeneToTxId gene_to_tx = 2; +} + +// Enumeration for `Transcript::biotype`. +enum TranscriptBiotype { + CODING = 0; + NON_CODING = 1; +} + +// Bit values for the transcript tags. +enum TranscriptTag{ + BASIC = 0; + ENSEMBL_CANONICAL = 1; + MANE_SELECT = 2; + MANE_PLUS_CLINICAL = 3; + REF_SEQ_SEQLECT = 4; +} + +// Store information about a transcript. +message Transcript { + // Transcript accession with version, e.g., `"NM_007294.3"` or `"ENST00000461574.1"` for BRCA1. + string id = 1; + // HGNC symbol, e.g., `"BRCA1"` + string gene_name = 2; + // HGNC gene identifier, e.g., `"1100"` for BRCA1. + string gene_id = 3; + // Transcript biotype. + TranscriptBiotype biotype = 4; + // Transcript flags. + repeated TranscriptTag tags = 5; + // Identifier of the corresponding protein. + optional string protein = 6; + // CDS start codon. + optional int32 start_codon = 7; + // CDS stop codon. + optional int32 stop_codon = 8; + // Alignments on the different genome builds. + repeated GenomeAlignment genome_alignments = 9; +} + +// Enumeration for the known genome builds. +enum GenomeBuild{ + GRCH37 = 0; + GRCH38 = 1; +} + +// Enumeration for the two strands of the genome. +enum Strand{ + PLUS = 0; + MINUS = 1; +} + +// Store information about a transcript aligning to a genome. +message GenomeAlignment { + // The genome build identifier. + GenomeBuild genome_build = 1; + // Accession of the contig sequence. + string contig = 2; + // CDS end position, `-1` to indicate `None`. + optional int32 cds_start = 3; + // CDS end position, `-1` to indicate `None`. + optional int32 cds_end = 4; + // The strand. + Strand strand = 5; + // Exons of the alignment. + repeated ExonAlignment exons = 6; +} + +// Store the alignment of one exon to the reference. +message ExonAlignment { + // Start position on reference. + int32 alt_start_i = 1; + // End position on reference. + int32 alt_end_i = 2; + // Exon number. + int32 ord = 3; + // CDS start coordinate. + optional int32 alt_cds_start_i = 4; + // CDS end coordinate. + optional int32 alt_cds_end_i = 5; + // CIGAR string of alignment, empty indicates full matches. + string cigar = 6; +} + +// Database of transcripts with sequences. +message TxSeqDatabase { + // Store transcripts with their aliases. + TranscriptDb tx_db = 1; + // Store sequence with their aliases. + SequenceDb seq_db = 2; +} diff --git a/src/db/create/txs/data.rs b/src/db/create/txs/data.rs deleted file mode 100644 index e1b524cb..00000000 --- a/src/db/create/txs/data.rs +++ /dev/null @@ -1,135 +0,0 @@ -//! Data structures for storing transcripts after deserializing from flatbuffers. -//! -//! The structure is taken 1:1 from the flatbuffers IDL so in the future it may become more -//! Rust-y if we switch to bincode. - -use enumset::{EnumSet, EnumSetType}; -use serde::{Deserialize, Serialize}; - -// Enumeration for `Transcript::biotype`. -#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] -pub enum TranscriptBiotype { - Coding, - NonCoding, -} - -/// Transcript tag enumeration. -#[derive(EnumSetType, Debug, Serialize, Deserialize)] -pub enum TranscriptTag { - Basic, - EnsemblCanonical, - ManeSelect, - ManePlusClinical, - RefSeqSelect, -} - -/// Enumeration for the known genome builds. -#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] -pub enum GenomeBuild { - Grch37, - Grch38, -} - -/// Enumeration for the two strands of the genome. -#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] -pub enum Strand { - Plus, - Minus, -} - -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct ExonAlignment { - /// Start position on reference. - pub alt_start_i: i32, - /// End position on reference. - pub alt_end_i: i32, - /// Exon number. - pub ord: i32, - /// CDS start coordinate. - pub alt_cds_start_i: Option, - /// CDS end coordinate. - pub alt_cds_end_i: Option, - /// CIGAR string of alignment, empty indicates full matches. - pub cigar: String, -} - -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct GenomeAlignment { - /// The genome build identifier. - pub genome_build: GenomeBuild, - /// Accession of the contig sequence. - pub contig: String, - /// CDS end position, `-1` to indicate `None`. - pub cds_start: Option, - /// CDS end position, `-1` to indicate `None`. - pub cds_end: Option, - /// The strand. - pub strand: Strand, - /// Exons of the alignment. - pub exons: Vec, -} -// Store information about a transcript. -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct Transcript { - // Transcript accession with version, e.g., `"NM_007294.3"` or `"ENST00000461574.1"` for BRCA1. - pub id: String, - // HGNC symbol, e.g., `"BRCA1"` - pub gene_name: String, - // HGNC gene identifier, e.g., `"1100"` for BRCA1. - pub gene_id: String, - // Transcript biotype. - pub biotype: TranscriptBiotype, - // Transcript flags, values from `TranscriptTag`, stored as OR-ed ubyte values. - pub tags: EnumSet, - // Identifier of the corresponding protein. - pub protein: Option, - // CDS start codon. - pub start_codon: Option, - // CDS stop codon. - pub stop_codon: Option, - // Alignments on the different genome builds. - pub genome_alignments: Vec, -} - -// Mapping from gene to transcript ID. -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct GeneToTxId { - /// Gene HGNC symbol, serves as gene identifier. - pub gene_name: String, - /// Vector of all transcript IDs. - pub tx_ids: Vec, -} - -/// Container for the transcript-related database. -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct TranscriptDb { - /// Vector of all transcripts. - pub transcripts: Vec, - /// Mapping from gene ID to vector of all transcript IDs. - pub gene_to_tx: Vec, -} - -/// Stores long array of sequences with an "index" of sequence names to their -/// index. -/// -/// The fields `aliases` and `aliases_idx` have the same length and `aliases_idx[i]` -/// stores the index into `seqs` for the sequence `aliases[i]`. In other words. -/// `seqs[aliases_idx[i]]` stores the sequence for `aliases[i]`. -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct SequenceDb { - /// The sequence aliases, cf. `aliases_idx`. - pub aliases: Vec, - /// The corresponding index in `seqs`, cf. `aliases`. - pub aliases_idx: Vec, - /// The corresponding sequences. - pub seqs: Vec, -} - -//// Database of transcripts with sequences. -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct TxSeqDatabase { - /// Store transcripts with their aliases. - pub tx_db: TranscriptDb, - /// Store sequence with their aliases. - pub seq_db: SequenceDb, -} diff --git a/src/db/create/txs/mod.rs b/src/db/create/txs/mod.rs index b3e8b343..6db65d3b 100644 --- a/src/db/create/txs/mod.rs +++ b/src/db/create/txs/mod.rs @@ -1,27 +1,17 @@ //! Transcript database. -pub mod data; - use std::collections::HashSet; use std::fs::File; use std::path::Path; use std::{collections::HashMap, io::Write, path::PathBuf, time::Instant}; use clap::Parser; -use flatbuffers::FlatBufferBuilder; -use hgvs::data::cdot::json::models::{self, BioType}; -use hgvs::sequences::{translate_cds, TranslationTable}; -use indicatif::{ProgressBar, ProgressStyle}; -use seqrepo::{AliasOrSeqId, Interface, SeqRepo}; +use hgvs::data::cdot::json::models; +use indicatif::ProgressStyle; +use seqrepo::SeqRepo; use thousands::Separable; -use crate::common::{trace_rss_now, GenomeRelease}; -use crate::world_flatbuffers::mehari::{ - ExonAlignment, ExonAlignmentArgs, GeneToTxId, GeneToTxIdArgs, GenomeAlignment, - GenomeAlignmentArgs, GenomeBuild, SequenceDb, SequenceDbArgs, Strand, Transcript, - TranscriptArgs, TranscriptBiotype, TranscriptDb, TranscriptDbArgs, TranscriptTag, - TxSeqDatabase, TxSeqDatabaseArgs, -}; +use crate::common::GenomeRelease; lazy_static::lazy_static! { /// Progress bar style to use. @@ -31,6 +21,11 @@ lazy_static::lazy_static! { .unwrap(); } +/// Data structures for (de-)serialization as generated by `prost-build`. +pub mod data { + include!(concat!(env!("OUT_DIR"), "/mehari.db.create.txs.data.rs")); +} + /// Command line arguments for `db create txs` sub command. #[derive(Parser, Debug)] #[command(about = "Construct mehari transcripts and sequence database", long_about = None)] @@ -38,7 +33,7 @@ pub struct Args { /// Genome release to extract transcripts for. #[arg(long)] pub genome_release: GenomeRelease, - /// Path to output flatbuffers file to write to. + /// Path to output protobuf file to write to. #[arg(long)] pub path_out: PathBuf, /// Paths to the cdot JSON transcripts to import. @@ -754,8 +749,8 @@ pub fn run(common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Erro let tx_data = load_cdot_files(args, &mut report_file)?; // then remove redundant onces, and let tx_data = filter_transcripts(tx_data, args.max_txs, &args.gene_symbols, &mut report_file)?; - // finally build flatbuffers file. - build_flatbuffers( + // finally build protobuf file. + build_protobuf( &args.path_out, seqrepo, tx_data, diff --git a/src/main.rs b/src/main.rs index 18678695..779beb72 100644 --- a/src/main.rs +++ b/src/main.rs @@ -97,17 +97,6 @@ pub mod db; pub mod ped; pub mod verify; -#[allow( - non_snake_case, - unused_imports, - clippy::extra_unused_lifetimes, - clippy::missing_safety_doc, - clippy::derivable_impls, - clippy::size_of_in_element_count -)] -#[path = "../target/flatbuffers/world_generated.rs"] -pub mod world_flatbuffers; - use clap::{command, Args, Parser, Subcommand}; #[derive(Debug, Parser)] diff --git a/src/verify/seqvars.rs b/src/verify/seqvars.rs index dc54c096..923c24e6 100644 --- a/src/verify/seqvars.rs +++ b/src/verify/seqvars.rs @@ -39,10 +39,6 @@ pub struct Args { /// For debug purposes, maximal number of variants to annotate. #[arg(long)] pub max_var_count: Option, - /// Maximal number of flatbuffers tables, should not need tweaking. However, if you see a - /// "too many tables" error output, increase this value. - #[arg(long, default_value_t = 5_000_000)] - pub max_fb_tables: usize, } /// Guess genome release from VEP TSV file. @@ -129,7 +125,6 @@ pub fn run(_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Err &args.path_db, path_component(assembly) ), - args.max_fb_tables, )?; tracing::info!("Building transcript interval trees ..."); let provider = Rc::new(MehariProvider::new(tx_db, assembly)); diff --git a/utils/install-flatbuffers.sh b/utils/install-flatbuffers.sh deleted file mode 100644 index d03010bf..00000000 --- a/utils/install-flatbuffers.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/bash - -# Will install into ~/.local/share/flatbuffers, so make sure to add the following -# to your PATH: ~/.local/share/flatbuffers/bin -# -# Will go into ./utils/var for cloning/building. - -mkdir -p utils/var -cd utils/var - -git clone https://github.com/google/flatbuffers.git -cd flatbuffers -git checkout v22.12.06 -cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=$HOME/.local/share/flatbuffers -make -./flattests -make install -flatc --version diff --git a/utils/install-protoc.sh b/utils/install-protoc.sh new file mode 100644 index 00000000..9df6e14c --- /dev/null +++ b/utils/install-protoc.sh @@ -0,0 +1,23 @@ +#!/usr/bin/bash + +# Will install into ~/.local/share/flatbuffers, so make sure to add the following +# to your PATH: ~/.local/share/flatbuffers/bin +# +# Will go into ./utils/var for cloning/building. + +mkdir -p utils/var +cd utils/var + +sudo apt-get install g++ git + +if [[ ! -e protobuf ]]; then + git clone https://github.com/protocolbuffers/protobuf.git +fi +cd protobuf +git submodule update --init --recursive + +cmake . CMAKE_INSTALL_PREFIX=$HOME/.local/share/protoc +make -j 8 install + +mkdir -p ~/.local/share/protoc/bin +cp bazel-bin/protoc ~/.local/share/protoc/bin