Skip to content

Commit

Permalink
feat: implement file identifier mappings (#364)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Jun 6, 2024
1 parent 06334e6 commit 7d22cfc
Show file tree
Hide file tree
Showing 20 changed files with 162 additions and 100 deletions.
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ noodles-core = "0.14.0"
noodles-csi = { version = "0.30.0", features = ["async"] }
noodles-tabix = { version = "0.36.0", features = ["async"] }
noodles-vcf = { version = "0.49", features = ["async"] }
pbjson = "0.6"
pbjson-types = "0.6"
procfs = "0.16"
prost = "0.12"
rand = "0.8"
Expand All @@ -69,7 +71,9 @@ tracing-subscriber = "0.3"
uuid = { version = "1.4", features = ["v4", "fast-rng", "serde"] }

[build-dependencies]
anyhow = "1.0"
prost-build = "0.12"
pbjson-build = "0.6.2"

[dev-dependencies]
async-std = { version = "1.12", features = ["attributes"] }
Expand Down
53 changes: 36 additions & 17 deletions build.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,40 @@
// The custom build script, needed as we use protocolbuffers.
// The custom build script, used to (1) generate the Rust classes for the
// protobuf implementation and (2) use pbjson for proto3 JSON serialization.

use std::{env, path::PathBuf};

fn main() -> Result<(), anyhow::Error> {
let root = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("protos");
let proto_files = [
"varfish/v1/clinvar.proto",
"varfish/v1/sv.proto",
"varfish/v1/worker.proto",
]
.iter()
.map(|f| root.join(f))
.collect::<Vec<_>>();

// Tell cargo to recompile if any of these proto files are changed
for proto_file in &proto_files {
println!("cargo:rerun-if-changed={}", proto_file.display());
}

let descriptor_path: PathBuf =
PathBuf::from(env::var("OUT_DIR").unwrap()).join("proto_descriptor.bin");

fn main() {
println!("cargo:rerun-if-changed=src/proto/varfish/v1/clinvar.proto");
println!("cargo:rerun-if-changed=src/proto/varfish/v1/sv.proto");
prost_build::Config::new()
.protoc_arg("-Isrc/proto")
// Add serde serialization and deserialization to the generated code.
.type_attribute(".", "#[derive(serde::Serialize, serde::Deserialize)]")
// Skip serializing `None` values.
.type_attribute(".", "#[serde_with::skip_serializing_none]")
// Save descriptors to file
.file_descriptor_set_path(&descriptor_path)
// Override prost-types with pbjson-types
.compile_well_known_types()
.extern_path(".google.protobuf", "::pbjson_types")
// Define the protobuf files to compile.
.compile_protos(
&[
"src/proto/varfish/v1/clinvar.proto",
"src/proto/varfish/v1/sv.proto",
],
&["src/"],
)
.unwrap();
.compile_protos(&proto_files, &[root])?;

let descriptor_set = std::fs::read(descriptor_path).unwrap();
pbjson_build::Builder::new()
.register_descriptors(&descriptor_set)?
.build(&[".varfish"])?;

Ok(())
}
File renamed without changes.
File renamed without changes.
27 changes: 27 additions & 0 deletions protos/varfish/v1/worker.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Protocol buffers related to the worker.

syntax = "proto3";

package varfish.v1.worker;

// Protocol buffer for storing a list of file identifier mappings.
message FileIdentifierMappings {
// Protocol buffer for storing file identifier mapping for one file.
message Mapping {
// Protocol buffer to store one mapping entry.
message Entry {
// Identifier as given in input file.
string src = 1;
// Identifier to use in output file.
string dst = 2;
}

// Path to the file to obtain mapping for, as given on the command line.
string path = 1;
// List of identifier mappings.
repeated Entry entries = 2;
}

// One file per mapping.
repeated Mapping mappings = 1;
}
1 change: 1 addition & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! VarFish Server Worker main executable
pub mod common;
pub mod pbs;
pub mod seqvars;
pub mod strucvars;

Expand Down
19 changes: 19 additions & 0 deletions src/pbs.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
//! Data structures for (de-)serialization as generated by `prost-build`.
/// Code generate for protobufs by `prost-build`.
pub mod clinvar {
include!(concat!(env!("OUT_DIR"), "/varfish.v1.clinvar.rs"));
include!(concat!(env!("OUT_DIR"), "/varfish.v1.clinvar.serde.rs"));
}

/// Code generate for protobufs by `prost-build`.
pub mod svs {
include!(concat!(env!("OUT_DIR"), "/varfish.v1.svs.rs"));
include!(concat!(env!("OUT_DIR"), "/varfish.v1.svs.serde.rs"));
}

/// Code generate for protobufs by `prost-build`.
pub mod worker {
include!(concat!(env!("OUT_DIR"), "/varfish.v1.worker.rs"));
include!(concat!(env!("OUT_DIR"), "/varfish.v1.worker.serde.rs"));
}
1 change: 0 additions & 1 deletion src/strucvars/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
pub mod aggregate;
pub mod ingest;
pub mod pbs;
pub mod query;
pub mod txt_to_bin;
3 changes: 0 additions & 3 deletions src/strucvars/pbs.rs

This file was deleted.

37 changes: 19 additions & 18 deletions src/strucvars/query/bgdbs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,7 @@ use serde::{Deserialize, Serialize};
use strum_macros::{Display, EnumString};
use tracing::info;

use crate::{
common::{trace_rss_now, GenomeRelease, CHROMS},
strucvars::pbs,
};
use crate::common::{trace_rss_now, GenomeRelease, CHROMS};

use super::{
schema::ChromRange,
Expand Down Expand Up @@ -147,18 +144,20 @@ pub fn load_bg_db_records(path: &Path) -> Result<BgDb, anyhow::Error> {

let fcontents =
std::fs::read(path).map_err(|e| anyhow::anyhow!("error reading {:?}: {}", &path, e))?;
let bg_db = pbs::BackgroundDatabase::decode(std::io::Cursor::new(fcontents))
let bg_db = crate::pbs::svs::BackgroundDatabase::decode(std::io::Cursor::new(fcontents))
.map_err(|e| anyhow::anyhow!("error decoding {:?}: {}", &path, e))?;
let record_count = bg_db.records.len();

for record in bg_db.records.into_iter() {
let chrom_no = record.chrom_no as usize;
let begin = match pbs::SvType::try_from(record.sv_type).expect("invalid sv_type") {
pbs::SvType::Bnd | pbs::SvType::Ins => record.start - 2,
_ => record.start - 1,
};
let end = match pbs::SvType::try_from(record.sv_type).expect("invalid sv_type") {
pbs::SvType::Bnd | pbs::SvType::Ins => record.start - 1,
let begin =
match crate::pbs::svs::SvType::try_from(record.sv_type).expect("invalid sv_type") {
crate::pbs::svs::SvType::Bnd | crate::pbs::svs::SvType::Ins => record.start - 2,
_ => record.start - 1,
};
let end = match crate::pbs::svs::SvType::try_from(record.sv_type).expect("invalid sv_type")
{
crate::pbs::svs::SvType::Bnd | crate::pbs::svs::SvType::Ins => record.start - 1,
_ => record.stop,
};
let key = begin..end;
Expand All @@ -167,13 +166,15 @@ pub fn load_bg_db_records(path: &Path) -> Result<BgDb, anyhow::Error> {
result.records[chrom_no].push(BgDbRecord {
begin: record.start - 1,
end: record.stop,
sv_type: match pbs::SvType::try_from(record.sv_type).expect("invalid sv_type") {
pbs::SvType::Del => SvType::Del,
pbs::SvType::Dup => SvType::Dup,
pbs::SvType::Inv => SvType::Inv,
pbs::SvType::Ins => SvType::Ins,
pbs::SvType::Bnd => SvType::Bnd,
pbs::SvType::Cnv => SvType::Cnv,
sv_type: match crate::pbs::svs::SvType::try_from(record.sv_type)
.expect("invalid sv_type")
{
crate::pbs::svs::SvType::Del => SvType::Del,
crate::pbs::svs::SvType::Dup => SvType::Dup,
crate::pbs::svs::SvType::Inv => SvType::Inv,
crate::pbs::svs::SvType::Ins => SvType::Ins,
crate::pbs::svs::SvType::Bnd => SvType::Bnd,
crate::pbs::svs::SvType::Cnv => SvType::Cnv,
},
count: record.count,
});
Expand Down
11 changes: 3 additions & 8 deletions src/strucvars/query/clinvar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,6 @@ use super::{
schema::{Pathogenicity, StructuralVariant, SvType},
};

/// Data structures for (de-)serialization as generated by `prost-build`.
pub mod pbs {
include!(concat!(env!("OUT_DIR"), "/varfish.v1.clinvar.rs"));
}

/// Alias for the interval tree that we use.
type IntervalTree = ArrayBackedIntervalTree<i32, u32>;

Expand All @@ -27,7 +22,7 @@ type IntervalTree = ArrayBackedIntervalTree<i32, u32>;
#[derive(Default, Debug)]
pub struct ClinvarSv {
/// Records, stored by chromosome.
pub records: Vec<Vec<pbs::SvRecord>>,
pub records: Vec<Vec<crate::pbs::clinvar::SvRecord>>,
/// Interval trees, stored by chromosome.
pub trees: Vec<IntervalTree>,
}
Expand All @@ -39,7 +34,7 @@ impl ClinvarSv {
chrom_range: &ChromRange,
chrom_map: &IndexMap<String, usize>,
min_patho: Option<Pathogenicity>,
) -> Vec<pbs::SvRecord> {
) -> Vec<crate::pbs::clinvar::SvRecord> {
let chrom_idx = *chrom_map
.get(&chrom_range.chromosome)
.expect("invalid chromosome");
Expand Down Expand Up @@ -107,7 +102,7 @@ pub fn load_clinvar_sv(
std::path::Path::new(path_db).join(format!("{}/strucvars/clinvar.bin", genome_release));
let fcontents =
std::fs::read(&path).map_err(|e| anyhow::anyhow!("error reading {:?}: {}", &path, e))?;
let bg_db = pbs::SvDatabase::decode(std::io::Cursor::new(fcontents))
let bg_db = crate::pbs::clinvar::SvDatabase::decode(std::io::Cursor::new(fcontents))
.map_err(|e| anyhow::anyhow!("error decoding {:?}: {}", &path, e))?;

let mut total_count = 0;
Expand Down
4 changes: 2 additions & 2 deletions src/strucvars/query/genes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use prost::Message;
use serde::Deserialize;
use tracing::info;

use crate::{common::GenomeRelease, strucvars::pbs};
use crate::{common::GenomeRelease, pbs};

/// Information to store for the interlink table.
#[derive(Default, Debug)]
Expand Down Expand Up @@ -40,7 +40,7 @@ fn load_xlink_db(path: &Path) -> Result<XlinkDb, anyhow::Error> {

let fcontents =
std::fs::read(path).map_err(|e| anyhow::anyhow!("error reading {:?}: {}", &path, e))?;
let xlink_db = pbs::XlinkDatabase::decode(std::io::Cursor::new(fcontents))
let xlink_db = pbs::svs::XlinkDatabase::decode(std::io::Cursor::new(fcontents))
.map_err(|e| anyhow::anyhow!("error decoding {:?}: {}", &path, e))?;

let mut total_count = 0;
Expand Down
8 changes: 4 additions & 4 deletions src/strucvars/query/masked.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use tracing::info;

use crate::{
common::{trace_rss_now, GenomeRelease, CHROMS},
strucvars::pbs,
pbs,
};

use super::{
Expand Down Expand Up @@ -141,7 +141,7 @@ pub fn load_masked_db_records(path: &Path) -> Result<MaskedDb, anyhow::Error> {

let fcontents =
std::fs::read(path).map_err(|e| anyhow::anyhow!("error reading {:?}: {}", &path, e))?;
let masked_db = pbs::MaskedDatabase::decode(std::io::Cursor::new(fcontents))
let masked_db = pbs::svs::MaskedDatabase::decode(std::io::Cursor::new(fcontents))
.map_err(|e| anyhow::anyhow!("error decoding {:?}: {}", &path, e))?;

for record in masked_db.records.into_iter() {
Expand Down Expand Up @@ -316,8 +316,8 @@ mod test {
let tmpdir = temp_testdir::TempDir::default();
let path_bin = tmpdir.join("masked_db.bin");

let data = super::pbs::MaskedDatabase {
records: vec![super::pbs::MaskedDbRecord {
let data = super::pbs::svs::MaskedDatabase {
records: vec![super::pbs::svs::MaskedDbRecord {
chrom_no: 0,
start: 1,
stop: 2,
Expand Down
Loading

0 comments on commit 7d22cfc

Please sign in to comment.