Skip to content

Commit

Permalink
feat: implement file identifier mappings (#364) (#365)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Jun 6, 2024
1 parent 06334e6 commit 57db49a
Show file tree
Hide file tree
Showing 45 changed files with 8,000 additions and 9,760 deletions.
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ noodles-core = "0.14.0"
noodles-csi = { version = "0.30.0", features = ["async"] }
noodles-tabix = { version = "0.36.0", features = ["async"] }
noodles-vcf = { version = "0.49", features = ["async"] }
pbjson = "0.6"
pbjson-types = "0.6"
procfs = "0.16"
prost = "0.12"
rand = "0.8"
Expand All @@ -69,7 +71,9 @@ tracing-subscriber = "0.3"
uuid = { version = "1.4", features = ["v4", "fast-rng", "serde"] }

[build-dependencies]
anyhow = "1.0"
prost-build = "0.12"
pbjson-build = "0.6.2"

[dev-dependencies]
async-std = { version = "1.12", features = ["attributes"] }
Expand Down
53 changes: 36 additions & 17 deletions build.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,40 @@
// The custom build script, needed as we use protocolbuffers.
// The custom build script, used to (1) generate the Rust classes for the
// protobuf implementation and (2) use pbjson for proto3 JSON serialization.

use std::{env, path::PathBuf};

fn main() -> Result<(), anyhow::Error> {
let root = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("protos");
let proto_files = [
"varfish/v1/clinvar.proto",
"varfish/v1/sv.proto",
"varfish/v1/worker.proto",
]
.iter()
.map(|f| root.join(f))
.collect::<Vec<_>>();

// Tell cargo to recompile if any of these proto files are changed
for proto_file in &proto_files {
println!("cargo:rerun-if-changed={}", proto_file.display());
}

let descriptor_path: PathBuf =
PathBuf::from(env::var("OUT_DIR").unwrap()).join("proto_descriptor.bin");

fn main() {
println!("cargo:rerun-if-changed=src/proto/varfish/v1/clinvar.proto");
println!("cargo:rerun-if-changed=src/proto/varfish/v1/sv.proto");
prost_build::Config::new()
.protoc_arg("-Isrc/proto")
// Add serde serialization and deserialization to the generated code.
.type_attribute(".", "#[derive(serde::Serialize, serde::Deserialize)]")
// Skip serializing `None` values.
.type_attribute(".", "#[serde_with::skip_serializing_none]")
// Save descriptors to file
.file_descriptor_set_path(&descriptor_path)
// Override prost-types with pbjson-types
.compile_well_known_types()
.extern_path(".google.protobuf", "::pbjson_types")
// Define the protobuf files to compile.
.compile_protos(
&[
"src/proto/varfish/v1/clinvar.proto",
"src/proto/varfish/v1/sv.proto",
],
&["src/"],
)
.unwrap();
.compile_protos(&proto_files, &[root])?;

let descriptor_set = std::fs::read(descriptor_path).unwrap();
pbjson_build::Builder::new()
.register_descriptors(&descriptor_set)?
.build(&[".varfish"])?;

Ok(())
}
File renamed without changes.
File renamed without changes.
27 changes: 27 additions & 0 deletions protos/varfish/v1/worker.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Protocol buffers related to the worker.

syntax = "proto3";

package varfish.v1.worker;

// Protocol buffer for storing a list of file identifier mappings.
message FileIdentifierMappings {
// Protocol buffer for storing file identifier mapping for one file.
message Mapping {
// Protocol buffer to store one mapping entry.
message Entry {
// Identifier as given in input file.
string src = 1;
// Identifier to use in output file.
string dst = 2;
}

// Path to the file to obtain mapping for, as given on the command line.
string path = 1;
// List of identifier mappings.
repeated Entry entries = 2;
}

// One file per mapping.
repeated Mapping mappings = 1;
}
108 changes: 108 additions & 0 deletions src/common/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,89 @@ macro_rules! flush_and_shutdown {
};
}

// Per-file identifier mapping.
pub mod id_mapping {
// Actual implementation.
#[derive(serde::Serialize, serde::Deserialize, PartialEq, Eq, Clone, Debug, Default)]
pub struct FileIdentifierMappings {
// Per-file and per-identifier mapping.
pub mappings: indexmap::IndexMap<String, indexmap::IndexMap<String, String>>,
}

impl FileIdentifierMappings {
// Load from JSON - deserialize via serde_json.
pub fn load_from_json(json: &str) -> Result<Self, anyhow::Error> {
let buf: crate::pbs::worker::FileIdentifierMappings = serde_json::from_str(json)
.map_err(|e| {
anyhow::anyhow!("could not deserialize FileIdentifierMappings: {}", e)
})?;
let mut mappings: indexmap::IndexMap<_, _> = Default::default();
for mapping in &buf.mappings {
let mut map: indexmap::IndexMap<_, _> = Default::default();
for entry in &mapping.entries {
map.insert(entry.src.to_string(), entry.dst.to_string());
}
mappings.insert(mapping.path.to_string(), map);
}
Ok(FileIdentifierMappings { mappings })
}

// Load from path.
pub fn load_from_path<P: AsRef<std::path::Path> + ToString>(
path: P,
) -> Result<Self, anyhow::Error> {
Self::load_from_json(
std::fs::read_to_string(path.as_ref())
.map_err(|e| anyhow::anyhow!("could not read {}: {}", path.to_string(), e))?
.as_str(),
)
}

// Obtain mapping.
pub fn mapping_for_file<'a>(
&'a self,
file_name: &str,
) -> Result<&'a indexmap::IndexMap<String, String>, anyhow::Error> {
self.mappings
.get(file_name)
.ok_or_else(|| anyhow::anyhow!("file name not in mapping: {}", &file_name))
}

// Map from file name and identifier.
pub fn map_identifier(
&self,
file_name: &str,
identifier: &str,
) -> Result<String, anyhow::Error> {
self.mapping_for_file(file_name)?
.get(identifier)
.map(|dst| dst.to_string())
.ok_or_else(|| {
anyhow::anyhow!("identifier {} not in file {}", &identifier, &file_name)
})
}

// Get all file names in map.
pub fn file_names(&self) -> Vec<String> {
self.mappings.keys().cloned().collect()
}

// Get all source identifiers in file.
pub fn src_identifiers(&self, file_name: &str) -> Result<Vec<String>, anyhow::Error> {
Ok(self.mapping_for_file(file_name)?.keys().cloned().collect())
}

// Get all destination identifiers in file.
pub fn dst_identifiers(&self, file_name: &str) -> Result<Vec<String>, anyhow::Error> {
Ok(self
.mapping_for_file(file_name)?
.values()
.cloned()
.collect())
}
}
}

#[cfg(test)]
mod test {
use noodles_vcf as vcf;
Expand Down Expand Up @@ -641,4 +724,29 @@ mod test {
insta::assert_debug_snapshot!(pedigree);
insta::assert_debug_snapshot!(case_uuid);
}

#[test]
fn file_identifier_mappings() -> Result<(), anyhow::Error> {
let mapping = super::id_mapping::FileIdentifierMappings::load_from_json(
r#"
{
"mappings": [
{
"path": "path/to/file",
"entries": [
{
"src": "foo",
"dst": "bar"
}
]
}
]
}
"#,
)?;

insta::assert_yaml_snapshot!(&mapping);

Ok(())
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
source: src/common/mod.rs
expression: "&mapping"
---
mappings:
path/to/file:
foo: bar
1 change: 1 addition & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! VarFish Server Worker main executable
pub mod common;
pub mod pbs;
pub mod seqvars;
pub mod strucvars;

Expand Down
19 changes: 19 additions & 0 deletions src/pbs.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
//! Data structures for (de-)serialization as generated by `prost-build`.
/// Code generate for protobufs by `prost-build`.
pub mod clinvar {
include!(concat!(env!("OUT_DIR"), "/varfish.v1.clinvar.rs"));
include!(concat!(env!("OUT_DIR"), "/varfish.v1.clinvar.serde.rs"));
}

/// Code generate for protobufs by `prost-build`.
pub mod svs {
include!(concat!(env!("OUT_DIR"), "/varfish.v1.svs.rs"));
include!(concat!(env!("OUT_DIR"), "/varfish.v1.svs.serde.rs"));
}

/// Code generate for protobufs by `prost-build`.
pub mod worker {
include!(concat!(env!("OUT_DIR"), "/varfish.v1.worker.rs"));
include!(concat!(env!("OUT_DIR"), "/varfish.v1.worker.serde.rs"));
}
49 changes: 40 additions & 9 deletions src/seqvars/ingest/header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ fn add_contigs_38(builder: vcf::header::Builder) -> Result<vcf::header::Builder,
pub fn build_output_header(
input_header: &vcf::Header,
pedigree: &Option<mehari::ped::PedigreeByName>,
id_mapping: &Option<indexmap::IndexMap<String, String>>,
genomebuild: GenomeRelease,
file_date: &str,
case_uuid: &uuid::Uuid,
Expand Down Expand Up @@ -355,7 +356,24 @@ pub fn build_output_header(
.iter()
.cloned()
.collect::<HashSet<_>>();
if !ped_idv.eq(&input_idv) {
if let Some(id_mapping) = id_mapping {
let src_idv = id_mapping.keys().cloned().collect::<HashSet<_>>();
if !src_idv.eq(&input_idv) {
anyhow::bail!(
"mapping source individuals = {:?} != input individuals: {:?}",
&src_idv,
&input_idv
)
}
let dst_idv = id_mapping.values().cloned().collect::<HashSet<_>>();
if !dst_idv.eq(&ped_idv) {
anyhow::bail!(
"mapping destination individuals = {:?} != pedigree individuals: {:?}",
&dst_idv,
&ped_idv
)
}
} else if !ped_idv.eq(&input_idv) {
anyhow::bail!(
"pedigree individuals = {:?} != input individuals: {:?}",
&ped_idv,
Expand All @@ -364,13 +382,16 @@ pub fn build_output_header(
}

let mut sample_names = Vec::new();
for name in input_header.sample_names() {
let i = pedigree
.individuals
.get(name)
.expect("checked equality above");
if input_header.sample_names().contains(&i.name) {
sample_names.push(i.name.clone());
for src_name in input_header.sample_names() {
let dst_name = if let Some(id_mapping) = id_mapping {
id_mapping.get(src_name).expect("checked above")
} else {
src_name
};

let i = pedigree.individuals.get(dst_name).expect("checked above");
if input_header.sample_names().contains(src_name) {
sample_names.push(dst_name.clone());
}

// Add SAMPLE entry.
Expand Down Expand Up @@ -409,7 +430,15 @@ pub fn build_output_header(
builder = builder.set_sample_names(sample_names.into_iter().collect());
} else {
for name in input_header.sample_names() {
builder = builder.add_sample_name(name.clone());
let name = if let Some(id_mapping) = id_mapping {
id_mapping
.get(name)
.ok_or_else(|| anyhow::anyhow!("mapping given but sample {} not found", name))?
.clone()
} else {
name.clone()
};
builder = builder.add_sample_name(name);
}
}

Expand Down Expand Up @@ -521,6 +550,7 @@ mod test {
let output_vcf_header = super::build_output_header(
&input_vcf_header,
&Some(pedigree),
&None,
crate::common::GenomeRelease::Grch37,
"20230421",
&uuid::Uuid::parse_str("00000000-0000-0000-0000-000000000000").unwrap(),
Expand Down Expand Up @@ -563,6 +593,7 @@ mod test {
let output_vcf_header = super::build_output_header(
&input_vcf_header,
&Some(pedigree),
&None,
crate::common::GenomeRelease::Grch38,
"20230421",
&uuid::Uuid::parse_str("00000000-0000-0000-0000-000000000000").unwrap(),
Expand Down
Loading

0 comments on commit 57db49a

Please sign in to comment.