Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow importing gnomAD SV/CNV v4 as background db (#295) #310

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 64 additions & 8 deletions src/strucvars/txt_to_bin/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,12 @@ pub enum InputType {
StrucvarExacCnv,
/// Convert Thousand Genomes to binary.
StrucvarG1k,
/// Convert gnomAD SV to binary.
StrucvarGnomadSv,
/// Convert gnomAD SV v2 to binary.
StrucvarGnomadSv2,
/// Convert gnomAD CNV v4 to binary.
StrucvarGnomadCnv4,
/// Convert gnomAD SV v4 to binary.
StrucvarGnomadSv4,
/// Convert masked region to binary.
MaskedRegion,
/// Convert cross-link to binary.
Expand All @@ -100,7 +104,7 @@ pub struct Args {

/// Main entry point for the `strucvars txt-to-bin` command.
pub fn run(common_args: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> {
tracing::info!("Starting `db to-bin`");
tracing::info!("Starting `strucvars txt-to-bin`");
tracing::info!(" common_args = {:?}", &common_args);
tracing::info!(" args = {:?}", &args);

Expand Down Expand Up @@ -135,9 +139,21 @@ pub fn run(common_args: &crate::common::Args, args: &Args) -> Result<(), anyhow:
InputType::StrucvarG1k => {
vardbs::convert_to_bin(&args.path_input, &args.path_output, InputFileType::G1k)?
}
InputType::StrucvarGnomadSv => {
vardbs::convert_to_bin(&args.path_input, &args.path_output, InputFileType::Gnomad)?
}
InputType::StrucvarGnomadSv2 => vardbs::convert_to_bin(
&args.path_input,
&args.path_output,
InputFileType::GnomadSv2,
)?,
InputType::StrucvarGnomadCnv4 => vardbs::convert_to_bin(
&args.path_input,
&args.path_output,
InputFileType::GnomadCnv4,
)?,
InputType::StrucvarGnomadSv4 => vardbs::convert_to_bin(
&args.path_input,
&args.path_output,
InputFileType::GnomadSv4,
)?,
InputType::MaskedRegion => masked::convert_to_bin(&args.path_input, &args.path_output)?,
InputType::Xlink => xlink::convert_to_bin(&args.path_input, &args.path_output)?,
}
Expand Down Expand Up @@ -301,14 +317,14 @@ mod test {
}

#[test]
fn run_strucvar_gnomad_smoke() -> Result<(), anyhow::Error> {
fn run_strucvar_gnomad_sv2_smoke() -> Result<(), anyhow::Error> {
let tmp_dir = temp_testdir::TempDir::default();
let common_args = common::Args {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::StrucvarGnomadSv,
input_type: InputType::StrucvarGnomadSv2,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/gnomad_sv.bed.gz",
),
Expand All @@ -320,6 +336,46 @@ mod test {
Ok(())
}

#[test]
fn run_strucvar_gnomad_cnv4_smoke() -> Result<(), anyhow::Error> {
let tmp_dir = temp_testdir::TempDir::default();
let common_args = common::Args {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::StrucvarGnomadCnv4,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/vardbs/grch38/strucvar/gnomad-cnv.bed.gz",
),
path_output: tmp_dir.join("gnomad-cnv.bin"),
};

super::run(&common_args, &args)?;

Ok(())
}

#[test]
fn run_strucvar_gnomad_sv4_smoke() -> Result<(), anyhow::Error> {
let tmp_dir = temp_testdir::TempDir::default();
let common_args = common::Args {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::StrucvarGnomadSv4,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/vardbs/grch38/strucvar/gnomad-sv.bed.gz",
),
path_output: tmp_dir.join("gnomad-sv.bin"),
};

super::run(&common_args, &args)?;

Ok(())
}

#[test]
fn run_masked_region_smoke() -> Result<(), anyhow::Error> {
let tmp_dir = temp_testdir::TempDir::default();
Expand Down
109 changes: 106 additions & 3 deletions src/strucvars/txt_to_bin/vardbs/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,9 @@ pub struct G1kRecord {
pub n_het: u32,
}

/// gnomAD SV database record as read from TSV file.
/// gnomAD SV v2 database record as read from TSV file.
#[derive(Debug, Deserialize)]
pub struct GnomadRecord {
pub struct GnomadSv2Record {
/// chromosome name
pub chromosome: String,
/// begin position, 0-based
Expand All @@ -102,6 +102,56 @@ pub struct GnomadRecord {
pub n_het: u32,
}

/// gnomAD SV v4 database record as read from TSV file.
#[derive(Debug, Deserialize)]
pub struct GnomadSv4Record {
/// chromosome name
pub chromosome: String,
/// begin position, 0-based
pub begin: i32,
/// end position, 0-based
pub end: i32,
/// The structural vairant type
pub svtype: String,
/// Number of male homozygous reference allele carriers.
pub male_n_homref: u32,
/// Number of male heterozygous alternate allele carriers.
pub male_n_het: u32,
/// Number of male homozygous alternate allele carriers.
pub male_n_homalt: u32,
/// Number of male hemizygous alternate allele carriers.
pub male_n_hemiref: u32,
/// Number of male hemizygous reference allele carriers.
pub male_n_hemialt: u32,
/// Number of female homozygous reference allele carriers.
pub female_n_homref: u32,
/// Number of female heterozygous alternate allele carriers.
pub female_n_het: u32,
/// Number of female homozygous alternate allele carriers.
pub female_n_homalt: u32,
/// Number of samples at this site (CNV only).
pub cnv_n_total: u32,
/// Number of samples with a CNV at this site (CNV only).
pub cnv_n_var: u32,
}

/// gnomAD CNV v$ database record as read from TSV file.
#[derive(Debug, Deserialize)]
pub struct GnomadCnv4Record {
/// chromosome name
pub chromosome: String,
/// begin position, 0-based
pub begin: i32,
/// end position, 0-based
pub end: i32,
/// The structural vairant type
pub svtype: String,
/// Number of samples at this site (passing QC).
pub n_total: u32,
/// Number of samples with a CNV at this site (passing QC).
pub n_var: u32,
}

/// Common type to convert input data to.
pub struct InputRecord {
/// Chromosome of start position.
Expand Down Expand Up @@ -251,7 +301,7 @@ impl TryInto<Option<InputRecord>> for ExacRecord {
}
}

impl TryInto<Option<InputRecord>> for GnomadRecord {
impl TryInto<Option<InputRecord>> for GnomadSv2Record {
type Error = &'static str;

fn try_into(self) -> Result<Option<InputRecord>, Self::Error> {
Expand Down Expand Up @@ -279,6 +329,59 @@ impl TryInto<Option<InputRecord>> for GnomadRecord {
}
}

impl TryInto<Option<InputRecord>> for GnomadCnv4Record {
type Error = &'static str;

fn try_into(self) -> Result<Option<InputRecord>, Self::Error> {
Ok(Some(InputRecord {
chromosome: self.chromosome.clone(),
chromosome2: self.chromosome,
begin: self.begin,
end: self.end,
sv_type: match self.svtype.as_str() {
"DEL" => SvType::Del,
"DUP" => SvType::Dup,
_ => {
error!("sv_type = {}", &self.svtype);
return Err("unknown SV type");
}
},
count: self.n_var,
}))
}
}

impl TryInto<Option<InputRecord>> for GnomadSv4Record {
type Error = &'static str;

fn try_into(self) -> Result<Option<InputRecord>, Self::Error> {
Ok(Some(InputRecord {
chromosome: self.chromosome.clone(),
chromosome2: self.chromosome,
begin: self.begin,
end: self.end,
sv_type: match self.svtype.as_str() {
"BND" => SvType::Bnd,
"CNV" => SvType::Cnv,
"DEL" => SvType::Del,
"DUP" => SvType::Dup,
"INS" => SvType::Ins,
"INV" => SvType::Inv,
_ => {
error!("sv_type = {}", &self.svtype);
return Err("unknown SV type");
}
},
count: self.male_n_het
+ self.male_n_homalt
+ self.male_n_hemialt
+ self.female_n_het
+ self.female_n_homalt
+ self.cnv_n_var,
}))
}
}

impl TryInto<Option<InputRecord>> for G1kRecord {
type Error = &'static str;

Expand Down
102 changes: 92 additions & 10 deletions src/strucvars/txt_to_bin/vardbs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ pub enum InputFileType {
DgvGs,
Exac,
G1k,
Gnomad,
GnomadSv2,
GnomadCnv4,
GnomadSv4,
InhouseDb,
}
/// Deserialize from CSV reader to an `Option<records::InputRecord>`
Expand Down Expand Up @@ -76,6 +78,24 @@ where
Ok(result)
}

/// Branch around `deserialize_loop`.
pub fn deserialize_branch(
input_type: InputFileType,
reader: &mut csv::Reader<Box<dyn std::io::BufRead>>,
) -> Result<Vec<BgDbRecord>, anyhow::Error> {
match input_type {
InputFileType::Dbvar => deserialize_loop::<input::DbVarRecord>(reader),
InputFileType::Dgv => deserialize_loop::<input::DgvRecord>(reader),
InputFileType::DgvGs => deserialize_loop::<input::DgvGsRecord>(reader),
InputFileType::Exac => deserialize_loop::<input::ExacRecord>(reader),
InputFileType::G1k => deserialize_loop::<input::G1kRecord>(reader),
InputFileType::InhouseDb => deserialize_loop::<InhouseDbRecord>(reader),
InputFileType::GnomadSv2 => deserialize_loop::<input::GnomadSv2Record>(reader),
InputFileType::GnomadCnv4 => deserialize_loop::<input::GnomadCnv4Record>(reader),
InputFileType::GnomadSv4 => deserialize_loop::<input::GnomadSv4Record>(reader),
}
}

/// Perform conversion to protobuf `.bin` file.
pub fn convert_to_bin<P, Q>(
path_input_tsv: P,
Expand All @@ -97,15 +117,7 @@ where
)?);
let before_parsing = Instant::now();

let records = match input_type {
InputFileType::Dbvar => deserialize_loop::<input::DbVarRecord>(&mut reader)?,
InputFileType::Dgv => deserialize_loop::<input::DgvRecord>(&mut reader)?,
InputFileType::DgvGs => deserialize_loop::<input::DgvGsRecord>(&mut reader)?,
InputFileType::Exac => deserialize_loop::<input::ExacRecord>(&mut reader)?,
InputFileType::G1k => deserialize_loop::<input::G1kRecord>(&mut reader)?,
InputFileType::Gnomad => deserialize_loop::<input::GnomadRecord>(&mut reader)?,
InputFileType::InhouseDb => deserialize_loop::<InhouseDbRecord>(&mut reader)?,
};
let records = deserialize_branch(input_type, &mut reader)?;
let bg_db = BackgroundDatabase { records };

tracing::debug!(
Expand All @@ -127,3 +139,73 @@ where

Ok(())
}

#[cfg(test)]
mod test {
use super::InputFileType;

#[rstest::rstest]
#[case::dbvar(
InputFileType::Dbvar,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/dbvar.bed.gz"
)]
#[case::dgv(
InputFileType::Dgv,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/dgv.bed.gz"
)]
#[case::dgv_gs(
InputFileType::DgvGs,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/dgv_gs.bed.gz"
)]
#[case::exac(
InputFileType::Exac,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/exac.bed.gz"
)]
#[case::g1k(
InputFileType::G1k,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/g1k.bed.gz"
)]
#[case::gnomad_sv2(
InputFileType::GnomadSv2,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/gnomad_sv.bed.gz"
)]
#[case::gnomad_cnv4(
InputFileType::GnomadCnv4,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch38/strucvar/gnomad-cnv.bed.gz"
)]
#[case::gnomad_sv4(
InputFileType::GnomadSv4,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch38/strucvar/gnomad-sv.bed.gz"
)]
#[case::inhouse_db(
InputFileType::InhouseDb,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/inhouse.tsv"
)]
fn test_deserialize_branch(
#[case] input_type: InputFileType,
#[case] path_input: &str,
) -> Result<(), anyhow::Error> {
mehari::common::set_snapshot_suffix!(
"{:?}-{}",
input_type,
path_input
.split('/')
.last()
.unwrap()
.split('.')
.next()
.unwrap()
);

let mut reader = csv::ReaderBuilder::new()
.has_headers(false)
.comment(Some(b'#'))
.delimiter(b'\t')
.from_reader(mehari::common::io::std::open_read_maybe_gz(path_input)?);

let records = super::deserialize_branch(input_type, &mut reader)?;
insta::assert_yaml_snapshot!(records);

Ok(())
}
}
Loading
Loading