Skip to content

Commit

Permalink
feat: implement parsing of clinvar-data-jsonl data (#198)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Oct 5, 2023
1 parent 5a3a936 commit 2f77a75
Show file tree
Hide file tree
Showing 17 changed files with 1,188 additions and 215 deletions.
10 changes: 5 additions & 5 deletions src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
use std::{
fs::File,
io::{BufRead, BufReader, BufWriter, Read, Write},
io::{BufRead, BufReader, BufWriter, Write},
ops::Range,
path::Path,
};
Expand Down Expand Up @@ -60,7 +60,7 @@ pub fn build_chrom_map() -> IndexMap<String, usize> {
}

/// Transparently open a file with gzip decoder.
pub fn open_read_maybe_gz<P>(path: P) -> Result<Box<dyn Read>, anyhow::Error>
pub fn open_read_maybe_gz<P>(path: P) -> Result<Box<dyn BufRead>, anyhow::Error>
where
P: AsRef<Path>,
{
Expand All @@ -69,11 +69,11 @@ where
let file = File::open(path)?;
let bufreader = BufReader::new(file);
let decoder = MultiGzDecoder::new(bufreader);
Ok(Box::new(decoder))
Ok(Box::new(BufReader::new(decoder)))
} else {
tracing::trace!("Opening {:?} as plain text for reading", path.as_ref());
let file = File::open(path)?;
Ok(Box::new(file))
let file = File::open(path).map(BufReader::new)?;
Ok(Box::new(BufReader::new(file)))
}
}

Expand Down
62 changes: 58 additions & 4 deletions src/db/to_bin/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,36 @@ use crate::{
},
};

/// Select the assembly
#[derive(
clap::ValueEnum,
Clone,
Copy,
Debug,
strum::Display,
PartialEq,
Eq,
enum_map::Enum,
PartialOrd,
Ord,
Hash,
)]
pub enum Assembly {
/// GRCh37
Grch37,
/// GRCh38
Grch38,
}

impl From<Assembly> for crate::db::to_bin::clinvar::input::Assembly {
fn from(val: Assembly) -> Self {
match val {
Assembly::Grch37 => crate::db::to_bin::clinvar::input::Assembly::Grch37,
Assembly::Grch38 => crate::db::to_bin::clinvar::input::Assembly::Grch38,
}
}
}

/// Select input/conversion type.
#[derive(
clap::ValueEnum,
Expand Down Expand Up @@ -56,6 +86,9 @@ pub enum InputType {
#[derive(Parser, Debug)]
#[command(about = "Convert to binary protobuf files", long_about = None)]
pub struct Args {
/// Optionally the assembly (required for ClinvarSv)
#[arg(long, value_enum)]
pub assembly: Option<Assembly>,
/// Input type to convert to binary.
#[arg(long, value_enum)]
pub input_type: InputType,
Expand All @@ -77,7 +110,13 @@ pub fn run(common_args: &crate::common::Args, args: &Args) -> Result<(), anyhow:

tracing::info!("Starting conversion...");
match args.input_type {
InputType::ClinvarSv => clinvar::convert_to_bin(&args.path_input, &args.path_output_bin)?,
InputType::ClinvarSv => {
let assembly = args
.assembly
.expect("assembly required for ClinvarSv conversion");
let assembly: crate::db::to_bin::clinvar::input::Assembly = assembly.into();
clinvar::convert_to_bin(&args.path_input, &args.path_output_bin, assembly)?
}
InputType::StrucvarInhouse => vardbs::convert_to_bin(
&args.path_input,
&args.path_output_bin,
Expand Down Expand Up @@ -128,16 +167,21 @@ mod test {

use super::{Args, InputType};

#[test]
fn run_clinvar_sv_smoke() -> Result<(), anyhow::Error> {
#[rstest::rstest]
#[case(crate::db::to_bin::cli::Assembly::Grch37)]
#[case(crate::db::to_bin::cli::Assembly::Grch38)]
fn run_clinvar_sv_smoke(
#[case] assembly: crate::db::to_bin::cli::Assembly,
) -> Result<(), anyhow::Error> {
let tmp_dir = temp_testdir::TempDir::default();
let common_args = common::Args {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: Some(assembly),
input_type: InputType::ClinvarSv,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/clinvar.bed.gz",
"tests/db/to-bin/varfish-db-downloader/vardbs/clinvar/clinvar-svs.jsonl.gz",
),
path_output_bin: tmp_dir.join("clinvar.bin"),
};
Expand All @@ -154,6 +198,7 @@ mod test {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::StrucvarInhouse,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/inhouse.tsv",
Expand All @@ -173,6 +218,7 @@ mod test {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::StrucvarDbVar,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/dbvar.bed.gz",
Expand All @@ -192,6 +238,7 @@ mod test {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::StrucvarDgv,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/dgv.bed.gz",
Expand All @@ -211,6 +258,7 @@ mod test {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::StrucvarDgvGs,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/dgv_gs.bed.gz",
Expand All @@ -230,6 +278,7 @@ mod test {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::StrucvarExacCnv,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/exac.bed.gz",
Expand All @@ -249,6 +298,7 @@ mod test {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::StrucvarG1k,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/g1k.bed.gz",
Expand All @@ -268,6 +318,7 @@ mod test {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::StrucvarGnomadSv,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/gnomad_sv.bed.gz",
Expand All @@ -287,6 +338,7 @@ mod test {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::GeneRegion,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/features/grch37/gene_regions/refseq.bed.gz",
Expand All @@ -306,6 +358,7 @@ mod test {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::MaskedRegion,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/features/grch37/masked/repeat.bed.gz",
Expand All @@ -325,6 +378,7 @@ mod test {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::Xlink,
path_input: String::from("tests/db/to-bin/varfish-db-downloader/genes/xlink/hgnc.tsv"),
path_output_bin: tmp_dir.join("xlink.bin"),
Expand Down
Loading

0 comments on commit 2f77a75

Please sign in to comment.