Skip to content

Commit

Permalink
feat: adjust "strucvars query" to read ingested VCFs (#202) (#220)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Oct 11, 2023
1 parent ea6e387 commit 1fb3c58
Show file tree
Hide file tree
Showing 71 changed files with 1,317 additions and 336 deletions.
5 changes: 5 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,8 @@ tests/db/compile/** filter=lfs diff=lfs merge=lfs -text
tests/db/hpo/** filter=lfs diff=lfs merge=lfs -text
tests/seqvars/ingest/db/grch37/** filter=lfs diff=lfs merge=lfs -text
tests/seqvars/ingest/db/grch38/** filter=lfs diff=lfs merge=lfs -text
tests/strucvars/query/*.vcf filter=lfs diff=lfs merge=lfs -text
tests/strucvars/query/grch37/** filter=lfs diff=lfs merge=lfs -text
tests/strucvars/query/noref/** filter=lfs diff=lfs merge=lfs -text
tests/strucvars/query/db/** filter=lfs diff=lfs merge=lfs -text
src/strucvars/query/snapshots/*smoke_test*.snap filter=lfs diff=lfs merge=lfs -text
44 changes: 43 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,49 @@ Convert output of [varfish-db-downloader](https://github.com/bihealth/varfish-db
$ varfish-server-worker strucvars txt-to-bin \
--input-type {ClinvarSv,StrucvarInhouse,...} \
--path-input IN.txt \
--path-output-bin DST.bin
--path-output DST.bin
```

## The `strucvars query` Command

Run a query on a VCF file with structural variants as created by `strucvars ingest` using a varfish worker database.

```
$ varfish-server-worker strucvars query \
--genome-release grch37 \
--path-db path/to/worker-db \
--path-input IN.vcf.gz \
--path-output OUT.jsonl
```

The worker database has the following structure:

```
$ROOT/
noref/
genes/
acmg.tsv -- ACMG SF list genes
mim2gene.tsv -- OMIM to NCBI mapping from clingen
xlink.bin -- gene crosslinks
{genome_release}/ -- one per genome release
mehari/
txs.bin.zstd -- mehari transcripts
features/ -- features important for annotation
masked_repeat.bin -- masked repeats
masked_seqdup.bin -- masked segmental duplications
strucvars/ -- structural variant specific
bgdbs/ -- background databases
dbvar.bin -- dbVar
dgv.bin -- DGV
dgv-gs.bin -- DGV gold standard
exac.bin -- ExAC CNVs
g1k.bin -- 1000 genomes CNVs
gnomad.bin -- gnomAD-SVs
clinvar.bin -- ClinVar SVs
inhouse.bin -- inhouse SV database
patho-mms.bed -- well-known pathogenic DELs/DUPs
tads/
hesc.bed -- hESC TAD definitions
```

# Developer Information
Expand Down
7 changes: 6 additions & 1 deletion build.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
// The custom build script, needed as we use protocolbuffers.

fn main() {
println!("cargo:rerun-if-changed=src/proto/varfish/v1/clinvar.proto");
println!("cargo:rerun-if-changed=src/proto/varfish/v1/sv.proto");
prost_build::Config::new()
.protoc_arg("-Isrc/proto")
// Add serde serialization and deserialization to the generated code.
Expand All @@ -9,7 +11,10 @@ fn main() {
.type_attribute(".", "#[serde_with::skip_serializing_none]")
// Define the protobuf files to compile.
.compile_protos(
&["varfish/v1/clinvar.proto", "varfish/v1/sv.proto"],
&[
"src/proto/varfish/v1/clinvar.proto",
"src/proto/varfish/v1/sv.proto",
],
&["src/"],
)
.unwrap();
Expand Down
5 changes: 2 additions & 3 deletions misc/ok-query-trio.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,13 @@
"CNV"
],
"tx_effects": [
"transcript_ablation",
"transcript_translocation",
"transcript_variant",
"exon_variant",
"splice_region_variant",
"intron_variant",
"upstream_variant",
"downstream_variant",
"intergenic_variant",
"intergenic_variant"
],
"gene_allowlist": null,
"genomic_region": null,
Expand Down
7 changes: 5 additions & 2 deletions src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,10 @@ where
)]
pub enum GenomeRelease {
// GRCh37 / hg19
#[strum(serialize = "GRCh37")]
#[strum(serialize = "grch37")]
Grch37,
/// GRCh38 / hg38
#[strum(serialize = "GRCh38")]
#[strum(serialize = "grch38")]
Grch38,
}

Expand Down Expand Up @@ -207,6 +207,8 @@ pub enum Genotype {
Het,
/// hom. alt.
HomAlt,
/// other, includes no-call
WithNoCall,
}

impl std::str::FromStr for Genotype {
Expand All @@ -217,6 +219,7 @@ impl std::str::FromStr for Genotype {
"0/0" | "0|0" | "0" => Genotype::HomRef,
"0/1" | "1/0" | "0|1" | "1|0" => Genotype::Het,
"1/1" | "1|1" | "1" => Genotype::HomAlt,
"./." | "./0" | "./1" | "0/." | "1/." => Genotype::WithNoCall,
_ => anyhow::bail!("invalid genotype value: {:?}", s),
})
}
Expand Down
1 change: 1 addition & 0 deletions src/seqvars/aggregate/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ fn handle_record(
};

let carrier_genotype = match (chrom, individual.sex, genotype) {
(_, _, Genotype::WithNoCall) => continue,
// on the autosomes, male/female count the same
(Chrom::Auto, _, Genotype::HomRef) => {
res_counts.count_an += 2;
Expand Down
1 change: 0 additions & 1 deletion src/seqvars/prefilter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,6 @@ mod test {
);

let params_file = tmpdir.to_path_buf().join("params.json");
eprintln!("params_json = {}", &params_json);
std::fs::write(&params_file, &params_json)?;

let args = super::Args {
Expand Down
1 change: 1 addition & 0 deletions src/strucvars/aggregate/output.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ impl Record {
};

match (chrom, individual.sex, genotype) {
(_, _, Genotype::WithNoCall) => continue,
// on the autosomes, male/female count the same
(Chrom::Auto, _, Genotype::HomRef) => (),
(Chrom::Auto, _, Genotype::Het) => {
Expand Down
1 change: 0 additions & 1 deletion src/strucvars/ingest/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,6 @@ impl mehari::annotate::seqvars::AnnotatedVcfWriter for WriterWrapper {
}

let key_callers: vcf::record::info::field::Key = "callers".parse()?;
eprintln!("callers = {:?}", &input_record.info().get(&key_callers));
if let Some(Some(callers)) = input_record.info().get(&key_callers) {
if let vcf::record::info::field::Value::Array(
vcf::record::info::field::value::Array::String(callers),
Expand Down
2 changes: 1 addition & 1 deletion src/strucvars/query/bgdbs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use crate::{
};

use super::{
records::ChromRange,
schema::ChromRange,
schema::{CaseQuery, StructuralVariant, SvType},
};

Expand Down
2 changes: 1 addition & 1 deletion src/strucvars/query/clinvar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use tracing::{info, warn};
use crate::common::{reciprocal_overlap, GenomeRelease, CHROMS};

use super::{
records::ChromRange,
schema::ChromRange,
schema::{Pathogenicity, StructuralVariant, SvType},
};

Expand Down
10 changes: 7 additions & 3 deletions src/strucvars/query/genes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ impl OmimDb {
}

#[tracing::instrument]
fn load_omim_db(path: &Path) -> Result<OmimDb, anyhow::Error> {
fn load_mim2gene_db(path: &Path) -> Result<OmimDb, anyhow::Error> {
tracing::debug!("loading OMIM TSV records from {:?}...", path);

let before_loading = Instant::now();
Expand Down Expand Up @@ -291,7 +291,7 @@ pub struct GeneDb {
pub ensembl: GeneRegionDb,
pub xlink: XlinkDb,
pub acmg: AcmgDb,
pub omim: OmimDb,
pub mim2gene: OmimDb,
}

impl GeneDb {
Expand Down Expand Up @@ -327,7 +327,11 @@ pub fn load_gene_db(path_db: &str, genome_release: GenomeRelease) -> Result<Gene
)?,
xlink: load_xlink_db(Path::new(path_db).join("noref/genes/xlink.bin").as_path())?,
acmg: load_acmg_db(Path::new(path_db).join("noref/genes/acmg.tsv").as_path())?,
omim: load_omim_db(Path::new(path_db).join("noref/genes/omim.tsv").as_path())?,
mim2gene: load_mim2gene_db(
Path::new(path_db)
.join("noref/genes/mim2gene.tsv")
.as_path(),
)?,
};

Ok(result)
Expand Down
Loading

0 comments on commit 1fb3c58

Please sign in to comment.