Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Oct 6, 2023
1 parent f7d1547 commit 4501696
Show file tree
Hide file tree
Showing 65 changed files with 2,419 additions and 51 deletions.
33 changes: 19 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,28 +193,33 @@ The following `FORMAT` fields are written:
Overall, the command will emit the following header rows in addition to the `##contig=<ID=.,length=.>` lines.

```
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##fileformat=VCFv4.4
##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
##INFO=<ID=chr2,Number=1,Type=String,Description="Second chromosome, if not equal to CHROM">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the longest variant described in this record">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
##INFO=<ID=SVLEN,Number=A,Type=Integer,Description="Length of structural variant">
##INFO=<ID=SVCLAIM,Number=A,Type=String,Description="Claim made by the structural variant call. Valid values are D, J, DJ for abunda
##INFO=<ID=annsv,Number=.,Type=String,Description="Effect annotations: 'Allele | Annotation | Gene_Name | Gene_ID'">
##INFO=<ID=SVCLAIM,Number=A,Type=String,Description="Claim made by the structural variant call. Valid values are D, J, DJ for abundance, adjacency and both respectively">
##INFO=<ID=chr2,Number=1,Type=String,Description="Second chromosome, if not equal to CHROM">
##INFO=<ID=annsv,Number=1,Type=String,Description="Effect annotations: 'Allele | Annotation | Gene_Name | Gene_ID'">
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=pec,Number=1,Type=Integer,Description="Total coverage with paired-end reads">
##FORMAT=<ID=pev,Number=1,Type=Integer,Description="Variant paired-end read support">
##FORMAT=<ID=pev,Number=1,Type=Integer,Description="Paired-end reads supporting the variant">
##FORMAT=<ID=src,Number=1,Type=Integer,Description="Total coverage with split reads">
##FORMAT=<ID=srv,Number=1,Type=Integer,Description="Variant split reads support">
##FORMAT=<ID=amq,Number=1,Type=Integer,Description="Average mapping quality over SV">
##FORMAT=<ID=srv,Number=1,Type=Integer,Description="Split reads supporting the variant">
##FORMAT=<ID=amq,Number=1,Type=Float,Description="Average mapping quality over the variant">
##FORMAT=<ID=cn,Number=1,Type=Integer,Description="Copy number of the variant in the sample">
##FORMAT=<ID=anc,Number=1,Type=Float,Description="Average normalized coverage of the variant in the sample">
##FORMAT=<ID=pc,Number=1,Type=Integer,Description="Point count for CNV call (windows/targets/probes)">
##x-varfish-version=<ID=varfish-server-worker,Version=x.y.z>
##x-varfish-version=<ID=orig-caller,Name=DragenSV,Version=07.021.624.3.10.4>
##x-varfish-version=<ID=orig-caller,Name=Delly2,Version=1.1.7>
##FORMAT=<ID=anc,Number=1,Type=Float,Description="Average normalized coverage over the variant in the sample">
##FORMAT=<ID=pc,Number=1,Type=Integer,Description="Point count (windows/targets/probes)">
##SAMPLE=<ID=index,Sex="Male",Disease="Affected">
##SAMPLE=<ID=father,Sex="Male",Disease="Unaffected">
##SAMPLE=<ID=mother,Sex="Female",Disease="Unaffected">
##PEDIGREE=<ID=index,Father="father",Mother="mother">
##PEDIGREE=<ID=father>
##PEDIGREE=<ID=mother>
##x-varfish-version=<ID=varfish-server-worker,Version="x.y.z">
##x-varfish-version=<ID=Delly,Name="Delly",Version="1.1.3">
```

# Developer Information
Expand Down
139 changes: 137 additions & 2 deletions src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ use std::{
};

use byte_unit::Byte;
use clap_verbosity_flag::{InfoLevel, Verbosity};

use clap::Parser;
use clap_verbosity_flag::{InfoLevel, Verbosity};
use flate2::{bufread::MultiGzDecoder, write::GzEncoder, Compression};
use hgvs::static_data::Assembly;
use indexmap::IndexMap;
use noodles_vcf as vcf;

/// Commonly used command line arguments.
#[derive(Parser, Debug)]
Expand Down Expand Up @@ -386,3 +386,138 @@ mod test {
Ok(())
}
}

/// Return the version of the `varfish-server-worker` crate and `x.y.z` in tests.
pub fn worker_version() -> &'static str {
if cfg!(test) {
"x.y.z"
} else {
env!("CARGO_PKG_VERSION")
}
}

/// Add contigs for GRCh37.
pub fn add_contigs_37(
builder: vcf::header::Builder,
) -> Result<vcf::header::Builder, anyhow::Error> {
use vcf::header::record::value::map::Contig;
use vcf::header::record::value::Map;

let mut builder = builder;

let specs: &[(&str, usize); 25] = &[
("1", 249250621),
("2", 243199373),
("3", 198022430),
("4", 191154276),
("5", 180915260),
("6", 171115067),
("7", 159138663),
("8", 146364022),
("9", 141213431),
("10", 135534747),
("11", 135006516),
("12", 133851895),
("13", 115169878),
("14", 107349540),
("15", 102531392),
("16", 90354753),
("17", 81195210),
("18", 78077248),
("19", 59128983),
("20", 63025520),
("21", 48129895),
("22", 51304566),
("X", 155270560),
("Y", 59373566),
("MT", 16569),
];

for (contig, length) in specs {
builder = builder.add_contig(
contig
.parse()
.map_err(|_| anyhow::anyhow!("invalid contig: {}", contig))?,
Map::<Contig>::builder()
.set_length(*length)
.insert(
"assembly"
.parse()
.map_err(|_| anyhow::anyhow!("invalid key: assembly"))?,
"GRCh37",
)
.insert(
"species"
.parse()
.map_err(|_| anyhow::anyhow!("invalid key: species"))?,
"Homo sapiens",
)
.build()?,
);
}

Ok(builder)
}

/// Add contigs for GRCh38.
pub fn add_contigs_38(
builder: vcf::header::Builder,
) -> Result<vcf::header::Builder, anyhow::Error> {
use vcf::header::record::value::map::Contig;
use vcf::header::record::value::Map;

let mut builder = builder;

let specs: &[(&str, usize); 25] = &[
("chr1", 248956422),
("chr2", 242193529),
("chr3", 198295559),
("chr4", 190214555),
("chr5", 181538259),
("chr6", 170805979),
("chr7", 159345973),
("chr8", 145138636),
("chr9", 138394717),
("chr10", 133797422),
("chr11", 135086622),
("chr12", 133275309),
("chr13", 114364328),
("chr14", 107043718),
("chr15", 101991189),
("chr16", 90338345),
("chr17", 83257441),
("chr18", 80373285),
("chr19", 58617616),
("chr20", 64444167),
("chr21", 46709983),
("chr22", 50818468),
("chrX", 156040895),
("chrY", 57227415),
("chrM", 16569),
];

for (contig, length) in specs {
builder = builder.add_contig(
contig
.parse()
.map_err(|_| anyhow::anyhow!("invalid contig: {}", contig))?,
Map::<Contig>::builder()
.set_length(*length)
.insert(
"assembly"
.parse()
.map_err(|_| anyhow::anyhow!("invalid key: assembly"))?,
"GRCh38",
)
.insert(
"species"
.parse()
.map_err(|_| anyhow::anyhow!("invalid key: species"))?,
"Homo sapiens",
)
.build()?,
);
}

Ok(builder)
}
2 changes: 1 addition & 1 deletion src/db/mk_inhouse/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use crate::{
build_chrom_map, open_read_maybe_gz, open_write_maybe_gz, read_lines, trace_rss_now,
GenomeRelease, CHROMS,
},
sv::query::schema::SvType,
strucvars::query::schema::SvType,
};

/// Create one file with records for each chromosome and SV type.
Expand Down
2 changes: 1 addition & 1 deletion src/db/mk_inhouse/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
use serde::{de::IntoDeserializer, Deserialize, Deserializer, Serialize};

use crate::sv::query::schema::{StrandOrientation, SvType};
use crate::strucvars::query::schema::{StrandOrientation, SvType};

/// Representation of the fields from the `StructuralVariant` table from
/// VarFish Server that we need for building the background records.
Expand Down
2 changes: 1 addition & 1 deletion src/db/mk_inhouse/output.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
use serde::{Deserialize, Serialize};

use crate::sv::query::schema::{StrandOrientation, SvType};
use crate::strucvars::query::schema::{StrandOrientation, SvType};

use super::input::Record as InputRecord;

Expand Down
2 changes: 1 addition & 1 deletion src/db/to_bin/clinvar/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
//! Note that not the full model is implemented, only the parts that are needed for the
//! conversion of the ClinVar structural variants.
use crate::sv::query::clinvar::pbs::{Pathogenicity, VariationType};
use crate::strucvars::query::clinvar::pbs::{Pathogenicity, VariationType};

/// Accession of a ClinVar record.
#[derive(Debug, serde::Deserialize, serde::Serialize)]
Expand Down
4 changes: 2 additions & 2 deletions src/db/to_bin/clinvar/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use thousands::Separable;

use crate::{
common::{build_chrom_map, open_read_maybe_gz, trace_rss_now},
sv::query::clinvar::pbs::{Pathogenicity, SvDatabase, SvRecord},
strucvars::query::clinvar::pbs::{Pathogenicity, SvDatabase, SvRecord},
};

pub mod input;
Expand Down Expand Up @@ -71,7 +71,7 @@ fn convert_jsonl_to_protobuf(
for measure in &record.reference_clinvar_assertion.measures.measures {
// convert from JSONL to protocolbuffers: variation type
let variation_type: Result<
crate::sv::query::clinvar::pbs::VariationType,
crate::strucvars::query::clinvar::pbs::VariationType,
anyhow::Error,
> = measure.r#type.try_into();
let variation_type = if let Ok(variation_type) = variation_type {
Expand Down
2 changes: 1 addition & 1 deletion src/db/to_bin/vardbs/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use serde::Deserialize;
use tracing::error;

use crate::db::mk_inhouse::output::Record as InhouseDbRecord;
use crate::sv::query::schema::SvType;
use crate::strucvars::query::schema::SvType;

/// dbVar database record as read from TSV file.
#[derive(Debug, Deserialize)]
Expand Down
2 changes: 1 addition & 1 deletion src/db/to_bin/vardbs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use crate::common::{build_chrom_map, open_read_maybe_gz, trace_rss_now};
use crate::db;
use crate::db::mk_inhouse::output::Record as InhouseDbRecord;
use crate::db::pbs::{BackgroundDatabase, BgDbRecord};
use crate::sv::query::schema::SvType;
use crate::strucvars::query::schema::SvType;

use self::input::InputRecord;

Expand Down
24 changes: 14 additions & 10 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
pub mod common;
pub mod db;
pub mod seqvars;
pub mod sv;
pub mod strucvars;

use clap::{Args, Parser, Subcommand};
use console::{Emoji, Term};
Expand Down Expand Up @@ -32,8 +32,8 @@ struct Cli {
enum Commands {
/// Database building related commands.
Db(Db),
/// SV filtration related commands.
Sv(Sv),
/// Structural variant related commands.
Strucvars(Strucvars),
/// Sequence variant related commands.
Seqvars(Seqvars),
}
Expand All @@ -58,16 +58,17 @@ enum DbCommands {
/// Parsing of "sv *" sub commands.
#[derive(Debug, Args)]
#[command(args_conflicts_with_subcommands = true)]
struct Sv {
struct Strucvars {
/// The sub command to run
#[command(subcommand)]
command: SvCommands,
command: StrucvarsCommands,
}

/// Enum supporting the parsing of "sv *" sub commands.
#[derive(Debug, Subcommand)]
enum SvCommands {
Query(sv::query::Args),
enum StrucvarsCommands {
Ingest(strucvars::ingest::Args),
Query(strucvars::query::Args),
}

/// Parsing of "seqvars *" sub commands.
Expand Down Expand Up @@ -121,9 +122,12 @@ fn main() -> Result<(), anyhow::Error> {
seqvars::ingest::run(&cli.common, args)?;
}
},
Commands::Sv(sv) => match &sv.command {
SvCommands::Query(args) => {
sv::query::run(&cli.common, args)?;
Commands::Strucvars(strucvars) => match &strucvars.command {
StrucvarsCommands::Ingest(args) => {
strucvars::ingest::run(&cli.common, args)?;
}
StrucvarsCommands::Query(args) => {
strucvars::query::run(&cli.common, args)?;
}
},
}
Expand Down
11 changes: 1 addition & 10 deletions src/seqvars/ingest/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
use std::sync::{Arc, OnceLock};

use crate::common::{self, open_read_maybe_gz, open_write_maybe_gz, GenomeRelease};
use crate::common::{self, open_read_maybe_gz, open_write_maybe_gz, worker_version, GenomeRelease};
use mehari::annotate::seqvars::provider::MehariProvider;
use noodles_vcf as vcf;
use thousands::Separable;
Expand Down Expand Up @@ -33,15 +33,6 @@ pub struct Args {
pub path_out: String,
}

/// Return the version of the `varfish-server-worker` crate and `x.y.z` in tests.
fn worker_version() -> &'static str {
if cfg!(test) {
"x.y.z"
} else {
env!("CARGO_PKG_VERSION")
}
}

/// Return path component fo rth egiven assembly.
pub fn path_component(genomebuild: GenomeRelease) -> &'static str {
match genomebuild {
Expand Down
Loading

0 comments on commit 4501696

Please sign in to comment.