Skip to content

Commit

Permalink
feat: add multi-cohort support for gnomAD CNV v4 (#322)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Nov 21, 2023
1 parent cca54b4 commit 1d0cd26
Show file tree
Hide file tree
Showing 18 changed files with 509 additions and 41 deletions.
25 changes: 18 additions & 7 deletions protos/annonars/gnomad/gnomad_cnv4.proto
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ message CarrierCounts {

// Store the carrier counts for the given sub cohort and sub cohort factored by sex.
message CarrierCountsBySex {
// Overall allele counts in the sub cohort.
// Overall carrier counts in the sub cohort.
CarrierCounts overall = 1;
// Allele counts in female/XX karyotype individuals of sub cohort.
// Carrier counts in female/XX karyotype individuals of sub cohort.
optional CarrierCounts xx = 2;
// Allele counts in male/XY karyotype individuals of sub cohort.
// Carrier counts in male/XY karyotype individuals of sub cohort.
optional CarrierCounts xy = 3;
}

Expand All @@ -49,13 +49,23 @@ enum Population {
}

// Store the carrier counts for a population.
message PopulationAlleleCounts {
message PopulationCarrierCounts {
// The population.
Population population = 1;
// The overall allele counts and the one by sex.
CarrierCountsBySex counts = 2;
}

// Store the allele counts for the given cohort.
message CohortCarrierCounts {
// Name of the cohort, empty for global.
optional string cohort = 1;
// The overall carrier counts and the one by sex.
CarrierCountsBySex by_sex = 2;
// Carrier counts for each population.
repeated PopulationCarrierCounts by_population = 3;
}

// One record in the gnomAD-CNV v4 dataset.
message Record {
// Chromosome name.
Expand Down Expand Up @@ -91,8 +101,9 @@ message Record {
// Symbols of genes predicted to be impacted by variant
repeated string genes = 13;

// Global carrier counts.
CarrierCountsBySex counts = 14;
// Carrier counts in the different population.
repeated PopulationAlleleCounts by_population = 15;
//
// The populations in gnomAD CNV v4 are: empty for global, "non_neuro",
// and "non_neuro_coontrols".
repeated CohortCarrierCounts carrier_counts = 14;
}
69 changes: 61 additions & 8 deletions src/gnomad_sv/cli/import/gnomad_cnv4.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ use crate::{
common::noodles::{get_f32, get_i32, get_string, get_vec_str},
pbs::gnomad::exac_cnv::CnvType,
pbs::gnomad::gnomad_cnv4::{
CarrierCounts, CarrierCountsBySex, Population, PopulationAlleleCounts, Record,
CarrierCounts, CarrierCountsBySex, CohortCarrierCounts, Population,
PopulationCarrierCounts, Record,
},
};

Expand Down Expand Up @@ -63,6 +64,7 @@ impl Record {
/// # Arguments
///
/// * `record` - VCF record to create new record from.
/// * `cohort_name` - Name of the cohort.
///
/// # Returns
///
Expand All @@ -71,7 +73,10 @@ impl Record {
/// # Errors
///
/// * Any error encountered during the creation.
pub fn from_vcf_record(record: &noodles_vcf::Record) -> Result<Self, anyhow::Error> {
pub fn from_vcf_record(
record: &noodles_vcf::Record,
cohort_name: &str,
) -> Result<Self, anyhow::Error> {
let chrom = record.chromosome().to_string();
let start: usize = record.position().into();
let stop = get_i32(record, "END").expect("no END?");
Expand All @@ -92,7 +97,7 @@ impl Record {
let n_exn_var = get_i32(record, "N_EXN_VAR,").unwrap_or_default();
let n_int_var = get_i32(record, "N_INT_VAR").unwrap_or_default();
let genes = get_vec_str(record, "GENES").unwrap_or_default();
let counts = Some(Self::carrier_counts_by_sex_from_vcf_record(record, None)?);
let by_sex = Some(Self::carrier_counts_by_sex_from_vcf_record(record, None)?);
let mut by_population = Vec::new();
for population in [
Population::Afr,
Expand All @@ -104,14 +109,23 @@ impl Record {
Population::Nfe,
Population::Sas,
] {
by_population.push(PopulationAlleleCounts {
by_population.push(PopulationCarrierCounts {
population: population as i32,
counts: Some(Self::carrier_counts_by_sex_from_vcf_record(
record,
Some(population),
)?),
})
}
let carrier_counts = vec![CohortCarrierCounts {
cohort: if cohort_name.is_empty() {
None
} else {
Some(cohort_name.to_string())
},
by_sex,
by_population,
}];
Ok(Self {
chrom,
start: start as i32,
Expand All @@ -126,8 +140,7 @@ impl Record {
n_exn_var,
n_int_var,
genes,
counts,
by_population,
carrier_counts,
})
}

Expand Down Expand Up @@ -158,6 +171,24 @@ impl Record {

Ok(CarrierCounts { sc, sf, sn })
}

/// Merge with another record.
///
/// We assume that the record IDs are the same and just concatenate the allele counts.
///
/// # Arguments
///
/// * `other` - Other record to merge with.
///
/// # Returns
///
/// * `self` - Merged record.
pub fn merge_with(mut self, other: Self) -> Self {
let mut other = other;
self.carrier_counts.append(&mut other.carrier_counts);
self.carrier_counts.sort_by(|a, b| a.cohort.cmp(&b.cohort));
self
}
}

/// Perform import of gnomAD-CNV v4 data.
Expand All @@ -176,16 +207,38 @@ pub fn import(
cf_data: &Arc<rocksdb::BoundColumnFamily>,
path_in_vcf: &str,
) -> Result<(), anyhow::Error> {
let cohort_name = if path_in_vcf.contains("non_neuro_controls") {
"non_neuro_controls"
} else if path_in_vcf.contains("non_neuro") {
"non_neuro"
} else {
"all"
};
tracing::info!("importing gnomAD-CNV v4 {} cohort", cohort_name);

let mut reader = noodles_vcf::reader::Builder::default().build_from_path(path_in_vcf)?;
let header = reader.read_header()?;

for result in reader.records(&header) {
let vcf_record = result?;
let key = format!("{}", vcf_record.ids()).into_bytes();

// Build record for VCF record and write it to database.
let record = Record::from_vcf_record(&vcf_record)
// Build record for VCF record.
let record = Record::from_vcf_record(&vcf_record, cohort_name)
.map_err(|e| anyhow::anyhow!("problem building record from VCF: {}", e))?;

// Attempt to read existing record from the database.
let data = db
.get_cf(cf_data, key.clone())
.map_err(|e| anyhow::anyhow!("problem querying database: {}", e))?;
let record = if let Some(data) = data {
let db_record = Record::decode(&data[..])?;
db_record.merge_with(record)
} else {
record
};

// Write back new or merged records.
db.put_cf(cf_data, key, record.encode_to_vec())?;
}

Expand Down
6 changes: 3 additions & 3 deletions src/gnomad_sv/cli/import/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,10 @@ pub fn run(common: &common::cli::Args, args: &Args) -> Result<(), anyhow::Error>
}
(GnomadVersion::Four, GnomadKind::Exomes, common::cli::GenomeRelease::Grch38) => {
tracing::info!("- selected gnomAD CNV v4 import for GRCh38");
if args.path_in_vcf.len() != 1 {
anyhow::bail!("gnomAD CNV v4 import requires exactly one input file");
for path_in_vcf in &args.path_in_vcf {
tracing::info!(" - file {}", &path_in_vcf);
gnomad_cnv4::import(&db, &cf_data, path_in_vcf)?;
}
gnomad_cnv4::import(&db, &cf_data, &args.path_in_vcf[0])?;
}
(GnomadVersion::Four, GnomadKind::Genomes, common::cli::GenomeRelease::Grch38) => {
tracing::info!("- selected gnomAD SV v4 import for GRCh38");
Expand Down
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
22 changes: 13 additions & 9 deletions tests/gnomad-sv/gnomad-cnv4/bootstrap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,26 @@ trap "rm -rf $TMPDIR" EXIT

rm -f $SCRIPT_DIR/*.vcf*

curl \
https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/exome_cnv/gnomad.v4.0.cnv.all.vcf.gz \
| zcat \
| head -n 200 \
> $SCRIPT_DIR/gnomad.v4.0.cnv.all.vcf
for token in all non_neuro non_neuro_controls; do
curl \
https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/exome_cnv/gnomad.v4.0.cnv.$token.vcf.gz \
| zcat \
| head -n 200 \
> $SCRIPT_DIR/gnomad.v4.0.cnv.$token.vcf

bgzip -c $SCRIPT_DIR/gnomad.v4.0.cnv.all.vcf \
> $SCRIPT_DIR/gnomad.v4.0.cnv.all.vcf.gz
bgzip -c $SCRIPT_DIR/gnomad.v4.0.cnv.$token.vcf \
> $SCRIPT_DIR/gnomad.v4.0.cnv.$token.vcf.gz

tabix -f $SCRIPT_DIR/gnomad.v4.0.cnv.all.vcf.gz
tabix -f $SCRIPT_DIR/gnomad.v4.0.cnv.$token.vcf.gz

rm -rf $SCRIPT_DIR/rocksdb
rm -rf $SCRIPT_DIR/rocksdb
done

cargo run -- \
gnomad-sv import \
--path-in-vcf $SCRIPT_DIR/gnomad.v4.0.cnv.all.vcf.gz \
--path-in-vcf $SCRIPT_DIR/gnomad.v4.0.cnv.non_neuro.vcf.gz \
--path-in-vcf $SCRIPT_DIR/gnomad.v4.0.cnv.non_neuro_controls.vcf.gz \
--path-out-rocksdb $SCRIPT_DIR/rocksdb \
--gnomad-kind exomes \
--genome-release grch38 \
Expand Down
Loading

0 comments on commit 1d0cd26

Please sign in to comment.