Skip to content

Commit

Permalink
reducing time complexity by skipping imputation of the last allele pe…
Browse files Browse the repository at this point in the history
…r locus
  • Loading branch information
jeffersonfparil committed Feb 13, 2024
1 parent e2efc0e commit 35496cb
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 15 deletions.
8 changes: 4 additions & 4 deletions man/aldknni.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

38 changes: 33 additions & 5 deletions src/rust/src/aldknni.rs
Original file line number Diff line number Diff line change
Expand Up @@ -274,13 +274,39 @@ impl GenotypesAndPhenotypes {
.intercept_and_allele_frequencies
.slice(s![.., idx_ini..idx_fin])
.to_owned();
// Local positions in the chunk (add loci_idx[*idx_loci_idx_ini] to get the global index)
let mut vec_chunk_loci_idx: Vec<usize> = vec![];
// for ix in *idx_loci_idx_ini..=*idx_loci_idx_fin {
// vec_chunk_loci_idx.push(loci_idx[ix] - idx_ini);
// }
for idx in loci_idx
.iter()
.take(*idx_loci_idx_fin + 1)
.skip(*idx_loci_idx_ini)
{
vec_chunk_loci_idx.push(idx - idx_ini);
}
// Define the 2D array used for storing the minimum mean absolute error (MAE) estimates across all missing data across pools and loci.
// We encode the MAE as u8 for memory efficiency, where we set 0: u8 as missing.
let mut mae_u8: Array2<u8> = Array::from_elem(allele_frequencies.dim(), 0);
Zip::indexed(&mut allele_frequencies)
.and(&mut mae_u8)
.par_for_each(|(i, j_local), q, mu8| {
if q.is_nan() {

// Find the locus index where the current allele is part of
let mut idx_of_vec_chunk_loci_idx = 0;
for (idx, j) in vec_chunk_loci_idx.iter().enumerate() {
if *j <= j_local {
idx_of_vec_chunk_loci_idx = idx;
} else {
break;
}
}
// Determine if the allele is the last allele in a allelic locus (>=2 alleles per locus being represented here)
let n_alleles = vec_chunk_loci_idx[idx_of_vec_chunk_loci_idx + 1] - vec_chunk_loci_idx[idx_of_vec_chunk_loci_idx];
let current_allele_is_the_nth_allele = (j_local - vec_chunk_loci_idx[idx_of_vec_chunk_loci_idx]) + 1;
// Impute if the allele is missing and if it is not the last allele in its locus
if q.is_nan() & (current_allele_is_the_nth_allele < n_alleles) {
// Define global locus index
let j = j_local + idx_ini;
// Define the current chromosome
Expand Down Expand Up @@ -441,10 +467,6 @@ impl GenotypesAndPhenotypes {
}
// Correct for allele frequency over- and under-flows, as we are assuming all loci are represented by all of its alleles (see assumptions above)
let n = allele_frequencies.nrows();
let mut vec_chunk_loci_idx: Vec<usize> = vec![]; // Local positions in the chunk (add loci_idx[*idx_loci_idx_ini] to get the global index)
for ix in *idx_loci_idx_ini..=*idx_loci_idx_fin {
vec_chunk_loci_idx.push(loci_idx[ix] - loci_idx[*idx_loci_idx_ini]);
}
for j in 0..(vec_chunk_loci_idx.len() - 1) {
let idx_locus_ini = vec_chunk_loci_idx[j];
let idx_locus_fin = vec_chunk_loci_idx[j + 1];
Expand All @@ -453,6 +475,12 @@ impl GenotypesAndPhenotypes {
[(i, (idx_locus_ini + loci_idx[*idx_loci_idx_ini]))]
.is_nan()
{
// Use the additive inverse to impute the skipped allele per locus
let sum = allele_frequencies
.slice(s![i, idx_locus_ini..(idx_locus_fin - 1)])
.sum();
allele_frequencies[(i, idx_locus_fin - 1)] = 1.00 - sum;
// Make sure the allele frequencies per locus sum up to one
let sum = allele_frequencies
.slice(s![i, idx_locus_ini..idx_locus_fin])
.sum();
Expand Down
12 changes: 6 additions & 6 deletions tests/tests.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ test_that(
test_that(
"aldknni_fixed", {
print("aldknni_fixed:")
vcf = fn_extract_missing(aldknni(fname=fname_vcf, min_loci_corr=0.9, max_pool_dist=0.1))
sync = fn_extract_missing(aldknni(fname=fname_sync, min_loci_corr=0.9, max_pool_dist=0.1))
csv = fn_extract_missing(aldknni(fname=fname_csv, min_loci_corr=0.9, max_pool_dist=0.1)) - c(0, 0, 0, 0, 1, 0)
vcf = fn_extract_missing(aldknni(fname=fname_vcf))
sync = fn_extract_missing(aldknni(fname=fname_sync))
csv = fn_extract_missing(aldknni(fname=fname_csv)) - c(0, 0, 0, 0, 1, 0)
expect_equal(vcf, sync, tolerance=0.1)
expect_equal(vcf, csv, tolerance=0.1)
}
Expand All @@ -39,9 +39,9 @@ test_that(
test_that(
"aldknni_optim", {
print("aldknni_optim:")
vcf = fn_extract_missing(aldknni(fname=fname_vcf))
sync = fn_extract_missing(aldknni(fname=fname_sync))
csv = fn_extract_missing(aldknni(fname=fname_csv)) - c(0, 0, 0, 0, 1, 0)
vcf = fn_extract_missing(aldknni(fname=fname_vcf, min_loci_corr=NA, max_pool_dist=NA))
sync = fn_extract_missing(aldknni(fname=fname_sync, min_loci_corr=NA, max_pool_dist=NA))
csv = fn_extract_missing(aldknni(fname=fname_csv, min_loci_corr=NA, max_pool_dist=NA)) - c(0, 0, 0, 0, 1, 0)
expect_equal(vcf, sync, tolerance=0.1)
expect_equal(vcf, csv, tolerance=0.1)
}
Expand Down

0 comments on commit 35496cb

Please sign in to comment.