diff --git a/man/aldknni.Rd b/man/aldknni.Rd index 026e6d8..d264672 100644 --- a/man/aldknni.Rd +++ b/man/aldknni.Rd @@ -41,13 +41,13 @@ aldknni(fname, \item{frac_top_missing_loci}{fraction of loci with the highest number of pools with missing data to be omitted. Set to zero if the input vcf has already been filtered and the loci beyond the depth thresholds have been set to missing, otherwise set to an decimal number between zero and one. \link{Default=0.0}} -\item{min_loci_corr}{Minimum correlation (Pearson's correlation) between the locus requiring imputation and other loci deemed to be in linkage with it. Ranges from 0.0 to 1.0. If using the default value with is NA, then this threshold will be optimised to find the best value minimising imputation error. \link{Default=NA}} +\item{min_loci_corr}{Minimum correlation (Pearson's correlation) between the locus requiring imputation and other loci deemed to be in linkage with it. Ranges from 0.0 to 1.0. If using the default value with is NA, then this threshold will be optimised to find the best value minimising imputation error. \link{Default=0.9}} -\item{max_pool_dist}{Maximum genetic distance (mean absolute difference in allele frequencies) between the pool or sample requiring imputation and pools or samples deemed to be the closest neighbours. Ranges from 0.0 to 1.0. If using the default value with is NA, then this threshold will be optimised to find the best value minimising imputation error. \link{Default=NA}} +\item{max_pool_dist}{Maximum genetic distance (mean absolute difference in allele frequencies) between the pool or sample requiring imputation and pools or samples deemed to be the closest neighbours. Ranges from 0.0 to 1.0. If using the default value with is NA, then this threshold will be optimised to find the best value minimising imputation error. \link{Default=0.1}} -\item{min_l_loci}{Minimum number of linked loci to be used in estimating genetic distances between the pool or sample requiring imputation and other pools or samples. Minimum value of 1. \link{Default=1}} +\item{min_l_loci}{Minimum number of linked loci to be used in estimating genetic distances between the pool or sample requiring imputation and other pools or samples. Minimum value of 1. \link{Default=20}} -\item{min_k_neighbours}{Minimum number of k-nearest neighbours of the pool or sample requiring imputation. Minimum value of 1. \link{Default=1}} +\item{min_k_neighbours}{Minimum number of k-nearest neighbours of the pool or sample requiring imputation. Minimum value of 1. \link{Default=5}} \item{restrict_linked_loci_per_chromosome}{Restrict the choice of linked loci to within the chromosome the locus requiring imputation belong to? \link{Default=TRUE}} diff --git a/src/rust/src/aldknni.rs b/src/rust/src/aldknni.rs index fdcc0d0..3796486 100644 --- a/src/rust/src/aldknni.rs +++ b/src/rust/src/aldknni.rs @@ -274,13 +274,39 @@ impl GenotypesAndPhenotypes { .intercept_and_allele_frequencies .slice(s![.., idx_ini..idx_fin]) .to_owned(); + // Local positions in the chunk (add loci_idx[*idx_loci_idx_ini] to get the global index) + let mut vec_chunk_loci_idx: Vec = vec![]; + // for ix in *idx_loci_idx_ini..=*idx_loci_idx_fin { + // vec_chunk_loci_idx.push(loci_idx[ix] - idx_ini); + // } + for idx in loci_idx + .iter() + .take(*idx_loci_idx_fin + 1) + .skip(*idx_loci_idx_ini) + { + vec_chunk_loci_idx.push(idx - idx_ini); + } // Define the 2D array used for storing the minimum mean absolute error (MAE) estimates across all missing data across pools and loci. // We encode the MAE as u8 for memory efficiency, where we set 0: u8 as missing. let mut mae_u8: Array2 = Array::from_elem(allele_frequencies.dim(), 0); Zip::indexed(&mut allele_frequencies) .and(&mut mae_u8) .par_for_each(|(i, j_local), q, mu8| { - if q.is_nan() { + + // Find the locus index where the current allele is part of + let mut idx_of_vec_chunk_loci_idx = 0; + for (idx, j) in vec_chunk_loci_idx.iter().enumerate() { + if *j <= j_local { + idx_of_vec_chunk_loci_idx = idx; + } else { + break; + } + } + // Determine if the allele is the last allele in a allelic locus (>=2 alleles per locus being represented here) + let n_alleles = vec_chunk_loci_idx[idx_of_vec_chunk_loci_idx + 1] - vec_chunk_loci_idx[idx_of_vec_chunk_loci_idx]; + let current_allele_is_the_nth_allele = (j_local - vec_chunk_loci_idx[idx_of_vec_chunk_loci_idx]) + 1; + // Impute if the allele is missing and if it is not the last allele in its locus + if q.is_nan() & (current_allele_is_the_nth_allele < n_alleles) { // Define global locus index let j = j_local + idx_ini; // Define the current chromosome @@ -441,10 +467,6 @@ impl GenotypesAndPhenotypes { } // Correct for allele frequency over- and under-flows, as we are assuming all loci are represented by all of its alleles (see assumptions above) let n = allele_frequencies.nrows(); - let mut vec_chunk_loci_idx: Vec = vec![]; // Local positions in the chunk (add loci_idx[*idx_loci_idx_ini] to get the global index) - for ix in *idx_loci_idx_ini..=*idx_loci_idx_fin { - vec_chunk_loci_idx.push(loci_idx[ix] - loci_idx[*idx_loci_idx_ini]); - } for j in 0..(vec_chunk_loci_idx.len() - 1) { let idx_locus_ini = vec_chunk_loci_idx[j]; let idx_locus_fin = vec_chunk_loci_idx[j + 1]; @@ -453,6 +475,12 @@ impl GenotypesAndPhenotypes { [(i, (idx_locus_ini + loci_idx[*idx_loci_idx_ini]))] .is_nan() { + // Use the additive inverse to impute the skipped allele per locus + let sum = allele_frequencies + .slice(s![i, idx_locus_ini..(idx_locus_fin - 1)]) + .sum(); + allele_frequencies[(i, idx_locus_fin - 1)] = 1.00 - sum; + // Make sure the allele frequencies per locus sum up to one let sum = allele_frequencies .slice(s![i, idx_locus_ini..idx_locus_fin]) .sum(); diff --git a/tests/tests.R b/tests/tests.R index da3bd10..df49638 100644 --- a/tests/tests.R +++ b/tests/tests.R @@ -28,9 +28,9 @@ test_that( test_that( "aldknni_fixed", { print("aldknni_fixed:") - vcf = fn_extract_missing(aldknni(fname=fname_vcf, min_loci_corr=0.9, max_pool_dist=0.1)) - sync = fn_extract_missing(aldknni(fname=fname_sync, min_loci_corr=0.9, max_pool_dist=0.1)) - csv = fn_extract_missing(aldknni(fname=fname_csv, min_loci_corr=0.9, max_pool_dist=0.1)) - c(0, 0, 0, 0, 1, 0) + vcf = fn_extract_missing(aldknni(fname=fname_vcf)) + sync = fn_extract_missing(aldknni(fname=fname_sync)) + csv = fn_extract_missing(aldknni(fname=fname_csv)) - c(0, 0, 0, 0, 1, 0) expect_equal(vcf, sync, tolerance=0.1) expect_equal(vcf, csv, tolerance=0.1) } @@ -39,9 +39,9 @@ test_that( test_that( "aldknni_optim", { print("aldknni_optim:") - vcf = fn_extract_missing(aldknni(fname=fname_vcf)) - sync = fn_extract_missing(aldknni(fname=fname_sync)) - csv = fn_extract_missing(aldknni(fname=fname_csv)) - c(0, 0, 0, 0, 1, 0) + vcf = fn_extract_missing(aldknni(fname=fname_vcf, min_loci_corr=NA, max_pool_dist=NA)) + sync = fn_extract_missing(aldknni(fname=fname_sync, min_loci_corr=NA, max_pool_dist=NA)) + csv = fn_extract_missing(aldknni(fname=fname_csv, min_loci_corr=NA, max_pool_dist=NA)) - c(0, 0, 0, 0, 1, 0) expect_equal(vcf, sync, tolerance=0.1) expect_equal(vcf, csv, tolerance=0.1) }