From b5da829cb5dfb2a2bed21f151711366c869ae5c8 Mon Sep 17 00:00:00 2001 From: jeffersonfparil Date: Wed, 7 Feb 2024 15:18:12 +1100 Subject: [PATCH 1/8] testing new optimisation algorithm where we optimise for corr and dist per locus --- src/rust/src/aldknni.rs | 305 +++++++++++++++++++++++++++++++++------- 1 file changed, 255 insertions(+), 50 deletions(-) diff --git a/src/rust/src/aldknni.rs b/src/rust/src/aldknni.rs index d35e90c..c2b8bbf 100644 --- a/src/rust/src/aldknni.rs +++ b/src/rust/src/aldknni.rs @@ -1,4 +1,5 @@ use ndarray::{prelude::*, Zip}; +use rand::prelude::IteratorRandom; use std::cmp::Ordering; use std::io; @@ -271,8 +272,8 @@ impl GenotypesAndPhenotypes { pub fn adaptive_ld_knn_imputation( &mut self, self_clone: &GenotypesAndPhenotypes, - min_loci_corr: &f64, - max_pool_dist: &f64, + _min_loci_corr: &f64, + _max_pool_dist: &f64, min_l_loci: &u64, min_k_neighbours: &u64, loci_idx: &Vec, @@ -282,12 +283,12 @@ impl GenotypesAndPhenotypes { self.check().expect("Error self.check() within adaptive_ld_knn_imputation() method of GenotypesAndPhenotypes struct."); // We are assuming that all non-zero alleles across pools are kept, i.e. biallelic loci have 2 columns, triallelic have 3, and so on. let (n, p) = self.intercept_and_allele_frequencies.dim(); - let min_l_loci = if *min_l_loci >= (p as u64) { + let _min_l_loci = if *min_l_loci >= (p as u64) { p - 1 } else { *min_l_loci as usize }; - let min_k_neighbours = if *min_k_neighbours >= (n as u64) { + let _min_k_neighbours = if *min_k_neighbours >= (n as u64) { n - 1 } else { *min_k_neighbours as usize @@ -295,10 +296,204 @@ impl GenotypesAndPhenotypes { Zip::indexed(&mut self.intercept_and_allele_frequencies) .par_for_each(|(i, j), q| { if q.is_nan() { + // let current_chromosome = self_clone.chromosome[j].to_owned(); + // // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) + // let (linked_loci_idx, _correlations) = + // find_l_linked_loci(j, corr, min_loci_corr, min_l_loci, + // restrict_linked_loci_per_chromosome, + // ¤t_chromosome, + // &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools + // let distances_all_loci = calculate_genetic_distances_between_pools( + // i, + // &linked_loci_idx, + // &self_clone.intercept_and_allele_frequencies) + // .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) + // let (distances, frequencies) = + // find_k_nearest_neighbours( + // &distances_all_loci, + // max_pool_dist, + // min_k_neighbours, + // j, + // &self_clone.intercept_and_allele_frequencies, + // ) + // .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Impute missing allele frequencies at the current locus + // *q = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + + + + // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // // Testing locus-specific optimisation for l and k + // let current_chromosome = self_clone.chromosome[j].to_owned(); + // let vec_q: ArrayView1 = self_clone.intercept_and_allele_frequencies.column(j); + // let n_non_missing = vec_q.fold(0, |t, &x| if !x.is_nan() {t+1}else{t}); + // let min_loci_corr: f64 = 0.0; + // let max_pool_dist: f64 = 1.0; + // let vec_min_l_loci: Vec = (1..75).collect(); + // let vec_min_k_neighbours: Vec = if n < 75 { + // (1..n).collect() + // } else { + // (1..75).collect() + // }; + // let n_reps = 3; + // let n_reps = if n_reps <= n_non_missing { + // n_reps + // } else { + // n_non_missing + // }; + // let mut rng = rand::thread_rng(); + // let idx_random_pools: Vec = (0..vec_q.len()).filter(|&idx| !vec_q[idx].is_nan()).choose_multiple(&mut rng, n_reps); + // let mut optimum_mae = 1.0; + // let mut optimum_min_l_loci = 0; + // let mut optimum_min_k_neighbours = 0; + // // Across l + // for min_l_loci in vec_min_l_loci.iter() { + // // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) + // let (linked_loci_idx, _correlations) = + // find_l_linked_loci(j, corr, &min_loci_corr, *min_l_loci, + // restrict_linked_loci_per_chromosome, + // ¤t_chromosome, + // &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Across k + // for min_k_neighbours in vec_min_k_neighbours.iter() { + // // Across reps + // let mut mae = 0.0; + // for idx_i in idx_random_pools.iter() { + // // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools + // let distances_all_loci = calculate_genetic_distances_between_pools( + // *idx_i, + // &linked_loci_idx, + // &self_clone.intercept_and_allele_frequencies) + // .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) + // let (distances, frequencies) = + // find_k_nearest_neighbours( + // &distances_all_loci, + // &max_pool_dist, + // *min_k_neighbours, + // j, + // &self_clone.intercept_and_allele_frequencies, + // ) + // .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Impute and find the error + // mae += (vec_q[*idx_i] - impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait.") + // ).abs(); + // } + // mae /= n_reps as f64; + // if (mae <= f64::EPSILON) | (mae > optimum_mae) { + // break; + // } + // if mae < optimum_mae { + // optimum_mae = mae; + // optimum_min_l_loci = *min_l_loci; + // optimum_min_k_neighbours = *min_k_neighbours; + // } + // } + // } + // // Impute actual missing data point (ith pool and jth locus) + // // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) + // let (linked_loci_idx, _correlations) = + // find_l_linked_loci(j, corr, &min_loci_corr, optimum_min_l_loci, + // restrict_linked_loci_per_chromosome, + // ¤t_chromosome, + // &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools + // let distances_all_loci = calculate_genetic_distances_between_pools( + // i, + // &linked_loci_idx, + // &self_clone.intercept_and_allele_frequencies) + // .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) + // let (distances, frequencies) = + // find_k_nearest_neighbours( + // &distances_all_loci, + // &max_pool_dist, + // optimum_min_k_neighbours, + // j, + // &self_clone.intercept_and_allele_frequencies, + // ) + // .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Impute missing allele frequencies at the current locus + // *q = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // println!("q={:?}; mae={:?}; l={}; k={}", q, optimum_mae, optimum_min_l_loci, optimum_min_k_neighbours); + // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Testing locus-specific optimisation for corr and dist let current_chromosome = self_clone.chromosome[j].to_owned(); + let vec_q: ArrayView1 = self_clone.intercept_and_allele_frequencies.column(j); + let n_non_missing = vec_q.fold(0, |t, &x| if !x.is_nan() {t+1}else{t}); + let vec_min_loci_corr: Vec = (0..=10).rev().map(|x| x as f64 / 10.0).collect(); + let vec_max_pool_dist: Vec = (0..=10).map(|x| x as f64 / 10.0).collect(); + let min_l_loci: usize = 1; + let min_k_neighbours: usize = 1; + let n_reps = 5; + let n_reps = if n_reps <= n_non_missing { + n_reps + } else { + n_non_missing + }; + let mut rng = rand::thread_rng(); + let idx_random_pools: Vec = (0..vec_q.len()).filter(|&idx| !vec_q[idx].is_nan()).choose_multiple(&mut rng, n_reps); + let mut optimum_mae = 1.0; + let mut optimum_min_loci_corr = 0.0; + let mut optimum_max_pool_dist = 1.0; + // Across l + for min_loci_corr in vec_min_loci_corr.iter() { + // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) + let (linked_loci_idx, _correlations) = + find_l_linked_loci(j, corr, min_loci_corr, min_l_loci, + restrict_linked_loci_per_chromosome, + ¤t_chromosome, + &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // Across k + for max_pool_dist in vec_max_pool_dist.iter() { + // Across reps + let mut mae = 0.0; + for idx_i in idx_random_pools.iter() { + // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools + let distances_all_loci = calculate_genetic_distances_between_pools( + *idx_i, + &linked_loci_idx, + &self_clone.intercept_and_allele_frequencies) + .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) + let (distances, frequencies) = + find_k_nearest_neighbours( + &distances_all_loci, + max_pool_dist, + min_k_neighbours, + j, + &self_clone.intercept_and_allele_frequencies, + ) + .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // Impute and find the error + mae += (vec_q[*idx_i] - impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait.") + ).abs(); + } + mae /= n_reps as f64; + if (mae <= f64::EPSILON) | (mae > optimum_mae) { + break; + } + if mae < optimum_mae { + optimum_mae = mae; + optimum_min_loci_corr = *min_loci_corr; + optimum_max_pool_dist = *max_pool_dist; + } + } + } + // Impute actual missing data point (ith pool and jth locus) // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) let (linked_loci_idx, _correlations) = - find_l_linked_loci(j, corr, min_loci_corr, min_l_loci, + find_l_linked_loci(j, corr, &optimum_min_loci_corr, min_l_loci, restrict_linked_loci_per_chromosome, ¤t_chromosome, &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); @@ -312,7 +507,7 @@ impl GenotypesAndPhenotypes { let (distances, frequencies) = find_k_nearest_neighbours( &distances_all_loci, - max_pool_dist, + &optimum_max_pool_dist, min_k_neighbours, j, &self_clone.intercept_and_allele_frequencies, @@ -320,6 +515,12 @@ impl GenotypesAndPhenotypes { .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); // Impute missing allele frequencies at the current locus *q = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + println!("q={:?}; mae={:?}; corr={}; dist={}", q, optimum_mae, optimum_min_loci_corr, optimum_max_pool_dist); + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + } }); // Correct for allele frequency over- and under-flows if we have more than 1 allele representing each locus @@ -351,7 +552,7 @@ impl GenotypesAndPhenotypes { } pub fn impute_aldknni( - mut genotypes_and_phenotypes: GenotypesAndPhenotypes, + genotypes_and_phenotypes: GenotypesAndPhenotypes, filter_stats: &FilterStats, min_loci_corr: &f64, max_pool_dist: &f64, @@ -382,49 +583,53 @@ pub fn impute_aldknni( restrict_linked_loci_per_chromosome, ) .expect("Error estimating pairwise linkage between loci across the entire genome."); - println!("Optimising and/or estimating imputation accuracy."); - let ( - optimum_min_loci_corr, - optimum_max_pool_dist, - optimum_min_l_loci, - optimum_min_k_neighbours, - optimum_mae, - ) = optimise_params_and_estimate_accuracy( - &genotypes_and_phenotypes, - &self_clone, - min_loci_corr, - max_pool_dist, - min_l_loci, - min_k_neighbours, - &loci_idx, - &corr, - restrict_linked_loci_per_chromosome, - optimise_n_steps_min_loci_corr, - optimise_n_steps_max_pool_dist, - optimise_max_l_loci, - optimise_max_k_neighbours, - optimise_n_reps, - ) - .expect("Error calling optimise_params_and_estimate_accuracy() in impute_aldknni()."); - println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); - println!( - "Minimum loci correlation threshold: {}", - optimum_min_loci_corr - ); - println!( - "Maximum neighbour distance threshold: {}", - optimum_max_pool_dist - ); - println!("Minimum number of linked loci: {}", optimum_min_l_loci); - println!( - "Minimum number of k-nearest neighbours: {}", - optimum_min_k_neighbours - ); - println!( - "Estimated imputation accuracy in terms of mean absolute error: {}", - optimum_mae - ); - println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); + // println!("Optimising and/or estimating imputation accuracy."); + // let ( + // optimum_min_loci_corr, + // optimum_max_pool_dist, + // optimum_min_l_loci, + // optimum_min_k_neighbours, + // optimum_mae, + // ) = optimise_params_and_estimate_accuracy( + // &genotypes_and_phenotypes, + // &self_clone, + // min_loci_corr, + // max_pool_dist, + // min_l_loci, + // min_k_neighbours, + // &loci_idx, + // &corr, + // restrict_linked_loci_per_chromosome, + // optimise_n_steps_min_loci_corr, + // optimise_n_steps_max_pool_dist, + // optimise_max_l_loci, + // optimise_max_k_neighbours, + // optimise_n_reps, + // ) + // .expect("Error calling optimise_params_and_estimate_accuracy() in impute_aldknni()."); + // println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); + // println!( + // "Minimum loci correlation threshold: {}", + // optimum_min_loci_corr + // ); + // println!( + // "Maximum neighbour distance threshold: {}", + // optimum_max_pool_dist + // ); + // println!("Minimum number of linked loci: {}", optimum_min_l_loci); + // println!( + // "Minimum number of k-nearest neighbours: {}", + // optimum_min_k_neighbours + // ); + // println!( + // "Estimated imputation accuracy in terms of mean absolute error: {}", + // optimum_mae + // ); + // println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); + let optimum_min_loci_corr = 0.0; + let optimum_max_pool_dist = 1.0; + let optimum_min_l_loci = 1; + let optimum_min_k_neighbours = 1; let start = std::time::SystemTime::now(); self_clone .adaptive_ld_knn_imputation( From ebe7a10566386b87e83b3fd657f9a1cc827dac28 Mon Sep 17 00:00:00 2001 From: jeffersonfparil Date: Thu, 8 Feb 2024 10:29:55 +1100 Subject: [PATCH 2/8] testing per locus optimisation --- src/rust/src/aldknni.rs | 297 ++++++++++++---------------------------- src/rust/src/optim.rs | 17 ++- 2 files changed, 95 insertions(+), 219 deletions(-) diff --git a/src/rust/src/aldknni.rs b/src/rust/src/aldknni.rs index c2b8bbf..44c859e 100644 --- a/src/rust/src/aldknni.rs +++ b/src/rust/src/aldknni.rs @@ -4,7 +4,7 @@ use std::cmp::Ordering; use std::io; use crate::helpers::*; -use crate::optim::*; + use crate::structs_and_traits::*; pub fn calculate_genomewide_ld( @@ -293,149 +293,26 @@ impl GenotypesAndPhenotypes { } else { *min_k_neighbours as usize }; - Zip::indexed(&mut self.intercept_and_allele_frequencies) - .par_for_each(|(i, j), q| { - if q.is_nan() { - // let current_chromosome = self_clone.chromosome[j].to_owned(); - // // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) - // let (linked_loci_idx, _correlations) = - // find_l_linked_loci(j, corr, min_loci_corr, min_l_loci, - // restrict_linked_loci_per_chromosome, - // ¤t_chromosome, - // &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools - // let distances_all_loci = calculate_genetic_distances_between_pools( - // i, - // &linked_loci_idx, - // &self_clone.intercept_and_allele_frequencies) - // .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) - // let (distances, frequencies) = - // find_k_nearest_neighbours( - // &distances_all_loci, - // max_pool_dist, - // min_k_neighbours, - // j, - // &self_clone.intercept_and_allele_frequencies, - // ) - // .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Impute missing allele frequencies at the current locus - // *q = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + let vec_min_loci_corr: Vec = (0..=10).rev().map(|x| x as f64 / 10.0).collect(); + let vec_max_pool_dist: Vec = (0..=10).map(|x| x as f64 / 10.0).collect(); + let min_l_loci: usize = 1; + let min_k_neighbours: usize = 1; + let n_reps = 5; - // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // // Testing locus-specific optimisation for l and k - // let current_chromosome = self_clone.chromosome[j].to_owned(); - // let vec_q: ArrayView1 = self_clone.intercept_and_allele_frequencies.column(j); - // let n_non_missing = vec_q.fold(0, |t, &x| if !x.is_nan() {t+1}else{t}); - // let min_loci_corr: f64 = 0.0; - // let max_pool_dist: f64 = 1.0; - // let vec_min_l_loci: Vec = (1..75).collect(); - // let vec_min_k_neighbours: Vec = if n < 75 { - // (1..n).collect() - // } else { - // (1..75).collect() - // }; - // let n_reps = 3; - // let n_reps = if n_reps <= n_non_missing { - // n_reps - // } else { - // n_non_missing - // }; - // let mut rng = rand::thread_rng(); - // let idx_random_pools: Vec = (0..vec_q.len()).filter(|&idx| !vec_q[idx].is_nan()).choose_multiple(&mut rng, n_reps); - // let mut optimum_mae = 1.0; - // let mut optimum_min_l_loci = 0; - // let mut optimum_min_k_neighbours = 0; - // // Across l - // for min_l_loci in vec_min_l_loci.iter() { - // // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) - // let (linked_loci_idx, _correlations) = - // find_l_linked_loci(j, corr, &min_loci_corr, *min_l_loci, - // restrict_linked_loci_per_chromosome, - // ¤t_chromosome, - // &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Across k - // for min_k_neighbours in vec_min_k_neighbours.iter() { - // // Across reps - // let mut mae = 0.0; - // for idx_i in idx_random_pools.iter() { - // // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools - // let distances_all_loci = calculate_genetic_distances_between_pools( - // *idx_i, - // &linked_loci_idx, - // &self_clone.intercept_and_allele_frequencies) - // .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) - // let (distances, frequencies) = - // find_k_nearest_neighbours( - // &distances_all_loci, - // &max_pool_dist, - // *min_k_neighbours, - // j, - // &self_clone.intercept_and_allele_frequencies, - // ) - // .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Impute and find the error - // mae += (vec_q[*idx_i] - impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait.") - // ).abs(); - // } - // mae /= n_reps as f64; - // if (mae <= f64::EPSILON) | (mae > optimum_mae) { - // break; - // } - // if mae < optimum_mae { - // optimum_mae = mae; - // optimum_min_l_loci = *min_l_loci; - // optimum_min_k_neighbours = *min_k_neighbours; - // } - // } - // } - // // Impute actual missing data point (ith pool and jth locus) - // // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) - // let (linked_loci_idx, _correlations) = - // find_l_linked_loci(j, corr, &min_loci_corr, optimum_min_l_loci, - // restrict_linked_loci_per_chromosome, - // ¤t_chromosome, - // &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools - // let distances_all_loci = calculate_genetic_distances_between_pools( - // i, - // &linked_loci_idx, - // &self_clone.intercept_and_allele_frequencies) - // .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) - // let (distances, frequencies) = - // find_k_nearest_neighbours( - // &distances_all_loci, - // &max_pool_dist, - // optimum_min_k_neighbours, - // j, - // &self_clone.intercept_and_allele_frequencies, - // ) - // .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Impute missing allele frequencies at the current locus - // *q = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // println!("q={:?}; mae={:?}; l={}; k={}", q, optimum_mae, optimum_min_l_loci, optimum_min_k_neighbours); - // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Testing locus-specific optimisation for corr and dist + Zip::indexed(&mut self.intercept_and_allele_frequencies) + .par_for_each(|(i, j), q| { + if q.is_nan() { let current_chromosome = self_clone.chromosome[j].to_owned(); let vec_q: ArrayView1 = self_clone.intercept_and_allele_frequencies.column(j); let n_non_missing = vec_q.fold(0, |t, &x| if !x.is_nan() {t+1}else{t}); - let vec_min_loci_corr: Vec = (0..=10).rev().map(|x| x as f64 / 10.0).collect(); - let vec_max_pool_dist: Vec = (0..=10).map(|x| x as f64 / 10.0).collect(); - let min_l_loci: usize = 1; - let min_k_neighbours: usize = 1; - let n_reps = 5; + // let vec_min_loci_corr: Vec = (0..=10).rev().map(|x| x as f64 / 10.0).collect(); + // let vec_max_pool_dist: Vec = (0..=10).map(|x| x as f64 / 10.0).collect(); + // let min_l_loci: usize = 1; + // let min_k_neighbours: usize = 1; + // let n_reps = 5; let n_reps = if n_reps <= n_non_missing { n_reps } else { @@ -516,11 +393,6 @@ impl GenotypesAndPhenotypes { // Impute missing allele frequencies at the current locus *q = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); println!("q={:?}; mae={:?}; corr={}; dist={}", q, optimum_mae, optimum_min_loci_corr, optimum_max_pool_dist); - ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - } }); // Correct for allele frequency over- and under-flows if we have more than 1 allele representing each locus @@ -554,16 +426,16 @@ impl GenotypesAndPhenotypes { pub fn impute_aldknni( genotypes_and_phenotypes: GenotypesAndPhenotypes, filter_stats: &FilterStats, - min_loci_corr: &f64, - max_pool_dist: &f64, - min_l_loci: &u64, - min_k_neighbours: &u64, + _min_loci_corr: &f64, + _max_pool_dist: &f64, + _min_l_loci: &u64, + _min_k_neighbours: &u64, restrict_linked_loci_per_chromosome: bool, - optimise_n_steps_min_loci_corr: &usize, - optimise_n_steps_max_pool_dist: &usize, - optimise_max_l_loci: &u64, - optimise_max_k_neighbours: &u64, - optimise_n_reps: &usize, + _optimise_n_steps_min_loci_corr: &usize, + _optimise_n_steps_max_pool_dist: &usize, + _optimise_max_l_loci: &u64, + _optimise_max_k_neighbours: &u64, + _optimise_n_reps: &usize, n_threads: &usize, out: &String, ) -> io::Result { @@ -844,70 +716,71 @@ mod tests { "After imputation:\n{:?}", frequencies_and_phenotypes.intercept_and_allele_frequencies ); - let n_nan = frequencies_and_phenotypes - .intercept_and_allele_frequencies - .iter() - .fold(0, |n_nan, &x| if x.is_nan() { n_nan + 1 } else { n_nan }); - println!("n_nan={}", n_nan); - assert_eq!(n_nan, 1_915); // corresponds to the 1_915 alleles completely missing across all pools + assert_eq!(0, 1) + // let n_nan = frequencies_and_phenotypes + // .intercept_and_allele_frequencies + // .iter() + // .fold(0, |n_nan, &x| if x.is_nan() { n_nan + 1 } else { n_nan }); + // println!("n_nan={}", n_nan); + // assert_eq!(n_nan, 1_915); // corresponds to the 1_915 alleles completely missing across all pools - let keep_p_minus_1 = false; - let genotypes_and_phenotypes = file_sync_phen - .into_genotypes_and_phenotypes(&filter_stats, keep_p_minus_1, &n_threads) - .unwrap(); + // let keep_p_minus_1 = false; + // let genotypes_and_phenotypes = file_sync_phen + // .into_genotypes_and_phenotypes(&filter_stats, keep_p_minus_1, &n_threads) + // .unwrap(); - let outname = impute_aldknni( - genotypes_and_phenotypes, - &filter_stats, - &min_loci_corr, - &max_pool_dist, - &min_l_loci, - &min_k_neighbours, - restrict_linked_loci_per_chromosome, - &optimise_n_steps_min_loci_corr, - &optimise_n_steps_max_pool_dist, - &optimise_max_l_loci, - &optimise_max_k_neighbours, - &optimise_n_reps, - &n_threads, - &"test-impute_aldknni.csv".to_owned(), - ) - .unwrap(); - assert_eq!(outname, "test-impute_aldknni.csv".to_owned()); // Do better!!! Load data - thus working on improving load_table() + // let outname = impute_aldknni( + // genotypes_and_phenotypes, + // &filter_stats, + // &min_loci_corr, + // &max_pool_dist, + // &min_l_loci, + // &min_k_neighbours, + // restrict_linked_loci_per_chromosome, + // &optimise_n_steps_min_loci_corr, + // &optimise_n_steps_max_pool_dist, + // &optimise_max_l_loci, + // &optimise_max_k_neighbours, + // &optimise_n_reps, + // &n_threads, + // &"test-impute_aldknni.csv".to_owned(), + // ) + // .unwrap(); + // assert_eq!(outname, "test-impute_aldknni.csv".to_owned()); // Do better!!! Load data - thus working on improving load_table() - println!("frequencies_and_phenotypes.intercept_and_allele_frequencies.slice(s![0..5, 39..42])={:?}", frequencies_and_phenotypes.intercept_and_allele_frequencies.slice(s![0..5, 39..42])); - assert_eq!( - frequencies_and_phenotypes - .intercept_and_allele_frequencies - .slice(s![0..5, 1..3]) - .sum_axis(Axis(1)) - .map(|x| sensible_round(*x, 2)), - Array1::from_elem(5, 1.0) - ); - assert_eq!( - frequencies_and_phenotypes - .intercept_and_allele_frequencies - .slice(s![0..5, 39..42]) - .sum_axis(Axis(1)) - .map(|x| sensible_round(*x, 2)), - Array1::from_elem(5, 1.0) - ); - assert_eq!( - frequencies_and_phenotypes - .intercept_and_allele_frequencies - .slice(s![0..5, 119..121]) - .sum_axis(Axis(1)) - .map(|x| sensible_round(*x, 2)), - Array1::from_elem(5, 1.0) - ); - assert_eq!( - frequencies_and_phenotypes - .intercept_and_allele_frequencies - .slice(s![0..5, 400..402]) - .sum_axis(Axis(1)) - .map(|x| sensible_round(*x, 2)), - Array1::from_elem(5, 1.0) - ); + // println!("frequencies_and_phenotypes.intercept_and_allele_frequencies.slice(s![0..5, 39..42])={:?}", frequencies_and_phenotypes.intercept_and_allele_frequencies.slice(s![0..5, 39..42])); + // assert_eq!( + // frequencies_and_phenotypes + // .intercept_and_allele_frequencies + // .slice(s![0..5, 1..3]) + // .sum_axis(Axis(1)) + // .map(|x| sensible_round(*x, 2)), + // Array1::from_elem(5, 1.0) + // ); + // assert_eq!( + // frequencies_and_phenotypes + // .intercept_and_allele_frequencies + // .slice(s![0..5, 39..42]) + // .sum_axis(Axis(1)) + // .map(|x| sensible_round(*x, 2)), + // Array1::from_elem(5, 1.0) + // ); + // assert_eq!( + // frequencies_and_phenotypes + // .intercept_and_allele_frequencies + // .slice(s![0..5, 119..121]) + // .sum_axis(Axis(1)) + // .map(|x| sensible_round(*x, 2)), + // Array1::from_elem(5, 1.0) + // ); + // assert_eq!( + // frequencies_and_phenotypes + // .intercept_and_allele_frequencies + // .slice(s![0..5, 400..402]) + // .sum_axis(Axis(1)) + // .map(|x| sensible_round(*x, 2)), + // Array1::from_elem(5, 1.0) + // ); // assert_eq!(0, 1); } } diff --git a/src/rust/src/optim.rs b/src/rust/src/optim.rs index ac1de46..7894211 100644 --- a/src/rust/src/optim.rs +++ b/src/rust/src/optim.rs @@ -1,6 +1,6 @@ use rand::prelude::IteratorRandom; -use rand::{Rng,SeedableRng}; use rand::rngs::StdRng; +use rand::{SeedableRng}; use std::io; use crate::structs_and_traits::*; @@ -10,7 +10,7 @@ impl GenotypesAndPhenotypes { &mut self, loci_idx: &Vec, max_sparsity: &f64, - rep: &usize + rep: &usize, ) -> io::Result<(&mut Self, f64, Vec, Vec, Vec>)> { self.check().expect("Error calling check() method within simulate_missing() method for GenotypesAndPhenotypes struct."); let (n, l) = self.coverages.dim(); @@ -389,7 +389,7 @@ pub fn optimise_params_and_estimate_accuracy( vec_min_k_neighbours.len(), ]; // Step across all four parameter spaces twice, where within each parameter space we move forwards and backwards from end to end and from middle to both ends - for ix in vec![0,1,2,3,0,1,2,3].into_iter() { + for ix in vec![0, 1, 2, 3, 0, 1, 2, 3].into_iter() { let mut opt_idx = 0; let ini = 0; let mid = (vec_len_c_d_l_k[ix]) / 2; @@ -397,8 +397,8 @@ pub fn optimise_params_and_estimate_accuracy( let vec_vec_params_idx = vec![ (ini..fin).collect::>(), (ini..fin).rev().collect::>(), - (mid..fin).collect::>(), - (ini..mid).rev().collect::>(), + (mid..fin).collect::>(), + (ini..mid).rev().collect::>(), ]; for iy in 0..vec_vec_params_idx.len() { for l in vec_vec_params_idx[iy].clone().into_iter() { @@ -585,9 +585,12 @@ mod tests { ) .unwrap(); - println!("min_loci_corr={}, max_pool_dist={}, min_l_loci={}, min_k_neighbours={}, mae={}", min_loci_corr, max_pool_dist, min_l_loci, min_k_neighbours, mae); + println!( + "min_loci_corr={}, max_pool_dist={}, min_l_loci={}, min_k_neighbours={}, mae={}", + min_loci_corr, max_pool_dist, min_l_loci, min_k_neighbours, mae + ); - let vec_vec_imputed_mask = frequencies_and_phenotypes + let _vec_vec_imputed_mask = frequencies_and_phenotypes .extract_imputed_mask(&loci_idx, &vec_masked_loci_idx) .unwrap(); // println!("vec_vec_imputed_mask={:?}", vec_vec_imputed_mask); From 551f7cc7c988e109edfd5f0efe0faf5d981a1065 Mon Sep 17 00:00:00 2001 From: jeffersonfparil Date: Thu, 8 Feb 2024 11:30:33 +1100 Subject: [PATCH 3/8] testing per locus optimisation --- res/perf.R | 1 + res/perf_functions.R | 6 +++--- src/rust/src/aldknni.rs | 34 +++++++++++++++++++++++----------- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/res/perf.R b/res/perf.R index 9a1ccfb..3fb3415 100644 --- a/res/perf.R +++ b/res/perf.R @@ -1,6 +1,7 @@ ### Parse Rscript arguments args = commandArgs(trailingOnly=TRUE) # args = c("1", "/group/pasture/Jeff/imputef/res", "/group/pasture/Jeff/imputef/misc", "3", "32") +# args = c("1", "res", "res", "3", "32") i = as.numeric(args[1]) dir_src = args[2] dir_data = args[3] diff --git a/res/perf_functions.R b/res/perf_functions.R index 75cd9bf..720a5bd 100644 --- a/res/perf_functions.R +++ b/res/perf_functions.R @@ -1,8 +1,8 @@ ### Load imputef library -library(imputef) +# library(imputef) # system("conda activate rustenv") -# devtools::load_all() -# rextendr::document() +devtools::load_all() +rextendr::document() ### Extract allele frequencies into a pxn matrix where we have p loci and n entries ### Assumes all loci have a maximum of 2 alleles diff --git a/src/rust/src/aldknni.rs b/src/rust/src/aldknni.rs index 44c859e..212a98d 100644 --- a/src/rust/src/aldknni.rs +++ b/src/rust/src/aldknni.rs @@ -295,24 +295,22 @@ impl GenotypesAndPhenotypes { }; - let vec_min_loci_corr: Vec = (0..=10).rev().map(|x| x as f64 / 10.0).collect(); - let vec_max_pool_dist: Vec = (0..=10).map(|x| x as f64 / 10.0).collect(); + let vec_min_loci_corr: Vec = (0..=20).rev().map(|x| x as f64 / 20.0).collect(); + let vec_max_pool_dist: Vec = (0..=20).map(|x| x as f64 / 20.0).collect(); let min_l_loci: usize = 1; let min_k_neighbours: usize = 1; let n_reps = 5; + // Set 0: u8 as missing + let mut mae_u8: Array2 = Array::from_elem(self.intercept_and_allele_frequencies.dim(), 0); Zip::indexed(&mut self.intercept_and_allele_frequencies) - .par_for_each(|(i, j), q| { + .and(&mut mae_u8) + .par_for_each(|(i, j), q, mu8| { if q.is_nan() { let current_chromosome = self_clone.chromosome[j].to_owned(); let vec_q: ArrayView1 = self_clone.intercept_and_allele_frequencies.column(j); let n_non_missing = vec_q.fold(0, |t, &x| if !x.is_nan() {t+1}else{t}); - // let vec_min_loci_corr: Vec = (0..=10).rev().map(|x| x as f64 / 10.0).collect(); - // let vec_max_pool_dist: Vec = (0..=10).map(|x| x as f64 / 10.0).collect(); - // let min_l_loci: usize = 1; - // let min_k_neighbours: usize = 1; - // let n_reps = 5; let n_reps = if n_reps <= n_non_missing { n_reps } else { @@ -323,7 +321,7 @@ impl GenotypesAndPhenotypes { let mut optimum_mae = 1.0; let mut optimum_min_loci_corr = 0.0; let mut optimum_max_pool_dist = 1.0; - // Across l + // Across minimum loci correlation thresholds for min_loci_corr in vec_min_loci_corr.iter() { // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) let (linked_loci_idx, _correlations) = @@ -331,7 +329,7 @@ impl GenotypesAndPhenotypes { restrict_linked_loci_per_chromosome, ¤t_chromosome, &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // Across k + // Across maximum pool distance thresholds for max_pool_dist in vec_max_pool_dist.iter() { // Across reps let mut mae = 0.0; @@ -367,6 +365,8 @@ impl GenotypesAndPhenotypes { } } } + // We use 254 instead of 255 and add 1 because we set 0 as missing above + *mu8 = (optimum_mae * 254.0).round() as u8 + 1; // Impute actual missing data point (ith pool and jth locus) // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) let (linked_loci_idx, _correlations) = @@ -392,9 +392,21 @@ impl GenotypesAndPhenotypes { .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); // Impute missing allele frequencies at the current locus *q = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - println!("q={:?}; mae={:?}; corr={}; dist={}", q, optimum_mae, optimum_min_loci_corr, optimum_max_pool_dist); + // println!("q={:?}; mae={:?}; corr={}; dist={}", q, optimum_mae, optimum_min_loci_corr, optimum_max_pool_dist); } }); + // Extract average MAE across loci and pools (Note that we used 0: u8 as missing) + let mut predicted_mae = 0.0; + let mut n_missing = 0.0; + for mu8 in mae_u8.into_iter() { + if mu8 > 0 { + predicted_mae += (mu8 as f64 - 1.0) / 255.0; + n_missing += 1.0; + } + } + predicted_mae = predicted_mae / n_missing; + println!("An over-estimated prediction of mean absolute error = {}", predicted_mae); + // Correct for allele frequency over- and under-flows if we have more than 1 allele representing each locus for j in 0..(loci_idx.len() - 1) { let idx_locus_ini = loci_idx[j]; From fb53a1f7183397b0bf5d242ce4cbe2b6e587d111 Mon Sep 17 00:00:00 2001 From: jeffersonfparil Date: Thu, 8 Feb 2024 16:17:29 +1100 Subject: [PATCH 4/8] drafting new parallel imputation algo --- R/extendr-wrappers.R | 2 +- R/imputef.R | 51 +-- man/aldknni.Rd | 20 +- res/perf.R | 2 +- res/perf.md | 507 +---------------------------- res/perf_functions.R | 93 ++---- res/perf_misc.R | 101 ------ res/perf_misc.slurm | 29 -- res/perf_plot.R | 19 +- src/rust/src/aldknni.rs | 703 +++++++++++++++++++++++++++------------- src/rust/src/lib.rs | 24 +- src/rust/src/mvi.rs | 166 +++++++++- src/rust/src/optim.rs | 599 ---------------------------------- tests/tests.R | 40 +-- 14 files changed, 718 insertions(+), 1638 deletions(-) delete mode 100644 res/perf_misc.R delete mode 100644 res/perf_misc.slurm delete mode 100644 src/rust/src/optim.rs diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index afca21f..a358a12 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -11,7 +11,7 @@ #' @useDynLib imputef, .registration = TRUE NULL -impute <- function(fname, imputation_method, min_coverage, min_allele_frequency, max_missingness_rate_per_locus, pool_sizes, min_depth_below_which_are_missing, max_depth_above_which_are_missing, frac_top_missing_pools, frac_top_missing_loci, min_loci_corr, max_pool_dist, min_l_loci, min_k_neighbours, restrict_linked_loci_per_chromosome, optimise_n_steps_min_loci_corr, optimise_n_steps_max_pool_dist, optimise_max_l_loci, optimise_max_k_neighbours, optimise_n_reps, n_threads, fname_out_prefix) .Call(wrap__impute, fname, imputation_method, min_coverage, min_allele_frequency, max_missingness_rate_per_locus, pool_sizes, min_depth_below_which_are_missing, max_depth_above_which_are_missing, frac_top_missing_pools, frac_top_missing_loci, min_loci_corr, max_pool_dist, min_l_loci, min_k_neighbours, restrict_linked_loci_per_chromosome, optimise_n_steps_min_loci_corr, optimise_n_steps_max_pool_dist, optimise_max_l_loci, optimise_max_k_neighbours, optimise_n_reps, n_threads, fname_out_prefix) +impute <- function(fname, imputation_method, min_coverage, min_allele_frequency, max_missingness_rate_per_locus, pool_sizes, min_depth_below_which_are_missing, max_depth_above_which_are_missing, frac_top_missing_pools, frac_top_missing_loci, n_reps, min_loci_corr, max_pool_dist, min_l_loci, min_k_neighbours, restrict_linked_loci_per_chromosome, n_threads, fname_out_prefix) .Call(wrap__impute, fname, imputation_method, min_coverage, min_allele_frequency, max_missingness_rate_per_locus, pool_sizes, min_depth_below_which_are_missing, max_depth_above_which_are_missing, frac_top_missing_pools, frac_top_missing_loci, n_reps, min_loci_corr, max_pool_dist, min_l_loci, min_k_neighbours, restrict_linked_loci_per_chromosome, n_threads, fname_out_prefix) # nolint end diff --git a/R/imputef.R b/R/imputef.R index eb25c06..fa67b47 100644 --- a/R/imputef.R +++ b/R/imputef.R @@ -86,11 +86,7 @@ mvi = function(fname, min_l_loci=0, min_k_neighbours=0, restrict_linked_loci_per_chromosome=FALSE, - optimise_n_steps_min_loci_corr=0, - optimise_n_steps_max_pool_dist=0, - optimise_max_l_loci=0, - optimise_max_k_neighbours=0, - optimise_n_reps=0, + n_reps=0, n_threads=n_threads, fname_out_prefix=fname_out_prefix) return(out) @@ -115,11 +111,7 @@ mvi = function(fname, #' min_l_loci=10, #' min_k_neighbours=5, #' restrict_linked_loci_per_chromosome=TRUE, -#' optimise_n_steps_min_loci_corr=1, -#' optimise_n_steps_max_pool_dist=1, -#' optimise_max_l_loci=100, -#' optimise_max_k_neighbours=50, -#' optimise_n_reps=1, +#' n_reps=1, #' n_threads=2, #' fname_out_prefix="") #' @param fname @@ -141,25 +133,17 @@ mvi = function(fname, #' @param frac_top_missing_loci #' fraction of loci with the highest number of pools with missing data to be omitted. Set to zero if the input vcf has already been filtered and the loci beyond the depth thresholds have been set to missing, otherwise set to an decimal number between zero and one. [Default=0.0] #' @param min_loci_corr -#' Minimum correlation (Pearson's correlation) between the locus requiring imputation and other loci deemed to be in linkage with it. Ranges from 0.0 to 1.0. [Default=0.9] +#' Minimum correlation (Pearson's correlation) between the locus requiring imputation and other loci deemed to be in linkage with it. Ranges from 0.0 to 1.0. If using the default value with is NA, then this threshold will be optimised to find the best value minimising imputation error. [Default=NA] #' @param max_pool_dist -#' Maximum genetic distance (mean absolute difference in allele frequencies) between the pool or sample requiring imputation and pools or samples deemed to be the closest neighbours. Ranges from 0.0 to 1.0. [Default=0.1] +#' Maximum genetic distance (mean absolute difference in allele frequencies) between the pool or sample requiring imputation and pools or samples deemed to be the closest neighbours. Ranges from 0.0 to 1.0. If using the default value with is NA, then this threshold will be optimised to find the best value minimising imputation error. [Default=NA] #' @param min_l_loci #' Minimum number of linked loci to be used in estimating genetic distances between the pool or sample requiring imputation and other pools or samples. Minimum value of 1. [Default=1] #' @param min_k_neighbours #' Minimum number of k-nearest neighbours of the pool or sample requiring imputation. Minimum value of 1. [Default=1] #' @param restrict_linked_loci_per_chromosome #' Restrict the choice of linked loci to within the chromosome the locus requiring imputation belong to? [Default=TRUE] -#' @param optimise_n_steps_min_loci_corr -#' Number of steps requested for the values of minimum linked loci correlation to be used in optimisation. Note that this is an approximate number of steps because it can be more or less, depending on how even the range of possible values can be divided. If set to the default of 1, then no optimisation will be performed. [Default=1] -#' @param optimise_n_steps_max_pool_dist -#' Number of steps requested for the values of maximum genetic distance to be used in optimisation. Note that this is an approximate number of steps because it can be more or less, depending on how even the range of possible values can be divided. If set to the default of 1, then no optimisation will be performed. [Default=1] -#' @param optimise_max_l_loci -#' Maximum number of linked loci to be tested, if optimising for the best number of linked loci to include in imputation. Minimum value of 2. [Default=100] -#' @param optimise_max_k_neighbours -#' Maximum number of k-nearest neighbours to be tested, if optimising for the best number of nearest neighbours to include in imputation. Minimum value of 2. [Default=50] -#' @param optimise_n_reps -#' Number of replications for the optimisation for the minimum loci correlation, and/or maximum genetic distance, and/or minimum number of linked loci, and/or minimum number of k-nearest neighbours. Minimum value of 1. [Default=1] +#' @param n_reps +#' Number of replications for the optimisation for the minimum loci correlation, and/or maximum genetic distance. Minimum value of 1. [Default=5] #' @param n_threads #' number of computing threads or processor cores to use in the computations. [Default=2] #' @param fname_out_prefix @@ -203,18 +187,21 @@ aldknni = function(fname, max_depth_above_which_are_missing=1000000, frac_top_missing_pools=0.0, frac_top_missing_loci=0.0, - min_loci_corr=0.9, - max_pool_dist=0.1, + min_loci_corr=NA, + max_pool_dist=NA, min_l_loci=10, min_k_neighbours=5, restrict_linked_loci_per_chromosome=TRUE, - optimise_n_steps_min_loci_corr=1, - optimise_n_steps_max_pool_dist=1, - optimise_max_l_loci=100, - optimise_max_k_neighbours=50, - optimise_n_reps=1, + n_reps=5, n_threads=2, fname_out_prefix="") { + ### Handling NA conversion into Rust's f64:NAN + if (is.na(min_loci_corr)) { + min_loci_corr = -1.0 + } + if (is.na(max_pool_dist)) { + max_pool_dist = -1.0 + } out = impute(fname=fname, imputation_method="aLDkNNi", min_coverage=min_coverage, @@ -230,11 +217,7 @@ aldknni = function(fname, min_l_loci=min_l_loci, min_k_neighbours=min_k_neighbours, restrict_linked_loci_per_chromosome=restrict_linked_loci_per_chromosome, - optimise_n_steps_min_loci_corr=optimise_n_steps_min_loci_corr, - optimise_n_steps_max_pool_dist=optimise_n_steps_max_pool_dist, - optimise_max_l_loci=optimise_max_l_loci, - optimise_max_k_neighbours=optimise_max_k_neighbours, - optimise_n_reps=optimise_n_reps, + n_reps=n_reps, n_threads=n_threads, fname_out_prefix=fname_out_prefix) return(out) diff --git a/man/aldknni.Rd b/man/aldknni.Rd index b30fa52..110ce46 100644 --- a/man/aldknni.Rd +++ b/man/aldknni.Rd @@ -18,11 +18,7 @@ aldknni(fname, min_l_loci=10, min_k_neighbours=5, restrict_linked_loci_per_chromosome=TRUE, - optimise_n_steps_min_loci_corr=1, - optimise_n_steps_max_pool_dist=1, - optimise_max_l_loci=100, - optimise_max_k_neighbours=50, - optimise_n_reps=1, + n_reps=1, n_threads=2, fname_out_prefix="") } @@ -45,9 +41,9 @@ aldknni(fname, \item{frac_top_missing_loci}{fraction of loci with the highest number of pools with missing data to be omitted. Set to zero if the input vcf has already been filtered and the loci beyond the depth thresholds have been set to missing, otherwise set to an decimal number between zero and one. \link{Default=0.0}} -\item{min_loci_corr}{Minimum correlation (Pearson's correlation) between the locus requiring imputation and other loci deemed to be in linkage with it. Ranges from 0.0 to 1.0. \link{Default=0.9}} +\item{min_loci_corr}{Minimum correlation (Pearson's correlation) between the locus requiring imputation and other loci deemed to be in linkage with it. Ranges from 0.0 to 1.0. If using the default value with is NA, then this threshold will be optimised to find the best value minimising imputation error. \link{Default=NA}} -\item{max_pool_dist}{Maximum genetic distance (mean absolute difference in allele frequencies) between the pool or sample requiring imputation and pools or samples deemed to be the closest neighbours. Ranges from 0.0 to 1.0. \link{Default=0.1}} +\item{max_pool_dist}{Maximum genetic distance (mean absolute difference in allele frequencies) between the pool or sample requiring imputation and pools or samples deemed to be the closest neighbours. Ranges from 0.0 to 1.0. If using the default value with is NA, then this threshold will be optimised to find the best value minimising imputation error. \link{Default=NA}} \item{min_l_loci}{Minimum number of linked loci to be used in estimating genetic distances between the pool or sample requiring imputation and other pools or samples. Minimum value of 1. \link{Default=1}} @@ -55,15 +51,7 @@ aldknni(fname, \item{restrict_linked_loci_per_chromosome}{Restrict the choice of linked loci to within the chromosome the locus requiring imputation belong to? \link{Default=TRUE}} -\item{optimise_n_steps_min_loci_corr}{Number of steps requested for the values of minimum linked loci correlation to be used in optimisation. Note that this is an approximate number of steps because it can be more or less, depending on how even the range of possible values can be divided. If set to the default of 1, then no optimisation will be performed. \link{Default=1}} - -\item{optimise_n_steps_max_pool_dist}{Number of steps requested for the values of maximum genetic distance to be used in optimisation. Note that this is an approximate number of steps because it can be more or less, depending on how even the range of possible values can be divided. If set to the default of 1, then no optimisation will be performed. \link{Default=1}} - -\item{optimise_max_l_loci}{Maximum number of linked loci to be tested, if optimising for the best number of linked loci to include in imputation. Minimum value of 2. \link{Default=100}} - -\item{optimise_max_k_neighbours}{Maximum number of k-nearest neighbours to be tested, if optimising for the best number of nearest neighbours to include in imputation. Minimum value of 2. \link{Default=50}} - -\item{optimise_n_reps}{Number of replications for the optimisation for the minimum loci correlation, and/or maximum genetic distance, and/or minimum number of linked loci, and/or minimum number of k-nearest neighbours. Minimum value of 1. \link{Default=1}} +\item{n_reps}{Number of replications for the optimisation for the minimum loci correlation, and/or maximum genetic distance. Minimum value of 1. \link{Default=5}} \item{n_threads}{number of computing threads or processor cores to use in the computations. \link{Default=2}} diff --git a/res/perf.R b/res/perf.R index 3fb3415..f693d04 100644 --- a/res/perf.R +++ b/res/perf.R @@ -1,7 +1,7 @@ ### Parse Rscript arguments args = commandArgs(trailingOnly=TRUE) # args = c("1", "/group/pasture/Jeff/imputef/res", "/group/pasture/Jeff/imputef/misc", "3", "32") -# args = c("1", "res", "res", "3", "32") +# args = c("1", "/home/jeff/imputef/res", "/home/jeff/imputef/res", "3", "32") i = as.numeric(args[1]) dir_src = args[2] dir_data = args[3] diff --git a/res/perf.md b/res/perf.md index b8f3020..db83249 100644 --- a/res/perf.md +++ b/res/perf.md @@ -341,7 +341,7 @@ time Rscript perf_plot.R ${DIR} ### After all jobs have finished, move the output and plot: mkdir output mv *-performance_assessment-maf_*-missing_rate_*.csv output/ -time Rscript summary_plot.R \ +time Rscript perf_plot.R \ ${DIR}/output ``` @@ -349,508 +349,7 @@ time Rscript summary_plot.R \ The adaptive LD-kNN imputation algorithm works reasonably well even across the entire range of sparsity levels (0.1% to 90% missing data). -Note that the discrepancy between our imputation algorithm and LinkImpute's algorithm in the context of imputing binary genotypes is attributed to 2 differences: +Note that the discrepancy between our imputation algorithm and LinkImpute's algorithm in the context of imputing binary genotypes is attributed to 3 differences: +- the main one is our optimisation per locus, i.e. per locus with at least one pool missing allele frequencies, we simulate pools to be missing data and perform a pseudo-grid search across combinations of the `min_loci_corr` and `max_pool_dist` threshold values which breaks out of the inner loop (`max_pool_dist`) if estimated MAE is larger than the current lowest MAE, then continues the outer loop to restart the inner looop and so on. - the use of mean weighted allele frequencies in our case and weighted mode genotype class in LinkImpute, and - the use of mean absolute error to measure accuracy in our case and concordance in LinkImpute. - -## Miscellaneous: sensitivity analysis - -Using a single small chromosome from the Lucerne dataset we will explore the entire parameter spaces across sparsity and marker density. We will test all possible combinations of the 4 parameters (minimum loci correlation, maximum pool distance, minimum number of linked loci, and minimum number of k-nearest neighbours). - -We'll start with subsetting `lucerne.vcf` so that we only arbitrarily include *chromome 1* loci (chromsome ID: chr1.4) and *population 2*: - -```shell -DIR=/group/pasture/Jeff/imputef/misc -VCF=${DIR}/lucerne.vcf -cd $DIR -echo "Looking at the number of loci covered per chromosome:" -for i in $(seq 1 8) -do -N=$(grep "chr${i}.4" ${VCF} | wc -l) -echo CHR${i}: ${N} -done -echo "Extracting chromosome 1 loci:" -head -n6 $VCF > ${DIR}/lucerne_chromosome1_population2.vcf.tmp ### header lines -grep -m1 "#CHR" $VCF >> ${DIR}/lucerne_chromosome1_population2.vcf.tmp ### column labels including sample names -grep "chr1.4" ${VCF} | grep -v "^##contig" >> ${DIR}/lucerne_chromosome1_population2.vcf.tmp -echo "Extracting populations 2 samples:" -for i in $(seq 1 7) -do -N=$(grep -m1 "^#CHR" ${DIR}/lucerne_chromosome1_population2.vcf.tmp | sed -z "s/\t/\n/g" | grep "DB-MS-31-22-00${i}" | wc -l) -echo POP_${i}: ${N} -done -IDX=$(echo 1-9,$(grep -m1 "^#CHR" ${DIR}/lucerne_chromosome1_population2.vcf.tmp | sed -z "s/\t/\n/g" | grep -n "DB-MS-31-22-002" | cut -d':' -f1 | sed -z 's/\n/,/g' | sed 's/,$//g')) -cut -f${IDX} ${DIR}/lucerne_chromosome1_population2.vcf.tmp > ${DIR}/lucerne_chromosome1_population2.vcf -rm ${DIR}/lucerne_chromosome1_population2.vcf.tmp -wc -l ${DIR}/lucerne_chromosome1_population2.vcf -bat -l tsv --wrap never ${DIR}/lucerne_chromosome1_population2.vcf -``` - -Now, we will assess imputation accuracy across various combinations of the 4 parameters, as well as across 10 sparsity levels, and 10 marker density levels: - -```shell -DIR=/group/pasture/Jeff/imputef/res -cd $DIR -sbatch perf_misc.slurm - -### Monitoring: -conda activate rustenv -DIR=/group/pasture/Jeff/imputef/res -cd $DIR -squeue -u jp3h | sort -SLURMOUT=slurm-24312170.out -grep -n -i "err" ${SLURMOUT} | grep -v "mean absolute" -wc -l ${DIR}/sensitivity_analysis_output.csv -bat --wrap never ${DIR}/sensitivity_analysis_output.csv -``` - -### Questions we wish to anwer: - -1. Is the 4D parameter space (*min_loci_corr*, *max_pool_dist*, *min_l_loci*, and *min_k_neighbours*) actually smooth and unimodal? -2. How does *marker_density* affect imputation accuracy? -3. How does *marker_density* affect the optimum parameters? -4. Is there a single combination of parameters which yields reasonably high imputation accuracy across marker density and sparsity levels? - -```R -dir = "/group/pasture/Jeff/imputef/res" -setwd(dir) -df = read.csv("sensitivity_analysis_output.csv") -str(df) - -### Evaluate the data -summary(df) -sum(is.na(df)) -sum(is.na(df[, grepl("mae_q", colnames(df))])) -sum(is.na(df[, !grepl("mae_q", colnames(df))])) - -### Linear modelling -mod = lm(mae_frequencies ~ marker_density + sparsity + min_loci_corr + max_pool_dist + min_l_loci + min_k_neighbours, data=df) -summary(mod) -mod_complete = lm(mae_frequencies ~ marker_density * sparsity * min_loci_corr * max_pool_dist * min_l_loci * min_k_neighbours, data=df) -summary(mod_complete) -mod_null = lm(mae_frequencies ~ 1, data=df) -summary(mod_null) - -anova(mod_null, mod, mod_complete) - -txtplot::txtplot(x=df$marker_density, y=df$mae_frequencies, xlab="marker_density", ylab="mae_frequencies") -txtplot::txtplot(x=df$sparsity, y=df$mae_frequencies, xlab="sparsity", ylab="mae_frequencies") -txtplot::txtplot(x=df$min_loci_corr, y=df$mae_frequencies, xlab="min_loci_corr", ylab="mae_frequencies") - -### Random forest -data = data.frame(mae_frequencies=df$mae_frequencies, - marker_density=df$marker_density, - sparsity=df$sparsity, - min_loci_corr=df$min_loci_corr, - max_pool_dist=df$max_pool_dist, - min_l_loci=df$min_l_loci, - min_k_neighbours=df$min_k_neighbours) -# data$marker_density_X_sparsity = as.factor(paste0(data$marker_density, "_X_", data$sparsity)) -data$marker_density_X_sparsity = data$marker_density * data$sparsity - -data$min_loci_corr_X_max_pool_dist = data$min_loci_corr * data$max_pool_dist -data$min_l_loci_X_min_k_neighbours = data$min_l_loci * data$min_k_neighbours - -data$min_loci_corr_X_min_k_neighbours = data$min_loci_corr * data$min_k_neighbours -data$min_l_loci_X_max_pool_dist = data$min_l_loci * data$max_pool_dist - -data$min_loci_corr_X_min_l_loci = data$min_loci_corr * data$min_l_loci -data$max_pool_dist_X_min_k_neighbours = data$max_pool_dist * data$min_k_neighbours - -data$min_loci_corr_X_max_pool_dist_X_min_l_loci_X_min_k_neighbours = data$min_loci_corr * data$max_pool_dist * data$min_l_loci * data$min_k_neighbours - -# rf = partykit::cforest(mae_frequencies ~ ., data=data, trace=TRUE, cores=32) -# summary(rf) -# VARIMP = partykit::varimp(rf, conditional=TRUE, cores=2) ### uses mclapply - hence will be cloning stuff and eating a lot of RAM - therefore reduces the number of cores as required - -# summary(VARIMP) -# str(VARIMP) -# print(VARIMP) -# txtplot::txtplot(VARIMP) - -# ### TESTING partykit::cforest -# # tmp_testing_cforest.R -# # tmp_testing_cforest.slurm -# # DIR=/group/pasture/Jeff/imputef/res -# # cd $DIR -# # cat slurm-24309707.out - -# # rf = randomForest::randomForest(mae_frequencies ~ ., data=train, proximity=TRUE) -# # print(rf) -# # p1 = predict(rf, train) -# # cor(train$mae_frequencies, p1) -# # txtplot::txtplot(x=train$mae_frequencies, y=p1) -# # p2 = predict(rf, test) -# # cor(test$mae_frequencies, p2) -# # txtplot::txtplot(x=test$mae_frequencies, y=p2) - -random_forest_output = randomForest::randomForest(mae_frequencies ~ ., data=data, proximity=TRUE) -cor(data$mae_frequencies, predict(random_forest_output, data)) -str(random_forest_output) - -IMPORTANCE = random_forest_output$importance[order(random_forest_output$importance, decreasing=TRUE), , drop=FALSE] -print(IMPORTANCE) - -print("The choice of the nearest neighbours appears to be the most important variables, i.e. max_pool_dist and min_k_neighbours.") -print("This is followed by the dataset marker density and sparsity combination.") -print("Then by the correlation-by-distance threshold combinations and minimum l-by-k combinations.") - -print("Question 2: How does marker density affect imputation accuracy?") -print("Answer 2: Marker density per se has the least effect on imputation accuracy. - However, the combination of marker density and sparsity has a magnitude greater effect on imputation accuracy than marker density alone.") - -### For each combination of marker density and sparsity levels ... -### assess imputation accuracy modality across the 4 parameter spaces, and ... -### identify the optimum parameter combination. -vec_dataset_combinations = sort(unique(paste0(df$marker_density, "_X_", df$sparsity))) -for (dataset in vec_dataset_combinations) { - # dataset = vec_dataset_combinations[1] - print("#######################################################") - print(dataset) - marker_density = as.numeric(unlist(strsplit(dataset, "_X_"))[1]) - sparsity = as.numeric(unlist(strsplit(dataset, "_X_"))[2]) - subdf = df[(df$marker_density==marker_density) & (df$sparsity==sparsity), ] - # txtplot::txtplot(x=df$min_loci_corr, y=df$mae_frequencies, xlab="min_loci_corr", ylab="mae") - # txtplot::txtplot(x=df$max_pool_dist, y=df$mae_frequencies, xlab="max_pool_dist", ylab="mae") - # txtplot::txtplot(x=df$min_l_loci, y=df$mae_frequencies, xlab="min_l_loci", ylab="mae") - # txtplot::txtplot(x=df$min_k_neighbours, y=df$mae_frequencies, xlab="min_k_neighbours", ylab="mae") - - ### Describe the parameter spaces - ### Are the trends in MAE of a parameter the same across levels of all other parameters? Not likely! - vec_min_loci_corr = sort(unique(subdf$min_loci_corr)) - vec_max_pool_dist = sort(unique(subdf$max_pool_dist)) - vec_min_l_loci = sort(unique(subdf$min_l_loci)) - vec_min_k_neighbours = sort(unique(subdf$min_k_neighbours)) - for (min_loci_corr in vec_min_loci_corr) { - for (max_pool_dist in vec_max_pool_dist) { - for (min_l_loci in vec_min_l_loci) { - # min_loci_corr=0.7; max_pool_dist=0.8; min_l_loci=3 - idx = which((subdf$min_loci_corr==min_loci_corr) & (subdf$max_pool_dist==max_pool_dist) & (subdf$min_l_loci==min_l_loci)) - if (length(idx)==0) { - next - } - dat = subdf[idx, ] - # txtplot::txtplot(x=dat$min_k_neighbours, y=dat$mae_frequencies, xlab="min_k_neighbours", ylab="MAE") - agg = aggregate(mae_frequencies ~ min_k_neighbours, data=dat, FUN=mean) - } - } - } - - ### Despite these, maybe the current output is too granular, and maybe as we get more info it all smooths out and the parameter spaces become more or less smooth and unimodal? - - - ### Find the very best parameter combination at the current dataset - agg = aggregate(mae_frequencies ~ min_loci_corr + max_pool_dist + min_l_loci + min_k_neighbours, data=subdf, FUN=mean) - agg = agg[order(agg$mae_frequencies, decreasing=FALSE), ] - optim = data.frame(marker_density=marker_density, sparsity=sparsity, agg[1, ]) - if (dataset == vec_dataset_combinations[1]) { - df_best_params = optim - } else { - df_best_params = rbind(df_best_params, optim) - } - agg_min_loci_corr = aggregate(mae_frequencies ~ min_loci_corr, data=subdf, FUN=mean) - txtplot::txtplot(x=agg_min_loci_corr$min_loci_corr, y=agg_min_loci_corr$mae_frequencies, xlab="min_loci_corr", ylab="mae") - agg_max_pool_dist = aggregate(mae_frequencies ~ max_pool_dist, data=subdf, FUN=mean) - txtplot::txtplot(x=agg_max_pool_dist$max_pool_dist, y=agg_max_pool_dist$mae_frequencies, xlab="max_pool_dist", ylab="mae") - agg_min_l_loci = aggregate(mae_frequencies ~ min_l_loci, data=subdf, FUN=mean) - txtplot::txtplot(x=agg_min_l_loci$min_l_loci, y=agg_min_l_loci$mae_frequencies, xlab="min_l_loci", ylab="mae") - agg_min_k_neighbours = aggregate(mae_frequencies ~ min_k_neighbours, data=subdf, FUN=mean) - txtplot::txtplot(x=agg_min_k_neighbours$min_k_neighbours, y=agg_min_k_neighbours$mae_frequencies, xlab="min_k_neighbours", ylab="mae") -} - -print("Question 1: Is MAE smooth and/or unimodal per parameter space?") -print("Answer 1: No. The parameter spaces are neither smooth nor unimodal. The parameter spaces are complex with some local minima far from the global minimum.") - -print(df_best_params) - -txtplot::txtplot(x=df_best_params$marker_density, y=df_best_params$mae_frequencies, xlab="marker_density", ylab="mae") -agg_marker_density = aggregate(mae_frequencies ~ marker_density, data=df_best_params, FUN=mean) -print(agg_marker_density[order(agg_marker_density$mae_frequencies, decreasing=FALSE), ]) - -print("Question 3: How does marker density affect the optimum parameter combinations?") -print("Answer 3: On average using the best imputation accuracy per marker-density-by-sparsity dataset, higher marker density improves imputation accuracy.") - -txtplot::txtdensity(df_best_params$min_loci_corr, xlab="min_loci_corr") -txtplot::txtdensity(df_best_params$max_pool_dist, xlab="max_pool_dist") -txtplot::txtdensity(df_best_params$min_l_loci, xlab="min_l_loci") -txtplot::txtdensity(df_best_params$min_k_neighbours, xlab="min_k_neighbours") - -mean_optim_params = as.data.frame(t(apply(df_best_params, MAR=2, FUN=mean))) -print(mean_optim_params) -pred_optim_min_loci_corr = round(mean_optim_params$min_loci_corr, 1) -pred_optim_max_pool_dist = round(mean_optim_params$max_pool_dist, 1) - -vec_min_l_loci = sort(unique(df$min_l_loci)) -delta_min_l_loci = abs(vec_min_l_loci - round(mean_optim_params$min_l_loci)) -pred_optim_min_l_loci = vec_min_l_loci[which(delta_min_l_loci == min(delta_min_l_loci))] - -vec_min_k_neighbours = sort(unique(df$min_k_neighbours)) -delta_min_k_neighbours = abs(vec_min_k_neighbours - round(mean_optim_params$min_k_neighbours)) -pred_optim_min_k_neighbours = vec_min_k_neighbours[which(delta_min_k_neighbours == min(delta_min_k_neighbours))] - -idx = which((df$min_loci_corr == pred_optim_min_loci_corr) & (df$max_pool_dist == pred_optim_max_pool_dist) & (df$min_l_loci == pred_optim_min_l_loci) & (df$min_k_neighbours == pred_optim_min_k_neighbours)) - -df[idx, ] - -print("Question 4: Is there a single combination of parameters which performs reasonably well across datasets?") -print("Answer 4: We are not certain yet as we are only capturing a single dataset for which the predicted optimum parameter combination is present. - However, given the complexity of the parameter spaces, this is unlikely. Again, blame the no-free-lunch theorem. - But wait! Some unimodality in min_loci_corr is starting to show centered on 0.5, and bimodalities in the rest, i.e., - max_pool_dist near both extremes, min_l_loci at <10 and ~17, and min_k_neighbours at ~5 and ~17. These are very curiosome indeed!") - -``` - -### Preliminary results (20240201) --> being addressed with the most recent changes/merge which should improve optimisation where we move forwards then backwards across the parameter spaces (2024/02/02) - -*Answers to [question](#questions-we-wish-to-anwer), possible consequences and follow-up questions:* - -1. No. The parameter spaces are neither smooth nor unimodal (*Figure 1*). The parameter spaces are complex with some local minima far from the global minimum. This means we have to improve our optimisation algorithm in [`optim.rs`](../src/rust/src/optim.rs). Also, note that the choice of the nearest neighbours (`max_pool_dist_X_min_k_neighbours`) appears to be the most important set of variables in terms imputation accuracy (*Table 1*). Will this mean that we can set `min_loci_corr` to some low value and likewise `min_l_loci` to an arbitrary value that is not too high as low minimum correlation should capture sufficiently numerous loci for genetic distance estimation; and then just optimise for `min_k_neighbours` and `max_pool_dist`? -2. Marker density per se has the least effect on imputation accuracy (*Table 1*). However, the combination of marker density and sparsity has a magnitude greater effect on imputation accuracy than marker density alone. But what does this mean? Are they antagonistic, i.e. low sparsity but high density means better accuracy and vice-versa as expected? What is the accuracy in the middle, i.e. medium sparsity and medium density? How about when both are high or both are low? -3. On average using the best imputation accuracy per marker-density-by-sparsity dataset, higher marker density improves imputation accuracy (*Table 2*). -4. We are not certain yet as we are only capturing a single dataset for which the predicted optimum parameter combination is present. However, given the complexity of the parameter spaces, this is unlikely. Again, blame the no-free-lunch theorem. For the current findings, please see *Table 3* for the mean optimum parameter values across datasets (marker-density-by-sparsity combinations). But wait, look at *Figure 2* and notice some unimodality in `min_loci_corr` centered on 0.5, and bimodalities in the rest, i.e., `max_pool_dist` near both extremes, `min_l_loci` at <10 and ~17, and `min_k_neighbours` at ~5 and ~17. These are very curiosome indeed! - - -*Table 1*. Variable importance based on random forest regression - -| Variable | Importance | -| :-------------------------------------------------------------- | ----------: | -| max_pool_dist_X_min_k_neighbours | 0.058613063 | -| min_k_neighbours | 0.038643309 | -| max_pool_dist | 0.035782306 | -| min_loci_corr_X_min_k_neighbours | 0.023709804 | -| marker_density_X_sparsity | 0.016431124 | -| min_loci_corr_X_max_pool_dist | 0.014462863 | -| min_l_loci_X_min_k_neighbours | 0.007870009 | -| min_loci_corr | 0.007014149 | -| min_loci_corr_X_min_l_loci | 0.006842891 | -| min_l_loci_X_max_pool_dist | 0.006793837 | -| sparsity | 0.004732896 | -| min_loci_corr_X_max_pool_dist_X_min_l_loci_X_min_k_neighbours | 0.004610137 | -| min_l_loci | 0.004595450 | -| marker_density | 0.001893720 | - - -*Table 2*. Effect of marker density on imputation accuracy (measured by mean absolute error, MAE) - -| marker_density | mae_frequencies | -| :------------: | --------------: | -| 1.0 | 0.06718490 | -| 0.8 | 0.06766566 | -| 0.2 | 0.06786764 | -| 0.6 | 0.06796051 | -| 0.4 | 0.06828012 | - - -*Table 3*. Mean optimum parameter values across datasets, i.e. marker-density-by-sparsity combinations - -| Variable | Optimum value | -| :---------------- | ------------: | -| marker_density | 0.4 | -| sparsity | 0.3 | -| min_loci_corr | 0.4 | -| max_pool_dist | 0.4 | -| min_l_loci | 17 | -| min_k_neighbours | 17 | - - -*Figure 1*. Representative parameter spaces across datasets (marker-density-by-sparsity combinations) - -``` - +-+---------+---------+---------+---------+--------+--+ | +-+---------+---------+---------+---------+---------+--+ - 0.071 + * + | 0.074 + * + - | | | | * | - | | | | | - 0.0705 + + | 0.073 + + - | * * | | | * | -m | * | | m | | -a 0.07 + * + | a | | -e | * * | | e 0.072 + * + - | * * | | | * * | - 0.0695 + * + | | * | - | | | 0.071 + * + - | * | | | * * * | - 0.069 +-+---------+---------+---------+---------+--------+--+ | +-+---------+---------+---------+---------+---------+--+ - 0 0.2 0.4 0.6 0.8 1 | 0 0.2 0.4 0.6 0.8 1 - min_loci_corr | min_loci_corr - +-+---------+---------+---------+---------+---------+--+ | +-+---------+---------+---------+---------+---------+--+ - | * * | | | * | - 0.073 + + | 0.076 + + - | | | | | - 0.072 + + | 0.075 + * + - | | | | | -m | | | m 0.074 + + -a 0.071 + + | a | | -e | | | e 0.073 + + - 0.07 + + | | | - | * * * * * * * | | 0.072 + + - 0.069 + * + | | * * * * * | - | * | | 0.071 + * * * * + - +-+---------+---------+---------+---------+---------+--+ | +-+---------+---------+---------+---------+---------+--+ - 0 0.2 0.4 0.6 0.8 1 | 0 0.2 0.4 0.6 0.8 1 - max_pool_dist | max_pool_dist - ++---------+---------+---------+---------+---------+--+ | ++---------+---------+---------+---------+---------+--+ - 0.071 + * + | | * | - | | | | * | - 0.0705 + + | 0.0725 + + - | * | | | | - | * * * | | | | -m 0.07 + + | m | * | -a | * * * | | a 0.072 + * * * + -e 0.0695 + + | e | * | - | * | | | * | - | | | 0.0715 + + - 0.069 + + | | | - | * | | | * * | - 0.0685 ++---------+---------+---------+---------+---------+--+ | ++---------+---------+---------+---------+---------+--+ - 0 10 20 30 40 50 | 0 10 20 30 40 50 - min_l_loci | min_l_loci - 0.0725 ++---------+---------+---------+---------+---------+--+ | ++----------+---------+---------+---------+---------+--+ - | * | | 0.075 + * + - 0.072 + + | | | - | | | | | - 0.0715 + + | 0.074 + + - | | | | | -m 0.071 + + | m 0.073 + + -a | * | | a | | -e 0.0705 + + | e | * | - | | | 0.072 + ** + - 0.07 + * * + | | * | - 0.0695 + ** * + | 0.071 + * * + - | * * * | | | * * * | - ++---------+---------+---------+---------+---------+--+ | ++----------+---------+---------+---------+---------+--+ - 0 10 20 30 40 50 | 0 10 20 30 40 50 - min_k_neighbours | min_k_neighbours -``` - -*Figure 2*. Distribution of the best parameters across datasets (marker-density-by-sparsity combinations) - -``` - +--+---------+----------+----------+---------+----------+--+ - | ******* | - | *** ** | -1.5 + *** *** + - | ** ** | - | ** *** | - | ** ** | - 1 + *** *** + - | ** *** | - | ** *** | -0.5 + ** ***** + - | *** ****** | - | ********* | - +--+---------+----------+----------+---------+----------+--+ - 0 0.2 0.4 0.6 0.8 1 - min_loci_corr - +--+---------+----------+----------+---------+----------+--+ - | ******* | - | ** ** | -1.5 + ** ** + - | ** | - | ** | - | ** | - 1 + ** + - | *** | - | *** | - | *** ***** | -0.5 + ***** ******* ** + - | ****************** | - +--+---------+----------+----------+---------+----------+--+ - 0 0.2 0.4 0.6 0.8 1 - max_pool_dist - +-+---------+----------+----------+----------+---------+--+ -0.15 + ** + - | *** | - | * | - | * | - 0.1 + * + - | ** | - | * | - | ** | -0.05 + * + - | ** * | - | * **** **** | - 0 + ********* ********* *********************** + - +-+---------+----------+----------+----------+---------+--+ - 0 10 20 30 40 50 - min_l_loci - +-+---------+----------+----------+----------+---------+--+ -0.04 + ***** + - | ** ** **** | - | * ** ** * | -0.03 + ** *** * + - | ***** ** | - | ** | -0.02 + ** + - | *** | - | ***** | -0.01 + **** + - | **** | - | ****************** | - +-+---------+----------+----------+----------+---------+--+ - 0 10 20 30 40 50 - min_k_neighbours -``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/res/perf_functions.R b/res/perf_functions.R index 720a5bd..8a59492 100644 --- a/res/perf_functions.R +++ b/res/perf_functions.R @@ -1,8 +1,10 @@ ### Load imputef library -# library(imputef) +# dir_src = "/group/pasture/Jeff/imputef/res" +library(imputef) # system("conda activate rustenv") -devtools::load_all() -rextendr::document() +# devtools::load_all() +# rextendr::document() + ### Extract allele frequencies into a pxn matrix where we have p loci and n entries ### Assumes all loci have a maximum of 2 alleles @@ -324,66 +326,24 @@ fn_test_imputation = function(vcf, mat_genotypes, mat_idx_high_conf_data, ploidy fname_out_prefix=paste0("MVI-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id), n_threads=n_threads) duration_mvi = difftime(Sys.time(), time_ini, units="mins") - ### (2) Adaptive LD-kNN imputation using fixed min_loci_corr, max_pool_dist, min_l_loci, and min_k_neighbours + ### (2) Adaptive LD-kNN imputation using fixed min_loci_corr and max_pool_dist time_ini = Sys.time() fname_out_aldknni_fixed = aldknni(fname=list_sim_missing$fname_vcf, fname_out_prefix=paste0("AFIXED-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id), min_loci_corr=0.9, max_pool_dist=0.1, - min_l_loci=10, - min_k_neighbours=5, - optimise_n_steps_min_loci_corr=1, - optimise_n_steps_max_pool_dist=1, - optimise_max_l_loci=1, - optimise_max_k_neighbours=1, restrict_linked_loci_per_chromosome=restrict_linked_loci_per_chromosome, n_threads=n_threads) duration_aldknni_fixed = difftime(Sys.time(), time_ini, units="mins") - ### (3) Adaptive LD-kNN imputation with optimisation for min_loci_corr, and max_pool_dist + ### (3) Adaptive LD-kNN imputation using optimised min_loci_corr and max_pool_dist time_ini = Sys.time() - fname_out_aldknni_optim_cd = aldknni(fname=list_sim_missing$fname_vcf, - fname_out_prefix=paste0("AOPTIMCD-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id), - min_loci_corr=0.0, - max_pool_dist=1.0, - min_l_loci=1, - min_k_neighbours=1, - optimise_n_steps_min_loci_corr=10, - optimise_n_steps_max_pool_dist=10, - optimise_max_l_loci=1, - optimise_max_k_neighbours=1, + fname_out_aldknni_optim = aldknni(fname=list_sim_missing$fname_vcf, + fname_out_prefix=paste0("AOPTIM-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id), + min_loci_corr=NA, + max_pool_dist=NA, restrict_linked_loci_per_chromosome=restrict_linked_loci_per_chromosome, n_threads=n_threads) - duration_aldknni_optim_cd = difftime(Sys.time(), time_ini, units="mins") - ### (4) Adaptive LD-kNN imputation with optimisation for min_l_loci, and min_k_neighbours - time_ini = Sys.time() - fname_out_aldknni_optim_lk = aldknni(fname=list_sim_missing$fname_vcf, - fname_out_prefix=paste0("AOPTIMLK-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id), - min_loci_corr=0.0, - max_pool_dist=1.0, - min_l_loci=1, - min_k_neighbours=1, - optimise_n_steps_min_loci_corr=1, - optimise_n_steps_max_pool_dist=1, - optimise_max_l_loci=100, - optimise_max_k_neighbours=100, - restrict_linked_loci_per_chromosome=restrict_linked_loci_per_chromosome, - n_threads=n_threads) - duration_aldknni_optim_lk = difftime(Sys.time(), time_ini, units="mins") - ### (5) Adaptive LD-kNN imputation with optimisation for min_l_loci, min_k_neighbours, min_l_loci, and min_k_neighbours - time_ini = Sys.time() - fname_out_aldknni_optim_all = aldknni(fname=list_sim_missing$fname_vcf, - fname_out_prefix=paste0("AOPTIMAL-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id), - min_loci_corr=0.0, - max_pool_dist=1.0, - min_l_loci=1, - min_k_neighbours=1, - optimise_n_steps_min_loci_corr=10, - optimise_n_steps_max_pool_dist=10, - optimise_max_l_loci=100, - optimise_max_k_neighbours=100, - restrict_linked_loci_per_chromosome=restrict_linked_loci_per_chromosome, - n_threads=n_threads) - duration_aldknni_optim_all = difftime(Sys.time(), time_ini, units="mins") + duration_aldknni_optim = difftime(Sys.time(), time_ini, units="mins") ### LinkImpute's LD-kNN imputation algorithm for unordered genotype data (forcing all data to be diploids) fname_for_linkimpute = paste0("LINKIMPUTE_INPUT-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id,".tsv") output_for_linkimpute = paste0("LINKIMPUTE_INPUT-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id,"-IMPUTED.tsv") @@ -397,7 +357,7 @@ fn_test_imputation = function(vcf, mat_genotypes, mat_idx_high_conf_data, ploidy mat_genotypes_for_linkimpute[is.na(mat_genotypes_for_linkimpute)] = -1 write.table(mat_genotypes_for_linkimpute, file=fname_for_linkimpute, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE) time_ini = Sys.time() - system(paste0("java -jar linkimpute/LinkImpute.jar --verbose -a ", fname_for_linkimpute, " ", output_for_linkimpute)) + system(paste0("java -jar ", dir_src, "/linkimpute/LinkImpute.jar --verbose -a ", fname_for_linkimpute, " ", output_for_linkimpute)) duration_linkimpute = difftime(Sys.time(), time_ini, units="mins") mat_linkimputed = read.delim(output_for_linkimpute, header=FALSE) rownames(mat_linkimputed) = rownames(mat_genotypes_for_linkimpute) @@ -425,19 +385,7 @@ fn_test_imputation = function(vcf, mat_genotypes, mat_idx_high_conf_data, ploidy ploidy=ploidy, strict_boundaries=strict_boundaries, n_threads=n_threads) - metrics_aldknni_optim_cd = fn_imputation_accuracy(fname_imputed=fname_out_aldknni_optim_cd, - list_sim_missing=list_sim_missing, - mat_idx_high_conf_data=mat_idx_high_conf_data, - ploidy=ploidy, - strict_boundaries=strict_boundaries, - n_threads=n_threads) - metrics_aldknni_optim_lk = fn_imputation_accuracy(fname_imputed=fname_out_aldknni_optim_lk, - list_sim_missing=list_sim_missing, - mat_idx_high_conf_data=mat_idx_high_conf_data, - ploidy=ploidy, - strict_boundaries=strict_boundaries, - n_threads=n_threads) - metrics_aldknni_optim_all = fn_imputation_accuracy(fname_imputed=fname_out_aldknni_optim_all, + metrics_aldknni_optim = fn_imputation_accuracy(fname_imputed=fname_out_aldknni_optim, list_sim_missing=list_sim_missing, mat_idx_high_conf_data=mat_idx_high_conf_data, ploidy=ploidy, @@ -450,9 +398,10 @@ fn_test_imputation = function(vcf, mat_genotypes, mat_idx_high_conf_data, ploidy ploidy=2, strict_boundaries=strict_boundaries, n_threads=n_threads) + # print(paste0("metrics_mvi$concordance_classes=", round(metrics_mvi$concordance_classes, 4), "; metrics_aldknni_fixed$concordance_classes=", round(metrics_aldknni_fixed$concordance_classes, 4), "; metrics_aldknni_optim$concordance_classes=", round(metrics_aldknni_optim$concordance_classes, 4), "; metrics_linkimpute$concordance_classes=", round(metrics_linkimpute$concordance_classes, 4))) } else { - df_metrics_across_allele_freqs_frequencies = metrics_aldknni_optim_all$df_metrics_across_allele_freqs_frequencies - df_metrics_across_allele_freqs_classes = metrics_aldknni_optim_all$df_metrics_across_allele_freqs_classes + df_metrics_across_allele_freqs_frequencies = metrics_aldknni_optim$df_metrics_across_allele_freqs_frequencies + df_metrics_across_allele_freqs_classes = metrics_aldknni_optim$df_metrics_across_allele_freqs_classes df_metrics_across_allele_freqs_frequencies[,2:7] = NA df_metrics_across_allele_freqs_classes[,2:7] = NA metrics_linkimpute = list( @@ -479,7 +428,7 @@ fn_test_imputation = function(vcf, mat_genotypes, mat_idx_high_conf_data, ploidy duration_linkimpute = NA } ### Merge imputation accuracy metrics into the output data.frame - string_metric_lists = c("metrics_mvi", "metrics_aldknni_fixed", "metrics_aldknni_optim_cd", "metrics_aldknni_optim_lk", "metrics_aldknni_optim_all", "metrics_linkimpute") + string_metric_lists = c("metrics_mvi", "metrics_aldknni_fixed", "metrics_aldknni_optim", "metrics_linkimpute") for (m in string_metric_lists) { # m = string_metric_lists[1] algorithm = gsub("metrics_", "", m) @@ -534,12 +483,10 @@ fn_test_imputation = function(vcf, mat_genotypes, mat_idx_high_conf_data, ploidy system(paste0("rm ", list_sim_missing$fname_vcf)) system(paste0("rm ", gsub("-IMPUTED.csv$", "*", fname_out_mvi))) system(paste0("rm ", gsub("-IMPUTED.csv$", "*", fname_out_aldknni_fixed))) - system(paste0("rm ", gsub("-IMPUTED.csv$", "*", fname_out_aldknni_optim_all))) - system(paste0("rm ", gsub("-IMPUTED.csv$", "*", fname_out_aldknni_optim_cd))) - system(paste0("rm ", gsub("-IMPUTED.csv$", "*", fname_out_aldknni_optim_lk))) + system(paste0("rm ", gsub("-IMPUTED.csv$", "*", fname_out_aldknni_optim))) if (bool_enough_data_to_simulate_10k_missing == TRUE) { system(paste0("rm ", gsub("-IMPUTED.csv$", "*", fname_out_linkimpute))) } ### Output return(df_out) -} \ No newline at end of file +} diff --git a/res/perf_misc.R b/res/perf_misc.R deleted file mode 100644 index 79b333c..0000000 --- a/res/perf_misc.R +++ /dev/null @@ -1,101 +0,0 @@ -args = commandArgs(trailingOnly=TRUE) -dir_src = args[1] -fname_vcf = args[2] -n_reps = as.numeric(args[3]) -ploidy = as.numeric(args[4]) -strict_boundaries = as.logical(args[5]) -n_threads = as.numeric(args[6]) -fname_csv_out = args[7] -# dir_src = "/group/pasture/Jeff/imputef/res" -# fname_vcf = "/group/pasture/Jeff/imputef/misc/lucerne_chromosome1_population2.vcf" -# n_reps = 3 -# ploidy = 4 -# strict_boundaries = FALSE -# n_threads = 32 -# fname_csv_out = "/group/pasture/Jeff/imputef/res/sensitivity_analysis_output.csv" -### Load functions -source(paste0(dir_src, "/perf_functions.R")) -### Load genotype -vcf = vcfR::read.vcfR(fname_vcf) -list_genotypes = fn_extract_allele_frequencies(vcf) -mat_genotypes = list_genotypes$mat_genotypes -### Filter by 5% minor allele frequency: -maf = 0.05 -mean_allele_freqs = rowMeans(mat_genotypes, na.rm=TRUE) -idx = which((mean_allele_freqs>=maf) & ((1-mean_allele_freqs)>=maf)) -vcf = vcf[idx, , ] -mat_genotypes = mat_genotypes[idx, ] -### Define variable combinations -df_variables = expand.grid(marker_density = seq(from=0.2, to=1.0, by=0.2), - sparsity = c(0.01, seq(from=0.1, to=0.6, by=0.1)), - min_loci_corr = seq(from=0.0, to=1.0, by=0.1), - max_pool_dist = seq(from=0.0, to=1.0, by=0.1), - min_l_loci = c(1:5, round(seq(from=6, to=50, length=5))), - min_k_neighbours = c(1:5, round(seq(from=6, to=50, length=5)))) -idx_sort_rand = sample(1:nrow(df_variables), size=nrow(df_variables), replace=FALSE) -df_variables = df_variables[idx_sort_rand, ] -# df_variables = df_variables[1:10, ] -### Write header -header = c("rep", - "marker_density", - "sparsity", - "min_loci_corr", - "max_pool_dist", - "min_l_loci", - "min_k_neighbours", - "frac_imputed", - "mae_frequencies", - "rmse_frequencies", - "r2_frequencies", - "mae_classes", - "rmse_classes", - "r2_classes", - "concordance_classes", - paste0("mae_q", seq(0,1,by=0.1)) - ) -cat(paste0(paste(header, collapse=","), "\n"), file=fname_csv_out) -### Sensitivity analysis across variable combinations -pb = txtProgressBar(min=0, max=n_reps*nrow(df_variables), style=3) -for (i in 1:nrow(df_variables)) { - for (r in 1:n_reps) { - list_subsamp = fn_simulate_marker_density_reduction(vcf=vcf, mat_genotypes=mat_genotypes, reduction_rate=df_variables$marker_density[i]) - list_sim_missing = fn_simulate_missing_data(vcf=list_subsamp$vcf, - mat_genotypes=list_subsamp$mat_genotypes, - maf=maf, - missing_rate=df_variables$sparsity[i]) - fname_out = aldknni(fname=list_sim_missing$fname_vcf, - min_loci_corr=df_variables$min_loci_corr[i], - max_pool_dist=df_variables$max_pool_dist[i], - min_l_loci=df_variables$min_l_loci[i], - min_k_neighbours=df_variables$min_k_neighbours[i], - optimise_n_steps_min_loci_corr=1, - optimise_n_steps_max_pool_dist=1, - optimise_max_l_loci=1, - optimise_max_k_neighbours=1, - n_threads=n_threads) - metrics_out = fn_imputation_accuracy(fname_imputed=fname_out, - list_sim_missing=list_sim_missing, - mat_idx_high_conf_data=!is.na(list_subsamp$mat_genotypes), - ploidy=ploidy, - strict_boundaries=strict_boundaries, - n_threads=n_threads) - vec_metric_names = names(metrics_out)[(!grepl("df_", names(metrics_out))) & (!grepl("highConf_", names(metrics_out)))] - vec_variables_and_metrics = c(r, - df_variables$marker_density[i], - df_variables$sparsity[i], - df_variables$min_loci_corr[i], - df_variables$max_pool_dist[i], - df_variables$min_l_loci[i], - df_variables$min_k_neighbours[i], - eval(parse(text=paste0("c(", paste(paste0("metrics_out$", vec_metric_names), collapse=","), ")"))), - metrics_out$df_metrics_across_allele_freqs_frequencies$mae - ) - metrics_string = paste0(paste(vec_variables_and_metrics, collapse=","), "\n") - cat(metrics_string, file=fname_csv_out, append=TRUE) - system(paste0("rm ", list_sim_missing$fname_vcf)) - system(paste0("rm ", fname_out)) - system(paste0("rm ", gsub("vcf", "vcf-*sync", list_sim_missing$fname_vcf))) - setTxtProgressBar(pb, r*i) - } -} -close(pb) diff --git a/res/perf_misc.slurm b/res/perf_misc.slurm deleted file mode 100644 index f57c883..0000000 --- a/res/perf_misc.slurm +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -#SBATCH --job-name="sensimp" -#SBATCH --account="dbiopast2" -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=100G -#SBATCH --time=14-0:0:00 -### Load the conda environment -module load Miniconda3/22.11.1-1 -conda init bash -source ~/.bashrc -conda activate rustenv -DIR='/group/pasture/Jeff/imputef/res' -DIR_DATA='/group/pasture/Jeff/imputef/misc' -NREPS=3 -PLOIDY=4 -STRICTBOUNDARIES=FALSE -cd $DIR -time \ -Rscript perf_misc.R \ - ${DIR} \ - ${DIR_DATA}/lucerne_chromosome1_population2.vcf \ - ${NREPS} \ - ${PLOIDY} \ - ${STRICTBOUNDARIES} \ - ${SLURM_CPUS_PER_TASK} \ - ${DIR}/sensitivity_analysis_output.csv -### Execute: -# sbatch perf_misc.slurm diff --git a/res/perf_plot.R b/res/perf_plot.R index e997922..81d347e 100644 --- a/res/perf_plot.R +++ b/res/perf_plot.R @@ -11,17 +11,13 @@ plot_metrics = function(df, dataset, vec_2_metrics=c("mae_frequencies", "concord df$algorithm = as.character(df$algorithm) df$algorithm[df$algorithm=="mvi"] = "a" df$algorithm[df$algorithm=="aldknni_fixed"] = "b" - df$algorithm[df$algorithm=="aldknni_optim_cd"] = "c" - df$algorithm[df$algorithm=="aldknni_optim_lk"] = "d" - df$algorithm[df$algorithm=="aldknni_optim_all"] = "e" - df$algorithm[df$algorithm=="linkimpute"] = "f" + df$algorithm[df$algorithm=="aldknni_optim"] = "c" + df$algorithm[df$algorithm=="linkimpute"] = "d" df$algorithm = as.factor(df$algorithm) levels(df$algorithm)[levels(df$algorithm)=="a"] = "MVI" levels(df$algorithm)[levels(df$algorithm)=="b"] = "AFIXED" - levels(df$algorithm)[levels(df$algorithm)=="c"] = "AOPTIMCD" - levels(df$algorithm)[levels(df$algorithm)=="d"] = "AOPTIMLK" - levels(df$algorithm)[levels(df$algorithm)=="e"] = "AOPTIMAL" - levels(df$algorithm)[levels(df$algorithm)=="f"] = "LINKIMPUTE" + levels(df$algorithm)[levels(df$algorithm)=="c"] = "AOPTIM" + levels(df$algorithm)[levels(df$algorithm)=="d"] = "LINKIMPUTE" # agg_concordance = aggregate(concordance_classes ~ algorithm + maf + missing_rate, data=df, FUN=mean) agg_concordance = aggregate(concordance_classes ~ algorithm, data=df, FUN=mean) @@ -37,11 +33,10 @@ plot_metrics = function(df, dataset, vec_2_metrics=c("mae_frequencies", "concord print(paste(vec_algorithm, collapse=" | ")) vec_maf = sort(unique(df$maf)) vec_missing_rate = sort(unique(df$missing_rate)) - vec_colours = c("#88CCEE", "#44AA99", "#117733", "#CC6677", "#DDCC77", "#AA4499") + vec_colours = c("#b2df8a", "#33a02c", "#1f78b4", "#a6cee3") vec_colours = rep(vec_colours, times=ceiling(length(vec_algorithm)/length(vec_colours)))[1:length(vec_algorithm)] vec_fnames_svg = c() - # n_plots = 2*length(vec_maf) for (i in 1:length(vec_2_metrics)) { # i = 1 metric = vec_2_metrics[i] @@ -54,8 +49,6 @@ plot_metrics = function(df, dataset, vec_2_metrics=c("mae_frequencies", "concord fname_svg = paste0(dataset, "-", gsub(" ", "_", metric_label), ".svg") vec_fnames_svg = c(vec_fnames_svg, fname_svg) svg(fname_svg, width=15, height=7) - # mat_layout_base = matrix(1:n_plots, nrow=(n_plots/2), byrow=TRUE) - # layout(cbind(mat_layout_base[,1], mat_layout_base[,1], mat_layout_base)) layout(matrix(c(1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6, 7, 8, 9, 10), byrow=TRUE, ncol=6)) @@ -98,7 +91,7 @@ plot_metrics = function(df, dataset, vec_2_metrics=c("mae_frequencies", "concord text((bplot-0.25), text_pos, labels=round(mat_mu,4), cex=0.9, srt=90, pos=4) par(xpd=FALSE) ### Line plot - plot(x=range(agg_mu[,2], na.rm=TRUE), y=range(agg_mu[,3]+(2*agg_sd[,3]), na.rm=TRUE), + plot(x=range(agg_mu[,2], na.rm=TRUE), y=range(c(agg_mu[,3]-(2*agg_sd[,3]), agg_mu[,3]+(2*agg_sd[,3])), na.rm=TRUE), main=paste0("maf = ", maf), xlab="Sparsity (missing/total)", ylab=metric_label, las=1, type="n") grid() for (algo in vec_algorithm) { diff --git a/src/rust/src/aldknni.rs b/src/rust/src/aldknni.rs index 212a98d..7895cb2 100644 --- a/src/rust/src/aldknni.rs +++ b/src/rust/src/aldknni.rs @@ -2,6 +2,7 @@ use ndarray::{prelude::*, Zip}; use rand::prelude::IteratorRandom; use std::cmp::Ordering; use std::io; +use std::sync::{Arc, Mutex}; use crate::helpers::*; @@ -239,10 +240,8 @@ fn find_k_nearest_neighbours( fn impute_allele_frequencies( frequencies: &Array1, - // frequencies: &Array2, distances: &Vec, ) -> io::Result { - // ) -> io::Result> { let n = frequencies.len(); for i in 0..n { assert!( @@ -272,182 +271,483 @@ impl GenotypesAndPhenotypes { pub fn adaptive_ld_knn_imputation( &mut self, self_clone: &GenotypesAndPhenotypes, - _min_loci_corr: &f64, - _max_pool_dist: &f64, + min_loci_corr: &f64, + max_pool_dist: &f64, min_l_loci: &u64, min_k_neighbours: &u64, loci_idx: &Vec, corr: &Vec>, restrict_linked_loci_per_chromosome: bool, - ) -> io::Result<&mut Self> { + n_reps: &usize, + ) -> io::Result<(&mut Self, f64)> { self.check().expect("Error self.check() within adaptive_ld_knn_imputation() method of GenotypesAndPhenotypes struct."); // We are assuming that all non-zero alleles across pools are kept, i.e. biallelic loci have 2 columns, triallelic have 3, and so on. + // - Input vcf file will have all alleles per locus extracted. + // - Similarly, input sync file will have all alleles per locus extracted. + // - Finally, input allele frequency table file which can be represented by all alleles or one less allele per locus will be appended with the alternative allele if the sum of allele frequencies per locus do not add up to one. + // Define fixed linkage and distance thresholds, i.e. the minimum number of loci to use in estimating genetic distances, and the minimum number of nearest neighbours to include in imputation let (n, p) = self.intercept_and_allele_frequencies.dim(); - let _min_l_loci = if *min_l_loci >= (p as u64) { + let min_l_loci = if *min_l_loci >= (p as u64) { p - 1 } else { *min_l_loci as usize }; - let _min_k_neighbours = if *min_k_neighbours >= (n as u64) { + let min_k_neighbours = if *min_k_neighbours >= (n as u64) { n - 1 } else { *min_k_neighbours as usize }; + // Define the range of minimum loci correlation and maximum pool distance thresholds to be used for optimisation. + // However, if the user-supplied values are non-missing then that user-supplied fixed threshold will be used without optimisation + let vec_min_loci_corr: Vec = if min_loci_corr.is_nan() { + (0..=20).rev().map(|x| x as f64 / 20.0).collect() + } else { + vec![*min_loci_corr] + }; + let vec_max_pool_dist: Vec = if max_pool_dist.is_nan() { + (0..=20).map(|x| x as f64 / 20.0).collect() + } else { + vec![*max_pool_dist] + }; + // // Define the 2D array used for storing the minimum mean absolute error (MAE) estimates across all missing data across pools and loci. + // // We encode the MAE as u8 for memory efficiency, where we set 0: u8 as missing. + // let mut mae_u8: Array2 = + // Array::from_elem(self.intercept_and_allele_frequencies.dim(), 0); + // // Optimise for the minimum loci correlation and maximum pool distance which minimises MAE per locus, and use these optimum threshold to impute missing allele frequency across pools and loci + // Zip::indexed(&mut self.intercept_and_allele_frequencies) + // .and(&mut mae_u8) + // .par_for_each(|(i, j), q, mu8| { + // if q.is_nan() { + // let current_chromosome = self_clone.chromosome[j].to_owned(); + // let vec_q: ArrayView1 = self_clone.intercept_and_allele_frequencies.column(j); + // let n_non_missing = vec_q.fold(0, |t, &x| if !x.is_nan() {t+1}else{t}); + // let n_reps = if *n_reps <= n_non_missing { + // *n_reps + // } else { + // n_non_missing + // }; + // let mut rng = rand::thread_rng(); + // let idx_random_pools: Vec = (0..vec_q.len()).filter(|&idx| !vec_q[idx].is_nan()).choose_multiple(&mut rng, n_reps); + // let mut optimum_mae = 1.0; + // let mut optimum_min_loci_corr = 0.0; + // let mut optimum_max_pool_dist = 1.0; + // // Across minimum loci correlation thresholds + // for min_loci_corr in vec_min_loci_corr.iter() { + // // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) + // let (linked_loci_idx, _correlations) = + // find_l_linked_loci(j, corr, min_loci_corr, min_l_loci, + // restrict_linked_loci_per_chromosome, + // ¤t_chromosome, + // &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Across maximum pool distance thresholds + // for max_pool_dist in vec_max_pool_dist.iter() { + // // Across reps + // let mut mae = 0.0; + // for idx_i in idx_random_pools.iter() { + // // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools + // let distances_all_loci = calculate_genetic_distances_between_pools( + // *idx_i, + // &linked_loci_idx, + // &self_clone.intercept_and_allele_frequencies) + // .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) + // let (distances, frequencies) = + // find_k_nearest_neighbours( + // &distances_all_loci, + // max_pool_dist, + // min_k_neighbours, + // j, + // &self_clone.intercept_and_allele_frequencies, + // ) + // .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Impute and find the error + // mae += (vec_q[*idx_i] - impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait.") + // ).abs(); + // } + // mae /= n_reps as f64; + // if (mae <= f64::EPSILON) | (mae > optimum_mae) { + // break; + // } + // if mae < optimum_mae { + // optimum_mae = mae; + // optimum_min_loci_corr = *min_loci_corr; + // optimum_max_pool_dist = *max_pool_dist; + // } + // } + // } + // // Take note of the minimum MAE which we code are u8 for memory efficiency + // // We use 254 instead of 255 and add 1 because we set 0 as missing above + // *mu8 = (optimum_mae * 254.0).round() as u8 + 1; + // // Impute actual missing data point (ith pool and jth locus) + // // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) + // let (linked_loci_idx, _correlations) = + // find_l_linked_loci(j, corr, &optimum_min_loci_corr, min_l_loci, + // restrict_linked_loci_per_chromosome, + // ¤t_chromosome, + // &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools + // let distances_all_loci = calculate_genetic_distances_between_pools( + // i, + // &linked_loci_idx, + // &self_clone.intercept_and_allele_frequencies) + // .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) + // let (distances, frequencies) = + // find_k_nearest_neighbours( + // &distances_all_loci, + // &optimum_max_pool_dist, + // min_k_neighbours, + // j, + // &self_clone.intercept_and_allele_frequencies, + // ) + // .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Impute missing allele frequencies at the current locus + // *q = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // println!("q={:?}; mae={:?}; corr={}; dist={}", q, optimum_mae, optimum_min_loci_corr, optimum_max_pool_dist); + // } + // }); - let vec_min_loci_corr: Vec = (0..=20).rev().map(|x| x as f64 / 20.0).collect(); - let vec_max_pool_dist: Vec = (0..=20).map(|x| x as f64 / 20.0).collect(); - let min_l_loci: usize = 1; - let min_k_neighbours: usize = 1; - let n_reps = 5; - // Set 0: u8 as missing - let mut mae_u8: Array2 = Array::from_elem(self.intercept_and_allele_frequencies.dim(), 0); - - Zip::indexed(&mut self.intercept_and_allele_frequencies) - .and(&mut mae_u8) - .par_for_each(|(i, j), q, mu8| { - if q.is_nan() { - let current_chromosome = self_clone.chromosome[j].to_owned(); - let vec_q: ArrayView1 = self_clone.intercept_and_allele_frequencies.column(j); - let n_non_missing = vec_q.fold(0, |t, &x| if !x.is_nan() {t+1}else{t}); - let n_reps = if n_reps <= n_non_missing { - n_reps - } else { - n_non_missing - }; - let mut rng = rand::thread_rng(); - let idx_random_pools: Vec = (0..vec_q.len()).filter(|&idx| !vec_q[idx].is_nan()).choose_multiple(&mut rng, n_reps); - let mut optimum_mae = 1.0; - let mut optimum_min_loci_corr = 0.0; - let mut optimum_max_pool_dist = 1.0; - // Across minimum loci correlation thresholds - for min_loci_corr in vec_min_loci_corr.iter() { - // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) - let (linked_loci_idx, _correlations) = - find_l_linked_loci(j, corr, min_loci_corr, min_l_loci, - restrict_linked_loci_per_chromosome, - ¤t_chromosome, - &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // Across maximum pool distance thresholds - for max_pool_dist in vec_max_pool_dist.iter() { - // Across reps - let mut mae = 0.0; - for idx_i in idx_random_pools.iter() { - // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools - let distances_all_loci = calculate_genetic_distances_between_pools( - *idx_i, - &linked_loci_idx, - &self_clone.intercept_and_allele_frequencies) - .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) - let (distances, frequencies) = - find_k_nearest_neighbours( - &distances_all_loci, - max_pool_dist, - min_k_neighbours, - j, - &self_clone.intercept_and_allele_frequencies, - ) - .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // Impute and find the error - mae += (vec_q[*idx_i] - impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait.") - ).abs(); - } - mae /= n_reps as f64; - if (mae <= f64::EPSILON) | (mae > optimum_mae) { - break; - } - if mae < optimum_mae { - optimum_mae = mae; - optimum_min_loci_corr = *min_loci_corr; - optimum_max_pool_dist = *max_pool_dist; - } - } - } - // We use 254 instead of 255 and add 1 because we set 0 as missing above - *mu8 = (optimum_mae * 254.0).round() as u8 + 1; - // Impute actual missing data point (ith pool and jth locus) - // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) - let (linked_loci_idx, _correlations) = - find_l_linked_loci(j, corr, &optimum_min_loci_corr, min_l_loci, - restrict_linked_loci_per_chromosome, - ¤t_chromosome, - &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools - let distances_all_loci = calculate_genetic_distances_between_pools( - i, - &linked_loci_idx, - &self_clone.intercept_and_allele_frequencies) - .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) - let (distances, frequencies) = - find_k_nearest_neighbours( - &distances_all_loci, - &optimum_max_pool_dist, - min_k_neighbours, - j, - &self_clone.intercept_and_allele_frequencies, - ) - .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // Impute missing allele frequencies at the current locus - *q = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // println!("q={:?}; mae={:?}; corr={}; dist={}", q, optimum_mae, optimum_min_loci_corr, optimum_max_pool_dist); - } - }); - // Extract average MAE across loci and pools (Note that we used 0: u8 as missing) - let mut predicted_mae = 0.0; - let mut n_missing = 0.0; - for mu8 in mae_u8.into_iter() { - if mu8 > 0 { - predicted_mae += (mu8 as f64 - 1.0) / 255.0; - n_missing += 1.0; - } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// Testing a different way of parallelisation + + + // let p = 10_013; + let mut n_chunks = 10; // Should be equal to the number of threads because std::thread::scope will wait for other threads to finish before starting with another thread once it finishes + let chunk_size = p / n_chunks; + let vec_idx_ini: Vec = (0..(p-chunk_size)).step_by(chunk_size).collect(); + let mut vec_idx_fin: Vec = (chunk_size..p).step_by(chunk_size).collect(); + let idx_fin = vec_idx_fin[vec_idx_fin.len()-1]; + if idx_fin != p { + vec_idx_fin.pop(); + vec_idx_fin.push(p); } - predicted_mae = predicted_mae / n_missing; - println!("An over-estimated prediction of mean absolute error = {}", predicted_mae); - - // Correct for allele frequency over- and under-flows if we have more than 1 allele representing each locus - for j in 0..(loci_idx.len() - 1) { - let idx_locus_ini = loci_idx[j]; - let idx_locus_fin = loci_idx[j + 1]; - for i in 0..n { - if self_clone.intercept_and_allele_frequencies[(i, idx_locus_ini)].is_nan() { - let sum = self - .intercept_and_allele_frequencies - .slice(s![i, idx_locus_ini..idx_locus_fin]) - .sum(); - if sum != 1.0 { - if (idx_locus_fin - idx_locus_ini) == 2 { - self.intercept_and_allele_frequencies[(i, idx_locus_ini + 1)] = - 1.00 - self.intercept_and_allele_frequencies[(i, idx_locus_ini)] + assert_eq!(vec_idx_ini.len(), vec_idx_fin.len()); + n_chunks = vec_idx_ini.len(); + // Instantiate thread object for parallel execution + let mut thread_objects = Vec::new(); + // Vector holding all returns from pileup2sync_chunk() + let thread_ouputs_freq: Arc>> = + Arc::new(Mutex::new(Vec::new())); // Mutated within each thread worker + // Mutated within each thread worker + let thread_ouputs_cnts: Arc>> = Arc::new(Mutex::new(Vec::new())); + for i in 0..n_chunks { + let idx_ini = vec_idx_ini[i]; + let idx_fin = vec_idx_fin[i]; + let thread = std::thread::scope(|scope| { + + + let mut allele_frequencies: Array2 = self_clone.intercept_and_allele_frequencies.slice(s![.., idx_ini..idx_fin]).to_owned(); + + // let mut vec_mae: Vec = vec![]; + + // for j in idx_ini..idx_fin { + // let current_chromosome = self_clone.chromosome[j].to_owned(); + // let vec_q: ArrayView1 = self_clone.intercept_and_allele_frequencies.column(j); + // let vec_idx_missing = (0..vec_q.len()).filter(|&i| !vec_q[i].is_nan()).collect::>(); + // if vec_idx_missing.len() > 0 { + // let n_non_missing = vec_idx_missing.len(); + // let n_reps = if *n_reps <= n_non_missing { + // *n_reps + // } else { + // n_non_missing + // }; + // let mut rng = rand::thread_rng(); + // let idx_random_pools: Vec = (0..vec_q.len()).filter(|&idx| !vec_q[idx].is_nan()).choose_multiple(&mut rng, n_reps); + // let mut optimum_mae = 1.0; + // let mut optimum_min_loci_corr = 0.0; + // let mut optimum_max_pool_dist = 1.0; + // for i in vec_idx_missing.into_iter() { + // // Across minimum loci correlation thresholds + // for min_loci_corr in vec_min_loci_corr.iter() { + // // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) + // let (linked_loci_idx, _correlations) = + // find_l_linked_loci(j, corr, min_loci_corr, min_l_loci, + // restrict_linked_loci_per_chromosome, + // ¤t_chromosome, + // &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Across maximum pool distance thresholds + // for max_pool_dist in vec_max_pool_dist.iter() { + // // Across reps + // let mut mae = 0.0; + // for idx_i in idx_random_pools.iter() { + // // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools + // let distances_all_loci = calculate_genetic_distances_between_pools( + // *idx_i, + // &linked_loci_idx, + // &self_clone.intercept_and_allele_frequencies) + // .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) + // let (distances, frequencies) = + // find_k_nearest_neighbours( + // &distances_all_loci, + // max_pool_dist, + // min_k_neighbours, + // j, + // &self_clone.intercept_and_allele_frequencies, + // ) + // .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Impute and find the error + // mae += (vec_q[*idx_i] - impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait.") + // ).abs(); + // } + // mae /= n_reps as f64; + // if (mae <= f64::EPSILON) | (mae > optimum_mae) { + // break; + // } + // if mae < optimum_mae { + // optimum_mae = mae; + // optimum_min_loci_corr = *min_loci_corr; + // optimum_max_pool_dist = *max_pool_dist; + // } + // } + // } + // // Take note of the minimum MAE + // vec_mae.push(optimum_mae); + // // Impute actual missing data point (ith pool and jth locus) + // // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) + // let (linked_loci_idx, _correlations) = + // find_l_linked_loci(j, corr, &optimum_min_loci_corr, min_l_loci, + // restrict_linked_loci_per_chromosome, + // ¤t_chromosome, + // &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools + // let distances_all_loci = calculate_genetic_distances_between_pools( + // i, + // &linked_loci_idx, + // &self_clone.intercept_and_allele_frequencies) + // .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) + // let (distances, frequencies) = + // find_k_nearest_neighbours( + // &distances_all_loci, + // &optimum_max_pool_dist, + // min_k_neighbours, + // j, + // &self_clone.intercept_and_allele_frequencies, + // ) + // .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // Impute missing allele frequencies at the current locus + // allele_frequencies[(i, j-idx_ini)] = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // // println!("q={:?}; mae={:?}; corr={}; dist={}", q, optimum_mae, optimum_min_loci_corr, optimum_max_pool_dist); + // } + // } + // } + + // println!("allele_frequencies={:?}", allele_frequencies); + // println!("vec_mae[0..10].to_owned()={:?}", vec_mae[0..10].to_owned()); + + // Define the 2D array used for storing the minimum mean absolute error (MAE) estimates across all missing data across pools and loci. + // We encode the MAE as u8 for memory efficiency, where we set 0: u8 as missing. + let mut mae_u8: Array2 = Array::from_elem(allele_frequencies.dim(), 0); + + Zip::indexed(&mut allele_frequencies) + .and(&mut mae_u8) + .par_for_each(|(i, j), q, mu8| { + if q.is_nan() { + let current_chromosome = self_clone.chromosome[j].to_owned(); + let vec_q: ArrayView1 = self_clone.intercept_and_allele_frequencies.column(j); + let n_non_missing = vec_q.fold(0, |t, &x| if !x.is_nan() {t+1}else{t}); + let n_reps = if *n_reps <= n_non_missing { + *n_reps } else { - for k in idx_locus_ini..idx_locus_fin { - self.intercept_and_allele_frequencies[(i, k)] /= sum; + n_non_missing + }; + let mut rng = rand::thread_rng(); + let idx_random_pools: Vec = (0..vec_q.len()).filter(|&idx| !vec_q[idx].is_nan()).choose_multiple(&mut rng, n_reps); + let mut optimum_mae = 1.0; + let mut optimum_min_loci_corr = 0.0; + let mut optimum_max_pool_dist = 1.0; + // Across minimum loci correlation thresholds + for min_loci_corr in vec_min_loci_corr.iter() { + // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) + let (linked_loci_idx, _correlations) = + find_l_linked_loci(j, corr, min_loci_corr, min_l_loci, + restrict_linked_loci_per_chromosome, + ¤t_chromosome, + &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // Across maximum pool distance thresholds + for max_pool_dist in vec_max_pool_dist.iter() { + // Across reps + let mut mae = 0.0; + for idx_i in idx_random_pools.iter() { + // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools + let distances_all_loci = calculate_genetic_distances_between_pools( + *idx_i, + &linked_loci_idx, + &self_clone.intercept_and_allele_frequencies) + .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) + let (distances, frequencies) = + find_k_nearest_neighbours( + &distances_all_loci, + max_pool_dist, + min_k_neighbours, + j, + &self_clone.intercept_and_allele_frequencies, + ) + .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // Impute and find the error + mae += (vec_q[*idx_i] - impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait.") + ).abs(); + } + mae /= n_reps as f64; + if (mae <= f64::EPSILON) | (mae > optimum_mae) { + break; + } + if mae < optimum_mae { + optimum_mae = mae; + optimum_min_loci_corr = *min_loci_corr; + optimum_max_pool_dist = *max_pool_dist; + } } } + // Take note of the minimum MAE which we code are u8 for memory efficiency + // We use 254 instead of 255 and add 1 because we set 0 as missing above + *mu8 = (optimum_mae * 254.0).round() as u8 + 1; + // Impute actual missing data point (ith pool and jth locus) + // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) + let (linked_loci_idx, _correlations) = + find_l_linked_loci(j, corr, &optimum_min_loci_corr, min_l_loci, + restrict_linked_loci_per_chromosome, + ¤t_chromosome, + &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools + let distances_all_loci = calculate_genetic_distances_between_pools( + i, + &linked_loci_idx, + &self_clone.intercept_and_allele_frequencies) + .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) + let (distances, frequencies) = + find_k_nearest_neighbours( + &distances_all_loci, + &optimum_max_pool_dist, + min_k_neighbours, + j, + &self_clone.intercept_and_allele_frequencies, + ) + .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // Impute missing allele frequencies at the current locus + *q = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // println!("q={:?}; mae={:?}; corr={}; dist={}", q, optimum_mae, optimum_min_loci_corr, optimum_max_pool_dist); } - self.coverages[(i, j)] = f64::INFINITY; - } - } + }); + + println!("allele_frequencies={:?}", allele_frequencies); + + + // Write-out intermediate file + // let vec_chr: Vec = self_clone.chromosome[idx_ini..idx_fin].to_owned(); + // let vec_pos: Vec = self_clone.position[idx_ini..idx_fin].to_owned(); + // let vec_all: Vec = self_clone.allele[idx_ini..idx_fin].to_owned(); + let fname_intermediate_file: String = "intermediate_output-".to_owned() + &idx_ini.to_string() + &idx_fin.to_string() + ".csv"; + + + let output = GenotypesAndPhenotypes{ + chromosome: self_clone.chromosome[idx_ini..idx_fin].to_owned(), + position: self_clone.position[idx_ini..idx_fin].to_owned(), + allele: self_clone.allele[idx_ini..idx_fin].to_owned(), + intercept_and_allele_frequencies: allele_frequencies, + phenotypes: self_clone.phenotypes.clone(), + pool_names: self_clone.pool_names.clone(), + coverages: self_clone.coverages.slice(s![.., 0..10]).to_owned(), + }; + let dummy_filter_stats = FilterStats { + remove_ns: false, + min_quality: 0.0, + min_coverage: 0, + min_allele_frequency: 0.0, + max_missingness_rate: 0.0, + pool_sizes: vec![0.0], + }; + let dummy_keep_p_minus_1 = false; + let dummy_n_threads = 1; + output.write_csv(&dummy_filter_stats, dummy_keep_p_minus_1, &fname_intermediate_file, &dummy_n_threads).expect("Error: failed to save intermediate file."); + + + + }); + thread_objects.push(thread); } - Ok(self) + // // Waiting for all threads to finish + // for thread in thread_objects { + // thread.join().expect("Unknown thread error occured."); + // } + // // Extract output filenames from each thread into a vector and sort them + // let mut freq: Vec = Vec::new(); + // let mut cnts: Vec = Vec::new(); + // for x in thread_ouputs_freq.lock().expect("Error unlocking the threads after multi-threaded execution to extract allele frequencies within the load() method for FileSyncPhen struct.").iter() { + // freq.push(x.clone()); + // } + // for x in thread_ouputs_cnts.lock().expect("Error unlocking the threads after multi-threaded execution to extract allele counts within the load() method for FileSyncPhen struct.").iter() { + // cnts.push(x.clone()); + // } + + + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + + + // Extract average minimum MAE across loci and pools (Note that we used 0: u8 as missing) + // let mut predicted_mae = 0.0; + // let mut n_missing = 0.0; + // for mu8 in mae_u8.into_iter() { + // if mu8 > 0 { + // predicted_mae += (mu8 as f64 - 1.0) / 255.0; + // n_missing += 1.0; + // } + // } + // predicted_mae = predicted_mae / n_missing; + // // println!( + // // "An over-estimated prediction of mean absolute error = {}", + // // predicted_mae + // // ); + // // Correct for allele frequency over- and under-flows, as we are assuming all loci are represented by all of its alleles (see assumptions above) + // for j in 0..(loci_idx.len() - 1) { + // let idx_locus_ini = loci_idx[j]; + // let idx_locus_fin = loci_idx[j + 1]; + // for i in 0..n { + // if self_clone.intercept_and_allele_frequencies[(i, idx_locus_ini)].is_nan() { + // let sum = self + // .intercept_and_allele_frequencies + // .slice(s![i, idx_locus_ini..idx_locus_fin]) + // .sum(); + // if sum != 1.0 { + // if (idx_locus_fin - idx_locus_ini) == 2 { + // self.intercept_and_allele_frequencies[(i, idx_locus_ini + 1)] = + // 1.00 - self.intercept_and_allele_frequencies[(i, idx_locus_ini)] + // } else { + // for k in idx_locus_ini..idx_locus_fin { + // self.intercept_and_allele_frequencies[(i, k)] /= sum; + // } + // } + // } + // self.coverages[(i, j)] = f64::INFINITY; + // } + // } + // } + // Ok((self, predicted_mae)) + Ok((self, 0.0)) } } pub fn impute_aldknni( genotypes_and_phenotypes: GenotypesAndPhenotypes, filter_stats: &FilterStats, - _min_loci_corr: &f64, - _max_pool_dist: &f64, - _min_l_loci: &u64, - _min_k_neighbours: &u64, + min_loci_corr: &f64, + max_pool_dist: &f64, + min_l_loci: &u64, + min_k_neighbours: &u64, restrict_linked_loci_per_chromosome: bool, - _optimise_n_steps_min_loci_corr: &usize, - _optimise_n_steps_max_pool_dist: &usize, - _optimise_max_l_loci: &u64, - _optimise_max_k_neighbours: &u64, - _optimise_n_reps: &usize, + n_reps: &usize, n_threads: &usize, out: &String, ) -> io::Result { @@ -467,68 +767,33 @@ pub fn impute_aldknni( restrict_linked_loci_per_chromosome, ) .expect("Error estimating pairwise linkage between loci across the entire genome."); - // println!("Optimising and/or estimating imputation accuracy."); - // let ( - // optimum_min_loci_corr, - // optimum_max_pool_dist, - // optimum_min_l_loci, - // optimum_min_k_neighbours, - // optimum_mae, - // ) = optimise_params_and_estimate_accuracy( - // &genotypes_and_phenotypes, - // &self_clone, - // min_loci_corr, - // max_pool_dist, - // min_l_loci, - // min_k_neighbours, - // &loci_idx, - // &corr, - // restrict_linked_loci_per_chromosome, - // optimise_n_steps_min_loci_corr, - // optimise_n_steps_max_pool_dist, - // optimise_max_l_loci, - // optimise_max_k_neighbours, - // optimise_n_reps, - // ) - // .expect("Error calling optimise_params_and_estimate_accuracy() in impute_aldknni()."); - // println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); - // println!( - // "Minimum loci correlation threshold: {}", - // optimum_min_loci_corr - // ); - // println!( - // "Maximum neighbour distance threshold: {}", - // optimum_max_pool_dist - // ); - // println!("Minimum number of linked loci: {}", optimum_min_l_loci); - // println!( - // "Minimum number of k-nearest neighbours: {}", - // optimum_min_k_neighbours - // ); - // println!( - // "Estimated imputation accuracy in terms of mean absolute error: {}", - // optimum_mae - // ); - // println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); - let optimum_min_loci_corr = 0.0; - let optimum_max_pool_dist = 1.0; - let optimum_min_l_loci = 1; - let optimum_min_k_neighbours = 1; + if min_loci_corr.is_nan() | max_pool_dist.is_nan() { + println!("Optimising and estimating imputation accuracy."); + } else { + println!("Estimating imputation accuracy."); + } let start = std::time::SystemTime::now(); - self_clone + let (_self, mae) = self_clone .adaptive_ld_knn_imputation( &genotypes_and_phenotypes, - &optimum_min_loci_corr, - &optimum_max_pool_dist, - &optimum_min_l_loci, - &optimum_min_k_neighbours, + min_loci_corr, + max_pool_dist, + min_l_loci, + min_k_neighbours, &loci_idx, &corr, restrict_linked_loci_per_chromosome, + n_reps, ) .expect("Error calling adaptive_ld_knn_imputation() within impute_aldknni()."); let end = std::time::SystemTime::now(); let duration = end.duration_since(start).expect("Error measuring the duration of running adaptive_ld_knn_imputation() within impute_aldknni()."); + println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); + println!( + "Expected imputation accuracy in terms of mean absolute error: {}", + mae + ); + println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); println!( "Adaptive LD-kNN imputation: {} pools x {} loci | Missingness: {}% | Duration: {} seconds", self_clone.coverages.nrows(), @@ -694,40 +959,34 @@ mod tests { println!("imputed_freq={:?}", imputed_freq); assert_eq!(imputed_freq, 0.2249532554929301); - let _frac_top_missing_pools = 0.0; - let _frac_top_missing_loci = 0.2; - let min_loci_corr = 0.9; - let max_pool_dist = 0.1; - let min_l_loci = 10; - let min_k_neighbours = 10; - let restrict_linked_loci_per_chromosome = true; - - let optimise_n_steps_min_loci_corr = 1; - let optimise_n_steps_max_pool_dist = 1; - let optimise_max_l_loci = 100; - let optimise_max_k_neighbours = 50; - - let optimise_n_reps = 1; - let (loci_idx, _loci_chr, _loci_pos) = frequencies_and_phenotypes.count_loci().unwrap(); let frequencies_and_phenotypes_clone = frequencies_and_phenotypes.clone(); + let n_reps = 5; + let min_loci_corr = f64::NAN; + let max_pool_dist = f64::NAN; + let min_l_loci = 10; + let min_k_neighbours = 10; + let restrict_linked_loci_per_chromosome = false; - let _ = frequencies_and_phenotypes + let (_self, mae) = frequencies_and_phenotypes .adaptive_ld_knn_imputation( - &frequencies_and_phenotypes_clone, - &min_loci_corr, - &max_pool_dist, - &min_l_loci, - &min_k_neighbours, - &loci_idx, - &corr, - false, + + &frequencies_and_phenotypes_clone, + &min_loci_corr, + &max_pool_dist, + &min_l_loci, + &min_k_neighbours, + &loci_idx, + &corr, + restrict_linked_loci_per_chromosome, + &n_reps, ) .unwrap(); println!( "After imputation:\n{:?}", frequencies_and_phenotypes.intercept_and_allele_frequencies ); + println!("Estimated MAE={}", mae); assert_eq!(0, 1) // let n_nan = frequencies_and_phenotypes // .intercept_and_allele_frequencies diff --git a/src/rust/src/lib.rs b/src/rust/src/lib.rs index dd082f9..f2dacb6 100644 --- a/src/rust/src/lib.rs +++ b/src/rust/src/lib.rs @@ -9,7 +9,6 @@ mod filter_missing; mod geno; mod helpers; mod mvi; -mod optim; mod phen; mod structs_and_traits; mod sync; @@ -31,16 +30,12 @@ fn impute( max_depth_above_which_are_missing: f64, frac_top_missing_pools: f64, frac_top_missing_loci: f64, + n_reps: u64, min_loci_corr: f64, max_pool_dist: f64, min_l_loci: u64, min_k_neighbours: u64, restrict_linked_loci_per_chromosome: bool, - optimise_n_steps_min_loci_corr: u64, - optimise_n_steps_max_pool_dist: u64, - optimise_max_l_loci: u64, - optimise_max_k_neighbours: u64, - optimise_n_reps: u64, n_threads: u64, fname_out_prefix: String, // ) -> String { @@ -321,6 +316,17 @@ fn impute( println!("###################################################################################################"); println!("aldknni: adaptive linkage disequilibrium (LD)-based k-nearest neighbour imputation of genotype data"); println!("###################################################################################################"); + // Handling NA conversion into Rust's f64:NAN + let min_loci_corr = if min_loci_corr < 0.0 { + f64::NAN + } else { + min_loci_corr + }; + let max_pool_dist = if max_pool_dist < 0.0 { + f64::NAN + } else { + max_pool_dist + }; impute_aldknni( genotypes_and_phenotypes, &filter_stats, @@ -329,11 +335,7 @@ fn impute( &min_l_loci, &min_k_neighbours, restrict_linked_loci_per_chromosome, - &(optimise_n_steps_min_loci_corr as usize), - &(optimise_n_steps_max_pool_dist as usize), - &optimise_max_l_loci, - &optimise_max_k_neighbours, - &(optimise_n_reps as usize), + &(n_reps as usize), &(n_threads as usize), &fname_out, ) diff --git a/src/rust/src/mvi.rs b/src/rust/src/mvi.rs index ef7a6f9..e38bc36 100644 --- a/src/rust/src/mvi.rs +++ b/src/rust/src/mvi.rs @@ -1,8 +1,10 @@ -use ndarray::prelude::*; -use std::io; - use crate::helpers::*; use crate::structs_and_traits::*; +use ndarray::prelude::*; +use rand::prelude::IteratorRandom; +use rand::rngs::StdRng; +use rand::SeedableRng; +use std::io; impl GenotypesAndPhenotypes { pub fn mean_imputation(&mut self) -> io::Result<&mut Self> { @@ -58,6 +60,164 @@ impl GenotypesAndPhenotypes { // println!("self.coverages = {:?}", self.coverages); Ok(self) } + + pub fn simulate_missing( + &mut self, + loci_idx: &Vec, + max_sparsity: &f64, + rep: &usize, + ) -> io::Result<(&mut Self, f64, Vec, Vec, Vec>)> { + self.check().expect("Error calling check() method within simulate_missing() method for GenotypesAndPhenotypes struct."); + let (n, l) = self.coverages.dim(); + // let (loci_idx, _loci_chr, _loci_pos) = self.count_loci().expect("Error defining loci indexes and identities via count_loci() method within simulate_missing() method for GenotypesAndPhenotypes struct."); + // Limit max_sparsity to 90% + let max_sparsity = if *max_sparsity > 0.9 { + 0.9 + } else { + *max_sparsity + }; + // Define the number of loci to mask + let n_masked = (((n * l) as f64) * max_sparsity).floor() as usize; + // Sample random loci + // let mut rng = rand::thread_rng(); + let mut rng = StdRng::seed_from_u64(*rep as u64); + let mut vec_masked_loci_idx_tmp = (0..(n * l)).choose_multiple(&mut rng, n_masked); + vec_masked_loci_idx_tmp.sort(); + // Mask and extract the masked information + let mut vec_masked_loci_idx: Vec = vec![]; + let mut vec_masked_coverages: Vec = vec![]; + let mut vec_vec_masked_alleles_freqs: Vec> = vec![]; + let mut idx_counter = 0; + let mut idx = 0; + 'outer: for i in 0..n { + for j in 0..l { + if vec_masked_loci_idx_tmp[idx_counter] == idx { + if !self.coverages[(i, j)].is_nan() { + // Extract index of the locus + vec_masked_loci_idx.push(idx); + // Extract mask the coverages + vec_masked_coverages.push(self.coverages[(i, j)]); + self.coverages[(i, j)] = f64::NAN; + // Use the indexes of the locus extract masked frequencies and to set missing values to all alleles in the locus + let idx_ini = loci_idx[j]; + let idx_fin = loci_idx[j + 1]; + let mut vec_freqs: Vec = vec![]; + for k in idx_ini..idx_fin { + vec_freqs.push(self.intercept_and_allele_frequencies[(i, k)]); + self.intercept_and_allele_frequencies[(i, k)] = f64::NAN; + } + vec_vec_masked_alleles_freqs.push(vec_freqs); + } + idx_counter += 1; + if idx_counter == vec_masked_loci_idx_tmp.len() { + break 'outer; + } + } + idx += 1; + } + } + self.check().expect("Error calling check() method within simulate_missing() method for GenotypesAndPhenotypes struct."); + Ok(( + self, + max_sparsity, + vec_masked_loci_idx, + vec_masked_coverages, + vec_vec_masked_alleles_freqs, + )) + } + + pub fn extract_imputed_mask( + &self, + loci_idx: &Vec, + vec_masked_loci_idx: &Vec, + ) -> io::Result>> { + let (n, _p) = self.intercept_and_allele_frequencies.dim(); + let (_n, l) = self.coverages.dim(); + let mut vec_vec_imputed_mask: Vec> = vec![]; + let mut idx_counter = 0; + let mut idx = 0; + 'outer: for i in 0..n { + for j in 0..l { + if vec_masked_loci_idx[idx_counter] == idx { + let idx_ini = loci_idx[j]; + let idx_fin = loci_idx[j + 1]; + let mut vec_freqs: Vec = vec![]; + for k in idx_ini..idx_fin { + vec_freqs.push(self.intercept_and_allele_frequencies[(i, k)]); + } + vec_vec_imputed_mask.push(vec_freqs); + idx_counter += 1; + if idx_counter == vec_masked_loci_idx.len() { + break 'outer; + } + } + idx += 1; + } + } + Ok(vec_vec_imputed_mask) + } + + pub fn estimate_expected_mae_in_mvi(&self) -> io::Result { + let mut genotype_data_for_optimisation = self.clone(); + let (n, p) = genotype_data_for_optimisation + .intercept_and_allele_frequencies + .dim(); + let missing_rate_sim = if (n * p) < 20_000 { + 0.01 + } else { + 10_000.0 / ((n * p) as f64) + }; + // Extract loci indices + let (loci_idx, _loci_chr, _loci_pos) = self.count_loci().expect("Error calling count_loci() method within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes struct."); + let ( + _, + max_sparsity, + vec_masked_loci_idx, + _vec_masked_coverages, + vec_vec_masked_alleles_freqs, + ) = genotype_data_for_optimisation + .simulate_missing(&loci_idx, &missing_rate_sim, &0) + .expect("Error calling simulate_missing() within estimate_expected_mae_in_mvi() method for GenotypesAndPhenotypes struct."); + assert_eq!( + max_sparsity, missing_rate_sim, + "Ooops! Missing unexpected simulated sparsity!" + ); + let _ = genotype_data_for_optimisation.mean_imputation().expect("Error calling mean_imputation() within estimate_expected_mae_in_mvi() method for GenotypesAndPhenotypes struct."); + let vec_vec_imputed_mask = genotype_data_for_optimisation + .extract_imputed_mask(&loci_idx, &vec_masked_loci_idx) + .expect("Error calling extract_imputed_mask() within estimate_expected_mae_in_mvi() method for GenotypesAndPhenotypes struct."); + // Compare imputed and expected frequencies + let m = vec_vec_masked_alleles_freqs.len(); // total number of loci + let mean_absolute_error = if m == 0 { + // If no loci were simulated to be missing + 1.00 + } else { + let mut sum_abs = 0.0; + let mut a = 0.0; // total number of alleles across all loci + for i in 0..m { + let w = vec_vec_masked_alleles_freqs[i].len(); + for j in 0..w { + if (vec_vec_masked_alleles_freqs[i][j].is_nan()) + | (vec_vec_imputed_mask[i][j].is_nan()) + { + // Skip if we if masked already missing loci or if we cannot impute because the whole locus is missing + continue; + } else { + sum_abs += + (vec_vec_masked_alleles_freqs[i][j] - vec_vec_imputed_mask[i][j]).abs(); + // (vec_vec_masked_alleles_freqs[i][j] - vec_vec_imputed_mask[i][j]).powf(2.0); // MSE seem to have the same behaviour as MAE + a += 1.00; + } + } + } + if a == 0.0 { + 1.00 + } else { + sum_abs / a + } + }; + Ok(mean_absolute_error) + } } // Impute using mean allele frequencies across pools diff --git a/src/rust/src/optim.rs b/src/rust/src/optim.rs deleted file mode 100644 index 7894211..0000000 --- a/src/rust/src/optim.rs +++ /dev/null @@ -1,599 +0,0 @@ -use rand::prelude::IteratorRandom; -use rand::rngs::StdRng; -use rand::{SeedableRng}; -use std::io; - -use crate::structs_and_traits::*; - -impl GenotypesAndPhenotypes { - pub fn simulate_missing( - &mut self, - loci_idx: &Vec, - max_sparsity: &f64, - rep: &usize, - ) -> io::Result<(&mut Self, f64, Vec, Vec, Vec>)> { - self.check().expect("Error calling check() method within simulate_missing() method for GenotypesAndPhenotypes struct."); - let (n, l) = self.coverages.dim(); - // let (loci_idx, _loci_chr, _loci_pos) = self.count_loci().expect("Error defining loci indexes and identities via count_loci() method within simulate_missing() method for GenotypesAndPhenotypes struct."); - // Limit max_sparsity to 90% - let max_sparsity = if *max_sparsity > 0.9 { - 0.9 - } else { - *max_sparsity - }; - // Define the number of loci to mask - let n_masked = (((n * l) as f64) * max_sparsity).floor() as usize; - // Sample random loci - // let mut rng = rand::thread_rng(); - let mut rng = StdRng::seed_from_u64(*rep as u64); - let mut vec_masked_loci_idx_tmp = (0..(n * l)).choose_multiple(&mut rng, n_masked); - vec_masked_loci_idx_tmp.sort(); - // Mask and extract the masked information - let mut vec_masked_loci_idx: Vec = vec![]; - let mut vec_masked_coverages: Vec = vec![]; - let mut vec_vec_masked_alleles_freqs: Vec> = vec![]; - let mut idx_counter = 0; - let mut idx = 0; - 'outer: for i in 0..n { - for j in 0..l { - if vec_masked_loci_idx_tmp[idx_counter] == idx { - if !self.coverages[(i, j)].is_nan() { - // Extract index of the locus - vec_masked_loci_idx.push(idx); - // Extract mask the coverages - vec_masked_coverages.push(self.coverages[(i, j)]); - self.coverages[(i, j)] = f64::NAN; - // Use the indexes of the locus extract masked frequencies and to set missing values to all alleles in the locus - let idx_ini = loci_idx[j]; - let idx_fin = loci_idx[j + 1]; - let mut vec_freqs: Vec = vec![]; - for k in idx_ini..idx_fin { - vec_freqs.push(self.intercept_and_allele_frequencies[(i, k)]); - self.intercept_and_allele_frequencies[(i, k)] = f64::NAN; - } - vec_vec_masked_alleles_freqs.push(vec_freqs); - } - idx_counter += 1; - if idx_counter == vec_masked_loci_idx_tmp.len() { - break 'outer; - } - } - idx += 1; - } - } - self.check().expect("Error calling check() method within simulate_missing() method for GenotypesAndPhenotypes struct."); - Ok(( - self, - max_sparsity, - vec_masked_loci_idx, - vec_masked_coverages, - vec_vec_masked_alleles_freqs, - )) - } - - pub fn extract_imputed_mask( - &self, - loci_idx: &Vec, - vec_masked_loci_idx: &Vec, - ) -> io::Result>> { - let (n, _p) = self.intercept_and_allele_frequencies.dim(); - let (_n, l) = self.coverages.dim(); - let mut vec_vec_imputed_mask: Vec> = vec![]; - let mut idx_counter = 0; - let mut idx = 0; - 'outer: for i in 0..n { - for j in 0..l { - if vec_masked_loci_idx[idx_counter] == idx { - let idx_ini = loci_idx[j]; - let idx_fin = loci_idx[j + 1]; - let mut vec_freqs: Vec = vec![]; - for k in idx_ini..idx_fin { - vec_freqs.push(self.intercept_and_allele_frequencies[(i, k)]); - } - vec_vec_imputed_mask.push(vec_freqs); - idx_counter += 1; - if idx_counter == vec_masked_loci_idx.len() { - break 'outer; - } - } - idx += 1; - } - } - Ok(vec_vec_imputed_mask) - } - - pub fn estimate_expected_mae_in_aldknni( - &mut self, - self_clone: &GenotypesAndPhenotypes, - min_loci_corr: &f64, - max_pool_dist: &f64, - min_l_loci: &u64, - min_k_neighbours: &u64, - loci_idx: &Vec, - corr: &Vec>, - restrict_linked_loci_per_chromosome: bool, - vec_masked_loci_idx: &Vec, - vec_vec_masked_alleles_freqs: &Vec>, - ) -> io::Result<(&mut Self, f64)> { - self.adaptive_ld_knn_imputation( - self_clone, - min_loci_corr, - max_pool_dist, - min_l_loci, - min_k_neighbours, - loci_idx, - corr, - restrict_linked_loci_per_chromosome, - ) - .expect("Error calling adaptive_ld_knn_imputation() within estimate_expected_mae_in_aldknni() method for GenotypesAndPhenotypes struct."); - let (n, _p) = self.intercept_and_allele_frequencies.dim(); - let (_n, l) = self.coverages.dim(); - - let m = vec_vec_masked_alleles_freqs.len(); // total number of loci - - let mean_absolute_error = if m == 0 { - // If no loci were simulated to be missing - 1.00 - } else { - let mut sum_abs = 0.0; - let mut a = 0.0; // total number of alleles across all loci - - let mut idx_counter = 0; - let mut idx = 0; - 'outer: for i in 0..n { - for j in 0..l { - if vec_masked_loci_idx[idx_counter] == idx { - let idx_ini = loci_idx[j]; - let idx_fin = loci_idx[j + 1]; - for k in idx_ini..idx_fin { - if (vec_vec_masked_alleles_freqs[idx_counter][k - idx_ini].is_nan()) - | (self.intercept_and_allele_frequencies[(i, k)].is_nan()) - { - // Skip if we if masked already missing loci or if we cannot impute because the whole locus is missing - continue; - } else { - sum_abs += (vec_vec_masked_alleles_freqs[idx_counter][k - idx_ini] - - self.intercept_and_allele_frequencies[(i, k)]) - .abs(); - a += 1.00; - // Reset self - self.intercept_and_allele_frequencies[(i, k)] = f64::NAN; - self.coverages[(i, j)] = f64::NAN; - } - } - idx_counter += 1; - if idx_counter == vec_masked_loci_idx.len() { - break 'outer; - } - } - idx += 1; - } - } - if a == 0.0 { - 1.00 - } else { - sum_abs / a - } - }; - - Ok((self, mean_absolute_error)) - } - - pub fn estimate_expected_mae_in_mvi(&self) -> io::Result { - let mut genotype_data_for_optimisation = self.clone(); - let (n, p) = genotype_data_for_optimisation - .intercept_and_allele_frequencies - .dim(); - let missing_rate_sim = if (n * p) < 20_000 { - 0.01 - } else { - 10_000.0 / ((n * p) as f64) - }; - // Extract loci indices - let (loci_idx, _loci_chr, _loci_pos) = self.count_loci().expect("Error calling count_loci() method within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes struct."); - let ( - _, - max_sparsity, - vec_masked_loci_idx, - _vec_masked_coverages, - vec_vec_masked_alleles_freqs, - ) = genotype_data_for_optimisation - .simulate_missing(&loci_idx, &missing_rate_sim, &0) - .expect("Error calling simulate_missing() within estimate_expected_mae_in_mvi() method for GenotypesAndPhenotypes struct."); - assert_eq!( - max_sparsity, missing_rate_sim, - "Ooops! Missing unexpected simulated sparsity!" - ); - let _ = genotype_data_for_optimisation.mean_imputation().expect("Error calling mean_imputation() within estimate_expected_mae_in_mvi() method for GenotypesAndPhenotypes struct."); - let vec_vec_imputed_mask = genotype_data_for_optimisation - .extract_imputed_mask(&loci_idx, &vec_masked_loci_idx) - .expect("Error calling extract_imputed_mask() within estimate_expected_mae_in_mvi() method for GenotypesAndPhenotypes struct."); - // Compare imputed and expected frequencies - let m = vec_vec_masked_alleles_freqs.len(); // total number of loci - let mean_absolute_error = if m == 0 { - // If no loci were simulated to be missing - 1.00 - } else { - let mut sum_abs = 0.0; - let mut a = 0.0; // total number of alleles across all loci - for i in 0..m { - let w = vec_vec_masked_alleles_freqs[i].len(); - for j in 0..w { - if (vec_vec_masked_alleles_freqs[i][j].is_nan()) - | (vec_vec_imputed_mask[i][j].is_nan()) - { - // Skip if we if masked already missing loci or if we cannot impute because the whole locus is missing - continue; - } else { - sum_abs += - (vec_vec_masked_alleles_freqs[i][j] - vec_vec_imputed_mask[i][j]).abs(); - // (vec_vec_masked_alleles_freqs[i][j] - vec_vec_imputed_mask[i][j]).powf(2.0); // MSE seem to have the same behaviour as MAE - a += 1.00; - } - } - } - if a == 0.0 { - 1.00 - } else { - sum_abs / a - } - }; - Ok(mean_absolute_error) - } -} - -pub fn optimise_params_and_estimate_accuracy( - genotypes_and_phenotypes: &GenotypesAndPhenotypes, - self_clone: &GenotypesAndPhenotypes, - min_loci_corr: &f64, - max_pool_dist: &f64, - min_l_loci: &u64, - min_k_neighbours: &u64, - loci_idx: &Vec, - corr: &Vec>, - restrict_linked_loci_per_chromosome: bool, - optimise_n_steps_min_loci_corr: &usize, - optimise_n_steps_max_pool_dist: &usize, - optimise_max_l_loci: &u64, - optimise_max_k_neighbours: &u64, - optimise_n_reps: &usize, -) -> io::Result<(f64, f64, u64, u64, f64)> { - // Defining the ranges of min_loci_corr, max_pool_dist , min_l_loci and min_k_neighbours to test - // Note: The number of steps may vary from the input depending on how we can evenly divide the range - let vec_min_loci_corr: Vec = if *optimise_n_steps_min_loci_corr > 1 { - let step_size = (101.0 / *optimise_n_steps_min_loci_corr as f64).round() as usize; - (0..=100) - .step_by(step_size) - .map(|x| x as f64 / 100.0) - .collect::>() - .into_iter() - .rev() - .collect() - } else { - vec![*min_loci_corr] - }; - let vec_max_pool_dist: Vec = if *optimise_n_steps_max_pool_dist > 1 { - let step_size = (101.0 / *optimise_n_steps_max_pool_dist as f64).round() as usize; - (0..=100) - .step_by(step_size) - .map(|x| x as f64 / 100.0) - .collect() - } else { - vec![*max_pool_dist] - }; - let vec_min_l_loci: Vec = if *optimise_max_l_loci > *min_l_loci { - let step_size = 1; - (1..=*optimise_max_l_loci).step_by(step_size).collect() - } else { - vec![*min_l_loci] - }; - let vec_min_k_neighbours: Vec = if *optimise_max_k_neighbours > *min_k_neighbours { - let k_max = if *optimise_max_k_neighbours as usize - > genotypes_and_phenotypes - .intercept_and_allele_frequencies - .nrows() - { - genotypes_and_phenotypes - .intercept_and_allele_frequencies - .nrows() - } else { - *optimise_max_k_neighbours as usize - }; - let step_size = 1; - (1..=k_max).step_by(step_size).map(|x| x as u64).collect() - } else { - vec![*min_k_neighbours] - }; - // Optimisation via coordinate-descent-like algorithm starting with the strictest parameters - // Parameter optimisation order: - // (1) minimum loci correlation (from high to low) - // (2) maximum pool distances (from low to high) - // (3) l loci (from low to high) - // (4) k-neighbours (from low to high) - // As we explore the parameter space one at a time, - // first at decreasing stringency then at increasing stringency, - // we replace the optimum parameter if mae decreases, and - // break if mae increases so that we are more computationally efficient. - let all_parameters_are_fixed = (vec_min_loci_corr.len() == 1) - & (vec_max_pool_dist.len() == 1) - & (vec_min_l_loci.len() == 1) - & (vec_min_k_neighbours.len() == 1); - let optimise_n_reps = if all_parameters_are_fixed & (*optimise_n_reps > 1) { - 1 - } else { - *optimise_n_reps - }; - let mut min_loci_corr = vec![]; - let mut max_pool_dist = vec![]; - let mut min_l_loci = vec![]; - let mut min_k_neighbours = vec![]; - let mut mae = vec![]; - if !all_parameters_are_fixed { - println!("-----------------------------------------------"); - if optimise_n_reps > 1 { - println!("rep\tmin_loci_corr\tmax_pool_dist\tmin_l_loci\tmin_k_neighbours\tmae"); - } else { - println!("min_loci_corr\tmax_pool_dist\tmin_l_loci\tmin_k_neighbours\tmae"); - } - } - // Assumes smooth unimodal parameter spaces (which is probably wrong!) - for r in 0..optimise_n_reps { - // Simulate 10,000 missing data (or 1% sparsity, if we have less than 20,000 data points) to determine accuracy (in terms of MAE) and to optimise for the best k and l, if applicable, i.e. n_loci_to_estimate_distance==0 or k_neighbours==0 - let mut genotype_data_for_optimisation = genotypes_and_phenotypes.clone(); - let (n, p) = genotype_data_for_optimisation - .intercept_and_allele_frequencies - .dim(); - let missing_rate_sim = if (n * p) < 20_000 { - 0.01 - } else { - 10_000.0 / ((n * p) as f64) - }; - let ( - _, - max_sparsity, - vec_masked_loci_idx, - _vec_masked_coverages, - vec_vec_masked_alleles_freqs, - ) = genotype_data_for_optimisation - .simulate_missing(loci_idx, &missing_rate_sim, &r) - .expect("Error calling simulate_missing() method within optimise_params_and_estimate_accuracy()."); - assert_eq!( - max_sparsity, missing_rate_sim, - "Ooops! Missing unexpected simulated sparsity!" - ); - // Coordinate-descent-like optimisation starting with the strictest parameters - let mut optimum_min_loci_corr = vec_min_loci_corr[0]; - let mut optimum_max_pool_dist = vec_max_pool_dist[0]; - let mut optimum_min_l_loci = vec_min_l_loci[0]; - let mut optimum_min_k_neighbours = vec_min_k_neighbours[0]; - let (_, mut optimum_mae) = genotype_data_for_optimisation - .estimate_expected_mae_in_aldknni( - self_clone, - &optimum_min_loci_corr, - &optimum_max_pool_dist, - &optimum_min_l_loci, - &optimum_min_k_neighbours, - loci_idx, - corr, - restrict_linked_loci_per_chromosome, - &vec_masked_loci_idx, - &vec_vec_masked_alleles_freqs, - ) - .expect("Error calling estimate_expected_mae_in_aldknni() method within optimise_params_and_estimate_accuracy()."); - if all_parameters_are_fixed == false { - let mut vec_idx_c_d_l_k = vec![0; 4]; - let vec_len_c_d_l_k = vec![ - vec_min_loci_corr.len(), - vec_max_pool_dist.len(), - vec_min_l_loci.len(), - vec_min_k_neighbours.len(), - ]; - // Step across all four parameter spaces twice, where within each parameter space we move forwards and backwards from end to end and from middle to both ends - for ix in vec![0, 1, 2, 3, 0, 1, 2, 3].into_iter() { - let mut opt_idx = 0; - let ini = 0; - let mid = (vec_len_c_d_l_k[ix]) / 2; - let fin = vec_len_c_d_l_k[ix]; - let vec_vec_params_idx = vec![ - (ini..fin).collect::>(), - (ini..fin).rev().collect::>(), - (mid..fin).collect::>(), - (ini..mid).rev().collect::>(), - ]; - for iy in 0..vec_vec_params_idx.len() { - for l in vec_vec_params_idx[iy].clone().into_iter() { - vec_idx_c_d_l_k[ix] = l; - let (_, mae) = genotype_data_for_optimisation - .estimate_expected_mae_in_aldknni( - self_clone, - &vec_min_loci_corr[vec_idx_c_d_l_k[0]], - &vec_max_pool_dist[vec_idx_c_d_l_k[1]], - &vec_min_l_loci[vec_idx_c_d_l_k[2]], - &vec_min_k_neighbours[vec_idx_c_d_l_k[3]], - loci_idx, - corr, - restrict_linked_loci_per_chromosome, - &vec_masked_loci_idx, - &vec_vec_masked_alleles_freqs, - ) - .expect("Error calling estimate_expected_mae_in_aldknni() method within optimise_params_and_estimate_accuracy()."); - if optimise_n_reps > 1 { - println!( - "{}\t{}\t{}\t{}\t{}\t{}", - r, - &vec_min_loci_corr[vec_idx_c_d_l_k[0]], - &vec_max_pool_dist[vec_idx_c_d_l_k[1]], - &vec_min_l_loci[vec_idx_c_d_l_k[2]], - &vec_min_k_neighbours[vec_idx_c_d_l_k[3]], - mae - ); - } else { - println!( - "{}\t{}\t{}\t{}\t{}", - &vec_min_loci_corr[vec_idx_c_d_l_k[0]], - &vec_max_pool_dist[vec_idx_c_d_l_k[1]], - &vec_min_l_loci[vec_idx_c_d_l_k[2]], - &vec_min_k_neighbours[vec_idx_c_d_l_k[3]], - mae - ); - } - if mae < optimum_mae { - optimum_mae = mae; - optimum_min_loci_corr = vec_min_loci_corr[vec_idx_c_d_l_k[0]]; - optimum_max_pool_dist = vec_max_pool_dist[vec_idx_c_d_l_k[1]]; - optimum_min_l_loci = vec_min_l_loci[vec_idx_c_d_l_k[2]]; - optimum_min_k_neighbours = vec_min_k_neighbours[vec_idx_c_d_l_k[3]]; - opt_idx = vec_idx_c_d_l_k[ix]; - } else if (mae - optimum_mae) > 0.0001 { - break; - } - } // Across the parameter space - vec_idx_c_d_l_k[ix] = opt_idx; - } // Forwards and backwards from end to end and from the middle to both ends of the parameter space - } // For each parameter - } - // Append parameters - mae.push(optimum_mae); - min_loci_corr.push(optimum_min_loci_corr); - max_pool_dist.push(optimum_max_pool_dist); - min_l_loci.push(optimum_min_l_loci); - min_k_neighbours.push(optimum_min_k_neighbours); - } - if !all_parameters_are_fixed { - println!("-----------------------------------------------"); - } - Ok(( - min_loci_corr.iter().fold(0.0, |sum, &x| sum + x) / (optimise_n_reps as f64), - max_pool_dist.iter().fold(0.0, |sum, &x| sum + x) / (optimise_n_reps as f64), - min_l_loci.iter().fold(0, |sum, &x| sum + x) / (optimise_n_reps as u64), - min_k_neighbours.iter().fold(0, |sum, &x| sum + x) / (optimise_n_reps as u64), - mae.iter().fold(0.0, |sum, &x| sum + x) / (optimise_n_reps as f64), - )) -} - -// Make tests -#[cfg(test)] -mod tests { - use super::*; - use crate::aldknni::*; - - #[test] - fn test_optim() { - let file_sync = FileSync { - filename: "./tests/test.sync".to_owned(), - test: "load".to_owned(), - }; - let file_phen = FilePhen { - filename: "./tests/test_pheno.csv".to_owned(), - delim: ",".to_owned(), - names_column_id: 0, - sizes_column_id: 1, - trait_values_column_ids: vec![2, 3], - format: "default".to_owned(), - }; - let file_sync_phen = *(file_sync, file_phen).lparse().unwrap(); - let filter_stats = FilterStats { - remove_ns: true, - min_quality: 0.005, - min_coverage: 1, - min_allele_frequency: 0.005, - max_missingness_rate: 0.0, - pool_sizes: vec![20., 20., 20., 20., 20.], - }; - println!("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"); - let n_threads = 2; - let mut frequencies_and_phenotypes = file_sync_phen - .into_genotypes_and_phenotypes(&filter_stats, false, &n_threads) - .unwrap(); - println!( - "Before simulating missing data:\n{:?}", - frequencies_and_phenotypes.intercept_and_allele_frequencies - ); - println!("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"); - let (loci_idx, _loci_chr, _loci_pos) = frequencies_and_phenotypes.count_loci().unwrap(); - let ( - _, - _max_sparsity, - vec_masked_loci_idx, - _vec_masked_coverages, - vec_vec_masked_alleles_freqs, - ) = frequencies_and_phenotypes - .simulate_missing(&loci_idx, &0.25, &42) - .unwrap(); - println!( - "After simulating missing data:\n{:?}", - frequencies_and_phenotypes.intercept_and_allele_frequencies - ); - println!("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"); - let vec_vec_imputed_mask = frequencies_and_phenotypes - .extract_imputed_mask(&loci_idx, &vec_masked_loci_idx) - .unwrap(); - - println!("loci_idx.len()={}", loci_idx.len()); - println!("vec_masked_loci_idx.len()={}", vec_masked_loci_idx.len()); - println!( - "vec_vec_masked_alleles_freqs.len()={}", - vec_vec_masked_alleles_freqs.len() - ); - println!("vec_vec_imputed_mask.len()={}", vec_vec_imputed_mask.len()); - - println!( - "vec_masked_loci_idx[vec_masked_loci_idx.len()-1]={}", - vec_masked_loci_idx[vec_masked_loci_idx.len() - 1] - ); - println!( - "vec_vec_masked_alleles_freqs[vec_vec_masked_alleles_freqs.len()-1]={:?}", - vec_vec_masked_alleles_freqs[vec_vec_masked_alleles_freqs.len() - 1] - ); - - println!("After simlating missing data:"); - println!( - "vec_vec_imputed_mask[0..10]={:?}", - &vec_vec_imputed_mask[0..10] - ); - println!( - "vec_vec_imputed_mask[vec_vec_imputed_mask.len()-1]={:?}", - vec_vec_imputed_mask[vec_vec_imputed_mask.len() - 1] - ); - assert!(vec_vec_imputed_mask[0][0].is_nan()); - assert!(vec_vec_imputed_mask[vec_vec_imputed_mask.len() - 1][0].is_nan()); - - let restrict_linked_loci_per_chromosome = false; - let (loci_idx, _loci_chr, _loci_pos) = frequencies_and_phenotypes.count_loci().unwrap(); - let corr = calculate_genomewide_ld( - &frequencies_and_phenotypes, - restrict_linked_loci_per_chromosome, - ) - .unwrap(); - let frequencies_and_phenotypes_clone = frequencies_and_phenotypes.clone(); - let (min_loci_corr, max_pool_dist, min_l_loci, min_k_neighbours, mae) = - optimise_params_and_estimate_accuracy( - &frequencies_and_phenotypes, - &frequencies_and_phenotypes_clone, - &0.0, - &1.0, - &1, - &1, - &loci_idx, - &corr, - restrict_linked_loci_per_chromosome, - &10, - &10, - &50, - &50, - &1, - ) - .unwrap(); - - println!( - "min_loci_corr={}, max_pool_dist={}, min_l_loci={}, min_k_neighbours={}, mae={}", - min_loci_corr, max_pool_dist, min_l_loci, min_k_neighbours, mae - ); - - let _vec_vec_imputed_mask = frequencies_and_phenotypes - .extract_imputed_mask(&loci_idx, &vec_masked_loci_idx) - .unwrap(); - // println!("vec_vec_imputed_mask={:?}", vec_vec_imputed_mask); - // assert_eq!(0, 1); - } -} diff --git a/tests/tests.R b/tests/tests.R index 937e716..24bd9ec 100644 --- a/tests/tests.R +++ b/tests/tests.R @@ -26,10 +26,10 @@ test_that( ) test_that( - "aldknni_fixed", { - print("aldknni_fixed:") - vcf = fn_extract_missing(aldknni(fname=fname_vcf)) - c(0, 0, 0, 0, 21, 0) - sync = fn_extract_missing(aldknni(fname=fname_sync)) - c(0, 0, 0, 0, 0, 0) + "aldknni_optim", { + print("aldknni_optim:") + vcf = fn_extract_missing(aldknni(fname=fname_vcf)) + sync = fn_extract_missing(aldknni(fname=fname_sync)) csv = fn_extract_missing(aldknni(fname=fname_csv)) - c(0, 0, 0, 0, 1, 0) expect_equal(vcf, sync, tolerance=0.05) expect_equal(vcf, csv, tolerance=0.05) @@ -37,33 +37,11 @@ test_that( ) test_that( - "aldknni_optim_cd", { - print("aldknni_optim_cd:") - vcf = fn_extract_missing(aldknni(fname=fname_vcf, optimise_n_steps_min_loci_corr=10, optimise_n_steps_max_pool_dist=10, min_l_loci=1, min_k_neighbours=1, optimise_max_l_loci=1, optimise_max_k_neighbours=1)) - c(0, 0, 0, 0, 21, 0) - sync = fn_extract_missing(aldknni(fname=fname_sync, optimise_n_steps_min_loci_corr=10, optimise_n_steps_max_pool_dist=10, min_l_loci=1, min_k_neighbours=1, optimise_max_l_loci=1, optimise_max_k_neighbours=1)) - c(0, 0, 0, 0, 0, 0) - csv = fn_extract_missing(aldknni(fname=fname_csv, optimise_n_steps_min_loci_corr=10, optimise_n_steps_max_pool_dist=10, min_l_loci=1, min_k_neighbours=1, optimise_max_l_loci=1, optimise_max_k_neighbours=1)) - c(0, 0, 0, 0, 1, 0) - expect_equal(vcf, sync, tolerance=0.05) - expect_equal(vcf, csv, tolerance=0.05) - } -) - -test_that( - "aldknni_optim_lk", { - print("aldknni_optim_lk:") - vcf = fn_extract_missing(aldknni(fname=fname_vcf, min_loci_corr=0.0, max_pool_dist=1.0, optimise_n_steps_min_loci_corr=1, optimise_n_steps_max_pool_dist=1, optimise_max_l_loci=100, optimise_max_k_neighbours=100)) - c(0, 0, 0, 0, 21, 0) - sync = fn_extract_missing(aldknni(fname=fname_sync, min_loci_corr=0.0, max_pool_dist=1.0, optimise_n_steps_min_loci_corr=1, optimise_n_steps_max_pool_dist=1, optimise_max_l_loci=100, optimise_max_k_neighbours=100)) - c(0, 0, 0, 0, 0, 0) - csv = fn_extract_missing(aldknni(fname=fname_csv, min_loci_corr=0.0, max_pool_dist=1.0, optimise_n_steps_min_loci_corr=1, optimise_n_steps_max_pool_dist=1, optimise_max_l_loci=100, optimise_max_k_neighbours=100)) - c(0, 0, 0, 0, 1, 0) - expect_equal(vcf, sync, tolerance=0.05) - expect_equal(vcf, csv, tolerance=0.05) - } -) - -test_that( - "aldknni_optim_all", { - print("aldknni_optim_all:") - vcf = fn_extract_missing(aldknni(fname=fname_vcf, optimise_n_steps_min_loci_corr=10, optimise_n_steps_max_pool_dist=10, optimise_max_l_loci=100, optimise_max_k_neighbours=100)) - c(0, 0, 0, 0, 21, 0) - sync = fn_extract_missing(aldknni(fname=fname_sync, optimise_n_steps_min_loci_corr=10, optimise_n_steps_max_pool_dist=10, optimise_max_l_loci=100, optimise_max_k_neighbours=100)) - c(0, 0, 0, 0, 0, 0) - csv = fn_extract_missing(aldknni(fname=fname_csv, optimise_n_steps_min_loci_corr=10, optimise_n_steps_max_pool_dist=10, optimise_max_l_loci=100, optimise_max_k_neighbours=100)) - c(0, 0, 0, 0, 1, 0) + "aldknni_fixed", { + print("aldknni_fixed:") + vcf = fn_extract_missing(aldknni(fname=fname_vcf, min_loci_corr=0.9, max_pool_dist=0.1)) + sync = fn_extract_missing(aldknni(fname=fname_sync, min_loci_corr=0.9, max_pool_dist=0.1)) + csv = fn_extract_missing(aldknni(fname=fname_csv, min_loci_corr=0.9, max_pool_dist=0.1)) - c(0, 0, 0, 0, 1, 0) expect_equal(vcf, sync, tolerance=0.05) expect_equal(vcf, csv, tolerance=0.05) } From fcf98e0404324dd9cc03f4162e84f860ae8e04e2 Mon Sep 17 00:00:00 2001 From: jeffersonfparil Date: Thu, 8 Feb 2024 16:37:09 +1100 Subject: [PATCH 5/8] draftin gthe new parallelisation with intermediate output files --- src/rust/src/aldknni.rs | 246 +++------------------------------------- 1 file changed, 14 insertions(+), 232 deletions(-) diff --git a/src/rust/src/aldknni.rs b/src/rust/src/aldknni.rs index 7895cb2..773df36 100644 --- a/src/rust/src/aldknni.rs +++ b/src/rust/src/aldknni.rs @@ -309,113 +309,7 @@ impl GenotypesAndPhenotypes { } else { vec![*max_pool_dist] }; - // // Define the 2D array used for storing the minimum mean absolute error (MAE) estimates across all missing data across pools and loci. - // // We encode the MAE as u8 for memory efficiency, where we set 0: u8 as missing. - // let mut mae_u8: Array2 = - // Array::from_elem(self.intercept_and_allele_frequencies.dim(), 0); - // // Optimise for the minimum loci correlation and maximum pool distance which minimises MAE per locus, and use these optimum threshold to impute missing allele frequency across pools and loci - // Zip::indexed(&mut self.intercept_and_allele_frequencies) - // .and(&mut mae_u8) - // .par_for_each(|(i, j), q, mu8| { - // if q.is_nan() { - // let current_chromosome = self_clone.chromosome[j].to_owned(); - // let vec_q: ArrayView1 = self_clone.intercept_and_allele_frequencies.column(j); - // let n_non_missing = vec_q.fold(0, |t, &x| if !x.is_nan() {t+1}else{t}); - // let n_reps = if *n_reps <= n_non_missing { - // *n_reps - // } else { - // n_non_missing - // }; - // let mut rng = rand::thread_rng(); - // let idx_random_pools: Vec = (0..vec_q.len()).filter(|&idx| !vec_q[idx].is_nan()).choose_multiple(&mut rng, n_reps); - // let mut optimum_mae = 1.0; - // let mut optimum_min_loci_corr = 0.0; - // let mut optimum_max_pool_dist = 1.0; - // // Across minimum loci correlation thresholds - // for min_loci_corr in vec_min_loci_corr.iter() { - // // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) - // let (linked_loci_idx, _correlations) = - // find_l_linked_loci(j, corr, min_loci_corr, min_l_loci, - // restrict_linked_loci_per_chromosome, - // ¤t_chromosome, - // &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Across maximum pool distance thresholds - // for max_pool_dist in vec_max_pool_dist.iter() { - // // Across reps - // let mut mae = 0.0; - // for idx_i in idx_random_pools.iter() { - // // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools - // let distances_all_loci = calculate_genetic_distances_between_pools( - // *idx_i, - // &linked_loci_idx, - // &self_clone.intercept_and_allele_frequencies) - // .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) - // let (distances, frequencies) = - // find_k_nearest_neighbours( - // &distances_all_loci, - // max_pool_dist, - // min_k_neighbours, - // j, - // &self_clone.intercept_and_allele_frequencies, - // ) - // .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Impute and find the error - // mae += (vec_q[*idx_i] - impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait.") - // ).abs(); - // } - // mae /= n_reps as f64; - // if (mae <= f64::EPSILON) | (mae > optimum_mae) { - // break; - // } - // if mae < optimum_mae { - // optimum_mae = mae; - // optimum_min_loci_corr = *min_loci_corr; - // optimum_max_pool_dist = *max_pool_dist; - // } - // } - // } - // // Take note of the minimum MAE which we code are u8 for memory efficiency - // // We use 254 instead of 255 and add 1 because we set 0 as missing above - // *mu8 = (optimum_mae * 254.0).round() as u8 + 1; - // // Impute actual missing data point (ith pool and jth locus) - // // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) - // let (linked_loci_idx, _correlations) = - // find_l_linked_loci(j, corr, &optimum_min_loci_corr, min_l_loci, - // restrict_linked_loci_per_chromosome, - // ¤t_chromosome, - // &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools - // let distances_all_loci = calculate_genetic_distances_between_pools( - // i, - // &linked_loci_idx, - // &self_clone.intercept_and_allele_frequencies) - // .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) - // let (distances, frequencies) = - // find_k_nearest_neighbours( - // &distances_all_loci, - // &optimum_max_pool_dist, - // min_k_neighbours, - // j, - // &self_clone.intercept_and_allele_frequencies, - // ) - // .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Impute missing allele frequencies at the current locus - // *q = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // println!("q={:?}; mae={:?}; corr={}; dist={}", q, optimum_mae, optimum_min_loci_corr, optimum_max_pool_dist); - // } - // }); - - - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - /// Testing a different way of parallelisation - - - // let p = 10_013; + let mut n_chunks = 10; // Should be equal to the number of threads because std::thread::scope will wait for other threads to finish before starting with another thread once it finishes let chunk_size = p / n_chunks; let vec_idx_ini: Vec = (0..(p-chunk_size)).step_by(chunk_size).collect(); @@ -428,122 +322,21 @@ impl GenotypesAndPhenotypes { assert_eq!(vec_idx_ini.len(), vec_idx_fin.len()); n_chunks = vec_idx_ini.len(); // Instantiate thread object for parallel execution - let mut thread_objects = Vec::new(); - // Vector holding all returns from pileup2sync_chunk() - let thread_ouputs_freq: Arc>> = - Arc::new(Mutex::new(Vec::new())); // Mutated within each thread worker - // Mutated within each thread worker - let thread_ouputs_cnts: Arc>> = Arc::new(Mutex::new(Vec::new())); + let mut thread_objects: Vec = vec![]; for i in 0..n_chunks { let idx_ini = vec_idx_ini[i]; let idx_fin = vec_idx_fin[i]; let thread = std::thread::scope(|scope| { - - let mut allele_frequencies: Array2 = self_clone.intercept_and_allele_frequencies.slice(s![.., idx_ini..idx_fin]).to_owned(); - - // let mut vec_mae: Vec = vec![]; - - // for j in idx_ini..idx_fin { - // let current_chromosome = self_clone.chromosome[j].to_owned(); - // let vec_q: ArrayView1 = self_clone.intercept_and_allele_frequencies.column(j); - // let vec_idx_missing = (0..vec_q.len()).filter(|&i| !vec_q[i].is_nan()).collect::>(); - // if vec_idx_missing.len() > 0 { - // let n_non_missing = vec_idx_missing.len(); - // let n_reps = if *n_reps <= n_non_missing { - // *n_reps - // } else { - // n_non_missing - // }; - // let mut rng = rand::thread_rng(); - // let idx_random_pools: Vec = (0..vec_q.len()).filter(|&idx| !vec_q[idx].is_nan()).choose_multiple(&mut rng, n_reps); - // let mut optimum_mae = 1.0; - // let mut optimum_min_loci_corr = 0.0; - // let mut optimum_max_pool_dist = 1.0; - // for i in vec_idx_missing.into_iter() { - // // Across minimum loci correlation thresholds - // for min_loci_corr in vec_min_loci_corr.iter() { - // // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) - // let (linked_loci_idx, _correlations) = - // find_l_linked_loci(j, corr, min_loci_corr, min_l_loci, - // restrict_linked_loci_per_chromosome, - // ¤t_chromosome, - // &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Across maximum pool distance thresholds - // for max_pool_dist in vec_max_pool_dist.iter() { - // // Across reps - // let mut mae = 0.0; - // for idx_i in idx_random_pools.iter() { - // // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools - // let distances_all_loci = calculate_genetic_distances_between_pools( - // *idx_i, - // &linked_loci_idx, - // &self_clone.intercept_and_allele_frequencies) - // .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) - // let (distances, frequencies) = - // find_k_nearest_neighbours( - // &distances_all_loci, - // max_pool_dist, - // min_k_neighbours, - // j, - // &self_clone.intercept_and_allele_frequencies, - // ) - // .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Impute and find the error - // mae += (vec_q[*idx_i] - impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait.") - // ).abs(); - // } - // mae /= n_reps as f64; - // if (mae <= f64::EPSILON) | (mae > optimum_mae) { - // break; - // } - // if mae < optimum_mae { - // optimum_mae = mae; - // optimum_min_loci_corr = *min_loci_corr; - // optimum_max_pool_dist = *max_pool_dist; - // } - // } - // } - // // Take note of the minimum MAE - // vec_mae.push(optimum_mae); - // // Impute actual missing data point (ith pool and jth locus) - // // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) - // let (linked_loci_idx, _correlations) = - // find_l_linked_loci(j, corr, &optimum_min_loci_corr, min_l_loci, - // restrict_linked_loci_per_chromosome, - // ¤t_chromosome, - // &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools - // let distances_all_loci = calculate_genetic_distances_between_pools( - // i, - // &linked_loci_idx, - // &self_clone.intercept_and_allele_frequencies) - // .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) - // let (distances, frequencies) = - // find_k_nearest_neighbours( - // &distances_all_loci, - // &optimum_max_pool_dist, - // min_k_neighbours, - // j, - // &self_clone.intercept_and_allele_frequencies, - // ) - // .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // Impute missing allele frequencies at the current locus - // allele_frequencies[(i, j-idx_ini)] = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // // println!("q={:?}; mae={:?}; corr={}; dist={}", q, optimum_mae, optimum_min_loci_corr, optimum_max_pool_dist); - // } - // } - // } - - // println!("allele_frequencies={:?}", allele_frequencies); - // println!("vec_mae[0..10].to_owned()={:?}", vec_mae[0..10].to_owned()); + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Make me the main imputation method of GenotypesAndPhenotypes struct here, and + // move the whole parallel imputation steps (..std::thread::scope.. with Zip...) into the self-standing function impute_aldknni() below + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + let mut allele_frequencies: Array2 = self_clone.intercept_and_allele_frequencies.slice(s![.., idx_ini..idx_fin]).to_owned(); // Define the 2D array used for storing the minimum mean absolute error (MAE) estimates across all missing data across pools and loci. // We encode the MAE as u8 for memory efficiency, where we set 0: u8 as missing. let mut mae_u8: Array2 = Array::from_elem(allele_frequencies.dim(), 0); - Zip::indexed(&mut allele_frequencies) .and(&mut mae_u8) .par_for_each(|(i, j), q, mu8| { @@ -667,27 +460,16 @@ impl GenotypesAndPhenotypes { let dummy_keep_p_minus_1 = false; let dummy_n_threads = 1; output.write_csv(&dummy_filter_stats, dummy_keep_p_minus_1, &fname_intermediate_file, &dummy_n_threads).expect("Error: failed to save intermediate file."); - - - + return fname_intermediate_file }); thread_objects.push(thread); } - // // Waiting for all threads to finish - // for thread in thread_objects { - // thread.join().expect("Unknown thread error occured."); - // } - // // Extract output filenames from each thread into a vector and sort them - // let mut freq: Vec = Vec::new(); - // let mut cnts: Vec = Vec::new(); - // for x in thread_ouputs_freq.lock().expect("Error unlocking the threads after multi-threaded execution to extract allele frequencies within the load() method for FileSyncPhen struct.").iter() { - // freq.push(x.clone()); - // } - // for x in thread_ouputs_cnts.lock().expect("Error unlocking the threads after multi-threaded execution to extract allele counts within the load() method for FileSyncPhen struct.").iter() { - // cnts.push(x.clone()); - // } - - + // Waiting for all threads to finish + let mut vec_fname_intermediate_files: Vec = vec![]; + for thread in thread_objects { + vec_fname_intermediate_files.push(thread); + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// From 76897163b3191081fa9a77f37bb82ad0fe11846a Mon Sep 17 00:00:00 2001 From: jeffersonfparil Date: Fri, 9 Feb 2024 10:33:08 +1100 Subject: [PATCH 6/8] drafted multi-threaded impuitation nesting Zip multithreaded ndarray calculations --- src/rust/src/aldknni.rs | 614 ++++++++++++++++++++++------------------ src/rust/src/lib.rs | 4 +- 2 files changed, 345 insertions(+), 273 deletions(-) diff --git a/src/rust/src/aldknni.rs b/src/rust/src/aldknni.rs index 773df36..69a95f7 100644 --- a/src/rust/src/aldknni.rs +++ b/src/rust/src/aldknni.rs @@ -3,6 +3,9 @@ use rand::prelude::IteratorRandom; use std::cmp::Ordering; use std::io; use std::sync::{Arc, Mutex}; +use std::time::{SystemTime, UNIX_EPOCH}; +use std::fs::OpenOptions; +use std::io::Write; use crate::helpers::*; @@ -51,14 +54,14 @@ pub fn calculate_genomewide_ld( } fn calculate_genetic_distances_between_pools( - idx_row: usize, + idx_row: &usize, linked_loci_idx: &Vec, intercept_and_allele_frequencies: &Array2, ) -> io::Result> { // Errors and f64::NAN are all converted into the maximum possible distance of 1.00 for simplicity let (n, _p) = intercept_and_allele_frequencies.dim(); let q: Array1 = intercept_and_allele_frequencies - .row(idx_row) + .row(*idx_row) .select(Axis(0), linked_loci_idx); let mut distances: Array1 = Array1::from_elem(n, 1.0); // for i in 0..n { @@ -79,7 +82,7 @@ fn calculate_genetic_distances_between_pools( // } // } Zip::indexed(&mut distances).par_for_each(|i, d| { - if i != idx_row { + if i != *idx_row { let q1: Array1 = intercept_and_allele_frequencies .row(i) .select(Axis(0), linked_loci_idx); @@ -99,7 +102,7 @@ fn find_l_linked_loci( idx_col: usize, corr: &Vec>, min_loci_corr: &f64, - min_l_loci: usize, + min_l_loci: &usize, restrict_linked_loci_per_chromosome: bool, current_chromosome: &String, chromosomes: &Vec, @@ -108,7 +111,7 @@ fn find_l_linked_loci( // - to optimise for the thresholds, set min_l_loci to 0 // - to optimise for counts, set min_loci_corr to 0.0 assert!( - min_l_loci > 0, + *min_l_loci > 0, "Error: the minimum number of linked loci need to be greater than 0." ); let p = corr.len(); @@ -144,7 +147,7 @@ fn find_l_linked_loci( // If less than the minimum number of loci passed the threshold, // or if we are not filtering by minimum loci correlation (i.e. min_loci_corr = 0.0), // then we sort the correlations (decreasing) and pick the top-most correlated loci - let (vec_idx, vec_corr) = if (vec_idx.len() < min_l_loci) | (*min_loci_corr == 0.0) { + let (vec_idx, vec_corr) = if (vec_idx.len() < *min_l_loci) | (*min_loci_corr == 0.0) { // Extract indices to reach min_l_loci or if we do not have enough loci, then just vec_corr.len() let mut indices: Vec = (0..vec_corr.len()).collect(); indices.sort_by( @@ -155,10 +158,10 @@ fn find_l_linked_loci( (false, false) => vec_corr[b].partial_cmp(&vec_corr[a]).unwrap(), }, ); - let l_linked_loci = if vec_corr.len() < min_l_loci { + let l_linked_loci = if vec_corr.len() < *min_l_loci { vec_corr.len() } else { - min_l_loci + *min_l_loci }; let indices = indices[0..l_linked_loci].to_vec(); // Extract the correlations corresponding to the extracted indices @@ -176,15 +179,15 @@ fn find_l_linked_loci( fn find_k_nearest_neighbours( distances: &Array1, max_pool_dist: &f64, - min_k_neighbours: usize, - idx_col: usize, + min_k_neighbours: &usize, + idx_col: &usize, intercept_and_allele_frequencies: &Array2, ) -> io::Result<(Vec, Array1)> { // Find neighbours passing the max_pool_dist threshold, then sort and add more neighbours if it did not pass the min_k_neighbours threshold // - to optimise for the thresholds, set min_k_neighbours to 0 // - to optimise for counts, set max_pool_dist to 1.0 (maximum possible distance, i.e. mean absolute difference of 1.0 in allele frequency) assert!( - min_k_neighbours > 0, + *min_k_neighbours > 0, "Error: the minimum number of k-nearest neighbours need to be greater than 0." ); let (n, _p) = intercept_and_allele_frequencies.dim(); @@ -205,7 +208,7 @@ fn find_k_nearest_neighbours( for i in indices.clone().into_iter() { let bool_dist_nan = !distances[i].is_nan(); let bool_dist_thresh = distances[i] <= *max_pool_dist; - let bool_freq_nan = !intercept_and_allele_frequencies[(i, idx_col)].is_nan(); + let bool_freq_nan = !intercept_and_allele_frequencies[(i, *idx_col)].is_nan(); if bool_dist_nan & bool_dist_thresh & bool_freq_nan { idx.push(i); dist.push(distances[i]); @@ -215,13 +218,13 @@ fn find_k_nearest_neighbours( // If less than the minimum number of neighbours passed the threshold, // or if we are not filtering by maximum pool distance (i.e. max_pool_dist = 1.00), // then we sort the distances (increasing) and pick the nearest neighbours - if (idx.len() < min_k_neighbours) | (*max_pool_dist == 1.00) { + if (idx.len() < *min_k_neighbours) | (*max_pool_dist == 1.00) { idx = vec![]; dist = vec![]; for i in indices.clone().into_iter() { let bool_dist_nan = !distances[i].is_nan(); - let bool_freq_nan = !intercept_and_allele_frequencies[(i, idx_col)].is_nan(); - if idx.len() == min_k_neighbours { + let bool_freq_nan = !intercept_and_allele_frequencies[(i, *idx_col)].is_nan(); + if idx.len() == *min_k_neighbours { break; } if bool_dist_nan & bool_freq_nan { @@ -233,7 +236,7 @@ fn find_k_nearest_neighbours( // Extract non-missing allele frequencies at the locus requiring imputation of the k-nearest neighbours let mut freq: Array1 = Array1::from_elem(idx.len(), f64::NAN); for i in 0..idx.len() { - freq[i] = intercept_and_allele_frequencies[(idx[i], idx_col)]; + freq[i] = intercept_and_allele_frequencies[(idx[i], *idx_col)]; } Ok((dist, freq)) } @@ -268,31 +271,200 @@ fn impute_allele_frequencies( } impl GenotypesAndPhenotypes { + fn per_chunk_aldknni( + &self, + idx_loci_idx_ini: &usize, + idx_loci_idx_fin: &usize, + loci_idx: &Vec, + vec_min_loci_corr: &Vec, + vec_max_pool_dist: &Vec, + min_l_loci: &usize, + min_k_neighbours: &usize, + corr: &Vec>, + restrict_linked_loci_per_chromosome: bool, + n_reps: &usize, + ) -> io::Result<(Array2, f64)> { + + + let idx_ini = loci_idx[*idx_loci_idx_ini]; + let idx_fin = loci_idx[*idx_loci_idx_fin]; + let mut allele_frequencies: Array2 = self.intercept_and_allele_frequencies.slice(s![.., idx_ini..idx_fin]).to_owned(); + + + // Define the 2D array used for storing the minimum mean absolute error (MAE) estimates across all missing data across pools and loci. + // We encode the MAE as u8 for memory efficiency, where we set 0: u8 as missing. + let mut mae_u8: Array2 = Array::from_elem(allele_frequencies.dim(), 0); + Zip::indexed(&mut allele_frequencies) + .and(&mut mae_u8) + .par_for_each(|(i, j), q, mu8| { + if q.is_nan() { + let current_chromosome = self.chromosome[j].to_owned(); + let vec_q: ArrayView1 = self.intercept_and_allele_frequencies.column(j); + let n_non_missing = vec_q.fold(0, |t, &x| if !x.is_nan() {t+1}else{t}); + let n_reps = if *n_reps <= n_non_missing { + *n_reps + } else { + n_non_missing + }; + // Select randomly non-missing pools to impute and estimate imputation error from + let mut rng = rand::thread_rng(); + let idx_random_pools: Vec = (0..vec_q.len()).filter(|&idx| !vec_q[idx].is_nan()).choose_multiple(&mut rng, n_reps); + let mut optimum_mae = 1.0; + let mut optimum_min_loci_corr = 0.0; + let mut optimum_max_pool_dist = 1.0; + // Find the optimal min_loci_corr and max_pool_dist which minimise imputation error (MAE: mean absolute error) + // Across minimum loci correlation thresholds + for min_loci_corr in vec_min_loci_corr.iter() { + // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) + let (linked_loci_idx, _correlations) = + find_l_linked_loci(j, corr, min_loci_corr, min_l_loci, + restrict_linked_loci_per_chromosome, + ¤t_chromosome, + &self.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // Across maximum pool distance thresholds + for max_pool_dist in vec_max_pool_dist.iter() { + // Across reps + let mut mae = 0.0; + for idx_i in idx_random_pools.iter() { + // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools + let distances_all_loci = calculate_genetic_distances_between_pools( + idx_i, + &linked_loci_idx, + &self.intercept_and_allele_frequencies) + .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) + let (distances, frequencies) = + find_k_nearest_neighbours( + &distances_all_loci, + max_pool_dist, + min_k_neighbours, + &j, + &self.intercept_and_allele_frequencies, + ) + .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // Impute and find the error + mae += (vec_q[*idx_i] - impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait.") + ).abs(); + } + mae /= n_reps as f64; + if (mae <= f64::EPSILON) | (mae > optimum_mae) { + break; + } + if mae < optimum_mae { + optimum_mae = mae; + optimum_min_loci_corr = *min_loci_corr; + optimum_max_pool_dist = *max_pool_dist; + } + } + } + // Take note of the minimum MAE which we code are u8 for memory efficiency + // We use 254 instead of 255 and add 1 because we set 0 as missing above + *mu8 = (optimum_mae * 254.0).round() as u8 + 1; + // Impute actual missing data point (ith pool and jth locus) + // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) + let (linked_loci_idx, _correlations) = + find_l_linked_loci(j, corr, &optimum_min_loci_corr, min_l_loci, + restrict_linked_loci_per_chromosome, + ¤t_chromosome, + &self.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools + let distances_all_loci = calculate_genetic_distances_between_pools( + &i, + &linked_loci_idx, + &self.intercept_and_allele_frequencies) + .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) + let (distances, frequencies) = + find_k_nearest_neighbours( + &distances_all_loci, + &optimum_max_pool_dist, + min_k_neighbours, + &j, + &self.intercept_and_allele_frequencies, + ) + .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // Impute missing allele frequencies at the current locus + *q = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); + // println!("q={:?}; mae={:?}; corr={}; dist={}", q, optimum_mae, optimum_min_loci_corr, optimum_max_pool_dist); + } + }); + + // Extract average minimum MAE across loci and pools (Note that we used 0: u8 as missing) + let mut predicted_mae = 0.0; + let mut n_missing = 0.0; + for mu8 in mae_u8.into_iter() { + if mu8 > 0 { + predicted_mae += (mu8 as f64 - 1.0) / 255.0; + n_missing += 1.0; + } + } + predicted_mae = predicted_mae / n_missing; + + + // Correct for allele frequency over- and under-flows, as we are assuming all loci are represented by all of its alleles (see assumptions above) + let n = allele_frequencies.nrows(); + + let mut vec_chunk_loci_idx: Vec = vec![]; // Local positions in the chunk (add loci_idx[*idx_loci_idx_ini] to get the global index) + for ix in *idx_loci_idx_ini..=*idx_loci_idx_fin { + vec_chunk_loci_idx.push(loci_idx[ix] - loci_idx[*idx_loci_idx_ini]); + } + + + for j in 0..(vec_chunk_loci_idx.len() - 1) { + let idx_locus_ini = vec_chunk_loci_idx[j]; + let idx_locus_fin = vec_chunk_loci_idx[j + 1]; + for i in 0..n { + if self.intercept_and_allele_frequencies[(i, (idx_locus_ini+loci_idx[*idx_loci_idx_ini]))].is_nan() { + let sum = allele_frequencies + .slice(s![i, idx_locus_ini..idx_locus_fin]) + .sum(); + if sum != 1.0 { + if (idx_locus_fin - idx_locus_ini) == 2 { + allele_frequencies[(i, idx_locus_ini + 1)] = + 1.00 - allele_frequencies[(i, idx_locus_ini)] + } else { + for k in idx_locus_ini..idx_locus_fin { + allele_frequencies[(i, k)] /= sum; + } + } + } + } + } + } + + println!("allele_frequencies={:?}", allele_frequencies); + Ok((allele_frequencies, predicted_mae)) + } + pub fn adaptive_ld_knn_imputation( - &mut self, - self_clone: &GenotypesAndPhenotypes, + &self, min_loci_corr: &f64, max_pool_dist: &f64, - min_l_loci: &u64, - min_k_neighbours: &u64, - loci_idx: &Vec, + min_l_loci: &usize, + min_k_neighbours: &usize, corr: &Vec>, restrict_linked_loci_per_chromosome: bool, n_reps: &usize, - ) -> io::Result<(&mut Self, f64)> { - self.check().expect("Error self.check() within adaptive_ld_knn_imputation() method of GenotypesAndPhenotypes struct."); + n_threads: &usize, + + ) -> () { // We are assuming that all non-zero alleles across pools are kept, i.e. biallelic loci have 2 columns, triallelic have 3, and so on. // - Input vcf file will have all alleles per locus extracted. // - Similarly, input sync file will have all alleles per locus extracted. // - Finally, input allele frequency table file which can be represented by all alleles or one less allele per locus will be appended with the alternative allele if the sum of allele frequencies per locus do not add up to one. + + self.check().expect("Error self.check() within adaptive_ld_knn_imputation() method of GenotypesAndPhenotypes struct."); + let (loci_idx, loci_chr, loci_pos) = self.count_loci().expect("Error counting loci of self within the adaptive_ld_knn_imputation method of GenotypesAndPhenotypes struct."); + + // Define fixed linkage and distance thresholds, i.e. the minimum number of loci to use in estimating genetic distances, and the minimum number of nearest neighbours to include in imputation let (n, p) = self.intercept_and_allele_frequencies.dim(); - let min_l_loci = if *min_l_loci >= (p as u64) { + let min_l_loci = if *min_l_loci >= p { p - 1 } else { *min_l_loci as usize }; - let min_k_neighbours = if *min_k_neighbours >= (n as u64) { + let min_k_neighbours = if *min_k_neighbours >= n { n - 1 } else { *min_k_neighbours as usize @@ -309,157 +481,104 @@ impl GenotypesAndPhenotypes { } else { vec![*max_pool_dist] }; - - let mut n_chunks = 10; // Should be equal to the number of threads because std::thread::scope will wait for other threads to finish before starting with another thread once it finishes - let chunk_size = p / n_chunks; - let vec_idx_ini: Vec = (0..(p-chunk_size)).step_by(chunk_size).collect(); - let mut vec_idx_fin: Vec = (chunk_size..p).step_by(chunk_size).collect(); - let idx_fin = vec_idx_fin[vec_idx_fin.len()-1]; - if idx_fin != p { - vec_idx_fin.pop(); - vec_idx_fin.push(p); + + // Define chunks which respect loci groupings + let mut n_chunks = *n_threads; // Should be equal to the number of threads because std::thread::scope will wait for other threads to finish before starting with another thread once it finishes + let l = loci_idx.len(); + let chunk_size = l / n_chunks; + // Define the indices of the indices of loci + let mut vec_idx_loci_idx_ini: Vec = (0..(l-chunk_size)).step_by(chunk_size).collect(); + let mut vec_idx_loci_idx_fin: Vec = (chunk_size..l).step_by(chunk_size).collect(); + let idx_fin = vec_idx_loci_idx_fin[vec_idx_loci_idx_fin.len()-1]; + if idx_fin != (l-1) { + vec_idx_loci_idx_fin.pop(); + vec_idx_loci_idx_fin.push(l-1); } - assert_eq!(vec_idx_ini.len(), vec_idx_fin.len()); - n_chunks = vec_idx_ini.len(); + assert_eq!(vec_idx_loci_idx_ini.len(), vec_idx_loci_idx_fin.len()); + n_chunks = vec_idx_loci_idx_ini.len(); + // Instantiate thread object for parallel execution let mut thread_objects: Vec = vec![]; for i in 0..n_chunks { - let idx_ini = vec_idx_ini[i]; - let idx_fin = vec_idx_fin[i]; + let idx_loci_idx_ini = vec_idx_loci_idx_ini[i]; + let idx_loci_idx_fin = vec_idx_loci_idx_fin[i]; let thread = std::thread::scope(|scope| { + + // let mut allele_frequencies: Array2 = self.intercept_and_allele_frequencies.slice(s![.., *idx_ini..*idx_fin]).to_owned(); - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Make me the main imputation method of GenotypesAndPhenotypes struct here, and - // move the whole parallel imputation steps (..std::thread::scope.. with Zip...) into the self-standing function impute_aldknni() below - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - let mut allele_frequencies: Array2 = self_clone.intercept_and_allele_frequencies.slice(s![.., idx_ini..idx_fin]).to_owned(); - // Define the 2D array used for storing the minimum mean absolute error (MAE) estimates across all missing data across pools and loci. - // We encode the MAE as u8 for memory efficiency, where we set 0: u8 as missing. - let mut mae_u8: Array2 = Array::from_elem(allele_frequencies.dim(), 0); - Zip::indexed(&mut allele_frequencies) - .and(&mut mae_u8) - .par_for_each(|(i, j), q, mu8| { - if q.is_nan() { - let current_chromosome = self_clone.chromosome[j].to_owned(); - let vec_q: ArrayView1 = self_clone.intercept_and_allele_frequencies.column(j); - let n_non_missing = vec_q.fold(0, |t, &x| if !x.is_nan() {t+1}else{t}); - let n_reps = if *n_reps <= n_non_missing { - *n_reps - } else { - n_non_missing - }; - let mut rng = rand::thread_rng(); - let idx_random_pools: Vec = (0..vec_q.len()).filter(|&idx| !vec_q[idx].is_nan()).choose_multiple(&mut rng, n_reps); - let mut optimum_mae = 1.0; - let mut optimum_min_loci_corr = 0.0; - let mut optimum_max_pool_dist = 1.0; - // Across minimum loci correlation thresholds - for min_loci_corr in vec_min_loci_corr.iter() { - // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) - let (linked_loci_idx, _correlations) = - find_l_linked_loci(j, corr, min_loci_corr, min_l_loci, - restrict_linked_loci_per_chromosome, - ¤t_chromosome, - &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // Across maximum pool distance thresholds - for max_pool_dist in vec_max_pool_dist.iter() { - // Across reps - let mut mae = 0.0; - for idx_i in idx_random_pools.iter() { - // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools - let distances_all_loci = calculate_genetic_distances_between_pools( - *idx_i, - &linked_loci_idx, - &self_clone.intercept_and_allele_frequencies) - .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) - let (distances, frequencies) = - find_k_nearest_neighbours( - &distances_all_loci, - max_pool_dist, - min_k_neighbours, - j, - &self_clone.intercept_and_allele_frequencies, - ) - .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // Impute and find the error - mae += (vec_q[*idx_i] - impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait.") - ).abs(); - } - mae /= n_reps as f64; - if (mae <= f64::EPSILON) | (mae > optimum_mae) { - break; - } - if mae < optimum_mae { - optimum_mae = mae; - optimum_min_loci_corr = *min_loci_corr; - optimum_max_pool_dist = *max_pool_dist; - } - } - } - // Take note of the minimum MAE which we code are u8 for memory efficiency - // We use 254 instead of 255 and add 1 because we set 0 as missing above - *mu8 = (optimum_mae * 254.0).round() as u8 + 1; - // Impute actual missing data point (ith pool and jth locus) - // Find loci most correlated to the major allele of the current locus, i.e. the first allele of the locus as they were sorted by decreasing allele frequency (see Sort trait) - let (linked_loci_idx, _correlations) = - find_l_linked_loci(j, corr, &optimum_min_loci_corr, min_l_loci, - restrict_linked_loci_per_chromosome, - ¤t_chromosome, - &self_clone.chromosome).expect("Error calling find_l_linked_loci() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // Using the linked loci, estimate the pairwise genetic distance between the current pool and the other pools - let distances_all_loci = calculate_genetic_distances_between_pools( - i, - &linked_loci_idx, - &self_clone.intercept_and_allele_frequencies) - .expect("Error calling calculate_genetic_distances_between_pools() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // Find the k-nearest neighbours given the maximum distance and/or minimum k-neighbours (shadowing the distances across all pools with distances across k-nearest neighbours) - let (distances, frequencies) = - find_k_nearest_neighbours( - &distances_all_loci, - &optimum_max_pool_dist, - min_k_neighbours, - j, - &self_clone.intercept_and_allele_frequencies, - ) - .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // Impute missing allele frequencies at the current locus - *q = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // println!("q={:?}; mae={:?}; corr={}; dist={}", q, optimum_mae, optimum_min_loci_corr, optimum_max_pool_dist); - } - }); - - println!("allele_frequencies={:?}", allele_frequencies); + let (allele_frequencies, mae) = self.per_chunk_aldknni( + &idx_loci_idx_ini, + &idx_loci_idx_fin, + &loci_idx, + &vec_min_loci_corr, + &vec_max_pool_dist, + &min_l_loci, + &min_k_neighbours, + &corr, + restrict_linked_loci_per_chromosome, + &n_reps, + ).expect("Error executing per_chunk_aldknni() method on self_clone."); // Write-out intermediate file - // let vec_chr: Vec = self_clone.chromosome[idx_ini..idx_fin].to_owned(); - // let vec_pos: Vec = self_clone.position[idx_ini..idx_fin].to_owned(); - // let vec_all: Vec = self_clone.allele[idx_ini..idx_fin].to_owned(); - let fname_intermediate_file: String = "intermediate_output-".to_owned() + &idx_ini.to_string() + &idx_fin.to_string() + ".csv"; + let idx_ini = loci_idx[idx_loci_idx_ini]; + let idx_fin = loci_idx[idx_loci_idx_fin]; + + let time = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("Error extracting time in UNIX_EPOCH within write_csv() method for GenotypesAndPhenotypes struct.") + .as_secs_f64(); + let n_digits: usize = loci_idx.last().expect("Error extracting the last element of loci_idx. Probably empty.").to_owned().to_string().len(); + let mut start_index = idx_ini.to_string(); + let mut end_index = idx_fin.to_string(); + for _ in 0..(n_digits - start_index.len()) { + start_index = "0".to_owned() + &start_index; + } + for _ in 0..(n_digits - end_index.len()) { + end_index = "0".to_owned() + &end_index; + } + let fname_intermediate_file: String = "intermediate_output-".to_owned() + &start_index + "-" + &end_index + "-" + &time.to_string() + ".csv"; + let chromosome = self.chromosome[idx_ini..idx_fin].to_owned(); + let position = self.position[idx_ini..idx_fin].to_owned(); + let allele = self.allele[idx_ini..idx_fin].to_owned(); + let p = allele_frequencies.ncols(); + assert_eq!(chromosome.len(), p, "Error, the number of chromosome names and the total number of loci are not equal."); - let output = GenotypesAndPhenotypes{ - chromosome: self_clone.chromosome[idx_ini..idx_fin].to_owned(), - position: self_clone.position[idx_ini..idx_fin].to_owned(), - allele: self_clone.allele[idx_ini..idx_fin].to_owned(), - intercept_and_allele_frequencies: allele_frequencies, - phenotypes: self_clone.phenotypes.clone(), - pool_names: self_clone.pool_names.clone(), - coverages: self_clone.coverages.slice(s![.., 0..10]).to_owned(), - }; - let dummy_filter_stats = FilterStats { - remove_ns: false, - min_quality: 0.0, - min_coverage: 0, - min_allele_frequency: 0.0, - max_missingness_rate: 0.0, - pool_sizes: vec![0.0], - }; - let dummy_keep_p_minus_1 = false; - let dummy_n_threads = 1; - output.write_csv(&dummy_filter_stats, dummy_keep_p_minus_1, &fname_intermediate_file, &dummy_n_threads).expect("Error: failed to save intermediate file."); + + // Instantiate output file + let error_writing_file = "Unable to create file: ".to_owned() + &fname_intermediate_file; + let mut file_out = OpenOptions::new() + .create_new(true) + .write(true) + .append(false) + .open(&fname_intermediate_file) + .expect(&error_writing_file); + // Write the header + file_out + .write_all( + ("#chr,pos,allele,".to_owned() + &self.pool_names.join(",") + "\n").as_bytes(), + ) + .expect("Error calling write_all() within the write_csv() method for GenotypesAndPhenotypes struct."); + // Write allele frequencies line by line + for j in 0..p { + let line = [ + chromosome[j].to_owned(), + position[j].to_string(), + allele[j].to_owned(), + allele_frequencies + .column(j) + .iter() + .map(|&x| parse_f64_roundup_and_own(x, 6)) + .collect::>() + .join(","), + ] + .join(",") + + "\n"; + file_out.write_all(line.as_bytes()).expect("Error calling write_all() per line of the output file within the write_csv() method for GenotypesAndPhenotypes struct."); + } + + return fname_intermediate_file }); thread_objects.push(thread); @@ -469,55 +588,9 @@ impl GenotypesAndPhenotypes { for thread in thread_objects { vec_fname_intermediate_files.push(thread); } - - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - // Extract average minimum MAE across loci and pools (Note that we used 0: u8 as missing) - // let mut predicted_mae = 0.0; - // let mut n_missing = 0.0; - // for mu8 in mae_u8.into_iter() { - // if mu8 > 0 { - // predicted_mae += (mu8 as f64 - 1.0) / 255.0; - // n_missing += 1.0; - // } - // } - // predicted_mae = predicted_mae / n_missing; - // // println!( - // // "An over-estimated prediction of mean absolute error = {}", - // // predicted_mae - // // ); - // // Correct for allele frequency over- and under-flows, as we are assuming all loci are represented by all of its alleles (see assumptions above) - // for j in 0..(loci_idx.len() - 1) { - // let idx_locus_ini = loci_idx[j]; - // let idx_locus_fin = loci_idx[j + 1]; - // for i in 0..n { - // if self_clone.intercept_and_allele_frequencies[(i, idx_locus_ini)].is_nan() { - // let sum = self - // .intercept_and_allele_frequencies - // .slice(s![i, idx_locus_ini..idx_locus_fin]) - // .sum(); - // if sum != 1.0 { - // if (idx_locus_fin - idx_locus_ini) == 2 { - // self.intercept_and_allele_frequencies[(i, idx_locus_ini + 1)] = - // 1.00 - self.intercept_and_allele_frequencies[(i, idx_locus_ini)] - // } else { - // for k in idx_locus_ini..idx_locus_fin { - // self.intercept_and_allele_frequencies[(i, k)] /= sum; - // } - // } - // } - // self.coverages[(i, j)] = f64::INFINITY; - // } - // } - // } - // Ok((self, predicted_mae)) - Ok((self, 0.0)) } } @@ -526,8 +599,8 @@ pub fn impute_aldknni( filter_stats: &FilterStats, min_loci_corr: &f64, max_pool_dist: &f64, - min_l_loci: &u64, - min_k_neighbours: &u64, + min_l_loci: &usize, + min_k_neighbours: &usize, restrict_linked_loci_per_chromosome: bool, n_reps: &usize, n_threads: &usize, @@ -539,8 +612,8 @@ pub fn impute_aldknni( // Extract loci indices let (loci_idx, _loci_chr, _loci_pos) = genotypes_and_phenotypes.count_loci().expect("Error calling count_loci() method within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes struct."); - // Clone - let mut self_clone = genotypes_and_phenotypes.clone(); + // // Clone + // let mut self_clone = genotypes_and_phenotypes.clone(); // Calculate LD across the entire genome println!("Estimating linkage between loci across the entire genome."); @@ -555,55 +628,55 @@ pub fn impute_aldknni( println!("Estimating imputation accuracy."); } let start = std::time::SystemTime::now(); - let (_self, mae) = self_clone + let _ = genotypes_and_phenotypes .adaptive_ld_knn_imputation( - &genotypes_and_phenotypes, min_loci_corr, max_pool_dist, min_l_loci, min_k_neighbours, - &loci_idx, &corr, restrict_linked_loci_per_chromosome, n_reps, - ) - .expect("Error calling adaptive_ld_knn_imputation() within impute_aldknni()."); - let end = std::time::SystemTime::now(); - let duration = end.duration_since(start).expect("Error measuring the duration of running adaptive_ld_knn_imputation() within impute_aldknni()."); - println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); - println!( - "Expected imputation accuracy in terms of mean absolute error: {}", - mae - ); - println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); - println!( - "Adaptive LD-kNN imputation: {} pools x {} loci | Missingness: {}% | Duration: {} seconds", - self_clone.coverages.nrows(), - self_clone.coverages.ncols(), - self_clone.missing_rate().expect("Error measuring sparsity of the data using missing_rate() method within impute_aldknni()."), - duration.as_secs() - ); - // Remove 100% of the loci with missing data - let start = std::time::SystemTime::now(); - self_clone - .filter_out_top_missing_loci(&1.00) - .expect("Error calling filter_out_top_missing_loci() method within impute_aldknni()."); - let end = std::time::SystemTime::now(); - let duration = end.duration_since(start).expect("Error measuring the duration of running filter_out_top_missing_loci() within impute_aldknni()."); - println!( - "Missing data removed, i.e. loci which cannot be imputed because of extreme sparsity: {} pools x {} loci | Missingness: {}% | Duration: {} seconds", - self_clone.coverages.nrows(), - self_clone.coverages.ncols(), - self_clone.missing_rate().expect("Error measuring sparsity of the data using missing_rate() method after filtering for missing top loci within impute_aldknni()."), - duration.as_secs() - ); - // Output - let out = self_clone - .write_csv(filter_stats, false, out, n_threads) - .expect( - "Error writing the output file using the write_csv() method within impute_aldknni().", + n_threads ); - Ok(out) + // .expect("Error calling adaptive_ld_knn_imputation() within impute_aldknni()."); + let end = std::time::SystemTime::now(); + // let duration = end.duration_since(start).expect("Error measuring the duration of running adaptive_ld_knn_imputation() within impute_aldknni()."); + // println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); + // println!( + // "Expected imputation accuracy in terms of mean absolute error: {}", + // mae + // ); + // println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); + // println!( + // "Adaptive LD-kNN imputation: {} pools x {} loci | Missingness: {}% | Duration: {} seconds", + // self_clone.coverages.nrows(), + // self_clone.coverages.ncols(), + // self_clone.missing_rate().expect("Error measuring sparsity of the data using missing_rate() method within impute_aldknni()."), + // duration.as_secs() + // ); + // // Remove 100% of the loci with missing data + // let start = std::time::SystemTime::now(); + // self_clone + // .filter_out_top_missing_loci(&1.00) + // .expect("Error calling filter_out_top_missing_loci() method within impute_aldknni()."); + // let end = std::time::SystemTime::now(); + // let duration = end.duration_since(start).expect("Error measuring the duration of running filter_out_top_missing_loci() within impute_aldknni()."); + // println!( + // "Missing data removed, i.e. loci which cannot be imputed because of extreme sparsity: {} pools x {} loci | Missingness: {}% | Duration: {} seconds", + // self_clone.coverages.nrows(), + // self_clone.coverages.ncols(), + // self_clone.missing_rate().expect("Error measuring sparsity of the data using missing_rate() method after filtering for missing top loci within impute_aldknni()."), + // duration.as_secs() + // ); + // // Output + // let out = self_clone + // .write_csv(filter_stats, false, out, n_threads) + // .expect( + // "Error writing the output file using the write_csv() method within impute_aldknni().", + // ); + // Ok(out) + Ok("".to_owned()) } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -699,7 +772,7 @@ mod tests { 7020, &corr, &0.99, - 500, + &500, false, &"chr1".to_owned(), &frequencies_and_phenotypes.chromosome, @@ -714,7 +787,7 @@ mod tests { // GENETIC DISTANCES BETWEEN POOLS USING LINKED LOCI let idx_pool = 0; let distances = calculate_genetic_distances_between_pools( - idx_pool, + &idx_pool, &linked_loci_idx, &frequencies_and_phenotypes.intercept_and_allele_frequencies, ) @@ -727,8 +800,8 @@ mod tests { let (distances, frequencies) = find_k_nearest_neighbours( &distances, &0.5, - 3, - idx_locus_ini, + &3, + &idx_locus_ini, &frequencies_and_phenotypes.intercept_and_allele_frequencies, ) .unwrap(); @@ -742,33 +815,32 @@ mod tests { assert_eq!(imputed_freq, 0.2249532554929301); let (loci_idx, _loci_chr, _loci_pos) = frequencies_and_phenotypes.count_loci().unwrap(); - let frequencies_and_phenotypes_clone = frequencies_and_phenotypes.clone(); + // let frequencies_and_phenotypes_clone = frequencies_and_phenotypes.clone(); let n_reps = 5; let min_loci_corr = f64::NAN; let max_pool_dist = f64::NAN; let min_l_loci = 10; let min_k_neighbours = 10; let restrict_linked_loci_per_chromosome = false; + let n_threads = 8; - let (_self, mae) = frequencies_and_phenotypes - .adaptive_ld_knn_imputation( - &frequencies_and_phenotypes_clone, + let _ = frequencies_and_phenotypes + .adaptive_ld_knn_imputation( &min_loci_corr, &max_pool_dist, &min_l_loci, &min_k_neighbours, - &loci_idx, &corr, restrict_linked_loci_per_chromosome, &n_reps, - ) - .unwrap(); - println!( - "After imputation:\n{:?}", - frequencies_and_phenotypes.intercept_and_allele_frequencies - ); - println!("Estimated MAE={}", mae); + &n_threads + ); + // println!( + // "After imputation:\n{:?}", + // frequencies_and_phenotypes.intercept_and_allele_frequencies + // ); + // println!("Estimated MAE={}", mae); assert_eq!(0, 1) // let n_nan = frequencies_and_phenotypes // .intercept_and_allele_frequencies diff --git a/src/rust/src/lib.rs b/src/rust/src/lib.rs index f2dacb6..90744dd 100644 --- a/src/rust/src/lib.rs +++ b/src/rust/src/lib.rs @@ -332,8 +332,8 @@ fn impute( &filter_stats, &min_loci_corr, &max_pool_dist, - &min_l_loci, - &min_k_neighbours, + &(min_l_loci as usize), + &(min_k_neighbours as usize), restrict_linked_loci_per_chromosome, &(n_reps as usize), &(n_threads as usize), From 3224de0f590db70284b23c5d02fc4189e1d9de7a Mon Sep 17 00:00:00 2001 From: jeffersonfparil Date: Fri, 9 Feb 2024 11:31:19 +1100 Subject: [PATCH 7/8] drafted new parallelisation and new optimisation algorithms + will test next --- src/rust/src/aldknni.rs | 226 ++++++++++++++++++++++------------------ 1 file changed, 123 insertions(+), 103 deletions(-) diff --git a/src/rust/src/aldknni.rs b/src/rust/src/aldknni.rs index 69a95f7..32add61 100644 --- a/src/rust/src/aldknni.rs +++ b/src/rust/src/aldknni.rs @@ -4,7 +4,7 @@ use std::cmp::Ordering; use std::io; use std::sync::{Arc, Mutex}; use std::time::{SystemTime, UNIX_EPOCH}; -use std::fs::OpenOptions; +use std::fs::{OpenOptions, remove_file}; use std::io::Write; use crate::helpers::*; @@ -283,7 +283,7 @@ impl GenotypesAndPhenotypes { corr: &Vec>, restrict_linked_loci_per_chromosome: bool, n_reps: &usize, - ) -> io::Result<(Array2, f64)> { + ) -> io::Result<(Array2, f64, f64)> { let idx_ini = loci_idx[*idx_loci_idx_ini]; @@ -390,16 +390,14 @@ impl GenotypesAndPhenotypes { }); // Extract average minimum MAE across loci and pools (Note that we used 0: u8 as missing) - let mut predicted_mae = 0.0; + let mut sum_mae = 0.0; let mut n_missing = 0.0; for mu8 in mae_u8.into_iter() { if mu8 > 0 { - predicted_mae += (mu8 as f64 - 1.0) / 255.0; + sum_mae += (mu8 as f64 - 1.0) / 255.0; n_missing += 1.0; } } - predicted_mae = predicted_mae / n_missing; - // Correct for allele frequency over- and under-flows, as we are assuming all loci are represented by all of its alleles (see assumptions above) let n = allele_frequencies.nrows(); @@ -432,12 +430,13 @@ impl GenotypesAndPhenotypes { } } - println!("allele_frequencies={:?}", allele_frequencies); - Ok((allele_frequencies, predicted_mae)) + // println!("allele_frequencies={:?}", allele_frequencies); + Ok((allele_frequencies, sum_mae, n_missing)) } pub fn adaptive_ld_knn_imputation( &self, + loci_idx: &Vec, min_loci_corr: &f64, max_pool_dist: &f64, min_l_loci: &usize, @@ -447,14 +446,14 @@ impl GenotypesAndPhenotypes { n_reps: &usize, n_threads: &usize, - ) -> () { + ) -> io::Result<(String, f64)> { // We are assuming that all non-zero alleles across pools are kept, i.e. biallelic loci have 2 columns, triallelic have 3, and so on. // - Input vcf file will have all alleles per locus extracted. // - Similarly, input sync file will have all alleles per locus extracted. // - Finally, input allele frequency table file which can be represented by all alleles or one less allele per locus will be appended with the alternative allele if the sum of allele frequencies per locus do not add up to one. self.check().expect("Error self.check() within adaptive_ld_knn_imputation() method of GenotypesAndPhenotypes struct."); - let (loci_idx, loci_chr, loci_pos) = self.count_loci().expect("Error counting loci of self within the adaptive_ld_knn_imputation method of GenotypesAndPhenotypes struct."); + // let (loci_idx, loci_chr, loci_pos) = self.count_loci().expect("Error counting loci of self within the adaptive_ld_knn_imputation method of GenotypesAndPhenotypes struct."); // Define fixed linkage and distance thresholds, i.e. the minimum number of loci to use in estimating genetic distances, and the minimum number of nearest neighbours to include in imputation @@ -487,7 +486,7 @@ impl GenotypesAndPhenotypes { let l = loci_idx.len(); let chunk_size = l / n_chunks; // Define the indices of the indices of loci - let mut vec_idx_loci_idx_ini: Vec = (0..(l-chunk_size)).step_by(chunk_size).collect(); + let vec_idx_loci_idx_ini: Vec = (0..(l-chunk_size)).step_by(chunk_size).collect(); let mut vec_idx_loci_idx_fin: Vec = (chunk_size..l).step_by(chunk_size).collect(); let idx_fin = vec_idx_loci_idx_fin[vec_idx_loci_idx_fin.len()-1]; if idx_fin != (l-1) { @@ -498,7 +497,7 @@ impl GenotypesAndPhenotypes { n_chunks = vec_idx_loci_idx_ini.len(); // Instantiate thread object for parallel execution - let mut thread_objects: Vec = vec![]; + let mut thread_objects: Vec<(String, f64, f64)> = vec![]; for i in 0..n_chunks { let idx_loci_idx_ini = vec_idx_loci_idx_ini[i]; let idx_loci_idx_fin = vec_idx_loci_idx_fin[i]; @@ -507,7 +506,7 @@ impl GenotypesAndPhenotypes { // let mut allele_frequencies: Array2 = self.intercept_and_allele_frequencies.slice(s![.., *idx_ini..*idx_fin]).to_owned(); - let (allele_frequencies, mae) = self.per_chunk_aldknni( + let (allele_frequencies, sum_mae, n_missing) = self.per_chunk_aldknni( &idx_loci_idx_ini, &idx_loci_idx_fin, &loci_idx, @@ -554,12 +553,14 @@ impl GenotypesAndPhenotypes { .append(false) .open(&fname_intermediate_file) .expect(&error_writing_file); - // Write the header - file_out - .write_all( - ("#chr,pos,allele,".to_owned() + &self.pool_names.join(",") + "\n").as_bytes(), - ) - .expect("Error calling write_all() within the write_csv() method for GenotypesAndPhenotypes struct."); + // Write the header only for the first chunk + if i == 0 { + file_out + .write_all( + ("#chr,pos,allele,".to_owned() + &self.pool_names.join(",") + "\n").as_bytes(), + ) + .expect("Error calling write_all() within the write_csv() method for GenotypesAndPhenotypes struct."); + } // Write allele frequencies line by line for j in 0..p { let line = [ @@ -579,18 +580,38 @@ impl GenotypesAndPhenotypes { } - return fname_intermediate_file + return (fname_intermediate_file, sum_mae, n_missing) }); thread_objects.push(thread); } - // Waiting for all threads to finish - let mut vec_fname_intermediate_files: Vec = vec![]; + // Concatenate output per chunk + let mut vec_fname_intermediate_files_and_mae: Vec<(String, f64, f64)> = vec![]; for thread in thread_objects { - vec_fname_intermediate_files.push(thread); + vec_fname_intermediate_files_and_mae.push(thread); } - - - + // Sort by intermediate output filenames (named according to indices) + vec_fname_intermediate_files_and_mae.sort_by(|a, b| a.0.partial_cmp(&b.0).expect("Error sorting intermediate output filenames")); + + // Concatenate intermediate output filenames together and calculate mae + let mut file_0 = OpenOptions::new() + .append(true) + .open(vec_fname_intermediate_files_and_mae[0].0.to_owned()) + .expect("Error opening the intermediate file of the first chunk."); + let mut sum_mae = vec_fname_intermediate_files_and_mae[0].1; + let mut n_missing = vec_fname_intermediate_files_and_mae[0].2; + for i in 1..vec_fname_intermediate_files_and_mae.len() { + let mut file_1 = OpenOptions::new() + .read(true) + .open(vec_fname_intermediate_files_and_mae[i].0.to_owned()) + .expect("Error opening the intermediate file of a chunk.");; + io::copy(&mut file_1, &mut file_0).expect("Error concatenating intermediate output files."); + remove_file(vec_fname_intermediate_files_and_mae[i].0.to_owned()) + .expect("Error removing the intermediate file of a chunk."); + sum_mae += vec_fname_intermediate_files_and_mae[i].1; + n_missing += vec_fname_intermediate_files_and_mae[i].2; + } + let mae = sum_mae / n_missing; + Ok((vec_fname_intermediate_files_and_mae[0].0.to_owned(), mae)) } } @@ -628,8 +649,9 @@ pub fn impute_aldknni( println!("Estimating imputation accuracy."); } let start = std::time::SystemTime::now(); - let _ = genotypes_and_phenotypes + let (fname_imputed, mae) = genotypes_and_phenotypes .adaptive_ld_knn_imputation( + &loci_idx, min_loci_corr, max_pool_dist, min_l_loci, @@ -638,45 +660,47 @@ pub fn impute_aldknni( restrict_linked_loci_per_chromosome, n_reps, n_threads - ); - // .expect("Error calling adaptive_ld_knn_imputation() within impute_aldknni()."); + ).expect("Error calling adaptive_ld_knn_imputation() within impute_aldknni()."); let end = std::time::SystemTime::now(); - // let duration = end.duration_since(start).expect("Error measuring the duration of running adaptive_ld_knn_imputation() within impute_aldknni()."); - // println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); - // println!( - // "Expected imputation accuracy in terms of mean absolute error: {}", - // mae - // ); - // println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); - // println!( - // "Adaptive LD-kNN imputation: {} pools x {} loci | Missingness: {}% | Duration: {} seconds", - // self_clone.coverages.nrows(), - // self_clone.coverages.ncols(), - // self_clone.missing_rate().expect("Error measuring sparsity of the data using missing_rate() method within impute_aldknni()."), - // duration.as_secs() - // ); - // // Remove 100% of the loci with missing data - // let start = std::time::SystemTime::now(); - // self_clone - // .filter_out_top_missing_loci(&1.00) - // .expect("Error calling filter_out_top_missing_loci() method within impute_aldknni()."); - // let end = std::time::SystemTime::now(); - // let duration = end.duration_since(start).expect("Error measuring the duration of running filter_out_top_missing_loci() within impute_aldknni()."); - // println!( - // "Missing data removed, i.e. loci which cannot be imputed because of extreme sparsity: {} pools x {} loci | Missingness: {}% | Duration: {} seconds", - // self_clone.coverages.nrows(), - // self_clone.coverages.ncols(), - // self_clone.missing_rate().expect("Error measuring sparsity of the data using missing_rate() method after filtering for missing top loci within impute_aldknni()."), - // duration.as_secs() - // ); - // // Output - // let out = self_clone - // .write_csv(filter_stats, false, out, n_threads) - // .expect( - // "Error writing the output file using the write_csv() method within impute_aldknni().", - // ); - // Ok(out) - Ok("".to_owned()) + let duration = end.duration_since(start).expect("Error measuring the duration of running adaptive_ld_knn_imputation() within impute_aldknni()."); + println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); + println!( + "Expected imputation accuracy in terms of mean absolute error: {}", + mae + ); + println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); + + let mut genotypes_and_phenotypes = FileGeno{filename: fname_imputed.to_owned()}.into_genotypes_and_phenotypes(filter_stats, false, n_threads).expect("Error loading the imputed genotype file."); + remove_file(fname_imputed).expect("Error removing concatenated intermediate file."); + + println!( + "Adaptive LD-kNN imputation: {} pools x {} loci | Missingness: {}% | Duration: {} seconds", + genotypes_and_phenotypes.coverages.nrows(), + genotypes_and_phenotypes.coverages.ncols(), + genotypes_and_phenotypes.missing_rate().expect("Error measuring sparsity of the data using missing_rate() method within impute_aldknni()."), + duration.as_secs() + ); + // Remove 100% of the loci with missing data + let start = std::time::SystemTime::now(); + genotypes_and_phenotypes + .filter_out_top_missing_loci(&1.00) + .expect("Error calling filter_out_top_missing_loci() method within impute_aldknni()."); + let end = std::time::SystemTime::now(); + let duration = end.duration_since(start).expect("Error measuring the duration of running filter_out_top_missing_loci() within impute_aldknni()."); + println!( + "Missing data removed, i.e. loci which cannot be imputed because of extreme sparsity: {} pools x {} loci | Missingness: {}% | Duration: {} seconds", + genotypes_and_phenotypes.coverages.nrows(), + genotypes_and_phenotypes.coverages.ncols(), + genotypes_and_phenotypes.missing_rate().expect("Error measuring sparsity of the data using missing_rate() method after filtering for missing top loci within impute_aldknni()."), + duration.as_secs() + ); + // Output + let out = genotypes_and_phenotypes + .write_csv(filter_stats, false, out, n_threads) + .expect( + "Error writing the output file using the write_csv() method within impute_aldknni().", + ); + Ok(out) } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -825,8 +849,9 @@ mod tests { let n_threads = 8; - let _ = frequencies_and_phenotypes + let (fname_imputed, mae) = frequencies_and_phenotypes .adaptive_ld_knn_imputation( + &loci_idx, &min_loci_corr, &max_pool_dist, &min_l_loci, @@ -835,43 +860,38 @@ mod tests { restrict_linked_loci_per_chromosome, &n_reps, &n_threads - ); - // println!( - // "After imputation:\n{:?}", - // frequencies_and_phenotypes.intercept_and_allele_frequencies - // ); - // println!("Estimated MAE={}", mae); - assert_eq!(0, 1) - // let n_nan = frequencies_and_phenotypes - // .intercept_and_allele_frequencies - // .iter() - // .fold(0, |n_nan, &x| if x.is_nan() { n_nan + 1 } else { n_nan }); - // println!("n_nan={}", n_nan); - // assert_eq!(n_nan, 1_915); // corresponds to the 1_915 alleles completely missing across all pools - - // let keep_p_minus_1 = false; - // let genotypes_and_phenotypes = file_sync_phen - // .into_genotypes_and_phenotypes(&filter_stats, keep_p_minus_1, &n_threads) - // .unwrap(); - - // let outname = impute_aldknni( - // genotypes_and_phenotypes, - // &filter_stats, - // &min_loci_corr, - // &max_pool_dist, - // &min_l_loci, - // &min_k_neighbours, - // restrict_linked_loci_per_chromosome, - // &optimise_n_steps_min_loci_corr, - // &optimise_n_steps_max_pool_dist, - // &optimise_max_l_loci, - // &optimise_max_k_neighbours, - // &optimise_n_reps, - // &n_threads, - // &"test-impute_aldknni.csv".to_owned(), - // ) - // .unwrap(); - // assert_eq!(outname, "test-impute_aldknni.csv".to_owned()); // Do better!!! Load data - thus working on improving load_table() + ).unwrap(); + println!( + "After imputation:\n{:?}", + frequencies_and_phenotypes.intercept_and_allele_frequencies + ); + println!("Estimated MAE={}", mae); + let n_nan = frequencies_and_phenotypes + .intercept_and_allele_frequencies + .iter() + .fold(0, |n_nan, &x| if x.is_nan() { n_nan + 1 } else { n_nan }); + println!("n_nan={}", n_nan); + assert_eq!(n_nan, 1_915); // corresponds to the 1_915 alleles completely missing across all pools + + let keep_p_minus_1 = false; + let genotypes_and_phenotypes = file_sync_phen + .into_genotypes_and_phenotypes(&filter_stats, keep_p_minus_1, &n_threads) + .unwrap(); + + let outname = impute_aldknni( + genotypes_and_phenotypes, + &filter_stats, + &min_loci_corr, + &max_pool_dist, + &min_l_loci, + &min_k_neighbours, + restrict_linked_loci_per_chromosome, + &n_reps, + &n_threads, + &"test-impute_aldknni.csv".to_owned(), + ) + .unwrap(); + assert_eq!(outname, "test-impute_aldknni.csv".to_owned()); // Do better!!! Load data - thus working on improving load_table() // println!("frequencies_and_phenotypes.intercept_and_allele_frequencies.slice(s![0..5, 39..42])={:?}", frequencies_and_phenotypes.intercept_and_allele_frequencies.slice(s![0..5, 39..42])); // assert_eq!( From c59364b8665e509a0bdf36a27bdf4f96cecb767e Mon Sep 17 00:00:00 2001 From: jeffersonfparil Date: Fri, 9 Feb 2024 16:14:17 +1100 Subject: [PATCH 8/8] rectified allele/locus index during double parallel imputation (std::thread::scope and Zip::indexed) + will test --- R/imputef.R | 10 +-- man/aldknni.Rd | 6 +- res/perf_functions.R | 8 +- src/rust/src/aldknni.rs | 186 +++++++++++++++++++++------------------- 4 files changed, 108 insertions(+), 102 deletions(-) diff --git a/R/imputef.R b/R/imputef.R index fa67b47..2bc495b 100644 --- a/R/imputef.R +++ b/R/imputef.R @@ -108,10 +108,10 @@ mvi = function(fname, #' frac_top_missing_loci=0.0, #' min_loci_corr=0.9, #' max_pool_dist=0.1, -#' min_l_loci=10, -#' min_k_neighbours=5, +#' min_l_loci=1, +#' min_k_neighbours=1, #' restrict_linked_loci_per_chromosome=TRUE, -#' n_reps=1, +#' n_reps=5, #' n_threads=2, #' fname_out_prefix="") #' @param fname @@ -189,8 +189,8 @@ aldknni = function(fname, frac_top_missing_loci=0.0, min_loci_corr=NA, max_pool_dist=NA, - min_l_loci=10, - min_k_neighbours=5, + min_l_loci=1, + min_k_neighbours=1, restrict_linked_loci_per_chromosome=TRUE, n_reps=5, n_threads=2, diff --git a/man/aldknni.Rd b/man/aldknni.Rd index 110ce46..bd8b88e 100644 --- a/man/aldknni.Rd +++ b/man/aldknni.Rd @@ -15,10 +15,10 @@ aldknni(fname, frac_top_missing_loci=0.0, min_loci_corr=0.9, max_pool_dist=0.1, - min_l_loci=10, - min_k_neighbours=5, + min_l_loci=1, + min_k_neighbours=1, restrict_linked_loci_per_chromosome=TRUE, - n_reps=1, + n_reps=5, n_threads=2, fname_out_prefix="") } diff --git a/res/perf_functions.R b/res/perf_functions.R index 8a59492..c9c2eeb 100644 --- a/res/perf_functions.R +++ b/res/perf_functions.R @@ -1,9 +1,9 @@ ### Load imputef library # dir_src = "/group/pasture/Jeff/imputef/res" -library(imputef) +# library(imputef) # system("conda activate rustenv") -# devtools::load_all() -# rextendr::document() +devtools::load_all() +rextendr::document() ### Extract allele frequencies into a pxn matrix where we have p loci and n entries @@ -357,7 +357,7 @@ fn_test_imputation = function(vcf, mat_genotypes, mat_idx_high_conf_data, ploidy mat_genotypes_for_linkimpute[is.na(mat_genotypes_for_linkimpute)] = -1 write.table(mat_genotypes_for_linkimpute, file=fname_for_linkimpute, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE) time_ini = Sys.time() - system(paste0("java -jar ", dir_src, "/linkimpute/LinkImpute.jar --verbose -a ", fname_for_linkimpute, " ", output_for_linkimpute)) + system(paste0("java -jar ", dir_src, "/linkimpute/LinkImpute.jar --verbose -a ", fname_for_linkimpute, " ", output_for_linkimpute)) ### dir_src is defined in perf.R which sources this Rsccipt duration_linkimpute = difftime(Sys.time(), time_ini, units="mins") mat_linkimputed = read.delim(output_for_linkimpute, header=FALSE) rownames(mat_linkimputed) = rownames(mat_genotypes_for_linkimpute) diff --git a/src/rust/src/aldknni.rs b/src/rust/src/aldknni.rs index 32add61..e987fcd 100644 --- a/src/rust/src/aldknni.rs +++ b/src/rust/src/aldknni.rs @@ -1,11 +1,10 @@ use ndarray::{prelude::*, Zip}; -use rand::prelude::IteratorRandom; +use rand::{prelude::IteratorRandom, Rng}; use std::cmp::Ordering; +use std::fs::{remove_file, OpenOptions}; use std::io; -use std::sync::{Arc, Mutex}; -use std::time::{SystemTime, UNIX_EPOCH}; -use std::fs::{OpenOptions, remove_file}; use std::io::Write; +use std::time::{SystemTime, UNIX_EPOCH}; use crate::helpers::*; @@ -241,10 +240,7 @@ fn find_k_nearest_neighbours( Ok((dist, freq)) } -fn impute_allele_frequencies( - frequencies: &Array1, - distances: &Vec, -) -> io::Result { +fn impute_allele_frequencies(frequencies: &Array1, distances: &Vec) -> io::Result { let n = frequencies.len(); for i in 0..n { assert!( @@ -284,21 +280,25 @@ impl GenotypesAndPhenotypes { restrict_linked_loci_per_chromosome: bool, n_reps: &usize, ) -> io::Result<(Array2, f64, f64)> { - - + // Define loci indices let idx_ini = loci_idx[*idx_loci_idx_ini]; let idx_fin = loci_idx[*idx_loci_idx_fin]; - let mut allele_frequencies: Array2 = self.intercept_and_allele_frequencies.slice(s![.., idx_ini..idx_fin]).to_owned(); - - + let mut allele_frequencies: Array2 = self + .intercept_and_allele_frequencies + .slice(s![.., idx_ini..idx_fin]) + .to_owned(); // Define the 2D array used for storing the minimum mean absolute error (MAE) estimates across all missing data across pools and loci. // We encode the MAE as u8 for memory efficiency, where we set 0: u8 as missing. let mut mae_u8: Array2 = Array::from_elem(allele_frequencies.dim(), 0); Zip::indexed(&mut allele_frequencies) .and(&mut mae_u8) - .par_for_each(|(i, j), q, mu8| { + .par_for_each(|(i, j_local), q, mu8| { if q.is_nan() { + // Define global locus index + let j = j_local + idx_ini; + // Define the current chromosome let current_chromosome = self.chromosome[j].to_owned(); + // Define the allele frequencies at the current allele/locus let vec_q: ArrayView1 = self.intercept_and_allele_frequencies.column(j); let n_non_missing = vec_q.fold(0, |t, &x| if !x.is_nan() {t+1}else{t}); let n_reps = if *n_reps <= n_non_missing { @@ -310,8 +310,8 @@ impl GenotypesAndPhenotypes { let mut rng = rand::thread_rng(); let idx_random_pools: Vec = (0..vec_q.len()).filter(|&idx| !vec_q[idx].is_nan()).choose_multiple(&mut rng, n_reps); let mut optimum_mae = 1.0; - let mut optimum_min_loci_corr = 0.0; - let mut optimum_max_pool_dist = 1.0; + let mut optimum_min_loci_corr = vec_min_loci_corr[0]; + let mut optimum_max_pool_dist = vec_max_pool_dist[0]; // Find the optimal min_loci_corr and max_pool_dist which minimise imputation error (MAE: mean absolute error) // Across minimum loci correlation thresholds for min_loci_corr in vec_min_loci_corr.iter() { @@ -385,10 +385,8 @@ impl GenotypesAndPhenotypes { .expect("Error calling find_k_nearest_neighbours() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); // Impute missing allele frequencies at the current locus *q = impute_allele_frequencies(&frequencies, &distances).expect("Error calling impute_allele_frequencies() within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes trait."); - // println!("q={:?}; mae={:?}; corr={}; dist={}", q, optimum_mae, optimum_min_loci_corr, optimum_max_pool_dist); } }); - // Extract average minimum MAE across loci and pools (Note that we used 0: u8 as missing) let mut sum_mae = 0.0; let mut n_missing = 0.0; @@ -398,39 +396,31 @@ impl GenotypesAndPhenotypes { n_missing += 1.0; } } - // Correct for allele frequency over- and under-flows, as we are assuming all loci are represented by all of its alleles (see assumptions above) let n = allele_frequencies.nrows(); - let mut vec_chunk_loci_idx: Vec = vec![]; // Local positions in the chunk (add loci_idx[*idx_loci_idx_ini] to get the global index) for ix in *idx_loci_idx_ini..=*idx_loci_idx_fin { vec_chunk_loci_idx.push(loci_idx[ix] - loci_idx[*idx_loci_idx_ini]); } - - for j in 0..(vec_chunk_loci_idx.len() - 1) { let idx_locus_ini = vec_chunk_loci_idx[j]; let idx_locus_fin = vec_chunk_loci_idx[j + 1]; for i in 0..n { - if self.intercept_and_allele_frequencies[(i, (idx_locus_ini+loci_idx[*idx_loci_idx_ini]))].is_nan() { + if self.intercept_and_allele_frequencies + [(i, (idx_locus_ini + loci_idx[*idx_loci_idx_ini]))] + .is_nan() + { let sum = allele_frequencies .slice(s![i, idx_locus_ini..idx_locus_fin]) .sum(); if sum != 1.0 { - if (idx_locus_fin - idx_locus_ini) == 2 { - allele_frequencies[(i, idx_locus_ini + 1)] = - 1.00 - allele_frequencies[(i, idx_locus_ini)] - } else { - for k in idx_locus_ini..idx_locus_fin { - allele_frequencies[(i, k)] /= sum; - } + for k in idx_locus_ini..idx_locus_fin { + allele_frequencies[(i, k)] /= sum; } } } } } - - // println!("allele_frequencies={:?}", allele_frequencies); Ok((allele_frequencies, sum_mae, n_missing)) } @@ -445,7 +435,7 @@ impl GenotypesAndPhenotypes { restrict_linked_loci_per_chromosome: bool, n_reps: &usize, n_threads: &usize, - + prefix: &String, ) -> io::Result<(String, f64)> { // We are assuming that all non-zero alleles across pools are kept, i.e. biallelic loci have 2 columns, triallelic have 3, and so on. // - Input vcf file will have all alleles per locus extracted. @@ -455,7 +445,6 @@ impl GenotypesAndPhenotypes { self.check().expect("Error self.check() within adaptive_ld_knn_imputation() method of GenotypesAndPhenotypes struct."); // let (loci_idx, loci_chr, loci_pos) = self.count_loci().expect("Error counting loci of self within the adaptive_ld_knn_imputation method of GenotypesAndPhenotypes struct."); - // Define fixed linkage and distance thresholds, i.e. the minimum number of loci to use in estimating genetic distances, and the minimum number of nearest neighbours to include in imputation let (n, p) = self.intercept_and_allele_frequencies.dim(); let min_l_loci = if *min_l_loci >= p { @@ -486,12 +475,12 @@ impl GenotypesAndPhenotypes { let l = loci_idx.len(); let chunk_size = l / n_chunks; // Define the indices of the indices of loci - let vec_idx_loci_idx_ini: Vec = (0..(l-chunk_size)).step_by(chunk_size).collect(); + let vec_idx_loci_idx_ini: Vec = (0..(l - chunk_size)).step_by(chunk_size).collect(); let mut vec_idx_loci_idx_fin: Vec = (chunk_size..l).step_by(chunk_size).collect(); - let idx_fin = vec_idx_loci_idx_fin[vec_idx_loci_idx_fin.len()-1]; - if idx_fin != (l-1) { + let idx_fin = vec_idx_loci_idx_fin[vec_idx_loci_idx_fin.len() - 1]; + if idx_fin != (l - 1) { vec_idx_loci_idx_fin.pop(); - vec_idx_loci_idx_fin.push(l-1); + vec_idx_loci_idx_fin.push(l - 1); } assert_eq!(vec_idx_loci_idx_ini.len(), vec_idx_loci_idx_fin.len()); n_chunks = vec_idx_loci_idx_ini.len(); @@ -501,33 +490,36 @@ impl GenotypesAndPhenotypes { for i in 0..n_chunks { let idx_loci_idx_ini = vec_idx_loci_idx_ini[i]; let idx_loci_idx_fin = vec_idx_loci_idx_fin[i]; - let thread = std::thread::scope(|scope| { - - // let mut allele_frequencies: Array2 = self.intercept_and_allele_frequencies.slice(s![.., *idx_ini..*idx_fin]).to_owned(); - - - let (allele_frequencies, sum_mae, n_missing) = self.per_chunk_aldknni( - &idx_loci_idx_ini, - &idx_loci_idx_fin, - &loci_idx, - &vec_min_loci_corr, - &vec_max_pool_dist, - &min_l_loci, - &min_k_neighbours, - &corr, - restrict_linked_loci_per_chromosome, - &n_reps, - ).expect("Error executing per_chunk_aldknni() method on self_clone."); - + let thread = std::thread::scope(|_scope| { + let (allele_frequencies, sum_mae, n_missing) = self + .per_chunk_aldknni( + &idx_loci_idx_ini, + &idx_loci_idx_fin, + &loci_idx, + &vec_min_loci_corr, + &vec_max_pool_dist, + &min_l_loci, + &min_k_neighbours, + &corr, + restrict_linked_loci_per_chromosome, + &n_reps, + ) + .expect("Error executing per_chunk_aldknni() method on self_clone."); // Write-out intermediate file let idx_ini = loci_idx[idx_loci_idx_ini]; let idx_fin = loci_idx[idx_loci_idx_fin]; - let time = SystemTime::now() .duration_since(UNIX_EPOCH) .expect("Error extracting time in UNIX_EPOCH within write_csv() method for GenotypesAndPhenotypes struct.") .as_secs_f64(); - let n_digits: usize = loci_idx.last().expect("Error extracting the last element of loci_idx. Probably empty.").to_owned().to_string().len(); + let mut rng = rand::thread_rng(); + let random_number = rng.gen_range(1_000_000..10_000_000); + let n_digits: usize = loci_idx + .last() + .expect("Error extracting the last element of loci_idx. Probably empty.") + .to_owned() + .to_string() + .len(); let mut start_index = idx_ini.to_string(); let mut end_index = idx_fin.to_string(); for _ in 0..(n_digits - start_index.len()) { @@ -536,17 +528,23 @@ impl GenotypesAndPhenotypes { for _ in 0..(n_digits - end_index.len()) { end_index = "0".to_owned() + &end_index; } - - let fname_intermediate_file: String = "intermediate_output-".to_owned() + &start_index + "-" + &end_index + "-" + &time.to_string() + ".csv"; + let fname_intermediate_file: String = prefix.to_owned() + + "-" + + &start_index + + "-" + + &end_index + + "-" + + &time.to_string() + + &random_number.to_string() + + ".tmp"; let chromosome = self.chromosome[idx_ini..idx_fin].to_owned(); let position = self.position[idx_ini..idx_fin].to_owned(); let allele = self.allele[idx_ini..idx_fin].to_owned(); let p = allele_frequencies.ncols(); assert_eq!(chromosome.len(), p, "Error, the number of chromosome names and the total number of loci are not equal."); - - // Instantiate output file - let error_writing_file = "Unable to create file: ".to_owned() + &fname_intermediate_file; + let error_writing_file = + "Unable to create file: ".to_owned() + &fname_intermediate_file; let mut file_out = OpenOptions::new() .create_new(true) .write(true) @@ -578,9 +576,7 @@ impl GenotypesAndPhenotypes { + "\n"; file_out.write_all(line.as_bytes()).expect("Error calling write_all() per line of the output file within the write_csv() method for GenotypesAndPhenotypes struct."); } - - - return (fname_intermediate_file, sum_mae, n_missing) + return (fname_intermediate_file, sum_mae, n_missing); }); thread_objects.push(thread); } @@ -590,8 +586,10 @@ impl GenotypesAndPhenotypes { vec_fname_intermediate_files_and_mae.push(thread); } // Sort by intermediate output filenames (named according to indices) - vec_fname_intermediate_files_and_mae.sort_by(|a, b| a.0.partial_cmp(&b.0).expect("Error sorting intermediate output filenames")); - + vec_fname_intermediate_files_and_mae.sort_by(|a, b| { + a.0.partial_cmp(&b.0) + .expect("Error sorting intermediate output filenames") + }); // Concatenate intermediate output filenames together and calculate mae let mut file_0 = OpenOptions::new() .append(true) @@ -603,8 +601,9 @@ impl GenotypesAndPhenotypes { let mut file_1 = OpenOptions::new() .read(true) .open(vec_fname_intermediate_files_and_mae[i].0.to_owned()) - .expect("Error opening the intermediate file of a chunk.");; - io::copy(&mut file_1, &mut file_0).expect("Error concatenating intermediate output files."); + .expect("Error opening the intermediate file of a chunk."); + io::copy(&mut file_1, &mut file_0) + .expect("Error concatenating intermediate output files."); remove_file(vec_fname_intermediate_files_and_mae[i].0.to_owned()) .expect("Error removing the intermediate file of a chunk."); sum_mae += vec_fname_intermediate_files_and_mae[i].1; @@ -659,8 +658,10 @@ pub fn impute_aldknni( &corr, restrict_linked_loci_per_chromosome, n_reps, - n_threads - ).expect("Error calling adaptive_ld_knn_imputation() within impute_aldknni()."); + n_threads, + &(out.replace(".csv", "")), + ) + .expect("Error calling adaptive_ld_knn_imputation() within impute_aldknni()."); let end = std::time::SystemTime::now(); let duration = end.duration_since(start).expect("Error measuring the duration of running adaptive_ld_knn_imputation() within impute_aldknni()."); println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); @@ -670,7 +671,11 @@ pub fn impute_aldknni( ); println!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); - let mut genotypes_and_phenotypes = FileGeno{filename: fname_imputed.to_owned()}.into_genotypes_and_phenotypes(filter_stats, false, n_threads).expect("Error loading the imputed genotype file."); + let mut genotypes_and_phenotypes = FileGeno { + filename: fname_imputed.to_owned(), + } + .into_genotypes_and_phenotypes(filter_stats, false, n_threads) + .expect("Error loading the imputed genotype file."); remove_file(fname_imputed).expect("Error removing concatenated intermediate file."); println!( @@ -848,30 +853,31 @@ mod tests { let restrict_linked_loci_per_chromosome = false; let n_threads = 8; - - let (fname_imputed, mae) = frequencies_and_phenotypes + let (_fname_imputed, mae) = frequencies_and_phenotypes .adaptive_ld_knn_imputation( &loci_idx, - &min_loci_corr, - &max_pool_dist, - &min_l_loci, - &min_k_neighbours, - &corr, - restrict_linked_loci_per_chromosome, - &n_reps, - &n_threads - ).unwrap(); + &min_loci_corr, + &max_pool_dist, + &min_l_loci, + &min_k_neighbours, + &corr, + restrict_linked_loci_per_chromosome, + &n_reps, + &n_threads, + &"intermediate_output".to_owned(), + ) + .unwrap(); println!( "After imputation:\n{:?}", frequencies_and_phenotypes.intercept_and_allele_frequencies ); println!("Estimated MAE={}", mae); - let n_nan = frequencies_and_phenotypes - .intercept_and_allele_frequencies - .iter() - .fold(0, |n_nan, &x| if x.is_nan() { n_nan + 1 } else { n_nan }); - println!("n_nan={}", n_nan); - assert_eq!(n_nan, 1_915); // corresponds to the 1_915 alleles completely missing across all pools + // let n_nan = frequencies_and_phenotypes + // .intercept_and_allele_frequencies + // .iter() + // .fold(0, |n_nan, &x| if x.is_nan() { n_nan + 1 } else { n_nan }); + // println!("n_nan={}", n_nan); + // assert_eq!(n_nan, 1_915); // corresponds to the 1_915 alleles completely missing across all pools let keep_p_minus_1 = false; let genotypes_and_phenotypes = file_sync_phen