diff --git a/res/perf.R b/res/perf.R index 4788bf0..5b5eb14 100644 --- a/res/perf.R +++ b/res/perf.R @@ -123,7 +123,7 @@ fn_simulate_missing_data = function(vcf, mat_genotypes, maf=0.25, missing_rate=0 )) } ### Imputation accuracy metrics -fn_metrics = function(y_predicted, y_expected) { +fn_metrics = function(q_predicted, q_expected) { ## Overall metrics deviation = q_predicted - q_expected mae = mean(abs(deviation), na.rm=TRUE) @@ -132,7 +132,7 @@ fn_metrics = function(y_predicted, y_expected) { r2 = 1.00 - (mean(mse, na.rm=TRUE) / mean((q_expected-mean(q_expected))^2, na.rm=TRUE)) concordance = mean(q_predicted == q_expected, na.rm=TRUE) ### Metrics across the range of expected allele frequencies - vec_q_max = c(0.01, 0.05, c(1:9)/10, 0.95, 0.99, 1.00) + vec_q_max = c(0.00, 0.01, 0.05, c(1:9)/10, 0.95, 0.99, 1.00) vec_n = rep(0, each=length(vec_q_max)) vec_mae = rep(NA, each=length(vec_q_max)) vec_mse = rep(NA, each=length(vec_q_max)) @@ -142,12 +142,12 @@ fn_metrics = function(y_predicted, y_expected) { for (i in 1:length(vec_q_max)) { # i = 1 if (i==1) { - q_min = 0.0 + q_min = -1e-20 } else { q_min = vec_q_max[i-1] } q_max = vec_q_max[i] - idx = which((q_expected > q_min) & (q_expected <= q_max)) + idx = which((q_expected > q_min) & (q_expected <= q_max) & (is.na(q_expected)==FALSE) & (is.na(q_predicted)==FALSE)) if (length(idx) > 0) { vec_n[i] = length(idx) vec_deviations = q_predicted[idx] - q_expected[idx] @@ -222,20 +222,22 @@ fn_imputation_accuracy = function(fname_imputed, list_sim_missing, ploidy=4, str # print(paste0("Total number of simulated missing data = ", n_missing)) # print(paste0("Total number of imputed missing data = ", n_imputed)) ### Metrics using allele frequencies - metrics_allele_frequencies = fn_metrics(y_predicted=vec_imputed, y_expected=list_sim_missing$expected_allele_frequencies) + metrics_allele_frequencies = fn_metrics(q_predicted=vec_imputed, q_expected=list_sim_missing$expected_allele_frequencies) ### Metrics using genotype classes vec_expected_classes = fn_classify_allele_frequencies(mat_genotypes=list_sim_missing$expected_allele_frequencies, ploidy=ploidy, strict_boundaries=strict_boundaries) vec_imputed_classes = fn_classify_allele_frequencies(mat_genotypes=vec_imputed, ploidy=ploidy, strict_boundaries=strict_boundaries) - metrics_genotype_classes = fn_metrics(y_predicted=vec_imputed_classes, y_expected=vec_expected_classes) + metrics_genotype_classes = fn_metrics(q_predicted=vec_imputed_classes, q_expected=vec_expected_classes) return(list( frac_imputed = n_imputed / n_missing, - mae_freqs = metrics_allele_frequencies$mae, - rmse_freqs = metrics_allele_frequencies$rmse, - r2_freqs = metrics_allele_frequencies$r2, + mae_frequencies = metrics_allele_frequencies$mae, + rmse_frequencies = metrics_allele_frequencies$rmse, + r2_frequencies = metrics_allele_frequencies$r2, mae_classes = metrics_genotype_classes$mae, rmse_classes = metrics_genotype_classes$rmse, r2_classes = metrics_genotype_classes$r2, - concordance_classes = metrics_genotype_classes$concordance + concordance_classes = metrics_genotype_classes$concordance, + df_metrics_across_allele_freqs_frequencies = metrics_allele_frequencies$df_metrics_across_allele_freqs, + df_metrics_across_allele_freqs_classes = metrics_genotype_classes$df_metrics_across_allele_freqs )) } ### Performance assessment function @@ -270,31 +272,9 @@ fn_test_imputation = function(vcf, mat_genotypes, ploidy=4, maf=0.25, missing_ra min_k_neighbours=5, n_threads=n_threads) duration_aldknni_fixed = difftime(Sys.time(), time_ini, units="mins") - ### Adaptive LD-kNN imputation with optimisation for min_loci_corr, and max_pool_dist, and fixed min_l_loci, and min_k_neighbours - time_ini = Sys.time() - fname_out_aldknni_optimcd = aldknni(fname=list_sim_missing$fname_vcf, - fname_out_prefix=paste0("AOPTCD-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id), - optimise_n_steps_min_loci_corr=10, - optimise_n_steps_max_pool_dist=10, - min_l_loci=10, - min_k_neighbours=5, - n_threads=n_threads) - duration_aldknni_optimcd = difftime(Sys.time(), time_ini, units="mins") - ### Adaptive LD-kNN imputation with optimisation for min_l_loci, and min_k_neighbours, and fixed min_loci_corr, and max_pool_dist - time_ini = Sys.time() - fname_out_aldknni_optimlk = aldknni(fname=list_sim_missing$fname_vcf, - fname_out_prefix=paste0("AOPTLK-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id), - min_loci_corr=0.9, - max_pool_dist=0.1, - optimise_n_steps_min_l_loci=10, - optimise_n_steps_min_k_neighbours=10, - optimise_max_l_loci=100, - optimise_max_k_neighbours=50, - n_threads=n_threads) - duration_aldknni_optimlk = difftime(Sys.time(), time_ini, units="mins") ### Adaptive LD-kNN imputation with optimisation for min_loci_corr, max_pool_dist, min_l_loci, and min_k_neighbours time_ini = Sys.time() - fname_out_aldknni_optimall = aldknni(fname=list_sim_missing$fname_vcf, + fname_out_aldknni_optim = aldknni(fname=list_sim_missing$fname_vcf, fname_out_prefix=paste0("AOPTIM-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id), optimise_n_steps_min_loci_corr=10, optimise_n_steps_max_pool_dist=10, @@ -303,30 +283,34 @@ fn_test_imputation = function(vcf, mat_genotypes, ploidy=4, maf=0.25, missing_ra optimise_max_l_loci=100, optimise_max_k_neighbours=50, n_threads=n_threads) - duration_aldknni_optimall = difftime(Sys.time(), time_ini, units="mins") + duration_aldknni_optim = difftime(Sys.time(), time_ini, units="mins") ### LinkImpute's LD-kNN imputation algorithm for unordered genotype data (forcing all data to be diploids) fname_for_linkimpute = paste0("LINKIMPUTE_INPUT-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id,".tsv") output_for_linkimpute = paste0("LINKIMPUTE_INPUT-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id,"-IMPUTED.tsv") fname_out_linkimpute = paste0("LINKIMPUTE_INPUT-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id,"-IMPUTED.csv") vcf_for_linkimpute = vcfR::read.vcfR(list_sim_missing$fname_vcf) mat_genotypes_for_linkimpute = t(fn_classify_allele_frequencies(fn_extract_allele_frequencies(vcf_for_linkimpute), ploidy=2)) * 2 - mat_genotypes_for_linkimpute[is.na(mat_genotypes_for_linkimpute)] = -1 - write.table(mat_genotypes_for_linkimpute, file=fname_for_linkimpute, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE) - time_ini = Sys.time() - system(paste0("java -jar linkimpute/LinkImpute.jar --verbose -a ", fname_for_linkimpute, " ", output_for_linkimpute)) - duration_linkimpute = difftime(Sys.time(), time_ini, units="mins") - mat_linkimputed = read.delim(output_for_linkimpute, header=FALSE) - rownames(mat_linkimputed) = rownames(mat_genotypes_for_linkimpute) - colnames(mat_linkimputed) = colnames(mat_genotypes_for_linkimpute) - mat_linkimputed = t(mat_linkimputed / 2) - list_loci_names = strsplit(rownames(mat_linkimputed), "_") - chr = unlist(lapply(list_loci_names, FUN=function(x){(paste(x[1:(length(x)-2)], collapse="_"))})) - pos = unlist(lapply(list_loci_names, FUN=function(x){as.numeric(x[(length(x)-1)])})) - allele = unlist(lapply(list_loci_names, FUN=function(x){x[length(x)]})) - df_linkimputed = data.frame(chr, pos, allele) - df_linkimputed = cbind(df_linkimputed, mat_linkimputed) - colnames(df_linkimputed)[1] = "#chr" - write.table(df_linkimputed, file=fname_out_linkimpute, sep=",", row.names=FALSE, col.names=TRUE, quote=FALSE) + bool_enough_data_to_simulate_10k_missing = prod(dim(mat_genotypes_for_linkimpute)) >= (20000) + if (bool_enough_data_to_simulate_10k_missing == TRUE) { + ### LinkImpute stalls if it cannot mask 10,000 data points for optimising l and k, because the number of non-missing data points is not enough to reach the fixed 10,000 random data points. + mat_genotypes_for_linkimpute[is.na(mat_genotypes_for_linkimpute)] = -1 + write.table(mat_genotypes_for_linkimpute, file=fname_for_linkimpute, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE) + time_ini = Sys.time() + system(paste0("java -jar linkimpute/LinkImpute.jar --verbose -a ", fname_for_linkimpute, " ", output_for_linkimpute)) + duration_linkimpute = difftime(Sys.time(), time_ini, units="mins") + mat_linkimputed = read.delim(output_for_linkimpute, header=FALSE) + rownames(mat_linkimputed) = rownames(mat_genotypes_for_linkimpute) + colnames(mat_linkimputed) = colnames(mat_genotypes_for_linkimpute) + mat_linkimputed = t(mat_linkimputed / 2) + list_loci_names = strsplit(rownames(mat_linkimputed), "_") + chr = unlist(lapply(list_loci_names, FUN=function(x){(paste(x[1:(length(x)-2)], collapse="_"))})) + pos = unlist(lapply(list_loci_names, FUN=function(x){as.numeric(x[(length(x)-1)])})) + allele = unlist(lapply(list_loci_names, FUN=function(x){x[length(x)]})) + df_linkimputed = data.frame(chr, pos, allele) + df_linkimputed = cbind(df_linkimputed, mat_linkimputed) + colnames(df_linkimputed)[1] = "#chr" + write.table(df_linkimputed, file=fname_out_linkimpute, sep=",", row.names=FALSE, col.names=TRUE, quote=FALSE) + } ### Validating imputation metrics_mvi = fn_imputation_accuracy(fname_imputed=fname_out_mvi, list_sim_missing=list_sim_missing, @@ -338,29 +322,37 @@ fn_test_imputation = function(vcf, mat_genotypes, ploidy=4, maf=0.25, missing_ra ploidy=ploidy, strict_boundaries=strict_boundaries, n_threads=n_threads) - metrics_aldknni_optimcd = fn_imputation_accuracy(fname_imputed=fname_out_aldknni_optimcd, - list_sim_missing=list_sim_missing, - ploidy=ploidy, - strict_boundaries=strict_boundaries, - n_threads=n_threads) - metrics_aldknni_optimlk = fn_imputation_accuracy(fname_imputed=fname_out_aldknni_optimlk, + metrics_aldknni_optim = fn_imputation_accuracy(fname_imputed=fname_out_aldknni_optim, list_sim_missing=list_sim_missing, ploidy=ploidy, strict_boundaries=strict_boundaries, n_threads=n_threads) - metrics_aldknni_optimall = fn_imputation_accuracy(fname_imputed=fname_out_aldknni_optimall, - list_sim_missing=list_sim_missing, - ploidy=ploidy, - strict_boundaries=strict_boundaries, - n_threads=n_threads) - metrics_linkimpute = fn_imputation_accuracy(fname_imputed=fname_out_linkimpute, - list_sim_missing=list_sim_missing, - ploidy=2, - strict_boundaries=strict_boundaries, - n_threads=n_threads) + if (bool_enough_data_to_simulate_10k_missing == TRUE) { + metrics_linkimpute = fn_imputation_accuracy(fname_imputed=fname_out_linkimpute, + list_sim_missing=list_sim_missing, + ploidy=2, + strict_boundaries=strict_boundaries, + n_threads=n_threads) + } else { + df_metrics_across_allele_freqs_frequencies = metrics_aldknni_optimall$df_metrics_across_allele_freqs_frequencies + df_metrics_across_allele_freqs_classes = metrics_aldknni_optimall$df_metrics_across_allele_freqs_classes + df_metrics_across_allele_freqs_frequencies[,2:7] = NA + df_metrics_across_allele_freqs_classes[,2:7] = NA + metrics_linkimpute = list( + frac_imputed = 0.0, + mae_frequencies = NA, + rmse_frequencies = NA, + r2_frequencies = NA, + mae_classes = NA, + rmse_classes = NA, + r2_classes = NA, + concordance_classes = NA, + df_metrics_across_allele_freqs_frequencies = df_metrics_across_allele_freqs_frequencies, + df_metrics_across_allele_freqs_classes = df_metrics_across_allele_freqs_classes + ) + } ### Merge imputation accuracy metrics into the output data.frame - # string_metric_lists = c("metrics_mvi", "metrics_aldknni", "metrics_lukes") - string_metric_lists = c("metrics_mvi", "metrics_aldknni", "metrics_aldknni_optim_thresholds", "metrics_aldknni_optim_counts", "metrics_linkimpute") + string_metric_lists = c("metrics_mvi", "metrics_aldknni_fixed", "metrics_aldknni_optim", "metrics_linkimpute") for (m in string_metric_lists) { # m = string_metric_lists[1] algorithm = gsub("metrics_", "", m) @@ -374,7 +366,7 @@ fn_test_imputation = function(vcf, mat_genotypes, ploidy=4, maf=0.25, missing_ra algorithm, 2, as.numeric(eval(parse(text=paste0("duration_", algorithm)))), - matrix(unlist(eval(parse(text=m))), nrow=1)) + matrix(unlist(eval(parse(text=paste0(m, "[1:8]")))), nrow=1)) } else { df_metrics = data.frame( maf, @@ -384,7 +376,7 @@ fn_test_imputation = function(vcf, mat_genotypes, ploidy=4, maf=0.25, missing_ra algorithm, ploidy, as.numeric(eval(parse(text=paste0("duration_", algorithm)))), - matrix(unlist(eval(parse(text=m))), nrow=1)) + matrix(unlist(eval(parse(text=paste0(m, "[1:8]")))), nrow=1)) } colnames(df_metrics) = c( "maf", @@ -394,7 +386,14 @@ fn_test_imputation = function(vcf, mat_genotypes, ploidy=4, maf=0.25, missing_ra "algorithm", "ploidy", "duration_mins", - names(eval(parse(text=m)))) + names(eval(parse(text=paste0(m, "[1:8]"))))) + ### Insert metrics across allele frequency bins + df_metrics_across_allele_freqs_frequencies = eval(parse(text=paste0(m, "$df_metrics_across_allele_freqs_frequencies"))) + for (q in df_metrics_across_allele_freqs_frequencies$q) { + idx = which(df_metrics_across_allele_freqs_frequencies$q == q) + eval(parse(text=paste0("df_metrics$`mae_", q, "` = df_metrics_across_allele_freqs_frequencies$mae[idx]"))) + } + ### Bind if (m==string_metric_lists[1]) { df_out = df_metrics colnames(df_out) = names(df_metrics) @@ -409,9 +408,6 @@ fn_test_imputation = function(vcf, mat_genotypes, ploidy=4, maf=0.25, missing_ra system(paste0("rm ", gsub("-IMPUTED.csv$", "*", fname_out_aldknni_optim_thresholds))) system(paste0("rm ", gsub("-IMPUTED.csv$", "*", fname_out_aldknni_optim_counts))) system(paste0("rm ", gsub("-IMPUTED.csv$", "*", fname_out_linkimpute))) - # system(paste0("rm ", gsub("-IMPUTED.tsv$", "*", fname_out_linkimpute))) - # system(paste0("rm ", fname_lukes_ldknni_input_rds)) - # system(paste0("rm ", fname_out_lukes)) ### Output return(df_out) } @@ -423,11 +419,8 @@ ploidy = as.numeric(args[2]) i = as.numeric(args[3]) n_reps = as.numeric(args[4]) n_threads = as.numeric(args[5]) -# fname_vcf="/group/pasture/Jeff/imputef/misc/lucerne.vcf"; ploidy=4; i=1; n_reps=3; n_threads=32; strict_boundaries=FALSE -# fname_vcf="/group/pasture/Jeff/imputef/misc/soybean.vcf"; ploidy=84; i=44; n_reps=3; n_threads=32; strict_boundaries=FALSE -# fname_vcf="/group/pasture/Jeff/imputef/misc/zucchini.vcf"; ploidy=2; i=44; n_reps=3; n_threads=32; strict_boundaries=FALSE -# fname_vcf="/group/pasture/Jeff/imputef/misc/apple.vcf"; ploidy=2; i=44; n_reps=3; n_threads=32; strict_boundaries=FALSE -# fname_vcf="/group/pasture/Jeff/imputef/misc/grape.vcf"; ploidy=2; i=1; n_reps=3; n_threads=32; strict_boundaries=FALSE +# fname_vcf="/group/pasture/Jeff/imputef/misc/lucerne.vcf"; ploidy=2; i=19; n_reps=3; n_threads=32; strict_boundaries=FALSE; r=1 +# fname_vcf="/group/pasture/Jeff/imputef/misc/grape.vcf"; ploidy=2; i=19; n_reps=3; n_threads=32; strict_boundaries=FALSE; r=1 ### Load genotype data vcf = vcfR::read.vcfR(fname_vcf) ### high-confidence genotype data: 154 pools X 124,151 loci mat_genotypes = fn_extract_allele_frequencies(vcf) diff --git a/src/rust/src/aldknni.rs b/src/rust/src/aldknni.rs index 52e9ea8..a667390 100644 --- a/src/rust/src/aldknni.rs +++ b/src/rust/src/aldknni.rs @@ -259,33 +259,32 @@ fn impute_allele_frequencies( } let mut imputed_freqs = vec![0.0; p]; // LD-kNN imputations (weighted mode and mean) + // let do_linkimpute_weighted_mode = false; // Testing forcing mean for individual diploid biallelic loci imputation if do_linkimpute_weighted_mode { // Perform weighted modal imputation as in LinkImpute for biallelic diploids - the only 2 differences are that we are performing this per chromosome and the distance metric is MAE rather than Manhattan distance assert!(frequencies.ncols() <= 2, "Error in the number of alleles per locus. We expect a biallelic locus, set do_linkimpute_weighted_mode to false."); - for idx_allele in 0..p { - let vec_geno = vec![0.0, 0.5, 1.0]; - let mut max_score = 0.0; - let mut weighted_mode = 0.0; - for j in 0..vec_geno.len() { - let a = vec_geno[j]; - let mut score = 0.0; - for i in 0..frequencies.column(idx_allele).len() { - let f = 1.00 / (&distances[i] + f64::EPSILON); - let g = if frequencies.column(idx_allele)[i] == a { - 1.0 - } else { - 0.0 - }; - score += f * g; - } - if score > max_score { - max_score = score; - weighted_mode = vec_geno[j]; - } + let vec_geno = vec![0.0, 0.5, 1.0]; + let mut max_score = 0.0; + let mut weighted_mode = 0.0; + for j in 0..vec_geno.len() { + let a = vec_geno[j]; + let mut score = 0.0; + for i in 0..frequencies.column(0).len() { + let f = 1.00 - &distances[i]; + let g = if frequencies.column(0)[i] == a { + 1.0 + } else { + 0.0 + }; + score += f * g; + } + if score > max_score { + max_score = score; + weighted_mode = vec_geno[j]; } - // println!("weighted_mode={:?}", weighted_mode); - imputed_freqs[idx_allele] = weighted_mode; } + imputed_freqs[0] = weighted_mode; + imputed_freqs[1] = 1.0 - weighted_mode; } else { // Perform weighted mean allele frequencies let additive_inverse_plus_epsilon: Array1 = @@ -343,8 +342,14 @@ impl GenotypesAndPhenotypes { // Extract loci indices let (loci_idx, loci_chr, loci_pos) = self.count_loci().expect("Error calling count_loci() method within adaptive_ld_knn_imputation() method for GenotypesAndPhenotypes struct."); // Calculate LD across the entire genome + if (self.intercept_and_allele_frequencies.nrows() * self.intercept_and_allele_frequencies.ncols()) > 1_000_000 { + println!("Estimating linkage between loci across the entire genome...") + } let corr = calculate_genomewide_ld(&self.intercept_and_allele_frequencies) .expect("Error estimating pairwise linkage between loci across the entire genome."); + if (self.intercept_and_allele_frequencies.nrows() * self.intercept_and_allele_frequencies.ncols()) > 1_000_000 { + println!("LD estimation finished.") + } // Parallel imputation let mat_freqs = self.intercept_and_allele_frequencies.clone(); Zip::indexed(&mut self.intercept_and_allele_frequencies) diff --git a/src/rust/src/optim.rs b/src/rust/src/optim.rs index 335ce1a..a1cf921 100644 --- a/src/rust/src/optim.rs +++ b/src/rust/src/optim.rs @@ -1,4 +1,4 @@ -use ndarray::prelude::*; + use rand::prelude::IteratorRandom; use std::io; @@ -149,6 +149,7 @@ impl GenotypesAndPhenotypes { } else { sum_abs += (vec_vec_masked_alleles_freqs[i][j] - vec_vec_imputed_mask[i][j]).abs(); + // (vec_vec_masked_alleles_freqs[i][j] - vec_vec_imputed_mask[i][j]).powf(2.0); // MSE seem to have the same behaviour as MAE a += 1.00; } } @@ -159,7 +160,6 @@ impl GenotypesAndPhenotypes { sum_abs / a } }; - // println!("mean_absolute_error={:?}", mean_absolute_error); Ok(mean_absolute_error) } @@ -202,6 +202,7 @@ impl GenotypesAndPhenotypes { } else { sum_abs += (vec_vec_masked_alleles_freqs[i][j] - vec_vec_imputed_mask[i][j]).abs(); + // (vec_vec_masked_alleles_freqs[i][j] - vec_vec_imputed_mask[i][j]).powf(2.0); // MSE seem to have the same behaviour as MAE a += 1.00; } } @@ -279,24 +280,38 @@ pub fn optimise_params_and_estimate_accuracy( } else { vec![*min_k_neighbours] }; - // Use the same simulated missing data across the range of corr/l and dist/k... - let mut array5_mae: Array5 = Array5::from_elem( - ( - *optimise_n_reps, - vec_min_loci_corr.len(), - vec_max_pool_dist.len(), - vec_min_l_loci.len(), - vec_min_k_neighbours.len(), - ), - f64::NAN, - ); - println!("-----------------------------------------------"); - if *optimise_n_reps > 1 { - println!("rep\tmin_loci_corr\tmax_pool_dist\tmin_l_loci\tmin_k_neighbours\tmae"); + // Optimisation via coordinate-descent-like algorithm starting with the strictest parameters + // Parameter optimisation order: + // (1) k-neighbours (from low to high) + // (2) maximum pool distances (from low to high) + // (3) l loci (from low to high) + // (4) minimum loci correlation (from high to low) + // As we explore the parameter space one at a time at decreasing stringency, + // we replace the optimum parameter if mae decreases, and + // break if mae increases so that we are more computationally efficient. + let all_parameters_are_fixed = (vec_min_loci_corr.len() == 1) + & (vec_max_pool_dist.len() == 1) + & (vec_min_l_loci.len() == 1) + & (vec_min_k_neighbours.len() == 1); + let optimise_n_reps = if all_parameters_are_fixed & (*optimise_n_reps > 1) { + 1 } else { - println!("min_loci_corr\tmax_pool_dist\tmin_l_loci\tmin_k_neighbours\tmae"); + *optimise_n_reps + }; + let mut min_loci_corr = vec![]; + let mut max_pool_dist = vec![]; + let mut min_l_loci = vec![]; + let mut min_k_neighbours = vec![]; + let mut mae = vec![]; + if all_parameters_are_fixed == false { + println!("-----------------------------------------------"); + if optimise_n_reps > 1 { + println!("rep\tmin_k_neighbours\tmax_pool_dist\tmin_l_loci\tmin_loci_corr\tmae"); + } else { + println!("min_k_neighbours\tmax_pool_dist\tmin_l_loci\tmin_loci_corr\tmae"); + } } - for r in 0..*optimise_n_reps { + for r in 0..optimise_n_reps { // Simulate 10% sparsity to determine accuracy (in terms of MAE) and to optimise for the best k and l, if applicable, i.e. n_loci_to_estimate_distance==0 or k_neighbours==0 let mut genotype_data_for_optimisation = genotypes_and_phenotypes.clone(); let ( @@ -313,119 +328,364 @@ pub fn optimise_params_and_estimate_accuracy( max_sparsity, 0.1, "Ooops! Missing unexpected simulated sparsity!" ); - for i in 0..vec_min_loci_corr.len() { + // Coordinate-descent-like optimisation starting with the strictest parameters + let mut optimum_min_loci_corr = vec_min_loci_corr + .last() + .expect("Error: vec_min_loci_corr is empty.") + .to_owned(); + let mut optimum_max_pool_dist = vec_max_pool_dist[0]; + let mut optimum_min_l_loci = vec_min_l_loci[0]; + let mut optimum_min_k_neighbours = vec_min_k_neighbours[0]; + let mut optimum_mae = genotype_data_for_optimisation + .clone() + .estimate_expected_mae_in_aldknni( + &optimum_min_loci_corr, + &optimum_max_pool_dist, + &optimum_min_l_loci, + &optimum_min_k_neighbours, + restrict_linked_loci_per_chromosome, + do_linkimpute_weighted_mode, + &loci_idx, + &vec_masked_loci_idx, + &vec_vec_masked_alleles_freqs, + ) + .expect("Error calling estimate_expected_mae_in_aldknni() method within optimise_params_and_estimate_accuracy()."); + if all_parameters_are_fixed == false { + for l in 0..vec_min_k_neighbours.len() { + let mae = genotype_data_for_optimisation + .clone() + .estimate_expected_mae_in_aldknni( + &optimum_min_loci_corr, + &optimum_max_pool_dist, + &optimum_min_l_loci, + &vec_min_k_neighbours[l], + restrict_linked_loci_per_chromosome, + do_linkimpute_weighted_mode, + &loci_idx, + &vec_masked_loci_idx, + &vec_vec_masked_alleles_freqs, + ) + .expect("Error calling estimate_expected_mae_in_aldknni() method within optimise_params_and_estimate_accuracy()."); + if optimise_n_reps > 1 { + println!( + "{}\t{}\t{}\t{}\t{}\t{}", + r, + vec_min_k_neighbours[l], + optimum_max_pool_dist, + optimum_min_l_loci, + optimum_min_loci_corr, + mae + ); + } else { + println!( + "{}\t{}\t{}\t{}\t{}", + vec_min_k_neighbours[l], + optimum_max_pool_dist, + optimum_min_l_loci, + optimum_min_loci_corr, + mae + ); + } + if mae < optimum_mae { + optimum_min_k_neighbours = vec_min_k_neighbours[l]; + optimum_mae = mae; + } else if mae > optimum_mae { + break; + } + } for j in 0..vec_max_pool_dist.len() { - for k in 0..vec_min_l_loci.len() { - for l in 0..vec_min_k_neighbours.len() { - let mae = genotype_data_for_optimisation - .clone() - .estimate_expected_mae_in_aldknni( - &vec_min_loci_corr[i], - &vec_max_pool_dist[j], - &vec_min_l_loci[k], - &vec_min_k_neighbours[l], - restrict_linked_loci_per_chromosome, - do_linkimpute_weighted_mode, - &loci_idx, - &vec_masked_loci_idx, - &vec_vec_masked_alleles_freqs, - ) - .expect("Error calling estimate_expected_mae_in_aldknni() method within optimise_params_and_estimate_accuracy()."); - array5_mae[(r, i, j, k, l)] = mae; - if *optimise_n_reps > 1 { - println!( - "{}\t{}\t{}\t{}\t{}\t{}", - r, - vec_min_loci_corr[i], - vec_max_pool_dist[j], - vec_min_l_loci[k], - vec_min_k_neighbours[l], - mae - ); - } else { - println!( - "{}\t{}\t{}\t{}\t{}", - vec_min_loci_corr[i], - vec_max_pool_dist[j], - vec_min_l_loci[k], - vec_min_k_neighbours[l], - mae - ); - } - } + let mae = genotype_data_for_optimisation + .clone() + .estimate_expected_mae_in_aldknni( + &optimum_min_loci_corr, + &vec_max_pool_dist[j], + &optimum_min_l_loci, + &optimum_min_k_neighbours, + restrict_linked_loci_per_chromosome, + do_linkimpute_weighted_mode, + &loci_idx, + &vec_masked_loci_idx, + &vec_vec_masked_alleles_freqs, + ) + .expect("Error calling estimate_expected_mae_in_aldknni() method within optimise_params_and_estimate_accuracy()."); + if optimise_n_reps > 1 { + println!( + "{}\t{}\t{}\t{}\t{}\t{}", + r, + optimum_min_k_neighbours, + vec_max_pool_dist[j], + optimum_min_l_loci, + optimum_min_loci_corr, + mae + ); + } else { + println!( + "{}\t{}\t{}\t{}\t{}", + optimum_min_k_neighbours, + vec_max_pool_dist[j], + optimum_min_l_loci, + optimum_min_loci_corr, + mae + ); + } + if mae < optimum_mae { + optimum_max_pool_dist = vec_max_pool_dist[j]; + optimum_mae = mae; + } else if mae > optimum_mae { + break; } } - } - } - println!("-----------------------------------------------"); - // Identify best min_loci_corr, max_pool_dist, min_l_loci, and min_k_neighbours - let mut optimum_mae: f64 = 1.0; - let mut optimum_min_loci_corr: f64 = 0.0; - let mut optimum_max_pool_dist: f64 = 1.0; - let mut optimum_min_l_loci: u64 = 0; - let mut optimum_min_k_neighbours: u64 = 0; - if *optimise_n_reps > 1 { - println!("-----------------------------------------------"); - println!("min_loci_corr\tmax_pool_dist\tmin_l_loci\tmin_k_neighbours\tmae\tsd"); - } - for i in 0..vec_min_loci_corr.len() { - for j in 0..vec_max_pool_dist.len() { for k in 0..vec_min_l_loci.len() { - for l in 0..vec_min_k_neighbours.len() { - for _r in 0..*optimise_n_reps { - let mae: Array1 = array5_mae - .slice(s![.., i, j, k, l]) - .iter() - .filter(|&x| !x.is_nan()) - .map(|&x| x.to_owned()) - .collect(); - let (mu, sd) = if mae.len() > 0 { - let mu = mae.iter().fold(0.0, |sum, &x| sum + x) / (mae.len() as f64); - let sd = if mae.len() > 1 { - mae.iter().fold(0.0, |sum, &x| sum + (x - mu).powf(2.0)) - / (mae.len() as f64 - 1.00).sqrt() - } else { - 0.0 - }; - (mu, sd) - } else { - (f64::NAN, f64::NAN) - }; - if *optimise_n_reps > 1 { - println!( - "{}\t{}\t{}\t{}\t{}\t{}", - vec_min_loci_corr[i], - vec_max_pool_dist[j], - vec_min_l_loci[k], - vec_min_k_neighbours[l], - mu, - sd - ); - } - if mu.is_nan() == false { - if optimum_mae > mu { - optimum_mae = mu; - optimum_min_loci_corr = vec_min_loci_corr[i]; - optimum_max_pool_dist = vec_max_pool_dist[j]; - optimum_min_l_loci = vec_min_l_loci[k]; - optimum_min_k_neighbours = vec_min_k_neighbours[l]; - } - } - } + let mae = genotype_data_for_optimisation + .clone() + .estimate_expected_mae_in_aldknni( + &optimum_min_loci_corr, + &optimum_max_pool_dist, + &vec_min_l_loci[k], + &optimum_min_k_neighbours, + restrict_linked_loci_per_chromosome, + do_linkimpute_weighted_mode, + &loci_idx, + &vec_masked_loci_idx, + &vec_vec_masked_alleles_freqs, + ) + .expect("Error calling estimate_expected_mae_in_aldknni() method within optimise_params_and_estimate_accuracy()."); + if optimise_n_reps > 1 { + println!( + "{}\t{}\t{}\t{}\t{}\t{}", + r, + optimum_min_k_neighbours, + optimum_max_pool_dist, + vec_min_l_loci[k], + optimum_min_loci_corr, + mae + ); + } else { + println!( + "{}\t{}\t{}\t{}\t{}", + optimum_min_k_neighbours, + optimum_max_pool_dist, + vec_min_l_loci[k], + optimum_min_loci_corr, + mae + ); + } + if mae < optimum_mae { + optimum_min_l_loci = vec_min_l_loci[k]; + optimum_mae = mae; + } else if mae > optimum_mae { + break; + } + } + for i in (0..vec_min_loci_corr.len()).rev() { + let mae = genotype_data_for_optimisation + .clone() + .estimate_expected_mae_in_aldknni( + &vec_min_loci_corr[i], + &optimum_max_pool_dist, + &optimum_min_l_loci, + &optimum_min_k_neighbours, + restrict_linked_loci_per_chromosome, + do_linkimpute_weighted_mode, + &loci_idx, + &vec_masked_loci_idx, + &vec_vec_masked_alleles_freqs, + ) + .expect("Error calling estimate_expected_mae_in_aldknni() method within optimise_params_and_estimate_accuracy()."); + if optimise_n_reps > 1 { + println!( + "{}\t{}\t{}\t{}\t{}\t{}", + r, + optimum_min_k_neighbours, + optimum_max_pool_dist, + optimum_min_l_loci, + vec_min_loci_corr[i], + mae + ); + } else { + println!( + "{}\t{}\t{}\t{}\t{}", + optimum_min_k_neighbours, + optimum_max_pool_dist, + optimum_min_l_loci, + vec_min_loci_corr[i], + mae + ); + } + if mae < optimum_mae { + optimum_min_loci_corr = vec_min_loci_corr[i]; + optimum_mae = mae; + } else if mae > optimum_mae { + break; } } } + // Append parameters + mae.push(optimum_mae); + min_loci_corr.push(optimum_min_loci_corr); + max_pool_dist.push(optimum_max_pool_dist); + min_l_loci.push(optimum_min_l_loci); + min_k_neighbours.push(optimum_min_k_neighbours); } - if *optimise_n_reps > 1 { + if all_parameters_are_fixed == false { println!("-----------------------------------------------"); } Ok(( - optimum_min_loci_corr, - optimum_max_pool_dist, - optimum_min_l_loci, - optimum_min_k_neighbours, - optimum_mae, + min_loci_corr.iter().fold(0.0, |sum, &x| sum + x) / (optimise_n_reps as f64), + max_pool_dist.iter().fold(0.0, |sum, &x| sum + x) / (optimise_n_reps as f64), + min_l_loci.iter().fold(0, |sum, &x| sum + x) / (optimise_n_reps as u64), + min_k_neighbours.iter().fold(0, |sum, &x| sum + x) / (optimise_n_reps as u64), + mae.iter().fold(0.0, |sum, &x| sum + x) / (optimise_n_reps as f64), )) + // // Optimisation across the entire parameter space + // let mut array5_mae: Array5 = Array5::from_elem( + // ( + // *optimise_n_reps, + // vec_min_loci_corr.len(), + // vec_max_pool_dist.len(), + // vec_min_l_loci.len(), + // vec_min_k_neighbours.len(), + // ), + // f64::NAN, + // ); + // println!("-----------------------------------------------"); + //if *optimise_n_reps > 1 { + // println!("rep\tmin_loci_corr\tmax_pool_dist\tmin_l_loci\tmin_k_neighbours\tmae"); + // } else { + // println!("min_loci_corr\tmax_pool_dist\tmin_l_loci\tmin_k_neighbours\tmae"); + // } + // for r in 0..*optimise_n_reps { + // // Use the same simulated missing data across the range of corr/l and dist/k... + // // Simulate 10% sparsity to determine accuracy (in terms of MAE) and to optimise for the best k and l, if applicable, i.e. n_loci_to_estimate_distance==0 or k_neighbours==0 + // let mut genotype_data_for_optimisation = genotypes_and_phenotypes.clone(); + // let ( + // _, + // max_sparsity, + // loci_idx, + // vec_masked_loci_idx, + // _vec_masked_coverages, + // vec_vec_masked_alleles_freqs, + // ) = genotype_data_for_optimisation + // .simulate_missing(&0.1) + // .expect("Error calling simulate_missing() method within optimise_params_and_estimate_accuracy()."); + // assert_eq!( + // max_sparsity, 0.1, + // "Ooops! Missing unexpected simulated sparsity!" + // ); + // for i in 0..vec_min_loci_corr.len() { + // for j in 0..vec_max_pool_dist.len() { + // for k in 0..vec_min_l_loci.len() { + // for l in 0..vec_min_k_neighbours.len() { + // let mae = genotype_data_for_optimisation + // .clone() + // .estimate_expected_mae_in_aldknni( + // &vec_min_loci_corr[i], + // &vec_max_pool_dist[j], + // &vec_min_l_loci[k], + // &vec_min_k_neighbours[l], + // restrict_linked_loci_per_chromosome, + // do_linkimpute_weighted_mode, + // &loci_idx, + // &vec_masked_loci_idx, + // &vec_vec_masked_alleles_freqs, + // ) + // .expect("Error calling estimate_expected_mae_in_aldknni() method within optimise_params_and_estimate_accuracy()."); + // array5_mae[(r, i, j, k, l)] = mae; + // if *optimise_n_reps > 1 { + // println!( + // "{}\t{}\t{}\t{}\t{}\t{}", + // r, + // vec_min_loci_corr[i], + // vec_max_pool_dist[j], + // vec_min_l_loci[k], + // vec_min_k_neighbours[l], + // mae + // ); + // } else { + // println!( + // "{}\t{}\t{}\t{}\t{}", + // vec_min_loci_corr[i], + // vec_max_pool_dist[j], + // vec_min_l_loci[k], + // vec_min_k_neighbours[l], + // mae + // ); + // } + // } + // } + // } + // } + // } + // println!("-----------------------------------------------"); + // // Identify best min_loci_corr, max_pool_dist, min_l_loci, and min_k_neighbours + // let mut optimum_mae: f64 = 1.0; + // let mut optimum_min_loci_corr: f64 = 0.0; + // let mut optimum_max_pool_dist: f64 = 1.0; + // let mut optimum_min_l_loci: u64 = 0; + // let mut optimum_min_k_neighbours: u64 = 0; + // if *optimise_n_reps > 1 { + // println!("-----------------------------------------------"); + // println!("min_loci_corr\tmax_pool_dist\tmin_l_loci\tmin_k_neighbours\tmae\tsd"); + // } + // // Iterating across the parameters such that we iterate across increasing number of loci and pools to use in imputation and select the optimal parameters while minimising the loci and pools to use in imputation + // for i in (0..vec_min_loci_corr.len()).rev() { + // for j in 0..vec_max_pool_dist.len() { + // for k in 0..vec_min_l_loci.len() { + // for l in 0..vec_min_k_neighbours.len() { + // for _r in 0..*optimise_n_reps { + // let mae: Array1 = array5_mae + // .slice(s![.., i, j, k, l]) + // .iter() + // .filter(|&x| !x.is_nan()) + // .map(|&x| x.to_owned()) + // .collect(); + // let (mu, sd) = if mae.len() > 0 { + // let mu = mae.iter().fold(0.0, |sum, &x| sum + x) / (mae.len() as f64); + // let sd = if mae.len() > 1 { + // mae.iter().fold(0.0, |sum, &x| sum + (x - mu).powf(2.0)) + // / (mae.len() as f64 - 1.00).sqrt() + // } else { + // 0.0 + // }; + // (mu, sd) + // } else { + // (f64::NAN, f64::NAN) + // }; + // if *optimise_n_reps > 1 { + // println!( + // "{}\t{}\t{}\t{}\t{}\t{}", + // vec_min_loci_corr[i], + // vec_max_pool_dist[j], + // vec_min_l_loci[k], + // vec_min_k_neighbours[l], + // mu, + // sd + // ); + // } + // if mu.is_nan() == false { + // if optimum_mae > mu { + // optimum_mae = mu; + // optimum_min_loci_corr = vec_min_loci_corr[i]; + // optimum_max_pool_dist = vec_max_pool_dist[j]; + // optimum_min_l_loci = vec_min_l_loci[k]; + // optimum_min_k_neighbours = vec_min_k_neighbours[l]; + // } + // } + // } + // } + // } + // } + // } + // if *optimise_n_reps > 1 { + // println!("-----------------------------------------------"); + // } + // Ok(( + // optimum_min_loci_corr, + // optimum_max_pool_dist, + // optimum_min_l_loci, + // optimum_min_k_neighbours, + // optimum_mae, + // )) } // Make tests