From 72ceea4b27294d2ade920b882e4a10b5f55f0bdf Mon Sep 17 00:00:00 2001 From: jeffersonfparil Date: Thu, 11 Jan 2024 16:52:20 +1100 Subject: [PATCH] trying to fix issues #3 and #4 --- README.md | 18 +- src/load.R | 11 +- src/main.R | 475 ++++++++++++++++++++++++++---------------------- test/test.sh | 2 +- test/test.slurm | 4 +- 5 files changed, 276 insertions(+), 234 deletions(-) diff --git a/README.md b/README.md index 44cc6da..40045b9 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ Rscript src/main.R \ -g "test/grape.vcf" \ -p "test/grape_pheno.txt" \ --phenotype-delimiter "\t" \ - --phenotype-column-data 4 \ + --phenotype-column-data "3,4" \ --k-folds 5 \ --n-reps 1 \ --n-threads 8 \ @@ -100,7 +100,7 @@ Rscript src/main.R \ -g "test/grape.vcf" \ -p "test/grape_pheno.txt" \ --phenotype-delimiter "\t" \ - --phenotype-column-data 4 \ + --phenotype-column-data "3,4" \ --k-folds 5 \ --n-reps 1 \ --n-threads ${SLURM_CPUS_PER_TASK} \ @@ -113,7 +113,7 @@ Rscript src/main.R \ - [Variant call format (`*.vcf`)](https://www.internationalgenome.org/wiki/Analysis/vcf4.0/#:~:text=VCF%20is%20a%20text%20file,for%20each%20position%20or%20not.) which requires either the genotype call (**GT**) or allele depth (**AD**) field, where the AD field has precedence if both are present (i.e., allele frequencies rather than binary genotypes are assumed by default). See [test/test.vcf](test/test.vcf) for a very small example. - [R object file format (`*.rds`)](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/readRDS) which codes for a matrix where each row correspond to a sample and each column to a locus (or an allele of a multi-allelic locus). See [test/test.rds](test/test.rds) for a very small example. + The rows are expected to have names of the samples corresponding to the names in the phenotype file. - + The columns are expected to contain the chromosome or scaffold names, the position in bases, and allele names separated by underscores, e.g., `chr1_12345_A` - specifying the locus is at chr1, position 12,345 and the data encoded in the column represent the dosage of allele A. + + The columns are expected to contain the chromosome or scaffold names, the position in bases, and allele names separated by underscores, e.g., `chr1_12345_A` - specifying the locus is at chr1, position 12,345 and the data encoded in the column represent the dosage of allele A. Each locus may be represented by all of its alleles or less one allele, e.g. one column per locus for biallelic loci. Additional control to this is available via the `--retain-minus-one-alleles-per-locus` parameter which may be `TRUE` of `FALSE`. - It is highly recommended to use genotype data without missing values. Missing data will be naively imputed using mean genotype value per allele/locus. 2. Phenotype data in text-delimited format (`-p; --phenotype-file`) - e.g. tab-delimited (`*.txt`) and comma-separated (`*.csv`) @@ -145,11 +145,13 @@ Rscript src/main.R \ Tables of genomic prediction cross-validation results and predicted phenotypes (full path and filenames of these output files are printed out by the main Rscript): -1. `*-METRICS_WITHIN_POP-{uniqueID}.csv`: table of genomic prediction accuracy metrics across replications, folds, and models per population. -2. `*-YPRED_WITHIN_POP-{uniqueID}.csv`: table of predicted and expected phenotypes across samples, replications and models per population. -3. `*-METRICS_ACROSS_POP-{uniqueID}.csv`: table of genomic prediction accuracy metrics across 1 replication, 1 fold, and all models per validation population **(present if there are at least 2 populations)**. -4. `*-YPRED_ACROSS_POP-{uniqueID}.csv`: table of predicted and expected phenotypes across samples, 1 replication and all models per validation population **(present if there are at least 2 populations)**. -5. `*-GENOMIC_PREDICTIONS-{uniqueID}.csv`: table of predicted phenotypes of samples with known genotype and missing phenotypes **(present if there are samples with known genotype and unknown phenotype data)**. +-{time and random integer}.Rds + +1. `*-ETRICS_WITHIN_POP`: table of genomic prediction accuracy metrics across replications, folds, and models per population. +2. `*-PRED_WITHIN_POP`: table of predicted and expected phenotypes across samples, replications and models per population. +3. `*-ETRICS_ACROSS_POP`: table of genomic prediction accuracy metrics across 1 replication, 1 fold, and all models per validation population, i.e., via leave-one-population-out cross-validation **(present if there are at least 2 populations)**. +4. `*-PRED_ACROSS_POP`: table of predicted and expected phenotypes across samples, 1 replication and all models per validation population, i.e., via leave-one-population-out cross-validation **(present if there are at least 2 populations)**. +5. `*-ENOMIC_PREDICTIONS`: table of predicted phenotypes of samples with known genotype and missing phenotypes **(present if there are samples with known genotype and unknown phenotype data)**. With the following fields: diff --git a/src/load.R b/src/load.R index cded53b..42bef5d 100644 --- a/src/load.R +++ b/src/load.R @@ -115,7 +115,12 @@ fn_load_phenotype = function(fname_csv_txt, sep=",", header=TRUE, idx_col_id=1, pop = as.character(df[, idx_col_pop]) y = df[, idx_col_y] names(y) = entry - return(list(y=y, pop=pop)) + if (header==TRUE) { + trait_name = colnames(df)[idx_col_y] + } else { + trait_name = paste0("trait_", idx_col_y) + } + return(list(y=y, pop=pop, trait_name=trait_name)) } fn_filter_outlying_phenotypes = function(list_y_pop, verbose=FALSE) { @@ -124,6 +129,7 @@ fn_filter_outlying_phenotypes = function(list_y_pop, verbose=FALSE) { # sigma = 1 y = list_y_pop$y pop = list_y_pop$pop + trait_name = list_y_pop$trait_name ### Identify outliers with boxplot, i.e. values beyond -2.698 standard deviations (definition of R::boxplot whiskers) b = boxplot(y, plot=FALSE) idx = which(!(y %in% b$out)) @@ -135,7 +141,7 @@ fn_filter_outlying_phenotypes = function(list_y_pop, verbose=FALSE) { print(paste0("n=", length(y[idx]))) txtplot::txtdensity(y[idx][!is.na(y[idx])]) } - return(list(y=y[idx], pop=pop[idx])) + return(list(y=y[idx], pop=pop[idx], trait_name=trait_name)) } ### Merge genotype and phenotype data by their entry names, i.e. rownames for G and names for y @@ -272,6 +278,7 @@ tests_load = function() { yex = c(0.32, 0.67, 0.93); names(yex) = paste0("Entry-", 1:3) expect_equal(y, yex) expect_equal(list_y_pop$pop, c("pop-A", "pop-A", "pop-A")) + expect_equal(list_y_pop$trait_name, "yield") } ) diff --git a/src/main.R b/src/main.R index c8fe03a..446e6a6 100644 --- a/src/main.R +++ b/src/main.R @@ -4,23 +4,24 @@ parser = ArgumentParser() parser$add_argument("-g", "--genotype-file", dest="fname_rds_or_vcf", type="character", help="Genotype file in Rds or vcf format. The Rds file needs to contain an n samples x p loci numeric matrix with row and column names corresponding to the samples and loci (chromosome or scaffold names, position and allele separated by underscores, e.g. 'chr1_12345_A') names. The vcf file needs to have either the genotype call (**GT**) or allele depth (**AD**) field, where the AD field has precedence is both are present (i.e., allele frequencies are first-class citizens rather than binary genotypes). The sample names should be the same as the names in the phenotype file.") parser$add_argument("-p", "--phenotype-file", dest="fname_pheno", type="character", help="Phenotype data in text-delimited format, e.g. tab-delimited (`*.txt`) and comma-separated (`*.csv`) which may or may not have a header line. At least three columns are required: a column for the names of the samples, a column for the population or grouping ID of each sample, and a column for the numeric phenotype data (which can have missing data coded as whatever is convenient).") parser$add_argument("-c", "--covariate-file", dest="fname_covar_rds", type="character", default=NULL, help="**NOT IMPLEMENTED YET** | Covariate matrix where the row names overlap with the row names (sample names) of the genotype and phenotype files. [Default=NULL]") -parser$add_argument("--retain-minus-one-alleles-per-locus", type="logical", default=TRUE, help="Remove the minor allele from the genotype matrix? [Default=TRUE]") +parser$add_argument("--retain-minus-one-alleles-per-locus", dest="retain_minus_one_alleles_per_locus", type="logical", default=TRUE, help="Remove the minor allele from the genotype matrix? [Default=TRUE]") parser$add_argument("--phenotype-delimiter", dest="sep", type="character", default=",", help="Delimiter of the phenotype file [Default=',']") parser$add_argument("--phenotype-header", dest="header", type="logical", default=TRUE, help="Does the phenotype file have a header line? [Default=TRUE]") parser$add_argument("--phenotype-column-id", dest="idx_col_id", type="numeric", default=1, help="Which column are the sample names located? [Default=1]") parser$add_argument("--phenotype-column-pop", dest="idx_col_pop", type="numeric", default=2, help="Which column are the population or group names located? [Default=2]") -parser$add_argument("--phenotype-column-data", dest="idx_col_y", type="numeric", default=3, help="Which column are the phenotype data located? [Default=3]") +parser$add_argument("--phenotype-column-data", dest="idx_col_y", type="character", default=3, help="Which column are the phenotype data located? [Default=3]") parser$add_argument("--phenotype-missing-strings", dest="na.strings", type="character", default=c("", "-", "NA", "na", "NaN", "missing", "MISSING"), help="How were the missing data in the phenotype file coded? [Default=c('', '-', 'NA', 'na', 'NaN', 'missing', 'MISSING')]") parser$add_argument("--models", dest="models_to_test", type="character", default=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C"), help="Genomic prediction models to use [Default=c('ridge','lasso','elastic_net','Bayes_A','Bayes_B','Bayes_C')]") parser$add_argument("--k-folds", dest="k_folds", type="numeric", default=10, help="Number of folds to perform in within population k-fold cross-validation [Default=10]") parser$add_argument("--n-reps", dest="n_reps", type="numeric", default=10, help="Number of reshuffled replications per fold of within population k-fold cross-validation [Default=3]") parser$add_argument("--n-threads", dest="n_threads", type="numeric", default=8, help="Number of computing threads to use in within and across population cross-validations [Default=8]") +parser$add_argument("-o", "--output-file-prefix", dest="output_file_prefix", type="character", default="", help="Filename of the output Rds file containing a list of metrics and predicted phenotype dataframes [Default='']") parser$add_argument("--verbose", dest="verbose", type="logical", default=FALSE, help="Do you wish to print detailed progress of the cross-validation including scatter plots of expected vs predicted phenotypes? [Default=FALSE]") args = parser$parse_args() -# args = list(fname_pheno="test/grape_pheno.txt", fname_rds_or_vcf="test/grape.vcf", fname_covar_rds=NULL, header=TRUE, idx_col_id=1, idx_col_pop=2, idx_col_y=3, na.strings=c("","-","NA","na","NaN","missing","MISSING"), retain_minus_one_alleles_per_locus=TRUE, sep="\t", models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C"), k_folds=5, n_reps=3, n_threads=8, verbose=FALSE) -# args = list(fname_pheno="test/grape_pheno.txt", fname_rds_or_vcf="test/grape.rds", fname_covar_rds=NULL, header=TRUE, idx_col_id=1, idx_col_pop=2, idx_col_y=3, na.strings=c("","-","NA","na","NaN","missing","MISSING"), retain_minus_one_alleles_per_locus=TRUE, sep="\t", models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C"), k_folds=5, n_reps=3, n_threads=8, verbose=FALSE) -# args = list(fname_pheno="test/grape_pheno.txt", fname_rds_or_vcf="test/grape.vcf", fname_covar_rds="test/grape_covar.rds", header=TRUE, idx_col_id=1, idx_col_pop=2, idx_col_y=3, na.strings=c("","-","NA","na","NaN","missing","MISSING"), retain_minus_one_alleles_per_locus=TRUE, sep="\t", models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C"), k_folds=5, n_reps=3, n_threads=8, verbose=FALSE) -# args = list(fname_pheno="test/grape_pheno.txt", fname_rds_or_vcf="test/grape.vcf", fname_covar_rds=NULL, header=TRUE, idx_col_id=1, idx_col_pop=2, idx_col_y=4, na.strings=c("","-","NA","na","NaN","missing","MISSING"), retain_minus_one_alleles_per_locus=TRUE, sep="\t", models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C"), k_folds=3, n_reps=3, n_threads=8, verbose=TRUE) +# args = list(fname_pheno="test/grape_pheno.txt", fname_rds_or_vcf="test/grape.vcf", fname_covar_rds=NULL, header=TRUE, idx_col_id=1, idx_col_pop=2, idx_col_y="3", na.strings=c("","-","NA","na","NaN","missing","MISSING"), retain_minus_one_alleles_per_locus=TRUE, sep="\t", models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C"), k_folds=5, n_reps=3, n_threads=8, output_file_prefix="", verbose=FALSE) +# args = list(fname_pheno="test/grape_pheno.txt", fname_rds_or_vcf="test/grape.rds", fname_covar_rds=NULL, header=TRUE, idx_col_id=1, idx_col_pop=2, idx_col_y="3", na.strings=c("","-","NA","na","NaN","missing","MISSING"), retain_minus_one_alleles_per_locus=TRUE, sep="\t", models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C"), k_folds=5, n_reps=3, n_threads=8, output_file_prefix="", verbose=FALSE) +# args = list(fname_pheno="test/grape_pheno.txt", fname_rds_or_vcf="test/grape.vcf", fname_covar_rds="test/grape_covar.rds", header=TRUE, idx_col_id=1, idx_col_pop=2, idx_col_y="3", na.strings=c("","-","NA","na","NaN","missing","MISSING"), retain_minus_one_alleles_per_locus=TRUE, sep="\t", models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C"), k_folds=5, n_reps=3, n_threads=8, output_file_prefix="", verbose=FALSE) +# args = list(fname_pheno="test/grape_pheno.txt", fname_rds_or_vcf="test/grape.vcf", fname_covar_rds=NULL, header=TRUE, idx_col_id=1, idx_col_pop=2, idx_col_y="3,4", na.strings=c("","-","NA","na","NaN","missing","MISSING"), retain_minus_one_alleles_per_locus=TRUE, sep="\t", models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C"), k_folds=3, n_reps=3, n_threads=8, output_file_prefix="", verbose=TRUE) print("#######################################################################################################################") print("(。•̀ᴗ-)✧(✿◠‿◠)(◕‿◕✿)(ง'̀-'́)ง(つ▀¯▀)つ(。♥‿♥。)(っ◔◡◔)っ ♥(。•̀ᴗ-)✧(✿◠‿◠)(◕‿◕✿)(ง'̀-'́)ง(つ▀¯▀)つ(。♥‿♥。)(っ◔◡◔)っ ♥") print("GENOMIC PREDICTION / SELECTION CROSS-VALIDATION WITHIN (K-FOLD) AND ACROSS (LEAVE-ONE-OUT) POPULATIONS") @@ -34,7 +35,6 @@ print(args) args_dummy = commandArgs(trailingOnly=FALSE) dir_src = dirname(gsub("^--file=", "", args_dummy[grepl("--file=", args_dummy)])) # dir_src = "/group/pasture/Jeff/genomic_selection/src" -print(dir_src) source(paste0(dir_src, "/load.R")) source(paste0(dir_src, "/metrics.R")) source(paste0(dir_src, "/models.R")) @@ -44,243 +44,276 @@ source(paste0(dir_src, "/cross_validation.R")) print("#################") print("Load genotype") print("#################") -G = fn_load_genotype(fname_rds_or_vcf=args$fname_rds_or_vcf, +X = fn_load_genotype(fname_rds_or_vcf=args$fname_rds_or_vcf, retain_minus_one_alleles_per_locus=args$retain_minus_one_alleles_per_locus) ### Deal with missing data (stop and impute using an external tool or continue using mean value imputation) -if (sum(is.na(G)) > 0) { +if (sum(is.na(X)) > 0) { print("Please consider stopping this analysis to impute missing data using an appropriate imputation tool on your dataset.") print("Missing data detected and will be naively imputed using mean genotypes per allele/locus.") print("You have 30 seconds to stare at this message and cancel anytime before or after this pause.") Sys.sleep(30) - for (j in 1:ncol(G)) { - idx = which(is.na(G[,j])) + for (j in 1:ncol(X)) { + idx = which(is.na(X[,j])) if (length(idx) > 0) { - mu = mean(G[,j], na.rm=TRUE) + mu = mean(X[,j], na.rm=TRUE) if (is.na(mu)) { - mu = mean(G, na.rm=TRUE) + mu = mean(X, na.rm=TRUE) } - G[idx, j] = mu + X[idx, j] = mu } } } -### Load phenotype data -print("#################") -print("Load phenotype") -print("#################") -list_y_pop = fn_load_phenotype(fname_csv_txt=args$fname_pheno, - sep=args$sep, - header=args$header, - idx_col_id=args$idx_col_id, - idx_col_pop=args$idx_col_pop, - idx_col_y=args$idx_col_y, - na.strings=args$na.strings) -### Remove phenotype outlier/s -list_y_pop = fn_filter_outlying_phenotypes(list_y_pop=list_y_pop, - verbose=args$verbose) -### Load covariate -if (is.null(args$fname_covar_rds) == TRUE) { - COVAR = NULL -} else { + +### Parse the list of column indexes referring to the phenotypes which will be iteratively processed +vec_idx_col_y = as.numeric(unlist(strsplit(args$idx_col_y, ","))) + +### Genomic prediction cross-validation within and across populations to find the best model and prediction of missing phenotypes with genotype data +GP_PER_PHENOTYPE = function(G, idx_col_y, args) { + # G=X; idx_col_y=vec_idx_col_y[1]; args=args + ### Load phenotype data print("#################") - print("Load covariate") + print("Load phenotype") print("#################") - COVAR = readRDS(args$fname_covar_rds) -} -### Merge genotype, phenotype, and covariate matrices, where they all need to be matrices with overlapping rownames i.e., sample names (requirement of fn_cross_validation(...)) -print("#################") -print("Merge data") -print("#################") -list_G_list_y_pop_COVAR = fn_merge_genotype_and_phenotype(G=G, - list_y_pop=list_y_pop, - COVAR=COVAR, - verbose=args$verbose) -idx_no_missing = which(!is.na(list_G_list_y_pop_COVAR$list_y_pop$y)) -y = matrix(list_G_list_y_pop_COVAR$list_y_pop$y[idx_no_missing], ncol=1) -G = list_G_list_y_pop_COVAR$G[idx_no_missing, ] -rownames(y) = names(list_G_list_y_pop_COVAR$list_y_pop$y[idx_no_missing]) -pop = list_G_list_y_pop_COVAR$list_y_pop$pop[idx_no_missing] -COVAR = list_G_list_y_pop_COVAR$COVAR[idx_no_missing, ] -### k-fold cross-validation within population -print("##################################################") -print("Within population cross-validation:") -print("##################################################") -df_metrics = NULL -df_y_pred = NULL -vec_fnames_metrics = c() -vec_fnames_y_pred = c() -vec_pop = sort(unique(pop)) -for (pop_id in vec_pop) { - # pop_id = vec_pop[1] - print("-----------------------------------") - print(paste0("Population: ", pop_id)) - idx = which(pop == pop_id) - prefix = gsub(".vcf", "", gsub(".Rds", "", gsub(".rds$", "", args$fname_rds_or_vcf))) - list_out = fn_cross_validation(G=G[idx, , drop=FALSE], - y=y[ idx, , drop=FALSE], - COVAR=COVAR[idx, , drop=FALSE], - models_to_test=args$models_to_test, - k_folds=args$k_folds, - n_reps=args$n_reps, - n_threads=args$n_threads, - prefix_tmp=paste0(prefix, "-pop_", pop_id), - verbose=args$verbose) - if (length(list_out$df_metrics) == 1) { - ### Quit, if cross-validation failed because the number size of each set is less than 2. - ### The error message from fn_cross_validation(...) will recommend decreasing k_folds. - if (is.na(list_out$df_metrics)) { - quit() + list_y_pop = fn_load_phenotype(fname_csv_txt=args$fname_pheno, + sep=args$sep, + header=args$header, + idx_col_id=args$idx_col_id, + idx_col_pop=args$idx_col_pop, + idx_col_y=idx_col_y, + na.strings=args$na.strings) + ### Remove phenotype outlier/s + list_y_pop = fn_filter_outlying_phenotypes(list_y_pop=list_y_pop, + verbose=args$verbose) + ### Load covariate + if (is.null(args$fname_covar_rds) == TRUE) { + COVAR = NULL + } else { + print("#################") + print("Load covariate") + print("#################") + COVAR = readRDS(args$fname_covar_rds) + } + ### Merge genotype, phenotype, and covariate matrices, where they all need to be matrices with overlapping rownames i.e., sample names (requirement of fn_cross_validation(...)) + print("#################") + print("Merge data") + print("#################") + list_G_list_y_pop_COVAR = fn_merge_genotype_and_phenotype(G=G, + list_y_pop=list_y_pop, + COVAR=COVAR, + verbose=args$verbose) + idx_no_missing = which(!is.na(list_G_list_y_pop_COVAR$list_y_pop$y)) + y = matrix(list_G_list_y_pop_COVAR$list_y_pop$y[idx_no_missing], ncol=1) + G = list_G_list_y_pop_COVAR$G[idx_no_missing, ] + rownames(y) = names(list_G_list_y_pop_COVAR$list_y_pop$y[idx_no_missing]) + pop = list_G_list_y_pop_COVAR$list_y_pop$pop[idx_no_missing] + COVAR = list_G_list_y_pop_COVAR$COVAR[idx_no_missing, ] + + ### k-fold cross-validation within population + print("##################################################") + print("Within population cross-validation:") + print("##################################################") + df_metrics = NULL + df_y_pred = NULL + vec_fnames_metrics = c() + vec_fnames_y_pred = c() + vec_pop = sort(unique(pop)) + for (pop_id in vec_pop) { + # pop_id = vec_pop[1] + print("-----------------------------------") + print(paste0("Population: ", pop_id)) + idx = which(pop == pop_id) + prefix = gsub(".vcf.gz$", "", ignore.case=TRUE, gsub(".vcf$", "", ignore.case=TRUE, gsub(".rds$", "", ignore.case=TRUE, args$fname_rds_or_vcf))) + list_out = fn_cross_validation(G=G[idx, , drop=FALSE], + y=y[ idx, , drop=FALSE], + COVAR=COVAR[idx, , drop=FALSE], + models_to_test=args$models_to_test, + k_folds=args$k_folds, + n_reps=args$n_reps, + n_threads=args$n_threads, + prefix_tmp=paste0(prefix, "-pop_", pop_id), + verbose=args$verbose) + if (length(list_out$df_metrics) == 1) { + ### Quit, if cross-validation failed because the number size of each set is less than 2. + ### The error message from fn_cross_validation(...) will recommend decreasing k_folds. + if (is.na(list_out$df_metrics)) { + quit() + } } + list_out$df_metrics$pop = pop_id + vec_fnames_metrics = c(vec_fnames_metrics, list_out$fnames_metrics) + vec_fnames_y_pred = c(vec_fnames_y_pred, list_out$fnames_y_pred) + if (is.null(df_metrics)) { + df_metrics = list_out$df_metrics + df_y_pred = list_out$df_y_pred + } else { + df_metrics = rbind(df_metrics, list_out$df_metrics) + df_y_pred = rbind(df_y_pred, list_out$df_y_pred) + } + } + ### Cleanup + for (i in 1:length(vec_fnames_metrics)) { + system(paste("rm ", vec_fnames_metrics[i], vec_fnames_y_pred[i])) } - list_out$df_metrics$pop = pop_id - vec_fnames_metrics = c(vec_fnames_metrics, list_out$fnames_metrics) - vec_fnames_y_pred = c(vec_fnames_y_pred, list_out$fnames_y_pred) - if (is.null(df_metrics)) { - df_metrics = list_out$df_metrics - df_y_pred = list_out$df_y_pred + + ### Leave-one-population-out cross-validation + skip_lopo_cv = length(vec_pop) == 1 + if (skip_lopo_cv == TRUE) { + print("Skipping leave-one-population-out cross-validation as only one population was supplied.") } else { - df_metrics = rbind(df_metrics, list_out$df_metrics) - df_y_pred = rbind(df_y_pred, list_out$df_y_pred) + print("##################################################") + print("Across population cross-validation:") + print("(Leave-one-population-out CV)") + print("##################################################") + list_out = fn_leave_one_population_out_cross_validation(G=G, + y=y, + pop=pop, + COVAR=COVAR, + models_to_test=args$models_to_test, + n_threads=args$n_threads, + prefix_tmp=paste0(prefix, "-LOPO"), + verbose=args$verbose) + ### Cleanup + for (i in 1:length(list_out$fnames_metrics)) { + system(paste("rm ", list_out$fnames_metrics[i], list_out$fnames_y_pred[i])) + } } -} -suffix = paste0(gsub("/", "", format(Sys.time(), format="%D%H%M%S")), round(runif(n=1, min=1e6, max=9e6))) -fname_within_cv_metrics = paste0(prefix, "-METRICS_WITHIN_POP-", suffix, ".csv") -fname_within_cv_ypred = paste0(prefix, "-YPRED_WITHIN_POP-", suffix, ".csv") -write.table(df_metrics, file=fname_within_cv_metrics, sep=",", quote=FALSE, row.names=FALSE, col.names=TRUE) -write.table(df_y_pred, file=fname_within_cv_ypred, sep=",", quote=FALSE, row.names=FALSE, col.names=TRUE) -### Cleanup -for (i in 1:length(vec_fnames_metrics)) { - # i = 1 - system(paste("rm ", vec_fnames_metrics[i], vec_fnames_y_pred[i])) -} -### Leave-one-population-out cross-validation -if (length(vec_pop) == 1) { - print("Skipping leave-one-population-out cross-validation as only one population was supplied.") - quit() -} -print("##################################################") -print("Across population cross-validation:") -print("##################################################") -list_out = fn_leave_one_population_out_cross_validation(G=G, - y=y, - pop=pop, - COVAR=COVAR, - models_to_test=args$models_to_test, - n_threads=args$n_threads, - prefix_tmp=paste0(prefix, "-LOPO"), - verbose=args$verbose) -fname_across_cv_metrics = paste0(prefix, "-METRICS_ACROSS_POP-", suffix, ".csv") -fname_across_cv_ypred = paste0(prefix, "-YPRED_ACROSS_POP-", suffix, ".csv") -write.table(list_out$df_metrics, file=fname_across_cv_metrics, sep=",", quote=FALSE, row.names=FALSE, col.names=TRUE) -write.table(list_out$df_y_pred, file=fname_across_cv_ypred, sep=",", quote=FALSE, row.names=FALSE, col.names=TRUE) -### Cleanup -for (i in 1:length(list_out$fnames_metrics)) { - # i = 1 - system(paste("rm ", list_out$fnames_metrics[i], list_out$fnames_y_pred[i])) -} -print("##################################################") -print("Finished cross-validation tests successfully!") -print("##################################################") -print("Please find the following output files:") -vec_fnames_out = list.files(path=dirname(prefix), pattern=basename(prefix)) -vec_fnames_out = vec_fnames_out[grepl(paste0("_POP-", suffix, ".csv$"), vec_fnames_out)] -for (f in vec_fnames_out) { - print(paste0(" - ", getwd(), "/", dirname(prefix), "/", f)) -} -print("##################################################") -print("Identifying the best models per population") -print("(using Pearson's correlation)") -print("##################################################") -### Only considering within population k-fold cross-validation to identify the best model per population -### Using Pearson's correlation as the genomic prediction accuracy metric -agg_corr = aggregate(corr ~ model + pop, FUN=mean, data=df_metrics) -agg_corr_sd = aggregate(corr ~ model + pop, FUN=sd, data=df_metrics) -agg_corr = merge(agg_corr, agg_corr_sd, by=c("model", "pop")) -colnames(agg_corr) = c("model", "pop", "corr", "corr_sd") -vec_pop = c() -vec_model = c() -vec_corr = c() -vec_corr_sd = c() -for (pop in unique(agg_corr$pop)) { - agg_sub = agg_corr[agg_corr$pop==pop, ] - idx = which(agg_sub$corr == max(agg_sub$corr, na.rm=TRUE))[1] - model = as.character(agg_sub$model[idx]) - corr = agg_sub$corr[idx] - corr_sd = agg_sub$corr_sd[idx] - vec_pop = c(vec_pop, pop) - vec_model = c(vec_model, model) - vec_corr = c(vec_corr, corr) - vec_corr_sd = c(vec_corr_sd, corr_sd) -} -### Append overall best model across all populations -agg_overall_mean = aggregate(corr ~ model, FUN=mean, data=df_metrics) -agg_overall_sdev = aggregate(corr ~ model, FUN=sd, data=df_metrics) -agg_overall = merge(agg_overall_mean, agg_overall_sdev, by="model") -colnames(agg_overall) = c("model", "corr", "corr_sd") -agg_overall = agg_overall[which(agg_overall[,2]==max(agg_overall[,2], na.rm=TRUE))[1], ] -vec_pop = c(vec_pop, "overall") -vec_model = c(vec_model, as.character(agg_overall$model)) -vec_corr = c(vec_corr, agg_overall$corr) -vec_corr_sd = c(vec_corr_sd, agg_overall$corr_sd) -df_top_models = data.frame(pop=vec_pop, model=vec_model, corr=vec_corr, corr_sd=vec_corr_sd) -print(df_top_models) + ### Best models per population + print("##################################################") + print("Identifying the best models per population") + print("(using Pearson's correlation)") + print("##################################################") + ### Only considering within population k-fold cross-validation to identify the best model per population + ### Using Pearson's correlation as the genomic prediction accuracy metric + agg_corr = aggregate(corr ~ model + pop, FUN=mean, data=df_metrics) + agg_corr_sd = aggregate(corr ~ model + pop, FUN=sd, data=df_metrics) + agg_corr = merge(agg_corr, agg_corr_sd, by=c("model", "pop")) + colnames(agg_corr) = c("model", "pop", "corr", "corr_sd") + vec_pop = c() + vec_model = c() + vec_corr = c() + vec_corr_sd = c() + for (pop in unique(agg_corr$pop)) { + agg_sub = agg_corr[agg_corr$pop==pop, ] + idx = which(agg_sub$corr == max(agg_sub$corr, na.rm=TRUE))[1] + model = as.character(agg_sub$model[idx]) + corr = agg_sub$corr[idx] + corr_sd = agg_sub$corr_sd[idx] + vec_pop = c(vec_pop, pop) + vec_model = c(vec_model, model) + vec_corr = c(vec_corr, corr) + vec_corr_sd = c(vec_corr_sd, corr_sd) + } + ### Append overall best model across all populations + agg_overall_mean = aggregate(corr ~ model, FUN=mean, data=df_metrics) + agg_overall_sdev = aggregate(corr ~ model, FUN=sd, data=df_metrics) + agg_overall = merge(agg_overall_mean, agg_overall_sdev, by="model") + colnames(agg_overall) = c("model", "corr", "corr_sd") + agg_overall = agg_overall[which(agg_overall[,2]==max(agg_overall[,2], na.rm=TRUE))[1], ] + vec_pop = c(vec_pop, "overall") + vec_model = c(vec_model, as.character(agg_overall$model)) + vec_corr = c(vec_corr, agg_overall$corr) + vec_corr_sd = c(vec_corr_sd, agg_overall$corr_sd) + df_top_models = data.frame(pop=vec_pop, model=vec_model, corr=vec_corr, corr_sd=vec_corr_sd) + print(df_top_models) -print("##################################################") -print("Genomic prediction per se using the best models") -print("(predict missing phenotypes using known genotypes)") -print("##################################################") -idx_entries_for_genomic_prediction = which(is.na(list_G_list_y_pop_COVAR$list_y_pop$y)) -n = length(idx_entries_for_genomic_prediction) -if (n == 0) { - print("No entries with missing phenotype and known genotypes detected in the input dataset.") - print("Exiting.") - quit() -} -print(paste0("For each entry belonging to a population assessed via within population ", args$k_folds, "-fold cross-validation,")) -print("the best model in terms of Person's correlation will be used to predict their unknown phenotypes.") -vec_all_pop = list_G_list_y_pop_COVAR$list_y_pop$pop -vec_pop = c() -vec_entry = c() -vec_y_pred = c() -vec_model = c() -vec_corr = c() -vec_corr_sd = c() -vec_corr_pop = c() -for (pop in unique(vec_all_pop[idx_entries_for_genomic_prediction])) { - # pop = unique(vec_all_pop[idx_entries_for_genomic_prediction])[1] - idx = which(vec_all_pop==pop) - G = list_G_list_y_pop_COVAR$G[idx, ] - y = matrix(list_G_list_y_pop_COVAR$list_y_pop$y[idx], ncol=1) - rownames(y) = names(list_G_list_y_pop_COVAR$list_y_pop$y[idx]) - idx_training = which(!is.na(y[,1])) - idx_validation = which(is.na(y[,1])) - idx_top_model = which(df_top_models$pop == pop)[1] - if (length(idx_top_model)==0) { - ### Use the overall top model if the entries do not belong to any other populations - idx_top_model = which(df_top_models$pop == "overall")[1] + ### Genomic prediction per se + idx_entries_for_genomic_prediction = which(is.na(list_G_list_y_pop_COVAR$list_y_pop$y)) + skip_gp_perse = length(idx_entries_for_genomic_prediction) == 0 + if (skip_gp_perse == TRUE) { + print("No entries with missing phenotype and known genotypes detected in the input dataset.") + } else { + print("##################################################") + print("Genomic prediction per se using the best models") + print("(predict missing phenotypes using known genotypes)") + print("##################################################") + print(paste0("For each entry belonging to a population assessed via within population ", args$k_folds, "-fold cross-validation,")) + print("the best model in terms of Person's correlation will be used to predict their unknown phenotypes.") + vec_all_pop = list_G_list_y_pop_COVAR$list_y_pop$pop + vec_pop = c() + vec_entry = c() + vec_y_pred = c() + vec_model = c() + vec_corr = c() + vec_corr_sd = c() + vec_corr_pop = c() + for (pop in unique(vec_all_pop[idx_entries_for_genomic_prediction])) { + # pop = unique(vec_all_pop[idx_entries_for_genomic_prediction])[1] + idx = which(vec_all_pop==pop) + G = list_G_list_y_pop_COVAR$G[idx, ] + y = matrix(list_G_list_y_pop_COVAR$list_y_pop$y[idx], ncol=1) + rownames(y) = names(list_G_list_y_pop_COVAR$list_y_pop$y[idx]) + idx_training = which(!is.na(y[,1])) + idx_validation = which(is.na(y[,1])) + idx_top_model = which(df_top_models$pop == pop)[1] + if (length(idx_top_model)==0) { + ### Use the overall top model if the entries do not belong to any other populations + idx_top_model = which(df_top_models$pop == "overall")[1] + } + model = df_top_models$model[idx_top_model] + corr = df_top_models$corr[idx_top_model] + corr_sd = df_top_models$corr_sd[idx_top_model] + corr_pop = df_top_models$pop[idx_top_model] + gp = eval(parse(text=paste0("fn_", model, "(G=G, y=y, idx_training=idx_training, idx_validation=idx_validation)"))) + vec_pop = c(vec_pop, rep(pop, length(gp$y_pred))) + if (is.null(names(gp$y_pred))) { + vec_entry = c(vec_entry, rownames(gp$y_pred)) + } else { + vec_entry = c(vec_entry, names(gp$y_pred)) + } + vec_y_pred = c(vec_y_pred, gp$y_pred) + vec_model = c(vec_model, rep(model, length(gp$y_pred))) + vec_corr = c(vec_corr, rep(corr, length(gp$y_pred))) + vec_corr_sd = c(vec_corr_sd, rep(corr_sd, length(gp$y_pred))) + vec_corr_pop = c(vec_corr_pop, rep(corr_pop, length(gp$y_pred))) + } + df_genomic_predictions = data.frame(pop=vec_pop, entry=vec_entry, y_pred=vec_y_pred, top_model=vec_model, corr_from_kfoldcv=paste0(round(vec_corr, 2), "(±", round(vec_corr_sd, 2), "|", vec_corr_pop, ")")) + } + if (skip_lopo_cv == TRUE) { + METRICS_ACROSS_POP = NULL + YPRED_ACROSS_POP = NULL + } else { + METRICS_ACROSS_POP = list_out$df_metrics + YPRED_ACROSS_POP = list_out$df_y_pred } - model = df_top_models$model[idx_top_model] - corr = df_top_models$corr[idx_top_model] - corr_sd = df_top_models$corr_sd[idx_top_model] - corr_pop = df_top_models$pop[idx_top_model] - gp = eval(parse(text=paste0("fn_", model, "(G=G, y=y, idx_training=idx_training, idx_validation=idx_validation)"))) - vec_pop = c(vec_pop, rep(pop, length(gp$y_pred))) - if (is.null(names(gp$y_pred))) { - vec_entry = c(vec_entry, rownames(gp$y_pred)) + if (skip_gp_perse == TRUE) { + GENOMIC_PREDICTIONS = NULL } else { - vec_entry = c(vec_entry, names(gp$y_pred)) + GENOMIC_PREDICTIONS = df_genomic_predictions } - vec_y_pred = c(vec_y_pred, gp$y_pred) - vec_model = c(vec_model, rep(model, length(gp$y_pred))) - vec_corr = c(vec_corr, rep(corr, length(gp$y_pred))) - vec_corr_sd = c(vec_corr_sd, rep(corr_sd, length(gp$y_pred))) - vec_corr_pop = c(vec_corr_pop, rep(corr_pop, length(gp$y_pred))) + out_per_phenotype = list( + TRAIT_NAME = list_y_pop$trait_name, + METRICS_WITHIN_POP = df_metrics, + YPRED_WITHIN_POP = df_y_pred, + METRICS_ACROSS_POP = METRICS_ACROSS_POP, + YPRED_ACROSS_POP = YPRED_ACROSS_POP, + GENOMIC_PREDICTIONS = GENOMIC_PREDICTIONS + ) + return(out_per_phenotype) +} + +### Perform genomic predictions per phenotype +out = list() +for (idx_col_y in c(vec_idx_col_y)) { + gp = GP_PER_PHENOTYPE(G=X, idx_col_y=idx_col_y, args=args) + trait_name = gp$TRAIT_NAME + eval(parse(text=paste0("out$", trait_name, " = gp"))) +} + +### Output Rds of list of dataframes +suffix = paste0(gsub("/", "", format(Sys.time(), format="%D%H%M%S")), round(runif(n=1, min=1e6, max=9e6))) +if (args$output_file_prefix == "") { + output_file_rds = paste0(gsub(".vcf.gz$", "", ignore.case=TRUE, gsub(".vcf$", "", ignore.case=TRUE, gsub(".rds$", "", ignore.case=TRUE, args$fname_rds_or_vcf))), + "-GENOMIC_PREDICTION_CV_GEBV-", suffix, ".rds") +} else { + output_file_rds = gsub(".rds$", paste0("-", suffix, ".Rds"), ignore.case=TRUE, output_file_rds) } -df_genomic_predictions = data.frame(pop=vec_pop, entry=vec_entry, y_pred=vec_y_pred, top_model=vec_model, corr_from_kfoldcv=paste0(round(vec_corr, 2), "(±", round(vec_corr_sd, 2), "|", vec_corr_pop, ")")) -fname_genomic_predictions = paste0(prefix, "-GENOMIC_PREDICTIONS-", suffix, ".csv") -write.table(df_genomic_predictions, file=fname_genomic_predictions, sep=",", quote=FALSE, row.names=FALSE, col.names=TRUE) +saveRDS(out, file=output_file_rds) print("##################################################") -print("Finished genomic prediction successfully!") +print("Finished successfully!") print("##################################################") -print("Please find the predicted phenotypes in the following output file:") -print(paste0(" ", fname_genomic_predictions)) +print("Please find the output Rds file containing a list of lists of data frames (top level list refer to each trait used) consisting of:") +print(paste0(" - within population ", args$k_folds, "-fold cross-validation metrics and predicted-expected phenotypes (2 data frames)")) +print(" - leave-one-population-out cross-validation metrics and predicted-expected phenotypes, if there are more than 1 population with known genotype and phenotype data supplied (another 2 data frames)") +print(" - predicted phenotypes from samples with known genotype and unknown phenotype data (1 data frame)") +print(paste0("OUTPUT FILENAME: ", output_file_rds)) diff --git a/test/test.sh b/test/test.sh index a792957..e297efc 100644 --- a/test/test.sh +++ b/test/test.sh @@ -41,7 +41,7 @@ Rscript src/main.R \ -g "test/grape.rds" \ -p "test/grape_pheno.txt" \ --phenotype-delimiter "\t" \ - --phenotype-column-data 4 \ + --phenotype-column-data "3,4" \ --k-folds 5 \ --n-reps 1 \ --n-threads ${NTHREADS} \ diff --git a/test/test.slurm b/test/test.slurm index b4bb982..1b449c0 100644 --- a/test/test.slurm +++ b/test/test.slurm @@ -3,8 +3,8 @@ #SBATCH --account="dbiopast2" #SBATCH --ntasks=1 #SBATCH --cpus-per-task=32 -#SBATCH --mem=500G -#SBATCH --time=14-0:0:00 +#SBATCH --mem=250G +#SBATCH --time=1-0:0:00 ### Load the conda environment module load Miniconda3/22.11.1-1 conda init bash