fixed trait_names dropping during merging and fixed issues #3 and #4

jeffersonfparil · Jan 11, 2024 · 65c82eb · 65c82eb
1 parent 72ceea4
commit 65c82eb
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -133,45 +133,43 @@ Rscript src/main.R \
 6. Does the phenotype file have a header line? (`--phenotype-header`) [Default=TRUE]
 7. Which column contains the sample names? (`--phenotype-column-id`) [Default=1]
 8. Which column contains the population or group names? (`--phenotype-column-pop`) [Default=2]
-9. Which column contains the phenotype data? (`--phenotype-column-data`) [Default=3]
+9. Which column/s contains the phenotype data? If more than 2 columns are requested then seperate them with commas, e.g. "3,4". (`--phenotype-column-data`) [Default="3"]
 10. How are the missing data in the phenotype file coded? (`--phenotype-missing-strings`) [Default=c("", "-", "NA", "na", "NaN", "missing", "MISSING")]
 11. Genomic prediction models to use (`--models`) [Default=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C")]
 12. Number of folds to perform in within population k-fold cross-validation (`--k-folds`) [Default=10]
 13. Number of reshuffled replications per fold of within population k-fold cross-validation (`--n-reps`) [Default=3]
 14. Number of computing threads to use in within and across population cross-validations (`--n-threads`) [Default=8]
-15. Do you wish to print detailed progress reports of the workflow? (`--verbose`) [Default=FALSE]
+15. Prefix of the filename of the output Rds file containing a list of lists (each corresponding to a trait requested by `--phenotype-column-pop`) of metrics and predicted phenotype dataframes (`-o; --output-file-prefix`) [Default='']"
+16. Do you wish to print detailed progress reports of the workflow? (`--verbose`) [Default=FALSE]
 
 ## Output data
 
-Tables of genomic prediction cross-validation results and predicted phenotypes (full path and filenames of these output files are printed out by the main Rscript):
+Rds file containing a list of lists each corresponding to a trait requested by `--phenotype-column-pop`. Each list or trait consists of the following dataframes:
 
--{time and random integer}.Rds
+1. `METRICS_WITHIN_POP`: table of genomic prediction accuracy metrics across replications, folds, and models per population.
+2. `YPRED_WITHIN_POP`: table of predicted and expected phenotypes across samples, replications and models per population.
+3. `METRICS_ACROSS_POP`: table of genomic prediction accuracy metrics across 1 replication, 1 fold, and all models per validation population, i.e., via leave-one-population-out cross-validation **(present if there are at least 2 populations)**.
+4. `YPRED_ACROSS_POP`: table of predicted and expected phenotypes across samples, 1 replication and all models per validation population, i.e., via leave-one-population-out cross-validation **(present if there are at least 2 populations)**.
+5. `GENOMIC_PREDICTIONS`: table of predicted phenotypes of samples with known genotype and missing phenotypes **(present if there are samples with known genotype and unknown phenotype data)**.
 
-1. `*-ETRICS_WITHIN_POP`: table of genomic prediction accuracy metrics across replications, folds, and models per population.
-2. `*-PRED_WITHIN_POP`: table of predicted and expected phenotypes across samples, replications and models per population.
-3. `*-ETRICS_ACROSS_POP`: table of genomic prediction accuracy metrics across 1 replication, 1 fold, and all models per validation population, i.e., via leave-one-population-out cross-validation **(present if there are at least 2 populations)**.
-4. `*-PRED_ACROSS_POP`: table of predicted and expected phenotypes across samples, 1 replication and all models per validation population, i.e., via leave-one-population-out cross-validation **(present if there are at least 2 populations)**.
-5. `*-ENOMIC_PREDICTIONS`: table of predicted phenotypes of samples with known genotype and missing phenotypes **(present if there are samples with known genotype and unknown phenotype data)**.
+These dataframes have the following fields:
 
-With the following fields:
-
-`*-METRICS_*_POP-*.csv*` output fields:
+`METRICS_*_POP-*.csv*` output fields:
 | r | k | model | mbe | mae | rmse | r2 | corr | n_non_zero | duration_mins | pop (within) or validation_pop (across) |
 |:-:|:-:|:-----:|:---:|:---:|:----:|:--:|:----:|:----------:|:-------------:|:---------------------------------------:|
 |:-:|:-:|:-----:|:---:|:---:|:----:|:--:|:----:|:----------:|:-------------:|:---------------------------------------:|
 
-`*-YPRED_*_POP-*.csv` output fields:
+`YPRED_*_POP-*.csv` output fields:
 | entry | rep | model | y_pred | y_true |
 |:-----:|:---:|:-----:|:------:|:------:|
 |:-----:|:---:|:-----:|:------:|:------:|
 
-`*-GENOMIC_PREDICTIONS-*.csv` output fields:
+`GENOMIC_PREDICTIONS-*.csv` output fields:
 | pop | entry | y_pred | top_model | corr_from_kfoldcv |
 |:---:|:-----:|:------:|:---------:|:-----------------:|
 |:---:|:-----:|:------:|:---------:|:-----------------:|
 
-
-Defined as:
+These fields are defined as:
 
 - **r**: replication
 - **k**: fold

diff --git a/src/load.R b/src/load.R
@@ -150,6 +150,7 @@ fn_merge_genotype_and_phenotype = function(G, list_y_pop, COVAR=NULL, verbose=FA
     # y = fn_load_phenotype(fname_csv_txt=paste0("/group/pasture/Jeff/genomic_selection/test/test_pheno.csv"), header=TRUE, idx_col_id=1, idx_col_pop=2, idx_col_y=3)$y
     y = list_y_pop$y
     pop = list_y_pop$pop
+    trait_name = list_y_pop$trait_name
     if (is.null(COVAR) == FALSE) {
         idx_G = (rownames(G) %in% names(y)) & (rownames(G) %in% rownames(COVAR))
     } else {
@@ -194,7 +195,7 @@ fn_merge_genotype_and_phenotype = function(G, list_y_pop, COVAR=NULL, verbose=FA
             print("Covariate is null.")
         }
     }
-    return(list(G=G, list_y_pop=list(y=y, pop=pop), COVAR=COVAR))
+    return(list(G=G, list_y_pop=list(y=y, pop=pop, trait_name=trait_name), COVAR=COVAR))
 }
 
 ### Tests

diff --git a/src/main.R b/src/main.R
@@ -9,13 +9,13 @@ parser$add_argument("--phenotype-delimiter", dest="sep", type="character", defau
 parser$add_argument("--phenotype-header", dest="header", type="logical", default=TRUE, help="Does the phenotype file have a header line? [Default=TRUE]")
 parser$add_argument("--phenotype-column-id", dest="idx_col_id", type="numeric", default=1, help="Which column are the sample names located? [Default=1]")
 parser$add_argument("--phenotype-column-pop", dest="idx_col_pop", type="numeric", default=2, help="Which column are the population or group names located? [Default=2]")
-parser$add_argument("--phenotype-column-data", dest="idx_col_y", type="character", default=3, help="Which column are the phenotype data located? [Default=3]")
+parser$add_argument("--phenotype-column-data", dest="idx_col_y", type="character", default="3", help="Which column are the phenotype data located? If more than 2 columns are requested then seperate them with commas, e.g. '3,4'. [Default=3]")
 parser$add_argument("--phenotype-missing-strings", dest="na.strings", type="character", default=c("", "-", "NA", "na", "NaN", "missing", "MISSING"), help="How were the missing data in the phenotype file coded? [Default=c('', '-', 'NA', 'na', 'NaN', 'missing', 'MISSING')]")
 parser$add_argument("--models", dest="models_to_test", type="character", default=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C"), help="Genomic prediction models to use [Default=c('ridge','lasso','elastic_net','Bayes_A','Bayes_B','Bayes_C')]")
 parser$add_argument("--k-folds", dest="k_folds", type="numeric", default=10, help="Number of folds to perform in within population k-fold cross-validation [Default=10]")
 parser$add_argument("--n-reps", dest="n_reps", type="numeric", default=10, help="Number of reshuffled replications per fold of within population k-fold cross-validation [Default=3]")
 parser$add_argument("--n-threads", dest="n_threads", type="numeric", default=8, help="Number of computing threads to use in within and across population cross-validations [Default=8]")
-parser$add_argument("-o", "--output-file-prefix", dest="output_file_prefix", type="character", default="", help="Filename of the output Rds file containing a list of metrics and predicted phenotype dataframes [Default='']")
+parser$add_argument("-o", "--output-file-prefix", dest="output_file_prefix", type="character", default="", help="Prefix of the filename of the output Rds file containing a list of metrics and predicted phenotype dataframes [Default='']")
 parser$add_argument("--verbose", dest="verbose", type="logical", default=FALSE, help="Do you wish to print detailed progress of the cross-validation including scatter plots of expected vs predicted phenotypes? [Default=FALSE]")
 args = parser$parse_args()
 # args = list(fname_pheno="test/grape_pheno.txt", fname_rds_or_vcf="test/grape.vcf", fname_covar_rds=NULL, header=TRUE, idx_col_id=1, idx_col_pop=2, idx_col_y="3", na.strings=c("","-","NA","na","NaN","missing","MISSING"), retain_minus_one_alleles_per_locus=TRUE, sep="\t", models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C"), k_folds=5, n_reps=3, n_threads=8, output_file_prefix="", verbose=FALSE)