diff --git a/PRSice.R b/PRSice.R index bc782381..da1e3952 100755 --- a/PRSice.R +++ b/PRSice.R @@ -19,7 +19,7 @@ In_Regression <- R2 <- print.p <- R <- P <- value <- Phenotype <- Set <- PRS.R2 <- LCI <- UCI <- quant.ref <- NULL -r.version <- "2.3.0.e" +r.version <- "2.3.1" # Help Messages -------------------------------------- help_message <- "usage: Rscript PRSice.R [options] <-b base_file> <-t target_file> <--prsice prsice_location>\n @@ -431,7 +431,7 @@ UsePackage <- function(package, dir, no.install) } use.data.table <- T -use.ggplot <- T #cerr +use.ggplot <- T for (library in libraries) { package.directory <- "." @@ -762,7 +762,6 @@ writeLines(paste0("Current Rscript version = ",r.version)) expand_scale <- function(mult = 0, add = 0) { stopifnot(is.numeric(mult) && is.numeric(add)) stopifnot((length(mult) %in% 1:2) && (length(add) %in% 1:2)) - mult <- rep(mult, length.out = 2) add <- rep(add, length.out = 2) c(mult[1], add[1], mult[2], add[2]) @@ -784,6 +783,8 @@ shorten_label <- function(x) { } +# Quantile functions ------------------------------------------------------ + get_quantile <- function(x, num.quant, quant.ref){ quant <- as.numeric(cut(x, breaks = unique(quantile( @@ -840,9 +841,10 @@ set_uneven_quant <- function(quant.cutoff, ref.cutoff, num.quant, prs, quant.ind return(list(quant, quant.index)) } # Determine Default ------------------------------------------------------- -# First, determine the bar levels +# Bar level default ------------------------------------------------------- if(!provided("bar_levels", argv)){ if(!provided("msigdb", argv) & !provided("gtf", argv) & !provided("bed", argv)) { + # Not PRSet argv$bar_levels <- paste(0.001, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, sep=",") if (!provided("no_full", argv)) { argv$bar_levels <- paste(argv$bar_levels, 1, sep=",") @@ -850,67 +852,98 @@ if(!provided("bar_levels", argv)){ } else if (!provided("fastscore", argv) & !provided("lower", argv) & !provided("upper", argv) & !provided("interval", argv)) { - # This is prset, so by default, we don't do all threshold - # unless user use some of the parameter related to the thresholding + # This is prset, no thresholding unless user use parameters related to thresholding argv$bar_levels <- "1" }else{ - # this should be PRSet but user want thresholding argv$bar_levels <- paste(0.001, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, sep=",") if (!provided("no_full", argv)) { argv$bar_levels <- paste(argv$bar_levels, 1, sep=",") } } } -# Next, we need to determine if we are doing binary target -# This is only required when user does not provide --binary-target - -if(!provided("binary_target", argv)){ - # Now we want to check if base is beta - base_beta <- F - if(provided("beta", argv)){ - # Base is beta - base_beta <- T - }else{ - if(!provided("base", argv)){ - writeLines("Warning: Without base file, we cannot determine if the summary statistic is beta or OR, which is used to determine if the target phenotype is binary or not. We will now proceed assuming the target phenotype is binary. If that is incorrect, please use --binary-target to specify the correct target phenotype type, or you can provide the base file") - } - else{ - zz <- gzfile(argv$base) - base_header <- readLines(zz,n=1) - close(zz) - or <- length(grep("or",base_header,ignore.case=TRUE))==1 - beta <- length(grep("beta",base_header,ignore.case=TRUE))==1 - if(or & beta){ - stop("Both OR and BETA detected. Cannot determine which one should be used. Please use --beta or --binary-target") - } - if(beta){ - base_beta <- T - } - if(!or & !beta){ - stop("Do not detect either BETA or OR. Please ensure your base input is correct") - } - } - } - # Now we know if the base is beta or not, we can determine the target binary status - if(!provided("pheno_col", argv)){ - if(base_beta){ - argv$binary_target <- "F" - }else{ - argv$binary_target <- "T" - } - }else if(length(strsplit(argv$pheno_col, split=",")[[1]])==1){ - if(base_beta){ - argv$binary_target <- "F" - }else{ - argv$binary_target <- "T" - } + +# Base check -------------------------------------------------------------- +# We need to check the base when both --beta, --or and --binary-target isn't provided +# as PRSice will try to determine the binary-target flag with those input +base.beta <- provided("beta", argv) +base.or <- provided("or", argv) +if(!base.beta & + !base.or & + !provided("binary_target", argv)) { + if (!provided("base", argv)) { + stop( + "Warning: Without base file, we cannot determine if the summary statistic is beta or OR, which is used to determine if the target phenotype is binary or not. Please use --binary-target to specify the correct target phenotype type." + ) + } + zz <- gzfile(argv$base) + base_header <- readLines(zz, n = 1) + close(zz) + or <- length(grep("or", base_header, ignore.case = TRUE)) == 1 + beta <- length(grep("beta", base_header, ignore.case = TRUE)) == 1 + if (or & beta) { + stop( + "Both OR and BETA detected. Cannot determine which one should be used. Please use --beta or --binary-target" + ) + } + if (beta) { + base.beta <- T + base.or <- F + } else if (or) { + base.or <- T + base.beta <- F + } + else { + stop("Do not detect either BETA or OR. Please ensure your base input is correct") + } +} + +# Determine default of binary-target ---------------------------------------- +get_binary_vector <- function(input){ + # Assume the input is correct. If not, then user is likely using --plot + vec <- strsplit(input, split=",")[[1]] + binary.vec <- NULL + for(i in vec){ + if(endsWith(toupper(i), "F") |endsWith(toupper(i), "FALSE")){ + count <- as.numeric(gsub("F", "", gsub("FALSE","", i, ignore.case=T), ignore.case=T)) + if(!is.na(count)){ + binary.vec <- c(binary.vec, rep(F, count)) + }else{ + binary.vec <- c(binary.vec, F) + } + }else if(endsWith(toupper(i), "T") |endsWith(toupper(i), "TRUE")){ + count <- as.numeric(gsub("T", "", gsub("TRUE","", i, ignore.case=T), ignore.case=T)) + if(!is.na(count)){ + binary.vec <- c(binary.vec, rep(T,count)) + }else{ + binary.vec <- c(binary.vec, T) + } } + } + return(binary.vec) +} +binary.vector <- NULL +if(!provided("binary_target", argv)){ + num.pheno <- 1 + if(provided("pheno_col", argv)){ + num.pheno <- length(strsplit(argv$pheno_col, str=",")[[1]]) + } + if(base.beta){ + binary.vector <- rep(F, num.pheno) + }else if(base.or){ + binary.vector <- rep(T, num.pheno) + }else{ + stop("Error: Fatal logic error from Sam when parsing binary target status") + } +}else{ + # Try to parse binary_target into vector of TF + binary.vector <- get_binary_vector(argv$binary_target) } + # Sanity check for binary-target -if(provided("pheno_col", argv) & provided("binary_target", argv)){ +if(provided("pheno_col", argv) ){ pheno_length <- length(strsplit(argv$pheno_col, split=",")[[1]]) - binary_length <- length(strsplit(argv$binary_target, split=",")[[1]]) + binary_length <- length(binary.vector) if(pheno_length==0 & binary_length==1){ # This is ok }else if(pheno_length!=binary_length){ @@ -2065,8 +2098,6 @@ multi_set_plot <- function(prefix, prs.summary, pheno.name, parameters, use.ggpl } # Sanity Check ------------------------------------------------------------ - - if (provided("no_regress", argv)) { quit("yes") } @@ -2088,62 +2119,70 @@ extract_matrix <- function(x, y) { # With this update, we only allow a single base file therefore we don't even need the # information of base here +# Check if target provided ------------------------------------------------ + if (!provided("target", argv) & !provided("target_list", argv)) { stop("Target file name not found. You'll need to provide the target name for plotting! (even with --plot)") } -phenos <- NULL - -binary_target <- strsplit(argv$binary_target, split = ",")[[1]] +# Phenotype file check ---------------------------------------------------- +pheno.cols <- NULL pheno.index <- 6 + if (provided("pheno_col", argv)) { - phenos <- strsplit(argv$pheno_col, split = ",")[[1]] - if (!provided("pheno_file", argv)) { - writeLines( - strwrap( - "WARNING: Cannot have multiple phenotypes if pheno_file is not provided. We will ignore the pheno_col option.", - width = 80 - ) - ) - }else if (length(binary_target) != length(phenos)) { - message <- - "Number of binray target should match number of phenotype provided!" - message <- paste( - message, - "There are ", - length(binary_target), - " binary target information and ", - length(phenos), - "phenotypes", - sep = "" - ) - stop(message) - } else{ - header <- read.table(argv$pheno_file, nrows = 1, header = TRUE, check.names=FALSE) - # This will automatically filter out un-used phenos - valid.pheno <- phenos %in% colnames(header) - valid.file.index <- colnames(header) %in% phenos - if (sum(valid.pheno) == 0) { - stop("Error: None of the phenotype is identified in phenotype header!") - }else if(sum(valid.pheno) != length(phenos)){ - writeLines("WARNING: Some phenotypes were not identified from the phenotype file:") - for(i in phenos[!phenos %in% colnames(header)] ){ - writeLines(i) - } - } - binary_target <- binary_target[valid.pheno] - phenos <- phenos[valid.pheno] - pheno.index <- c(1:ncol(header))[valid.file.index] + pheno.cols <- strsplit(argv$pheno_col, split = ",")[[1]] + if (!provided("pheno_file", argv)) { + stop("Error: You must provide a phenotype file for multiple phenotype analysis") + } else if (length(binary.vector) != length(pheno.cols)) { + message <- + "Number of binray target should match number of phenotype provided!" + message <- paste( + message, + "There are ", + length(binary_target), + " binary target information and ", + length(pheno.cols), + "phenotypes", + sep = "" + ) + stop(message) + } else{ + header <- + read.table( + argv$pheno_file, + nrows = 1, + header = TRUE, + check.names = FALSE + ) + if (length(unique(header)) != length(header)) { + # Duplicated phenotype + stop( + "Error: Duplicated phenotype column detected. Please make sure you have provided the correct input" + ) + } + valid.pheno <- pheno.cols %in% colnames(header) + if (sum(valid.pheno) == 0) { + stop("Error: None of the phenotype is identified in phenotype header!") + } else if (sum(valid.pheno) != length(pheno.cols)) { + writeLines("WARNING: Some phenotypes were not identified from the phenotype file:") + for (i in pheno.cols[!pheno.cols %in% colnames(header)]) { + writeLines(i) + } } + binary.vector <- binary.vector[valid.pheno] + pheno.cols <- pheno.cols[valid.pheno] + pheno.index <- 1:length(header) + pheno.index <- pheno.index[valid.pheno] + } } else if (provided("pheno_file", argv)) { - pheno.index <- 3 - if (ignore_fid) - pheno.index <- 2 + pheno.index <- 3 + if (ignore_fid) + pheno.index <- 2 } else{ - if (length(binary_target) != 1) { - stop("Too many binary target information. We only have one phenotype") - } + if (length(binary.vector) != 1) { + stop("Too many binary target information. We only have one phenotype") + } } @@ -2274,104 +2313,119 @@ prefix <- argv$out # Process plot functions -------------------------------------------------- process_plot <- - function(prefix, - covariance, - is_binary, - pheno.file, - parameters, - pheno.index, - use.data.table, - use.ggplot, - pheno.name) { - sum.prefix <- prefix - if(pheno.name!="-"){ - prefix <- paste(prefix, pheno.name, sep=".") - } - best <- NULL - prs.summary <- NULL - prsice.result <- NULL - phenotype <- NULL - if (use.data.table) { - best <- fread(paste0(prefix, ".best"), data.table = F, colClasses=c("FID"="character","IID"="character")) - prs.summary <- - fread(paste0(sum.prefix, ".summary"), data.table = F) - prsice.result <- - fread(paste0(sum.prefix, ".prsice"), data.table = F) - phenotype <- - fread(pheno.file, data.table = F, header = F, colClasses=c("V1"="character","V2"="character")) - } else{ - best <- read.table(paste0(prefix, ".best"), header = T, colClasses=c("FID"="character","IID"="character")) - prs.summary <- - read.table(paste0(sum.prefix, ".summary"), header = T) - prsice.result <- - read.table(paste0(sum.prefix, ".prsice"), header = T) - # Allow header = false for fam or for phenotype files that does not contain phenotype name - phenotype <- read.table(pheno.file, header = F, colClasses=c("V1"="character","V2"="character")) - } - best <- subset(best, In_Regression == "Yes") - # We know the format of the best file, and it will always contain FID and IID - phenos <- unique(prsice.result$Pheno) - if(length(phenos)!= 1) { - prsice.result <- subset(prsice.result, Pheno == pheno.name) - } - base.prs <- best[,c(1,2,4)] - if(provided("plot_set", parameters) & (provided("msigdb", parameters) | provided("bed", parameters) | provided("gtf", parameters)| provided("snp_set", parameters)| provided("snp_sets", parameters))){ - base.prs <- best[,colnames(best)%in%c("FID", "IID", parameters$plot_set)] - colnames(base.prs)[3] <- "PRS" - } -# Generate phenotype matrix ----------------------------------------------- - # extract the phenotype column - # And only retain samples with phenotype and covariate information - # They will be found in the best data.frame - - ignore_fid <- provided("ignore_fid", parameters) - if (!ignore_fid) { - phenotype <- phenotype[, c(1:2, pheno.index)] - colnames(phenotype) <- c("FID", "IID", "Pheno") - phenotype <- - phenotype[phenotype$FID %in% best$FID & - phenotype$IID %in% best$IID, ] - } else{ - phenotype <- phenotype[, c(1, pheno.index)] - colnames(phenotype) <- c("IID", "Pheno") - phenotype <- phenotype[phenotype$IID %in% best$IID, ] - } - phenotype$Pheno <- as.numeric(as.character(phenotype$Pheno)) - pheno <- phenotype - use.residual <- F - if(is_binary){ - if(max(pheno$Pheno)==2){ - pheno$Pheno <- pheno$Pheno-1 - } - } - -# Start calling functions ------------------------------------------------- - if (provided("quantile", parameters) && parameters$quantile > 0) { - # Need to plot the quantile plot (Remember to remove the iid when performing the regression) - if(!provided("quant_break", parameters)){ - quantile_plot(base.prs, pheno, covariance, prefix, parameters, is_binary, use.ggplot) - }else{ - uneven_quantile_plot(base.prs, pheno, covariance, prefix, parameters, is_binary, use.ggplot) - } - } - if(provided("msigdb", parameters) | provided("bed", parameters) | provided("gtf", parameters) - | provided("snp_test", parameters) | provided("snp_tests", parameters)){ - if(length(strsplit(argv$bar_levels, split=",")[[1]])>1){ - bar_plot(prsice.result, prefix, parameters, use.ggplot) - if(!provided("fastscore", parameters)){ - high_res_plot(prsice.result, prefix, parameters, use.ggplot) - } - } - }else{ - bar_plot(prsice.result, prefix, parameters, use.ggplot) - if(!provided("fastscore", parameters)){ - high_res_plot(prsice.result, prefix, parameters, use.ggplot) - } - } - if(provided("multi_plot", parameters)){ - multi_set_plot(prefix, prs.summary, pheno.name, parameters, use.ggplot, argv$device) + function(prefix, + phenotype, + covariance, + pheno.index, + is_binary, + prsice.summary, + prsice.result, + pheno.name, + parameters, + use.data.table, + use.ggplot) { + if (pheno.name != "-") { + prefix <- paste(prefix, pheno.name, sep = ".") + } + best <- NULL + if (use.data.table) { + best <- + fread( + paste0(prefix, ".best"), + data.table = F, + colClasses = c("FID" = "character", "IID" = "character") + ) + print(best) + } else{ + best <- + read.table( + paste0(prefix, ".best"), + header = T, + colClasses = c("FID" = "character", "IID" = "character") + ) + } + best <- subset(best, In_Regression == "Yes") + # We know the format of the best file, and it will always contain FID and IID + base.prs <- best[, c(1, 2, 4)] + if (provided("plot_set", parameters) & + ( + provided("msigdb", parameters) | + provided("bed", parameters) | + provided("gtf", parameters) | + provided("snp_set", parameters) + )) { + base.prs <- + best[, colnames(best) %in% c("FID", "IID", parameters$plot_set)] + colnames(base.prs)[3] <- "PRS" + } + ignore_fid <- provided("ignore_fid", parameters) + if (!ignore_fid) { + phenotype <- phenotype[, c(1:2, pheno.index)] + colnames(phenotype) <- c("FID", "IID", "Pheno") + phenotype <- + phenotype[phenotype$FID %in% best$FID & + phenotype$IID %in% best$IID,] + } else{ + phenotype <- phenotype[, c(1, pheno.index)] + colnames(phenotype) <- c("IID", "Pheno") + phenotype <- phenotype[phenotype$IID %in% best$IID,] + } + # Because we read with header=F, it is likely our pheno is parsed as character + phenotype$Pheno <- as.numeric(as.character(phenotype$Pheno)) + pheno <- phenotype + if (is_binary) { + pheno$Pheno[pheno$Pheno == -9] <- NA + if (max(pheno$Pheno) == 2) { + pheno$Pheno <- pheno$Pheno - 1 + } + } + if (provided("quantile", parameters) && + parameters$quantile > 0) { + # Need to plot the quantile plot (Remember to remove the iid when performing the regression) + if (!provided("quant_break", parameters)) { + quantile_plot(base.prs, + pheno, + covariance, + prefix, + parameters, + is_binary, + use.ggplot) + } else{ + uneven_quantile_plot(base.prs, + pheno, + covariance, + prefix, + parameters, + is_binary, + use.ggplot) + } + } + if (provided("msigdb", parameters) | + provided("bed", parameters) | + provided("gtf", parameters) | + provided("snp_test", parameters)) + { + if (length(strsplit(argv$bar_levels, split = ",")[[1]]) > 1) { + bar_plot(prsice.result, prefix, parameters, use.ggplot) + if (!provided("fastscore", parameters)) { + high_res_plot(prsice.result, prefix, parameters, use.ggplot) } + } + } else{ + bar_plot(prsice.result, prefix, parameters, use.ggplot) + if (!provided("fastscore", parameters)) { + high_res_plot(prsice.result, prefix, parameters, use.ggplot) + } } + if (provided("multi_plot", parameters)) { + multi_set_plot(prefix, + prsice.summary, + pheno.name, + parameters, + use.ggplot, + argv$device) + } + } # Check if phenotype file is of sample format ----------------------------- is_sample_format <- function(file) { @@ -2389,7 +2443,7 @@ is_sample_format <- function(file) { first <- strsplit(first_line, split = " ")[[1]] } second <- strsplit(second_line, split = "\t")[[1]] - if (length(first) == 1) { + if (length(second) == 1) { second <- strsplit(second_line, split = " ")[[1]] } if (length(first) != length(second) | length(first) < 3) { @@ -2414,111 +2468,108 @@ is_sample_format <- function(file) { pheno.file <- NULL if (provided("pheno_file", argv)) { - pheno.file <- argv$pheno_file -} else if(provided("target", argv)){ - # Check if external fam / sample file is provided - target.info <- strsplit(argv$target, split = ",")[[1]] - if (length(target.info) == 2) { - pheno.file <- target.info[2] - if (provided("type", argv)) { - if (argv$type == "bgen") { - # sample file should contain FID and IID by format requirement - pheno.index <- 3 - if (ignore_fid & - !is_sample_format(pheno.file)) - pheno.index <- 2 - } - } + pheno.file <- argv$pheno_file +} else if (provided("target", argv)) { + if (provided("type", argv)) { + if (argv$type == "bgen") { + # We don't have a default phenotype file + stop("Error: You must provide a phenotype file for bgen input") + } + } + target.info <- strsplit(argv$target, split = ",")[[1]] + + if (length(target.info) == 2) { + pheno.file <- target.info[2] + } else{ + if (provided("type", argv)) { + pheno.file <- paste0(argv$target, ".fam") } else{ - if (provided("type", argv)) { - if (argv$type == "bgen") { - stop("Error: You must provide either a phenotype or sample file for bgen input") - } else if (argv$type == "bed") { - pheno.file <- paste0(argv$target, ".fam") - } - } else{ - # Because default is always plink - pheno.file <- paste0(argv$target, ".fam") - } + # Because default is always plink + pheno.file <- paste0(argv$target, ".fam") } -}else if(provided("target_list", argv)){ - # Assume no header - target.info <- strsplit(argv$target_list, split = ",")[[1]] - target.list <- read.table(argv$target_list) - target.prefix <- target.list[1,1] - if (length(target.info) == 2) { - pheno.file <- target.info[2] - if (provided("type", argv)) { - if (argv$type == "bgen") { - # sample file should contain FID and IID by format requirement - pheno.index <- 3 - if (ignore_fid & - !is_sample_format(pheno.file)) - pheno.index <- 2 - } - } + } +} else if (provided("target_list", argv)) { + # Assume no header + if (provided("type", argv)) { + if (argv$type == "bgen") { + # We don't have a default phenotype file + stop("Error: You must provide a phenotype file for bgen input") + } + } + target.info <- strsplit(argv$target_list, split = ",")[[1]] + target.list <- read.table(argv$target_list) + target.prefix <- target.list[1, 1] + if (length(target.info) == 2) { + pheno.file <- target.info[2] + } else{ + if (provided("type", argv)) { + pheno.file <- paste0(target.prefix, ".fam") } else{ - if (provided("type", argv)) { - if (argv$type == "bgen") { - stop("Error: You must provide either a phenotype or sample file for bgen input") - } else if (argv$type == "bed") { - pheno.file <- paste0(target.prefix, ".fam") - } - } else{ - # Because default is always plink - pheno.file <- paste0(target.prefix, ".fam") - } + # Because default is always plink + pheno.file <- paste0(target.prefix, ".fam") } + } } # To account for the chromosome number - pheno.file <- gsub("#", "1", pheno.file) -if (!is.null(phenos) & - length(phenos) > 1) { - for (i in 1:length(phenos)) { - # Update the covariance matrix accordingly - - process_plot( - argv$out, - covariance.base, - binary_target[i], - pheno.file, - argv, - pheno.index[i], - use.data.table, - use.ggplot, - phenos[i] - ) - } - if(provided("multi_plot", argv)){ - multi_pheno_plot(argv, use.ggplot, use.data.table, argv$device) - } -} else if (!is.null(phenos)) { - process_plot( - argv$out, - covariance.base, - binary_target[1], - pheno.file, - argv, - pheno.index[1], - use.data.table, - use.ggplot, - "-" - ) + +# Read in required files -------------------------------------------------- +# With exception of best, we have 1 file for each PRSice run +prs.summary <- NULL +prsice.result <- NULL +phenotype <- NULL +if (use.data.table) { + prs.summary <- + fread(paste0(argv$out, ".summary"), data.table = F) + prsice.result <- + fread(paste0(argv$out, ".prsice"), data.table = F) + phenotype <- + fread(pheno.file, data.table = F, header = F, colClasses=c("V1"="character","V2"="character")) } else{ + prs.summary <- + read.table(paste0(argv$out, ".summary"), header = T) + prsice.result <- + read.table(paste0(argv$out, ".prsice"), header = T) + phenotype <- read.table(pheno.file, header = F, colClasses=c("V1"="character","V2"="character")) +} + + +if (!is.null(pheno.cols) & + length(pheno.cols) > 1) { + for (i in 1:length(pheno.cols)) { + # Update the covariance matrix accordingly process_plot( - argv$out, - covariance.base, - binary_target[1], - pheno.file, - argv, - pheno.index[1], - use.data.table, - use.ggplot, - "-" + argv$out, + phenotype, + covariance.base, + pheno.index[i], + binary.vector[i], + prs.summary, + subset(prsice.result, Pheno == pheno.cols[i]), + pheno.cols[i], + argv, + use.data.table, + use.ggplot ) + } + if (provided("multi_plot", argv)) { + multi_pheno_plot(argv, use.ggplot, use.data.table, argv$device) + } +} else{ + process_plot( + argv$out, + phenotype, + covariance.base, + pheno.index[1], + binary.vector[1], + prs.summary, + prsice.result, + "-", + argv, + use.data.table, + use.ggplot + ) } - diff --git a/docs/index.md b/docs/index.md index 12c7339e..d03bc4b0 100644 --- a/docs/index.md +++ b/docs/index.md @@ -18,29 +18,16 @@ PRSice (pronounced 'precise') is a Polygenic Risk Score software for calculating # Executable downloads [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3703335.svg)](https://doi.org/10.5281/zenodo.3703335)[![Coverage Status](https://coveralls.io/repos/github/choishingwan/PRSice/badge.svg?branch=master)](https://coveralls.io/github/choishingwan/PRSice?branch=master) | Operating System | Link | | -----------------|:----:| -| Linux 64-bit | [v2.3.0.d](https://github.com/choishingwan/PRSice/releases/download/2.3.0/PRSice_linux.230d.zip) | -| OS X 64-bit | [v2.3.0.d](https://github.com/choishingwan/PRSice/releases/download/2.3.0/PRSice_mac.230d.zip) | +| Linux 64-bit | [v2.3.1](https://github.com/choishingwan/PRSice/releases/download/2.3.1/PRSice_linux.zip) | +| OS X 64-bit | [v2.3.1](https://github.com/choishingwan/PRSice/releases/download/2.3.1/PRSice_mac.zip) | | Windows 32-bit | Not available | | Windows 64-bit | Not available | !!! Note "Latest Update" - # 2020-05-21 (v2.3.0.d) - - Fix problem introduced by previous fix. - - Was hoping 2.3.0's unit test will help reducing the amount of bugs. Sorry for the troubles. + # 2020-05-23 (v2.3.1) + - Update Rscript such that it match features in executable (thus avoid problem in plotting) + - Fix a bug where PRSice will crash when there are missing covariates - # 2020-05-20 (v2.3.0.c) - - Fix all score output format - - Fix problem with `--no-regress`. Might still have problem with `--no-regress --score con-std` - - # 2020-05-19 (v2.3.0.b) - - Fix error where sample selection will distort phenotype loading, loading the wrong phenotype to wrong sample. As this is a major bug, we deleted the previous 2 releases. Sorry for the troubles. - - # 2020-05-19 (v2.3.0.a) - - Fix output error where we always say 0 valid phenotype were included for continuous trait - - Fix problem with permutation where PRSice will crash if input are rank deficient - - Fix problem when provide a binary phenotype file with a fam file containing -9 as phenotype, PRSice will wrongly state that there are no phenotype presented - - Fix problem in Rscript where if sample ID is numeric and starts with 0, the best file will not merge with the phenotype file, causing 0 valid PRS to be observed - # 2020-05-18 (v2.3.0) - We now support multi-threaded clumping (separated by chromosome) - Genotypes will be stored to memory during clumping (increase memory usage, significantly speed up clumping) @@ -48,13 +35,8 @@ PRSice (pronounced 'precise') is a Polygenic Risk Score software for calculating - .prsice file now has additional column call "Pheno" - Introduced `--chr-id` which generate rs id based on user provided formula (see [detail](command_detail.md) for more info) - Format of `--base-maf` and `--base-info` are now changed to `:` from `,` - - Fix a bug related to ambiguous allele dosage flipping when `--keep-ambig` is used - - Better mismatch handling. For example, if your base file only provide the effective allele A without the non-effective allele information, PRSice will now do dosage flipping if your target file has G/C as effective allele and A /T as an non-effective allele (whereas previous this SNP will be considered as a mismatch) - - Fix bug in 2.2.13 where PRSice won't output the error message during command parsing stage - If user provided the `--stat` information, PRSice will now error out instead of trying to look for BETA or OR in the file. - - PRSice should now better recognize if phenotype file contains a header - - various small bug fix - + update log for previous release can be found [here](update_log.md) diff --git a/docs/update_log.md b/docs/update_log.md index 5f44044b..95fa68c1 100644 --- a/docs/update_log.md +++ b/docs/update_log.md @@ -1,4 +1,13 @@ From now on, I will try to archive our update log here. + +# 2020-05-23 (v2.3.1) +- Update Rscript such that it match features in executable (thus avoid problem in plotting) +- Fix a bug where PRSice will crash when there are missing covariates + + +# 2020-05-21 ([v2.3.0.e](https://github.com/choishingwan/PRSice/tree/2b057f0eafa28762ec0c1245bc2f20aacadda05b)) +- Fix Rscript bar plot problem + # 2020-05-21 ([v2.3.0.d](https://github.com/choishingwan/PRSice/tree/8784ab58b5171c5e4bbc5341de5baa68f5f5238f)) - Fix problem introduced by previous fix. - Was hoping 2.3.0's unit test will help reducing the amount of bugs. Sorry for the troubles. diff --git a/inc/commander.hpp b/inc/commander.hpp index 1bcf5ee0..aeb2651c 100644 --- a/inc/commander.hpp +++ b/inc/commander.hpp @@ -42,8 +42,8 @@ #include #endif -const std::string version = "2.3.0.d"; -const std::string date = "2020-05-21"; +const std::string version = "2.3.1"; +const std::string date = "2020-05-23"; class Commander { public: @@ -652,7 +652,7 @@ class Commander m_parameter_log["score"] = input; return true; } - + void add_base_to_log(); bool in_file(const std::vector& column_names, const size_t index, const std::string& warning, bool no_default, bool case_sensitive = true, diff --git a/src/commander.cpp b/src/commander.cpp index 3e852ed1..2ac7456f 100644 --- a/src/commander.cpp +++ b/src/commander.cpp @@ -1041,51 +1041,66 @@ std::vector get_base_header(const std::string& file) bool Commander::get_statistic_column( const std::vector& column_names) { - bool has_col; // don't allow both OR and BETA to be set if (m_base_info.is_or && m_base_info.is_beta) return false; - if (m_base_info.is_or || m_base_info.is_beta) - { - // guess default based on --or and --beta - const std::string target = m_base_info.is_or ? "OR" : "BETA"; - m_base_info.column_name[+BASE_INDEX::STAT] = target; - has_col = in_file(column_names, +BASE_INDEX::STAT, "Error", - m_user_no_default, false); - m_base_info.has_column[+BASE_INDEX::STAT] = has_col; - return has_col; - } - else - { - // go through file and look for either OR or BETA - m_base_info.column_name[+BASE_INDEX::STAT] = "OR"; - bool or_found = in_file(column_names, +BASE_INDEX::STAT, "Warning", - false, false, false); - m_base_info.column_name[+BASE_INDEX::STAT] = "BETA"; - bool beta_found = in_file(column_names, +BASE_INDEX::STAT, "Warning", - false, false, false); - if (or_found && beta_found) + const bool or_has_set = m_base_info.is_or; + const bool beta_has_set = m_base_info.is_beta; + + // go through file and look for either OR or BETA + m_base_info.column_name[+BASE_INDEX::STAT] = "OR"; + bool or_found = in_file(column_names, +BASE_INDEX::STAT, "Warning", false, + false, false); + m_base_info.column_name[+BASE_INDEX::STAT] = "BETA"; + bool beta_found = in_file(column_names, +BASE_INDEX::STAT, "Warning", false, + false, false); + if ((beta_has_set || or_has_set) && (beta_found || or_found)) + { + if (beta_has_set && beta_found) { - m_error_message.append("Error: Both OR and BETA " - "found in base file! We cannot determine " - "which statistic to use, please provide it " - "through --stat\n"); - return false; + // don't need to do anything as beta_found come last } - else if (or_found || beta_found) + else if (or_has_set && or_found) { - // don't need to update column_index as column_index is only set - // when we find the item in the base file - m_base_info.is_or = or_found; - m_base_info.is_beta = beta_found; - m_base_info.has_column[+BASE_INDEX::STAT] = true; - return true; + // we need to recall or to set the index + m_base_info.column_name[+BASE_INDEX::STAT] = "OR"; + in_file(column_names, +BASE_INDEX::STAT, "Warning", false, false, + false); } - else + else if (or_has_set && beta_found) + { + } + else if (beta_has_set && or_found) { - // cannot find either - m_error_message.append("Error: No statistic column in file!\n"); - return false; + m_base_info.column_name[+BASE_INDEX::STAT] = "OR"; + in_file(column_names, +BASE_INDEX::STAT, "Warning", false, false, + false); } + m_base_info.has_column[+BASE_INDEX::STAT] = true; + return true; + } + else if (or_found && beta_found) + { + m_error_message.append("Error: Both OR and BETA " + "found in base file! We cannot determine " + "which statistic to use, please provide it " + "through --stat\n"); + return false; + } + else if (or_found || beta_found) + { + // don't need to update column_index as column_index is only set + // when we find the item in the base file so this will only be set to + // the correct one + m_base_info.is_or = or_found; + m_base_info.is_beta = beta_found; + m_base_info.has_column[+BASE_INDEX::STAT] = true; + return true; + } + else + { + // cannot find either + m_error_message.append("Error: No statistic column in file!\n"); + return false; } } @@ -1195,9 +1210,45 @@ bool Commander::base_column_check(std::vector& column_names) } m_base_info.column_index[+BASE_INDEX::MAX] = *max_element( m_base_info.column_index.begin(), m_base_info.column_index.end()); + add_base_to_log(); return !error; } +void Commander::add_base_to_log() +{ + for (size_t i = 0; i < +BASE_INDEX::MAX; ++i) + { + std::string flag = ""; + switch (i) + { + case +BASE_INDEX::P: + if (m_base_info.has_column[i]) { flag = "pvalue"; } + break; + case +BASE_INDEX::BP: + if (m_base_info.has_column[i]) { flag = "bp"; } + break; + case +BASE_INDEX::RS: + if (m_base_info.has_column[i]) { flag = "snp"; } + break; + case +BASE_INDEX::CHR: + if (m_base_info.has_column[i]) { flag = "chr"; } + break; + case +BASE_INDEX::INFO: + if (m_base_info.has_column[i]) { flag = "base-info"; } + break; + case +BASE_INDEX::STAT: + if (m_base_info.has_column[i]) { flag = "stat"; } + break; + case +BASE_INDEX::EFFECT: + if (m_base_info.has_column[i]) { flag = "a1"; } + break; + case +BASE_INDEX::NONEFFECT: + if (m_base_info.has_column[i]) { flag = "a2"; } + break; + } + if (!flag.empty()) m_parameter_log[flag] = m_base_info.column_name[i]; + } +} bool Commander::clump_check() { bool error = false; diff --git a/src/prsice.cpp b/src/prsice.cpp index 5042c56a..2123bfa6 100644 --- a/src/prsice.cpp +++ b/src/prsice.cpp @@ -830,11 +830,12 @@ void PRSice::gen_cov_matrix(const std::vector& cov_names, + "\n==============================\n"); auto factor_levels = cov_check_and_factor_level_count( is_factor, cov_names, cov_idx, delim, ignore_fid, cov_file, target); + const auto num_valid_sample = m_phenotype.rows(); auto [cov_start, num_matrix_col] = get_cov_start(factor_levels, is_factor, cov_idx); // update size of independent variable m_independent_variables = - Eigen::MatrixXd::Zero(static_cast(num_sample), + Eigen::MatrixXd::Zero(static_cast(num_valid_sample), static_cast(num_matrix_col)); m_independent_variables.col(0).setOnes(); m_independent_variables.col(1).setOnes(); diff --git a/test/csrc/prsice_covariate.cpp b/test/csrc/prsice_covariate.cpp index d0483a92..9fc0ec08 100644 --- a/test/csrc/prsice_covariate.cpp +++ b/test/csrc/prsice_covariate.cpp @@ -456,3 +456,9 @@ TEST_CASE("Generate covariate matrix") } } } + + +TEST_CASE("Full init_matrix test") +{ + SECTION("No regress") {} +}