From f5d2faeb20124a71d56b40337e4c55812ffac2d0 Mon Sep 17 00:00:00 2001 From: "Brian S. Yandell" Date: Thu, 20 Jun 2024 08:54:54 -0500 Subject: [PATCH] use multiple cores in scan1_pg.R routine scan1_pg_clean (#234) * modify scan1_pg_clean to branch by chr, pheno and pos * modified scan1_pg_clean to better divide positions relative to phenos when using cores --- R/scan1_pg.R | 66 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 19 deletions(-) diff --git a/R/scan1_pg.R b/R/scan1_pg.R index c62f4449..479d363e 100644 --- a/R/scan1_pg.R +++ b/R/scan1_pg.R @@ -314,14 +314,53 @@ scan1_pg_clean <- else no_x <- FALSE } else loco <- TRUE - batches <- list(chr=rep(seq_len(length(genoprobs)), ncol(pheno)), - phecol=rep(seq_len(ncol(pheno)), each=length(genoprobs))) - + # This creates a batch for every chr, pheno and set of positions. + # If cores == 1, use all positions on each chr (original code). + # if cores != 1, then + # for each chr, divide positions into contiguous intervals + # number of intervals across chr and phenos =~ cores + npos_by_chr <- dim(genoprobs)[3,] + # Genoprob names not retained for qtl2fst + names(npos_by_chr) <- names(genoprobs) + if(n_cores(cores) == 1) { + # Add beginning and end pos of each chr to original batches list. + batches <- list(chr=rep(seq_len(length(genoprobs)), ncol(pheno)), + pos1 = rep(1, length(genoprobs) * ncol(pheno)), + pos2 = rep(npos_by_chr, ncol(pheno)), + phecol=rep(seq_len(ncol(pheno)), each=length(genoprobs))) + } else { + # Divide each chr into intervals pos1:pos2 contiguous from 1 to length of chr. + tmpfn <- function(x) { + npos <- ceiling(x * ncol(pheno) * length(npos_by_chr) / n_cores(cores)) + pos1 <- seq(1, x, by = npos) + pos2 <- c(pos1[-1] - 1, x) + list(pos1 = pos1, pos2 = pos2) + } + npos <- lapply(npos_by_chr, tmpfn) + + # Collapse pos1 and pos2 into vectors. + pos1 <- pos2 <- NULL + for(i in names(npos)) { + pos1 <- c(pos1, npos[[i]]$pos1) + pos2 <- c(pos2, npos[[i]]$pos2) + } + # npos = number of intervals by chr + npos <- sapply(npos, function(x) length(x$pos1)) + + # Batches list + batches <- list(chr=rep(rep(seq_len(length(genoprobs)),npos), ncol(pheno)), + pos1=rep(pos1, ncol(pheno)), + pos2=rep(pos2, ncol(pheno)), + phecol=rep(seq_len(ncol(pheno)), each=sum(npos))) + } + # function that does the work by_batch_func <- function(batch) { chr <- batches$chr[batch] + pos1 <- batches$pos1[batch] + pos2 <- batches$pos2[batch] phecol <- batches$phecol[batch] if(loco) { @@ -342,11 +381,12 @@ scan1_pg_clean <- # subset the genotype probabilities: drop cols with all 0s, plus the first column Xcol2drop <- genoprob_Xcol2drop[[chr]] if(length(Xcol2drop) > 0) { - pr <- genoprobs[[chr]][ind2keep,-Xcol2drop,,drop=FALSE] + pr <- genoprobs[[chr]][ind2keep,-Xcol2drop,pos1:pos2,drop=FALSE] pr <- pr[,-1,,drop=FALSE] } - else - pr <- genoprobs[[chr]][ind2keep,-1,,drop=FALSE] + else { + pr <- genoprobs[[chr]][ind2keep,-1,pos1:pos2,drop=FALSE] + } # weight the probabilities pr <- weight_array(pr, weights) @@ -384,19 +424,7 @@ scan1_pg_clean <- if(any(result_is_null)) stop("cluster problem: returned ", sum(result_is_null), " NULLs.") - npos_by_chr <- dim(genoprobs)[3,] - totpos <- sum(npos_by_chr) - pos_index <- split(seq_len(totpos), rep(seq_len(length(genoprobs)), npos_by_chr)) - - # to contain the results - result <- matrix(nrow=totpos, ncol=ncol(pheno)) - for(batch in seq_along(batches$chr)) { - chr <- batches$chr[batch] - phecol <- batches$phecol[batch] - result[pos_index[[chr]], phecol] <- lod_list[[batch]] - } - - result + matrix(nrow=sum(npos_by_chr), ncol=ncol(pheno), unlist(lod_list)) }