-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement find_dup_markers(), port of qtl::findDupMarkers()
- Issue rqtl#225
- Loading branch information
Showing
10 changed files
with
325 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
# find_dup_markers | ||
# | ||
#' Find markers with identical genotype data | ||
#' | ||
#' Identify sets of markers with identical genotype data. | ||
#' | ||
#' @param cross | ||
#' | ||
#' @param chr Optional vector specifying which chromosomes to consider. | ||
#' This may be a logical, numeric, or character string vector. | ||
#' | ||
#' @param exact_only If TRUE, look only for markers that have matching | ||
#' genotypes and the same pattern of missing data; if FALSE, also look for | ||
#' cases where the observed genotypes at one marker match those at | ||
#' another, and where the first marker has missing genotype whenever the | ||
#' genotype for the second marker is missing. | ||
#' | ||
#' @param adjacent_only If TRUE, look only for sets of markers that are | ||
#' adjacent to each other. | ||
#' | ||
#' @return A list of marker names; each component is a set of markers whose | ||
#' genotypes match one other marker, and the name of the component is the | ||
#' name of the marker that they match. | ||
#' | ||
#' @details | ||
#' If `exact.only=TRUE`, we look only for groups of markers whose | ||
#' pattern of missing data and observed genotypes match exactly. One | ||
#' marker (chosen at random) is selected as the name of the group (in the | ||
#' output of the function). | ||
#' | ||
#' If `exact.only=FALSE`, we look also for markers whose observed genotypes | ||
#' are contained in the observed genotypes of another marker. We use a | ||
#' pair of nested loops, working from the markers with the most observed | ||
#' genotypes to the markers with the fewest observed genotypes. | ||
#' | ||
#' @export | ||
#' @keywords utilities | ||
#' | ||
#' @seealso [drop_markers()], [drop_nullmarkers()], [reduce_markers()] | ||
#' | ||
#' @examples | ||
#' grav2 <- read_cross2(system.file("extdata", "grav2.zip", package="qtl2")) | ||
#' dup <- find_dup_markers(grav2) | ||
#' grav2_nodup <- drop_markers(grav2, unlist(dup)) | ||
|
||
find_dup_markers <- | ||
function(cross, chr, exact_only=TRUE, adjacent_only=FALSE) | ||
{ | ||
if(!missing(chr)) cross <- subset(cross, chr=chr) | ||
|
||
if(!is.cross2(cross)) | ||
stop('Input cross should be a "cross2" object.') | ||
|
||
g <- do.call("cbind", cross$geno) | ||
markers <- colnames(g) | ||
markerloc <- lapply(n_mar(cross), function(a) 1:a) | ||
if(length(markerloc) > 1) { | ||
for(j in 2:length(markerloc)) | ||
markerloc[[j]] <- markerloc[[j]] + max(markerloc[[j-1]]) + 10 | ||
} | ||
markerloc <- unlist(markerloc) | ||
|
||
if(exact_only) { | ||
g[is.na(g)] <- 0 | ||
|
||
# genotype data patterns | ||
pat <- apply(g, 2, paste, collapse="") | ||
|
||
# table of unique values | ||
tab <- table(pat) | ||
|
||
# no duplicates; return | ||
if(all(tab == 1)) return(NULL) | ||
|
||
wh <- which(tab > 1) | ||
theloc <- themar <- vector("list", length(wh)) | ||
for(i in seq(along=wh)) { | ||
themar[[i]] <- names(pat)[pat==names(tab)[wh[i]]] | ||
theloc[[i]] <- markerloc[pat==names(tab)[wh[i]]] | ||
} | ||
|
||
if(adjacent_only) { | ||
extraloc <- list() | ||
extramar <- list() | ||
for(i in seq(along=theloc)) { | ||
d <- diff(theloc[[i]]) # look for adjacency | ||
if(any(d>1)) { # split into adjacent groups | ||
temp <- which(d>1) | ||
first <- c(1, temp+1) | ||
last <- c(temp, length(theloc[[i]])) | ||
for(j in 2:length(first)) { | ||
extraloc[[length(extraloc)+1]] <- theloc[[i]][first[j]:last[j]] | ||
extramar[[length(extramar)+1]] <- themar[[i]][first[j]:last[j]] | ||
} | ||
themar[[i]] <- themar[[i]][first[1]:last[1]] | ||
theloc[[i]] <- theloc[[i]][first[1]:last[1]] | ||
} | ||
} | ||
themar <- c(themar, extramar) | ||
theloc <- c(theloc, extraloc) | ||
|
||
nm <- sapply(themar, length) | ||
if(all(nm==1)) return(NULL) # nothing left | ||
themar <- themar[nm>1] | ||
theloc <- theloc[nm>1] | ||
} | ||
|
||
# order by first locus | ||
o <- order(sapply(theloc, min)) | ||
themar <- themar[o] | ||
|
||
randompics <- sapply(themar, function(a) sample(length(a), 1)) | ||
for(i in seq(along=themar)) { | ||
names(themar)[i] <- themar[[i]][randompics[i]] | ||
themar[[i]] <- themar[[i]][-randompics[i]] | ||
} | ||
|
||
} | ||
else { | ||
themar <- NULL | ||
|
||
ntyp <- n_typed(cross, "marker") | ||
o <- order(ntyp, decreasing=TRUE) | ||
|
||
g[is.na(g)] <- 0 | ||
result <- .find_dup_markers_notexact(g, o, markerloc, adjacent_only) | ||
|
||
if(all(result==0)) return(NULL) | ||
u <- unique(result[result != 0]) | ||
themar <- vector("list", length(u)) | ||
names(themar) <- colnames(g)[u] | ||
for(i in seq(along=themar)) | ||
themar[[i]] <- colnames(g)[result==u[i]] | ||
} | ||
|
||
themar | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
// find subsets of markers with identical genotypes | ||
|
||
#include "find_dup_markers.h" | ||
#include <Rcpp.h> | ||
|
||
using namespace Rcpp; | ||
|
||
// [[Rcpp::export(".find_dup_markers_notexact")]] | ||
IntegerVector find_dup_markers_notexact(const IntegerMatrix& Geno, // matrix of genotypes, individuals x markers | ||
const IntegerVector& order, // vector indicating order to be considered, most data to least | ||
const IntegerVector& markerloc, // | ||
const bool adjacent_only) // if true, consider only adjacent markers | ||
{ | ||
const int n_ind = Geno.rows(); | ||
const int n_mar = Geno.cols(); | ||
if(order.size() != n_mar) | ||
throw std::invalid_argument("length(order) != ncol(Geno)"); | ||
if(markerloc.size() != n_mar) | ||
throw std::invalid_argument("length(markerloc) != ncol(Geno)"); | ||
|
||
IntegerVector result(n_mar); | ||
for(int i=0; i<n_mar; i++) result[i] = 0; | ||
|
||
for(int i=0; i<n_mar-1; i++) { | ||
int oi = order[i]-1; | ||
for(int j=(i+1); j<n_mar; j++) { | ||
int oj = order[j]-1; | ||
|
||
if(result[oj] != 0 || | ||
(adjacent_only && abs(markerloc[oi] - markerloc[oj]) > 1)) { | ||
/* skip */ | ||
} | ||
else { | ||
int flag = 0; | ||
for(int k=0; k<n_ind; k++) { | ||
if((Geno(k,oi)==0 && Geno(k,oj)!=0) || | ||
(Geno(k,oi)!=0 && Geno(k,oj)!=0 && Geno(k,oi) != Geno(k,oj))) { | ||
flag = 1; | ||
break; | ||
} | ||
} | ||
if(!flag) { /* it worked */ | ||
if(result[oi] != 0) result[oj] = result[oi]; | ||
else result[oj] = oi+1; | ||
} | ||
} | ||
} | ||
} | ||
|
||
return(result); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
// find subsets of markers with identical genotypes | ||
#ifndef FIND_DUP_MARKERS_H | ||
#define FIND_DUP_MARKERS_H | ||
|
||
#include <Rcpp.h> | ||
|
||
Rcpp::IntegerVector find_dup_markers_notexact(const Rcpp::IntegerMatrix& Geno, // matrix of genotypes, individuals x markers | ||
const Rcpp::IntegerVector& order, // vector indicating order to be considered, most data to least | ||
const Rcpp::IntegerVector markerloc, // integer vector indicating "position" | ||
const bool adjacent_only); // if true, consider only adjacent markers | ||
|
||
#endif // FIND_DUP_MARKERS_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
context("find duplicate markers") | ||
|
||
test_that("find_dup_markers matches qtl::findDupMarkers", { | ||
|
||
library(qtl) | ||
data(hyper) | ||
|
||
hyper2 <- convert2cross2(hyper) | ||
|
||
# exact only, not adjacent only | ||
set.seed(20231110) | ||
dup2 <- find_dup_markers(hyper2) | ||
|
||
set.seed(20231110) | ||
dup <- qtl::findDupMarkers(hyper) | ||
|
||
expect_equal(dup2, dup) | ||
|
||
# exact only, adjacent only | ||
set.seed(20231110) | ||
dup2 <- find_dup_markers(hyper2, adjacent_only=TRUE) | ||
|
||
set.seed(20231110) | ||
dup <- qtl::findDupMarkers(hyper, adjacent.only=TRUE) | ||
|
||
expect_equal(dup2, dup) | ||
|
||
# not exact only, not adjacent only | ||
set.seed(20231110) | ||
dup2 <- find_dup_markers(hyper2, exact_only=FALSE) | ||
|
||
set.seed(20231110) | ||
dup <- qtl::findDupMarkers(hyper, exact.only=FALSE) | ||
|
||
expect_equal(dup2, dup) | ||
|
||
# not exact only, adjacent only | ||
set.seed(20231110) | ||
dup2 <- find_dup_markers(hyper2, exact_only=FALSE, adjacent_only=TRUE) | ||
|
||
set.seed(20231110) | ||
dup <- qtl::findDupMarkers(hyper, exact.only=FALSE, adjacent.only=TRUE) | ||
|
||
expect_equal(dup2, dup) | ||
|
||
|
||
}) |