From 5076cafcffe4be00745b82cd9bdf2203c1dcd988 Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Fri, 10 Nov 2023 09:00:53 -0600 Subject: [PATCH 1/3] Implement find_dup_markers(), port of qtl::findDupMarkers() - Issue #225 --- DESCRIPTION | 4 +- NAMESPACE | 1 + NEWS.md | 6 +- R/RcppExports.R | 4 + R/find_dup_markers.R | 137 +++++++++++++++++++++++++ man/find_dup_markers.Rd | 51 +++++++++ src/RcppExports.cpp | 15 +++ src/find_dup_markers.cpp | 51 +++++++++ src/find_dup_markers.h | 12 +++ tests/testthat/test-find_dup_markers.R | 47 +++++++++ 10 files changed, 325 insertions(+), 3 deletions(-) create mode 100644 R/find_dup_markers.R create mode 100644 man/find_dup_markers.Rd create mode 100644 src/find_dup_markers.cpp create mode 100644 src/find_dup_markers.h create mode 100644 tests/testthat/test-find_dup_markers.R diff --git a/DESCRIPTION b/DESCRIPTION index 4b729b40..fc65ddc7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: qtl2 -Version: 0.33-3 -Date: 2023-11-04 +Version: 0.33-4 +Date: 2023-11-10 Title: Quantitative Trait Locus Mapping in Experimental Crosses Description: Provides a set of tools to perform quantitative trait locus (QTL) analysis in experimental crosses. It is a diff --git a/NAMESPACE b/NAMESPACE index ee8fee9a..1d73aee7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -80,6 +80,7 @@ export(drop_markers) export(drop_nullmarkers) export(est_herit) export(est_map) +export(find_dup_markers) export(find_ibd_segments) export(find_index_snp) export(find_map_gaps) diff --git a/NEWS.md b/NEWS.md index a6e3995d..a86df39e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -## qtl2 0.33-3 (2023-11-04) +## qtl2 0.33-4 (2023-11-10) ### Major changes @@ -7,6 +7,10 @@ - Similarly, changed `read_csv_numer()` to `fread_csv_numer()`. +- Added function `fund_dup_markers()`, for identifying subsets of + markers with identical genotype data. This is a port of + `qtl::findDupMarkers()`. + ### Minor changes - Have `calc_het()` stop with an error if the genotypes have labels diff --git a/R/RcppExports.R b/R/RcppExports.R index 717a6304..52a7b0e8 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -149,6 +149,10 @@ invert_founder_index <- function(cross_info) { .Call(`_qtl2_is_phase_known`, crosstype) } +.find_dup_markers_notexact <- function(Geno, order, markerloc, adjacent_only) { + .Call(`_qtl2_find_dup_markers_notexact`, Geno, order, markerloc, adjacent_only) +} + .find_ibd_segments <- function(g1, g2, p, error_prob) { .Call(`_qtl2_find_ibd_segments`, g1, g2, p, error_prob) } diff --git a/R/find_dup_markers.R b/R/find_dup_markers.R new file mode 100644 index 00000000..72f261a3 --- /dev/null +++ b/R/find_dup_markers.R @@ -0,0 +1,137 @@ +# find_dup_markers +# +#' Find markers with identical genotype data +#' +#' Identify sets of markers with identical genotype data. +#' +#' @param cross +#' +#' @param chr Optional vector specifying which chromosomes to consider. +#' This may be a logical, numeric, or character string vector. +#' +#' @param exact_only If TRUE, look only for markers that have matching +#' genotypes and the same pattern of missing data; if FALSE, also look for +#' cases where the observed genotypes at one marker match those at +#' another, and where the first marker has missing genotype whenever the +#' genotype for the second marker is missing. +#' +#' @param adjacent_only If TRUE, look only for sets of markers that are +#' adjacent to each other. +#' +#' @return A list of marker names; each component is a set of markers whose +#' genotypes match one other marker, and the name of the component is the +#' name of the marker that they match. +#' +#' @details +#' If `exact.only=TRUE`, we look only for groups of markers whose +#' pattern of missing data and observed genotypes match exactly. One +#' marker (chosen at random) is selected as the name of the group (in the +#' output of the function). +#' +#' If `exact.only=FALSE`, we look also for markers whose observed genotypes +#' are contained in the observed genotypes of another marker. We use a +#' pair of nested loops, working from the markers with the most observed +#' genotypes to the markers with the fewest observed genotypes. +#' +#' @export +#' @keywords utilities +#' +#' @seealso [drop_markers()], [drop_nullmarkers()], [reduce_markers()] +#' +#' @examples +#' grav2 <- read_cross2(system.file("extdata", "grav2.zip", package="qtl2")) +#' dup <- find_dup_markers(grav2) +#' grav2_nodup <- drop_markers(grav2, unlist(dup)) + +find_dup_markers <- + function(cross, chr, exact_only=TRUE, adjacent_only=FALSE) +{ + if(!missing(chr)) cross <- subset(cross, chr=chr) + + if(!is.cross2(cross)) + stop('Input cross should be a "cross2" object.') + + g <- do.call("cbind", cross$geno) + markers <- colnames(g) + markerloc <- lapply(n_mar(cross), function(a) 1:a) + if(length(markerloc) > 1) { + for(j in 2:length(markerloc)) + markerloc[[j]] <- markerloc[[j]] + max(markerloc[[j-1]]) + 10 + } + markerloc <- unlist(markerloc) + + if(exact_only) { + g[is.na(g)] <- 0 + + # genotype data patterns + pat <- apply(g, 2, paste, collapse="") + + # table of unique values + tab <- table(pat) + + # no duplicates; return + if(all(tab == 1)) return(NULL) + + wh <- which(tab > 1) + theloc <- themar <- vector("list", length(wh)) + for(i in seq(along=wh)) { + themar[[i]] <- names(pat)[pat==names(tab)[wh[i]]] + theloc[[i]] <- markerloc[pat==names(tab)[wh[i]]] + } + + if(adjacent_only) { + extraloc <- list() + extramar <- list() + for(i in seq(along=theloc)) { + d <- diff(theloc[[i]]) # look for adjacency + if(any(d>1)) { # split into adjacent groups + temp <- which(d>1) + first <- c(1, temp+1) + last <- c(temp, length(theloc[[i]])) + for(j in 2:length(first)) { + extraloc[[length(extraloc)+1]] <- theloc[[i]][first[j]:last[j]] + extramar[[length(extramar)+1]] <- themar[[i]][first[j]:last[j]] + } + themar[[i]] <- themar[[i]][first[1]:last[1]] + theloc[[i]] <- theloc[[i]][first[1]:last[1]] + } + } + themar <- c(themar, extramar) + theloc <- c(theloc, extraloc) + + nm <- sapply(themar, length) + if(all(nm==1)) return(NULL) # nothing left + themar <- themar[nm>1] + theloc <- theloc[nm>1] + } + + # order by first locus + o <- order(sapply(theloc, min)) + themar <- themar[o] + + randompics <- sapply(themar, function(a) sample(length(a), 1)) + for(i in seq(along=themar)) { + names(themar)[i] <- themar[[i]][randompics[i]] + themar[[i]] <- themar[[i]][-randompics[i]] + } + + } + else { + themar <- NULL + + ntyp <- n_typed(cross, "marker") + o <- order(ntyp, decreasing=TRUE) + + g[is.na(g)] <- 0 + result <- .find_dup_markers_notexact(g, o, markerloc, adjacent_only) + + if(all(result==0)) return(NULL) + u <- unique(result[result != 0]) + themar <- vector("list", length(u)) + names(themar) <- colnames(g)[u] + for(i in seq(along=themar)) + themar[[i]] <- colnames(g)[result==u[i]] + } + + themar +} diff --git a/man/find_dup_markers.Rd b/man/find_dup_markers.Rd new file mode 100644 index 00000000..7124a9c3 --- /dev/null +++ b/man/find_dup_markers.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/find_dup_markers.R +\name{find_dup_markers} +\alias{find_dup_markers} +\title{Find markers with identical genotype data} +\usage{ +find_dup_markers(cross, chr, exact_only = TRUE, adjacent_only = FALSE) +} +\arguments{ +\item{cross}{} + +\item{chr}{Optional vector specifying which chromosomes to consider. +This may be a logical, numeric, or character string vector.} + +\item{exact_only}{If TRUE, look only for markers that have matching +genotypes and the same pattern of missing data; if FALSE, also look for +cases where the observed genotypes at one marker match those at +another, and where the first marker has missing genotype whenever the +genotype for the second marker is missing.} + +\item{adjacent_only}{If TRUE, look only for sets of markers that are +adjacent to each other.} +} +\value{ +A list of marker names; each component is a set of markers whose +genotypes match one other marker, and the name of the component is the +name of the marker that they match. +} +\description{ +Identify sets of markers with identical genotype data. +} +\details{ +If \code{exact.only=TRUE}, we look only for groups of markers whose +pattern of missing data and observed genotypes match exactly. One +marker (chosen at random) is selected as the name of the group (in the +output of the function). + +If \code{exact.only=FALSE}, we look also for markers whose observed genotypes +are contained in the observed genotypes of another marker. We use a +pair of nested loops, working from the markers with the most observed +genotypes to the markers with the fewest observed genotypes. +} +\examples{ +grav2 <- read_cross2(system.file("extdata", "grav2.zip", package="qtl2")) +dup <- find_dup_markers(grav2) +grav2_nodup <- drop_markers(grav2, unlist(dup)) +} +\seealso{ +\code{\link[=drop_markers]{drop_markers()}}, \code{\link[=drop_nullmarkers]{drop_nullmarkers()}}, \code{\link[=reduce_markers]{reduce_markers()}} +} +\keyword{utilities} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 503efe9e..2090f685 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -545,6 +545,20 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// find_dup_markers_notexact +IntegerVector find_dup_markers_notexact(const IntegerMatrix& Geno, const IntegerVector& order, const IntegerVector& markerloc, const bool adjacent_only); +RcppExport SEXP _qtl2_find_dup_markers_notexact(SEXP GenoSEXP, SEXP orderSEXP, SEXP markerlocSEXP, SEXP adjacent_onlySEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const IntegerMatrix& >::type Geno(GenoSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type order(orderSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type markerloc(markerlocSEXP); + Rcpp::traits::input_parameter< const bool >::type adjacent_only(adjacent_onlySEXP); + rcpp_result_gen = Rcpp::wrap(find_dup_markers_notexact(Geno, order, markerloc, adjacent_only)); + return rcpp_result_gen; +END_RCPP +} // find_ibd_segments NumericMatrix find_ibd_segments(const IntegerVector& g1, const IntegerVector& g2, const NumericVector& p, const double error_prob); RcppExport SEXP _qtl2_find_ibd_segments(SEXP g1SEXP, SEXP g2SEXP, SEXP pSEXP, SEXP error_probSEXP) { @@ -2455,6 +2469,7 @@ static const R_CallMethodDef CallEntries[] = { {"_qtl2_mpp_geno_names", (DL_FUNC) &_qtl2_mpp_geno_names, 2}, {"_qtl2_invert_founder_index", (DL_FUNC) &_qtl2_invert_founder_index, 1}, {"_qtl2_is_phase_known", (DL_FUNC) &_qtl2_is_phase_known, 1}, + {"_qtl2_find_dup_markers_notexact", (DL_FUNC) &_qtl2_find_dup_markers_notexact, 4}, {"_qtl2_find_ibd_segments", (DL_FUNC) &_qtl2_find_ibd_segments, 4}, {"_qtl2_R_find_peaks", (DL_FUNC) &_qtl2_R_find_peaks, 3}, {"_qtl2_R_find_peaks_and_lodint", (DL_FUNC) &_qtl2_R_find_peaks_and_lodint, 4}, diff --git a/src/find_dup_markers.cpp b/src/find_dup_markers.cpp new file mode 100644 index 00000000..1c628e73 --- /dev/null +++ b/src/find_dup_markers.cpp @@ -0,0 +1,51 @@ +// find subsets of markers with identical genotypes + +#include "find_dup_markers.h" +#include + +using namespace Rcpp; + +// [[Rcpp::export(".find_dup_markers_notexact")]] +IntegerVector find_dup_markers_notexact(const IntegerMatrix& Geno, // matrix of genotypes, individuals x markers + const IntegerVector& order, // vector indicating order to be considered, most data to least + const IntegerVector& markerloc, // + const bool adjacent_only) // if true, consider only adjacent markers +{ + const int n_ind = Geno.rows(); + const int n_mar = Geno.cols(); + if(order.size() != n_mar) + throw std::invalid_argument("length(order) != ncol(Geno)"); + if(markerloc.size() != n_mar) + throw std::invalid_argument("length(markerloc) != ncol(Geno)"); + + IntegerVector result(n_mar); + for(int i=0; i 1)) { + /* skip */ + } + else { + int flag = 0; + for(int k=0; k + +Rcpp::IntegerVector find_dup_markers_notexact(const Rcpp::IntegerMatrix& Geno, // matrix of genotypes, individuals x markers + const Rcpp::IntegerVector& order, // vector indicating order to be considered, most data to least + const Rcpp::IntegerVector markerloc, // integer vector indicating "position" + const bool adjacent_only); // if true, consider only adjacent markers + +#endif // FIND_DUP_MARKERS_H diff --git a/tests/testthat/test-find_dup_markers.R b/tests/testthat/test-find_dup_markers.R new file mode 100644 index 00000000..7ef5bc30 --- /dev/null +++ b/tests/testthat/test-find_dup_markers.R @@ -0,0 +1,47 @@ +context("find duplicate markers") + +test_that("find_dup_markers matches qtl::findDupMarkers", { + + library(qtl) + data(hyper) + + hyper2 <- convert2cross2(hyper) + + # exact only, not adjacent only + set.seed(20231110) + dup2 <- find_dup_markers(hyper2) + + set.seed(20231110) + dup <- qtl::findDupMarkers(hyper) + + expect_equal(dup2, dup) + + # exact only, adjacent only + set.seed(20231110) + dup2 <- find_dup_markers(hyper2, adjacent_only=TRUE) + + set.seed(20231110) + dup <- qtl::findDupMarkers(hyper, adjacent.only=TRUE) + + expect_equal(dup2, dup) + + # not exact only, not adjacent only + set.seed(20231110) + dup2 <- find_dup_markers(hyper2, exact_only=FALSE) + + set.seed(20231110) + dup <- qtl::findDupMarkers(hyper, exact.only=FALSE) + + expect_equal(dup2, dup) + + # not exact only, adjacent only + set.seed(20231110) + dup2 <- find_dup_markers(hyper2, exact_only=FALSE, adjacent_only=TRUE) + + set.seed(20231110) + dup <- qtl::findDupMarkers(hyper, exact.only=FALSE, adjacent.only=TRUE) + + expect_equal(dup2, dup) + + +}) From 7416ee57dd0b64f9f175af0c3744580d0a83d52e Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Fri, 10 Nov 2023 09:01:31 -0600 Subject: [PATCH 2/3] Add cross-refs in help info --- R/drop_markers.R | 2 +- R/reduce_markers.R | 2 ++ man/drop_markers.Rd | 2 +- man/reduce_markers.Rd | 3 +++ 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/R/drop_markers.R b/R/drop_markers.R index 264c8f5d..68ec22e1 100644 --- a/R/drop_markers.R +++ b/R/drop_markers.R @@ -10,7 +10,7 @@ #' @return The input `cross` with the specified markers removed. #' #' @export -#' @seealso [pull_markers()], [drop_nullmarkers()] +#' @seealso [pull_markers()], [drop_nullmarkers()], [reduce_markers()], [find_dup_markers()] #' #' @examples #' grav2 <- read_cross2(system.file("extdata", "grav2.zip", package="qtl2")) diff --git a/R/reduce_markers.R b/R/reduce_markers.R index 3d98cc94..26d55f0e 100644 --- a/R/reduce_markers.R +++ b/R/reduce_markers.R @@ -31,6 +31,8 @@ #' the algorithm applied to each batch with min_distance smaller by a #' factor `min_distance_mult`, and then merged together for one last pass. #' +#' @seealso [find_dup_markers()], [drop_markers()] +#' #' @references Broman KW, Weber JL (1999) Method for constructing #' confidently ordered linkage maps. Genet Epidemiol 16:337--343 #' diff --git a/man/drop_markers.Rd b/man/drop_markers.Rd index 7a3b63a6..07e3a416 100644 --- a/man/drop_markers.Rd +++ b/man/drop_markers.Rd @@ -24,5 +24,5 @@ markers2drop <- c("BH.342C/347L-Col", "GH.94L", "EG.357C/359L-Col", "CD.245L", " grav2_rev <- drop_markers(grav2, markers2drop) } \seealso{ -\code{\link[=pull_markers]{pull_markers()}}, \code{\link[=drop_nullmarkers]{drop_nullmarkers()}} +\code{\link[=pull_markers]{pull_markers()}}, \code{\link[=drop_nullmarkers]{drop_nullmarkers()}}, \code{\link[=reduce_markers]{reduce_markers()}}, \code{\link[=find_dup_markers]{find_dup_markers()}} } diff --git a/man/reduce_markers.Rd b/man/reduce_markers.Rd index 4ae12a21..0ddbbabc 100644 --- a/man/reduce_markers.Rd +++ b/man/reduce_markers.Rd @@ -71,3 +71,6 @@ grav2_sub <- pull_markers(grav2, markers2keep) Broman KW, Weber JL (1999) Method for constructing confidently ordered linkage maps. Genet Epidemiol 16:337--343 } +\seealso{ +\code{\link[=find_dup_markers]{find_dup_markers()}}, \code{\link[=drop_markers]{drop_markers()}} +} From 2ddfc21fe88171bb9c1aad68b911174a0f30fb23 Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Fri, 10 Nov 2023 13:06:05 -0600 Subject: [PATCH 3/3] Oops forgot to explain cross arg in find_dup_markers() --- R/find_dup_markers.R | 3 ++- man/find_dup_markers.Rd | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R/find_dup_markers.R b/R/find_dup_markers.R index 72f261a3..ad3f7ffa 100644 --- a/R/find_dup_markers.R +++ b/R/find_dup_markers.R @@ -4,7 +4,8 @@ #' #' Identify sets of markers with identical genotype data. #' -#' @param cross +#' @param cross Object of class `"cross2"`. For details, see the +#' [R/qtl2 developer guide](https://kbroman.org/qtl2/assets/vignettes/developer_guide.html). #' #' @param chr Optional vector specifying which chromosomes to consider. #' This may be a logical, numeric, or character string vector. diff --git a/man/find_dup_markers.Rd b/man/find_dup_markers.Rd index 7124a9c3..43d77d67 100644 --- a/man/find_dup_markers.Rd +++ b/man/find_dup_markers.Rd @@ -7,7 +7,8 @@ find_dup_markers(cross, chr, exact_only = TRUE, adjacent_only = FALSE) } \arguments{ -\item{cross}{} +\item{cross}{Object of class \code{"cross2"}. For details, see the +\href{https://kbroman.org/qtl2/assets/vignettes/developer_guide.html}{R/qtl2 developer guide}.} \item{chr}{Optional vector specifying which chromosomes to consider. This may be a logical, numeric, or character string vector.}