From 5076cafcffe4be00745b82cd9bdf2203c1dcd988 Mon Sep 17 00:00:00 2001
From: Karl Broman <kbroman@gmail.com>
Date: Fri, 10 Nov 2023 09:00:53 -0600
Subject: [PATCH 1/3] Implement find_dup_markers(), port of
 qtl::findDupMarkers()

- Issue #225
---
 DESCRIPTION                            |   4 +-
 NAMESPACE                              |   1 +
 NEWS.md                                |   6 +-
 R/RcppExports.R                        |   4 +
 R/find_dup_markers.R                   | 137 +++++++++++++++++++++++++
 man/find_dup_markers.Rd                |  51 +++++++++
 src/RcppExports.cpp                    |  15 +++
 src/find_dup_markers.cpp               |  51 +++++++++
 src/find_dup_markers.h                 |  12 +++
 tests/testthat/test-find_dup_markers.R |  47 +++++++++
 10 files changed, 325 insertions(+), 3 deletions(-)
 create mode 100644 R/find_dup_markers.R
 create mode 100644 man/find_dup_markers.Rd
 create mode 100644 src/find_dup_markers.cpp
 create mode 100644 src/find_dup_markers.h
 create mode 100644 tests/testthat/test-find_dup_markers.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 4b729b40..fc65ddc7 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: qtl2
-Version: 0.33-3
-Date: 2023-11-04
+Version: 0.33-4
+Date: 2023-11-10
 Title: Quantitative Trait Locus Mapping in Experimental Crosses
 Description: Provides a set of tools to perform quantitative
     trait locus (QTL) analysis in experimental crosses. It is a
diff --git a/NAMESPACE b/NAMESPACE
index ee8fee9a..1d73aee7 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -80,6 +80,7 @@ export(drop_markers)
 export(drop_nullmarkers)
 export(est_herit)
 export(est_map)
+export(find_dup_markers)
 export(find_ibd_segments)
 export(find_index_snp)
 export(find_map_gaps)
diff --git a/NEWS.md b/NEWS.md
index a6e3995d..a86df39e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,4 @@
-## qtl2 0.33-3 (2023-11-04)
+## qtl2 0.33-4 (2023-11-10)
 
 ### Major changes
 
@@ -7,6 +7,10 @@
 
 - Similarly, changed `read_csv_numer()` to `fread_csv_numer()`.
 
+- Added function `fund_dup_markers()`, for identifying subsets of
+  markers with identical genotype data. This is a port of
+  `qtl::findDupMarkers()`.
+
 ### Minor changes
 
 - Have `calc_het()` stop with an error if the genotypes have labels
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 717a6304..52a7b0e8 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -149,6 +149,10 @@ invert_founder_index <- function(cross_info) {
     .Call(`_qtl2_is_phase_known`, crosstype)
 }
 
+.find_dup_markers_notexact <- function(Geno, order, markerloc, adjacent_only) {
+    .Call(`_qtl2_find_dup_markers_notexact`, Geno, order, markerloc, adjacent_only)
+}
+
 .find_ibd_segments <- function(g1, g2, p, error_prob) {
     .Call(`_qtl2_find_ibd_segments`, g1, g2, p, error_prob)
 }
diff --git a/R/find_dup_markers.R b/R/find_dup_markers.R
new file mode 100644
index 00000000..72f261a3
--- /dev/null
+++ b/R/find_dup_markers.R
@@ -0,0 +1,137 @@
+# find_dup_markers
+#
+#' Find markers with identical genotype data
+#'
+#' Identify sets of markers with identical genotype data.
+#'
+#' @param cross
+#'
+#' @param chr Optional vector specifying which chromosomes to consider.
+#' This may be a logical, numeric, or character string vector.
+#'
+#' @param exact_only If TRUE, look only for markers that have matching
+#' genotypes and the same pattern of missing data; if FALSE, also look for
+#' cases where the observed genotypes at one marker match those at
+#' another, and where the first marker has missing genotype whenever the
+#' genotype for the second marker is missing.
+#'
+#' @param adjacent_only If TRUE, look only for sets of markers that are
+#' adjacent to each other.
+#'
+#' @return A list of marker names; each component is a set of markers whose
+#' genotypes match one other marker, and the name of the component is the
+#'  name of the marker that they match.
+#'
+#' @details
+#'  If `exact.only=TRUE`, we look only for groups of markers whose
+#'  pattern of missing data and observed genotypes match exactly.  One
+#'  marker (chosen at random) is selected as the name of the group (in the
+#'  output of the function).
+#'
+#'  If `exact.only=FALSE`, we look also for markers whose observed genotypes
+#'  are contained in the observed genotypes of another marker.  We use a
+#'  pair of nested loops, working from the markers with the most observed
+#'  genotypes to the markers with the fewest observed genotypes.
+#'
+#' @export
+#' @keywords utilities
+#'
+#' @seealso [drop_markers()], [drop_nullmarkers()], [reduce_markers()]
+#'
+#' @examples
+#' grav2 <- read_cross2(system.file("extdata", "grav2.zip", package="qtl2"))
+#' dup <- find_dup_markers(grav2)
+#' grav2_nodup <- drop_markers(grav2, unlist(dup))
+
+find_dup_markers <-
+    function(cross, chr, exact_only=TRUE, adjacent_only=FALSE)
+{
+    if(!missing(chr)) cross <- subset(cross, chr=chr)
+
+    if(!is.cross2(cross))
+        stop('Input cross should be a "cross2" object.')
+
+    g <- do.call("cbind", cross$geno)
+    markers <- colnames(g)
+    markerloc <- lapply(n_mar(cross), function(a) 1:a)
+    if(length(markerloc) > 1) {
+        for(j in 2:length(markerloc))
+            markerloc[[j]] <- markerloc[[j]] + max(markerloc[[j-1]]) + 10
+    }
+    markerloc <- unlist(markerloc)
+
+    if(exact_only) {
+        g[is.na(g)] <- 0
+
+        # genotype data patterns
+        pat <- apply(g, 2, paste, collapse="")
+
+        # table of unique values
+        tab <- table(pat)
+
+        # no duplicates; return
+        if(all(tab == 1)) return(NULL)
+
+        wh <- which(tab > 1)
+        theloc <- themar <- vector("list", length(wh))
+        for(i in seq(along=wh)) {
+            themar[[i]] <- names(pat)[pat==names(tab)[wh[i]]]
+            theloc[[i]] <- markerloc[pat==names(tab)[wh[i]]]
+        }
+
+        if(adjacent_only) {
+            extraloc <- list()
+            extramar <- list()
+            for(i in seq(along=theloc)) {
+                d <- diff(theloc[[i]]) # look for adjacency
+                if(any(d>1)) { # split into adjacent groups
+                    temp <- which(d>1)
+                    first <- c(1, temp+1)
+                    last <- c(temp, length(theloc[[i]]))
+                    for(j in 2:length(first)) {
+                        extraloc[[length(extraloc)+1]] <- theloc[[i]][first[j]:last[j]]
+                        extramar[[length(extramar)+1]] <- themar[[i]][first[j]:last[j]]
+                    }
+                    themar[[i]] <- themar[[i]][first[1]:last[1]]
+                    theloc[[i]] <- theloc[[i]][first[1]:last[1]]
+                }
+            }
+            themar <- c(themar, extramar)
+            theloc <- c(theloc, extraloc)
+
+            nm <- sapply(themar, length)
+            if(all(nm==1)) return(NULL) # nothing left
+            themar <- themar[nm>1]
+            theloc <- theloc[nm>1]
+        }
+
+        # order by first locus
+        o <- order(sapply(theloc, min))
+        themar <- themar[o]
+
+        randompics <- sapply(themar, function(a) sample(length(a), 1))
+        for(i in seq(along=themar)) {
+            names(themar)[i] <- themar[[i]][randompics[i]]
+            themar[[i]] <- themar[[i]][-randompics[i]]
+        }
+
+    }
+    else {
+        themar <- NULL
+
+        ntyp <- n_typed(cross, "marker")
+        o <- order(ntyp, decreasing=TRUE)
+
+        g[is.na(g)] <- 0
+        result <- .find_dup_markers_notexact(g, o, markerloc, adjacent_only)
+
+        if(all(result==0)) return(NULL)
+        u <- unique(result[result != 0])
+        themar <- vector("list", length(u))
+        names(themar) <- colnames(g)[u]
+        for(i in seq(along=themar))
+            themar[[i]] <- colnames(g)[result==u[i]]
+    }
+
+    themar
+}
diff --git a/man/find_dup_markers.Rd b/man/find_dup_markers.Rd
new file mode 100644
index 00000000..7124a9c3
--- /dev/null
+++ b/man/find_dup_markers.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/find_dup_markers.R
+\name{find_dup_markers}
+\alias{find_dup_markers}
+\title{Find markers with identical genotype data}
+\usage{
+find_dup_markers(cross, chr, exact_only = TRUE, adjacent_only = FALSE)
+}
+\arguments{
+\item{cross}{}
+
+\item{chr}{Optional vector specifying which chromosomes to consider.
+This may be a logical, numeric, or character string vector.}
+
+\item{exact_only}{If TRUE, look only for markers that have matching
+genotypes and the same pattern of missing data; if FALSE, also look for
+cases where the observed genotypes at one marker match those at
+another, and where the first marker has missing genotype whenever the
+genotype for the second marker is missing.}
+
+\item{adjacent_only}{If TRUE, look only for sets of markers that are
+adjacent to each other.}
+}
+\value{
+A list of marker names; each component is a set of markers whose
+genotypes match one other marker, and the name of the component is the
+name of the marker that they match.
+}
+\description{
+Identify sets of markers with identical genotype data.
+}
+\details{
+If \code{exact.only=TRUE}, we look only for groups of markers whose
+pattern of missing data and observed genotypes match exactly.  One
+marker (chosen at random) is selected as the name of the group (in the
+output of the function).
+
+If \code{exact.only=FALSE}, we look also for markers whose observed genotypes
+are contained in the observed genotypes of another marker.  We use a
+pair of nested loops, working from the markers with the most observed
+genotypes to the markers with the fewest observed genotypes.
+}
+\examples{
+grav2 <- read_cross2(system.file("extdata", "grav2.zip", package="qtl2"))
+dup <- find_dup_markers(grav2)
+grav2_nodup <- drop_markers(grav2, unlist(dup))
+}
+\seealso{
+\code{\link[=drop_markers]{drop_markers()}}, \code{\link[=drop_nullmarkers]{drop_nullmarkers()}}, \code{\link[=reduce_markers]{reduce_markers()}}
+}
+\keyword{utilities}
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 503efe9e..2090f685 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -545,6 +545,20 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// find_dup_markers_notexact
+IntegerVector find_dup_markers_notexact(const IntegerMatrix& Geno, const IntegerVector& order, const IntegerVector& markerloc, const bool adjacent_only);
+RcppExport SEXP _qtl2_find_dup_markers_notexact(SEXP GenoSEXP, SEXP orderSEXP, SEXP markerlocSEXP, SEXP adjacent_onlySEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const IntegerMatrix& >::type Geno(GenoSEXP);
+    Rcpp::traits::input_parameter< const IntegerVector& >::type order(orderSEXP);
+    Rcpp::traits::input_parameter< const IntegerVector& >::type markerloc(markerlocSEXP);
+    Rcpp::traits::input_parameter< const bool >::type adjacent_only(adjacent_onlySEXP);
+    rcpp_result_gen = Rcpp::wrap(find_dup_markers_notexact(Geno, order, markerloc, adjacent_only));
+    return rcpp_result_gen;
+END_RCPP
+}
 // find_ibd_segments
 NumericMatrix find_ibd_segments(const IntegerVector& g1, const IntegerVector& g2, const NumericVector& p, const double error_prob);
 RcppExport SEXP _qtl2_find_ibd_segments(SEXP g1SEXP, SEXP g2SEXP, SEXP pSEXP, SEXP error_probSEXP) {
@@ -2455,6 +2469,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_qtl2_mpp_geno_names", (DL_FUNC) &_qtl2_mpp_geno_names, 2},
     {"_qtl2_invert_founder_index", (DL_FUNC) &_qtl2_invert_founder_index, 1},
     {"_qtl2_is_phase_known", (DL_FUNC) &_qtl2_is_phase_known, 1},
+    {"_qtl2_find_dup_markers_notexact", (DL_FUNC) &_qtl2_find_dup_markers_notexact, 4},
     {"_qtl2_find_ibd_segments", (DL_FUNC) &_qtl2_find_ibd_segments, 4},
     {"_qtl2_R_find_peaks", (DL_FUNC) &_qtl2_R_find_peaks, 3},
     {"_qtl2_R_find_peaks_and_lodint", (DL_FUNC) &_qtl2_R_find_peaks_and_lodint, 4},
diff --git a/src/find_dup_markers.cpp b/src/find_dup_markers.cpp
new file mode 100644
index 00000000..1c628e73
--- /dev/null
+++ b/src/find_dup_markers.cpp
@@ -0,0 +1,51 @@
+// find subsets of markers with identical genotypes
+
+#include "find_dup_markers.h"
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+// [[Rcpp::export(".find_dup_markers_notexact")]]
+IntegerVector find_dup_markers_notexact(const IntegerMatrix& Geno, // matrix of genotypes, individuals x markers
+                                        const IntegerVector& order, // vector indicating order to be considered, most data to least
+                                        const IntegerVector& markerloc, //
+                                        const bool adjacent_only)   // if true, consider only adjacent markers
+{
+    const int n_ind = Geno.rows();
+    const int n_mar = Geno.cols();
+    if(order.size() != n_mar)
+        throw std::invalid_argument("length(order) != ncol(Geno)");
+    if(markerloc.size() != n_mar)
+        throw std::invalid_argument("length(markerloc) != ncol(Geno)");
+
+    IntegerVector result(n_mar);
+    for(int i=0; i<n_mar; i++) result[i] = 0;
+
+    for(int i=0; i<n_mar-1; i++) {
+        int oi = order[i]-1;
+        for(int j=(i+1); j<n_mar; j++) {
+            int oj = order[j]-1;
+
+            if(result[oj] != 0 ||
+               (adjacent_only && abs(markerloc[oi] - markerloc[oj]) > 1)) {
+                /* skip */
+            }
+            else {
+                int flag = 0;
+                for(int k=0; k<n_ind; k++) {
+                    if((Geno(k,oi)==0 && Geno(k,oj)!=0) ||
+                       (Geno(k,oi)!=0 && Geno(k,oj)!=0 && Geno(k,oi) != Geno(k,oj))) {
+                        flag = 1;
+                        break;
+                    }
+                }
+                if(!flag) { /* it worked */
+                    if(result[oi] != 0) result[oj] = result[oi];
+                    else result[oj] = oi+1;
+                }
+            }
+        }
+    }
+
+    return(result);
+}
diff --git a/src/find_dup_markers.h b/src/find_dup_markers.h
new file mode 100644
index 00000000..032a93a8
--- /dev/null
+++ b/src/find_dup_markers.h
@@ -0,0 +1,12 @@
+// find subsets of markers with identical genotypes
+#ifndef FIND_DUP_MARKERS_H
+#define FIND_DUP_MARKERS_H
+
+#include <Rcpp.h>
+
+Rcpp::IntegerVector find_dup_markers_notexact(const Rcpp::IntegerMatrix& Geno, // matrix of genotypes, individuals x markers
+                                              const Rcpp::IntegerVector& order, // vector indicating order to be considered, most data to least
+                                              const Rcpp::IntegerVector markerloc, // integer vector indicating "position"
+                                              const bool adjacent_only);   // if true, consider only adjacent markers
+
+#endif // FIND_DUP_MARKERS_H
diff --git a/tests/testthat/test-find_dup_markers.R b/tests/testthat/test-find_dup_markers.R
new file mode 100644
index 00000000..7ef5bc30
--- /dev/null
+++ b/tests/testthat/test-find_dup_markers.R
@@ -0,0 +1,47 @@
+context("find duplicate markers")
+
+test_that("find_dup_markers matches qtl::findDupMarkers", {
+
+    library(qtl)
+    data(hyper)
+
+    hyper2 <- convert2cross2(hyper)
+
+    # exact only, not adjacent only
+    set.seed(20231110)
+    dup2 <- find_dup_markers(hyper2)
+
+    set.seed(20231110)
+    dup <- qtl::findDupMarkers(hyper)
+
+    expect_equal(dup2, dup)
+
+    # exact only, adjacent only
+    set.seed(20231110)
+    dup2 <- find_dup_markers(hyper2, adjacent_only=TRUE)
+
+    set.seed(20231110)
+    dup <- qtl::findDupMarkers(hyper, adjacent.only=TRUE)
+
+    expect_equal(dup2, dup)
+
+    # not exact only, not adjacent only
+    set.seed(20231110)
+    dup2 <- find_dup_markers(hyper2, exact_only=FALSE)
+
+    set.seed(20231110)
+    dup <- qtl::findDupMarkers(hyper, exact.only=FALSE)
+
+    expect_equal(dup2, dup)
+
+    # not exact only, adjacent only
+    set.seed(20231110)
+    dup2 <- find_dup_markers(hyper2, exact_only=FALSE, adjacent_only=TRUE)
+
+    set.seed(20231110)
+    dup <- qtl::findDupMarkers(hyper, exact.only=FALSE, adjacent.only=TRUE)
+
+    expect_equal(dup2, dup)
+
+
+})

From 7416ee57dd0b64f9f175af0c3744580d0a83d52e Mon Sep 17 00:00:00 2001
From: Karl Broman <kbroman@gmail.com>
Date: Fri, 10 Nov 2023 09:01:31 -0600
Subject: [PATCH 2/3] Add cross-refs in help info

---
 R/drop_markers.R      | 2 +-
 R/reduce_markers.R    | 2 ++
 man/drop_markers.Rd   | 2 +-
 man/reduce_markers.Rd | 3 +++
 4 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/R/drop_markers.R b/R/drop_markers.R
index 264c8f5d..68ec22e1 100644
--- a/R/drop_markers.R
+++ b/R/drop_markers.R
@@ -10,7 +10,7 @@
 #' @return The input `cross` with the specified markers removed.
 #'
 #' @export
-#' @seealso [pull_markers()], [drop_nullmarkers()]
+#' @seealso [pull_markers()], [drop_nullmarkers()], [reduce_markers()], [find_dup_markers()]
 #'
 #' @examples
 #' grav2 <- read_cross2(system.file("extdata", "grav2.zip", package="qtl2"))
diff --git a/R/reduce_markers.R b/R/reduce_markers.R
index 3d98cc94..26d55f0e 100644
--- a/R/reduce_markers.R
+++ b/R/reduce_markers.R
@@ -31,6 +31,8 @@
 #' the algorithm applied to each batch with min_distance smaller by a
 #' factor `min_distance_mult`, and then merged together for one last pass.
 #'
+#' @seealso [find_dup_markers()], [drop_markers()]
+#'
 #' @references Broman KW, Weber JL (1999) Method for constructing
 #' confidently ordered linkage maps. Genet Epidemiol 16:337--343
 #'
diff --git a/man/drop_markers.Rd b/man/drop_markers.Rd
index 7a3b63a6..07e3a416 100644
--- a/man/drop_markers.Rd
+++ b/man/drop_markers.Rd
@@ -24,5 +24,5 @@ markers2drop <- c("BH.342C/347L-Col", "GH.94L", "EG.357C/359L-Col", "CD.245L", "
 grav2_rev <- drop_markers(grav2, markers2drop)
 }
 \seealso{
-\code{\link[=pull_markers]{pull_markers()}}, \code{\link[=drop_nullmarkers]{drop_nullmarkers()}}
+\code{\link[=pull_markers]{pull_markers()}}, \code{\link[=drop_nullmarkers]{drop_nullmarkers()}}, \code{\link[=reduce_markers]{reduce_markers()}}, \code{\link[=find_dup_markers]{find_dup_markers()}}
 }
diff --git a/man/reduce_markers.Rd b/man/reduce_markers.Rd
index 4ae12a21..0ddbbabc 100644
--- a/man/reduce_markers.Rd
+++ b/man/reduce_markers.Rd
@@ -71,3 +71,6 @@ grav2_sub <- pull_markers(grav2, markers2keep)
 Broman KW, Weber JL (1999) Method for constructing
 confidently ordered linkage maps. Genet Epidemiol 16:337--343
 }
+\seealso{
+\code{\link[=find_dup_markers]{find_dup_markers()}}, \code{\link[=drop_markers]{drop_markers()}}
+}

From 2ddfc21fe88171bb9c1aad68b911174a0f30fb23 Mon Sep 17 00:00:00 2001
From: Karl Broman <kbroman@gmail.com>
Date: Fri, 10 Nov 2023 13:06:05 -0600
Subject: [PATCH 3/3] Oops forgot to explain cross arg in find_dup_markers()

---
 R/find_dup_markers.R    | 3 ++-
 man/find_dup_markers.Rd | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/find_dup_markers.R b/R/find_dup_markers.R
index 72f261a3..ad3f7ffa 100644
--- a/R/find_dup_markers.R
+++ b/R/find_dup_markers.R
@@ -4,7 +4,8 @@
 #'
 #' Identify sets of markers with identical genotype data.
 #'
-#' @param cross
+#' @param cross Object of class `"cross2"`. For details, see the
+#' [R/qtl2 developer guide](https://kbroman.org/qtl2/assets/vignettes/developer_guide.html).
 #'
 #' @param chr Optional vector specifying which chromosomes to consider.
 #' This may be a logical, numeric, or character string vector.
diff --git a/man/find_dup_markers.Rd b/man/find_dup_markers.Rd
index 7124a9c3..43d77d67 100644
--- a/man/find_dup_markers.Rd
+++ b/man/find_dup_markers.Rd
@@ -7,7 +7,8 @@
 find_dup_markers(cross, chr, exact_only = TRUE, adjacent_only = FALSE)
 }
 \arguments{
-\item{cross}{}
+\item{cross}{Object of class \code{"cross2"}. For details, see the
+\href{https://kbroman.org/qtl2/assets/vignettes/developer_guide.html}{R/qtl2 developer guide}.}
 
 \item{chr}{Optional vector specifying which chromosomes to consider.
 This may be a logical, numeric, or character string vector.}