Skip to content

Commit

Permalink
Implement find_dup_markers(), port of qtl::findDupMarkers()
Browse files Browse the repository at this point in the history
- Issue rqtl#225
  • Loading branch information
kbroman committed Nov 10, 2023
1 parent d8dd24b commit 5076caf
Show file tree
Hide file tree
Showing 10 changed files with 325 additions and 3 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: qtl2
Version: 0.33-3
Date: 2023-11-04
Version: 0.33-4
Date: 2023-11-10
Title: Quantitative Trait Locus Mapping in Experimental Crosses
Description: Provides a set of tools to perform quantitative
trait locus (QTL) analysis in experimental crosses. It is a
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ export(drop_markers)
export(drop_nullmarkers)
export(est_herit)
export(est_map)
export(find_dup_markers)
export(find_ibd_segments)
export(find_index_snp)
export(find_map_gaps)
Expand Down
6 changes: 5 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## qtl2 0.33-3 (2023-11-04)
## qtl2 0.33-4 (2023-11-10)

### Major changes

Expand All @@ -7,6 +7,10 @@

- Similarly, changed `read_csv_numer()` to `fread_csv_numer()`.

- Added function `fund_dup_markers()`, for identifying subsets of
markers with identical genotype data. This is a port of
`qtl::findDupMarkers()`.

### Minor changes

- Have `calc_het()` stop with an error if the genotypes have labels
Expand Down
4 changes: 4 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,10 @@ invert_founder_index <- function(cross_info) {
.Call(`_qtl2_is_phase_known`, crosstype)
}

.find_dup_markers_notexact <- function(Geno, order, markerloc, adjacent_only) {
.Call(`_qtl2_find_dup_markers_notexact`, Geno, order, markerloc, adjacent_only)
}

.find_ibd_segments <- function(g1, g2, p, error_prob) {
.Call(`_qtl2_find_ibd_segments`, g1, g2, p, error_prob)
}
Expand Down
137 changes: 137 additions & 0 deletions R/find_dup_markers.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# find_dup_markers
#
#' Find markers with identical genotype data
#'
#' Identify sets of markers with identical genotype data.
#'
#' @param cross
#'
#' @param chr Optional vector specifying which chromosomes to consider.
#' This may be a logical, numeric, or character string vector.
#'
#' @param exact_only If TRUE, look only for markers that have matching
#' genotypes and the same pattern of missing data; if FALSE, also look for
#' cases where the observed genotypes at one marker match those at
#' another, and where the first marker has missing genotype whenever the
#' genotype for the second marker is missing.
#'
#' @param adjacent_only If TRUE, look only for sets of markers that are
#' adjacent to each other.
#'
#' @return A list of marker names; each component is a set of markers whose
#' genotypes match one other marker, and the name of the component is the
#' name of the marker that they match.
#'
#' @details
#' If `exact.only=TRUE`, we look only for groups of markers whose
#' pattern of missing data and observed genotypes match exactly. One
#' marker (chosen at random) is selected as the name of the group (in the
#' output of the function).
#'
#' If `exact.only=FALSE`, we look also for markers whose observed genotypes
#' are contained in the observed genotypes of another marker. We use a
#' pair of nested loops, working from the markers with the most observed
#' genotypes to the markers with the fewest observed genotypes.
#'
#' @export
#' @keywords utilities
#'
#' @seealso [drop_markers()], [drop_nullmarkers()], [reduce_markers()]
#'
#' @examples
#' grav2 <- read_cross2(system.file("extdata", "grav2.zip", package="qtl2"))
#' dup <- find_dup_markers(grav2)
#' grav2_nodup <- drop_markers(grav2, unlist(dup))

find_dup_markers <-
function(cross, chr, exact_only=TRUE, adjacent_only=FALSE)
{
if(!missing(chr)) cross <- subset(cross, chr=chr)

if(!is.cross2(cross))
stop('Input cross should be a "cross2" object.')

g <- do.call("cbind", cross$geno)
markers <- colnames(g)
markerloc <- lapply(n_mar(cross), function(a) 1:a)
if(length(markerloc) > 1) {
for(j in 2:length(markerloc))
markerloc[[j]] <- markerloc[[j]] + max(markerloc[[j-1]]) + 10
}
markerloc <- unlist(markerloc)

if(exact_only) {
g[is.na(g)] <- 0

# genotype data patterns
pat <- apply(g, 2, paste, collapse="")

# table of unique values
tab <- table(pat)

# no duplicates; return
if(all(tab == 1)) return(NULL)

wh <- which(tab > 1)
theloc <- themar <- vector("list", length(wh))
for(i in seq(along=wh)) {
themar[[i]] <- names(pat)[pat==names(tab)[wh[i]]]
theloc[[i]] <- markerloc[pat==names(tab)[wh[i]]]
}

if(adjacent_only) {
extraloc <- list()
extramar <- list()
for(i in seq(along=theloc)) {
d <- diff(theloc[[i]]) # look for adjacency
if(any(d>1)) { # split into adjacent groups
temp <- which(d>1)
first <- c(1, temp+1)
last <- c(temp, length(theloc[[i]]))
for(j in 2:length(first)) {
extraloc[[length(extraloc)+1]] <- theloc[[i]][first[j]:last[j]]
extramar[[length(extramar)+1]] <- themar[[i]][first[j]:last[j]]
}
themar[[i]] <- themar[[i]][first[1]:last[1]]
theloc[[i]] <- theloc[[i]][first[1]:last[1]]
}
}
themar <- c(themar, extramar)
theloc <- c(theloc, extraloc)

nm <- sapply(themar, length)
if(all(nm==1)) return(NULL) # nothing left
themar <- themar[nm>1]
theloc <- theloc[nm>1]
}

# order by first locus
o <- order(sapply(theloc, min))
themar <- themar[o]

randompics <- sapply(themar, function(a) sample(length(a), 1))
for(i in seq(along=themar)) {
names(themar)[i] <- themar[[i]][randompics[i]]
themar[[i]] <- themar[[i]][-randompics[i]]
}

}
else {
themar <- NULL

ntyp <- n_typed(cross, "marker")
o <- order(ntyp, decreasing=TRUE)

g[is.na(g)] <- 0
result <- .find_dup_markers_notexact(g, o, markerloc, adjacent_only)

if(all(result==0)) return(NULL)
u <- unique(result[result != 0])
themar <- vector("list", length(u))
names(themar) <- colnames(g)[u]
for(i in seq(along=themar))
themar[[i]] <- colnames(g)[result==u[i]]
}

themar
}
51 changes: 51 additions & 0 deletions man/find_dup_markers.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 15 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,20 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// find_dup_markers_notexact
IntegerVector find_dup_markers_notexact(const IntegerMatrix& Geno, const IntegerVector& order, const IntegerVector& markerloc, const bool adjacent_only);
RcppExport SEXP _qtl2_find_dup_markers_notexact(SEXP GenoSEXP, SEXP orderSEXP, SEXP markerlocSEXP, SEXP adjacent_onlySEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const IntegerMatrix& >::type Geno(GenoSEXP);
Rcpp::traits::input_parameter< const IntegerVector& >::type order(orderSEXP);
Rcpp::traits::input_parameter< const IntegerVector& >::type markerloc(markerlocSEXP);
Rcpp::traits::input_parameter< const bool >::type adjacent_only(adjacent_onlySEXP);
rcpp_result_gen = Rcpp::wrap(find_dup_markers_notexact(Geno, order, markerloc, adjacent_only));
return rcpp_result_gen;
END_RCPP
}
// find_ibd_segments
NumericMatrix find_ibd_segments(const IntegerVector& g1, const IntegerVector& g2, const NumericVector& p, const double error_prob);
RcppExport SEXP _qtl2_find_ibd_segments(SEXP g1SEXP, SEXP g2SEXP, SEXP pSEXP, SEXP error_probSEXP) {
Expand Down Expand Up @@ -2455,6 +2469,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_qtl2_mpp_geno_names", (DL_FUNC) &_qtl2_mpp_geno_names, 2},
{"_qtl2_invert_founder_index", (DL_FUNC) &_qtl2_invert_founder_index, 1},
{"_qtl2_is_phase_known", (DL_FUNC) &_qtl2_is_phase_known, 1},
{"_qtl2_find_dup_markers_notexact", (DL_FUNC) &_qtl2_find_dup_markers_notexact, 4},
{"_qtl2_find_ibd_segments", (DL_FUNC) &_qtl2_find_ibd_segments, 4},
{"_qtl2_R_find_peaks", (DL_FUNC) &_qtl2_R_find_peaks, 3},
{"_qtl2_R_find_peaks_and_lodint", (DL_FUNC) &_qtl2_R_find_peaks_and_lodint, 4},
Expand Down
51 changes: 51 additions & 0 deletions src/find_dup_markers.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// find subsets of markers with identical genotypes

#include "find_dup_markers.h"
#include <Rcpp.h>

using namespace Rcpp;

// [[Rcpp::export(".find_dup_markers_notexact")]]
IntegerVector find_dup_markers_notexact(const IntegerMatrix& Geno, // matrix of genotypes, individuals x markers
const IntegerVector& order, // vector indicating order to be considered, most data to least
const IntegerVector& markerloc, //
const bool adjacent_only) // if true, consider only adjacent markers
{
const int n_ind = Geno.rows();
const int n_mar = Geno.cols();
if(order.size() != n_mar)
throw std::invalid_argument("length(order) != ncol(Geno)");
if(markerloc.size() != n_mar)
throw std::invalid_argument("length(markerloc) != ncol(Geno)");

IntegerVector result(n_mar);
for(int i=0; i<n_mar; i++) result[i] = 0;

for(int i=0; i<n_mar-1; i++) {
int oi = order[i]-1;
for(int j=(i+1); j<n_mar; j++) {
int oj = order[j]-1;

if(result[oj] != 0 ||
(adjacent_only && abs(markerloc[oi] - markerloc[oj]) > 1)) {
/* skip */
}
else {
int flag = 0;
for(int k=0; k<n_ind; k++) {
if((Geno(k,oi)==0 && Geno(k,oj)!=0) ||
(Geno(k,oi)!=0 && Geno(k,oj)!=0 && Geno(k,oi) != Geno(k,oj))) {
flag = 1;
break;
}
}
if(!flag) { /* it worked */
if(result[oi] != 0) result[oj] = result[oi];
else result[oj] = oi+1;
}
}
}
}

return(result);
}
12 changes: 12 additions & 0 deletions src/find_dup_markers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// find subsets of markers with identical genotypes
#ifndef FIND_DUP_MARKERS_H
#define FIND_DUP_MARKERS_H

#include <Rcpp.h>

Rcpp::IntegerVector find_dup_markers_notexact(const Rcpp::IntegerMatrix& Geno, // matrix of genotypes, individuals x markers
const Rcpp::IntegerVector& order, // vector indicating order to be considered, most data to least
const Rcpp::IntegerVector markerloc, // integer vector indicating "position"
const bool adjacent_only); // if true, consider only adjacent markers

#endif // FIND_DUP_MARKERS_H
47 changes: 47 additions & 0 deletions tests/testthat/test-find_dup_markers.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
context("find duplicate markers")

test_that("find_dup_markers matches qtl::findDupMarkers", {

library(qtl)
data(hyper)

hyper2 <- convert2cross2(hyper)

# exact only, not adjacent only
set.seed(20231110)
dup2 <- find_dup_markers(hyper2)

set.seed(20231110)
dup <- qtl::findDupMarkers(hyper)

expect_equal(dup2, dup)

# exact only, adjacent only
set.seed(20231110)
dup2 <- find_dup_markers(hyper2, adjacent_only=TRUE)

set.seed(20231110)
dup <- qtl::findDupMarkers(hyper, adjacent.only=TRUE)

expect_equal(dup2, dup)

# not exact only, not adjacent only
set.seed(20231110)
dup2 <- find_dup_markers(hyper2, exact_only=FALSE)

set.seed(20231110)
dup <- qtl::findDupMarkers(hyper, exact.only=FALSE)

expect_equal(dup2, dup)

# not exact only, adjacent only
set.seed(20231110)
dup2 <- find_dup_markers(hyper2, exact_only=FALSE, adjacent_only=TRUE)

set.seed(20231110)
dup <- qtl::findDupMarkers(hyper, exact.only=FALSE, adjacent.only=TRUE)

expect_equal(dup2, dup)


})

0 comments on commit 5076caf

Please sign in to comment.