Skip to content

Commit

Permalink
Merge pull request #96 from MariaAVC/Comparing_rowNames
Browse files Browse the repository at this point in the history
Matching row names
  • Loading branch information
svteichman authored Nov 27, 2024
2 parents a7de668 + a7c3aeb commit 49b50dc
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 6 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
Depends:
MASS,
Matrix,
Expand Down
51 changes: 46 additions & 5 deletions R/emuFit.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
#' @param use_both_cov logical: should score tests be run using information and
#' empirical score covariance evaluated both under the null and full models?
#' Used in simulations
#' @param match_row_names logical: Make sure rows on covariate data and response data correspond to
#' the same sample by comparing row names and subsetting/reordering if necessary.
#' @param constraint_fn function g defining a constraint on rows of B; g(B_k) = 0
#' for rows k = 1, ..., p of B. Default function is a smoothed median (minimizer of
#' pseudohuber loss). If a number is provided a single category constraint will be used
Expand Down Expand Up @@ -139,6 +141,7 @@ emuFit <- function(Y,
use_fullmodel_info = FALSE,
use_fullmodel_cov = FALSE,
use_both_cov = FALSE,
match_row_names = TRUE,
constraint_fn = pseudohuber_center,
constraint_grad_fn = dpseudohuber_center_dx,
constraint_param = 0.1,
Expand Down Expand Up @@ -219,11 +222,49 @@ covariates in formula must be provided.")
}
}

# check that if X and Y have rownames, they match
if (!is.null(rownames(Y)) & !is.null(rownames(X))) {
if (all.equal(rownames(Y), rownames(X)) != TRUE) {
message("There is a different row ordering between covariate data and response data. Covariate data will be reordered to match response data.")
X <- X[rownames(Y), ]
# check that if X and Y match in the row names
if (is.null(rownames(X)) || is.null(rownames(Y))){
if (nrow(X) == nrow(Y)){
if(match_row_names){
if(is.null(rownames(X))){
message("Row names are missing from the covariate matrix X. We will assume the rows are in the same order as in the response matrix Y. You are responsible for ensuring the order of your observations is the same in both matrices.")
} else {
message("Row names are missing from the response matrix Y. We will assume the rows are in the same order as in the covariate matrix X. You are responsible for ensuring the order of your observations is the same in both matrices.")
}
}
} else {
if(is.null(rownames(X))){
stop("Row names are missing from the covariate matrix X, and the number of rows does not match the number of rows in the response matrix Y. Please resolve this issue before refitting the model.")
} else {
stop("Row names are missing from the response matrix Y, and the number of rows does not match the number of rows in the covariate matrix X. Please resolve this issue before refitting the model.")
}
}
} else{
if(match_row_names){
names_X <- rownames(X)
names_Y <- rownames(Y)

#Checking if any row names are duplicated
if (any(duplicated(names_X))) stop("Covariate matrix X has duplicated row names. Please ensure all row names are unique before refitting the model.")
if (any(duplicated(names_Y))) stop("Response matrix Y has duplicated row names. Please ensure all row names are unique before refitting the model.")

# Find common row names
common_names <- intersect(names_X, names_Y)

if (length(common_names) < length(names_X) || length(common_names) < length(names_Y)) {
warning(sprintf("According to the rownames, there are observations that are missing either in the covariate matrix (X) and/or the response matrix (Y). We will subset to common rows only, resulting in %d samples.", length(common_names)))

X <- X[common_names, , drop = FALSE]
Y <- Y[common_names, , drop = FALSE]

} else if(all.equal(rownames(Y), rownames(X)) != TRUE){
message("There is a different row ordering between the covariate matrix (X) and the response matrix (Y). Covariate data will be reordered to match response data.")
X <- X[rownames(Y), , drop = FALSE]
}
} else {
if(nrow(X) != nrow(Y)){
stop("The number of rows does not match between the covariate matrix (X) and the response matrix (Y), and subsetting/matching by row name has been disabled. Please resolve this issue before refitting the model.")
}
}
}

Expand Down
4 changes: 4 additions & 0 deletions man/emuFit.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions tests/testthat/test-emuFit.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ for(i in 1:n){
}
}

#To ensure the messages about lack of row names do not show in the tests
rownames(X) <- paste0("Sample_",1:12)
rownames(Y) <- paste0("Sample_",1:12)

# Y <- structure(c(534337, 0, 0, 0, 376, 41, 19, 103, 0, 0, 85, 0, 42794,
# 0, 0, 0, 95, 0, 0, 15, 0, 0, 0, 26, 0, 149, 0, 0, 0, 0, 0, 211,
# 0, 0, 0, 0, 0, 103, 0, 0, 0, 1372, 83, 337, 0, 0, 0, 0, 0, 53,
Expand Down
75 changes: 75 additions & 0 deletions tests/testthat/test-row_name_matching.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
dat <- data.frame(cov1 = rep(c("A", "B", "C"), each = 6),
cov2 = rep(c("D", "E"), each = 9),
cov3 = rnorm(18),
cov4 = rnorm(18),
cov5 <- rep(c("G", "H", "I"), 6))

form <- ~ cov1 + cov2 + cov3 + cov4 + cov5
X.based <- model.matrix(form, dat)

Y <- matrix(rpois(18*6, 3), nrow = 18, ncol = 6)
colnames(Y) <- paste0("category_", 1:ncol(Y))
rownames(Y) <- paste0("sample_", 1:nrow(Y))

test_that("emuFit handles missing row names", {
X1 <- matrix(X.based, nrow = 18) # No row names

expect_message(emuFit(Y = Y, X = X1),
"Row names are missing from the covariate matrix X. We will assume the rows are in the same order as in the response matrix Y. You are responsible for ensuring the order of your observations is the same in both matrices.")
})

test_that("emuFit throws error on duplicate row names", {
X2 <- X.based
rownames(X2) <- rownames(Y)
rownames(X2)[5] <- "sample_4" #Repeating one of the sample labels

expect_error(emuFit(Y = Y, X = X2),
"Covariate matrix X has duplicated row names. Please ensure all row names are unique before refitting the model.")
})

test_that("emuFit subsets to common row names with warning", {
X3 <- X.based
rownames(X3) <- rownames(Y)
X3 <- X3[c(1:4,7:14,16:18), , drop = FALSE]
Y3 <- Y[c(1:2,5:18), , drop = FALSE]

expect_warning(emuFit(Y = Y3, X = X3),
regexp = "According to the rownames, there are observations that are missing either in the covariate matrix \\(X\\) and/or the response matrix \\(Y\\)\\. We will subset to common rows only, resulting in [0-9]+ samples\\.")
})

test_that("emuFit reorders rows of X when", {
X4 <- X.based
rownames(X4) <- rownames(Y)
X4.p <- X4[c(1,(nrow(Y):2)), , drop = FALSE]

expect_message(model.p <- emuFit(Y = Y, X = X4.p),
"There is a different row ordering between the covariate matrix \\(X\\) and the response matrix \\(Y\\)\\. Covariate data will be reordered to match response data\\.")
model.o <- emuFit(Y = Y, X = X4)
expect_equal(model.o$coef$estimate, model.p$coef$estimate)
})

test_that("emuFit does not reorder rows of X when match_row_names is FALSE", {
X5 <- X.based
X5.p <- X.based
rownames(X5) <- rownames(Y)
rownames(X5.p) <- rownames(Y)[c(1,nrow(Y):2)]

model.o <- emuFit(Y = Y, X = X5)
model.p <- emuFit(Y = Y, X = X5.p, match_row_names = FALSE)

expect_silent(model.o <- emuFit(Y = Y, X = X5))
expect_silent(model.p <- emuFit(Y = Y, X = X5.p, match_row_names = FALSE))

expect_equal(model.o$coef$estimate, model.p$coef$estimate)
})

test_that("emuFit stops when match_row_names is FALSE, but nrow does not coincide",{

X6 <- X.based
rownames(X6) <- rownames(Y)
X6 <- X6[c(1,(nrow(Y):2)),]
X6 <- X6[(1:16), , drop = FALSE]

expect_error(emuFit(Y = Y, X = X6, match_row_names = FALSE),
"The number of rows does not match between the covariate matrix \\(X\\) and the response matrix \\(Y\\), and subsetting/matching by row name has been disabled\\. Please resolve this issue before refitting the model\\.")
})

0 comments on commit 49b50dc

Please sign in to comment.