Skip to content

Commit

Permalink
check for empty strings in gene names, fixes #149
Browse files Browse the repository at this point in the history
  • Loading branch information
assaron committed Aug 5, 2024
1 parent eea285e commit 3268ac0
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 65 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: fgsea
Title: Fast Gene Set Enrichment Analysis
Version: 1.31.2
Version: 1.31.3
Authors@R: c(person("Gennady", "Korotkevich", role = "aut"),
person("Vladimir", "Sukhov", role = "aut"),
person("Nikolay", "Budin", role = "ctb"),
Expand Down
3 changes: 3 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
CHANGES IN VERSION 1.31.3
* check for empty strings in gene names

CHANGES IN VERSION 1.31.2
* fora() provides fold enrichment scores for more effective prioritisation of results

Expand Down
36 changes: 3 additions & 33 deletions R/fgsea.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,26 +39,6 @@ fgsea <- function(pathways, stats, minSize = 1, maxSize = length(stats)-1, gseaP


preparePathwaysAndStats <- function(pathways, stats, minSize, maxSize, gseaParam, scoreType){
# Error if pathways is not a list
if (!is.list(pathways)) {
stop("pathways should be a list with each element containing names of the stats argument")
}

# Error if stats is not named
if (is.null(names(stats))) {
stop("stats should be named")
}

# Error if stats names are NA
if (any(is.na(names(stats)))) {
stop("NAs in names(stats) are not allowed")
}

# Error for duplicate gene names
if (any(duplicated(names(stats)))) {
stop("Duplicate names(stats) are not allowed")
}

# Error if stats are non-finite
if (any(!is.finite(stats))){
stop("Not all stats values are finite numbers")
Expand All @@ -81,21 +61,11 @@ preparePathwaysAndStats <- function(pathways, stats, minSize, maxSize, gseaParam
stats <- sort(stats, decreasing=TRUE)
stats <- abs(stats) ^ gseaParam

res <- preparePathways(pathways, universe=names(stats), minSize, maxSize)

minSize <- max(minSize, 1)
maxSize <- min(maxSize, length(stats)-1)
res$stats <- stats

pathwaysFiltered <- lapply(pathways, function(p) { unique(na.omit(fmatch(p, names(stats)))) })
pathwaysSizes <- sapply(pathwaysFiltered, length)

toKeep <- which(minSize <= pathwaysSizes & pathwaysSizes <= maxSize)

pathwaysFiltered <- pathwaysFiltered[toKeep]
pathwaysSizes <- pathwaysSizes[toKeep]

list(filtered=pathwaysFiltered,
sizes=pathwaysSizes,
stats=stats)
res
}


Expand Down
25 changes: 5 additions & 20 deletions R/fgseaORA.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,12 @@
#' data(exampleRanks)
#' foraRes <- fora(examplePathways, genes=tail(names(exampleRanks), 200), universe=names(exampleRanks))
fora <- function(pathways, genes, universe, minSize=1, maxSize=length(universe)-1) {
# Error if pathways is not a list
if (!is.list(pathways)) {
stop("pathways should be a list with each element containing genes from the universe")
}

# Warning message for duplicate gene names
if (any(duplicated(universe))) {
warning("There were duplicate genes in universe, they were collapsed")
universe <- unique(universe)
}

minSize <- max(minSize, 1)
pp <- preparePathways(pathways, universe, minSize, maxSize)
pathwaysFiltered <- pp$filtered
pathwaysSizes <- pp$sizes

pathwaysFiltered <- lapply(pathways, function(p) { unique(na.omit(fmatch(p, universe))) })
pathwaysSizes <- sapply(pathwaysFiltered, length)

toKeep <- which(minSize <= pathwaysSizes & pathwaysSizes <= maxSize)

if (length(toKeep) == 0){
if (length(pathwaysFiltered) == 0){
return(data.table(pathway=character(),
pval=numeric(),
padj=numeric(),
Expand All @@ -50,13 +37,11 @@ fora <- function(pathways, genes, universe, minSize=1, maxSize=length(universe)-
overlapGenes=list()))
}

pathwaysFiltered <- pathwaysFiltered[toKeep]
pathwaysSizes <- pathwaysSizes[toKeep]


if (!all(genes %in% universe)) {
warning("Not all of the input genes belong to the universe, such genes were removed")
}

genesFiltered <- unique(na.omit(fmatch(genes, universe)))

overlaps <- lapply(pathwaysFiltered, intersect, genesFiltered)
Expand Down
13 changes: 2 additions & 11 deletions R/geseca-utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,9 @@ checkGesecaArgs <- function(E, pathways){
}

gesecaPreparePathways <- function(E, pathways, minSize, maxSize){
minSize <- max(minSize, 1)
maxSize <- min(nrow(E) - 1, maxSize)
res <- preparePathways(pathways, universe = rownames(E), minSize, maxSize)

pathwaysFiltered <- lapply(pathways, function(p) {unique(na.omit(fmatch(p, rownames(E))))})
pathwaysSizes <- sapply(pathwaysFiltered, length)

toKeep <- which(minSize <= pathwaysSizes & pathwaysSizes <= maxSize)
pathwaysFiltered <- pathwaysFiltered[toKeep]
pathwaysSizes <- pathwaysSizes[toKeep]

return(list(filtered=pathwaysFiltered,
sizes=pathwaysSizes))
return(res)
}


Expand Down
45 changes: 45 additions & 0 deletions R/util.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
preparePathways <- function(pathways, universe, minSize, maxSize){
universeArg <- deparse(match.call()$universe)
# Error if pathways is not a list
if (!is.list(pathways)) {
stop("pathways should be a list with each element containing gene identifiers")
}

# Error if stats is not named
if (is.null(universe)) {
.stopf("%s should not be null", universeArg)
}

# Error if stats names are NA
if (any(is.na(universe))) {
.stopf("NAs in %s are not allowed", universeArg)
}

# Error if stats names are empty string
if (any(universe == "")) {
.stopf("Empty strings are not allowed in %s", universeArg)
}

# Error for duplicate gene names
if (any(duplicated(universe))) {
.stopf("Duplicate values in %s not allowed", universeArg)
}

minSize <- max(minSize, 1)
maxSize <- min(maxSize, length(universe)-1)

pathwaysFiltered <- lapply(pathways, function(p) { unique(na.omit(fmatch(p, universe))) })
pathwaysSizes <- lengths(pathwaysFiltered)

toKeep <- which(minSize <= pathwaysSizes & pathwaysSizes <= maxSize)

pathwaysFiltered <- pathwaysFiltered[toKeep]
pathwaysSizes <- pathwaysSizes[toKeep]

list(filtered=pathwaysFiltered,
sizes=pathwaysSizes)
}

.stopf <- function(fmt, ...) {
stop(sprintf(fmt, ...))
}
4 changes: 4 additions & 0 deletions tests/testthat/test_gsea_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@ test_that("fgseaSimple correctly checks gene names", {
names(ranks)[41] <- NA
expect_error(fgseaSimple(examplePathways, ranks, nperm=100, minSize=10, maxSize=50, nproc=1))

ranks <- exampleRanks
names(ranks)[41] <- ""
expect_error(fgseaSimple(examplePathways, ranks, nperm=100, minSize=10, maxSize=50, nproc=1))

ranks <- unname(exampleRanks)
expect_error(fgseaSimple(examplePathways, ranks, nperm=100, minSize=10, maxSize=50, nproc=1))

Expand Down

0 comments on commit 3268ac0

Please sign in to comment.