Skip to content

Commit

Permalink
unique, duplicated and uniqueN default by= to all columns. Option to …
Browse files Browse the repository at this point in the history
…restore old default (key(x)) provided. Startup message added. Closes #1284.
  • Loading branch information
mattdowle committed Sep 14, 2016
1 parent 5f3262a commit 11e6497
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 18 deletions.
29 changes: 14 additions & 15 deletions R/duplicated.R
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
# handles #1284
duplicated_warning <- function() {
message("Default value of duplicated.data.table method's 'by' argument will be changed to seq_along(x) (from key(x)) from the next release to be consistent with the default behaviour of base::unique.data.frame.")
}

duplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=key(x), ...) {
duplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) {
if (!cedta()) return(NextMethod("duplicated"))
if (!identical(incomparables, FALSE)) {
.NotYetUsed("incomparables != FALSE")
}
if (missing(by) && !is.null(key(x))) duplicated_warning()
if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key"))) #1284
by = key(x)
if (nrow(x) == 0L || ncol(x) == 0L) return(logical(0)) # fix for bug #5582
if (is.na(fromLast) || !is.logical(fromLast)) stop("'fromLast' must be TRUE or FALSE")
query <- .duplicated.helper(x, by)
Expand All @@ -30,9 +27,10 @@ duplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=key
res
}

unique.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=key(x), ...) {
unique.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) {
if (!cedta()) return(NextMethod("unique"))
if (missing(by) && !is.null(key(x))) duplicated_warning()
if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key"))) #1284
by = key(x)
dups <- duplicated.data.table(x, incomparables, fromLast, by, ...)
ans <- .Call(CsubsetDT, x, which_(dups, FALSE), seq_len(ncol(x))) # more memory efficient version of which(!dups)
if (nrow(x) != nrow(ans)) setindexv(ans, NULL)[] else ans #1760
Expand All @@ -47,7 +45,7 @@ unique.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=key(x),
##
## This was dropped into a helper because initial implementation of
## unique.data.table and duplicated.data.table both needed this. However,
## unique.data.table has bene refactored to simply call duplicated.data.table
## unique.data.table has been refactored to simply call duplicated.data.table
## making the refactor unnecessary, but let's leave it here just in case
.duplicated.helper <- function(x, by) {
use.sub.cols <- !is.null(by) # && !isTRUE(by) # Fixing bug #5424
Expand Down Expand Up @@ -85,27 +83,27 @@ unique.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=key(x),
# Note that base's anyDuplicated is faster than any(duplicated(.)) (for vectors) - for data.frames it still pastes before calling duplicated
# In that sense, this anyDuplicated is *not* the same as base's - meaning it's not a different implementation
# This is just a wrapper. That being said, it should be incredibly fast on data.tables (due to data.table's fast forder)
anyDuplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=key(x), ...) {
anyDuplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) {
if (!cedta()) return(NextMethod("anyDuplicated"))
if (missing(by) && !is.null(key(x))) duplicated_warning()
if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key"))) #1284
by = key(x)
dups <- duplicated(x, incomparables, fromLast, by, ...)
if (fromLast) idx = tail(which(dups), 1L) else idx = head(which(dups), 1L)
if (!length(idx)) idx=0L
idx
}


# simple straightforward helper function to get the number
# of groups in a vector or data.table. Here by data.table,
# we really mean `.SD` - used in a grouping operation
# TODO: optimise uniqueN further with GForce.
uniqueN <- function(x, by = if (is.data.table(x)) key(x) else NULL, na.rm=FALSE) { # na.rm, #1455
if (missing(by) && !is.null(key(x))) duplicated_warning()
uniqueN <- function(x, by = if (is.list(x)) seq_along(x) else NULL, na.rm=FALSE) { # na.rm, #1455
if (missing(by) && is.data.table(x) && isTRUE(getOption("datatable.old.unique.by.key"))) #1284
by = key(x)
if (is.null(x)) return(0L)
if (!is.atomic(x) && !is.data.frame(x))
stop("x must be an atomic vector or data.frames/data.tables")
if (is.atomic(x)) x = as_list(x)
if (is.null(by)) by = seq_along(x)
o = forderv(x, by=by, retGrp=TRUE, na.last=if (!na.rm) FALSE else NA)
starts = attr(o, 'starts')
if (!na.rm) {
Expand All @@ -116,3 +114,4 @@ uniqueN <- function(x, by = if (is.data.table(x)) key(x) else NULL, na.rm=FALSE)
sum( (if (length(o)) o[starts] else starts) != 0L)
}
}

2 changes: 2 additions & 0 deletions R/onAttach.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,7 @@
if (dev && (Sys.Date() - as.Date(d))>28) packageStartupMessage("**********\nThis development version of data.table was built more than 4 weeks ago. Please update.\n**********")
packageStartupMessage('For help type ?data.table or https://github.com/Rdatatable/data.table/wiki')
packageStartupMessage('The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way')
packageStartupMessage("By default all columns are now used by unique(), duplicated() and uniqueN() data.table methods. To restore old behaviour: setOption(datatable.old.unique.by.key=TRUE).")
}
}

3 changes: 2 additions & 1 deletion R/onLoad.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@
"datatable.fread.datatable"="TRUE",
"datatable.fread.dec.experiment"="TRUE", # temp. will remove once stable
"datatable.fread.dec.locale"=if (.Platform$OS.type=="unix") "'fr_FR.utf8'" else "'French_France.1252'",
"datatable.prettyprint.char" = NULL # FR #1091
"datatable.prettyprint.char" = NULL, # FR #1091
"datatable.old.unique.by.key" = "FALSE" # temp. TODO: warn 1 year, remove after 2 years
)
for (i in setdiff(names(opts),names(options()))) {
eval(parse(text=paste("options(",i,"=",opts[i],")",sep="")))
Expand Down
4 changes: 2 additions & 2 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -6228,7 +6228,7 @@ test(1502.2, dt1["a", z := 42L], dt2["a", z := 42L])
# fix for #1080
dt = data.table(col1 = c(1,2,3,2,5,3,2), col2 = c(0,9,8,9,6,5,4), key=c("col1"))
test(1503.1, uniqueN(dt, by=key(dt)), 4L) # default on key columns
test(1503.2, uniqueN(dt, by=NULL), 6L) # on all columns
test(1503.2, uniqueN(dt), 6L) # on all columns
test(1503.3, uniqueN(dt$col1), 4L) # on just that column

# .SDcols and with=FALSE understands colstart:colend syntax
Expand Down Expand Up @@ -8984,7 +8984,7 @@ dt <- data.table(
dt[ depart_time<=8 & travel_time < 60, condition1 := TRUE]
dt[ depart_time>=16 & travel_time < 60, condition2 := TRUE]
setkey(dt, origin, destination)
res <- unique(dt[(condition1)])[unique(dt[(condition2)]),
res <- unique(dt[(condition1)],by=key(dt))[unique(dt[(condition2)], by=key(dt)),
on = c(destination = "origin", origin = "destination"),
nomatch = 0L]
test(1690.3, res[, .(points = sum(points_in_dest)), keyby = origin], data.table(origin=LETTERS[1:3], points=c(9,7,12), key="origin"))
Expand Down

0 comments on commit 11e6497

Please sign in to comment.