unique, duplicated and uniqueN default by= to all columns. Option to …

…restore old default (key(x)) provided. Startup message added. Closes #1284.
Rdatatable · Sep 14, 2016 · 11e6497 · 11e6497
1 parent 5f3262a
commit 11e6497
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 18 deletions.
diff --git a/R/duplicated.R b/R/duplicated.R
@@ -1,14 +1,11 @@
-# handles #1284
-duplicated_warning <- function() {
-    message("Default value of duplicated.data.table method's 'by' argument will be changed to seq_along(x) (from key(x)) from the next release to be consistent with the default behaviour of base::unique.data.frame.")
-}
 
-duplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=key(x), ...) {
+duplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) {
     if (!cedta()) return(NextMethod("duplicated"))
     if (!identical(incomparables, FALSE)) {
         .NotYetUsed("incomparables != FALSE")
     }
-    if (missing(by) && !is.null(key(x))) duplicated_warning()
+    if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key")))  #1284
+        by = key(x)
     if (nrow(x) == 0L || ncol(x) == 0L) return(logical(0)) # fix for bug #5582
     if (is.na(fromLast) || !is.logical(fromLast)) stop("'fromLast' must be TRUE or FALSE")
     query <- .duplicated.helper(x, by)
@@ -30,9 +27,10 @@ duplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=key
     res
 }
 
-unique.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=key(x), ...) {
+unique.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) {
     if (!cedta()) return(NextMethod("unique"))
-    if (missing(by) && !is.null(key(x))) duplicated_warning()
+    if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key")))  #1284
+        by = key(x)
     dups <- duplicated.data.table(x, incomparables, fromLast, by, ...)
     ans <- .Call(CsubsetDT, x, which_(dups, FALSE), seq_len(ncol(x))) # more memory efficient version of which(!dups)
     if (nrow(x) != nrow(ans)) setindexv(ans, NULL)[] else ans #1760
@@ -47,7 +45,7 @@ unique.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=key(x),
 ##
 ## This was dropped into a helper because initial implementation of
 ## unique.data.table and duplicated.data.table both needed this. However,
-## unique.data.table has bene refactored to simply call duplicated.data.table
+## unique.data.table has been refactored to simply call duplicated.data.table
 ## making the refactor unnecessary, but let's leave it here just in case
 .duplicated.helper <- function(x, by) {
     use.sub.cols <- !is.null(by) # && !isTRUE(by) # Fixing bug #5424
@@ -85,27 +83,27 @@ unique.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=key(x),
 # Note that base's anyDuplicated is faster than any(duplicated(.)) (for vectors) - for data.frames it still pastes before calling duplicated
 # In that sense, this anyDuplicated is *not* the same as base's - meaning it's not a different implementation
 # This is just a wrapper. That being said, it should be incredibly fast on data.tables (due to data.table's fast forder)
-anyDuplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=key(x), ...) {
+anyDuplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) {
     if (!cedta()) return(NextMethod("anyDuplicated"))
-    if (missing(by) && !is.null(key(x))) duplicated_warning()
+    if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key")))  #1284
+        by = key(x)
     dups <- duplicated(x, incomparables, fromLast, by, ...)
     if (fromLast) idx = tail(which(dups), 1L) else idx = head(which(dups), 1L)
     if (!length(idx)) idx=0L
     idx
 }
 
-
 # simple straightforward helper function to get the number 
 # of groups in a vector or data.table. Here by data.table, 
 # we really mean `.SD` - used in a grouping operation
 # TODO: optimise uniqueN further with GForce.
-uniqueN <- function(x, by = if (is.data.table(x)) key(x) else NULL, na.rm=FALSE) { # na.rm, #1455
-    if (missing(by) && !is.null(key(x))) duplicated_warning()
+uniqueN <- function(x, by = if (is.list(x)) seq_along(x) else NULL, na.rm=FALSE) { # na.rm, #1455
+    if (missing(by) && is.data.table(x) && isTRUE(getOption("datatable.old.unique.by.key")))  #1284
+        by = key(x)
     if (is.null(x)) return(0L)
     if (!is.atomic(x) && !is.data.frame(x))
         stop("x must be an atomic vector or data.frames/data.tables")
     if (is.atomic(x)) x = as_list(x)
-    if (is.null(by)) by = seq_along(x)
     o = forderv(x, by=by, retGrp=TRUE, na.last=if (!na.rm) FALSE else NA)
     starts = attr(o, 'starts')
     if (!na.rm) {
@@ -116,3 +114,4 @@ uniqueN <- function(x, by = if (is.data.table(x)) key(x) else NULL, na.rm=FALSE)
         sum( (if (length(o)) o[starts] else starts) != 0L)
     }
 }
+
diff --git a/R/onAttach.R b/R/onAttach.R
@@ -18,5 +18,7 @@
     if (dev && (Sys.Date() - as.Date(d))>28) packageStartupMessage("**********\nThis development version of data.table was built more than 4 weeks ago. Please update.\n**********")
     packageStartupMessage('For help type ?data.table or https://github.com/Rdatatable/data.table/wiki')
     packageStartupMessage('The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way')
+    packageStartupMessage("By default all columns are now used by unique(), duplicated() and uniqueN() data.table methods. To restore old behaviour: setOption(datatable.old.unique.by.key=TRUE).")
   }
 }
+
diff --git a/R/onLoad.R b/R/onLoad.R
@@ -45,7 +45,8 @@
              "datatable.fread.datatable"="TRUE",
              "datatable.fread.dec.experiment"="TRUE", # temp.  will remove once stable
              "datatable.fread.dec.locale"=if (.Platform$OS.type=="unix") "'fr_FR.utf8'" else "'French_France.1252'",
-             "datatable.prettyprint.char" = NULL # FR #1091
+             "datatable.prettyprint.char" = NULL, # FR #1091
+             "datatable.old.unique.by.key" = "FALSE"  # temp. TODO: warn 1 year, remove after 2 years
              )
     for (i in setdiff(names(opts),names(options()))) {
         eval(parse(text=paste("options(",i,"=",opts[i],")",sep="")))

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -6228,7 +6228,7 @@ test(1502.2, dt1["a", z := 42L], dt2["a", z := 42L])
 # fix for #1080
 dt = data.table(col1 = c(1,2,3,2,5,3,2), col2 = c(0,9,8,9,6,5,4), key=c("col1"))
 test(1503.1, uniqueN(dt, by=key(dt)), 4L) # default on key columns
-test(1503.2, uniqueN(dt, by=NULL), 6L) # on all columns
+test(1503.2, uniqueN(dt), 6L) # on all columns
 test(1503.3, uniqueN(dt$col1), 4L) # on just that column
 
 # .SDcols and with=FALSE understands colstart:colend syntax
@@ -8984,7 +8984,7 @@ dt <-  data.table(
 dt[ depart_time<=8  & travel_time < 60, condition1 := TRUE]
 dt[ depart_time>=16 & travel_time < 60, condition2 := TRUE] 
 setkey(dt, origin, destination)
-res <- unique(dt[(condition1)])[unique(dt[(condition2)]), 
+res <- unique(dt[(condition1)],by=key(dt))[unique(dt[(condition2)], by=key(dt)), 
                                 on = c(destination = "origin", origin = "destination"), 
                                 nomatch = 0L]
 test(1690.3, res[, .(points = sum(points_in_dest)),  keyby = origin], data.table(origin=LETTERS[1:3], points=c(9,7,12), key="origin"))