new option "datatable.use.index"=TRUE to control usage of indices, cl…

…oses #1422
Rdatatable · Apr 20, 2016 · f961f7b · f961f7b
1 parent c1b0972
commit f961f7b
Show file tree

Hide file tree

Showing 6 changed files with 67 additions and 10 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -237,6 +237,8 @@
 
   25. `data.table`'s dependency has been moved forward from R 2.14.1 to R 2.15.0 (Mar 2012; i.e. 4 years old). We keep this dependency as old as possible for as long as possible as requested by users in managed environments. This bump allows `data.table` to use `paste0()` internally and in tests for the first time. Before release to CRAN [our procedures](https://github.com/Rdatatable/data.table/blob/master/CRAN_Release.cmd) include running the test suite using this stated dependency.
 
+  26. New option `options(datatable.use.index = TRUE)` (default) gives better control over usage of indices, when combined with `options(datatable.auto.index = FALSE)` it allows to use only indices created manually with `setindex` or `setindexv`. Closes [#1422](https://github.com/Rdatatable/data.table/issues/1422).  
+
 ### Changes in v1.9.6  (on CRAN 19 Sep 2015)
 
 #### NEW FEATURES

diff --git a/R/data.table.R b/R/data.table.R
@@ -505,11 +505,13 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
             assign("x", x, order_env)
             i = eval(isub, order_env, parent.frame())             # for optimisation of 'order' to 'forder'
             # that forder returns integer(0) is taken care of internally within forder
-          } else if (is.call(isub) && getOption("datatable.auto.index") &&
-                   as.character(isub[[1L]]) %chin% c("==","%in%") &&
-                   is.name(isub[[2L]]) &&
-                   (isub2<-as.character(isub[[2L]])) %chin% names(x) && 
-                   is.null(attr(x, '.data.table.locked'))) {  # fix for #958, don't create auto index on '.SD'.
+          } else if (is.call(isub) &&
+                     getOption("datatable.use.index") && # #1422
+                     as.character(isub[[1L]]) %chin% c("==","%in%") &&
+                     is.name(isub[[2L]]) &&
+                     (isub2<-as.character(isub[[2L]])) %chin% names(x) &&
+                     (getOption("datatable.auto.index") || (isub2 %chin% indices(x))) && # `||` used to either auto.index or already have index #1422
+                     is.null(attr(x, '.data.table.locked'))) {  # fix for #958, don't create auto index on '.SD'.
             # LHS is a column name symbol
             # simplest case for now (single ==).  Later, top level may be &,|,< or >
             # TO DO: print method could print physical and secondary keys at end.
@@ -545,6 +547,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                     xo = get2key(x,isub2)  # Can't be any index with that col as the first one because those indexes will reorder within each group
                     if (is.null(xo)) {   # integer() would be valid and signifies o=1:.N
                         if (verbose) {cat("Creating new index '",isub2,"'\n",sep="");flush.console()}
+                        if (identical(getOption("datatable.auto.index"), FALSE)) warning("Index is being created on '",isub2,"' besides the fact that option 'datatable.auto.index' is FALSE. Please report to data.table#1422.") # why not double check that, even if won't happen now may be a good check for future changes
                         setindexv(x,isub2)
                         xo = get2key(x,isub2)
                     } else {
@@ -653,11 +656,13 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                     } else nqgrp = integer(0)
                 }
                 if (nqmaxgrp == 1L) { # equi join. Reuse secondary index, #1439
-                    if (verbose) cat("Looking for existing (secondary) index... ")
-                    xo = attr(attr(x, 'index'), paste("__", names(x)[rightcols], sep="", collapse=""))
+                    xo = if (isTRUE(getOption("datatable.use.index"))) {
+                        if (verbose) cat("Looking for existing (secondary) index... ")
+                        attr(attr(x, 'index'), paste("__", names(x)[rightcols], sep="", collapse=""))
+                    }
                     if (is.null(xo)) {
                         if (verbose) {
-                            cat("not found.\n")
+                            if (isTRUE(getOption("datatable.use.index"))) cat("not found.\n")
                             tt = system.time(xo <- forderv(x, by=rightcols))
                             cat("forder took", tt["user.self"] + tt["sys.self"], "sec\n")
                         } else xo = forderv(x, by = rightcols)

diff --git a/R/onLoad.R b/R/onLoad.R
@@ -41,6 +41,7 @@
              "datatable.integer64"="'integer64'",    # datatable.<argument name>    integer64|double|character
              "datatable.showProgress"="1L",          # in fread
              "datatable.auto.index"="TRUE",          # DT[col=="val"] to auto add index so 2nd time faster
+             "datatable.use.index"="TRUE",           # global switch to address #1422
              "datatable.fread.datatable"="TRUE",
              "datatable.old.bywithoutby"="FALSE",    # temp rollback method for code migration, will be removed in future
              "datatable.fread.dec.experiment"="TRUE", # temp.  will remove once stable

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -8745,6 +8745,53 @@ test(1665.1, rbindlist(list(dt1,dt2)), setDT(rbind(as.data.frame(dt1), as.data.f
 # print method now works (when rows > 100 it uses rbind/rbindlist internally)
 test(1665.2, ans <- capture.output(dt2), ans) # just checking that it doesn't error, really.
 
+# Use existing index even when auto index is disabled #1422
+d = data.table(k=3:1) # subset - no index
+options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE)
+test(1666.1, d[k==1L, verbose=TRUE], d[3L], output="Creating new index 'k'")
+d = data.table(k=3:1)
+options("datatable.use.index"=TRUE, "datatable.auto.index"=FALSE)
+test(1666.2, grep("Creating new index", capture.output(d[k==1L, verbose=TRUE])), integer(0)) # do not create index
+d = data.table(k=3:1)
+options("datatable.use.index"=FALSE, "datatable.auto.index"=FALSE)
+test(1666.3, grep("Creating new index", capture.output(d[k==1L, verbose=TRUE])), integer(0))
+d = data.table(k=3:1)
+options("datatable.use.index"=FALSE, "datatable.auto.index"=TRUE)
+test(1666.4, grep("Creating new index", capture.output(d[k==1L, verbose=TRUE])), integer(0))
+d = data.table(k=3:1) # subset - index
+setindex(d, k)
+options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE)
+test(1666.5, d[k==1L, verbose=TRUE], d[3L], output="Using existing index 'k'")
+options("datatable.use.index"=TRUE, "datatable.auto.index"=FALSE)
+test(1666.6, d[k==1L, verbose=TRUE], d[3L], output="Using existing index 'k'")
+options("datatable.use.index"=FALSE, "datatable.auto.index"=FALSE)
+test(1666.7, grep("Using existing index", capture.output(d[k==1L, verbose=TRUE])), integer(0)) # not using existing index
+options("datatable.use.index"=FALSE, "datatable.auto.index"=TRUE)
+test(1666.8, grep("Using existing index", capture.output(d[k==1L, verbose=TRUE])), integer(0))
+d1 = data.table(k=3:1) # join - no index
+d2 = data.table(k=2:4)
+options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE)
+test(1666.9, d1[d2, on="k", verbose=TRUE], d1[d2, on="k"], output="not found.")
+options("datatable.use.index"=TRUE, "datatable.auto.index"=FALSE)
+test(1666.10, d1[d2, on="k", verbose=TRUE], d1[d2, on="k"], output="not found.")
+options("datatable.use.index"=FALSE, "datatable.auto.index"=FALSE)
+test(1666.11, grep("Looking for existing (secondary) index", capture.output(d1[d2, on="k", verbose=TRUE])), integer(0)) # not looking for index
+options("datatable.use.index"=FALSE, "datatable.auto.index"=TRUE)
+test(1666.12, grep("Looking for existing (secondary) index", capture.output(d1[d2, on="k", verbose=TRUE])), integer(0))
+d1 = data.table(k=3:1) # join - index
+d2 = data.table(k=2:4)
+setindex(d1, k)
+options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE)
+test(1666.13, d1[d2, on="k", verbose=TRUE], d1[d2, on="k"], output="found. Reusing index.")
+options("datatable.use.index"=TRUE, "datatable.auto.index"=FALSE)
+test(1666.14, d1[d2, on="k", verbose=TRUE], d1[d2, on="k"], output="found. Reusing index.")
+options("datatable.use.index"=FALSE, "datatable.auto.index"=FALSE)
+test(1666.15, grep("Looking for existing (secondary) index", capture.output(d1[d2, on="k", verbose=TRUE])), integer(0)) # not looking for index
+options("datatable.use.index"=FALSE, "datatable.auto.index"=TRUE)
+test(1666.16, grep("Looking for existing (secondary) index", capture.output(d1[d2, on="k", verbose=TRUE])), integer(0))
+# reset defaults
+options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE)
+
 ##########################
 
 # TODO: Tests involving GForce functions needs to be run with optimisation level 1 and 2, so that both functions are tested all the time.

diff --git a/man/datatable-optimize.Rd b/man/datatable-optimize.Rd
@@ -52,7 +52,7 @@
 
  	At the moment, expressions of the form \code{dt[col == val]} and \code{dt[col \%in\% val]} are both optimised. We plan to expand this to more operators and conditions in the future.
 
- 	Auto indexing can be switched off with the global option \code{options(datatable.auto.index = FALSE)}.
+ 	Auto indexing can be switched off with the global option \code{options(datatable.auto.index = FALSE)}. To switch off using existing indices set global option \code{options(datatable.use.index = FALSE)}.
 
  	\bold{Numeric rounding:} While grouping by column(s) of \code{numeric} type, e.g., \code{double}, \code{POSIXct} etc., \code{data.table} by default rounds off the last two bytes before computing groups to avoid unexpected behaviours due to limitations in representing floating point numbers. See \code{\link{setNumericRounding}} for more.
 
@@ -105,7 +105,7 @@ identical(ans1, ans2)
 options(datatable.optimize = Inf)
 
 # auto indexing
-options(datatable.auto.index=FALSE)
+options(datatable.auto.index = FALSE)
 system.time(ans1 <- dt[id == 100L]) # vector scan
 system.time(ans2 <- dt[id == 100L]) # vector scan
 system.time(dt[id %in% 100:500])    # vector scan

diff --git a/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd b/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd
@@ -319,6 +319,8 @@ system.time(dt[x %in% 1989:2012])
 
 * Auto indexing can be disabled by setting the global argument `options(datatable.auto.index = FALSE)`.
 
+* Disabling auto indexing still allows to use indices created explicitly with `setindex` or `setindexv`. You can disable indices fully by setting global argument `options(datatable.use.index = FALSE)`.
+
 # 
 
 In the future, we plan to extend auto indexing to expressions involving more than one column. Also we are working on extending binary search to work with more binary operators like `<`, `<=`, `>` and `>=`. Once done, it would be straightforward to extend it to these operators as well.
-Original file line number
+Diff line change
@@ Expand Up / @@ -319,6 +319,8 @@ system.time(dt[x %in% 1989:2012]) @@
     * Auto indexing can be disabled by setting the global argument `options(datatable.auto.index = FALSE)`.
+    * Disabling auto indexing still allows to use indices created explicitly with `setindex` or `setindexv`. You can disable indices fully by setting global argument `options(datatable.use.index = FALSE)`.
     #
     In the future, we plan to extend auto indexing to expressions involving more than one column. Also we are working on extending binary search to work with more binary operators like `<`, `<=`, `>` and `>=`. Once done, it would be straightforward to extend it to these operators as well.
@@ Expand Down @@