Rdatatable · 2005m · Jan 17, 2020 · Jan 18, 2020 · Jan 18, 2020 · Jan 18, 2020
@@ -18,6 +18,7 @@ export(chmatch, "%chin%", chorder, chgroup)
 export(rbindlist)
 export(fifelse)
 export(fcase)
+export(topn)
 export(fread)
 export(fwrite)
 export(foverlaps)

@@ -81,6 +81,33 @@ unit = "s")
 
 14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR.
 
+15. New function `topn`, implemented in C by Morgan Jacob, [#3804](https://github.com/Rdatatable/data.table/issues/3804). It returns the top largest or smallest `n` values for a given numeric vector `vec`. Please see `?topn` for more details. Similar to `dplyr::top_n`.
+
+```R
+set.seed(123)
+x = rnorm(5e7) # 382 MB
+microbenchmark::microbenchmark(
+  topn(x, 6L),
+  order(x)[1:6], 
+  times = 10L
+)
+# Unit: seconds
+#          expr    min    lq  mean  median    uq   max neval
+# topn(x, 6L)     0.19  0.19  0.20    0.20  0.20  0.22    10
+# order(x)[1:6]   4.56  4.60  4.65    4.62  4.70  4.77    10
+
+microbenchmark::microbenchmark(
+  x[topn(x, 6L)],
+  sort(x, partial = 1:6)[1:6],
+  times = 10L,
+  unit = "s"
+)
+# Unit: seconds
+#                        expr   min    lq  mean  median    uq   max neval
+# x[topn(x, 6L)]               0.19  0.20  0.20    0.20  0.20  0.21    10
+# sort(x, partial = 1:6)[1:6]  1.20  1.22  1.23    1.24  1.25  1.27    10
+```
+
 ## BUG FIXES
 
 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085).

@@ -7,6 +7,7 @@ setcoalesce = function(...) .Call(Ccoalesce, list(...), TRUE)
 
 fifelse = function(test, yes, no, na=NA) .Call(CfifelseR, test, yes, no, na)
 fcase   = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.list(substitute(list(...)))[-1L])
+topn    = function(vec, n=6L, decreasing=FALSE) .Call(CtopnR, vec, n, decreasing)
 
 colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups)
 coerceFill = function(x) .Call(CcoerceFillR, x)

@@ -16846,3 +16846,87 @@ A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN))
 test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
 A = data.table(A=as.complex(rep(NA, 5)))
 test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
+
+
+# topn, #3804
+x0 = c(3L, 2L, 10L, NA_integer_, 1L, 1L, NA_integer_,  NA_integer_, 10L, 20L, 20L, 20L, 30L)
+x1 = as.numeric(x0)
+x2 = c(NA_integer_, NA_integer_, NA_integer_)
+x3 = as.numeric(x2)
+x4 = as.raw(c(1,2,3))
+
+class2134 = setClass("class2134", slots=list(x="numeric"))
+s1 = class2134(x=20191231)
+
+test(2139.001, topn(x0, 1L), order(x0)[1:1])
+test(2139.002, topn(x0, 2L), order(x0)[1:2])
+test(2139.003, topn(x0, 3L), order(x0)[1:3])
+test(2139.004, topn(x0, 4L), order(x0)[1:4])
+test(2139.005, topn(x0, 5L), order(x0)[1:5])
+test(2139.006, topn(x0, 6L), order(x0)[1:6])
+test(2139.007, topn(x0, 7L), order(x0)[1:7])
+test(2139.008, topn(x0, 8L), order(x0)[1:8])
+test(2139.009, topn(x0, 9L), order(x0)[1:9])
+test(2139.010, topn(x0, 10L), order(x0)[1:10])
+test(2139.011, topn(x0, 11L), order(x0)[1:11])
+test(2139.012, topn(x0, 12L), order(x0)[1:12])
+test(2139.013, topn(x0, 13L), order(x0)[1:13])
+test(2139.014, topn(x1, 1L), order(x1)[1:1])
+test(2139.015, topn(x1, 2L), order(x1)[1:2])
+test(2139.016, topn(x1, 3L), order(x1)[1:3])
+test(2139.017, topn(x1, 4L), order(x1)[1:4])
+test(2139.018, topn(x1, 5L), order(x1)[1:5])
+test(2139.019, topn(x1, 6L), order(x1)[1:6])
+test(2139.020, topn(x1, 7L), order(x1)[1:7])
+test(2139.021, topn(x1, 8L), order(x1)[1:8])
+test(2139.022, topn(x1, 9L), order(x1)[1:9])
+test(2139.023, topn(x1, 10L), order(x1)[1:10])
+test(2139.024, topn(x1, 11L), order(x1)[1:11])
+test(2139.025, topn(x1, 12L), order(x1)[1:12])
+test(2139.026, topn(x1, 13L), order(x1)[1:13])
+test(2139.027, topn(x2, 1L), order(x2)[1:1])
+test(2139.028, topn(x2, 2L), order(x2)[1:2])
+test(2139.029, topn(x2, 3L), order(x2)[1:3])
+test(2139.030, topn(x3, 1L), order(x3)[1:1])
+test(2139.031, topn(x3, 2L), order(x3)[1:2])
+test(2139.032, topn(x3, 3L), order(x3)[1:3])
+test(2139.033, topn(x0, 1L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:1])
+test(2139.034, topn(x0, 2L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:2])
+test(2139.035, topn(x0, 3L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:3])
+test(2139.036, topn(x0, 4L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:4])
+test(2139.037, topn(x0, 5L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:5])
+test(2139.038, topn(x0, 6L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:6])
+test(2139.039, topn(x0, 7L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:7])
+test(2139.040, topn(x0, 8L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:8])
+test(2139.041, topn(x0, 9L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:9])
+test(2139.042, topn(x0, 10L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:10])
+test(2139.043, topn(x0, 11L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:11])
+test(2139.044, topn(x0, 12L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:12])
+test(2139.045, topn(x0, 13L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:13])
+test(2139.046, topn(x1, 1L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:1])
+test(2139.047, topn(x1, 2L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:2])
+test(2139.048, topn(x1, 3L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:3])
+test(2139.049, topn(x1, 4L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:4])
+test(2139.050, topn(x1, 5L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:5])
+test(2139.051, topn(x1, 6L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:6])
+test(2139.052, topn(x1, 7L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:7])
+test(2139.053, topn(x1, 8L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:8])
+test(2139.054, topn(x1, 9L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:9])
+test(2139.055, topn(x1, 10L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:10])
+test(2139.056, topn(x1, 11L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:11])
+test(2139.057, topn(x1, 12L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:12])
+test(2139.058, topn(x1, 13L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:13])
+test(2139.060, topn(x2, 1L, decreasing=TRUE), order(x2, decreasing=TRUE)[1:1])
+test(2139.061, topn(x2, 2L, decreasing=TRUE), order(x2, decreasing=TRUE)[1:2])
+test(2139.062, topn(x2, 3L, decreasing=TRUE), order(x2, decreasing=TRUE)[1:3])
+test(2139.063, topn(x3, 1L, decreasing=TRUE), order(x3, decreasing=TRUE)[1:1])
+test(2139.064, topn(x3, 2L, decreasing=TRUE), order(x3, decreasing=TRUE)[1:2])
+test(2139.065, topn(x3, 3L, decreasing=TRUE), order(x3, decreasing=TRUE)[1:3])
+test(2139.066, topn(x0, -1L), error = "Please enter a positive integer larger or equal to 1.")
+test(2139.067, topn(x0, 1001L), error = "Function 'topn' is not built for large value of 'n'. The algorithm is made for small values. Please prefer the 'order' function if you want to proceed with such large value.")
+test(2139.068, topn(x0, 100L), order(x0)[1:13], warning = "'n' is larger than length of 'vec'. 'n' will be set to length of 'vec'.")
+test(2139.069, topn(x0, 10L, decreasing = NA), error = "Argument 'decreasing' must be TRUE or FALSE and length 1.")
+test(2139.070, topn(s1, 10L, decreasing = NA), error = "S4 class objects (excluding nanotime) are not supported.")
+test(2139.071, topn(x4, 2L), error = "Type raw is not supported.")
+test(2139.072, topn(x4, 2L, decreasing = TRUE), error = "Type raw is not supported.")
+rm(s1, class2134)
@@ -0,0 +1,35 @@
+\name{topn}
+\alias{topn}
+\title{ Top N values index}
+\description{
+  \code{topn} is used to get the indices of the few values of an input. This is an extension of \code{\link{which.max}}/\code{\link{which.min}} which provide \emph{only} the first such index. 
+
+  The output is the same as \code{order(vec)[1:n]}, but internally optimized not to sort the irrelevant elements of the input (and therefore much faster, for small \code{n} relative to input size).
+}
+\usage{
+  topn(vec, n=6L, decreasing=FALSE)
+}
+\arguments{
+  \item{vec}{ A numeric vector of type numeric or integer. Other types are not supported yet. }
+  \item{n}{ A positive integer value greater or equal to 1. Maximum value is 1000. }
+  \item{decreasing}{ A logical value (default \code{FALSE}) to indicate whether to sort \code{vec} in decreasing or increasing order. Equivalent to argument \code{decreasing} in function \code{base::order}. }
+}
+\value{
+  \code{integer} vector of indices of the most extreme (according to \code{decreasing}) \code{n} values in vector \code{vec}.
+}
+\examples{
+x = rnorm(1e6)
+
+# Example 1: index of top 6 negative values 
+topn(x, 6L)
+order(x)[1:6]
+
+# Example 2: index of top 6 positive values
+topn(x, 6L, decreasing = TRUE)
+order(x, decreasing = TRUE)[1:6]
+
+# Example 3: top 6 negative values
+x[topn(x, 6L)]
+sort(x)[1:6]
+}
+\keyword{ data }
@@ -245,3 +245,4 @@ SEXP testMsgR(SEXP status, SEXP x, SEXP k);
 //fifelse.c
 SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na);
 SEXP fcaseR(SEXP na, SEXP rho, SEXP args);
+SEXP topnR(SEXP vec, SEXP n, SEXP dec);
@@ -344,3 +344,194 @@ SEXP fcaseR(SEXP na, SEXP rho, SEXP args) {
   UNPROTECT(nprotect);
   return ans;
 }
+
+SEXP topnR(SEXP vec, SEXP n, SEXP dec) {
+  int nprotect = 0;
+  int64_t i, j, idx = 0;
+  int len0 = asInteger(n);
+  const int64_t len1 = xlength(vec);
+
+  if (isS4(vec) && !INHERITS(vec, char_nanotime)) {
+    error(_("S4 class objects (excluding nanotime) are not supported."));
+  }
+  if (len0 > 1000) {
+    error(_("Function 'topn' is not built for large value of 'n'. The algorithm is made for small values. Please prefer the 'order' function if you want to proceed with such large value."));
+  }
+  if (len0 > len1) {
+    warning(_("'n' is larger than length of 'vec'. 'n' will be set to length of 'vec'."));
+    len0 = len1;
+  }
+  if (len0 < 1) {
+    error(_("Please enter a positive integer larger or equal to 1."));
+  }
+  if (!IS_TRUE_OR_FALSE(dec)) {
+    error(_("Argument 'decreasing' must be TRUE or FALSE and length 1."));
+  }
+
+  const bool vdec = LOGICAL(dec)[0];
+  SEXPTYPE tvec = TYPEOF(vec);
+  SEXP ans = PROTECT(allocVector(INTSXP, len0)); nprotect++;
+  int *restrict pans = INTEGER(ans);
+  int tmp;
+
+  if (vdec) {
+    switch(tvec) {
+    case INTSXP: {
+      const int *restrict pvec = INTEGER(vec);
+      int min_value = pvec[0];
+      for (i = 0; i < len0; ++i) {
+        pans[i] = i;
+        if (pvec[i] <= min_value || pvec[i] == NA_INTEGER) {
+          min_value = pvec[i];
+          idx = i;
+        }
+      }
+      for (i = len0; i < len1; ++i) {
+        if (pvec[i] == NA_INTEGER) {
+          continue;
+        }
+        if (pvec[i] > min_value) {
+          min_value = pvec[i];
+          pans[idx] = i;
+          for (j = 0; j <len0; ++j) {
+            if ((min_value > pvec[pans[j]] || (min_value == pvec[pans[j]] && pans[idx] < pans[j])) || pvec[pans[j]] == NA_INTEGER) {
+              min_value = pvec[pans[j]];
+              idx = j;
+            }
+          }
+        }
+      }
+      for (i = 0; i < len0; ++i) {
+        tmp = pans[i];
+        for (j = i; j > 0 && (pvec[tmp] > pvec[pans[j-1]] || (pvec[tmp] == pvec[pans[j-1]] && tmp < pans[j-1])); --j) {
+          pans[j] = pans[j-1];
+        }
+        pans[j] = tmp;
+      }
+      for (i =0; i < len0; ++i) {
+        pans[i]++;
+      }
+    } break;
+    case REALSXP: {
+      const double *restrict pvec = REAL(vec);
+      double min_value = pvec[0];
+      for (i = 0; i < len0; ++i) {
+        pans[i] = i;
+        if (pvec[i] <= min_value || ISNAN(pvec[i])) {
+          min_value = pvec[i];
+          idx = i;
+        }
+      }
+      for (i = len0; i < len1; ++i) {
+        if (ISNAN(pvec[i])) {
+          continue;
+        }
+        if (pvec[i] > min_value || ISNAN(min_value)) {
+          min_value = pvec[i];
+          pans[idx] = i;
+          for (j = 0; j <len0; ++j) {
+            if ((min_value > pvec[pans[j]] || (min_value == pvec[pans[j]] && pans[idx] < pans[j])) || ISNAN(pvec[pans[j]])) {
+              min_value = pvec[pans[j]];
+              idx = j;
+            }
+          }
+        }
+      }
+      for (i = 0; i < len0; ++i) {
+        tmp = pans[i];
+        for (j = i; j > 0 && (pvec[tmp] > pvec[pans[j-1]] || (pvec[tmp] == pvec[pans[j-1]] && tmp < pans[j-1]) || (!ISNAN(pvec[tmp]) && ISNAN(pvec[pans[j-1]]))); --j) {
+          pans[j] = pans[j-1];
+        }
+        pans[j] = tmp;
+      }
+      for (i =0; i < len0; ++i) {
+        pans[i]++;
+      }
+    } break;
+    default:
+      error(_("Type %s is not supported."), type2char(tvec));
+    }
+  } else {
+    switch(tvec) {
+    case INTSXP: {
+      const int *restrict pvec = INTEGER(vec);
+      int min_value = pvec[0];
+      for (i = 0; i < len0; ++i) {
+        pans[i] = i;
+        if ((pvec[i] >= min_value && min_value != NA_INTEGER) || pvec[i] == NA_INTEGER) {
+          min_value = pvec[i];
+          idx = i;
+        }
+      }
+      for (i = len0; i < len1; ++i) {
+        if (pvec[i] == NA_INTEGER) {
+          continue;
+        }
+        if (pvec[i] < min_value || min_value == NA_INTEGER) {
+          min_value = pvec[i];
+          pans[idx] = i;
+          for (j = 0; j <len0; ++j) {
+            if (((min_value < pvec[pans[j]] || (min_value == pvec[pans[j]] && pans[idx] < pans[j])) && min_value != NA_INTEGER) || pvec[pans[j]] == NA_INTEGER) {
+              min_value = pvec[pans[j]];
+              idx = j;
+            }
+          }
+        }
+      }
+      for (i = 0; i < len0; ++i) {
+        tmp = pans[i];
+        if (pvec[tmp] == NA_INTEGER) {
+          continue;
+        }
+        for (j = i; j > 0 && (pvec[tmp] < pvec[pans[j-1]] || (pvec[tmp] == pvec[pans[j-1]] && tmp < pans[j-1]) || pvec[pans[j-1]] == NA_INTEGER); --j) {
+          pans[j] = pans[j-1];
+        }
+        pans[j] = tmp;
+      }
+      for (i =0; i < len0; ++i) {
+        pans[i]++;
+      }
+    } break;
+    case REALSXP: {
+      const double *restrict pvec = REAL(vec);
+      double min_value = pvec[0];
+      for (i = 0; i < len0; ++i) {
+        pans[i] = i;
+        if (pvec[i] >= min_value || ISNAN(pvec[i])) {
+          min_value = pvec[i];
+          idx = i;
+        }
+      }
+      for (i = len0; i < len1; ++i) {
+        if (ISNAN(pvec[i])) {
+          continue;
+        }
+        if (pvec[i] < min_value || ISNAN(min_value)) {
+          min_value = pvec[i];
+          pans[idx] = i;
+          for (j = 0; j <len0; ++j) {
+            if ((min_value < pvec[pans[j]] || (min_value == pvec[pans[j]] && pans[idx] < pans[j])) || ISNAN(pvec[pans[j]])) {
+              min_value = pvec[pans[j]];
+              idx = j;
+            }
+          }
+        }
+      }
+      for (i = 0; i < len0; ++i) {
+        tmp = pans[i];
+        for (j = i; j > 0 && (pvec[tmp] < pvec[pans[j-1]] || (pvec[tmp] == pvec[pans[j-1]] && tmp < pans[j-1]) || (!ISNAN(pvec[tmp]) && ISNAN(pvec[pans[j-1]]))); --j) {
+          pans[j] = pans[j-1];
+        }
+        pans[j] = tmp;
+      }
+      for (i =0; i < len0; ++i) {
+        pans[i]++;
+      }
+    } break;
+    default:
+      error(_("Type %s is not supported."), type2char(tvec));
+    }
+  }
+  UNPROTECT(nprotect);
+  return ans;
+}
@@ -53,6 +53,7 @@ SEXP chmatchdup_R();
 SEXP chin_R();
 SEXP fifelseR();
 SEXP fcaseR();
+SEXP topnR();
 SEXP freadR();
 SEXP fwriteR();
 SEXP reorder();
@@ -205,6 +206,7 @@ R_CallMethodDef callMethods[] = {
 {"Ccoalesce", (DL_FUNC) &coalesce, -1},
 {"CfifelseR", (DL_FUNC) &fifelseR, -1},
 {"CfcaseR", (DL_FUNC) &fcaseR, -1},
+{"CtopnR", (DL_FUNC) &topnR, -1},
 {"C_lock", (DL_FUNC) &lock, -1},  // _ for these 3 to avoid Clock as in time
 {"C_unlock", (DL_FUNC) &unlock, -1},
 {"C_islocked", (DL_FUNC) &islockedR, -1},