Rdatatable · mattdowle · Jul 19, 2019 · Jul 10, 2019 · Jul 10, 2019 · Jul 11, 2019
@@ -128,6 +128,8 @@
     identical(y1,y2) && identical(y1,y3)
     # TRUE
     ```
+
+19. Sorting now extended to complex vectors, [#1703](https://github.com/Rdatatable/data.table/issues/1703). Consistent with `base::order`, sorting is done lexicographically (`z1<z2` means `Re(z1) < Re(z2) | (Re(z1) == Re(z2) & Im(z1) < Im(z2))`).
 
 #### BUG FIXES
 

@@ -14,7 +14,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
   # careful to only plonk syntax (full column) on i/x from now on otherwise user's i and x would change;
   #   this is why shallow() is very importantly internal only, currently.
 
-  supported = c("logical", "integer", "double", "character", "factor", "integer64")
+  supported = c(ORDERING_TYPES, "factor", "integer64")
 
   getClass = function(x) {
     ans = typeof(x)

@@ -829,7 +829,7 @@ replace_order = function(isub, verbose, env) {
         if (!is.list(byval)) stop("'by' or 'keyby' must evaluate to a vector or a list of vectors (where 'list' includes data.table and data.frame which are lists, too)")
         if (length(byval)==1L && is.null(byval[[1L]])) bynull=TRUE #3530 when by=(function()NULL)()
         if (!bynull) for (jj in seq_len(length(byval))) {
-          if (!typeof(byval[[jj]]) %chin% c("integer","logical","character","double")) stop("column or expression ",jj," of 'by' or 'keyby' is type ",typeof(byval[[jj]]),". Do not quote column names. Usage: DT[,sum(colC),by=list(colA,month(colB))]")
+          if (!typeof(byval[[jj]]) %chin% ORDERING_TYPES) stop("column or expression ",jj," of 'by' or 'keyby' is type ",typeof(byval[[jj]]),". Do not quote column names. Usage: DT[,sum(colC),by=list(colA,month(colB))]")
         }
         tt = vapply_1i(byval,length)
         if (any(tt!=xnrow)) stop("The items in the 'by' or 'keyby' list are length (",paste(tt,collapse=","),"). Each must be length ", xnrow, "; the same length as there are rows in x (after subsetting if i is provided).")

@@ -51,14 +51,9 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU
   }
   if (identical(cols,"")) stop("cols is the empty string. Use NULL to remove the key.")
   if (!all(nzchar(cols))) stop("cols contains some blanks.")
-  if (!length(cols)) {
-    cols = colnames(x)   # All columns in the data.table, usually a few when used in this form
-  } else {
-    # remove backticks from cols
-    cols = gsub("`", "", cols, fixed = TRUE)
-    miss = !(cols %chin% colnames(x))
-    if (any(miss)) stop("some columns are not in the data.table: ", paste(cols[miss], collapse=","))
-  }
+  cols = gsub("`", "", cols, fixed = TRUE)
+  miss = !(cols %chin% colnames(x))
+  if (any(miss)) stop("some columns are not in the data.table: ", paste(cols[miss], collapse=","))
 
   ## determine, whether key is already present:
   if (identical(key(x),cols)) {
@@ -83,7 +78,7 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU
   if (".xi" %chin% names(x)) stop("x contains a column called '.xi'. Conflicts with internal use by data.table.")
   for (i in cols) {
     .xi = x[[i]]  # [[ is copy on write, otherwise checking type would be copying each column
-    if (!typeof(.xi) %chin% c("integer","logical","character","double")) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported as a key column type, currently.")
+    if (!typeof(.xi) %chin% ORDERING_TYPES) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported as a key column type, currently.")
   }
   if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov
 
@@ -181,6 +176,7 @@ is.sorted = function(x, by=seq_along(x)) {
   # Important to call forder.c::fsorted here, for consistent character ordering and numeric/integer64 twiddling.
 }
 
+ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character')
 forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE)
 {
   if (!(sort || retGrp)) stop("At least one of retGrp or sort must be TRUE")
@@ -208,7 +204,7 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.las
       stop("'by' is type 'double' and one or more items in it are not whole integers")
     }
     by = as.integer(by)
-    if ( (length(order) != 1L && length(order) != length(by)) || any(!order %in% c(1L, -1L)) )
+    if ( (length(order) != 1L && length(order) != length(by)) || !all(order %in% c(1L, -1L)) )
       stop("x is a list, length(order) must be either =1 or =length(by) and each value should be 1 or -1 for each column in 'by', corresponding to ascending or descending order, respectively. If length(order) == 1, it will be recycled to length(by).")
     if (length(order) == 1L) order = rep(order, length(by))
   }
@@ -330,7 +326,7 @@ setorderv = function(x, cols = colnames(x), order=1L, na.last=FALSE)
   if (".xi" %chin% colnames(x)) stop("x contains a column called '.xi'. Conflicts with internal use by data.table.")
   for (i in cols) {
     .xi = x[[i]]  # [[ is copy on write, otherwise checking type would be copying each column
-    if (!typeof(.xi) %chin% c("integer","logical","character","double")) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported for ordering currently.")
+    if (!typeof(.xi) %chin% ORDERING_TYPES) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported for ordering currently.")
   }
   if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov
 

@@ -11701,11 +11701,11 @@ test(1844.2, forder(DT,V1,V2,na.last=NA), INT(2,1,3,0,4))  # prior to v1.12.0 th
 # now with two NAs in that 2-group covers forder.c:forder line 1269 starting: else if (nalast == 0 && tmp==-2) {
 DT = data.table(c("a","a","a","b","b"),c(2,1,3,NA,NA))
 test(1844.3, forder(DT,V1,V2,na.last=NA), INT(2,1,3,0,0))
-DT = data.table((0+0i)^(-3:3), 7:1)
-test(1844.4, forder(DT,V1,V2), error="Column 1 of by= (1) is type 'complex', not yet supported")
-test(1844.5, forder(DT,V2,V1), error="Column 2 of by= (2) is type 'complex', not yet supported")
-DT = data.table((0+0i)^(-3:3), c(5L,5L,1L,2L,2L,2L,2L))
-test(1844.6, forder(DT,V2,V1), error="Column 2 of by= (2) is type 'complex', not yet supported")
+DT = data.table(as.raw(0:6), 7:1)
+test(1844.4, forder(DT,V1,V2), error="Column 1 of by= (1) is type 'raw', not yet supported")
+test(1844.5, forder(DT,V2,V1), error="Column 2 of by= (2) is type 'raw', not yet supported")
+DT = data.table(as.raw(0:6), c(5L,5L,1L,2L,2L,2L,2L))
+test(1844.6, forder(DT,V2,V1), error="Column 2 of by= (2) is type 'raw', not yet supported")
 
 # fix for non-equi joins issue #1991. Thanks to Henrik for the nice minimal example.
 d1 <- data.table(x = c(rep(c("b", "a", "c"), each = 3), c("a", "b")), y = c(rep(c(1, 3, 6), 3), 6, 6), id = 1:11)
@@ -13158,9 +13158,9 @@ setnames(DT, '.xi')
 setkey(DT, NULL)
 test(1962.037, setkey(DT, .xi),
      error = "x contains a column called '.xi'")
-DT = data.table(a = 1+3i)
+DT = data.table(a = as.raw(0))
 test(1962.038, setkey(DT, a),
-     error = "Column 'a' is type 'complex'")
+     error = "Column 'a' is type 'raw'")
 
 test(1962.039, is.sorted(3:1, by = 'x'),
      error = 'x is vector but')
@@ -13216,8 +13216,8 @@ test(1962.064, setorderv(copy(DT)),
 test(1962.065, setorderv(DT, 'c'), error = 'some columns are not in the data.table')
 setnames(DT, 1L, '.xi')
 test(1962.066, setorderv(DT, 'b'), error = "x contains a column called '.xi'")
-test(1962.067, setorderv(data.table(a = 1+3i), 'a'),
-     error = "Column 'a' is type 'complex'")
+test(1962.067, setorderv(data.table(a = as.raw(0)), 'a'),
+     error = "Column 'a' is type 'raw'")
 
 DT = data.table(
   color = c("yellow", "red", "green", "red", "green", "red",
@@ -13743,7 +13743,7 @@ test(1984.05, DT[ , sum(b), keyby = c, verbose = TRUE],
 ### hitting byval = eval(bysub, setattr(as.list(seq_along(xss)), ...)
 test(1984.06, DT[1:3, sum(a), by=b:c], data.table(b=10:8, c=1:3, V1=1:3))
 test(1984.07, DT[, sum(a), by=call('sin',pi)], error='must evaluate to a vector or a list of vectors')
-test(1984.08, DT[, sum(a), by=1+3i],           error='column or expression.*type complex')
+test(1984.08, DT[, sum(a), by=as.raw(0)],           error='column or expression.*type raw')
 test(1984.09, DT[, sum(a), by=.(1,1:2)],       error='The items.*list are length [(]1,2[)].*Each must be length 10; .*rows in x.*after subsetting')
 options('datatable.optimize' = Inf)
 test(1984.10, DT[ , 1, by = .(a %% 2), verbose = TRUE],
@@ -14755,14 +14755,14 @@ dt1 <- data.table(int = 1L:10L,
                   bool = c(rep(FALSE, 9), TRUE),
                   char = letters[1L:10L],
                   fact = factor(letters[1L:10L]),
-                  complex = as.complex(1:5))
+                  raw = as.raw(1:5))
 dt2 <- data.table(int = 1L:5L,
                   doubleInt = as.numeric(1:5),
                   realDouble = seq(0.5, 2.5, by = 0.5),
                   bool = TRUE,
                   char = letters[1L:5L],
                   fact = factor(letters[1L:5L]),
-                  complex = as.complex(1:5))
+                  raw = as.raw(1:5))
 if (test_bit64) {
   dt1[, int64 := as.integer64(c(1:9, 3e10))]
   dt2[, int64 := as.integer64(c(1:4, 3e9))]
@@ -14779,8 +14779,8 @@ test(2044.08, nrow(dt1[dt2, on="fact==fact",             verbose=TRUE]), nrow(dt
 if (test_bit64) {
   test(2044.09, nrow(dt1[dt2, on = "int64==int64",       verbose=TRUE]), nrow(dt2), output="No coercion needed")
 }
-test(2044.10, dt1[dt2, on = "int==complex"],   error = "i.complex is type complex which is not supported by data.table join")
-test(2044.11, dt1[dt2, on = "complex==int"],   error = "x.complex is type complex which is not supported by data.table join")
+test(2044.10, dt1[dt2, on = "int==raw"],   error = "i.raw is type raw which is not supported by data.table join")
+test(2044.11, dt1[dt2, on = "raw==int"],   error = "x.raw is type raw which is not supported by data.table join")
 # incompatible types
 test(2044.20, dt1[dt2, on="bool==int"],        error="Incompatible join types: x.bool (logical) and i.int (integer)")
 test(2044.21, dt1[dt2, on="bool==doubleInt"],  error="Incompatible join types: x.bool (logical) and i.doubleInt (double)")
@@ -15242,6 +15242,55 @@ ll = list(1:2, NULL, 3:4)
 test(2063.4, transpose(ll, ignore=TRUE), list(c(1L, 3L), c(2L, 4L)))
 options(old)
 
+# forderv (and downstream functions) handles complex vector input, part of #3690
+DT = data.table(
+  a = c(1L, 1L, 8L, 2L, 1L, 9L, 3L, 2L, 6L, 6L),
+  b = c(3+9i, 10+5i, 8+2i, 10+4i, 3+3i, 1+2i, 5+1i, 8+1i, 8+2i, 10+6i),
+  c = 6
+)
+test(2064.01, DT[order(a, b)], DT[base::order(a, b)])
+test(2064.02, DT[order(a, -b)], DT[base::order(a, -b)])
+test(2064.03, forderv(DT$b, order = 1L), base::order(DT$b))
+test(2064.04, forderv(DT$b, order = -1L), base::order(-DT$b))
+test(2064.05, forderv(DT, by = 2:1), forderv(DT[ , 2:1]))
+test(2064.06, forderv(DT, by = 2:1, order = c(1L, -1L)), DT[order(b, -a), which = TRUE])
+
+# downstreams of forder
+DT = data.table(
+  z = c(0, 0, 1, 1, 2, 3) + c(1, 1, 2, 2, 3, 4)*1i,
+  grp = rep(1:2, 3L),
+  v = c(3, 1, 4, 1, 5, 9)
+)
+unq_z = 0:3 + (1:4)*1i
+test(2064.07, DT[ , .N, by=z], data.table(z=unq_z, N=c(2L, 2L, 1L, 1L)))
+# uniqlist.c needs work
+# test(2064.08, DT[ , .N, keyby = z],
+# DT = setkey(copy(DT[.N:1]), z)
+# test(2964.09, key(DT), 'z')
+# test(2964.10, DT
+test(2964.11, dcast(DT, z ~ grp, value.var='v', fill=0),
+     data.table(z=unq_z, `1`=c(3, 4, 5, 0), `2`=c(1, 1, 0, 9), key='z'))
+test(2964.12, frank(DT$z), c(1.5, 1.5, 3.5, 3.5, 5, 6))
+test(2964.13, frank(DT$z, ties.method='max'), c(2L, 2L, 4L, 4L, 5L, 6L))
+test(2964.14, frank(-DT$z, ties.method='min'), c(5L, 5L, 3L, 3L, 2L, 1L))
+test(2964.15, DT[ , rowid(z, grp)], rep(1L, 6L))
+test(2964.16, DT[ , rowid(z)], c(1:2, 1:2, 1L, 1L))
+test(2964.17, rleid(c(1i, 1i, 1i, 0, 0, 1-1i, 2+3i, 2+3i)), rep(1:4, c(3:1, 2L)))
+
+## assorted coverage tests from along the way
+if (test_bit64) {
+  test(2964.50, is.sorted(as.integer64(10:1)), FALSE)
+  test(2964.51, is.sorted(as.integer64(1:10)))
+}
+# sort by vector outside of table
+ord = 3:1
+test(2964.52, forder(data.table(a = 3:1), ord), 3:1)
+
+# DT1 = data.table(z = c(0+1i, 2-3i, 4+1i))
+# DT2 = data.table(z = c(2-3i, 0+1i, 0+0i))
+# DT1[DT2, on = 'z']
+
+
 
 ###################################
 #  Add new tests above this line  #

@@ -440,11 +440,61 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S
   if (!isInteger(by) || !LENGTH(by)) error("DT has %d columns but 'by' is either not integer or is length 0", length(DT));  // seq_along(x) at R level
   if (!isInteger(ascArg) || LENGTH(ascArg)!=LENGTH(by)) error("Either 'ascArg' is not integer or its length (%d) is different to 'by's length (%d)", LENGTH(ascArg), LENGTH(by));
   nrow = length(VECTOR_ELT(DT,0));
+  int n_cplx = 0;
   for (int i=0; i<LENGTH(by); i++) {
     if (INTEGER(by)[i] < 1 || INTEGER(by)[i] > length(DT))
       error("'by' value %d out of range [1,%d]", INTEGER(by)[i], length(DT));
     if ( nrow != length(VECTOR_ELT(DT, INTEGER(by)[i]-1)) )
       error("Column %d is length %d which differs from length of column 1 (%d)\n", INTEGER(by)[i], length(VECTOR_ELT(DT, INTEGER(by)[i]-1)), nrow);
+    if (TYPEOF(VECTOR_ELT(DT, i)) == CPLXSXP) n_cplx++;
+  }
+  if (n_cplx) {
+    // we don't expect users to need complex sorting extensively
+    //   or on massive data sets, so we take the approach of
+    //  splitting a complex vector into its real & imaginary parts
+    //  and using the regular forderv machinery to sort; a baremetal
+    //  implementation would at root do the same, but the approach
+    //  here is a bit more slapdash with respect to memory efficiency
+    //  (seen clearly here at C from the 3+2*n_cplx PROTECT() calls)
+    int n_out = length(by) + n_cplx;
+    SEXP new_dt = PROTECT(allocVector(VECSXP, n_out)); n_protect++;
+    SEXP new_asc = PROTECT(allocVector(INTSXP, n_out)); n_protect++;
+    // will be simply 1:n_out
+    SEXP new_by = PROTECT(allocVector(INTSXP, n_out)); n_protect++;
+    int j = 0;
+    for (int i=0; i<length(by); i++) {
+      int by_idx = INTEGER(by)[i]-1;
+      if (TYPEOF(VECTOR_ELT(DT, by_idx)) == CPLXSXP) {
+        // I don't see any shorthand way of splitting of the real&imaginary components,
+        //   i.e., a shorthand way of doing Re(z), Im(z). That includes searching all of
+        //   the r-source code & all of the r-devel archives. So just reproduce Re(), Im()
+        //   as done in do_cmathfuns in complex.c
+        SEXP realPart = PROTECT(allocVector(REALSXP, nrow)); n_protect++;
+        SEXP cplxPart = PROTECT(allocVector(REALSXP, nrow)); n_protect++;
+        double *pre = REAL(realPart);
+        double *pim = REAL(cplxPart);
+        Rcomplex *pz = COMPLEX(VECTOR_ELT(DT, by_idx));
+        for (int i = 0; i < nrow; i++) {
+          pre[i] = pz[i].r;
+          pim[i] = pz[i].i;
+        }
+        SET_VECTOR_ELT(new_dt, j, realPart);
+        SET_VECTOR_ELT(new_dt, j+1, cplxPart);
+        INTEGER(new_asc)[j] = INTEGER(ascArg)[i];
+        INTEGER(new_asc)[j+1] = INTEGER(ascArg)[i];
+        INTEGER(new_by)[j] = j+1;
+        INTEGER(new_by)[j+1] = j+2;
+        j += 2;
+      } else {
+        SET_VECTOR_ELT(new_dt, j, VECTOR_ELT(DT, by_idx));
+        INTEGER(new_asc)[j] = INTEGER(ascArg)[i];
+        INTEGER(new_by)[j] = j+1;
+        j += 1;
+      }
+    }
+    DT = new_dt;
+    ascArg = new_asc;
+    by = new_by;
   }
 
   if (!isLogical(retGrpArg) || LENGTH(retGrpArg)!=1 || INTEGER(retGrpArg)[0]==NA_LOGICAL) error("retGrp must be TRUE or FALSE");

@@ -1,4 +1,5 @@
 #include "data.table.h"
+#include <complex.h>
 
 // DONE: return 'uniqlist' as a vector (same as duplist) and write a separate function to get group sizes
 // Also improvements for numeric type with a hack of checking unsigned int (to overcome NA/NaN/Inf/-Inf comparisons) (> 2x speed-up)
@@ -200,6 +201,11 @@ SEXP rleid(SEXP l, SEXP cols) {
           // 8 bytes of bits are identical. For real (no rounding currently) and integer64
           // long long == 8 bytes checked in init.c
           break;
+        case CPLXSXP: {
+          // tried to make long long complex * but got a warning that it's a GNU extension
+          double complex *pz = (double complex *)COMPLEX(jcol);
+          same = (long long)creal(pz[i]) == (long long)creal(pz[i-1]) && (long long)cimag(pz[i]) == (long long)cimag(pz[i-1]);
+        } break;
         default :
           error("Type '%s' not supported", type2char(TYPEOF(jcol)));  // # nocov
         }
@@ -232,6 +238,13 @@ SEXP rleid(SEXP l, SEXP cols) {
         }
       }
       break;
+    case CPLXSXP: {
+      double complex *pzjcol = (double complex *)COMPLEX(jcol);
+      for (R_xlen_t i=1; i<nrow; i++) {
+        bool same = (long long)creal(pzjcol[i]) == (long long)creal(pzjcol[i-1]) && (long long)cimag(pzjcol[i]) == (long long)cimag(pzjcol[i-1]);
+        ians[i] = (grp += !same);
+      }
+    } break;
     default :
       error("Type '%s' not supported", type2char(TYPEOF(jcol)));
     }
-Original file line number
+Diff line change
@@ Expand Up / @@ -128,6 +128,8 @@ @@
         identical(y1,y2) && identical(y1,y3)
         # TRUE
         ```
+. Sorting now extended to complex vectors, [#1703](https://github.com/Rdatatable/data.table/issues/1703). Consistent with `base::order`, sorting is done lexicographically (`z1<z2` means `Re(z1) < Re(z2) | (Re(z1) == Re(z2) & Im(z1) < Im(z2))`).
     #### BUG FIXES
@@ Expand Down @@