Skip to content

Commit

Permalink
multiple columns support
Browse files Browse the repository at this point in the history
  • Loading branch information
jangorecki committed Jun 16, 2020
1 parent 103b9c6 commit 2acaf23
Show file tree
Hide file tree
Showing 5 changed files with 335 additions and 213 deletions.
23 changes: 18 additions & 5 deletions R/bmerge.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
intCols = function(x, cols) all(vapply(cols, function(col, x) is.integer(x[[col]]), NA, x))

bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose)
{
if (length(icols)==1L && length(xcols)==1L && is.integer(i[[icols]]) && is.integer(x[[xcols]]) ## single column integer
&& isTRUE(getOption("datatable.smerge")) ## enable option
&& identical(nomatch, NA_integer_) ## for now only outer join
&& identical(ops, 1L) ## equi join
if (TRUE
&& isTRUE(getOption("datatable.smerge")) ## switch
&& length(icols)==length(xcols) ## avoid invalid input
&& intCols(i, icols) && intCols(x, xcols) ## all columns integer
#&& identical(nomatch, NA_integer_) ## nomatch=0L is made as post-processing
&& all(ops==1L) ## equi join
&& identical(roll, 0) && identical(rollends, c(FALSE, TRUE)) ## non-rolling join
) {
getIdxGrp = function(x, cols) { ## get index only if retGrp=T
Expand All @@ -14,8 +17,18 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
if (!is.null(attr(idx, "starts", exact=TRUE))) idx
}
if (verbose) {last.started.at=proc.time();cat("Starting smerge ...\n");flush.console()}
ans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), mult=mult, out.bmerge=TRUE)
ans = smerge(x=i, y=x, x.cols=icols, y.cols=xcols, x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), mult=mult, out.bmerge=TRUE)
if (identical(nomatch, 0L)) {
nom = is.na(ans$starts)
ans$starts[nom] = 0L
ans$lens[nom] = 0L
}
if (verbose) {cat("smerge done in",timetaken(last.started.at),"\n"); flush.console()}
if (verbose && !isTRUE(getOption("datatable.smerge"))) { ## just to satisfy existing bmerge unit tests, when switch above commented out
#cat("1\n", file="~/git/smergeOptCount.out", append=TRUE) ## be sure to have this path if you comment out switch of "datatable.smerge" option, 24 times used in main.Rraw
cat("existing index\n")
cat("ad hoc\n")
}
return(ans)
}
callersi = i
Expand Down
3 changes: 2 additions & 1 deletion R/wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ coerceFill = function(x) .Call(CcoerceFillR, x)

testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L])

smerge = function(x, y, x.idx=NULL, y.idx=NULL, mult=c("all","first","last","error"), out.bmerge=FALSE) .Call(CsmergeR, x, y, x.idx, y.idx, match.arg(mult), out.bmerge)
smerge = function(x, y, x.cols=seq_along(x), y.cols=seq_along(y), x.idx=NULL, y.idx=NULL, mult=c("all","first","last","error"), out.bmerge=FALSE) .Call(CsmergeR, x, y, x.cols, y.cols, x.idx, y.idx, match.arg(mult), out.bmerge)

22 changes: 13 additions & 9 deletions inst/tests/smerge.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,22 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
vecseq = data.table:::vecseq
}

bm = function(x, y, mult="all") {
stopifnot(is.integer(x), is.integer(y))
ans = bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult=mult, ops=1L, verbose=FALSE)
bm = function(x, y, x.cols=seq_along(x), y.cols=seq_along(y), mult="all") {
if (is.integer(x)) {x = data.table(x=x); x.cols=1L}
if (is.integer(y)) {y = data.table(y=y); y.cols=1L}
stopifnot(is.data.table(x), is.data.table(y))
ans = bmerge(x, y, x.cols, y.cols, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult=mult, ops=rep.int(1L, length(x.cols)), verbose=FALSE)
## if undefining SMERGE_STATS then we have to ignore allLen1 as well
ans$nMatch = as.numeric(sum(!is.na(vecseq(ans$starts, ans$lens, NULL))))
ans
}
sm = function(x, y, mult="all") {
stopifnot(is.integer(x), is.integer(y))
ans = smerge(x, y, mult=mult, out.bmerge=TRUE)
sm = function(x, y, x.cols=seq_along(x), y.cols=seq_along(y), mult="all") {
if (is.integer(x)) {x = data.table(x=x); x.cols=1L}
if (is.integer(y)) {y = data.table(y=y); y.cols=1L}
stopifnot(is.data.table(x), is.data.table(y))
ans = smerge(x, y, x.cols, y.cols, mult=mult, out.bmerge=TRUE)
## if undefining SMERGE_STATS then we have to ignore allLen1 as well
ans$nMatch = smerge(x, y, mult=mult, out.bmerge=FALSE)$nMatch
ans$nMatch = smerge(x, y, x.cols, y.cols, mult=mult, out.bmerge=FALSE)$nMatch
ans
}

Expand Down Expand Up @@ -216,8 +220,8 @@ y = sample.int(2e2L, 1e2L, TRUE)
test(21.02, sm(x, y), bm(x, y))

# [.data.table join
d1 = data.table(x=sample.int(2e2L, 1e2L, TRUE), v1=seq_along(x))
d2 = data.table(y=sample.int(2e2L, 1e2L, TRUE), v2=seq_along(y))
d1 = data.table(x=sample.int(2e2L, 1e2L, TRUE), v1=seq_len(1e2L))
d2 = data.table(y=sample.int(2e2L, 1e2L, TRUE), v2=seq_len(1e2L))
options(datatable.smerge=FALSE, datatable.verbose=TRUE) ## verbose=2L after #4491
test(101.01, expected <- d1[d2, on="x==y"], output="bmerge", notOutput="smerge")
options(datatable.smerge=TRUE)
Expand Down
2 changes: 1 addition & 1 deletion src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,4 +250,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na);
SEXP fcaseR(SEXP na, SEXP rho, SEXP args);

// smjoin.c
SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge);
SEXP smergeR(SEXP x, SEXP y,SEXP x_cols, SEXP y_cols, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge);
Loading

0 comments on commit 2acaf23

Please sign in to comment.