From 2d2892cb873ff7a26d7dce6b977df2616383235d Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 15 Nov 2022 04:06:51 -0700 Subject: [PATCH] Move ram tests (#5520) --- R/test.data.table.R | 22 +- R/utils.R | 10 + inst/tests/benchmark.Rraw | 317 +++++++++++++++++++++ inst/tests/other.Rraw | 28 ++ inst/tests/tests.Rraw | 564 ++++++++++---------------------------- 5 files changed, 514 insertions(+), 427 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 3ea6683d5..bc512bdfa 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -126,8 +126,8 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F on.exit(setwd(owd)) if (memtest) { - catf("\n***\n*** memtest=%d. This should be the first task in a fresh R session for best results. Ctrl-C now if not.\n***\n\n", memtest) - if (is.na(ps_mem())) stopf("memtest intended for Linux. Step through ps_mem() to see what went wrong.") + catf("\n***\n*** memtest=%d. This should be the first call in a fresh R_GC_MEM_GROW=0 R session for best results. Ctrl-C now if not.\n***\n\n", memtest) + if (is.na(rss())) stopf("memtest intended for Linux. Step through data.table:::rss() to see what went wrong.") } err = try(sys.source(fn, envir=env), silent=silent) @@ -197,7 +197,12 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F ans = timings[, diff:=c(NA,round(diff(RSS),1))][y+1L][,time:=NULL] # time is distracting and influenced by gc() calls; just focus on RAM usage here catf("10 largest RAM increases (MB); see plot for cumulative effect (if any)\n") print(ans, class=FALSE) - plot(timings$RSS, main=basename(fn), ylab="RSS (MB)") + dev.new(width=14, height=7) + par(mfrow=c(1,2)) + plot(timings$RSS, main=paste(basename(fn),"\nylim[0]=0 for context"), ylab="RSS (MB)", ylim=c(0,max(timings$RSS))) + mtext(lastRSS<-as.integer(ceiling(last(timings$RSS))), side=4, at=lastRSS, las=1, font=2) + plot(timings$RSS, main=paste(basename(fn),"\nylim=range for inspection"), ylab="RSS (MB)") + mtext(lastRSS, side=4, at=lastRSS, las=1, font=2) } catf("All %d tests (last %.8g) in %s completed ok in %s\n", ntest, env$prevtest, names(fn), timetaken(env$started.at)) @@ -227,15 +232,6 @@ compactprint = function(DT, topn=2L) { INT = function(...) { as.integer(c(...)) } # utility used in tests.Rraw -ps_mem = function() { - # nocov start - cmd = paste0("ps -o rss --no-headers ", Sys.getpid()) # ps returns KB - ans = tryCatch(as.numeric(system(cmd, intern=TRUE)), warning=function(w) NA_real_, error=function(e) NA_real_) - if (length(ans)!=1L || !is.numeric(ans)) ans=NA_real_ # just in case - round(ans / 1024, 1L) # return MB - # nocov end -} - gc_mem = function() { # nocov start # gc reports memory in MB @@ -280,7 +276,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no timings[as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE] if (memtest) { if (memtest==1L) gc() # see #5515 for before/after - timings[as.integer(num), RSS:=max(ps_mem(),RSS), verbose=FALSE] + timings[as.integer(num), RSS:=max(rss(),RSS), verbose=FALSE] if (memtest==2L) gc() } assign("lasttime", proc.time()[3L], parent.frame(), inherits=TRUE) # after gc() to exclude gc() time from next test when memtest diff --git a/R/utils.R b/R/utils.R index c81f9e58c..a78e5450f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -156,3 +156,13 @@ edit.data.table = function(name, ...) { setDT(NextMethod('edit', name))[] } # nocov end + +rss = function() { #5515 #5517 + # nocov start + cmd = paste0("ps -o rss --no-headers ", Sys.getpid()) # ps returns KB + ans = tryCatch(as.numeric(system(cmd, intern=TRUE)), warning=function(w) NA_real_, error=function(e) NA_real_) + if (length(ans)!=1L || !is.numeric(ans)) ans=NA_real_ # just in case + round(ans / 1024, 1L) # return MB + # nocov end +} + diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 401331a13..04c5c490b 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -175,6 +175,322 @@ t2 = system.time(DT2 <- do.call(data.table, list(DF))) # 3.07s before, 0.02s af test(, identical(DT1, DT2)) test(, t2["elapsed"]/t1["elapsed"]<2) +########################################################### +# largest tests by ram usage moved out of tests.Rraw, #5517 +########################################################### + +# Test ad hoc by of more than 100,000 levels, see 2nd part of bug #1387 (100,000 from the limit of base::sort.list radix) +# This does need to be this large, like this in CRAN checks, because sort.list(method="radix") has this limit, which +# this tests. But it's well under 10 seconds. +DT = data.table(A=1:10,B=rnorm(10),C=factor(paste("a",1:100010,sep=""))) +test(301, nrow(DT[,sum(B),by=C])==100010) +DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep="")) +test(301.1, nrow(DT[,sum(B),by=C])==100010) + +# Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too. +options(datatable.optimize=0L) +set.seed(1) +DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") +test(637.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) +test(637.2, key(DT[J(43L),a:=99L]), NULL) +setkey(DT,a) +test(637.3, key(DT[,a:=99L,by=a]), NULL) +options(datatable.optimize=2L) +set.seed(1) +DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") +test(638.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) +test(638.2, key(DT[J(43L),a:=99L]), NULL) +setkey(DT,a) +test(638.3, key(DT[,a:=99L,by=a]), NULL) + +# Test X[Y] slowdown, #2216 +# Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes +# in R or elsewhere cause the 2 minute (!) bug to return. Hence not moving out to benmark.Rraw. +X = CJ(a=seq_len(1e3),b=seq_len(1e3)) +Y = copy(X) +X[4,b:=3L] # create a dup group, to force allLen1=FALSE +setkey(X) +test(819, system.time(X[Y,allow.cartesian=TRUE])["user.self"] < 10) # this system.time usage ok in this case +test(820, system.time(X[Y,mult="first"])["user.self"] < 10) # this system.time usage ok in this case + +# test uniqlengths +set.seed(45) +x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE) +ox <- forderv(x) +o1 <- uniqlist(list(x), ox) +test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) +o1 <- uniqlist(list(x)) +test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) +rm(list=c("x","ox","o1")) +gc() + +# Fix for (usually small) memory leak when grouping, #2648. +# Deliberate worst case: largest group (100000 rows) followed last by a small group (1 row). +DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001)) +before = gc()["Vcells",2] +for (i in 1:50) DT[, sum(B), by=A] +after = gc()["Vcells",2] +test(1157, after < before+3) # +3 = 3MB +# Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case. + +# Similar for when dogroups writes less rows than allocated, #2648. +DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4)) +before = gc()["Vcells",2] +for (i in 1:50) DT[ , unlist(.SD), by = 'k'] +after = gc()["Vcells",2] +test(1158, after < before+3) # 177.6MB => 179.2MB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024 + +# fix DT[TRUE, :=] using too much working memory for i, #1249 +if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) { # in case R not compiled with memory profiling enabled + f = tempfile() + N = 1000000 # or any large number of rows + DT = data.table(A=1:N, B=rnorm(N)) + DT[TRUE, B := B * 2] # stabilize with initial dummy update + Rprofmem(f) + DT[TRUE, B := B * 2] # or some in-place update + Rprofmem(NULL) + test(1542, length(grep("000",readLines(f, warn=FALSE))), 1L) # one allocation for the RHS only + unlink(f) +} + +if (FALSE) { + # Full range takes too long for CRAN. + dts = seq(as.Date("0000-03-01"), as.Date("9999-12-31"), by="day") + dtsCh = as.character(dts) # 36s + dtsCh = gsub(" ","0",sprintf("%10s",dtsCh)) # R does not 0 pad years < 1000 + test(1739.1, length(dtsCh)==3652365 && identical(dtsCh[c(1,3652365)],c("0000-03-01","9999-12-31"))) +} else { + # test on CRAN a reduced but important range + dts = seq(as.Date("1899-12-31"), as.Date("2100-01-01"), by="day") + dtsCh = as.character(dts) + test(1739.2, length(dtsCh)==73051 && identical(dtsCh[c(1,73051)],c("1899-12-31","2100-01-01"))) +} +DT = data.table(A=dts, B=as.IDate(dts)) +test(1739.3, sapply(DT,typeof), c(A="double",B="integer")) +test(1739.4, typeof(dts), "double") +f = tempfile() +g = tempfile() # Full range +fwrite(DT,f) # 0.092s +write.csv(DT,g,row.names=FALSE,quote=FALSE) # 65.250s +test(1739.5, readLines(f), c("A,B",paste(dtsCh,dtsCh,sep=","))) +test(1739.6, readLines(f), readLines(g)) +unlink(f) +unlink(g) +rm(list=c("dtsCh","dts")) +gc() + +# catch malformed factor in rbindlist, #3315 +set.seed(32940) +NN=7e5; KK=4e4; TT=25 +DT = data.table( id = sample(KK, NN, TRUE), tt = sample(TT, NN, TRUE), ff = factor(sample(3, NN, TRUE)) ) +test(1978, print(DT[ , diff(ff), by = id]), error="Column 2 of item 1 has type 'factor' but has no levels; i.e. malformed.") # the print invokes rbindlist which bites + +# print.data.table row id in non-scientific notation, #1167 +DT <- data.table(a = rep(1:5,3*1e5), b = rep(letters[1:3],5*1e5)) +test(1549, capture.output(print(DT)), c(" a b", " 1: 1 a", " 2: 2 b", " 3: 3 c", " 4: 4 a", " 5: 5 b", " --- ", "1499996: 1 b", "1499997: 2 c", "1499998: 3 a", "1499999: 4 b", "1500000: 5 c")) +rm(DT) + +# Create a file to test a sample jump being skipped due to format error. It will fail later in the read step because +# this is a real error. Currently have not constructed an error for which nextGoodLine looks good, but in fact is not. +# Would need a very complicated construction of embedded new lines in quoted fields, to test that. +# This test size with default buffMB results in 2 threads being used. 2 is important to pass on CRAN. +DT = as.data.table(CO2) +f = tempfile() +for (i in 0:1000) { + start = nrow(CO2)*i + fwrite(DT[,Plant:=start:(start+nrow(CO2)-1)], f, append=TRUE, col.names=FALSE) + if (i==502) write("-999,Bad,Line,0.0,0.0,extra\n", f, append=TRUE) +} +test(1835, fread(f, verbose=TRUE), + output = "A line with too-many.*jump 50.*jump landed awkwardly.*skipped", + warning = "Stopped.*line 42253. Expected 5 fields but found 6.*discarded.*<<-999,Bad,Line,0.0,0.0,extra>>") +unlink(f) + +# test no memory leak, #2191 and #2284 +# These take a few seconds each, and it's important to run these on CRAN to check no leak +gc(); before = gc()["Vcells","(Mb)"] +for (i in 1:2000) { DT = data.table(1:3); rm(DT) } # in 1.8.2 would leak 3MB +gc(); after = gc()["Vcells","(Mb)"] +test(861, after < before+0.5) # close to 0.0 difference, but 0.5 for safe margin +gc(); before = gc()["Vcells","(Mb)"] +DF = data.frame(x=1:20, y=runif(20)) +for (i in 1:2000) { DT = as.data.table(DF); rm(DT) } +gc(); after = gc()["Vcells","(Mb)"] +test(862, after < before+0.5) +gc(); before = gc()["Vcells","(Mb)"] +DT = data.table(x=1:20, y=runif(20)) +for (i in 1:2000) { x <- DT[1:5,]; rm(x) } +gc(); after = gc()["Vcells","(Mb)"] +test(863, after < before+0.5) + +# fread should use multiple threads on single column input. +# tests 2 threads; the very reasonable limit on CRAN +# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MB currently) +if (getDTthreads() == 1L) { + cat("Test 1760 not run because this session either has no OpenMP or has been limited to one thread (e.g. under UBSAN and ASAN)\n") +} else { + N = if (TRUE) 2e6 else 1e9 # offline speed check + fwrite(data.table(A=sample(10,N,replace=TRUE)), f<-tempfile()) + test(1760.1, file.info(f)$size > 4*1024*1024) + test(1760.2, fread(f, verbose=TRUE, nThread=2), output="using 2 threads") + unlink(f) +} + +# segfault of unprotected var caught with the help of address sanitizer; was test 1509 +# in #5517 I figured this test shouldn't be reduced in size due to its nature +set.seed(1) +val = sample(c(1:5, NA), 1e4L, TRUE) +dt <- setDT(replicate(100L, val, simplify=FALSE)) +## to ensure there's no segfault... +ans <- melt(dt, measure.vars=names(dt), na.rm=TRUE) +test(1035.21, ans, ans) + +# gc race with altrep in R-devel May 2018, #2866 & #2767, PR#2882 +# This runs with 2 threads in the test suite on CRAN and AppVeyor etc. +# 2 threads are sufficient to fail before the fix. +N = 20 +DF = data.frame(a=rnorm(N), + b=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]), + c=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5])) +DT = setDT(DF) # setDT required since data.table() already expanded altrep's +before = sum(gc()[, 2]) +fff = function(aref) { + ff = lapply(1:5, function(i) { + DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] + }) + return(rbindlist(ff)) +} +for(i in 1:100) { + f = fff("a") + rm("f") +} +gc() # extra gc() (i.e. two including the one on next line) seems to reduce `after` + # from 29.7 to 27.2 (exactly `before`). Keeping the extra gc() as no harm. +after = sum(gc()[, 2]) +test(1912.1, after < before + 10) # 10MB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up). +# +before = sum(gc()[, 2]) +fff = function(aref) { + DT = setDT(data.frame(a=1:N, b=1:N, c=1:N, d=1:N, e=1:N, f=1:N, g=1:N, h=1:N)) # 1:N creates altrep. A few of them too to tickle (the fixed) race. + lapply(1:5, function(i) { + DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] + }) +} +for(i in 1:100) { + fff("a") +} +gc() +after = sum(gc()[, 2]) +test(1912.2, after < before + 10) + +DT = data.table(A=seq(1, 1000000), B="x", C=TRUE) +fwrite(DT, f<-tempfile()) +test(1815, fread(f, nrows=5), DT[1:5]) #2243: nrows small vs large nrow(DT) + +# Better jump sync and run-on in PR#2627 +# +# Reproduces error 'did not finish exactly where jump 1 found ...' in #2561 in master before PR #2627 +# the jump point is just before an empty line and the nextGoodLine() wasn't sync'd properly +x = sprintf("ABCDEFGHIJKLMNOPQRST%06d", 1:102184) +x[51094]="" +cat(x, file=f<-tempfile(), sep="\n") +test(1874.1, fread(f,header=FALSE,verbose=TRUE)[c(1,51094,.N),], + data.table(V1=c("ABCDEFGHIJKLMNOPQRST000001","","ABCDEFGHIJKLMNOPQRST102184")), + output="jumps=[0..2)") # ensure jump 1 happened +# +# out-of-sample short lines in the first jump, not near the jump point +x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) +x[5021:5041] = "small,batch,short,lines" # 4 fields not 5 +cat(x, file=f, sep="\n") +test(1874.2, fread(f), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:5020), + warning="Stopped early on line 5021.*<>") +test(1874.3, fread(f,fill=TRUE,verbose=TRUE)[c(1,5020,5021,5041,5042,.N),], + data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), + V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), + V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), + V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), + V5=c(1L,5020L,NA,NA,5042L,102184L)), + output="jumps=[0..2)") +# +# jump just before a set of 30 or more too-few lines, to reproduce "No good line could be found" error in #2267 +# confirmed fails in master with that error before PR#2627 +x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) +x[51094:51150] = "small,batch,short,lines" # 4 fields not 5 +cat(x, file=f, sep="\n") +test(1874.4, fread(f,verbose=TRUE), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:51093), + warning="Stopped early on line 51094.*<>", + output="jumps=[0..2)") +test(1874.5, fread(f,fill=TRUE,verbose=TRUE)[c(1,51093,51094,51150,51151,.N),], + data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), + V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), + V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), + V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), + V5=c(1L,51093L,NA,NA,51151L,102184L)), + output="jumps=[0..2)") +# +# jump inside a quoted field containing many new lines, to simulate a dirty jump +# we'll make this jump landing even harder for nextGoodLine() by making the lines resemble the number and types of the true lines, too. +# Rather than needing to make nextGoodLine() better and better (at some point it's impossible), in these rare cases we'll just sweep dirty jumps. +x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) +x[51093] = "\"A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n\",FGHI,KLMN,PQRS,51093" +cat(x, file=f, sep="\n") +test(1875.6, fread(f,verbose=TRUE)[c(1,51092:51094,.N),][3,V1:=gsub("\r","",V1)], # gsub since R on Windows replaces \n with \r\n + data.table(V1=c("ABCD","ABCD", "A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n", "ABCD","ABCD"), + V2="FGHI", V3="KLMN", V4="PQRS", V5=c(1L,51092:51094,102184L)), + output = "too-few.*sample jump 50.*jump landed awkwardly.*skipped.*Read the data.*jumps=\\[0..2\\).*jumps=\\[1..2\\).*Reading 2 chunks \\(1 swept\\)") +# Aside: although the file (with over 100,000 lines) is big enough for 100 sampling jumps (of which just 1, the middle sample jump, skipped), it's +# still too small for more than 2 reading chunks to be worth it which is correct (based on buffMB not nth) +unlink(f) + +# chmatchdup test from benchmark at the bottom of chmatch.c +set.seed(45L) +x = sample(letters, 1e5, TRUE) +y = sample(letters, 1e6, TRUE) +test(2000, c(head(ans<-chmatchdup(x,y,0L)),tail(ans)), INT(7,49,11,20,69,25,99365,100750,97596,99671,103320,99406)) +rm(list=c("x","y")) + +# Add nq tests 1641-1652 here with larger sizes and calls that have been turned off in the past as took too long, and +# restore the exact parameters w.r.t. Jan's comment: https://github.com/Rdatatable/data.table/pull/5520#discussion_r1020180583 + +# issue 2351 +set.seed(1) +DT = data.table(id=paste0("id",1:1e5), v=sample(100,1e5,replace=TRUE)) +fwrite(DT, file=f<-tempfile(), eol="\r") +test(1826.1, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id99999","id100000"), v=c(27L,38L,10L,13L))) +cat("id888,42", file=f, append=TRUE) # without final \r after last line +test(1826.2, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id100000","id888"), v=c(27L,38L,13L,42L))) +unlink(f) + +# segfault when rbindlist is asked to create a DT with more than 2bn rows +DT = data.table(1:1e6) +L = vector("list", 2148) +for (i in seq_along(L)) L[[i]] = DT # many references to the same DT to avoid actually using large RAM for this test +test(1850, rbindlist(L), error="Total rows in the list is 2148000000 which is larger than the maximum number of rows, currently 2147483647") +rm(L, DT) +gc() + +# segfault in forder when nrow/throttle=255 && nrow>=65536; #5077 +# Matt ran these on clang's ASAN+OpenMP which correctly faulted v1.14.0; these tests segfault consistently without ASAN too +set.seed(1) +DT = data.table(grp=sample(255L, 65536L ,replace=TRUE)) # >=255 && >=65536 necessary +setDTthreads(throttle=nrow(DT)) # increase throttle to reduce threads to 1 for this nrow +test(2201.1, nrow(DT[, .N, by=grp]), 255L) +test(2201.2, nrow(setkey(DT, grp)), 65536L) +set.seed(1) +DT = data.table(grp=sample(65536L)) # extra case with all size 1 groups too just for fun +test(2201.3, nrow(DT[, .N, by=grp]), 65536L) +test(2201.4, nrow(setkey(DT, grp)), 65536L) +setDTthreads() # restore default throttle + +# print of DT with many columns reordered them, #3306. +DT = as.data.table(lapply(1:255, function(i)rep.int(i, 105L))) # 105 to be enough for 'top 5 ... bottom 5' to print +out = capture.output(print(DT)) +tt = out[grep("V",out)] +tt = unlist(strsplit(gsub(" ","",tt), "V")) +test(1982.1, tt[1L], "") +tt = as.integer(tt[tt!=""]) +test(1982.2, tt, seq_along(tt)) + # fread leak, #3292 dummy = rep("1\t2\t3\t4\t5", 10000000) writeLines(dummy, "out.tsv") @@ -183,3 +499,4 @@ for (i in 1:10) data.table::fread("out.tsv") end = gc()["Vcells",2] test(, end/start < 1.05) + diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index c6520a377..807a67c19 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -690,4 +690,32 @@ if (loaded[["nanotime"]]) { } +# that plot works; moved from tests.Rraw 167 to here to save ram of loading graphics package and possible screen device issues on overloaded servers, #5517 +DT = data.table( a=1:5, b=11:50, d=c("A","B","C","D"), f=1:5, grp=1:5 ) +test(28.1, DT[,plot(b,f)], NULL) +test(28.2, as.integer(DT[,hist(b)]$breaks), seq.int(10L,50L,by=5L)) # as.integer needed for R 3.1.0 +test(28.3, DT[,plot(b,f),by=.(grp)], data.table(grp=integer())) +try(graphics.off(),silent=TRUE) + +# test DT$.<- in a data.table-unaware package +# moved from tests.Rraw 1890 to here to save ram of loading stats package and plot, #5517 +DT = data.table(A=1:5) +test(29.1, stats::ts.plot(gpars=DT), error="object must have one or more observations") +# Inside ts.plot is a gpars$ylab<- which happens before its error. That dispatches to our $<- which does the alloc.col() +test(29.2, DT, data.table(A=1:5)) + +if (FALSE) { # moved from tests.Rraw in #5517 and not yet back on; wasn't sure we need to still test reshape2 + # test dispatch for non-data.table objects, #4864. + if (inherits(try(getNamespace("reshape2"), silent=TRUE),"try-error")) { + test(1038.001, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), + error="The melt generic in data.table has been passed a data.frame") + } else { + # 1) GLCI rel-cran has reshape2 installed because caret in other.Rraw depends on reshape2 + # 2) a user running test.data.table() with reshape2 installed (doesn't have to be loaded) + # 3) in dev locally I have reshape2 installed to run caret in other.Rraw + test(1038.002, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), + as.data.frame(melt(DT, id.vars=1:2, measure.vars=5:6)), + warning="The melt generic in data.table has been passed a data.frame") + } +} diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b242290df..7947ac009 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7,6 +7,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { } if ((tt<-compiler::enableJIT(-1))>0) cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") + rm_all = function() {} } else { require(data.table) # Make symbols to the installed version's ::: so that we can i) test internal-only not-exposed R functions @@ -52,6 +53,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { print.data.table = data.table:::print.data.table replace_dot_alias = data.table:::replace_dot_alias rollup.data.table = data.table:::rollup.data.table + rss = data.table:::rss selfrefok = data.table:::selfrefok setcoalesce = data.table:::setcoalesce setdiff_ = data.table:::setdiff_ @@ -98,6 +100,13 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { year = data.table::year # lubridate yearmon = data.table::yearmon # zoo yearqtr = data.table::yearqtr # zoo + + rm_all = function(env=parent.frame()) { + tt = setdiff(ls(envir=env), .do_not_rm) + rm(list=tt, envir=env) + gc() + invisible() + } } # Optional suggests are now tested in other.Rraw, #5516. No calls to require() or library() should occur @@ -156,6 +165,8 @@ base_messages = list( mixed_subscripts = get_msg(letters[-1:1]) ) +########################## +.do_not_rm = ls() # objects that exist at this point should not be removed by rm_all(); e.g. test_*, base_messages, Ctest_dt_win_snprintf, prevtest, etc ########################## test(1.1, tables(env=new.env()), null.data.table(), output = "No objects of class") @@ -520,11 +531,7 @@ test(164, foo(f), DT[,mean(b),by=d]) test(165, subset(DT,a>2), DT[a>2]) test(166, suppressWarnings(split(DT,DT$grp)[[2]]), DT[grp==2]) -# and that plotting works -test(167.1, DT[,plot(b,f)], NULL) -test(167.2, as.integer(DT[,hist(b)]$breaks), seq.int(10L,50L,by=5L)) # as.integer needed for R 3.1.0 -test(167.3, DT[,plot(b,f),by=.(grp)], data.table(grp=integer())) -try(graphics.off(),silent=TRUE) +# 167 tested graphics::plot, moved to other.Rraw 28 to save ram, #5517 # IDateTime conversion methods that ggplot2 uses (it calls as.data.frame method) # Since %b is e.g. "nov." in LC_TIME=fr_FR.UTF-8 locale, we need to @@ -961,13 +968,7 @@ DT = data.table(a=1:3,b=1:9,v=1:9,key="a,b") test(300, DT[J(1),sum(v),by=b], data.table(b=c(1L,4L,7L),V1=c(1L,4L,7L))) # should not retain key because by= is not on head(key(DT)) test(300.1, DT[J(1:2),sum(v),by=b], data.table(b=c(1L,4L,7L,2L,5L,8L),V1=c(1L,4L,7L,2L,5L,8L))) -# Test ad hoc by of more than 100,000 levels, see 2nd part of bug #1387 (100,000 from the limit of base::sort.list radix) -# This does need to be this large, like this in CRAN checks, because sort.list(method="radix") has this limit, which -# this tests. But it's well under 10 seconds. -DT = data.table(A=1:10,B=rnorm(10),C=factor(paste("a",1:100010,sep=""))) -test(301, nrow(DT[,sum(B),by=C])==100010) -DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep="")) -test(301.1, nrow(DT[,sum(B),by=C])==100010) +# 301 moved to benchmark.Rraw, #5517 # Test fast assign DT = data.table(a=c(1L,2L,2L,3L),b=4:7,key="a") @@ -1930,21 +1931,7 @@ DT = data.table(x=1:3,y=1:3) test(635, names(DT[,list(x,y,a=y)]), c("x","y","a")) test(636, names(DT[,list(x,a=y)]), c("x","a")) -# Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too. -options(datatable.optimize=0L) -set.seed(1) -DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") -test(637.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) -test(637.2, key(DT[J(43L),a:=99L]), NULL) -setkey(DT,a) -test(637.3, key(DT[,a:=99L,by=a]), NULL) -options(datatable.optimize=2L) -set.seed(1) -DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") -test(638.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) -test(638.2, key(DT[J(43L),a:=99L]), NULL) -setkey(DT,a) -test(638.3, key(DT[,a:=99L,by=a]), NULL) +# 637-638 moved to benchmark.Rraw, #5517 # Test printing is right aligned without quotes etc, and rownames are repeated ok for more than 20 rows DT=data.table(a=8:10,b=c("xy","x","xyz"),c=c(1.1,22.1,0)) @@ -1964,9 +1951,9 @@ test(645, setkey(DT,b), error="Column 2 is length 2 which differs from length of # Test faster mean with a lot of very small groups. Example from (now not needed as much) data.table wiki point 3. # benchmarks.Rraw contains the same, to be scaled up. set.seed(9) -n=1e4 # very small n so as not to overload daily CRAN checks. -DT=data.table(grp1=sample(1:150, n, replace=TRUE), - grp2=sample(1:150, n, replace=TRUE), +n=1e3 # very small n (1e4) so as not to overload daily CRAN checks. Then reduced even further to just 1e3, #5517 +DT=data.table(grp1=sample.int(150L, n, replace=TRUE), + grp2=sample.int(150L, n, replace=TRUE), x=rnorm(n), y=rnorm(n)) DT[c(2,5),x:=NA] # seed chosen to get a group of size 2 and 3 in the first 5 to easily inspect. @@ -2423,16 +2410,7 @@ mycols = 2 test(814.12, DT[,!..mycols], ans) test(814.13, DT[,-..mycols], ans) - -# Test X[Y] slowdown, #2216 -# Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes -# in R or elsewhere cause the 2 minute (!) bug to return. Hence not moving out to benmark.Rraw. -X = CJ(a=seq_len(1e3),b=seq_len(1e3)) -Y = copy(X) -X[4,b:=3L] # create a dup group, to force allLen1=FALSE -setkey(X) -test(819, system.time(X[Y,allow.cartesian=TRUE])["user.self"] < 10) # this system.time usage ok in this case -test(820, system.time(X[Y,mult="first"])["user.self"] < 10) # this system.time usage ok in this case +# 819-820 moved to benchmark.Rraw, #5517 # Optimization of lapply(,"+"), #2212 DT = data.table(a=rep(1:3,each=2L),b=1:6,c=7:12) @@ -2534,24 +2512,7 @@ i = data.frame(foo=1) test(859, DT[i], DT[J(i)]) test(860, DT[i], DT[data.table(i)]) -# test no memory leak, #2191 and #2284 -# These take a few seconds each, and it's important to run these on CRAN to check no leak -gc(); before = gc()["Vcells","(Mb)"] -for (i in 1:2000) { DT = data.table(1:3); rm(DT) } # in 1.8.2 would leak 3MB -gc(); after = gc()["Vcells","(Mb)"] -test(861, after < before+0.5) # close to 0.0 difference, but 0.5 for safe margin - -gc(); before = gc()["Vcells","(Mb)"] -DF = data.frame(x=1:20, y=runif(20)) -for (i in 1:2000) { DT = as.data.table(DF); rm(DT) } -gc(); after = gc()["Vcells","(Mb)"] -test(862, after < before+0.5) - -gc(); before = gc()["Vcells","(Mb)"] -DT = data.table(x=1:20, y=runif(20)) -for (i in 1:2000) { x <- DT[1:5,]; rm(x) } -gc(); after = gc()["Vcells","(Mb)"] -test(863, after < before+0.5) +# 861-863 moved to benchmark.Rraw, #5517 # rbindlist should look for the first non-empty data.table - New changes (from Arun). Explanation below: # Even if data.table is empty, as long as there are column names, they should be considered. @@ -3251,13 +3212,7 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1035.20, melt(DT, id.vars=1:2), data.table(A=1:2, B=3:4, variable=factor(rep(1L, 4L), labels="D"), value=5:8)) - # segfault of unprotected var caught with the help of address sanitizer; was test 1509 - set.seed(1) - val = sample(c(1:5, NA), 1e4L, TRUE) - dt <- setDT(replicate(100L, val, simplify=FALSE)) - ## to ensure there's no segfault... - ans <- melt(dt, measure.vars=names(dt), na.rm=TRUE) - test(1035.21, ans, ans) + # 1035.21 moved to benchmark.Rraw, #5517 # improper levels fix, #1359; was test 1563 dt = data.table(id=1:3, x=NA_character_, y=c('a', NA_character_, 'c')) @@ -3360,18 +3315,8 @@ Sep,33.5,19.4,15.7,11.9,0,100.8,100.8,0,12.7,12.7,0,174.1") test(1037.414, melt(x, id.vars='x1', measure.vars='r'), error="Unknown column type 'raw' for column 'r'") - # test dispatch for non-data.table objects, #4864. - if (inherits(try(getNamespace("reshape2"), silent=TRUE),"try-error")) { - test(1038.001, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), - error="The melt generic in data.table has been passed a data.frame") - } else { - # 1) GLCI rel-cran has reshape2 installed because caret in other.Rraw depends on reshape2 - # 2) a user running test.data.table() with reshape2 installed (doesn't have to be loaded) - # 3) in dev locally I have reshape2 installed to run caret in other.Rraw - test(1038.002, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), - as.data.frame(melt(DT, id.vars=1:2, measure.vars=5:6)), - warning="The melt generic in data.table has been passed a data.frame") - } + # 1038 moved to other.Rraw, #5517 + } # sorting and grouping of Inf, -Inf, NA and NaN, #117, #112 & #105 @@ -4070,7 +4015,8 @@ if (test_longdouble) { old = getNumericRounding() set.seed(6) - x = rnorm(1e6)*1e4 + x = rnorm(1e4)*1e4 # first 1e4 reduced from 1e6 to save ram, #5517 + x = c(x, 11969.235757385, 11969.235757322) # add back 2 numbers from the 1e6 sample whose order is changed in test 1147.3 ans = base::sort.list(x, method="shell") setNumericRounding(0) test(1147.1, ans, forderv(x)) @@ -4104,16 +4050,7 @@ if (test_longdouble) { test(1149.1, forderv(integer(0)), integer(0)) test(1149.2, forderv(numeric(0)), integer(0)) -# test uniqlengths -set.seed(45) -x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE) -ox <- forderv(x) -o1 <- uniqlist(list(x), ox) -test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) -o1 <- uniqlist(list(x)) -test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) -rm(list=c("x","ox","o1")) -gc() +# 1151 moved to benchmark.Rraw, #5517 # #67 fix - grouping with .SDcols gave "symbol not subsettable error" - consequence of FR #355 implementation dt = data.table(grp = sample(letters[1:3],20, replace = TRUE), v1 = rnorm(20), v2 = rnorm(20)) @@ -4153,21 +4090,7 @@ setkey(dt, x) test(1155.4, dt[J(NaN)], dt[is.nan(x)]) test(1155.5, dt[J(NA_real_)], dt[is.na(x) & !is.nan(x)]) -# Fix for (usually small) memory leak when grouping, #2648. -# Deliberate worst case: largest group (100000 rows) followed last by a small group (1 row). -DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001)) -before = gc()["Vcells",2] -for (i in 1:50) DT[, sum(B), by=A] -after = gc()["Vcells",2] -test(1157, after < before+3) # +3 = 3MB -# Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case. - -# Similar for when dogroups writes less rows than allocated, #2648. -DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4)) -before = gc()["Vcells",2] -for (i in 1:50) DT[ , unlist(.SD), by = 'k'] -after = gc()["Vcells",2] -test(1158, after < before+3) # 177.6MB => 179.2MB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024 +# 1157-1158 moved to benchmark.Rraw, #5517 # tests for 'setDT' - convert list, DF to DT without copy x <- data.frame(a=1:4, b=5:8) @@ -4479,48 +4402,46 @@ seed = as.integer(Sys.time()) # sample(9999L, 1L) temporary fix, because all the seedInfo = paste("forder decreasing argument test: seed = ", seed," ", sep="") # no NaN (because it's hard to match with base::order); tested below in 1988.4-8 set.seed(seed) -foo <- function(n) apply(matrix(sample(letters, n*8L, TRUE), ncol=8L), 1, paste, sep="") +foo <- function(n) apply(matrix(sample(letters, n*8L, TRUE), ncol=8L), 1, paste, collapse="") i1 = as.integer(sample(c(-100:100), 1e3, TRUE)) i2 = as.integer(sample(c(-100:100, -1e6, 1e6), 1e3, TRUE)) d1 = as.numeric(sample(c(-100:100,Inf,-Inf), 1e3, TRUE)) d2 = as.numeric(rnorm(1e3)) -c1 = sample(c(letters), 1e3, TRUE) -c2 = sample(foo(200), 1e3, TRUE) +c1 = sample(letters, 1e3, TRUE) +c2 = sample(foo(50), 1e3, TRUE) DT = data.table(i1, i2, d1, d2, c1, c2) # randomise col order as well colorder=sample(ncol(DT)) setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") -ans = vector("list", length(names(DT))) test_no = 1223.0 oldnfail = nfail -for (i in seq_along(names(DT))) { - cj = as.matrix(do.call(CJ, split(rep(c(1L,-1L), each=i), 1:i))) - ans[[i]] = combn(names(DT), i, function(x) { - tmp = apply(cj, 1, function(y) { +for (nvars in seq_along(names(DT))) { + signs = expand.grid(replicate(nvars, c(-1L,1L), simplify=FALSE)) + combn(names(DT), nvars, function(x) { + for (i in seq_len(nrow(signs))) { test_no <<- signif(test_no+.001, 7) ll = as.call(c(as.name("order"), lapply(seq_along(x), function(j) { - if (y[j] == 1L) + if (signs[i,j] == 1L) as.name(x[j]) else { - if (class(DT[[x[j]]]) =="character") + if (is.character(DT[[x[j]]])) as.call(c(as.name("-"), as.call(list(as.name("xtfrm"), as.name(x[j]))))) else as.call(list(as.name("-"), as.name(x[j]))) } }) )) - test(test_no, forderv(DT, by=x, order=y), with(DT, eval(ll))) - }) - dim(tmp)=NULL - list(tmp) + test(test_no, forderv(DT, by=x, order=signs[i,]), with(DT, eval(ll))) + } + integer() }) } -ans = NULL if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce +rm_all() # fix for bug #44 - unique on null data.table should return null data.table test(1224, unique(data.table(NULL)), data.table(NULL)) @@ -4616,7 +4537,7 @@ if (base::getRversion() < "3.3.0") { # Test for optimisation of 'order' to 'forder'. Copied to benchmarks.Rraw too. set.seed(45L) -DT = data.table(x=sample(1e2, 1e5, TRUE), y=sample(1e2, 1e5, TRUE)) +DT = data.table(x=sample.int(1e2, 1e3, TRUE), y=sample.int(1e2, 1e3, TRUE)) # 1e5 reduced again to 1e3, #5517 test(1241, DT[order(x,-y)], # optimized to forder() DT[base_order(x,-y)]) # not optimized @@ -4890,7 +4811,7 @@ test(1268.22, dt[, c(as.list(c), lapply(.SD, mean)), by=a], # Wide range numeric and integer64, to test all bits old_rounding = getNumericRounding() -x = sample( c(seq(-1e100, 1e100, length.out=1e5), c(seq(-1e-100,1e-100,length.out=1e5))) ) +x = sample( c(seq(-1e100, 1e100, length.out=1e3), c(seq(-1e-100,1e-100,length.out=1e3))) ) # 1e5 reduced to 1e3, #5517 setNumericRounding(0) test(1269, forderv(x), base::order(x)) setNumericRounding(2) # not affected by rounding @@ -5212,8 +5133,8 @@ DT = DT[1L] set(DT,1L,"b",FALSE) # passing 1L as i here is needed to avoid column plonk, so changes the logical singleton in place test(1297, as.integer(TRUE[1]), 1L) # In R 3.1, TRUE[1] returns the global TRUE but TRUE doesn't yet (parses as new vector) test(1298, as.integer(TRUE), 1L) -# orignal example, verbatim from James Sams : -upc_table = data.table(upc=1:100000, upc_ver_uc=rep(c(1,2), times=50000), is_PL=rep(c(TRUE, FALSE, FALSE, TRUE), each=25000), product_module_code=rep(1:4, times=25000), ignore.column=2:100001) +# orignal example, verbatim from James Sams; sizes reduced to save ram in #5517 +upc_table = data.table(upc=1:1000, upc_ver_uc=rep(c(1,2), times=500), is_PL=rep(c(TRUE, FALSE, FALSE, TRUE), each=250), product_module_code=rep(1:4, times=250), ignore.column=2:1001) test(1299, upc_table[, .N, by=list(upc, upc_ver_uc)][,max(N)], 1L) # all size 1 groups test(1300, upc_table[, list(is_PL, product_module_code), keyby=list(upc, upc_ver_uc)][,upc[1:3]], 1:3L) # was warning "internal TRUE value has been modified" rm(list="upc_table") @@ -7629,18 +7550,8 @@ dtab <- data.table(pid = factor(c("i", "nouana")), c("pid", "year")) test(1541, key(dtp[dtab]), c("pid", "year")) -# fix DT[TRUE, :=] using too much working memory for i, #1249 -if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) { # in case R not compiled with memory profiling enabled - f = tempfile() - N = 1000000 # or any large number of rows - DT = data.table(A=1:N, B=rnorm(N)) - DT[TRUE, B := B * 2] # stabilize with initial dummy update - Rprofmem(f) - DT[TRUE, B := B * 2] # or some in-place update - Rprofmem(NULL) - test(1542, length(grep("000",readLines(f, warn=FALSE))), 1L) # one allocation for the RHS only - unlink(f) -} +# 1542.0 moved to benchmark.Rraw, #5517 + # DT[TRUE] should shallow copy as v1.11.8 and earlier did (#3214); in future more will shallow copy too DT = data.table(id = 1:5, key="id") DT1 = DT[TRUE] @@ -7789,10 +7700,7 @@ ans2 <- fread(f, sep=",", header=TRUE, encoding="UTF-8") test(1548.1, unique(unlist(lapply(ans1, Encoding))), "unknown") test(1548.2, unique(unlist(lapply(ans2, Encoding))), "UTF-8") -# #1167 print.data.table row id in non-scientific notation -DT <- data.table(a = rep(1:5,3*1e5), b = rep(letters[1:3],5*1e5)) -test(1549, capture.output(print(DT)), c(" a b", " 1: 1 a", " 2: 2 b", " 3: 3 c", " 4: 4 a", " 5: 5 b", " --- ", "1499996: 1 b", "1499997: 2 c", "1499998: 3 a", "1499999: 4 b", "1500000: 5 c")) -rm(DT) +# 1549 moved to benchmark.Rraw, #5517 # PR by @dselivanov # fixes #504 - handle nastring while reading (without coercion to character) @@ -9193,6 +9101,8 @@ dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2)) test(1638, dt[, .SD, by=z, verbose=TRUE], output="All optimizations are turned off") options(datatable.optimize=Inf) +rm_all() + #1389 - split.data.table - big chunk of unit tests set.seed(123) dt = data.table(x1 = rep(letters[1:2], 6), x2 = rep(letters[3:5], 4), x3 = rep(letters[5:8], 3), y = rnorm(12)) @@ -9284,14 +9194,14 @@ test(1639.056, TRUE, all( sapply(l, sapply, ncol) == rep(4L, 4) )) l = split(fdt, by = c("x1","x2","x3"), flatten=FALSE) # empty levels in x3 after subset are expanded -test(1639.057, TRUE, all( - is.list(l), identical(names(l), c("b","a")), - sapply(l, function(x) !is.data.table(x) && is.list(x)), - sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)), - identical(lapply(l, lapply, names), list(b=list(d=c("h","f","e","g"), e=c("h","f","e","g"), c=c("f","h","e","g")), a=list(e=c("g","e","f","h"), d=c("e","g","f","h"), c=c("e","g","f","h")))), - sapply(l, sapply, sapply, nrow) == rep(c(1L,1L,0L,0L), 6), - sapply(l, sapply, sapply, ncol) == rep(4L, 24) -)) +# memtest tracing in #5520 showed this split() and the one before 1639.188 (both by 3 columns) account for the RAM usage in 1639. But they should be gc()'d eventually after rm_all(). +test(1639.0571, is.list(l)) +test(1639.0572, names(l), c("b","a")) +test(1639.0573, all(sapply(l, function(x) !is.data.table(x) && is.list(x)))) +test(1639.0574, all(sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)))) +test(1639.0575, lapply(l, lapply, names), list(b=list(d=c("h","f","e","g"), e=c("h","f","e","g"), c=c("f","h","e","g")), a=list(e=c("g","e","f","h"), d=c("e","g","f","h"), c=c("e","g","f","h")))) +test(1639.0576, all(sapply(l, sapply, sapply, nrow) == rep(c(1L,1L,0L,0L), 6))) +test(1639.0577, all(sapply(l, sapply, sapply, ncol) == rep(4L, 24))) l = split(fdt, by = c("x3","x1"), drop=TRUE, flatten=FALSE) # multi col rev test(1639.058, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e")), @@ -9656,6 +9566,7 @@ test(1639.141, all(sapply(dtL, truelength) > 1000)) dt <- data.table(x = factor("a"), y = 1) test(1639.142, x = split(dt, by = "x"), y = list(a = dt)) test(1639.143, x = split(dt, by = "y"), y = list(`1` = dt)) +rm_all() # allow x's cols (specifically x's join cols) to be referred to using 'x.' syntax # patch for #1615. Note that I specifically have not implemented x[y, aa, on=c(aa="bb")] @@ -9668,10 +9579,10 @@ test(1640.2, x[y, c(.SD, .(x.aa=x.aa)), on=c(aa="bb")], data.table(aa=3:5, cc=c( # tests for non-equi joins # function to create a random data.table with all necessary columns nq_fun = function(n=100L) { - i1 = sample(sample(n, 10L), n, TRUE) - i2 = sample(-n/2:n/2, n, TRUE) - i3 = sample(-1e6:1e6, n, TRUE) - i4 = sample(c(NA_integer_, sample(-n:n, 10L, FALSE)), n, TRUE) + i1 = sample(sample.int(n, 10L), n, TRUE) + i2 = sample.int(n, n, TRUE) - as.integer(n/2) # this used to be type numeric before #5517 which didn't seem intentional + i3 = sample.int(2e6, n, TRUE) - as.integer(1e6) # used to sample from -1e6:1e6 which if allocated would be 8MB, #5517 + i4 = sample(c(NA_integer_, sample.int(n*2L, 10L, FALSE)-n), n, TRUE) d1 = sample(rnorm(10L), n, TRUE) d2 = sample(rnorm(50), n, TRUE) @@ -9683,15 +9594,55 @@ nq_fun = function(n=100L) { dt = data.table(i1,i2,i3,i4, d1,d2,d3,d4, c1,c2) if (test_bit64) { - I1 = as.integer64(sample(sample(n, 10L), n, TRUE)) - I2 = as.integer64(sample(-n/2:n/2, n, TRUE)) - I3 = as.integer64(sample(-1e6:1e6, n, TRUE)) - I4 = as.integer64(sample(c(NA_integer_, sample(-n:n, 10L, FALSE)), n, TRUE)) + I1 = as.integer64(sample(sample.int(n, 10L), n, TRUE)) + I2 = as.integer64(sample.int(n, n, TRUE) - as.integer(n/2)) + I3 = as.integer64(sample.int(2e6, n, TRUE) - as.integer(1e6)) # there used to be another -1e6:1e6 here whose altrep likely allocated when sample accessed it, #5517 + I4 = as.integer64(sample(c(NA_integer_, sample.int(n*2L, 10L, FALSE)-n), n, TRUE)) dt = cbind(dt, data.table(I1,I2,I3,I4)) } dt } +construct <- function(cols, vals, ops, x, y) { + expr = lapply(seq_along(cols), function(i) { + GT_or_LT = ops[i]==">" || ops[i]=="<" + if (inherits(vals[[i]], "integer64")) { + if (is.na.integer64(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is.na.integer64), as.name(cols[[i]]))) + else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), as.integer(vals[[i]]))) + # don't know how to construct a call with int64 -- vals[[i]] gets converted to NAN + } else { + if (is.nan(vals[[i]])) if (GT_or_LT) quote(logical(0)) else as.call(list(quote(is.nan), as.name(cols[[i]]))) + else if (is_only_na(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is_only_na), as.name(cols[[i]]))) + else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), vals[[i]])) + } + }) + Reduce(function(x,y)call("&",x,y), expr) +} + +check <- function(x, y, cols, ops, mult="all") { + # gather just row numbers here and then select all rows once afterwards, rather than rbindlist + rowNums = unlist(lapply(1:nrow(y), function(i) { + e = construct(cols, y[i, ..cols], ops, x, y) + rowNums = which(with(x, eval(e))) # raw expression, isolated from both [.data.table overhead and subset optimization + if (!length(rowNums) || mult=="all") + rowNums + else if (mult=="first") + rowNums[1L] + else # mult=="last" + rowNums[length(rowNums)] + })) + x[rowNums] +} + +nq <- function(x, y, cols, ops, nomatch=0L, mult="all") { + sd_cols = c(paste0("x.", cols), setdiff(names(x), cols)) + ans = x[y, mget(sd_cols, as.environment(-1)), on = paste0(cols, ops, cols), allow.cartesian=TRUE, nomatch=nomatch, mult=mult] + setnames(ans, gsub("^x[.]", "", names(ans))) + setcolorder(ans, names(x))[] +} + +is_only_na <- function(x) is.na(x) & !is.nan(x) + nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { ops = c("==", ">=", "<=", ">", "<") xclass = sapply(x, class) @@ -9702,42 +9653,6 @@ nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { thisops[startsWith(cols, "c")] = "==" thisops }) - is_only_na <- function(x) is.na(x) & !is.nan(x) - construct <- function(cols, vals, ops) { - expr = lapply(seq_along(cols), function(i) { - GT_or_LT = ops[i]==">" || ops[i]=="<" - if (inherits(vals[[i]], "integer64")) { - if (is.na.integer64(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is.na.integer64), as.name(cols[[i]]))) - else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), as.integer(vals[[i]]))) - # don't know how to construct a call with int64 -- vals[[i]] gets converted to NAN - } else { - if (is.nan(vals[[i]])) if (GT_or_LT) quote(logical(0)) else as.call(list(quote(is.nan), as.name(cols[[i]]))) - else if (is_only_na(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is_only_na), as.name(cols[[i]]))) - else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), vals[[i]])) - } - }) - Reduce(function(x,y)call("&",x,y), expr) - } - check <- function(x, y, cols, ops, mult="all") { - # gather just row numbers here and then select all rows once afterwards, rather than rbindlist - rowNums = unlist(lapply(1:nrow(y), function(i) { - e = construct(cols, y[i, ..cols], ops) - rowNums = which(with(x, eval(e))) # raw expression, isolated from both [.data.table overhead and subset optimization - if (!length(rowNums) || mult=="all") - rowNums - else if (mult=="first") - rowNums[1L] - else # mult=="last" - rowNums[length(rowNums)] - })) - x[rowNums] - } - nq <- function(x, y, cols, ops, nomatch=0L, mult="all") { - sd_cols = c(paste0("x.", cols), setdiff(names(x), cols)) - ans = x[y, mget(sd_cols, as.environment(-1)), on = paste0(cols, ops, cols), allow.cartesian=TRUE, nomatch=nomatch, mult=mult] - setnames(ans, gsub("^x[.]", "", names(ans))) - setcolorder(ans, names(x))[] - } for (i in seq_along(runcmb)) { thiscols = runcmb[[i]] thisops = runops[[i]] @@ -9750,7 +9665,7 @@ nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { gc() # no longer needed but left in place just in case, no harm } -dt1 = nq_fun(400L) +dt1 = nq_fun(100L) # 400 reduced to 100, #5517 dt2 = nq_fun(50L) x = na.omit(dt1) y = na.omit(dt2) @@ -10881,31 +10796,7 @@ test(1738.3, sapply(DT,typeof), c(A="double",B="integer")) test(1738.4, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) test(1738.5, as.integer(as.Date(c("0000-03-01","9999-12-31"))), c(-719468L,2932896L)) -if (FALSE) { - # Full range takes too long for CRAN. - dts = seq(as.Date("0000-03-01"), as.Date("9999-12-31"), by="day") - dtsCh = as.character(dts) # 36s - dtsCh = gsub(" ","0",sprintf("%10s",dtsCh)) # R does not 0 pad years < 1000 - test(1739.1, length(dtsCh)==3652365 && identical(dtsCh[c(1,3652365)],c("0000-03-01","9999-12-31"))) -} else { - # test on CRAN a reduced but important range - dts = seq(as.Date("1899-12-31"), as.Date("2100-01-01"), by="day") - dtsCh = as.character(dts) - test(1739.2, length(dtsCh)==73051 && identical(dtsCh[c(1,73051)],c("1899-12-31","2100-01-01"))) -} -DT = data.table(A=dts, B=as.IDate(dts)) -test(1739.3, sapply(DT,typeof), c(A="double",B="integer")) -test(1739.4, typeof(dts), "double") -f = tempfile() -g = tempfile() # Full range -fwrite(DT,f) # 0.092s -write.csv(DT,g,row.names=FALSE,quote=FALSE) # 65.250s -test(1739.5, readLines(f), c("A,B",paste(dtsCh,dtsCh,sep=","))) -test(1739.6, readLines(f), readLines(g)) -unlink(f) -unlink(g) -rm(list=c("dtsCh","dts")) -gc() +# 1739 moved to benchmark.Rraw, #5517 # dateTimeAs DT = data.table( @@ -11223,12 +11114,13 @@ test(1750.07, # 0 length `by`, must also use `sets=list()`, so 0L rows result nrow(groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = character(), .SDcols=c("amount","value"), sets=list(), id=TRUE)), 0L ) -test(1750.08, all( # for any single value from dataset there should be always same aggregate result on any level of grouping - sapply(seq_len(nrow(dt)), function(i) uniqueN( +# for any single value from dataset there should be always be the same aggregate result on any level of grouping +# changed from all(sapply()) to for() to save ram, #5517 +for (i in seq_len(nrow(dt))) { + test(1750.08+i/10000, uniqueN( groupingsets(dt[i], j = lapply(.SD, sum), by = c("color","year","status"), sets=list(c("color","year","status"), c("year"), c("status"), character())), - by=c("amount","value") - )) == 1L -), TRUE) + by=c("amount","value")) == 1L) +} # all grouping id matches in all totals r = groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=list(c("color","year","status"), c("year"), c("status"), character()), id=TRUE) test(1750.09, uniqueN( @@ -11457,18 +11349,7 @@ if (test_R.utils) test(1759, fread(testDir("alluniquechar.csv.gz"))[c(1,2,499,50 H=c("tokakysooopwtmlkeimzbgpein","hguwmynjhecsxpxldyzlemavmw", "lyclruzkazfqhyxnppaafwcveo","myfqhltlwzwwxyvshwrzrdmfyq"))) -# fread should use multiple threads on single column input. -# tests 2 threads; the very reasonable limit on CRAN -# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MB currently) -if (getDTthreads() == 1L) { - cat("Test 1760 not run because this session either has no OpenMP or has been limited to one thread (e.g. under UBSAN and ASAN)\n") -} else { - N = if (TRUE) 2e6 else 1e9 # offline speed check - fwrite(data.table(A=sample(10,N,replace=TRUE)), f<-tempfile()) - test(1760.1, file.info(f)$size > 4*1024*1024) - test(1760.2, fread(f, verbose=TRUE, nThread=2), output="using 2 threads") - unlink(f) -} +# 1760 moved to benchmark.Rraw, #5517 # fread single column with superfluous fill=TRUE, #2118 test(1761.1, fread("1\n2\n3", fill=TRUE), data.table(V1=1:3)) @@ -11813,10 +11694,10 @@ ld = sapply(same, as.IDate) test(1779.01, uniqueN(ld)==1L) lt = sapply(same[1:2], as.ITime) # exclude date test(1779.02, uniqueN(lt)==1L) -# some random 1e6 timestamps old defaults vs new methods UTC +# some random timestamps old defaults vs new methods UTC intpx = function(x) as.integer(as.POSIXct(x, origin = "1970-01-01", tz = "UTC")) set.seed(1) -i = sample(intpx("2015-10-12")-intpx("2014-10-12"), 1e5, TRUE) + intpx("2014-10-12") +i = sample(intpx("2015-10-12")-intpx("2014-10-12"), 1e3, TRUE) + intpx("2014-10-12") # 1e5 reduced to 1e3, #5517 p = as.POSIXct(i, origin = "1970-01-01", tz = "UTC") test(1779.03, identical(as.ITime.default(p), as.ITime(p))) test(1779.04, identical(as.IDate.default(p), as.IDate(p))) @@ -11888,9 +11769,7 @@ test(1812, fread("A,B\n1,2\n3,4\n", skip="4", verbose=TRUE), data.table(V1=3L, V test(1813, fread("A,B\n1,2\n3,4", skip=10L), error="skip=10 but the input only has 3 lines") test(1814, fread("A,B\n1,2\n3,4\n \n\t", skip=3L), error="skip has been set after the last non-whitespace") -DT = data.table(A=seq(1, 1000000), B="x", C=TRUE) -fwrite(DT, f<-tempfile()) -test(1815, fread(f, nrows=5), DT[1:5]) #2243 +# 1815 moved to benchmark.Rraw, #5517 test(1816.1, fread("A,E\n1,2\n5,7\n4,6\n\x1A\x1A", verbose=TRUE), data.table(A=c(1L, 5L, 4L), E=c(2L, 7L, 6L)), @@ -12007,14 +11886,7 @@ fwrite(DT, f) test(1825.22, fread(f, colClasses = c(a = "numeric", b = "integer")), DT, warning="Attempt to override column 2.*ignored") unlink(f) -# issue 2351 -set.seed(1) -DT = data.table(id=paste0("id",1:1e5), v=sample(100,1e5,replace=TRUE)) -fwrite(DT, file=f<-tempfile(), eol="\r") -test(1826.1, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id99999","id100000"), v=c(27L,38L,10L,13L))) -cat("id888,42", file=f, append=TRUE) # without final \r after last line -test(1826.2, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id100000","id888"), v=c(27L,38L,13L,42L))) -unlink(f) +# 1826 moved to benchmark.Rraw, #5517 # Issue 2222 test(1827.1, fread("A,B\n1987,1\n1987,3\n", na.strings=c("1987", "NA")), data.table(A=c(NA,NA),B=c(1L,3L))) @@ -12102,21 +11974,7 @@ if (test_R.utils) { V12=c("AAAAAAAAAAAAA","","AAAAAAA","AAA"))) } -# Create a file to test a sample jump being skipped due to format error. It will fail later in the read step because -# this is a real error. Currently have not constructed an error for which nextGoodLine looks good, but in fact is not. -# Would need a very complicated construction of embedded new lines in quoted fields, to test that. -# This test size with default buffMB results in 2 threads being used. 2 is important to pass on CRAN. -DT = as.data.table(CO2) -f = tempfile() -for (i in 0:1000) { - start = nrow(CO2)*i - fwrite(DT[,Plant:=start:(start+nrow(CO2)-1)], f, append=TRUE, col.names=FALSE) - if (i==502) write("-999,Bad,Line,0.0,0.0,extra\n", f, append=TRUE) -} -test(1835, fread(f, verbose=TRUE), - output = "A line with too-many.*jump 50.*jump landed awkwardly.*skipped", - warning = "Stopped.*line 42253. Expected 5 fields but found 6.*discarded.*<<-999,Bad,Line,0.0,0.0,extra>>") -unlink(f) +# 1835 moved to benchmark.Rraw, #5517 test(1836, fread('1,2,"3,a"\n4,5,"6,b"'), data.table(V1=c(1L,4L), V2=c(2L,5L), V3=c("3,a","6,b"))) # 2196 @@ -12221,7 +12079,7 @@ rand_strings = function(n) { apply(M, 1, function(x) paste0(letters[x], collapse="")) } set.seed(123) # the random data here doesn't match the data in issue 2275 because they used stringi::stri_rand_strings which has a different RNG -n = 100000 +n = 1000 # reduced from 100000 to 1000 for #5517 DT1 = data.table(RANDOM_STRING = rand_strings(n), DATE = sample(seq(as.Date('2016-01-01'), as.Date('2016-12-31'), by="day"), n, replace=TRUE)) DT2 = data.table(RANDOM_STRING = rand_strings(n), @@ -12266,13 +12124,7 @@ test(1849.9, fread(f, select=c("Date", "Description", "Balance")), data.table(Date=20150725L,Description="abcd",Balance="$5,006")) unlink(f) -# segfault when rbindlist is asked to create a DT with more than 2bn rows -DT = data.table(1:1e6) -L = vector("list", 2148) -for (i in seq_along(L)) L[[i]] = DT # many references to the same DT to avoid actually using large RAM for this test -test(1850, rbindlist(L), error="Total rows in the list is 2148000000 which is larger than the maximum number of rows, currently 2147483647") -rm(list=c("L","DT")) -gc() +# 1850 moved to benchmark.Rraw, #5517 # by=.EACHI missings to list columns, #2300 dt = data.table(a=factor(1:5, levels=1:10), b=as.list(letters[1:5])) @@ -12577,60 +12429,7 @@ fwrite(DT,f<-tempfile()) test(1873, fread(f), DT) unlink(f) -# Better jump sync and run-on in PR#2627 -# -# Reproduces error 'did not finish exactly where jump 1 found ...' in #2561 in master before PR #2627 -# the jump point is just before an empty line and the nextGoodLine() wasn't sync'd properly -x = sprintf("ABCDEFGHIJKLMNOPQRST%06d", 1:102184) -x[51094]="" -cat(x, file=f<-tempfile(), sep="\n") -test(1874.1, fread(f,header=FALSE,verbose=TRUE)[c(1,51094,.N),], - data.table(V1=c("ABCDEFGHIJKLMNOPQRST000001","","ABCDEFGHIJKLMNOPQRST102184")), - output="jumps=[0..2)") # ensure jump 1 happened -# -# out-of-sample short lines in the first jump, not near the jump point -x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) -x[5021:5041] = "small,batch,short,lines" # 4 fields not 5 -cat(x, file=f, sep="\n") -test(1874.2, fread(f), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:5020), - warning="Stopped early on line 5021.*<>") -test(1874.3, fread(f,fill=TRUE,verbose=TRUE)[c(1,5020,5021,5041,5042,.N),], - data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), - V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), - V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), - V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), - V5=c(1L,5020L,NA,NA,5042L,102184L)), - output="jumps=[0..2)") -# -# jump just before a set of 30 or more too-few lines, to reproduce "No good line could be found" error in #2267 -# confirmed fails in master with that error before PR#2627 -x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) -x[51094:51150] = "small,batch,short,lines" # 4 fields not 5 -cat(x, file=f, sep="\n") -test(1874.4, fread(f,verbose=TRUE), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:51093), - warning="Stopped early on line 51094.*<>", - output="jumps=[0..2)") -test(1874.5, fread(f,fill=TRUE,verbose=TRUE)[c(1,51093,51094,51150,51151,.N),], - data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), - V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), - V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), - V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), - V5=c(1L,51093L,NA,NA,51151L,102184L)), - output="jumps=[0..2)") -# -# jump inside a quoted field containing many new lines, to simulate a dirty jump -# we'll make this jump landing even harder for nextGoodLine() by making the lines resemble the number and types of the true lines, too. -# Rather than needing to make nextGoodLine() better and better (at some point it's impossible), in these rare cases we'll just sweep dirty jumps. -x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) -x[51093] = "\"A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n\",FGHI,KLMN,PQRS,51093" -cat(x, file=f, sep="\n") -test(1875.6, fread(f,verbose=TRUE)[c(1,51092:51094,.N),][3,V1:=gsub("\r","",V1)], # gsub since R on Windows replaces \n with \r\n - data.table(V1=c("ABCD","ABCD", "A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n", "ABCD","ABCD"), - V2="FGHI", V3="KLMN", V4="PQRS", V5=c(1L,51092:51094,102184L)), - output = "too-few.*sample jump 50.*jump landed awkwardly.*skipped.*Read the data.*jumps=\\[0..2\\).*jumps=\\[1..2\\).*Reading 2 chunks \\(1 swept\\)") -# Aside: although the file (with over 100,000 lines) is big enough for 100 sampling jumps (of which just 1, the middle sample jump, skipped), it's -# still too small for more than 2 reading chunks to be worth it which is correct (based on buffMB not nth) -unlink(f) +# 1874-1875 moved to benchmark.Rraw, #5517 test(1876, fread("http://hkhfsk\nhttp://fhdkf\nhttp://kjfhskd\nhttp://hfkjf", header=FALSE), # data not a download, #2531 data.table(V1=c("http://hkhfsk","http://fhdkf","http://kjfhskd","http://hfkjf"))) @@ -12724,7 +12523,7 @@ DT = fread(",2,3\n1,,3\n1,2,\n") # all rows contain an NA, #2784 test(1887.3, na.omit(DT), DT[0L]) test(1887.4, na.omit(DT, invert=TRUE), DT) -x = runif(1e4) +x = runif(1e3) # 1e4 reduced to 1e3 in #5517 but really it was the 1e6 just after 1888.5 below which is now 1e3 too test(1888, fsort(x), base::sort(x)) test(1888.1, fsort(x, decreasing = TRUE), base::sort(x, decreasing = TRUE), warning = "New parallel sort has not been implemented for decreasing=TRUE.*one thread") @@ -12738,7 +12537,7 @@ test(1888.4, fsort(x, decreasing = TRUE, na.last = TRUE), base::sort(x, decreasi x <- as.integer(x) test(1888.5, fsort(x), base::sort(x, na.last = FALSE), warning = "Input is not a vector of type double. New parallel sort has only been done for double vectors so far.*Using one thread") -x = runif(1e6) +x = runif(1e3) test(1888.6, y<-fsort(x,verbose=TRUE), output="nth=.*Top 20 MSB counts") test(1888.7, !base::is.unsorted(y)) test(1888.8, fsort(x,verbose=1), error="verbose must be TRUE or FALSE") @@ -12751,11 +12550,7 @@ test(1889, chmatch(x,x), 1:1000) rm(list=x) gc() -# test DT$.<- in a data.table-unaware package -DT = data.table(A=1:5) -test(1890.1, stats::ts.plot(gpars=DT), error="object must have one or more observations") -# Inside ts.plot is a gpars$ylab<- which happens before its error. That dispatches to our $<- which does the alloc.col() -test(1890.2, DT, data.table(A=1:5)) +# 1890 used stats::ts.plot, moved to other.Rraw 29 to save ram, #5517 # na="" default, #2524 test(1891.1, fread('A,B,C\n1,foo,4\n2,,5\n3,bar,6\n', na.strings=""), data.table(A=1:3, B=c("foo",NA,"bar"), C=4:6)) @@ -12971,43 +12766,7 @@ test(1911.2, DT[, COL_INT := integer(0)], error = "RHS of assignment to existing column 'COL_INT' is zero length but not NULL.*") -# gc race with altrep in R-devel May 2018, #2866 & #2767, PR#2882 -# This runs with 2 threads in the test suite on CRAN and AppVeyor etc. -# 2 threads are sufficient to fail before the fix. -N = 20 -DF = data.frame(a=rnorm(N), - b=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]), - c=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5])) -DT = setDT(DF) # setDT required since data.table() already expanded altrep's -before = sum(gc()[, 2]) -fff = function(aref) { - ff = lapply(1:5, function(i) { - DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] - }) - return(rbindlist(ff)) -} -for(i in 1:100) { - f = fff("a") - rm("f") -} -gc() # extra gc() (i.e. two including the one on next line) seems to reduce `after` - # from 29.7 to 27.2 (exactly `before`). Keeping the extra gc() as no harm. -after = sum(gc()[, 2]) -test(1912.1, after < before + 10) # 10MB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up). -# -before = sum(gc()[, 2]) -fff = function(aref) { - DT = setDT(data.frame(a=1:N, b=1:N, c=1:N, d=1:N, e=1:N, f=1:N, g=1:N, h=1:N)) # 1:N creates altrep. A few of them too to tickle (the fixed) race. - lapply(1:5, function(i) { - DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] - }) -} -for(i in 1:100) { - fff("a") -} -gc() -after = sum(gc()[, 2]) -test(1912.2, after < before + 10) +# 1912 moved to benchmark.Rraw, #5517 # BEGIN port of old testthat tests, #2740. Issue numbers may be from R-forge. # @@ -14212,11 +13971,7 @@ test(1977.4, DT["D", -"GRP"], data.table(ID="D", X=NA_real_, key="ID")) test(1977.5, DT["D", c("ID","GRP")], data.table(ID="D", GRP=NA_integer_, key="ID")) test(1977.6, DT[c("A","D"), c("ID","GRP")], data.table(ID=c("A","A","D"), GRP=INT(1,1,NA))) -# catch malformed factor in rbindlist, #3315 -set.seed(32940) -NN=7e5; KK=4e4; TT=25 -DT = data.table( id = sample(KK, NN, TRUE), tt = sample(TT, NN, TRUE), ff = factor(sample(3, NN, TRUE)) ) -test(1978, print(DT[ , diff(ff), by = id]), error="Column 2 of item 1 has type 'factor' but has no levels; i.e. malformed.") # the print invokes rbindlist which bites +# 1978 moved to benchmark.Rraw, #5517 # Drop Null Values from `j` list elements #1406 DT = data.table(a = 1:3,b = letters[1:3],c = LETTERS[1:3]) @@ -14236,14 +13991,7 @@ DT = data.table( id = 1:5 , val = letters[1:5] ) test(1981.3, DT[, new_col := shift(val, "lead")], error="is.numeric(n) is not TRUE") test(1981.4, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") -# print of DT with many columns reordered them, #3306. -DT = as.data.table(lapply(1:255, function(i)rep.int(i, 105L))) # 105 to be enough for 'top 5 ... bottom 5' to print -out = capture.output(print(DT)) -tt = out[grep("V",out)] -tt = unlist(strsplit(gsub(" ","",tt), "V")) -test(1982.1, tt[1L], "") -tt = as.integer(tt[tt!=""]) -test(1982.2, tt, seq_along(tt)) +# 1982 moved to benchmark.Rraw, #5517 # parse(text = 'list(`\\phantom{.}`)') fails, #3319 DT <- data.table(x=1, y=1:5) @@ -14532,12 +14280,7 @@ dx = data.table(id = 1L, key = "id") di = list(z=c(2L, 1L)) test(1999.2, key(dx[di]), NULL) -# chmatchdup test from benchmark at the bottom of chmatch.c -set.seed(45L) -x = sample(letters, 1e5, TRUE) -y = sample(letters, 1e6, TRUE) -test(2000, c(head(ans<-chmatchdup(x,y,0L)),tail(ans)), INT(7,49,11,20,69,25,99365,100750,97596,99671,103320,99406)) -rm(list=c("x","y")) +# 2000 moved to benchmark.Rraw, #5517 # rbindlist use.names=TRUE returned random column order when ncol>255; #3373 DT = setDT(replicate(300, rnorm(3L), simplify = FALSE)) @@ -16483,8 +16226,8 @@ g = function(x) { if (x==1L) factor(c("a","b")) else factor(c("a","b","c")) } test(2114.2, DT[,g(.GRP),by=A], data.table(A=INT(1,1,2,2,2), V1=as.factor(c("a","b","a","b","c")))) # original test verbatim from the same issue #2199 set.seed(2) -ids = sample(letters, 20) -dates = 1:40 +ids = sample(letters, 10) # reduced from 20 to 10 +dates = 1:10 # and 40 to 10 to save ram, #5517 dt = data.table(CJ(dates, ids, ids)) setnames(dt, c("date", "id1", "id2")) dt[, value := rnorm(length(date))] @@ -16495,8 +16238,8 @@ f1 = function(sdt) { melt.data.table(dt1, id.vars = "id1") } res = dt[, f1(.SD), by=date] -test(2114.3, setnames(res[c(1,.N)],"variable","id2")[,id2:=as.character(id2)][], dt[c(1,.N)]) -test(2114.4, print(res), output="date.*0.433") +test(2114.3, setnames(res[c(1,.N)],"variable","id2")[,id2:=as.character(id2)], dt[c(1,.N)]) +test(2114.4, print(res), output="date.*-0.830") # and from #2522 DT = data.table(id=1:9, grp=rep(1:3,each=3), val=c("a","b","c", "a","b","c", "a","b","c")) test(2114.5, as.character(DT[, valfactor1 := factor(val), by = grp]$valfactor1), ans<-rep(c("a","b","c"),3)) @@ -17695,18 +17438,7 @@ d[1:50, "a"] = d[51:100, "a"] setDT(d) test(2200, nrow(d[a==99]), 2L) -# segfault in forder when nrow/throttle=255 && nrow>=65536; #5077 -# Matt ran these on clang's ASAN+OpenMP which correctly faulted v1.14.0; these tests segfault consistently without ASAN too -set.seed(1) -DT = data.table(grp=sample(255L, 65536L ,replace=TRUE)) # >=255 && >=65536 necessary -setDTthreads(throttle=nrow(DT)) # increase throttle to reduce threads to 1 for this nrow -test(2201.1, nrow(DT[, .N, by=grp]), 255L) -test(2201.2, nrow(setkey(DT, grp)), 65536L) -set.seed(1) -DT = data.table(grp=sample(65536L)) # extra case with all size 1 groups too just for fun -test(2201.3, nrow(DT[, .N, by=grp]), 65536L) -test(2201.4, nrow(setkey(DT, grp)), 65536L) -setDTthreads() # restore default throttle +# 2201 moved to benchmark.Rraw, #5517 # fwrite now allows sep="", #4817 test(2202.1, fwrite(data.frame(a="id", b=letters[1:5], c=1:5), sep=""), @@ -18021,26 +17753,30 @@ DT = data.table(x = sample(letters[1:5], 20, TRUE), c = sample(c(0+3i,1,-1-1i,NA), 20, TRUE), l = sample(c(TRUE, FALSE, NA), 20, TRUE), r = as.raw(sample(1:5, 20, TRUE))) -load(testDir("test2224.Rdata")) # ans array +load(testDir("test2224.Rdata")) # 47KB array 24x8 where each cell contains a length-20 result if (test_bit64) { DT[, i64:=as.integer64(sample(c(-2L,0L,2L,NA), 20, TRUE))] } else { ans = ans[, -match("i64",colnames(ans))] } -test(2224.01, sapply(names(DT)[-1], function(col) { - sapply(list(1, 5, -1, -5, c(1,2), c(-1,1)), function(n) list( - # fill is tested by group in tests 2218.*; see comments in #5205 - EVAL(sprintf("DT[, shift(%s, %d, type='lag'), by=x]$V1", col, n)), - EVAL(sprintf("DT[, shift(%s, %d, type='lead'), by=x]$V1", col, n)), - EVAL(sprintf("DT[, shift(%s, %d, type='shift'), by=x]$V1", col, n)), - EVAL(sprintf("DT[, shift(%s, %d, type='cyclic'), by=x]$V1", col, n)) - )) -}), ans) +i = 1L +for (col in names(DT)[-1]) { + for (n in list(1, 5, -1, -5, c(1,2), c(-1,1))) { + for (type in c('lag','lead','shift','cyclic')) { + # fill is tested by group in tests 2218.*; see comments in #5205 + # sapply(sapply()) changed to for(for(for())) to save 29MB, #5517 + test(2224.1+i/10000, # 192 tests here when test_bit64=TRUE; 168 when FALSE + EVAL(sprintf("DT[, shift(%s, %d, type='%s'), by=x]$V1", col, n, type)), + ans[[i]]) + i = i+1L + } + } +} a = 1:2 # fill argument with length > 1 which is not a call -test(2224.02, DT[, shift(i, fill=a), by=x], error="fill must be a vector of length 1") +test(2224.2, DT[, shift(i, fill=a), by=x], error="fill must be a vector of length 1") DT = data.table(x=pairlist(1), g=1) # unsupported type as argument -test(2224.03, DT[, shift(x), g], error="Type 'list' is not supported by GForce gshift.") +test(2224.3, DT[, shift(x), g], error="Type 'list' is not supported by GForce gshift.") # groupingsets by named by argument test(2225.1, groupingsets(data.table(iris), j=sum(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')),