Skip to content

Commit

Permalink
Move ram tests (#5520)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattdowle authored Nov 15, 2022
1 parent 5affced commit 2d2892c
Show file tree
Hide file tree
Showing 5 changed files with 514 additions and 427 deletions.
22 changes: 9 additions & 13 deletions R/test.data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,8 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
on.exit(setwd(owd))

if (memtest) {
catf("\n***\n*** memtest=%d. This should be the first task in a fresh R session for best results. Ctrl-C now if not.\n***\n\n", memtest)
if (is.na(ps_mem())) stopf("memtest intended for Linux. Step through ps_mem() to see what went wrong.")
catf("\n***\n*** memtest=%d. This should be the first call in a fresh R_GC_MEM_GROW=0 R session for best results. Ctrl-C now if not.\n***\n\n", memtest)
if (is.na(rss())) stopf("memtest intended for Linux. Step through data.table:::rss() to see what went wrong.")
}

err = try(sys.source(fn, envir=env), silent=silent)
Expand Down Expand Up @@ -197,7 +197,12 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
ans = timings[, diff:=c(NA,round(diff(RSS),1))][y+1L][,time:=NULL] # time is distracting and influenced by gc() calls; just focus on RAM usage here
catf("10 largest RAM increases (MB); see plot for cumulative effect (if any)\n")
print(ans, class=FALSE)
plot(timings$RSS, main=basename(fn), ylab="RSS (MB)")
dev.new(width=14, height=7)
par(mfrow=c(1,2))
plot(timings$RSS, main=paste(basename(fn),"\nylim[0]=0 for context"), ylab="RSS (MB)", ylim=c(0,max(timings$RSS)))
mtext(lastRSS<-as.integer(ceiling(last(timings$RSS))), side=4, at=lastRSS, las=1, font=2)
plot(timings$RSS, main=paste(basename(fn),"\nylim=range for inspection"), ylab="RSS (MB)")
mtext(lastRSS, side=4, at=lastRSS, las=1, font=2)
}

catf("All %d tests (last %.8g) in %s completed ok in %s\n", ntest, env$prevtest, names(fn), timetaken(env$started.at))
Expand Down Expand Up @@ -227,15 +232,6 @@ compactprint = function(DT, topn=2L) {

INT = function(...) { as.integer(c(...)) } # utility used in tests.Rraw

ps_mem = function() {
# nocov start
cmd = paste0("ps -o rss --no-headers ", Sys.getpid()) # ps returns KB
ans = tryCatch(as.numeric(system(cmd, intern=TRUE)), warning=function(w) NA_real_, error=function(e) NA_real_)
if (length(ans)!=1L || !is.numeric(ans)) ans=NA_real_ # just in case
round(ans / 1024, 1L) # return MB
# nocov end
}

gc_mem = function() {
# nocov start
# gc reports memory in MB
Expand Down Expand Up @@ -280,7 +276,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
timings[as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE]
if (memtest) {
if (memtest==1L) gc() # see #5515 for before/after
timings[as.integer(num), RSS:=max(ps_mem(),RSS), verbose=FALSE]
timings[as.integer(num), RSS:=max(rss(),RSS), verbose=FALSE]
if (memtest==2L) gc()
}
assign("lasttime", proc.time()[3L], parent.frame(), inherits=TRUE) # after gc() to exclude gc() time from next test when memtest
Expand Down
10 changes: 10 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,13 @@ edit.data.table = function(name, ...) {
setDT(NextMethod('edit', name))[]
}
# nocov end

rss = function() { #5515 #5517
# nocov start
cmd = paste0("ps -o rss --no-headers ", Sys.getpid()) # ps returns KB
ans = tryCatch(as.numeric(system(cmd, intern=TRUE)), warning=function(w) NA_real_, error=function(e) NA_real_)
if (length(ans)!=1L || !is.numeric(ans)) ans=NA_real_ # just in case
round(ans / 1024, 1L) # return MB
# nocov end
}

317 changes: 317 additions & 0 deletions inst/tests/benchmark.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,322 @@ t2 = system.time(DT2 <- do.call(data.table, list(DF))) # 3.07s before, 0.02s af
test(, identical(DT1, DT2))
test(, t2["elapsed"]/t1["elapsed"]<2)

###########################################################
# largest tests by ram usage moved out of tests.Rraw, #5517
###########################################################

# Test ad hoc by of more than 100,000 levels, see 2nd part of bug #1387 (100,000 from the limit of base::sort.list radix)
# This does need to be this large, like this in CRAN checks, because sort.list(method="radix") has this limit, which
# this tests. But it's well under 10 seconds.
DT = data.table(A=1:10,B=rnorm(10),C=factor(paste("a",1:100010,sep="")))
test(301, nrow(DT[,sum(B),by=C])==100010)
DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep=""))
test(301.1, nrow(DT[,sum(B),by=C])==100010)

# Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too.
options(datatable.optimize=0L)
set.seed(1)
DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a")
test(637.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a"))
test(637.2, key(DT[J(43L),a:=99L]), NULL)
setkey(DT,a)
test(637.3, key(DT[,a:=99L,by=a]), NULL)
options(datatable.optimize=2L)
set.seed(1)
DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a")
test(638.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a"))
test(638.2, key(DT[J(43L),a:=99L]), NULL)
setkey(DT,a)
test(638.3, key(DT[,a:=99L,by=a]), NULL)

# Test X[Y] slowdown, #2216
# Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes
# in R or elsewhere cause the 2 minute (!) bug to return. Hence not moving out to benmark.Rraw.
X = CJ(a=seq_len(1e3),b=seq_len(1e3))
Y = copy(X)
X[4,b:=3L] # create a dup group, to force allLen1=FALSE
setkey(X)
test(819, system.time(X[Y,allow.cartesian=TRUE])["user.self"] < 10) # this system.time usage ok in this case
test(820, system.time(X[Y,mult="first"])["user.self"] < 10) # this system.time usage ok in this case

# test uniqlengths
set.seed(45)
x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE)
ox <- forderv(x)
o1 <- uniqlist(list(x), ox)
test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x)))
o1 <- uniqlist(list(x))
test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x)))
rm(list=c("x","ox","o1"))
gc()

# Fix for (usually small) memory leak when grouping, #2648.
# Deliberate worst case: largest group (100000 rows) followed last by a small group (1 row).
DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001))
before = gc()["Vcells",2]
for (i in 1:50) DT[, sum(B), by=A]
after = gc()["Vcells",2]
test(1157, after < before+3) # +3 = 3MB
# Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case.

# Similar for when dogroups writes less rows than allocated, #2648.
DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4))
before = gc()["Vcells",2]
for (i in 1:50) DT[ , unlist(.SD), by = 'k']
after = gc()["Vcells",2]
test(1158, after < before+3) # 177.6MB => 179.2MB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024

# fix DT[TRUE, :=] using too much working memory for i, #1249
if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) { # in case R not compiled with memory profiling enabled
f = tempfile()
N = 1000000 # or any large number of rows
DT = data.table(A=1:N, B=rnorm(N))
DT[TRUE, B := B * 2] # stabilize with initial dummy update
Rprofmem(f)
DT[TRUE, B := B * 2] # or some in-place update
Rprofmem(NULL)
test(1542, length(grep("000",readLines(f, warn=FALSE))), 1L) # one allocation for the RHS only
unlink(f)
}

if (FALSE) {
# Full range takes too long for CRAN.
dts = seq(as.Date("0000-03-01"), as.Date("9999-12-31"), by="day")
dtsCh = as.character(dts) # 36s
dtsCh = gsub(" ","0",sprintf("%10s",dtsCh)) # R does not 0 pad years < 1000
test(1739.1, length(dtsCh)==3652365 && identical(dtsCh[c(1,3652365)],c("0000-03-01","9999-12-31")))
} else {
# test on CRAN a reduced but important range
dts = seq(as.Date("1899-12-31"), as.Date("2100-01-01"), by="day")
dtsCh = as.character(dts)
test(1739.2, length(dtsCh)==73051 && identical(dtsCh[c(1,73051)],c("1899-12-31","2100-01-01")))
}
DT = data.table(A=dts, B=as.IDate(dts))
test(1739.3, sapply(DT,typeof), c(A="double",B="integer"))
test(1739.4, typeof(dts), "double")
f = tempfile()
g = tempfile() # Full range
fwrite(DT,f) # 0.092s
write.csv(DT,g,row.names=FALSE,quote=FALSE) # 65.250s
test(1739.5, readLines(f), c("A,B",paste(dtsCh,dtsCh,sep=",")))
test(1739.6, readLines(f), readLines(g))
unlink(f)
unlink(g)
rm(list=c("dtsCh","dts"))
gc()

# catch malformed factor in rbindlist, #3315
set.seed(32940)
NN=7e5; KK=4e4; TT=25
DT = data.table( id = sample(KK, NN, TRUE), tt = sample(TT, NN, TRUE), ff = factor(sample(3, NN, TRUE)) )
test(1978, print(DT[ , diff(ff), by = id]), error="Column 2 of item 1 has type 'factor' but has no levels; i.e. malformed.") # the print invokes rbindlist which bites

# print.data.table row id in non-scientific notation, #1167
DT <- data.table(a = rep(1:5,3*1e5), b = rep(letters[1:3],5*1e5))
test(1549, capture.output(print(DT)), c(" a b", " 1: 1 a", " 2: 2 b", " 3: 3 c", " 4: 4 a", " 5: 5 b", " --- ", "1499996: 1 b", "1499997: 2 c", "1499998: 3 a", "1499999: 4 b", "1500000: 5 c"))
rm(DT)

# Create a file to test a sample jump being skipped due to format error. It will fail later in the read step because
# this is a real error. Currently have not constructed an error for which nextGoodLine looks good, but in fact is not.
# Would need a very complicated construction of embedded new lines in quoted fields, to test that.
# This test size with default buffMB results in 2 threads being used. 2 is important to pass on CRAN.
DT = as.data.table(CO2)
f = tempfile()
for (i in 0:1000) {
start = nrow(CO2)*i
fwrite(DT[,Plant:=start:(start+nrow(CO2)-1)], f, append=TRUE, col.names=FALSE)
if (i==502) write("-999,Bad,Line,0.0,0.0,extra\n", f, append=TRUE)
}
test(1835, fread(f, verbose=TRUE),
output = "A line with too-many.*jump 50.*jump landed awkwardly.*skipped",
warning = "Stopped.*line 42253. Expected 5 fields but found 6.*discarded.*<<-999,Bad,Line,0.0,0.0,extra>>")
unlink(f)

# test no memory leak, #2191 and #2284
# These take a few seconds each, and it's important to run these on CRAN to check no leak
gc(); before = gc()["Vcells","(Mb)"]
for (i in 1:2000) { DT = data.table(1:3); rm(DT) } # in 1.8.2 would leak 3MB
gc(); after = gc()["Vcells","(Mb)"]
test(861, after < before+0.5) # close to 0.0 difference, but 0.5 for safe margin
gc(); before = gc()["Vcells","(Mb)"]
DF = data.frame(x=1:20, y=runif(20))
for (i in 1:2000) { DT = as.data.table(DF); rm(DT) }
gc(); after = gc()["Vcells","(Mb)"]
test(862, after < before+0.5)
gc(); before = gc()["Vcells","(Mb)"]
DT = data.table(x=1:20, y=runif(20))
for (i in 1:2000) { x <- DT[1:5,]; rm(x) }
gc(); after = gc()["Vcells","(Mb)"]
test(863, after < before+0.5)

# fread should use multiple threads on single column input.
# tests 2 threads; the very reasonable limit on CRAN
# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MB currently)
if (getDTthreads() == 1L) {
cat("Test 1760 not run because this session either has no OpenMP or has been limited to one thread (e.g. under UBSAN and ASAN)\n")
} else {
N = if (TRUE) 2e6 else 1e9 # offline speed check
fwrite(data.table(A=sample(10,N,replace=TRUE)), f<-tempfile())
test(1760.1, file.info(f)$size > 4*1024*1024)
test(1760.2, fread(f, verbose=TRUE, nThread=2), output="using 2 threads")
unlink(f)
}

# segfault of unprotected var caught with the help of address sanitizer; was test 1509
# in #5517 I figured this test shouldn't be reduced in size due to its nature
set.seed(1)
val = sample(c(1:5, NA), 1e4L, TRUE)
dt <- setDT(replicate(100L, val, simplify=FALSE))
## to ensure there's no segfault...
ans <- melt(dt, measure.vars=names(dt), na.rm=TRUE)
test(1035.21, ans, ans)

# gc race with altrep in R-devel May 2018, #2866 & #2767, PR#2882
# This runs with 2 threads in the test suite on CRAN and AppVeyor etc.
# 2 threads are sufficient to fail before the fix.
N = 20
DF = data.frame(a=rnorm(N),
b=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]),
c=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]))
DT = setDT(DF) # setDT required since data.table() already expanded altrep's
before = sum(gc()[, 2])
fff = function(aref) {
ff = lapply(1:5, function(i) {
DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]]
})
return(rbindlist(ff))
}
for(i in 1:100) {
f = fff("a")
rm("f")
}
gc() # extra gc() (i.e. two including the one on next line) seems to reduce `after`
# from 29.7 to 27.2 (exactly `before`). Keeping the extra gc() as no harm.
after = sum(gc()[, 2])
test(1912.1, after < before + 10) # 10MB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up).
#
before = sum(gc()[, 2])
fff = function(aref) {
DT = setDT(data.frame(a=1:N, b=1:N, c=1:N, d=1:N, e=1:N, f=1:N, g=1:N, h=1:N)) # 1:N creates altrep. A few of them too to tickle (the fixed) race.
lapply(1:5, function(i) {
DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]]
})
}
for(i in 1:100) {
fff("a")
}
gc()
after = sum(gc()[, 2])
test(1912.2, after < before + 10)

DT = data.table(A=seq(1, 1000000), B="x", C=TRUE)
fwrite(DT, f<-tempfile())
test(1815, fread(f, nrows=5), DT[1:5]) #2243: nrows small vs large nrow(DT)

# Better jump sync and run-on in PR#2627
#
# Reproduces error 'did not finish exactly where jump 1 found ...' in #2561 in master before PR #2627
# the jump point is just before an empty line and the nextGoodLine() wasn't sync'd properly
x = sprintf("ABCDEFGHIJKLMNOPQRST%06d", 1:102184)
x[51094]=""
cat(x, file=f<-tempfile(), sep="\n")
test(1874.1, fread(f,header=FALSE,verbose=TRUE)[c(1,51094,.N),],
data.table(V1=c("ABCDEFGHIJKLMNOPQRST000001","","ABCDEFGHIJKLMNOPQRST102184")),
output="jumps=[0..2)") # ensure jump 1 happened
#
# out-of-sample short lines in the first jump, not near the jump point
x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184)
x[5021:5041] = "small,batch,short,lines" # 4 fields not 5
cat(x, file=f, sep="\n")
test(1874.2, fread(f), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:5020),
warning="Stopped early on line 5021.*<<small,batch,short,lines>>")
test(1874.3, fread(f,fill=TRUE,verbose=TRUE)[c(1,5020,5021,5041,5042,.N),],
data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"),
V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"),
V3=c("KLMN","KLMN","short","short","KLMN","KLMN"),
V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"),
V5=c(1L,5020L,NA,NA,5042L,102184L)),
output="jumps=[0..2)")
#
# jump just before a set of 30 or more too-few lines, to reproduce "No good line could be found" error in #2267
# confirmed fails in master with that error before PR#2627
x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184)
x[51094:51150] = "small,batch,short,lines" # 4 fields not 5
cat(x, file=f, sep="\n")
test(1874.4, fread(f,verbose=TRUE), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:51093),
warning="Stopped early on line 51094.*<<small,batch,short,lines>>",
output="jumps=[0..2)")
test(1874.5, fread(f,fill=TRUE,verbose=TRUE)[c(1,51093,51094,51150,51151,.N),],
data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"),
V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"),
V3=c("KLMN","KLMN","short","short","KLMN","KLMN"),
V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"),
V5=c(1L,51093L,NA,NA,51151L,102184L)),
output="jumps=[0..2)")
#
# jump inside a quoted field containing many new lines, to simulate a dirty jump
# we'll make this jump landing even harder for nextGoodLine() by making the lines resemble the number and types of the true lines, too.
# Rather than needing to make nextGoodLine() better and better (at some point it's impossible), in these rare cases we'll just sweep dirty jumps.
x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184)
x[51093] = "\"A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n\",FGHI,KLMN,PQRS,51093"
cat(x, file=f, sep="\n")
test(1875.6, fread(f,verbose=TRUE)[c(1,51092:51094,.N),][3,V1:=gsub("\r","",V1)], # gsub since R on Windows replaces \n with \r\n
data.table(V1=c("ABCD","ABCD", "A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n", "ABCD","ABCD"),
V2="FGHI", V3="KLMN", V4="PQRS", V5=c(1L,51092:51094,102184L)),
output = "too-few.*sample jump 50.*jump landed awkwardly.*skipped.*Read the data.*jumps=\\[0..2\\).*jumps=\\[1..2\\).*Reading 2 chunks \\(1 swept\\)")
# Aside: although the file (with over 100,000 lines) is big enough for 100 sampling jumps (of which just 1, the middle sample jump, skipped), it's
# still too small for more than 2 reading chunks to be worth it which is correct (based on buffMB not nth)
unlink(f)

# chmatchdup test from benchmark at the bottom of chmatch.c
set.seed(45L)
x = sample(letters, 1e5, TRUE)
y = sample(letters, 1e6, TRUE)
test(2000, c(head(ans<-chmatchdup(x,y,0L)),tail(ans)), INT(7,49,11,20,69,25,99365,100750,97596,99671,103320,99406))
rm(list=c("x","y"))

# Add nq tests 1641-1652 here with larger sizes and calls that have been turned off in the past as took too long, and
# restore the exact parameters w.r.t. Jan's comment: https://github.com/Rdatatable/data.table/pull/5520#discussion_r1020180583

# issue 2351
set.seed(1)
DT = data.table(id=paste0("id",1:1e5), v=sample(100,1e5,replace=TRUE))
fwrite(DT, file=f<-tempfile(), eol="\r")
test(1826.1, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id99999","id100000"), v=c(27L,38L,10L,13L)))
cat("id888,42", file=f, append=TRUE) # without final \r after last line
test(1826.2, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id100000","id888"), v=c(27L,38L,13L,42L)))
unlink(f)

# segfault when rbindlist is asked to create a DT with more than 2bn rows
DT = data.table(1:1e6)
L = vector("list", 2148)
for (i in seq_along(L)) L[[i]] = DT # many references to the same DT to avoid actually using large RAM for this test
test(1850, rbindlist(L), error="Total rows in the list is 2148000000 which is larger than the maximum number of rows, currently 2147483647")
rm(L, DT)
gc()

# segfault in forder when nrow/throttle<nth && ngrp>=255 && nrow>=65536; #5077
# Matt ran these on clang's ASAN+OpenMP which correctly faulted v1.14.0; these tests segfault consistently without ASAN too
set.seed(1)
DT = data.table(grp=sample(255L, 65536L ,replace=TRUE)) # >=255 && >=65536 necessary
setDTthreads(throttle=nrow(DT)) # increase throttle to reduce threads to 1 for this nrow
test(2201.1, nrow(DT[, .N, by=grp]), 255L)
test(2201.2, nrow(setkey(DT, grp)), 65536L)
set.seed(1)
DT = data.table(grp=sample(65536L)) # extra case with all size 1 groups too just for fun
test(2201.3, nrow(DT[, .N, by=grp]), 65536L)
test(2201.4, nrow(setkey(DT, grp)), 65536L)
setDTthreads() # restore default throttle

# print of DT with many columns reordered them, #3306.
DT = as.data.table(lapply(1:255, function(i)rep.int(i, 105L))) # 105 to be enough for 'top 5 ... bottom 5' to print
out = capture.output(print(DT))
tt = out[grep("V",out)]
tt = unlist(strsplit(gsub(" ","",tt), "V"))
test(1982.1, tt[1L], "")
tt = as.integer(tt[tt!=""])
test(1982.2, tt, seq_along(tt))

# fread leak, #3292
dummy = rep("1\t2\t3\t4\t5", 10000000)
writeLines(dummy, "out.tsv")
Expand All @@ -183,3 +499,4 @@ for (i in 1:10) data.table::fread("out.tsv")
end = gc()["Vcells",2]
test(, end/start < 1.05)


Loading

0 comments on commit 2d2892c

Please sign in to comment.