Skip to content

Commit

Permalink
fread(file, nrows=0) file with header does not determine types (#5253)
Browse files Browse the repository at this point in the history
  • Loading branch information
ben-schwen authored Nov 18, 2021
1 parent 45e7da8 commit 96860f2
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 8 deletions.
2 changes: 1 addition & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@
2. `print(DT, trunc.cols=TRUE)` and the corresponding `datatable.print.trunc.cols` option (new feature 3 in v1.13.0) could incorrectly display an extra column, [#4266](https://github.com/Rdatatable/data.table/issues/4266). Thanks to @tdhock for the bug report and @MichaelChirico for the PR.
3. `fread(..., nrows=0L)` now works as intended and the same as `nrows=0`; i.e. returning the column names and typed empty columns determined by the large sample, [#4686](https://github.com/Rdatatable/data.table/issues/4686). Thanks to @hongyuanjia for reporting, and Benjamin Schwendinger for the PR.
3. `fread(..., nrows=0L)` now works as intended and the same as `nrows=0`; i.e. returning the column names and typed empty columns determined by the large sample, [#4686](https://github.com/Rdatatable/data.table/issues/4686), [#4029](https://github.com/Rdatatable/data.table/issues/4029). Thanks to @hongyuanjia and @michaelpaulhirsch for reporting, and Benjamin Schwendinger for the PR.
4. Passing `.SD` to `frankv()` with `ties.method='random'` or with `na.last=NA` failed with `.SD is locked`, [#4429](https://github.com/Rdatatable/data.table/issues/4429). Thanks @smarches for the report.
Expand Down
31 changes: 26 additions & 5 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -13487,13 +13487,34 @@ test(1958.01, fread('\U0001f64d', encoding = 'UTF-16'), error = "Argument 'encod
test(1958.02, fread('a,b\n1,2', nrows = NA_real_), data.table(a = 1L, b = 2L))
test(1958.03, fread('a,b\n1,2', nrows = -1), data.table(a = 1L, b = 2L))
test(1958.04, fread('a,b\n1,2', key = 1), error = 'must be a character vector naming columns')
test(1958.05, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0), data.table(A=logical(), B=logical(), C=logical())) #2747
test(1958.06, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0, sep=','), data.table(A=logical(), B=logical(), C=logical()))
test(1958.05, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0), data.table(A=integer(), B=integer(), C=integer())) #2747
test(1958.06, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0, sep=','), data.table(A=integer(), B=integer(), C=integer()))
test(1958.07, fread('A,B,C,D\n"a,b",4,5,6\n"c,d",6,7\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(6,NA))) # 2547
test(1958.08, fread('A,B,C,D\n"a,b",4,5\n"c,d",6,7,8\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(NA,8)))
# 4686
test(1958.09, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0L), data.table(A=logical(), B=logical(), C=logical()))
test(1958.10, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0L, sep=','), data.table(A=logical(), B=logical(), C=logical()))
test(1958.09, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0L), data.table(A=integer(), B=integer(), C=integer())) # nrows=0 vs 0L, 4686
test(1958.10, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0L, sep=','), data.table(A=integer(), B=integer(), C=integer()))
# nrows=0 should perform a full sample to get the empty column types right as documented, #4029
test(1958.11, fread('A,B,C,D\n1,CHAR,"CHAR",3.1', nrows=0L), data.table(A=integer(), B=character(), C=character(), D=numeric()))
# .. one different type in the middle of under 100
txt = paste(c("A,B\n1,2\n", rep("3,4\n",48), "3,4.1\n", rep("5,6\n",48)), collapse="")
test(1958.12, fread(text=txt, nrows=0L), data.table(A=integer(), B=numeric()))
test(1958.13, fread(text=txt, nrows=0L, skip=1L), data.table(V1=integer(), V2=numeric()))
test(1958.14, fread(text=txt, nrows=1L), data.table(A=1L, B=2L)) # B integer not numeric because sample is min(nrows,100) when nrows>=1
test(1958.15, fread(text=txt, nrows=1L, skip=1L), data.table(V1=1L, V2=2L))
test(1958.16, fread(text=txt, nrows=2L), data.table(A=c(1L,3L), B=c(2L,4L)))
test(1958.17, fread(text=txt, nrows=2L, skip=1L), data.table(V1=c(1L,3L), V2=c(2L,4L)))
# .. one different type on line 148 when there are just under 200 lines
txt = paste(c("A,B\n1,2\n", rep("3,4\n",148), "3,4.1\n", rep("5,6\n",48)), collapse="")
test(1958.18, fread(text=txt, nrows=0L, verbose=TRUE), data.table(A=integer(), B=numeric()),
output="Sampled 149 rows.*at 2 jump points")
# .. one different type within sample for large number of lines
txt = paste(c("A,B\n1,2\n", rep("3,4\n",5000), "3,4.1\n", rep("5,6\n",5000)), collapse="")
test(1958.19, fread(text=txt, nrows=0L, verbose=TRUE), data.table(A=integer(), B=numeric()),
output="Sampled 1049 rows.*at 11 jump points")
# .. one different type out of sample for large number of lines
txt = paste(c("A,B\n1,2\n", rep("3,4\n",5100), "3,4.1\n", rep("5,6\n",4900)), collapse="")
test(1958.20, fread(text=txt, nrows=0L, verbose=TRUE), data.table(A=integer(), B=integer()),
output="Sampled 1049 rows.*at 11 jump points")

# Skip should work with all types of newlines #3006
eols = c("\n", "\r\n", "\r", "\n\r")
Expand Down
4 changes: 2 additions & 2 deletions src/fread.c
Original file line number Diff line number Diff line change
Expand Up @@ -1579,7 +1579,7 @@ int freadMain(freadMainArgs _args) {
int ncol; // Detected number of columns in the file
const char *firstJumpEnd=NULL; // remember where the winning jumpline from jump 0 ends, to know its size excluding header
const char *prevStart = NULL; // the start of the non-empty line before the first not-ignored row (for warning message later, or taking as column names)
int jumpLines = (int)umin(100,nrowLimit); // how many lines from each jump point to use. If nrowLimit is supplied, nJumps is later set to 1 as well.
int jumpLines = nrowLimit==0 ? 100 : (int)umin(100, nrowLimit); // how many lines from each jump point to use. If nrows>0 is supplied, nJumps is later set to 1. #4029
{
if (verbose) DTPRINT(_("[06] Detect separator, quoting rule, and ncolumns\n"));

Expand Down Expand Up @@ -1812,7 +1812,7 @@ int freadMain(freadMainArgs _args) {
(uint64_t)sz, (uint64_t)jump0size, (uint64_t)(sz/(2*jump0size)));
}
nJumps++; // the extra sample at the very end (up to eof) is sampled and format checked but not jumped to when reading
if (nrowLimit<INT64_MAX) nJumps=1; // when nrowLimit supplied by user, no jumps (not even at the end) and single threaded
if (nrowLimit<INT64_MAX && nrowLimit>0) nJumps=1; // when nrows>0 supplied by user, no jumps (not even at the end) and single threaded

sampleLines = 0;
double sumLen=0.0, sumLenSq=0.0;
Expand Down

0 comments on commit 96860f2

Please sign in to comment.