diff --git a/.travis.yml b/.travis.yml
index ea0a185852..42e740c899 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,6 @@
language: julia
julia:
- - 0.4
- 0.5
- nightly
os:
@@ -15,6 +14,5 @@ script:
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
- julia --check-bounds=yes -e 'Pkg.clone(pwd()); Pkg.build("DataFrames"); Pkg.test("DataFrames"; coverage=true)'
after_success:
- - julia -e 'cd(Pkg.dir("DataFrames")); Pkg.clone("https://github.com/MichaelHatherly/Documenter.jl"); include(joinpath("docs", "make.jl"))'
+ - julia -e 'cd(Pkg.dir("DataFrames")); Pkg.add("Documenter"); include(joinpath("docs", "make.jl"))'
- julia -e 'cd(Pkg.dir("DataFrames")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
-
diff --git a/REQUIRE b/REQUIRE
index b20aea22d9..79622049cf 100644
--- a/REQUIRE
+++ b/REQUIRE
@@ -1,6 +1,7 @@
-julia 0.4
-DataArrays 0.3.4
-StatsBase 0.8.3
+julia 0.5
+NullableArrays 0.0.8
+CategoricalArrays 0.0.6
+StatsBase 0.11.0
GZip
SortingAlgorithms
Reexport
diff --git a/appveyor.yml b/appveyor.yml
index cfc1085114..84c37acbda 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,7 +1,5 @@
environment:
matrix:
- - JULIAVERSION: "julialang/bin/winnt/x86/0.4/julia-0.4-latest-win32.exe"
- - JULIAVERSION: "julialang/bin/winnt/x64/0.4/julia-0.4-latest-win64.exe"
- JULIAVERSION: "julialang/bin/winnt/x86/0.5/julia-0.5-latest-win32.exe"
- JULIAVERSION: "julialang/bin/winnt/x64/0.5/julia-0.5-latest-win64.exe"
- JULIAVERSION: "julianightlies/bin/winnt/x86/julia-latest-win32.exe"
diff --git a/benchmark/datamatrix.jl b/benchmark/datamatrix.jl
deleted file mode 100644
index c856a15020..0000000000
--- a/benchmark/datamatrix.jl
+++ /dev/null
@@ -1,37 +0,0 @@
-a = eye(100)
-b = eye(100)
-
-dm_a = data(a)
-dm_b = data(b)
-
-dm_a_na = copy(dm_a)
-dm_a_na[:, :] = NA
-dm_b_na = copy(dm_b)
-dm_b_na[:, :] = NA
-
-f1() = *(a, b)
-f2() = *(dm_a, dm_b)
-f3() = *(dm_a_na, dm_b_na)
-
-df1 = benchmark(f1,
- "Linear Algebra",
- "Matrix Multiplication w/ No NA's",
- 1_000)
-df2 = benchmark(f2,
- "Linear Algebra",
- "DataMatrix Multiplication w/ No NA's",
- 1_000)
-df3 = benchmark(f3,
- "Linear Algebra",
- "DataMatrix Multiplication w/ NA's",
- 1_000)
-
-# TODO: Keep permanent record
-printtable(vcat(df1, df2, df3), header=false)
-
-# Compare with R
-# We're 10x as fast!
-# a <- diag(100)
-# b <- diag(100)
-# a %*% b
-# s <- Sys.time(); a %*% b; e <- Sys.time(); e - s
diff --git a/benchmark/datavector.jl b/benchmark/datavector.jl
deleted file mode 100644
index 1139fa9114..0000000000
--- a/benchmark/datavector.jl
+++ /dev/null
@@ -1,56 +0,0 @@
-srand(1)
-N = 1_000_000
-v = randn(N)
-dv = DataArray(v)
-dvna = deepcopy(dv)
-dvna[rand(1:N, 10_000)] = NA
-idxv = shuffle([1:N])
-idxdv = DataArray(idxv)
-
-f1(v) = sum(v)
-f2(v) = sum(dropna(v))
-f3(v) = sum(dropna(v)) # Make this an iterator
-f4(v) = mean(v)
-f5(v) = mean(dropna(v))
-f6(v) = mean(dropna(v)) # Make this an iterator
-f7(v1, v2) = v1 + v2
-f8(v1, v2) = v1 .> v2
-f9(v, i) = v[i]
-
-perf_test = Dict()
-
-perf_test["sum(v): Vector with no NA's"] = () -> f1(v)
-perf_test["sum(dv): DataVector with no NA's"] = () -> f1(dv)
-perf_test["sum(dropna(dv)): DataVector with no NA's"] = () -> f2(dv)
-perf_test["sum(*dropna(dv)): DataVector with no NA's"] = () -> f3(dv)
-
-perf_test["sum(dvna): DataVector with NA's"] = () -> f4(dv)
-perf_test["sum(dropna(dvna)): DataVector with NA's"] = () -> f5(dv)
-perf_test["sum(*dropna(dvna)): DataVector with NA's"] = () -> f6(dv)
-
-perf_test["mean(v): Vector with no NA's"] = () -> f4(v)
-perf_test["mean(dv): DataVector with no NA's"] = () -> f4(dv)
-perf_test["mean(dropna(dv)): DataVector with no NA's"] = () -> f5(dv)
-perf_test["mean(*dropna(dv)): DataVector with no NA's"] = () -> f6(dv)
-
-perf_test["mean(dvna): DataVector with NA's"] = () -> f4(dv)
-perf_test["mean(dropna(dvna)): DataVector with NA's"] = () -> f5(dv)
-perf_test["mean(*dropna(dvna)): DataVector with NA's"] = () -> f6(dv)
-
-perf_test["v + 1.0 : Vector"] = () -> f7(v, 1.0)
-perf_test["dv + 1.0 : DataVector with no NA's"] = () -> f7(dv, 1.0)
-perf_test["dvna + 1.0 : DataVector with NA's"] = () -> f7(dvna, 1.0)
-
-perf_test["v .> 1.0 : Vector"] = () -> f8(v, 1.0)
-perf_test["dv .> 1.0 : DataVector with no NA's"] = () -> f8(dv, 1.0)
-perf_test["dvna .> 1.0 : DataVector with NA's"] = () -> f8(dvna, 1.0)
-
-perf_test["v[idxv] : Vector"] = () -> f9(v, idxv)
-perf_test["dv[idxv] : DataVector and Vector indexing"] = () -> f9(dv, idxv)
-perf_test["dv[idxdv] : DataVector and DataVector indexing"] = () -> f9(dv, idxdv)
-
-for (name, f) in perf_test
- res = benchmark(f, "DataArray Operations", name, 10)
- # TODO: Keep permanent record
- printtable(res, header=false)
-end
diff --git a/benchmark/results.csv b/benchmark/results.csv
index 803a8f1963..2f828b1374 100644
--- a/benchmark/results.csv
+++ b/benchmark/results.csv
@@ -1,27 +1,4 @@
Category,Benchmark,Iterations,TotalWall,AverageWall,MaxWall,MinWall,Timestamp,JuliaHash,CodeHash,OS,CPUCores
-"DataArray Operations","sum(v): Vector with no NA's",10,0.00857686996459961,0.000857686996459961,0.0008759498596191406,0.0008528232574462891,"2013-01-14 10:20:09","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","mean(dv): DataVector with no NA's",10,0.01034688949584961,0.001034688949584961,0.0015439987182617188,0.0009601116180419922,"2013-01-14 10:20:09","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","mean(removeNA(dvna)): DataVector with NA's",10,0.3172330856323242,0.031723308563232425,0.03600907325744629,0.031102895736694336,"2013-01-14 10:20:10","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","dv .> 1.0 : DataVector with no NA's",10,2.0662119388580322,0.20662119388580322,0.2186141014099121,0.19791007041931152,"2013-01-14 10:20:12","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","dv[idxdv] : DataVector and DataVector indexing",10,1.309283971786499,0.1309283971786499,0.1430819034576416,0.12758612632751465,"2013-01-14 10:20:14","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","mean(v): Vector with no NA's",10,0.008614063262939453,0.0008614063262939453,0.0008738040924072266,0.0008530616760253906,"2013-01-14 10:20:14","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","dv + 1.0 : DataVector with no NA's",10,0.06403565406799316,0.006403565406799316,0.018197059631347656,0.002321958541870117,"2013-01-14 10:20:14","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","v .> 1.0 : Vector",10,0.6535539627075195,0.06535539627075196,0.08094000816345215,0.059654951095581055,"2013-01-14 10:20:15","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","dv[idxv] : DataVector and Vector indexing",10,1.0075325965881348,0.10075325965881347,0.10670304298400879,0.09811711311340332,"2013-01-14 10:20:16","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","sum(removeNA(dv)): DataVector with no NA's",10,0.31229615211486816,0.031229615211486816,0.03513312339782715,0.029542922973632812,"2013-01-14 10:20:16","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","v[idxv] : Vector",10,0.17796993255615234,0.017796993255615234,0.03006291389465332,0.01340794563293457,"2013-01-14 10:20:16","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","dvna .> 1.0 : DataVector with NA's",10,2.047502040863037,0.2047502040863037,0.21759915351867676,0.19756603240966797,"2013-01-14 10:20:19","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","sum(dvna): DataVector with NA's",10,0.009662866592407227,0.0009662866592407227,0.0009889602661132812,0.0009551048278808594,"2013-01-14 10:20:19","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","mean(*removeNA(dv)): DataVector with no NA's",10,0.33266425132751465,0.033266425132751465,0.03953218460083008,0.031510114669799805,"2013-01-14 10:20:19","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","mean(*removeNA(dvna)): DataVector with NA's",10,0.3514890670776367,0.035148906707763675,0.04177212715148926,0.030744075775146484,"2013-01-14 10:20:20","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","dvna + 1.0 : DataVector with NA's",10,0.06465601921081543,0.006465601921081543,0.018489837646484375,0.0023229122161865234,"2013-01-14 10:20:20","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","sum(*removeNA(dvna)): DataVector with NA's",10,0.3297770023345947,0.03297770023345947,0.039092063903808594,0.031161069869995117,"2013-01-14 10:20:20","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","sum(dv): DataVector with no NA's",10,0.009810924530029297,0.0009810924530029297,0.0010459423065185547,0.0009570121765136719,"2013-01-14 10:20:20","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","sum(removeNA(dvna)): DataVector with NA's",10,0.3214747905731201,0.03214747905731201,0.03607583045959473,0.03061199188232422,"2013-01-14 10:20:20","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","mean(removeNA(dv)): DataVector with no NA's",10,0.31479310989379883,0.03147931098937988,0.03677701950073242,0.030520200729370117,"2013-01-14 10:20:21","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","v + 1.0 : Vector",10,0.06811809539794922,0.006811809539794922,0.030359983444213867,0.0016188621520996094,"2013-01-14 10:20:21","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","sum(*removeNA(dv)): DataVector with no NA's",10,0.313723087310791,0.0313723087310791,0.036063194274902344,0.030745983123779297,"2013-01-14 10:20:21","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","mean(dvna): DataVector with NA's",10,0.010206937789916992,0.0010206937789916993,0.001260995864868164,0.0009670257568359375,"2013-01-14 10:20:21","9e0ff15b52","61162cd918","Darwin",4
"Linear Algebra","Matrix Multiplication w/ No NA's",1000,0.1434164047241211,0.0001434164047241211,0.03787708282470703,5.888938903808594e-5,"2013-01-14 10:20:22","9e0ff15b52","61162cd918","Darwin",4
"Linear Algebra","DataMatrix Multiplication w/ No NA's",1000,2.0335049629211426,0.0020335049629211428,0.025124788284301758,0.001116037368774414,"2013-01-14 10:20:24","9e0ff15b52","61162cd918","Darwin",4
"Linear Algebra","DataMatrix Multiplication w/ NA's",1000,4.978086709976196,0.004978086709976196,0.038012027740478516,0.003064870834350586,"2013-01-14 10:20:29","9e0ff15b52","61162cd918","Darwin",4
@@ -40,29 +17,6 @@ Category,Benchmark,Iterations,TotalWall,AverageWall,MaxWall,MinWall,Timestamp,Ju
"DataFrame I/O","space_before_delimiter.csv",10,0.007896661758422852,0.0007896661758422851,0.0009069442749023438,0.0005939006805419922,"2013-01-14 10:21:54","9e0ff15b52","61162cd918","Darwin",4
"DataFrame I/O","types.csv",10,0.01001596450805664,0.001001596450805664,0.0011119842529296875,0.0007750988006591797,"2013-01-14 10:21:54","9e0ff15b52","61162cd918","Darwin",4
"DataFrame I/O","utf8.csv",10,0.007441043853759766,0.0007441043853759766,0.0008280277252197266,0.0007090568542480469,"2013-01-14 10:21:54","9e0ff15b52","61162cd918","Darwin",4
-"DataArray Operations","sum(v): Vector with no NA's",10,0.009074211120605469,0.0009074211120605469,0.0009720325469970703,0.0008630752563476562,"2013-01-14 10:44:57","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","mean(dv): DataVector with no NA's",10,0.00978994369506836,0.000978994369506836,0.0010378360748291016,0.0009520053863525391,"2013-01-14 10:44:57","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","mean(removeNA(dvna)): DataVector with NA's",10,0.370542049407959,0.0370542049407959,0.07280802726745605,0.032627105712890625,"2013-01-14 10:44:58","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","dv .> 1.0 : DataVector with no NA's",10,2.0874040126800537,0.20874040126800536,0.22231101989746094,0.2008979320526123,"2013-01-14 10:45:00","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","dv[idxdv] : DataVector and DataVector indexing",10,1.3245141506195068,0.13245141506195068,0.13637208938598633,0.12865090370178223,"2013-01-14 10:45:02","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","mean(v): Vector with no NA's",10,0.010338783264160156,0.0010338783264160155,0.001631021499633789,0.0008609294891357422,"2013-01-14 10:45:02","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","dv + 1.0 : DataVector with no NA's",10,0.06729602813720703,0.006729602813720703,0.019953012466430664,0.0023021697998046875,"2013-01-14 10:45:02","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","v .> 1.0 : Vector",10,0.6516821384429932,0.06516821384429931,0.07804393768310547,0.059031009674072266,"2013-01-14 10:45:03","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","dv[idxv] : DataVector and Vector indexing",10,1.0018041133880615,0.10018041133880615,0.10875797271728516,0.09689211845397949,"2013-01-14 10:45:04","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","sum(removeNA(dv)): DataVector with no NA's",10,0.3324899673461914,0.03324899673461914,0.03690004348754883,0.029747962951660156,"2013-01-14 10:45:04","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","v[idxv] : Vector",10,0.21311020851135254,0.021311020851135253,0.03587007522583008,0.01586008071899414,"2013-01-14 10:45:05","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","dvna .> 1.0 : DataVector with NA's",10,2.057245969772339,0.2057245969772339,0.21527600288391113,0.19754600524902344,"2013-01-14 10:45:07","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","sum(dvna): DataVector with NA's",10,0.010241031646728516,0.0010241031646728515,0.0012030601501464844,0.0009589195251464844,"2013-01-14 10:45:07","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","mean(*removeNA(dv)): DataVector with no NA's",10,0.32787513732910156,0.032787513732910153,0.03671598434448242,0.03182506561279297,"2013-01-14 10:45:07","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","mean(*removeNA(dvna)): DataVector with NA's",10,0.3225698471069336,0.03225698471069336,0.03831791877746582,0.03119802474975586,"2013-01-14 10:45:08","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","dvna + 1.0 : DataVector with NA's",10,0.0656731128692627,0.006567311286926269,0.0189821720123291,0.002315044403076172,"2013-01-14 10:45:08","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","sum(*removeNA(dvna)): DataVector with NA's",10,0.33132195472717285,0.033132195472717285,0.04097390174865723,0.031419992446899414,"2013-01-14 10:45:08","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","sum(dv): DataVector with no NA's",10,0.009993314743041992,0.0009993314743041992,0.0010750293731689453,0.0009589195251464844,"2013-01-14 10:45:08","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","sum(removeNA(dvna)): DataVector with NA's",10,0.32796382904052734,0.032796382904052734,0.03624391555786133,0.030983924865722656,"2013-01-14 10:45:09","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","mean(removeNA(dv)): DataVector with no NA's",10,0.3377056121826172,0.03377056121826172,0.03966093063354492,0.03125596046447754,"2013-01-14 10:45:09","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","v + 1.0 : Vector",10,0.06936144828796387,0.0069361448287963865,0.030318021774291992,0.0016639232635498047,"2013-01-14 10:45:09","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","sum(*removeNA(dv)): DataVector with no NA's",10,0.325742244720459,0.0325742244720459,0.035830020904541016,0.031311988830566406,"2013-01-14 10:45:09","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","mean(dvna): DataVector with NA's",10,0.009889841079711914,0.0009889841079711915,0.0011150836944580078,0.0009551048278808594,"2013-01-14 10:45:09","11f365ef08","319eab675d","Darwin",4
"Linear Algebra","Matrix Multiplication w/ No NA's",1000,0.16434788703918457,0.00016434788703918458,0.03700113296508789,5.888938903808594e-5,"2013-01-14 10:45:10","11f365ef08","319eab675d","Darwin",4
"Linear Algebra","DataMatrix Multiplication w/ No NA's",1000,2.043034553527832,0.002043034553527832,0.02640819549560547,0.0011169910430908203,"2013-01-14 10:45:12","11f365ef08","319eab675d","Darwin",4
"Linear Algebra","DataMatrix Multiplication w/ NA's",1000,5.303671598434448,0.005303671598434448,0.04494500160217285,0.0030400753021240234,"2013-01-14 10:45:17","11f365ef08","319eab675d","Darwin",4
@@ -81,29 +35,6 @@ Category,Benchmark,Iterations,TotalWall,AverageWall,MaxWall,MinWall,Timestamp,Ju
"DataFrame I/O","space_before_delimiter.csv",10,0.005663871765136719,0.0005663871765136719,0.0005869865417480469,0.0005459785461425781,"2013-01-14 10:46:44","11f365ef08","319eab675d","Darwin",4
"DataFrame I/O","types.csv",10,0.006924867630004883,0.0006924867630004883,0.0007150173187255859,0.0006740093231201172,"2013-01-14 10:46:44","11f365ef08","319eab675d","Darwin",4
"DataFrame I/O","utf8.csv",10,0.007310152053833008,0.0007310152053833008,0.0008001327514648438,0.0007109642028808594,"2013-01-14 10:46:44","11f365ef08","319eab675d","Darwin",4
-"DataArray Operations","sum(v): Vector with no NA's",10,0.008659124374389648,0.0008659124374389649,0.0009109973907470703,0.0008530616760253906,"2013-01-14 21:09:54","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","mean(dv): DataVector with no NA's",10,0.010535955429077148,0.001053595542907715,0.0016469955444335938,0.0009620189666748047,"2013-01-14 21:09:55","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","mean(removeNA(dvna)): DataVector with NA's",10,0.31494879722595215,0.031494879722595216,0.036512136459350586,0.0302579402923584,"2013-01-14 21:09:55","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","dv .> 1.0 : DataVector with no NA's",10,2.0206620693206787,0.20206620693206787,0.21630287170410156,0.19572210311889648,"2013-01-14 21:09:57","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","dv[idxdv] : DataVector and DataVector indexing",10,1.2879621982574463,0.12879621982574463,0.13094496726989746,0.1274411678314209,"2013-01-14 21:09:59","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","mean(v): Vector with no NA's",10,0.010784149169921875,0.0010784149169921875,0.0013751983642578125,0.0008599758148193359,"2013-01-14 21:09:59","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","dv + 1.0 : DataVector with no NA's",10,0.06326985359191895,0.006326985359191894,0.018002986907958984,0.002331972122192383,"2013-01-14 21:09:59","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","v .> 1.0 : Vector",10,0.6304340362548828,0.06304340362548828,0.07402396202087402,0.058135032653808594,"2013-01-14 21:10:00","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","dv[idxv] : DataVector and Vector indexing",10,0.9911110401153564,0.09911110401153564,0.10813021659851074,0.09464001655578613,"2013-01-14 21:10:01","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","sum(removeNA(dv)): DataVector with no NA's",10,0.323793888092041,0.0323793888092041,0.03618288040161133,0.03157186508178711,"2013-01-14 21:10:01","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","v[idxv] : Vector",10,0.1790471076965332,0.01790471076965332,0.030202150344848633,0.013410091400146484,"2013-01-14 21:10:02","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","dvna .> 1.0 : DataVector with NA's",10,2.020591974258423,0.20205919742584227,0.21394085884094238,0.19475698471069336,"2013-01-14 21:10:04","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","sum(dvna): DataVector with NA's",10,0.009839057922363281,0.0009839057922363282,0.0010139942169189453,0.000965118408203125,"2013-01-14 21:10:04","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","mean(*removeNA(dv)): DataVector with no NA's",10,0.3176560401916504,0.03176560401916504,0.036063194274902344,0.02964496612548828,"2013-01-14 21:10:04","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","mean(*removeNA(dvna)): DataVector with NA's",10,0.3231468200683594,0.03231468200683594,0.03954195976257324,0.029547929763793945,"2013-01-14 21:10:05","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","dvna + 1.0 : DataVector with NA's",10,0.06484484672546387,0.006484484672546387,0.018603086471557617,0.002290964126586914,"2013-01-14 21:10:05","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","sum(*removeNA(dvna)): DataVector with NA's",10,0.32024693489074707,0.032024693489074704,0.036015987396240234,0.0309140682220459,"2013-01-14 21:10:05","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","sum(dv): DataVector with no NA's",10,0.009638071060180664,0.0009638071060180664,0.0009829998016357422,0.0009531974792480469,"2013-01-14 21:10:05","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","sum(removeNA(dvna)): DataVector with NA's",10,0.3199589252471924,0.03199589252471924,0.03625988960266113,0.030423879623413086,"2013-01-14 21:10:06","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","mean(removeNA(dv)): DataVector with no NA's",10,0.31856274604797363,0.03185627460479736,0.03632402420043945,0.031010150909423828,"2013-01-14 21:10:06","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","v + 1.0 : Vector",10,0.07034850120544434,0.007034850120544433,0.030943870544433594,0.0016410350799560547,"2013-01-14 21:10:06","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","sum(*removeNA(dv)): DataVector with no NA's",10,0.3294239044189453,0.03294239044189453,0.03928399085998535,0.030253887176513672,"2013-01-14 21:10:06","3549f803f9","3b67c77708","Darwin",4
-"DataArray Operations","mean(dvna): DataVector with NA's",10,0.010222911834716797,0.0010222911834716797,0.0012698173522949219,0.0009789466857910156,"2013-01-14 21:10:06","3549f803f9","3b67c77708","Darwin",4
"Linear Algebra","Matrix Multiplication w/ No NA's",1000,0.1328754425048828,0.00013287544250488282,0.011707067489624023,5.888938903808594e-5,"2013-01-14 21:10:07","3549f803f9","3b67c77708","Darwin",4
"Linear Algebra","DataMatrix Multiplication w/ No NA's",1000,1.9026072025299072,0.0019026072025299073,0.026885986328125,0.0011301040649414062,"2013-01-14 21:10:09","3549f803f9","3b67c77708","Darwin",4
"Linear Algebra","DataMatrix Multiplication w/ NA's",1000,4.693065643310547,0.004693065643310547,0.034111976623535156,0.0031609535217285156,"2013-01-14 21:10:14","3549f803f9","3b67c77708","Darwin",4
diff --git a/benchmark/runbenchmarks.jl b/benchmark/runbenchmarks.jl
index fa9859b406..adbcef595e 100644
--- a/benchmark/runbenchmarks.jl
+++ b/benchmark/runbenchmarks.jl
@@ -5,9 +5,7 @@
using DataFrames
using Benchmark
-benchmarks = ["datavector.jl",
- "datamatrix.jl",
- "io.jl"]
+benchmarks = [ "io.jl"]
# TODO: Print summary to stdout_stream, while printing results
# to file with appends.
diff --git a/docs/make.jl b/docs/make.jl
index cfd5a2df8a..467ceb2f0f 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,4 +1,4 @@
-using Documenter, DataFrames, DataArrays
+using Documenter, DataFrames
# Build documentation.
# ====================
@@ -6,18 +6,41 @@ using Documenter, DataFrames, DataArrays
makedocs(
# options
modules = [DataFrames],
- doctest = false,
- clean = false
+ doctest = true,
+ clean = false,
+ sitename = "DataFrames.jl",
+ format = Documenter.Formats.HTML,
+ pages = Any[
+ "Introduction" => "index.md",
+ "User Guide" => Any[
+ "Getting Started" => "man/getting_started.md",
+ "IO" => "man/io.md",
+ "Joins" => "man/joins.md",
+ "Split-apply-combine" => "man/split_apply_combine.md",
+ "Reshaping" => "man/reshaping_and_pivoting.md",
+ "Sorting" => "man/sorting.md",
+ "Formulas" => "man/formulas.md",
+ "Pooling" => "man/pooling.md",
+ ],
+ "API" => Any[
+ "Main types" => "lib/maintypes.md",
+ "Utilities" => "lib/utilities.md",
+ "Data manipulation" => "lib/manipulation.md",
+ ],
+ "About" => Any[
+ "Release Notes" => "NEWS.md",
+ "License" => "LICENSE.md",
+ ]
+ ]
)
# Deploy built documentation from Travis.
# =======================================
-# Needs to install an additional dep, mkdocs-material, so provide a custom `deps`.
-custom_deps() = run(`pip install --user pygments mkdocs mkdocs-material`)
-
deploydocs(
# options
- deps = custom_deps,
- repo = "github.com/JuliaStats/DataFrames.jl.git"
+ repo = "github.com/JuliaStats/DataFrames.jl.git",
+ target = "build",
+ deps = nothing,
+ make = nothing,
)
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
deleted file mode 100644
index ea5c0f8695..0000000000
--- a/docs/mkdocs.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-
-site_name: DataFrames.jl
-site_description: package for working with tabular data in Julia
-repo_url: https://github.com/JuliaStats/DataFrames.jl
-
-theme: material
-
-extra:
- palette:
- primary: 'indigo'
- accent: 'blue'
-
-extra_css:
- - assets/Documenter.css
-
-markdown_extensions:
- - codehilite
- - extra
- - tables
- - fenced_code
-
-extra_javascript:
- - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML
- - assets/mathjaxhelper.js
-
-docs_dir: 'build'
-
-pages:
-- [index.md, Introduction]
-- [man/getting_started.md, User guide, Getting Started]
-- [man/io.md, User guide, IO]
-- [man/joins.md, User guide, Joins]
-- [man/split_apply_combine.md, User guide, Split-apply-combine]
-- [man/reshaping_and_pivoting.md, User guide, Reshaping]
-- [man/sorting.md, User guide, Sorting]
-- [man/formulas.md, User guide, Formulas]
-- [man/pooling.md, User guide, Pooling]
-- [lib/maintypes.md, API, Main types]
-- [lib/utilities.md, API, Utilities]
-- [lib/manipulation.md, API, Data manipulation]
-- [NEWS.md, About, Release notes]
-- [LICENSE.md, About, License]
-
diff --git a/docs/src/index.md b/docs/src/index.md
index 9a08a33548..7943a0597d 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -2,17 +2,20 @@
## Package Manual
- {contents}
- Pages = ["man/getting_started.md", "man/io.md", "man/joins.md", "man/split_apply_combine.md", "man/reshaping_and_pivoting.md", "man/sorting.md", "man/formulas.md", "man/pooling.md"]
- Depth = 2
+```@contents
+Pages = ["man/getting_started.md", "man/io.md", "man/joins.md", "man/split_apply_combine.md", "man/reshaping_and_pivoting.md", "man/sorting.md", "man/formulas.md", "man/pooling.md"]
+Depth = 2
+```
## API
- {contents}
- Pages = ["lib/maintypes.md", "lib/manipulation.md", "lib/utilities.md"]
- Depth = 2
+```@contents
+Pages = ["lib/maintypes.md", "lib/manipulation.md", "lib/utilities.md"]
+Depth = 2
+```
## Documentation Index
- {index}
- Pages = ["lib/maintypes.md", "lib/manipulation.md", "lib/utilities.md", "man/io.md"]
+```@index
+Pages = ["lib/maintypes.md", "lib/manipulation.md", "lib/utilities.md", "man/io.md"]
+```
diff --git a/docs/src/lib/maintypes.md b/docs/src/lib/maintypes.md
index 8b8f300283..ccc62d530c 100644
--- a/docs/src/lib/maintypes.md
+++ b/docs/src/lib/maintypes.md
@@ -1,16 +1,16 @@
- {meta}
- CurrentModule = DataFrames
+```@meta
+CurrentModule = DataFrames
+```
# Main Types
- {index}
- Pages = ["maintypes.md"]
-
-...
-
- {docs}
- AbstractDataFrame
- DataFrame
- SubDataFrame
+```@index
+Pages = ["maintypes.md"]
+```
+```@docs
+AbstractDataFrame
+DataFrame
+SubDataFrame
+```
diff --git a/docs/src/lib/manipulation.md b/docs/src/lib/manipulation.md
index dae992c48b..1f9f578d25 100644
--- a/docs/src/lib/manipulation.md
+++ b/docs/src/lib/manipulation.md
@@ -1,22 +1,25 @@
-
- {meta}
- CurrentModule = DataFrames
+```@meta
+CurrentModule = DataFrames
+```
# Data Manipulation
-
- {index}
- Pages = ["manipulation.md"]
+
+```@index
+Pages = ["manipulation.md"]
+```
## Joins
- {docs}
- join
-
+```@docs
+join
+```
+
## Reshaping
- {docs}
- melt
- stack
- unstack
- stackdf
- meltdf
+```@docs
+melt
+stack
+unstack
+stackdf
+meltdf
+```
diff --git a/docs/src/lib/utilities.md b/docs/src/lib/utilities.md
index 6b2d8d2564..23c9c76d65 100644
--- a/docs/src/lib/utilities.md
+++ b/docs/src/lib/utilities.md
@@ -1,27 +1,25 @@
-
- {meta}
- CurrentModule = DataFrames
+```@meta
+CurrentModule = DataFrames
+```
# Utilities
- {index}
- Pages = ["utilities.md"]
+```@index
+Pages = ["utilities.md"]
+```
-...
-
- {docs}
- eltypes
- head
- complete_cases
- complete_cases!
- describe
- dump
- names!
- nonunique
- rename
- rename!
- tail
- unique
- unique!
-
-
+```@docs
+eltypes
+head
+complete_cases
+complete_cases!
+describe
+dump
+names!
+nonunique
+rename
+rename!
+tail
+unique
+unique!
+```
diff --git a/docs/src/man/formulas.md b/docs/src/man/formulas.md
index e9203439e8..c283a696a8 100644
--- a/docs/src/man/formulas.md
+++ b/docs/src/man/formulas.md
@@ -33,7 +33,7 @@ If you would like to specify both main effects and an interaction term at once,
mm = ModelMatrix(ModelFrame(Z ~ X*Y, df))
```
-You can control how categorical variables (e.g., `PooledDataArray` columns) are converted to `ModelMatrix` columns by specifying _contrasts_ when you construct a `ModelFrame`:
+You can control how categorical variables (e.g., `CategoricalArray` columns) are converted to `ModelMatrix` columns by specifying _contrasts_ when you construct a `ModelFrame`:
```julia
mm = ModelMatrix(ModelFrame(Z ~ X*Y, df, contrasts = Dict(:X => HelmertCoding())))
@@ -47,4 +47,3 @@ contrasts!(mf, X = HelmertCoding())
```
The construction of model matrices makes it easy to formulate complex statistical models. These are used to good effect by the [GLM Package.](https://github.com/JuliaStats/GLM.jl)
-
diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md
index 89a2f7e101..b4fc4e1a17 100644
--- a/docs/src/man/getting_started.md
+++ b/docs/src/man/getting_started.md
@@ -2,75 +2,75 @@
## Installation
-The DataFrames package is available through the Julia package system. Throughout the rest of this tutorial, we will assume that you have installed the DataFrames package and have already typed `using DataArrays, DataFrames` to bring all of the relevant variables into your current namespace. In addition, we will make use of the `RDatasets` package, which provides access to hundreds of classical data sets.
+The DataFrames package is available through the Julia package system. Throughout the rest of this tutorial, we will assume that you have installed the DataFrames package and have already typed `using NullableArrays, DataFrames` to bring all of the relevant variables into your current namespace. In addition, we will make use of the `RDatasets` package, which provides access to hundreds of classical data sets.
-## The `NA` Value
+## The `Nullable` Type
-To get started, let's examine the `NA` value. Type the following into the REPL:
+To get started, let's examine the `Nullable` type. Objects of this type can either hold a value, or represent a missing value (`null`). For example, this is a `Nullable` holding the integer `1`:
```julia
-NA
+Nullable(1)
```
-One of the essential properties of `NA` is that it poisons other items. To see this, try to add something like `1` to `NA`:
-
+And this represents a missing value:
```julia
-1 + NA
+Nullable()
```
-## The `DataArray` Type
-
-Now that we see that `NA` is working, let's insert one into a `DataArray`. We'll create one now using the `@data` macro:
+`Nullable` objects support all standard operators, which return another `Nullable`. One of the essential properties of `null` values is that they poison other items. To see this, try to add something like `Nullable(1)` to `Nullable()`:
```julia
-dv = @data([NA, 3, 2, 5, 4])
+Nullable(1) + Nullable()
```
-To see how `NA` poisons even complex calculations, let's try to take the mean of the five numbers stored in `dv`:
+Note that operations mixing `Nullable` and scalars (e.g. `1 + Nullable()`) are not supported.
+
+## The `NullableArray` Type
+
+`Nullable` objects can be stored in a standard `Array` just like any value:
```julia
-mean(dv)
+v = Nullable{Int}[1, 3, 4, 5, 4]
```
-In many cases we're willing to just ignore `NA` values and remove them from our vector. We can do that using the `dropna` function:
+But arrays of `Nullable` are inefficient, both in terms of computation costs and of memory use. `NullableArrays` provide a more efficient storage, and behave like `Array{Nullable}` objects.
```julia
-dropna(dv)
-mean(dropna(dv))
+nv = NullableArray(Nullable{Int}[Nullable(), 3, 2, 5, 4])
```
-Instead of removing `NA` values, you can try to conver the `DataArray` into a normal Julia `Array` using `convert`:
+In many cases we're willing to just ignore missing values and remove them from our vector. We can do that using the `dropnull` function:
```julia
-convert(Array, dv)
+dropnull(nv)
+mean(dropnull(nv))
```
-This fails in the presence of `NA` values, but will succeed if there are no `NA` values:
+Instead of removing `null` values, you can try to convert the `NullableArray` into a normal Julia `Array` using `convert`:
```julia
-dv[1] = 3
-convert(Array, dv)
+convert(Array, nv)
```
-In addition to removing `NA` values and hoping they won't occur, you can also replace any `NA` values using the `convert` function, which takes a replacement value as an argument:
+This fails in the presence of `null` values, but will succeed if there are no `null` values:
```julia
-dv = @data([NA, 3, 2, 5, 4])
-mean(convert(Array, dv, 11))
+nv[1] = 3
+convert(Array, nv)
```
-Which strategy for dealing with `NA` values is most appropriate will typically depend on the specific details of your data analysis pathway.
-
-Although the examples above employed only 1D `DataArray` objects, the `DataArray` type defines a completely generic N-dimensional array type. Operations on generic `DataArray` objects work in higher dimensions in the same way that they work on Julia's Base `Array` type:
+In addition to removing `null` values and hoping they won't occur, you can also replace any `null` values using the `convert` function, which takes a replacement value as an argument:
```julia
-dm = @data([NA 0.0; 0.0 1.0])
-dm * dm
+nv = NullableArray(Nullable{Int}[Nullable(), 3, 2, 5, 4])
+mean(convert(Array, nv, 0))
```
+Which strategy for dealing with `null` values is most appropriate will typically depend on the specific details of your data analysis pathway.
+
## The `DataFrame` Type
-The `DataFrame` type can be used to represent data tables, each column of which is a `DataArray`. You can specify the columns using keyword arguments:
+The `DataFrame` type can be used to represent data tables, each column of which is an array (by default, a `NullableArray`). You can specify the columns using keyword arguments:
```julia
df = DataFrame(A = 1:4, B = ["M", "F", "F", "M"])
@@ -110,22 +110,22 @@ describe(df)
To focus our search, we start looking at just the means and medians of specific columns. In the example below, we use numeric indexing to access the columns of the `DataFrame`:
```julia
-mean(df[1])
-median(df[1])
+mean(dropnull(df[1]))
+median(dropnull(df[1]))
```
We could also have used column names to access individual columns:
```julia
-mean(df[:A])
-median(df[:A])
+mean(dropnull(df[:A]))
+median(dropnull(df[:A]))
```
We can also apply a function to each column of a `DataFrame` with the `colwise` function. For example:
```julia
df = DataFrame(A = 1:4, B = randn(4))
-colwise(cumsum, df)
+colwise(c->cumsum(dropnull(c)), df)
```
## Accessing Classic Data Sets
@@ -135,10 +135,15 @@ To see more of the functionality for working with `DataFrame` objects, we need a
For example, we can access Fisher's iris data set using the following functions:
```julia
-using RDatasets
-iris = dataset("datasets", "iris")
+iris = readtable(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"))
head(iris)
```
In the next section, we'll discuss generic I/O strategy for reading and writing `DataFrame` objects that you can use to import and export your own data files.
+## Querying DataFrames
+
+While the `DataFrames` package provides basic data manipulation capabilities, users are encouraged to use the following packages for more powerful and complete data querying functionality in the spirit of [dplyr](https://github.com/hadley/dplyr) and [LINQ](https://msdn.microsoft.com/en-us/library/bb397926.aspx):
+
+- [DataFramesMeta.jl](https://github.com/JuliaStats/DataFramesMeta.jl) provides metaprogramming tools for `DataFrames` and associative objects. These macros improve performance and provide more convenient syntax.
+- [Query.jl](https://github.com/davidanthoff/Query.jl) provides a LINQ like interface to a large number of data sources, including `DataFrame` instances.
diff --git a/docs/src/man/io.md b/docs/src/man/io.md
index fdb0869355..0ebcc94d60 100644
--- a/docs/src/man/io.md
+++ b/docs/src/man/io.md
@@ -4,8 +4,9 @@
To read data from a CSV-like file, use the `readtable` function:
- {docs}
- readtable
+```@docs
+readtable
+```
`readtable` requires that you specify the path of the file that you would like to read as a `String`. To read data from a non-file source, you may also supply an `IO` object. It supports many additional keyword arguments: these are documented in the section on advanced I/O operations.
@@ -13,8 +14,9 @@ To read data from a CSV-like file, use the `readtable` function:
To write data to a CSV file, use the `writetable` function:
- {docs}
- writetable
+```@docs
+writetable
+```
## Supplying `DataFrame`s inline with non-standard string literals
diff --git a/docs/src/man/joins.md b/docs/src/man/joins.md
index 558bf317e3..c152ee9fa3 100644
--- a/docs/src/man/joins.md
+++ b/docs/src/man/joins.md
@@ -15,7 +15,7 @@ full = join(names, jobs, on = :ID)
Output:
-| Row | ID | Name | Job |
+| Row | ID | Name | Job |
|-----|----|------------|----------|
| 1 | 1 | "John Doe" | "Lawyer" |
| 2 | 1 | "Jane Doe" | "Doctor" |
diff --git a/docs/src/man/pooling.md b/docs/src/man/pooling.md
index 17757d8526..fcffaaba29 100644
--- a/docs/src/man/pooling.md
+++ b/docs/src/man/pooling.md
@@ -1,44 +1,49 @@
-# Pooling Data (Representing Factors)
+# Categorical Data
Often, we have to deal with factors that take on a small number of levels:
```julia
-dv = @data(["Group A", "Group A", "Group A",
- "Group B", "Group B", "Group B"])
+v = ["Group A", "Group A", "Group A",
+ "Group B", "Group B", "Group B"]
```
-The naive encoding used in a `DataArray` represents every entry of this vector as a full string. In contrast, we can represent the data more efficiently by replacing the strings with indices into a small pool of levels. This is what the `PooledDataArray` does:
+The naive encoding used in an `Array` or in a `NullableArray` represents every entry of this vector as a full string. In contrast, we can represent the data more efficiently by replacing the strings with indices into a small pool of levels. This is what the `CategoricalArray` type does:
```julia
-pdv = @pdata(["Group A", "Group A", "Group A",
- "Group B", "Group B", "Group B"])
+cv = CategoricalArray(["Group A", "Group A", "Group A",
+ "Group B", "Group B", "Group B"])
```
-In addition to representing repeated data efficiently, the `PooledDataArray` allows us to determine the levels of the factor at any time using the `levels` function:
+A companion type, `NullableCategoricalArray`, allows storing missing values in the array: is to `CategoricalArray` what `NullableArray` is to the standard `Array` type.
+
+In addition to representing repeated data efficiently, the `CategoricalArray` type allows us to determine efficiently the allowed levels of the variable at any time using the `levels` function (note that levels may or may not be actually used in the data):
```julia
-levels(pdv)
+levels(cv)
```
-By default, a `PooledDataArray` is able to represent 232differents levels. You can use less memory by calling the `compact` function:
+The `levels!` function also allows changing the order of appearance of the levels, which can be useful for display purposes or when working with ordered variables.
+
+By default, a `CategoricalArray` is able to represent 232differents levels. You can use less memory by calling the `compact` function:
```julia
-pdv = compact(pdv)
+cv = compact(cv)
```
-Often, you will have factors encoded inside a DataFrame with `DataArray` columns instead of `PooledDataArray` columns. You can do conversion of a single column using the `pool` function:
+Often, you will have factors encoded inside a DataFrame with `Array` or `NullableArray` columns instead of `CategoricalArray` or `NullableCategoricalArray` columns. You can do conversion of a single column using the `categorical` function:
```julia
-pdv = pool(dv)
+cv = categorical(v)
```
-Or you can edit the columns of a `DataFrame` in-place using the `pool!` function:
+Or you can edit the columns of a `DataFrame` in-place using the `categorical!` function:
```julia
df = DataFrame(A = [1, 1, 1, 2, 2, 2],
B = ["X", "X", "X", "Y", "Y", "Y"])
-pool!(df, [:A, :B])
+categorical!(df, [:A, :B])
```
-Pooling columns is important for working with the [GLM package](https://github.com/JuliaStats/GLM.jl) When fitting regression models, `PooledDataArray` columns in the input are translated into 0/1 indicator columns in the `ModelMatrix` with one column for each of the levels of the `PooledDataArray`. This allows one to analyze categorical data efficiently.
+Using categorical arrays is important for working with the [GLM package](https://github.com/JuliaStats/GLM.jl). When fitting regression models, `CategoricalArray` and `NullableCategoricalArray` columns in the input are translated into 0/1 indicator columns in the `ModelMatrix` with one column for each of the levels of the `CategoricalArray`/`NullableCategoricalArray`. This allows one to analyze categorical data efficiently.
+See the [CategoricalArrays package](https://github.com/nalimilan/CategoricalArrays.jl) for more information regarding categorical arrays.
diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md
index 959dd00bfb..dcd02c70d9 100644
--- a/docs/src/man/reshaping_and_pivoting.md
+++ b/docs/src/man/reshaping_and_pivoting.md
@@ -3,10 +3,10 @@
Reshape data from wide to long format using the `stack` function:
```julia
-using DataFrames, RDatasets
-iris = dataset("datasets", "iris")
+using DataFrames
+iris = readtable(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"))
iris[:id] = 1:size(iris, 1) # this makes it easier to unstack
-d = stack(iris, [1:4])
+d = stack(iris, 1:4)
```
The second optional argument to `stack` indicates the columns to be stacked. These are normally referred to as the measured variables. Column names can also be given:
@@ -79,6 +79,6 @@ None of these reshaping functions perform any aggregation. To do aggregation, us
```julia
d = stack(iris)
-x = by(d, [:variable, :Species], df -> DataFrame(vsum = mean(df[:value])))
+x = by(d, [:variable, :Species], df -> DataFrame(vsum = mean(dropnull(df[:value]))))
unstack(x, :Species, :vsum)
```
diff --git a/docs/src/man/sorting.md b/docs/src/man/sorting.md
index 4d2140a258..14224a8b55 100644
--- a/docs/src/man/sorting.md
+++ b/docs/src/man/sorting.md
@@ -3,9 +3,8 @@
Sorting is a fundamental component of data analysis. Basic sorting is trivial: just calling `sort!` will sort all columns, in place:
```julia
-using DataFrames, RDatasets
-
-iris = dataset("datasets", "iris")
+using DataFrames
+iris = readtable(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"))
sort!(iris)
```
diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
index 912279f3ac..8caa9b1b1e 100644
--- a/docs/src/man/split_apply_combine.md
+++ b/docs/src/man/split_apply_combine.md
@@ -7,12 +7,11 @@ The DataFrames package supports the Split-Apply-Combine strategy through the `by
We show several examples of the `by` function applied to the `iris` dataset below:
```julia
-using DataFrames, RDatasets
-
-iris = dataset("datasets", "iris")
+using DataFrames
+iris = readtable(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"))
by(iris, :Species, size)
-by(iris, :Species, df -> mean(df[:PetalLength]))
+by(iris, :Species, df -> mean(dropnull(df[:PetalLength])))
by(iris, :Species, df -> DataFrame(N = size(df, 1)))
```
@@ -20,7 +19,7 @@ The `by` function also support the `do` block form:
```julia
by(iris, :Species) do df
- DataFrame(m = mean(df[:PetalLength]), s² = var(df[:PetalLength]))
+ DataFrame(m = mean(dropnull(df[:PetalLength])), s² = var(dropnull(df[:PetalLength])))
end
```
@@ -30,7 +29,7 @@ We show several examples of the `aggregate` function applied to the `iris` datas
```julia
aggregate(iris, :Species, sum)
-aggregate(iris, :Species, [sum, mean])
+aggregate(iris, :Species, [sum, x->mean(dropnull(x))])
```
If you only want to split the data set into subsets, use the `groupby` function:
diff --git a/docs/src/man/subsets.md b/docs/src/man/subsets.md
index 2049d5cd1a..a1a899a848 100644
--- a/docs/src/man/subsets.md
+++ b/docs/src/man/subsets.md
@@ -1,59 +1,12 @@
# Subsets
-## DataArrays
-
-The `DataArray` type is meant to behave like a standard Julia `Array` and tries to implement identical indexing rules:
-
-One dimensional `DataArray`:
-
-```julia
-julia> using DataArrays
-
-julia> dv = data([1, 2, 3])
-3-element DataArray{Int64,1}:
- 1
- 2
- 3
-
-julia> dv[1]
-1
-
-julia> dv[2] = NA
-NA
-
-julia> dv[2]
-NA
-```
-
-Two dimensional `DataArray`:
-
-```julia
-julia> using DataArrays
-
-julia> dm = data([1 2; 3 4])
-2×2 DataArray{Int64,2}:
- 1 2
- 3 4
-
-julia> dm[1, 1]
-1
-
-julia> dm[2, 1] = NA
-NA
-
-julia> dm[2, 1]
-NA
-```
-
-DataFrames
-
-In contrast, a `DataFrame` offers substantially more forms of indexing because columns can be referred to by name:
+A `DataFrame` supports many forms of indexing.
```julia
julia> using DataFrames
julia> df = DataFrame(A = 1:10, B = 2:2:20)
-10×2 DataFrame
+10×2 DataFrames.DataFrame
│ Row │ A │ B │
├─────┼────┼────┤
│ 1 │ 1 │ 2 │
@@ -68,11 +21,11 @@ julia> df = DataFrame(A = 1:10, B = 2:2:20)
│ 10 │ 10 │ 20 │
```
-Refering to the first column by index or name:
+Referring to the first column by index or name:
```julia
julia> df[1]
-10-element DataArray{Int64,1}:
+10-element NullableArrays.NullableArray{Int64,1}:
1
2
3
@@ -85,7 +38,7 @@ julia> df[1]
10
julia> df[:A]
-10-element DataArray{Int64,1}:
+10-element NullableArrays.NullableArray{Int64,1}:
1
2
3
@@ -102,17 +55,17 @@ Refering to the first element of the first column:
```julia
julia> df[1, 1]
-1
+Nullable{Int64}(1)
julia> df[1, :A]
-1
+Nullable{Int64}(1)
```
Selecting a subset of rows by index and an (ordered) subset of columns by name:
```julia
julia> df[1:3, [:A, :B]]
-3×2 DataFrame
+3×2 DataFrames.DataFrame
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 2 │
@@ -120,39 +73,10 @@ julia> df[1:3, [:A, :B]]
│ 3 │ 3 │ 6 │
julia> df[1:3, [:B, :A]]
-3×2 DataFrame
+3×2 DataFrames.DataFrame
│ Row │ B │ A │
├─────┼───┼───┤
│ 1 │ 2 │ 1 │
│ 2 │ 4 │ 2 │
│ 3 │ 6 │ 3 │
```
-
-Selecting a subset of rows by using a condition:
-
-```julia
-julia> df[df[:A] % 2 .== 0, :]
-5×2 DataFrame
-│ Row │ A │ B │
-├─────┼────┼────┤
-│ 1 │ 2 │ 4 │
-│ 2 │ 4 │ 8 │
-│ 3 │ 6 │ 12 │
-│ 4 │ 8 │ 16 │
-│ 5 │ 10 │ 20 │
-
-julia> df[df[:B] % 2 .== 0, :]
-10×2 DataFrame
-│ Row │ A │ B │
-├─────┼────┼────┤
-│ 1 │ 1 │ 2 │
-│ 2 │ 2 │ 4 │
-│ 3 │ 3 │ 6 │
-│ 4 │ 4 │ 8 │
-│ 5 │ 5 │ 10 │
-│ 6 │ 6 │ 12 │
-│ 7 │ 7 │ 14 │
-│ 8 │ 8 │ 16 │
-│ 9 │ 9 │ 18 │
-│ 10 │ 10 │ 20 │
-```
diff --git a/src/DataFrames.jl b/src/DataFrames.jl
index 3c2a53003f..e19750286c 100644
--- a/src/DataFrames.jl
+++ b/src/DataFrames.jl
@@ -12,7 +12,8 @@ using Compat
import Compat.String
using Reexport
@reexport using StatsBase
-@reexport using DataArrays
+@reexport using NullableArrays
+@reexport using CategoricalArrays
using GZip
using SortingAlgorithms
@@ -50,6 +51,7 @@ export @~,
aggregate,
by,
+ categorical!,
coefnames,
colwise,
combine,
@@ -70,8 +72,6 @@ export @~,
nrow,
nullable!,
order,
- pool,
- pool!,
printtable,
readtable,
rename!,
@@ -82,9 +82,14 @@ export @~,
unique!,
unstack,
writetable,
+ head,
+ tail,
# Remove after deprecation period
- read_rda
+ read_rda,
+ pool,
+ pool!
+
##############################################################################
##
diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 3df74edcf7..6e7f9adedf 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -11,24 +11,24 @@ type in that it allows indexing by a key (the columns).
The following are normally implemented for AbstractDataFrames:
-* [`describe`]({ref}) : summarize columns
-* [`dump`]({ref}) : show structure
+* [`describe`](@ref) : summarize columns
+* [`dump`](@ref) : show structure
* `hcat` : horizontal concatenation
* `vcat` : vertical concatenation
* `names` : columns names
-* [`names!`]({ref}) : set columns names
-* [`rename!`]({ref}) : rename columns names based on keyword arguments
-* [`eltypes`]({ref}) : `eltype` of each column
+* [`names!`](@ref) : set columns names
+* [`rename!`](@ref) : rename columns names based on keyword arguments
+* [`eltypes`](@ref) : `eltype` of each column
* `length` : number of columns
* `size` : (nrows, ncols)
-* [`head`]({ref}) : first `n` rows
-* [`tail`]({ref}) : last `n` rows
+* [`head`](@ref) : first `n` rows
+* [`tail`](@ref) : last `n` rows
* `convert` : convert to an array
-* `DataArray` : convert to a DataArray
-* [`complete_cases`]({ref}) : indexes of complete cases (rows with no NA's)
-* [`complete_cases!`]({ref}) : remove rows with NA's
-* [`nonunique`]({ref}) : indexes of duplicate rows
-* [`unique!`]({ref}) : remove duplicate rows
+* `NullableArray` : convert to a NullableArray
+* [`complete_cases`](@ref) : indexes of complete cases (rows with no NA's)
+* [`complete_cases!`](@ref) : remove rows with NA's
+* [`nonunique`](@ref) : indexes of duplicate rows
+* [`unique!`](@ref) : remove duplicate rows
* `similar` : a DataFrame with similar columns as `d`
**Indexing**
@@ -79,13 +79,17 @@ abstract AbstractDataFrame
##
##############################################################################
-immutable Cols{T <: AbstractDataFrame}
+immutable Cols{T <: AbstractDataFrame} <: AbstractVector{Any}
df::T
end
Base.start(::Cols) = 1
Base.done(itr::Cols, st) = st > length(itr.df)
Base.next(itr::Cols, st) = (itr.df[st], st + 1)
Base.length(itr::Cols) = length(itr.df)
+Base.size(itr::Cols, ix) = ix==1 ? length(itr) : throw(ArgumentError("Incorrect dimension"))
+Base.size(itr::Cols) = (length(itr.df),)
+Base.linearindexing{T}(::Type{Cols{T}}) = Base.LinearFast()
+Base.getindex(itr::Cols, inds...) = getindex(itr.df, inds...)
# N.B. where stored as a vector, 'columns(x) = x.vector' is a bit cheaper
columns{T <: AbstractDataFrame}(df::T) = Cols{T}(df)
@@ -175,7 +179,7 @@ rename!(df, @compat(Dict(:i=>:A, :x=>:X)))
(rename!, rename)
"""
-Column elemental types
+Return element types of columns
```julia
eltypes(df::AbstractDataFrame)
@@ -187,7 +191,7 @@ eltypes(df::AbstractDataFrame)
**Result**
-* `::Vector{Type}` : the elemental type of each column
+* `::Vector{Type}` : the element type of each column
**Examples**
@@ -197,14 +201,7 @@ eltypes(df)
```
"""
-function eltypes(df::AbstractDataFrame)
- ncols = size(df, 2)
- res = Array(Type, ncols)
- for j in 1:ncols
- res[j] = eltype(df[j])
- end
- return res
-end
+eltypes(df::AbstractDataFrame) = map!(eltype, Vector{Type}(size(df,2)), columns(df))
Base.size(df::AbstractDataFrame) = (nrow(df), ncol(df))
function Base.size(df::AbstractDataFrame, i::Integer)
@@ -213,7 +210,7 @@ function Base.size(df::AbstractDataFrame, i::Integer)
elseif i == 2
ncol(df)
else
- throw(ArgumentError("DataFrames have only two dimensions"))
+ throw(ArgumentError("DataFrames only have two dimensions"))
end
end
@@ -231,21 +228,15 @@ Base.ndims(::AbstractDataFrame) = 2
Base.similar(df::AbstractDataFrame, dims::Int) =
DataFrame(Any[similar(x, dims) for x in columns(df)], copy(index(df)))
-nas{T}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) = # TODO move to datavector.jl?
- DataArray(Array(T, dims), trues(dims))
-
-nas{T,R}(dv::PooledDataArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
- PooledDataArray(DataArrays.RefArray(zeros(R, dims)), dv.pool)
-
-nas(df::AbstractDataFrame, dims::Int) =
- DataFrame(Any[nas(x, dims) for x in columns(df)], copy(index(df)))
-
##############################################################################
##
## Equality
##
##############################################################################
+# Imported in DataFrames.jl for compatibility across Julia 0.4 and 0.5
+@compat(Base.:(==))(df1::AbstractDataFrame, df2::AbstractDataFrame) = isequal(df1, df2)
+
function Base.isequal(df1::AbstractDataFrame, df2::AbstractDataFrame)
size(df1, 2) == size(df2, 2) || return false
isequal(index(df1), index(df2)) || return false
@@ -255,20 +246,6 @@ function Base.isequal(df1::AbstractDataFrame, df2::AbstractDataFrame)
return true
end
-# Imported in DataFrames.jl for compatibility across Julia 0.4 and 0.5
-function (==)(df1::AbstractDataFrame, df2::AbstractDataFrame)
- size(df1, 2) == size(df2, 2) || return false
- isequal(index(df1), index(df2)) || return false
- eq = true
- for idx in 1:size(df1, 2)
- coleq = df1[idx] == df2[idx]
- # coleq could be NA
- !isequal(coleq, false) || return false
- eq &= coleq
- end
- return eq
-end
-
##############################################################################
##
## Associative methods
@@ -285,10 +262,10 @@ Base.isempty(df::AbstractDataFrame) = ncol(df) == 0
##
##############################################################################
-DataArrays.head(df::AbstractDataFrame, r::Int) = df[1:min(r,nrow(df)), :]
-DataArrays.head(df::AbstractDataFrame) = head(df, 6)
-DataArrays.tail(df::AbstractDataFrame, r::Int) = df[max(1,nrow(df)-r+1):nrow(df), :]
-DataArrays.tail(df::AbstractDataFrame) = tail(df, 6)
+head(df::AbstractDataFrame, r::Int) = df[1:min(r,nrow(df)), :]
+head(df::AbstractDataFrame) = head(df, 6)
+tail(df::AbstractDataFrame, r::Int) = df[max(1,nrow(df)-r+1):nrow(df), :]
+tail(df::AbstractDataFrame) = tail(df, 6)
"""
Show the first or last part of an AbstractDataFrame
@@ -341,7 +318,7 @@ dump(io::IO, df::AbstractDataFrame, n::Int = 5)
```julia
df = DataFrame(i = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10))
-str(df)
+dump(df)
```
"""
@@ -355,10 +332,6 @@ function Base.dump(io::IO, df::AbstractDataFrame, n::Int, indent)
end
end
-function Base.dump(io::IO, dv::AbstractDataVector, n::Int, indent)
- println(io, typeof(dv), "(", length(dv), ") ", dv[1:min(4, end)])
-end
-
# summarize the columns of a DF
# TODO: clever layout in rows
"""
@@ -404,32 +377,33 @@ function StatsBase.describe(io, df::AbstractDataFrame)
println(io, )
end
end
-StatsBase.describe(dv::AbstractArray) = describe(STDOUT, dv)
-function StatsBase.describe{T<:Number}(io, dv::AbstractArray{T})
- if all(isna(dv))
+StatsBase.describe(nv::AbstractArray) = describe(STDOUT, nv)
+function StatsBase.describe{T<:Number}(io, nv::AbstractArray{T})
+ if all(_isnull, nv)
println(io, " * All NA * ")
return
end
- filtered = float(dropna(dv))
+ filtered = float(dropnull(nv))
qs = quantile(filtered, [0, .25, .5, .75, 1])
statNames = ["Min", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max"]
statVals = [qs[1:3]; mean(filtered); qs[4:5]]
for i = 1:6
- println(io, string(rpad(statNames[i], 8, " "), " ", string(statVals[i])))
+ println(io, string(rpad(statNames[i], 10, " "), " ", string(statVals[i])))
end
- nas = sum(isna(dv))
- println(io, "NAs $nas")
- println(io, "NA% $(round(nas*100/length(dv), 2))%")
+ nulls = countnull(nv)
+ println(io, "NULLs $(nulls)")
+ println(io, "NULL % $(round(nulls*100/length(nv), 2))%")
return
end
-function StatsBase.describe{T}(io, dv::AbstractArray{T})
- ispooled = isa(dv, PooledDataVector) ? "Pooled " : ""
+function StatsBase.describe{T}(io, nv::AbstractArray{T})
+ ispooled = isa(nv, CategoricalVector) ? "Pooled " : ""
+ nulls = countnull(nv)
# if nothing else, just give the length and element type and NA count
- println(io, "Length $(length(dv))")
- println(io, "Type $(ispooled)$(string(eltype(dv)))")
- println(io, "NAs $(sum(isna(dv)))")
- println(io, "NA% $(round(sum(isna(dv))*100/length(dv), 2))%")
- println(io, "Unique $(length(unique(dv)))")
+ println(io, "Length $(length(nv))")
+ println(io, "Type $(ispooled)$(string(eltype(nv)))")
+ println(io, "NULLs $(nulls)")
+ println(io, "NULL % $(round(nulls*100/length(nv), 2))%")
+ println(io, "Unique $(length(unique(nv)))")
return
end
@@ -439,8 +413,27 @@ end
##
##############################################################################
+function _nonnull!(res, col)
+ for (i, el) in enumerate(col)
+ res[i] &= !_isnull(el)
+ end
+end
+
+function _nonnull!(res, col::NullableArray)
+ for (i, el) in enumerate(col.isnull)
+ res[i] &= !el
+ end
+end
+
+function _nonnull!(res, col::NullableCategoricalArray)
+ for (i, el) in enumerate(col.refs)
+ res[i] &= el > 0
+ end
+end
+
+
"""
-Indexes of complete cases (rows without NA's)
+Indexes of complete cases (rows without null values)
```julia
complete_cases(df::AbstractDataFrame)
@@ -454,29 +447,28 @@ complete_cases(df::AbstractDataFrame)
* `::Vector{Bool}` : indexes of complete cases
-See also [`complete_cases!`]({ref}).
+See also [`complete_cases!`](@ref).
**Examples**
```julia
df = DataFrame(i = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10))
-df[[1,4,5], :x] = NA
-df[[9,10], :y] = NA
+df[[1,4,5], :x] = Nullable()
+df[[9,10], :y] = Nullable()
complete_cases(df)
```
"""
function complete_cases(df::AbstractDataFrame)
- ## Returns a Vector{Bool} of indexes of complete cases (rows with no NA's).
- res = !isna(df[1])
- for i in 2:ncol(df)
- res &= !isna(df[i])
+ res = fill(true, size(df, 1))
+ for i in 1:size(df, 2)
+ _nonnull!(res, df[i])
end
res
end
"""
-Delete rows with NA's.
+Delete rows with null values.
```julia
complete_cases!(df::AbstractDataFrame)
@@ -490,14 +482,14 @@ complete_cases!(df::AbstractDataFrame)
* `::AbstractDataFrame` : the updated version
-See also [`complete_cases`]({ref}).
+See also [`complete_cases`](@ref).
**Examples**
```julia
df = DataFrame(i = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10))
-df[[1,4,5], :x] = NA
-df[[9,10], :y] = NA
+df[[1,4,5], :x] = Nullable()
+df[[9,10], :y] = Nullable()
complete_cases!(df)
```
@@ -508,7 +500,8 @@ function Base.convert(::Type{Array}, df::AbstractDataFrame)
convert(Matrix, df)
end
function Base.convert(::Type{Matrix}, df::AbstractDataFrame)
- T = reduce(typejoin, eltypes(df))
+ T = reduce(promote_type, eltypes(df))
+ T <: Nullable && (T = eltype(T))
convert(Matrix{T}, df)
end
function Base.convert{T}(::Type{Array{T}}, df::AbstractDataFrame)
@@ -518,27 +511,28 @@ function Base.convert{T}(::Type{Matrix{T}}, df::AbstractDataFrame)
n, p = size(df)
res = Array(T, n, p)
idx = 1
- for col in columns(df)
- anyna(col) && error("DataFrame contains NAs")
- copy!(res, idx, data(col))
+ for (name, col) in zip(names(df), columns(df))
+ anynull(col) && error("cannot convert a DataFrame containing null values to array (found for column $name)")
+ copy!(res, idx, convert(Vector{T}, col))
idx += n
end
return res
end
-function Base.convert(::Type{DataArray}, df::AbstractDataFrame)
- convert(DataMatrix, df)
+function Base.convert(::Type{NullableArray}, df::AbstractDataFrame)
+ convert(NullableMatrix, df)
end
-function Base.convert(::Type{DataMatrix}, df::AbstractDataFrame)
- T = reduce(typejoin, eltypes(df))
- convert(DataMatrix{T}, df)
+function Base.convert(::Type{NullableMatrix}, df::AbstractDataFrame)
+ T = reduce(promote_type, eltypes(df))
+ T <: Nullable && (T = eltype(T))
+ convert(NullableMatrix{T}, df)
end
-function Base.convert{T}(::Type{DataArray{T}}, df::AbstractDataFrame)
- convert(DataMatrix{T}, df)
+function Base.convert{T}(::Type{NullableArray{T}}, df::AbstractDataFrame)
+ convert(NullableMatrix{T}, df)
end
-function Base.convert{T}(::Type{DataMatrix{T}}, df::AbstractDataFrame)
+function Base.convert{T}(::Type{NullableMatrix{T}}, df::AbstractDataFrame)
n, p = size(df)
- res = DataArray(T, n, p)
+ res = NullableArray(T, n, p)
idx = 1
for col in columns(df)
copy!(res, idx, col)
@@ -548,7 +542,7 @@ function Base.convert{T}(::Type{DataMatrix{T}}, df::AbstractDataFrame)
end
"""
-Indexes of complete cases (rows without NA's)
+Indexes of duplicate rows (a row that is a duplicate of a prior row)
```julia
nonunique(df::AbstractDataFrame)
@@ -565,7 +559,7 @@ nonunique(df::AbstractDataFrame, cols)
* `::Vector{Bool}` : indicates whether the row is a duplicate of some
prior row
-See also [`unique`]({ref}) and [`unique!`]({ref}).
+See also [`unique`](@ref) and [`unique!`](@ref).
**Examples**
@@ -623,7 +617,7 @@ specifying the column(s) to compare.
When `cols` is specified, the return DataFrame contains complete rows,
retaining in each case the first instance for which `df[cols]` is unique.
-See also [`nonunique`]({ref}).
+See also [`nonunique`](@ref).
**Examples**
@@ -641,7 +635,7 @@ unique!(df) # modifies df
function nonuniquekey(df::AbstractDataFrame)
# Here's another (probably a lot faster) way to do `nonunique`
# by grouping on all columns. It will fail if columns cannot be
- # made into PooledDataVector's.
+ # made into CategoricalVector's.
gd = groupby(df, _names(df))
idx = [1:length(gd.idx)][gd.idx][gd.starts]
res = fill(true, nrow(df))
@@ -654,7 +648,7 @@ function colmissing(df::AbstractDataFrame) # -> Vector{Int}
nrows, ncols = size(df)
missing = zeros(Int, ncols)
for j in 1:ncols
- missing[j] = countna(df[j])
+ missing[j] = countnull(df[j])
end
return missing
end
@@ -673,7 +667,7 @@ without(df::AbstractDataFrame, c::Any) = without(df, index(df)[c])
##############################################################################
# hcat's first argument must be an AbstractDataFrame
-# Trailing arguments (currently) may also be DataVectors, Vectors, or scalars.
+# Trailing arguments (currently) may also be NullableVectors, Vectors, or scalars.
# hcat! is defined in dataframes/dataframes.jl
# Its first argument (currently) must be a DataFrame.
@@ -684,84 +678,63 @@ Base.hcat(df::AbstractDataFrame, x) = hcat!(df[:, :], x)
Base.hcat(df::AbstractDataFrame, x, y...) = hcat!(hcat(df, x), y...)
# vcat only accepts DataFrames. Finds union of columns, maintaining order
-# of first df. Missing data becomes NAs.
+# of first df. Missing data become null values.
Base.vcat(df::AbstractDataFrame) = df
Base.vcat(dfs::AbstractDataFrame...) = vcat(AbstractDataFrame[dfs...])
Base.vcat(dfs::Vector{Void}) = dfs
+
+_isnullable{A<:AbstractArray}(::Type{A}) = eltype(A) <: Nullable
+
function Base.vcat{T<:AbstractDataFrame}(dfs::Vector{T})
isempty(dfs) && return DataFrame()
- coltyps, colnams, similars = _colinfo(dfs)
-
res = DataFrame()
- Nrow = sum(nrow, dfs)
- for j in 1:length(colnams)
- colnam = colnams[j]
- col = similar(similars[j], coltyps[j], Nrow)
-
- i = 1
- for df in dfs
- if haskey(df, colnam) && eltype(df[colnam]) != NAtype
- copy!(col, i, df[colnam])
- end
- i += size(df, 1)
+ nrows = sum(nrow, dfs)
+ for colnam in unique(Base.flatten(names.(dfs)))
+ k = Bool[haskey(df, colnam) for df in dfs]
+ if all(k)
+ res[colnam] = vcat((dfs[i][colnam] for i in 1:length(dfs))...)
+ continue
end
- res[colnam] = col
- end
- res
-end
+ c = ((typeof(dfs[i][colnam]) for i in 1:length(dfs) if k[i])...)
+ C = Base.return_types(vcat, c)
+
+ if length(C)==1 && isleaftype(C[1])
+ if _isnullable(C[1])
+ NC = C[1]
+ else
+ NC = NullableArray{eltype(C[1])}
+ end
-_isnullable(::AbstractArray) = false
-_isnullable(::AbstractDataArray) = true
-const EMPTY_DATA = DataArray(Void, 0)
-
-function _colinfo{T<:AbstractDataFrame}(dfs::Vector{T})
- df1 = dfs[1]
- colindex = copy(index(df1))
- coltyps = eltypes(df1)
- similars = collect(columns(df1))
- nonnull_ct = Int[_isnullable(c) for c in columns(df1)]
-
- for i in 2:length(dfs)
- df = dfs[i]
- for j in 1:size(df, 2)
- col = df[j]
- cn, ct = _names(df)[j], eltype(col)
- if haskey(colindex, cn)
- idx = colindex[cn]
-
- oldtyp = coltyps[idx]
- if !(ct <: oldtyp)
- coltyps[idx] = promote_type(oldtyp, ct)
+ col = NC(nrows)
+ j = 1
+ for i in 1:length(dfs)
+ if k[i]
+ copy!(col, j, dfs[i][colnam])
end
- nonnull_ct[idx] += !_isnullable(col)
- else # new column
- push!(colindex, cn)
- push!(coltyps, ct)
- push!(similars, col)
- push!(nonnull_ct, !_isnullable(col))
+ j += nrow(dfs[i])
end
- end
- end
+ else
+ # warn("Unstable return types: ", C, " from vcat of ", [typeof(dfs[i][colnam]) for i in 1:length(dfs) if k[i]])
- for j in 1:length(colindex)
- if nonnull_ct[j] < length(dfs) && !_isnullable(similars[j])
- similars[j] = EMPTY_DATA
+ E = Base.promote_eltype(c...)
+ TN = NullableArray{E <: Nullable ? eltype(E) : E}
+ col = vcat((k[i] ? dfs[i][colnam] : TN(nrow(dfs[i])) for i in 1:length(dfs))...)
end
- end
- colnams = _names(colindex)
- coltyps, colnams, similars
+ res[colnam] = col
+ end
+ res
end
##############################################################################
##
## Hashing
##
-## Make sure this agrees with is_equals()
+## Make sure this agrees with isequals()
##
##############################################################################
@@ -792,7 +765,7 @@ ncol(df::AbstractDataFrame)
* `::AbstractDataFrame` : the updated version
-See also [`size`]({ref}).
+See also [`size`](@ref).
NOTE: these functions may be depreciated for `size`.
diff --git a/src/abstractdataframe/io.jl b/src/abstractdataframe/io.jl
index debd979db3..fc1f9ca418 100644
--- a/src/abstractdataframe/io.jl
+++ b/src/abstractdataframe/io.jl
@@ -23,7 +23,7 @@ function printtable(io::IO,
header::Bool = true,
separator::Char = ',',
quotemark::Char = '"',
- nastring::AbstractString = "NA")
+ nastring::AbstractString = "NULL")
n, p = size(df)
etypes = eltypes(df)
if header
@@ -42,10 +42,10 @@ function printtable(io::IO,
quotestr = string(quotemark)
for i in 1:n
for j in 1:p
- if ! (isna(df[j],i))
+ if !isnull(df[j],i)
if ! (etypes[j] <: Real)
print(io, quotemark)
- escapedprint(io, df[i, j], quotestr)
+ escapedprint(io, get(df[i, j]), quotestr)
print(io, quotemark)
else
print(io, df[i, j])
@@ -67,7 +67,7 @@ function printtable(df::AbstractDataFrame;
header::Bool = true,
separator::Char = ',',
quotemark::Char = '"',
- nastring::AbstractString = "NA")
+ nastring::AbstractString = "NULL")
printtable(STDOUT,
df,
header = header,
@@ -94,7 +94,7 @@ writetable(filename, df, [keyword options])
* `separator::Char` -- The separator character that you would like to use. Defaults to the output of `getseparator(filename)`, which uses commas for files that end in `.csv`, tabs for files that end in `.tsv` and a single space for files that end in `.wsv`.
* `quotemark::Char` -- The character used to delimit string fields. Defaults to `'"'`.
* `header::Bool` -- Should the file contain a header that specifies the column names from `df`. Defaults to `true`.
-* `nastring::AbstractString` -- What to write in place of missing data. Defaults to `"NA"`.
+* `nastring::AbstractString` -- What to write in place of missing data. Defaults to `"NULL"`.
### Result
@@ -115,7 +115,7 @@ function writetable(filename::AbstractString,
header::Bool = true,
separator::Char = getseparator(filename),
quotemark::Char = '"',
- nastring::AbstractString = "NA",
+ nastring::AbstractString = "NULL",
append::Bool = false)
if endswith(filename, ".bz") || endswith(filename, ".bz2")
@@ -169,7 +169,6 @@ function html_escape(cell::AbstractString)
end
@compat function Base.show(io::IO, ::MIME"text/html", df::AbstractDataFrame)
- n = size(df, 1)
cnames = _names(df)
write(io, "
")
write(io, "")
@@ -178,13 +177,19 @@ end
write(io, "$column_name | ")
end
write(io, "
")
- tty_rows, tty_cols = _displaysize(io)
- mxrow = min(n,tty_rows)
+ haslimit = get(io, :limit, true)
+ n = size(df, 1)
+ if haslimit
+ tty_rows, tty_cols = _displaysize(io)
+ mxrow = min(n,tty_rows)
+ else
+ mxrow = n
+ end
for row in 1:mxrow
write(io, "")
write(io, "$row | ")
for column_name in cnames
- cell = string(df[row, column_name])
+ cell = sprint(ourshowcompact, df[row, column_name])
write(io, "$(html_escape(cell)) | ")
end
write(io, "
")
@@ -200,6 +205,60 @@ end
write(io, "
")
end
+##############################################################################
+#
+# LaTeX output
+#
+##############################################################################
+
+function latex_char_escape(char::AbstractString)
+ if char == "\\"
+ return "\\textbackslash{}"
+ elseif char == "~"
+ return "\\textasciitilde{}"
+ else
+ return string("\\", char)
+ end
+end
+
+function latex_escape(cell::AbstractString)
+ cell = replace(cell, ['\\','~','#','$','%','&','_','^','{','}'], latex_char_escape)
+ return cell
+end
+
+function Base.show(io::IO, ::MIME"text/latex", df::AbstractDataFrame)
+ nrows = size(df, 1)
+ ncols = size(df, 2)
+ cnames = _names(df)
+ alignment = repeat("c", ncols)
+ write(io, "\\begin{tabular}{r|")
+ write(io, alignment)
+ write(io, "}\n")
+ write(io, "\t& ")
+ header = join(map(c -> latex_escape(string(c)), cnames), " & ")
+ write(io, header)
+ write(io, "\\\\\n")
+ write(io, "\t\\hline\n")
+ for row in 1:nrows
+ write(io, "\t")
+ write(io, @sprintf("%d", row))
+ for col in 1:ncols
+ write(io, " & ")
+ cell = df[row,col]
+ if !isnull(cell)
+ content = get(cell)
+ if mimewritable(MIME("text/latex"), content)
+ show(io, MIME("text/latex"), content)
+ else
+ print(io, latex_escape(string(content)))
+ end
+ end
+ end
+ write(io, " \\\\\n")
+ end
+ write(io, "\\end{tabular}\n")
+end
+
##############################################################################
#
# MIME
diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl
index c8f7e8c7d9..c57f8e344b 100644
--- a/src/abstractdataframe/join.jl
+++ b/src/abstractdataframe/join.jl
@@ -2,13 +2,26 @@
## Join / merge
##
+# Like similar, but returns a nullable array
+similar_nullable{T}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
+ NullableArray(T, dims)
+
+similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
+ NullableArray(eltype(T), dims)
+
+similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
+ NullableCategoricalArray(T, dims)
+
+similar_nullable(df::AbstractDataFrame, dims::Int) =
+ DataFrame(Any[similar_nullable(x, dims) for x in columns(df)], copy(index(df)))
+
function join_idx(left, right, max_groups)
## adapted from Wes McKinney's full_outer_join in pandas (file: src/join.pyx).
- # NA group in location 0
+ # NULL group in location 0
- left_sorter, where, left_count = DataArrays.groupsort_indexer(left, max_groups)
- right_sorter, where, right_count = DataArrays.groupsort_indexer(right, max_groups)
+ left_sorter, where, left_count = groupsort_indexer(left, max_groups)
+ right_sorter, where, right_count = groupsort_indexer(right, max_groups)
# First pass, determine size of result set
tcount = 0
@@ -27,7 +40,7 @@ function join_idx(left, right, max_groups)
end
end
- # group 0 is the NA group
+ # group 0 is the NULL group
tposition = 0
lposition = 0
rposition = 0
@@ -72,66 +85,134 @@ function join_idx(left, right, max_groups)
right_sorter[right_indexer], right_sorter[rightonly_indexer])
end
-function DataArrays.PooledDataVecs(df1::AbstractDataFrame,
- df2::AbstractDataFrame)
+function sharepools{S,N}(v1::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}},
+ v2::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}},
+ index::Vector{S},
+ R)
+ tidx1 = convert(Vector{R}, indexin(CategoricalArrays.index(v1.pool), index))
+ tidx2 = convert(Vector{R}, indexin(CategoricalArrays.index(v2.pool), index))
+ refs1 = zeros(R, length(v1))
+ refs2 = zeros(R, length(v2))
+ for i in 1:length(refs1)
+ if v1.refs[i] != 0
+ refs1[i] = tidx1[v1.refs[i]]
+ end
+ end
+ for i in 1:length(refs2)
+ if v2.refs[i] != 0
+ refs2[i] = tidx2[v2.refs[i]]
+ end
+ end
+ pool = CategoricalPool{S, R}(index)
+ return (CategoricalArray(refs1, pool),
+ CategoricalArray(refs2, pool))
+end
+
+function sharepools{S,N}(v1::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}},
+ v2::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}})
+ index = sort(unique([levels(v1); levels(v2)]))
+ sz = length(index)
+
+ R = sz <= typemax(UInt8) ? UInt8 :
+ sz <= typemax(UInt16) ? UInt16 :
+ sz <= typemax(UInt32) ? UInt32 :
+ UInt64
+
+ # To ensure type stability during actual work
+ sharepools(v1, v2, index, R)
+end
+
+sharepools{S,N}(v1::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}},
+ v2::AbstractArray{S,N}) =
+ sharepools(v1, oftype(v1, v2))
+
+sharepools{S,N}(v1::AbstractArray{S,N},
+ v2::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}}) =
+ sharepools(oftype(v2, v1), v2)
+
+# TODO: write an optimized version for (Nullable)CategoricalArray
+function sharepools(v1::AbstractArray,
+ v2::AbstractArray)
+ ## Return two categorical arrays that share the same pool.
+
+ ## TODO: allow specification of R
+ R = CategoricalArrays.DefaultRefType
+ refs1 = Array(R, size(v1))
+ refs2 = Array(R, size(v2))
+ poolref = Dict{promote_type(eltype(v1), eltype(v2)), R}()
+ maxref = 0
+
+ # loop through once to fill the poolref dict
+ for i = 1:length(v1)
+ if !_isnull(v1[i])
+ poolref[v1[i]] = 0
+ end
+ end
+ for i = 1:length(v2)
+ if !_isnull(v2[i])
+ poolref[v2[i]] = 0
+ end
+ end
+
+ # fill positions in poolref
+ pool = sort(collect(keys(poolref)))
+ i = 1
+ for p in pool
+ poolref[p] = i
+ i += 1
+ end
+
+ # fill in newrefs
+ zeroval = zero(R)
+ for i = 1:length(v1)
+ if _isnull(v1[i])
+ refs1[i] = zeroval
+ else
+ refs1[i] = poolref[v1[i]]
+ end
+ end
+ for i = 1:length(v2)
+ if _isnull(v2[i])
+ refs2[i] = zeroval
+ else
+ refs2[i] = poolref[v2[i]]
+ end
+ end
+
+ pool = CategoricalPool(pool)
+ return (NullableCategoricalArray(refs1, pool),
+ NullableCategoricalArray(refs2, pool))
+end
+
+function sharepools(df1::AbstractDataFrame, df2::AbstractDataFrame)
# This method exists to allow merge to work with multiple columns.
- # It takes the columns of each DataFrame and returns a DataArray
+ # It takes the columns of each DataFrame and returns a categorical array
# with a merged pool that "keys" the combination of column values.
# The pools of the result don't really mean anything.
- dv1, dv2 = PooledDataVecs(df1[1], df2[1])
- # use UInt32 instead of the minimum integer size chosen by PooledDataVecs
+ dv1, dv2 = sharepools(df1[1], df2[1])
+ # use UInt32 instead of the minimum integer size chosen by sharepools
# since the number of levels can be high
refs1 = Vector{UInt32}(dv1.refs)
refs2 = Vector{UInt32}(dv2.refs)
- # the + 1 handles NA's
+ # the + 1 handles nulls
refs1[:] += 1
refs2[:] += 1
- ngroups = length(dv1.pool) + 1
+ ngroups = length(levels(dv1)) + 1
for j = 2:ncol(df1)
- dv1, dv2 = PooledDataVecs(df1[j], df2[j])
+ dv1, dv2 = sharepools(df1[j], df2[j])
for i = 1:length(refs1)
refs1[i] += (dv1.refs[i]) * ngroups
end
for i = 1:length(refs2)
refs2[i] += (dv2.refs[i]) * ngroups
end
- ngroups *= (length(dv1.pool) + 1)
+ ngroups *= length(levels(dv1)) + 1
end
# recode refs1 and refs2 to drop the unused column combinations and
# limit the pool size
- PooledDataVecs( refs1, refs2 )
+ sharepools(refs1, refs2)
end
-function DataArrays.PooledDataArray{R}(df::AbstractDataFrame, ::Type{R})
- # This method exists to allow another way for merge to work with
- # multiple columns. It takes the columns of the DataFrame and
- # returns a DataArray with a merged pool that "keys" the
- # combination of column values.
- # Notes:
- # - I skipped the sort to make it faster.
- # - Converting each individual one-row DataFrame to a Tuple
- # might be faster.
- refs = zeros(R, nrow(df))
- poolref = Dict{AbstractDataFrame, Int}()
- pool = Array(UInt64, 0)
- j = 1
- for i = 1:nrow(df)
- val = df[i,:]
- if haskey(poolref, val)
- refs[i] = poolref[val]
- else
- push!(pool, hash(val))
- refs[i] = j
- poolref[val] = j
- j += 1
- end
- end
- return PooledDataArray(DataArrays.RefArray(refs), pool)
-end
-
-DataArrays.PooledDataArray(df::AbstractDataFrame) = PooledDataArray(df, DEFAULT_POOLED_REF_TYPE)
-
-
"""
Join two DataFrames
@@ -164,11 +245,11 @@ join(df1::AbstractDataFrame,
- `:cross` : a full Cartesian product of the key combinations; every
row of `df1` is matched with every row of `df2`
-`NA`s are filled in where needed to complete joins.
+Null values are filled in where needed to complete joins.
### Result
-* `::DataFrame` : the joined DataFrame
+* `::DataFrame` : the joined DataFrame
### Examples
@@ -199,7 +280,7 @@ function Base.join(df1::AbstractDataFrame,
throw(ArgumentError("Missing join argument 'on'."))
end
- dv1, dv2 = PooledDataVecs(df1[on], df2[on])
+ dv1, dv2 = sharepools(df1[on], df2[on])
left_idx, leftonly_idx, right_idx, rightonly_idx =
join_idx(dv1.refs, dv2.refs, length(dv1.pool))
@@ -216,14 +297,14 @@ function Base.join(df1::AbstractDataFrame,
left = df1[[left_idx; leftonly_idx], :]
right = vcat(df2w[right_idx, :],
- nas(df2w, length(leftonly_idx)))
+ similar_nullable(df2w, length(leftonly_idx)))
return hcat!(left, right)
elseif kind == :right
df1w = without(df1, on)
left = vcat(df1w[left_idx, :],
- nas(df1w, length(rightonly_idx)))
+ similar_nullable(df1w, length(rightonly_idx)))
right = df2[[right_idx; rightonly_idx], :]
return hcat!(left, right)
@@ -232,8 +313,8 @@ function Base.join(df1::AbstractDataFrame,
mixed = hcat!(df1[left_idx, :], df2w[right_idx, :])
leftonly = hcat!(df1[leftonly_idx, :],
- nas(df2w, length(leftonly_idx)))
- rightonly = hcat!(nas(df1w, length(rightonly_idx)),
+ similar_nullable(df2w, length(leftonly_idx)))
+ rightonly = hcat!(similar_nullable(df1w, length(rightonly_idx)),
df2[rightonly_idx, :])
return vcat(mixed, leftonly, rightonly)
diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl
index 47f2e33b90..75c27bfaa0 100644
--- a/src/abstractdataframe/reshape.jl
+++ b/src/abstractdataframe/reshape.jl
@@ -78,25 +78,24 @@ function stack(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vector
[Compat.repeat(df[c], outer=N) for c in id_vars]...], # id_var columns
cnames)
end
-function stack(df::AbstractDataFrame, measure_vars::Int, id_vars::Int)
- stack(df, [measure_vars], [id_vars])
+function stack(df::AbstractDataFrame, measure_var::Int, id_var::Int)
+ stack(df, [measure_var], [id_var])
end
-function stack(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Int)
- stack(df, measure_vars, [id_vars])
+function stack(df::AbstractDataFrame, measure_vars::Vector{Int}, id_var::Int)
+ stack(df, measure_vars, [id_var])
end
-function stack(df::AbstractDataFrame, measure_vars::Int, id_vars::Vector{Int})
- stackdf(df, [measure_vars], id_vars)
+function stack(df::AbstractDataFrame, measure_var::Int, id_vars::Vector{Int})
+ stackdf(df, [measure_var], id_vars)
end
stack(df::AbstractDataFrame, measure_vars, id_vars) =
stack(df, index(df)[measure_vars], index(df)[id_vars])
-function stack(df::AbstractDataFrame, measure_vars)
+# no vars specified, by default select only numeric columns
+numeric_vars(df::AbstractDataFrame) = [T <: AbstractFloat || (T <: Nullable && eltype(T) <: AbstractFloat)
+ for T in eltypes(df)]
+function stack(df::AbstractDataFrame, measure_vars = numeric_vars(df))
mv_inds = index(df)[measure_vars]
stack(df, mv_inds, _setdiff(1:ncol(df), mv_inds))
end
-function stack(df::AbstractDataFrame)
- idx = [1:length(df);][[t <: AbstractFloat for t in eltypes(df)]]
- stack(df, idx)
-end
"""
Stacks a DataFrame; convert from a wide to long format; see
@@ -163,27 +162,30 @@ function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int)
# `rowkey` integer indicating which column to place along rows
# `colkey` integer indicating which column to place along column headers
# `value` integer indicating which column has values
- refkeycol = PooledDataArray(df[rowkey])
+ refkeycol = NullableCategoricalArray(df[rowkey])
valuecol = df[value]
- # TODO make a version with a default refkeycol
- keycol = PooledDataArray(df[colkey])
+ keycol = NullableCategoricalArray(df[colkey])
Nrow = length(refkeycol.pool)
Ncol = length(keycol.pool)
- # TODO make fillNA(type, length)
- payload = DataFrame(Any[DataArray(eltype(valuecol), Nrow) for i in 1:Ncol], map(Symbol, keycol.pool))
+ T = eltype(valuecol)
+ if T <: Nullable
+ T = eltype(T)
+ end
+ payload = DataFrame(Any[NullableArray(T, Nrow) for i in 1:Ncol],
+ map(Symbol, levels(keycol)))
nowarning = true
for k in 1:nrow(df)
- j = @compat Int(keycol.refs[k])
- i = @compat Int(refkeycol.refs[k])
+ j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]])
+ i = Int(CategoricalArrays.order(refkeycol.pool)[refkeycol.refs[k]])
if i > 0 && j > 0
- if nowarning && !isna(payload[j][i])
+ if nowarning && !isnull(payload[j][i])
warn("Duplicate entries in unstack.")
nowarning = false
end
payload[j][i] = valuecol[k]
end
end
- insert!(payload, 1, refkeycol.pool, _names(df)[rowkey])
+ insert!(payload, 1, NullableArray(levels(refkeycol)), _names(df)[rowkey])
end
unstack(df::AbstractDataFrame, rowkey, colkey, value) =
unstack(df, index(df)[rowkey], index(df)[colkey], index(df)[value])
@@ -196,24 +198,28 @@ function unstack(df::AbstractDataFrame, colkey::Int, value::Int)
# group on anything not a key or value:
g = groupby(df, setdiff(_names(df), _names(df)[[colkey, value]]))
groupidxs = [g.idx[g.starts[i]:g.ends[i]] for i in 1:length(g.starts)]
- rowkey = PooledDataArray(zeros(Int, size(df, 1)), [1:length(groupidxs);])
+ rowkey = zeros(Int, size(df, 1))
for i in 1:length(groupidxs)
rowkey[groupidxs[i]] = i
end
- keycol = PooledDataArray(df[colkey])
+ keycol = NullableCategoricalArray(df[colkey])
valuecol = df[value]
df1 = df[g.idx[g.starts], g.cols]
- keys = unique(keycol)
Nrow = length(g)
- Ncol = length(keycol.pool)
- df2 = DataFrame(Any[DataArray(fill(valuecol[1], Nrow), fill(true, Nrow)) for i in 1:Ncol], map(@compat(Symbol), keycol.pool))
+ Ncol = length(levels(keycol))
+ T = eltype(valuecol)
+ if T <: Nullable
+ T = eltype(T)
+ end
+ df2 = DataFrame(Any[NullableArray(T, Nrow) for i in 1:Ncol],
+ map(@compat(Symbol), levels(keycol)))
nowarning = true
for k in 1:nrow(df)
- j = @compat Int(keycol.refs[k])
+ j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]])
i = rowkey[k]
if i > 0 && j > 0
- if nowarning && !isna(df2[j][i])
- warn("Duplicate entries in unstack.")
+ if nowarning && !isnull(df2[j][i])
+ warn("Duplicate entries in unstack at row $k.")
nowarning = false
end
df2[j][i] = valuecol[k]
@@ -243,7 +249,7 @@ NOTE: Not exported.
### Constructor
```julia
-RepeatedVector(d::AbstractVector...)
+StackedVector(d::AbstractVector...)
```
### Arguments
@@ -289,7 +295,7 @@ Base.ndims(v::StackedVector) = 1
Base.eltype(v::StackedVector) = promote_type(map(eltype, v.components)...)
Base.similar(v::StackedVector, T, dims::Dims) = similar(v.components[1], T, dims)
-DataArrays.PooledDataArray(v::StackedVector) = PooledDataArray(v[:]) # could be more efficient
+CategoricalArrays.CategoricalArray(v::StackedVector) = CategoricalArray(v[:]) # could be more efficient
"""
@@ -349,8 +355,8 @@ Base.reverse(v::RepeatedVector) = RepeatedVector(reverse(v.parent), v.inner, v.o
Base.similar(v::RepeatedVector, T, dims::Dims) = similar(v.parent, T, dims)
Base.unique(v::RepeatedVector) = unique(v.parent)
-function DataArrays.PooledDataArray(v::RepeatedVector)
- res = DataArrays.PooledDataArray(v.parent)
+function CategoricalArrays.CategoricalArray(v::RepeatedVector)
+ res = CategoricalArrays.CategoricalArray(v.parent)
res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer])
res
end
@@ -424,26 +430,22 @@ function stackdf(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vect
[RepeatedVector(df[:,c], 1, N) for c in id_vars]...], # id_var columns
cnames)
end
-function stackdf(df::AbstractDataFrame, measure_vars::Int, id_vars::Int)
- stackdf(df, [measure_vars], [id_vars])
+function stackdf(df::AbstractDataFrame, measure_var::Int, id_var::Int)
+ stackdf(df, [measure_var], [id_var])
end
-function stackdf(df::AbstractDataFrame, measure_vars, id_vars::Int)
- stackdf(df, measure_vars, [id_vars])
+function stackdf(df::AbstractDataFrame, measure_vars, id_var::Int)
+ stackdf(df, measure_vars, [id_var])
end
-function stackdf(df::AbstractDataFrame, measure_vars::Int, id_vars)
- stackdf(df, [measure_vars], id_vars)
+function stackdf(df::AbstractDataFrame, measure_var::Int, id_vars)
+ stackdf(df, [measure_var], id_vars)
end
function stackdf(df::AbstractDataFrame, measure_vars, id_vars)
stackdf(df, index(df)[measure_vars], index(df)[id_vars])
end
-function stackdf(df::AbstractDataFrame, measure_vars)
+function stackdf(df::AbstractDataFrame, measure_vars = numeric_vars(df))
m_inds = index(df)[measure_vars]
stackdf(df, m_inds, _setdiff(1:ncol(df), m_inds))
end
-function stackdf(df::AbstractDataFrame)
- idx = [1:length(df);][[t <: AbstractFloat for t in eltypes(df)]]
- stackdf(df, idx)
-end
"""
A stacked view of a DataFrame (long format); see `stackdf`
diff --git a/src/abstractdataframe/show.jl b/src/abstractdataframe/show.jl
index 4effdf1965..b981623b85 100644
--- a/src/abstractdataframe/show.jl
+++ b/src/abstractdataframe/show.jl
@@ -62,8 +62,9 @@ end
#' ourshowcompact(STDOUT, "abc")
#' ourshowcompact(STDOUT, 10000)
ourshowcompact(io::IO, x::Any) = showcompact(io, x) # -> Void
-ourshowcompact(io::IO, x::AbstractString) = showcompact(io, x) # -> Void
+ourshowcompact(io::IO, x::AbstractString) = print(io, x) # -> Void
ourshowcompact(io::IO, x::Symbol) = print(io, x) # -> Void
+ourshowcompact(io::IO, x::Nullable{String}) = isnull(x) ? showcompact(io, x) : print(io, get(x)) # -> Void
#' @description
#'
@@ -100,8 +101,6 @@ function getmaxwidths(df::AbstractDataFrame,
rowlabel::Symbol) # -> Vector{Int}
maxwidths = Array(Int, size(df, 2) + 1)
- # TODO: Move this definition somewhere else
- NAstrwidth = 2
undefstrwidth = ourstrwidth(Base.undef_ref_str)
j = 1
@@ -110,17 +109,11 @@ function getmaxwidths(df::AbstractDataFrame,
maxwidth = ourstrwidth(name)
# (2) Consider length of longest entry in that column
- for indices in (rowindices1, rowindices2)
- for i in indices
- if isna(col, i)
- maxwidth = max(maxwidth, NAstrwidth)
- else
- try
- maxwidth = max(maxwidth, ourstrwidth(col[i]))
- catch
- maxwidth = max(maxwidth, undefstrwidth)
- end
- end
+ for indices in (rowindices1, rowindices2), i in indices
+ try
+ maxwidth = max(maxwidth, ourstrwidth(col[i]))
+ catch
+ maxwidth = max(maxwidth, undefstrwidth)
end
end
maxwidths[j] = maxwidth
diff --git a/src/abstractdataframe/sort.jl b/src/abstractdataframe/sort.jl
index c92021f2e8..1d05e2c9da 100644
--- a/src/abstractdataframe/sort.jl
+++ b/src/abstractdataframe/sort.jl
@@ -308,7 +308,3 @@ end
Base.sort(df::AbstractDataFrame, a::Algorithm, o::Ordering) = df[sortperm(df, a, o),:]
Base.sortperm(df::AbstractDataFrame, a::Algorithm, o::@compat(Union{Perm,DFPerm})) = sort!([1:size(df, 1);], a, o)
Base.sortperm(df::AbstractDataFrame, a::Algorithm, o::Ordering) = sortperm(df, a, DFPerm(o,df))
-
-# Extras to speed up sorting
-Base.sortperm{V}(df::AbstractDataFrame, a::Algorithm, o::FastPerm{Sort.ForwardOrdering,V}) = sortperm(o.vec)
-Base.sortperm{V}(df::AbstractDataFrame, a::Algorithm, o::FastPerm{Sort.ReverseOrdering,V}) = reverse(sortperm(o.vec))
diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
index 3c33fc1505..4ce59b324f 100644
--- a/src/dataframe/dataframe.jl
+++ b/src/dataframe/dataframe.jl
@@ -2,7 +2,7 @@
An AbstractDataFrame that stores a set of named columns
The columns are normally AbstractVectors stored in memory,
-particularly a Vector, DataVector, or PooledDataVector.
+particularly a Vector, NullableVector, or CategoricalVector.
**Constructors**
@@ -30,9 +30,9 @@ Each column in `columns` should be the same length.
**Notes**
-Most of the default constructors convert columns to `DataArrays`. The
+Most of the default constructors convert columns to `NullableArray`. The
base constructor, `DataFrame(columns::Vector{Any},
-names::Vector{Symbol})` does not convert to `DataArrays`.
+names::Vector{Symbol})` does not convert to `NullableArray`.
A `DataFrame` is a lightweight object. As long as columns are not
manipulated, creation of a DataFrame from existing AbstractVectors is
@@ -48,12 +48,12 @@ loops.
```julia
df = DataFrame()
v = ["x","y","z"][rand(1:3, 10)]
-df1 = DataFrame(Any[[1:10], v, rand(10)], [:A, :B, :C]) # columns are Arrays
-df2 = DataFrame(A = 1:10, B = v, C = rand(10)) # columns are DataArrays
+df1 = DataFrame(Any[collect(1:10), v, rand(10)], [:A, :B, :C]) # columns are Arrays
+df2 = DataFrame(A = 1:10, B = v, C = rand(10)) # columns are NullableArrays
dump(df1)
dump(df2)
describe(df2)
-head(df1)
+DataFrames.head(df1)
df1[:A] + df2[:C]
df1[1:4, 1:2]
df1[[:A,:C]]
@@ -102,9 +102,9 @@ function DataFrame(; kwargs...)
return result
end
-function DataFrame(columns::Vector{Any},
- cnames::Vector{Symbol} = gennames(length(columns)))
- return DataFrame(columns, Index(cnames))
+function DataFrame(columns::AbstractVector,
+ cnames::AbstractVector{Symbol} = gennames(length(columns)))
+ return DataFrame(convert(Vector{Any}, columns), Index(convert(Vector{Symbol}, cnames)))
end
@@ -112,7 +112,7 @@ end
function DataFrame(t::Type, nrows::Integer, ncols::Integer)
columns = Array(Any, ncols)
for i in 1:ncols
- columns[i] = DataArray(t, nrows)
+ columns[i] = NullableArray(t, nrows)
end
cnames = gennames(ncols)
return DataFrame(columns, Index(cnames))
@@ -123,19 +123,21 @@ function DataFrame(column_eltypes::Vector, cnames::Vector, nrows::Integer)
p = length(column_eltypes)
columns = Array(Any, p)
for j in 1:p
- columns[j] = DataArray(column_eltypes[j], nrows)
+ columns[j] = NullableArray(column_eltypes[j], nrows)
end
return DataFrame(columns, Index(cnames))
end
-# Initialize an empty DataFrame with specific eltypes and names and whether is pooled data array
-function DataFrame(column_eltypes::Vector{DataType}, cnames::Vector{Symbol}, ispda::Vector{Bool}, nrows::Integer)
+# Initialize an empty DataFrame with specific eltypes and names
+# and whether a nominal array should be created
+function DataFrame(column_eltypes::Vector{DataType}, cnames::Vector{Symbol},
+ nominal::Vector{Bool}, nrows::Integer)
p = length(column_eltypes)
columns = Array(Any, p)
for j in 1:p
- if ispda[j]
- columns[j] = PooledDataArray(column_eltypes[j], nrows)
+ if nominal[j]
+ columns[j] = NullableCategoricalArray(column_eltypes[j], nrows)
else
- columns[j] = DataArray(column_eltypes[j], nrows)
+ columns[j] = NullableArray(column_eltypes[j], nrows)
end
end
return DataFrame(columns, Index(cnames))
@@ -147,7 +149,7 @@ function DataFrame(column_eltypes::Vector, nrows::Integer)
columns = Array(Any, p)
cnames = gennames(p)
for j in 1:p
- columns[j] = DataArray(column_eltypes[j], nrows)
+ columns[j] = NullableArray(column_eltypes[j], nrows)
end
return DataFrame(columns, Index(cnames))
end
@@ -167,8 +169,7 @@ function DataFrame{D <: Associative}(ds::Vector{D}, ks::Vector)
col_eltypes = Type[@compat(Union{}) for _ = 1:length(ks)]
for d in ds
for (i,k) in enumerate(ks)
- # TODO: check for user-defined "NA" values, ala pandas
- if haskey(d, k) && !isna(d[k])
+ if haskey(d, k) && !_isnull(d[k])
col_eltypes[i] = promote_type(col_eltypes[i], typeof(d[k]))
end
end
@@ -179,7 +180,7 @@ function DataFrame{D <: Associative}(ds::Vector{D}, ks::Vector)
df = DataFrame(col_eltypes, ks, length(ds))
for (i,d) in enumerate(ds)
for (j,k) in enumerate(ks)
- df[i,j] = get(d, k, NA)
+ df[i,j] = get(d, k, Nullable())
end
end
@@ -230,7 +231,9 @@ function Base.getindex(df::DataFrame, col_ind::ColumnIndex)
end
# df[MultiColumnIndex] => (Sub)?DataFrame
-function Base.getindex{T <: ColumnIndex}(df::DataFrame, col_inds::AbstractVector{T})
+function Base.getindex{T <: ColumnIndex}(df::DataFrame,
+ col_inds::Union{AbstractVector{T},
+ AbstractVector{Nullable{T}}})
selected_columns = index(df)[col_inds]
new_columns = df.columns[selected_columns]
return DataFrame(new_columns, Index(_names(df)[selected_columns]))
@@ -246,20 +249,29 @@ function Base.getindex(df::DataFrame, row_ind::Real, col_ind::ColumnIndex)
end
# df[SingleRowIndex, MultiColumnIndex] => (Sub)?DataFrame
-function Base.getindex{T <: ColumnIndex}(df::DataFrame, row_ind::Real, col_inds::AbstractVector{T})
+function Base.getindex{T <: ColumnIndex}(df::DataFrame,
+ row_ind::Real,
+ col_inds::Union{AbstractVector{T},
+ AbstractVector{Nullable{T}}})
selected_columns = index(df)[col_inds]
new_columns = Any[dv[[row_ind]] for dv in df.columns[selected_columns]]
return DataFrame(new_columns, Index(_names(df)[selected_columns]))
end
# df[MultiRowIndex, SingleColumnIndex] => (Sub)?AbstractDataVector
-function Base.getindex{T <: Real}(df::DataFrame, row_inds::AbstractVector{T}, col_ind::ColumnIndex)
+function Base.getindex{T <: Real}(df::DataFrame,
+ row_inds::Union{AbstractVector{T}, AbstractVector{Nullable{T}}},
+ col_ind::ColumnIndex)
selected_column = index(df)[col_ind]
return df.columns[selected_column][row_inds]
end
# df[MultiRowIndex, MultiColumnIndex] => (Sub)?DataFrame
-function Base.getindex{R <: Real, T <: ColumnIndex}(df::DataFrame, row_inds::AbstractVector{R}, col_inds::AbstractVector{T})
+function Base.getindex{R <: Real, T <: ColumnIndex}(df::DataFrame,
+ row_inds::Union{AbstractVector{R},
+ AbstractVector{Nullable{R}}},
+ col_inds::Union{AbstractVector{T},
+ AbstractVector{Nullable{T}}})
selected_columns = index(df)[col_inds]
new_columns = Any[dv[row_inds] for dv in df.columns[selected_columns]]
return DataFrame(new_columns, Index(_names(df)[selected_columns]))
@@ -267,13 +279,20 @@ end
# df[:, SingleColumnIndex] => (Sub)?AbstractVector
# df[:, MultiColumnIndex] => (Sub)?DataFrame
-Base.getindex{T<:ColumnIndex}(df::DataFrame, row_inds::Colon, col_inds::@compat(Union{T, AbstractVector{T}})) = df[col_inds]
+Base.getindex{T<:ColumnIndex}(df::DataFrame,
+ row_inds::Colon,
+ col_inds::Union{T, AbstractVector{T},
+ AbstractVector{Nullable{T}}}) =
+ df[col_inds]
# df[SingleRowIndex, :] => (Sub)?DataFrame
Base.getindex(df::DataFrame, row_ind::Real, col_inds::Colon) = df[[row_ind], col_inds]
# df[MultiRowIndex, :] => (Sub)?DataFrame
-function Base.getindex{R<:Real}(df::DataFrame, row_inds::AbstractVector{R}, col_inds::Colon)
+function Base.getindex{R<:Real}(df::DataFrame,
+ row_inds::Union{AbstractVector{R},
+ AbstractVector{Nullable{R}}},
+ col_inds::Colon)
new_columns = Any[dv[row_inds] for dv in df.columns]
return DataFrame(new_columns, copy(index(df)))
end
@@ -344,17 +363,17 @@ function insert_multiple_entries!{T <: Real}(df::DataFrame,
end
end
-upgrade_vector(v::Vector) = DataArray(v, falses(length(v)))
-upgrade_vector(v::Range) = DataArray([v;], falses(length(v)))
-upgrade_vector(v::BitVector) = DataArray(convert(Array{Bool}, v), falses(length(v)))
-upgrade_vector(adv::AbstractDataArray) = adv
+upgrade_vector{T<:Nullable}(v::AbstractArray{T}) = v
+upgrade_vector(v::CategoricalArray) = NullableCategoricalArray(v)
+upgrade_vector(v::AbstractArray) = NullableArray(v)
+
function upgrade_scalar(df::DataFrame, v::AbstractArray)
msg = "setindex!(::DataFrame, ...) only broadcasts scalars, not arrays"
throw(ArgumentError(msg))
end
function upgrade_scalar(df::DataFrame, v::Any)
n = (ncol(df) == 0) ? 1 : nrow(df)
- DataArray(fill(v, n), falses(n))
+ NullableArray(fill(v, n))
end
# df[SingleColumnIndex] = AbstractVector
@@ -365,10 +384,13 @@ function Base.setindex!(df::DataFrame,
end
# df[SingleColumnIndex] = Single Item (EXPANDS TO NROW(DF) if NCOL(DF) > 0)
-function Base.setindex!(df::DataFrame,
- v::Any,
- col_ind::ColumnIndex)
- insert_single_column!(df, upgrade_scalar(df, v), col_ind)
+function Base.setindex!(df::DataFrame, v, col_ind::ColumnIndex)
+ if haskey(index(df), col_ind)
+ fill!(df[col_ind], v)
+ else
+ insert_single_column!(df, upgrade_scalar(df, v), col_ind)
+ end
+ return df
end
# df[MultiColumnIndex] = DataFrame
@@ -397,7 +419,7 @@ function Base.setindex!{T <: ColumnIndex}(df::DataFrame,
col_inds::AbstractVector{T})
dv = upgrade_vector(v)
for col_ind in col_inds
- insert_single_column!(df, dv, col_ind)
+ df[col_ind] = dv
end
return df
end
@@ -411,9 +433,8 @@ end
function Base.setindex!{T <: ColumnIndex}(df::DataFrame,
val::Any,
col_inds::AbstractVector{T})
- dv = upgrade_scalar(df, val)
for col_ind in col_inds
- insert_single_column!(df, dv, col_ind)
+ df[col_ind] = val
end
return df
end
@@ -621,8 +642,20 @@ function Base.insert!(df::DataFrame, col_ind::Int, item::AbstractVector, name::S
insert!(df.columns, col_ind, item)
df
end
-Base.insert!(df::DataFrame, col_ind::Int, item, name::Symbol) =
+
+# FIXME: Needed to work around a crash: JuliaLang/julia#18299
+function Base.insert!(df::DataFrame, col_ind::Int, item::NullableArray, name::Symbol)
+ 0 < col_ind <= ncol(df) + 1 || throw(BoundsError())
+ size(df, 1) == length(item) || size(df, 1) == 0 || error("number of rows does not match")
+
+ insert!(index(df), col_ind, name)
+ insert!(df.columns, col_ind, item)
+ df
+end
+
+function Base.insert!(df::DataFrame, col_ind::Int, item, name::Symbol)
insert!(df, col_ind, upgrade_scalar(df, item), name)
+end
function Base.merge!(df::DataFrame, others::AbstractDataFrame...)
for other in others
@@ -721,9 +754,11 @@ function hcat!(df1::DataFrame, df2::AbstractDataFrame)
return df1
end
-hcat!{T}(df::DataFrame, x::DataVector{T}) = hcat!(df, DataFrame(Any[x]))
-hcat!{T}(df::DataFrame, x::Vector{T}) = hcat!(df, DataFrame(Any[DataArray(x)]))
-hcat!{T}(df::DataFrame, x::T) = hcat!(df, DataFrame(Any[DataArray([x])]))
+hcat!(df::DataFrame, x::CategoricalArray) = hcat!(df, DataFrame(Any[x]))
+hcat!(df::DataFrame, x::NullableCategoricalArray) = hcat!(df, DataFrame(Any[x]))
+hcat!(df::DataFrame, x::NullableVector) = hcat!(df, DataFrame(Any[x]))
+hcat!(df::DataFrame, x::Vector) = hcat!(df, DataFrame(Any[NullableArray(x)]))
+hcat!(df::DataFrame, x) = hcat!(df, DataFrame(Any[NullableArray([x])]))
# hcat! for 1-n arguments
hcat!(df::DataFrame) = df
@@ -739,7 +774,7 @@ Base.hcat(df::DataFrame, x) = hcat!(copy(df), x)
##############################################################################
function nullable!(df::DataFrame, col::ColumnIndex)
- df[col] = DataArray(df[col])
+ df[col] = NullableArray(df[col])
df
end
function nullable!{T <: ColumnIndex}(df::DataFrame, cols::Vector{T})
@@ -755,25 +790,23 @@ end
##
##############################################################################
-pool(a::AbstractVector) = compact(PooledDataArray(a))
-
-function pool!(df::DataFrame, cname::@compat(Union{Integer, Symbol}))
- df[cname] = pool(df[cname])
+function categorical!(df::DataFrame, cname::@compat(Union{Integer, Symbol}), compact::Bool=true)
+ df[cname] = categorical(df[cname], compact)
return
end
-function pool!{T <: @compat(Union{Integer, Symbol})}(df::DataFrame, cnames::Vector{T})
+function categorical!{T <: @compat(Union{Integer, Symbol})}(df::DataFrame, cnames::Vector{T},
+ compact::Bool=true)
for cname in cnames
- df[cname] = pool(df[cname])
+ df[cname] = categorical(df[cname], compact)
end
return
end
-# TODO: Deprecate or change for being too inconsistent with other pool methods
-function pool!(df::DataFrame)
+function categorical!(df::DataFrame, compact::Bool=true)
for i in 1:size(df, 2)
if eltype(df[i]) <: AbstractString
- df[i] = pool(df[i])
+ df[i] = categorical(df[i], compact)
end
end
return
@@ -811,7 +844,7 @@ function _dataframe_from_associative(dnames, d::Associative)
if length(col) != n
throw(ArgumentError("All columns in Dict must have the same length"))
end
- columns[j] = DataArray(col)
+ columns[j] = NullableArray(col)
colnames[j] = Symbol(name)
end
return DataFrame(columns, Index(colnames))
diff --git a/src/dataframe/io.jl b/src/dataframe/io.jl
index c7f76baae6..6a86c2466d 100644
--- a/src/dataframe/io.jl
+++ b/src/dataframe/io.jl
@@ -517,7 +517,7 @@ function builddf(rows::Integer,
values = Array(o.eltypes[j], rows)
end
- missing = falses(rows)
+ missing = fill(false, rows)
is_int = true
is_float = true
is_bool = true
@@ -640,9 +640,9 @@ function builddf(rows::Integer,
end
if o.makefactors && !(is_int || is_float || is_bool)
- columns[j] = PooledDataArray(values, missing)
+ columns[j] = NullableCategoricalArray(values, missing)
else
- columns[j] = DataArray(values, missing)
+ columns[j] = NullableArray(values, missing)
end
end
@@ -801,7 +801,7 @@ function readtable(io::IO,
separator::Char = ',',
quotemark::Vector{Char} = ['"'],
decimal::Char = '.',
- nastrings::Vector = ["", "NA"],
+ nastrings::Vector = ["", "NULL", "NA"],
truestrings::Vector = ["T", "t", "TRUE", "true"],
falsestrings::Vector = ["F", "f", "FALSE", "false"],
makefactors::Bool = false,
@@ -874,10 +874,10 @@ readtable(filename, [keyword options])
* `separator::Char` -- Assume that fields are split by the `separator` character. If not specified, it will be guessed from the filename: `.csv` defaults to `','`, `.tsv` defaults to `'\t'`, `.wsv` defaults to `' '`.
* `quotemark::Vector{Char}` -- Assume that fields contained inside of two `quotemark` characters are quoted, which disables processing of separators and linebreaks. Set to `Char[]` to disable this feature and slightly improve performance. Defaults to `['"']`.
* `decimal::Char` -- Assume that the decimal place in numbers is written using the `decimal` character. Defaults to `'.'`.
-* `nastrings::Vector{String}` -- Translate any of the strings into this vector into an `NA`. Defaults to `["", "NA"]`.
+* `nastrings::Vector{String}` -- Translate any of the strings into this vector into a NULL value. Defaults to `["", "NULL", "NA"]`.
* `truestrings::Vector{String}` -- Translate any of the strings into this vector into a Boolean `true`. Defaults to `["T", "t", "TRUE", "true"]`.
* `falsestrings::Vector{String}` -- Translate any of the strings into this vector into a Boolean `false`. Defaults to `["F", "f", "FALSE", "false"]`.
-* `makefactors::Bool` -- Convert string columns into `PooledDataVector`'s for use as factors. Defaults to `false`.
+* `makefactors::Bool` -- Convert string columns into `CategoricalVector`'s for use as factors. Defaults to `false`.
* `nrows::Int` -- Read only `nrows` from the file. Defaults to `-1`, which indicates that the entire file should be read.
* `names::Vector{Symbol}` -- Use the values in this array as the names for all columns instead of or in lieu of the names in the file's header. Defaults to `[]`, which indicates that the header should be used if present or that numeric names should be invented if there is no header.
* `eltypes::Vector` -- Specify the types of all columns. Defaults to `[]`.
@@ -909,7 +909,7 @@ function readtable(pathname::AbstractString;
separator::Char = getseparator(pathname),
quotemark::Vector{Char} = ['"'],
decimal::Char = '.',
- nastrings::Vector = String["", "NA"],
+ nastrings::Vector = String["", "NULL", "NA"],
truestrings::Vector = String["T", "t", "TRUE", "true"],
falsestrings::Vector = String["F", "f", "FALSE", "false"],
makefactors::Bool = false,
@@ -975,7 +975,7 @@ literals. Parses the string `s` containing delimiter-separated tabular data
argument contains a list of flag characters, which, if present, are equivalent
to supplying named arguments to `readtable` as follows:
-- `f`: `makefactors=true`, convert string columns to `PooledData` columns
+- `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns
- `c`: `allowcomments=true`, ignore lines beginning with `#`
- `H`: `header=false`, do not interpret the first line as column names
"""
@@ -1004,7 +1004,7 @@ separated values (CSV) using `readtable`, just as if it were being loaded from
an external file. The suffix flags `f`, `c`, and `H` are optional. If present,
they are equivalent to supplying named arguments to `readtable` as follows:
-* `f`: `makefactors=true`, convert string columns to `PooledDataArray` columns
+* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns
* `c`: `allowcomments=true`, ignore lines beginning with `#`
* `H`: `header=false`, do not interpret the first line as column names
@@ -1038,7 +1038,7 @@ character, just as if it were being loaded from an external file. The suffix
flags `f`, `c`, and `H` are optional. If present, they are equivalent to
supplying named arguments to `readtable` as follows:
-* `f`: `makefactors=true`, convert string columns to `PooledDataArray` columns
+* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns
* `c`: `allowcomments=true`, ignore lines beginning with `#`
* `H`: `header=false`, do not interpret the first line as column names
@@ -1074,7 +1074,7 @@ loaded from an external file. The suffix flags `f`, `c`, and `H` are optional.
If present, they are equivalent to supplying named arguments to `readtable` as
follows:
-* `f`: `makefactors=true`, convert string columns to `PooledDataArray` columns
+* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns
* `c`: `allowcomments=true`, ignore lines beginning with `#`
* `H`: `header=false`, do not interpret the first line as column names
@@ -1107,7 +1107,7 @@ separated values (TSV) using `readtable`, just as if it were being loaded from
an external file. The suffix flags `f`, `c`, and `H` are optional. If present,
they are equivalent to supplying named arguments to `readtable` as follows:
-* `f`: `makefactors=true`, convert string columns to `PooledDataArray` columns
+* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns
* `c`: `allowcomments=true`, ignore lines beginning with `#`
* `H`: `header=false`, do not interpret the first line as column names
diff --git a/src/dataframe/sort.jl b/src/dataframe/sort.jl
index 9255557469..e6cd01a88d 100644
--- a/src/dataframe/sort.jl
+++ b/src/dataframe/sort.jl
@@ -12,7 +12,14 @@ end
function Base.sort!(df::DataFrame, a::Base.Sort.Algorithm, o::Base.Sort.Ordering)
p = sortperm(df, a, o)
pp = similar(p)
- for col in columns(df)
+ c = columns(df)
+
+ for (i,col) in enumerate(c)
+ # Check if this column has been sorted already
+ if any(j -> c[j]===col, 1:i-1)
+ continue
+ end
+
copy!(pp,p)
Base.permute!!(col, pp)
end
diff --git a/src/dataframerow/dataframerow.jl b/src/dataframerow/dataframerow.jl
index daf7db86d7..1614e317b6 100644
--- a/src/dataframerow/dataframerow.jl
+++ b/src/dataframerow/dataframerow.jl
@@ -41,7 +41,7 @@ Base.convert(::Type{Array}, r::DataFrameRow) = convert(Array, r.df[r.row,:])
# so that duplicate rows would have the same hash
function Base.hash(r::DataFrameRow, h::UInt)
for col in columns(r.df)
- if isna(col, r.row)
+ if _isnull(col[r.row])
h = hash(false, h)
else
h = hash(true, hash(col[r.row], h))
@@ -50,34 +50,16 @@ function Base.hash(r::DataFrameRow, h::UInt)
return h
end
-# compare two elements in the array
-_isequalelms(a::Array, i::Int, j::Int) = isequal(a[i], a[j])
-
-# compare the two elements in the data array
-function _isequalelms(a::DataArray, i::Int, j::Int)
- if isna(a, i)
- return isna(a, j)
- else
- return !isna(a, j) && isequal(a.data[i], a.data[j])
- end
-end
-
-# compare two elements in the pooled array
-# NOTE assume there are no duplicated elements in the pool
-_isequalelms(a::PooledDataArray, i::Int, j::Int) = isequal(a.refs[i], a.refs[j])
-
# comparison of DataFrame rows
# only the rows of the same DataFrame could be compared
# rows are equal if they have the same values (while the row indices could differ)
+@compat(Base.:(==))(r1::DataFrameRow, r2::DataFrameRow) = isequal(r1, r2)
+
function Base.isequal(r1::DataFrameRow, r2::DataFrameRow)
- if r1.df !== r2.df
- throw(ArgumentError("Comparing rows from different frames not supported"))
- end
- if r1.row == r2.row
- return true
- end
+ r1.df == r2.df || throw(ArgumentError("Comparing rows from different frames not supported"))
+ r1.row == r2.row && return true
for col in columns(r1.df)
- if !_isequalelms(col, r1.row, r2.row)
+ if !isequal(col[r1.row], col[r2.row])
return false
end
end
diff --git a/src/deprecated.jl b/src/deprecated.jl
index 0c7e43c31d..286aa61b93 100644
--- a/src/deprecated.jl
+++ b/src/deprecated.jl
@@ -4,17 +4,13 @@ import Base: @deprecate
@deprecate by(d::AbstractDataFrame, cols, s::Symbol) aggregate(d, cols, eval(s))
@deprecate nullable!(colnames::Array{Symbol,1}, df::AbstractDataFrame) nullable!(df, colnames)
@deprecate nullable!(colnums::Array{Int,1}, df::AbstractDataFrame) nullable!(df, colnums)
+
import Base: keys, values, insert!
@deprecate keys(df::AbstractDataFrame) names(df)
@deprecate values(df::AbstractDataFrame) DataFrames.columns(df)
@deprecate insert!(df::DataFrame, df2::AbstractDataFrame) merge!(df, df2)
-import DataArrays: array, DataArray
-@deprecate array(df::AbstractDataFrame) convert(Array, df)
-@deprecate array(r::DataFrameRow) convert(Array, r)
-if VERSION < v"0.4.0-"
- @deprecate DataArray(df::AbstractDataFrame) convert(DataArray, df)
-end
-@deprecate DataArray(df::AbstractDataFrame, T::DataType) convert(DataArray{T}, df)
-
@deprecate read_rda(args...) FileIO.load(args...)
+
+@deprecate pool categorical
+@deprecate pool! categorical!
diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl
index 50dbc288fc..9caca98f24 100644
--- a/src/groupeddataframe/grouping.jl
+++ b/src/groupeddataframe/grouping.jl
@@ -25,6 +25,41 @@ end
#
# Split
#
+
+function groupsort_indexer(x::AbstractVector, ngroups::Integer, null_last::Bool=false)
+ # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).
+
+ # count group sizes, location 0 for NULL
+ n = length(x)
+ # counts = x.pool
+ counts = fill(0, ngroups + 1)
+ for i = 1:n
+ counts[x[i] + 1] += 1
+ end
+
+ # mark the start of each contiguous group of like-indexed data
+ where = fill(1, ngroups + 1)
+ if null_last
+ for i = 3:ngroups+1
+ where[i] = where[i - 1] + counts[i - 1]
+ end
+ where[1] = where[end] + counts[end]
+ else
+ for i = 2:ngroups+1
+ where[i] = where[i - 1] + counts[i - 1]
+ end
+ end
+
+ # this is our indexer
+ result = fill(0, n)
+ for i = 1:n
+ label = x[i] + 1
+ result[where[label]] = i
+ where[label] += 1
+ end
+ result, where, counts
+end
+
"""
A view of an AbstractDataFrame split into row groups
@@ -35,14 +70,13 @@ groupby(cols)
### Arguments
-* `d` : an AbstractDataFrame
-* `cols` : an
-
-If `d` is not provided, a curried version of groupby is given.
+* `d` : an AbstractDataFrame to split (optional, see [Returns](#returns))
+* `cols` : data frame columns to group by
### Returns
* `::GroupedDataFrame` : a grouped view into `d`
+* `::Function`: a function `x -> groupby(x, cols)` (if `d` is not specified)
### Details
@@ -76,8 +110,8 @@ vcat([g[:b] for g in gd]...)
for g in gd
println(g)
end
-map(d -> mean(d[:c]), gd) # returns a GroupApplied object
-combine(map(d -> mean(d[:c]), gd))
+map(d -> mean(dropnull(d[:c])), gd) # returns a GroupApplied object
+combine(map(d -> mean(dropnull(d[:c])), gd))
df |> groupby(:a) |> [sum, length]
df |> groupby([:a, :b]) |> [sum, length]
```
@@ -88,25 +122,34 @@ function groupby{T}(d::AbstractDataFrame, cols::Vector{T})
## http://wesmckinney.com/blog/?p=489
ncols = length(cols)
- # use the pool trick to get a set of integer references for each unique item
- dv = PooledDataArray(d[cols[ncols]])
- # if there are NAs, add 1 to the refs to avoid underflows in x later
- dv_has_nas = (findfirst(dv.refs, 0) > 0 ? 1 : 0)
- # use UInt32 instead of the PDA's integer size since the number of levels can be high
- x = copy!(similar(dv.refs, UInt32), dv.refs) .+ dv_has_nas
+ # use CategoricalArray to get a set of integer references for each unique item
+ nv = NullableCategoricalArray(d[cols[ncols]])
+ # if there are NULLs, add 1 to the refs to avoid underflows in x later
+ anynulls = (findfirst(nv.refs, 0) > 0 ? 1 : 0)
+ # use UInt32 instead of the original array's integer size since the number of levels can be high
+ x = similar(nv.refs, UInt32)
+ for i = 1:nrow(d)
+ if nv.refs[i] == 0
+ x[i] = 1
+ else
+ x[i] = CategoricalArrays.order(nv.pool)[nv.refs[i]] + anynulls
+ end
+ end
# also compute the number of groups, which is the product of the set lengths
- ngroups = length(dv.pool) + dv_has_nas
+ ngroups = length(levels(nv)) + anynulls
# if there's more than 1 column, do roughly the same thing repeatedly
for j = (ncols - 1):-1:1
- dv = PooledDataArray(d[cols[j]])
- dv_has_nas = (findfirst(dv.refs, 0) > 0 ? 1 : 0)
+ nv = NullableCategoricalArray(d[cols[j]])
+ anynulls = (findfirst(nv.refs, 0) > 0 ? 1 : 0)
for i = 1:nrow(d)
- x[i] += (dv.refs[i] + dv_has_nas- 1) * ngroups
+ if nv.refs[i] != 0
+ x[i] += (CategoricalArrays.order(nv.pool)[nv.refs[i]] + anynulls - 1) * ngroups
+ end
end
- ngroups = ngroups * (length(dv.pool) + dv_has_nas)
+ ngroups = ngroups * (length(levels(nv)) + anynulls)
# TODO if ngroups is really big, shrink it
end
- (idx, starts) = DataArrays.groupsort_indexer(x, ngroups)
+ (idx, starts) = groupsort_indexer(x, ngroups)
# Remove zero-length groupings
starts = _uniqueofsorted(starts)
ends = starts[2:end] - 1
@@ -159,15 +202,14 @@ Not meant to be constructed directly, see `groupby` abnd
provided for a GroupApplied object.
"""
-type GroupApplied
+immutable GroupApplied{T<:AbstractDataFrame}
gd::GroupedDataFrame
- vals::Vector
+ vals::Vector{T}
- function GroupApplied(gd, vals)
- if length(gd) != length(vals)
- error("GroupApplied requires keys and vals be of equal length.")
- end
- new(gd, vals)
+ @compat function (::Type{GroupApplied})(gd::GroupedDataFrame, vals::Vector)
+ length(gd) == length(vals) ||
+ throw(DimensionMismatch("GroupApplied requires keys and vals be of equal length (got $(length(gd)) and $(length(vals)))."))
+ new{eltype(vals)}(gd, vals)
end
end
@@ -178,10 +220,10 @@ end
# map() sweeps along groups
function Base.map(f::Function, gd::GroupedDataFrame)
- GroupApplied(gd, AbstractDataFrame[wrap(f(d)) for d in gd])
+ GroupApplied(gd, [wrap(f(df)) for df in gd])
end
function Base.map(f::Function, ga::GroupApplied)
- GroupApplied(ga.gd, AbstractDataFrame[wrap(f(d)) for d in ga.vals])
+ GroupApplied(ga.gd, [wrap(f(df)) for df in ga.vals])
end
wrap(df::AbstractDataFrame) = df
@@ -209,23 +251,21 @@ combine(ga::GroupApplied)
df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]),
b = repeat([2, 1], outer=[4]),
c = randn(8))
-combine(map(d -> mean(d[:c]), gd))
+combine(map(d -> mean(dropnull(d[:c])), gd))
```
"""
function combine(ga::GroupApplied)
gd, vals = ga.gd, ga.vals
- # Could be made shorter with a rep(x, lengths) function
- # See JuliaLang/julia#16443
- idx = Vector{Int}(sum(Int[size(val, 1) for val in vals]))
+ valscat = vcat(vals)
+ idx = Vector{Int}(size(valscat, 1))
j = 0
- for i in 1:length(vals)
- n = size(vals[i], 1)
- @inbounds idx[j + (1:n)] = gd.idx[gd.starts[i]]
+ @inbounds for (start, val) in zip(gd.starts, vals)
+ n = size(val, 1)
+ idx[j + (1:n)] = gd.idx[start]
j += n
end
- ret = gd.parent[idx, gd.cols]
- hcat!(ret, vcat(vals))
+ hcat!(gd.parent[idx, gd.cols], valscat)
end
@@ -260,12 +300,14 @@ colwise(sum, groupby(df, :a))
```
"""
-colwise(f::Function, d::AbstractDataFrame) = Any[[f(d[idx])] for idx in 1:size(d, 2)]
+colwise(f::Function, d::AbstractDataFrame) = Any[vcat(f(d[idx])) for idx in 1:size(d, 2)]
colwise(f::Function, gd::GroupedDataFrame) = map(colwise(f), gd)
colwise(f::Function) = x -> colwise(f, x)
colwise(f) = x -> colwise(f, x)
# apply several functions to each column in a DataFrame
-colwise{T<:Function}(fns::Vector{T}, d::AbstractDataFrame) = Any[[f(d[idx])] for f in fns, idx in 1:size(d, 2)][:]
+colwise{T<:Function}(fns::Vector{T}, d::AbstractDataFrame) =
+ reshape(Any[vcat(f(d[idx])) for f in fns, idx in 1:size(d, 2)],
+ length(fns)*size(d, 2))
colwise{T<:Function}(fns::Vector{T}, gd::GroupedDataFrame) = map(colwise(fns), gd)
colwise{T<:Function}(fns::Vector{T}) = x -> colwise(fns, x)
@@ -299,7 +341,7 @@ notation can be used.
### Returns
-* `::DataFrame`
+* `::DataFrame`
### Examples
@@ -308,11 +350,11 @@ df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]),
b = repeat([2, 1], outer=[4]),
c = randn(8))
by(df, :a, d -> sum(d[:c]))
-by(df, :a, d -> 2 * d[:c])
-by(df, :a, d -> DataFrame(c_sum = sum(d[:c]), c_mean = mean(d[:c])))
-by(df, :a, d -> DataFrame(c = d[:c], c_mean = mean(d[:c])))
+by(df, :a, d -> 2 * dropnull(d[:c]))
+by(df, :a, d -> DataFrame(c_sum = sum(d[:c]), c_mean = mean(dropnull(d[:c]))))
+by(df, :a, d -> DataFrame(c = d[:c], c_mean = mean(dropnull(d[:c]))))
by(df, [:a, :b]) do d
- DataFrame(m = mean(d[:c]), v = var(d[:c]))
+ DataFrame(m = mean(dropnull(d[:c])), v = var(dropnull(d[:c])))
end
```
@@ -347,7 +389,7 @@ same length.
### Returns
-* `::DataFrame`
+* `::DataFrame`
### Examples
@@ -356,9 +398,9 @@ df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]),
b = repeat([2, 1], outer=[4]),
c = randn(8))
aggregate(df, :a, sum)
-aggregate(df, :a, [sum, mean])
-aggregate(groupby(df, :a), [sum, mean])
-df |> groupby(:a) |> [sum, mean] # equivalent
+aggregate(df, :a, [sum, x->mean(dropnull(x))])
+aggregate(groupby(df, :a), [sum, x->mean(dropnull(x))])
+df |> groupby(:a) |> [sum, x->mean(dropnull(x))] # equivalent
```
"""
@@ -369,7 +411,7 @@ function aggregate{T<:Function}(d::AbstractDataFrame, fs::Vector{T})
end
# Applies aggregate to non-key cols of each SubDataFrame of a GroupedDataFrame
-aggregate(gd::GroupedDataFrame, fs::Function) = aggregate(gd, [fs])
+aggregate(gd::GroupedDataFrame, f::Function) = aggregate(gd, [f])
function aggregate{T<:Function}(gd::GroupedDataFrame, fs::Vector{T})
headers = _makeheaders(fs, _setdiff(_names(gd), gd.cols))
combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd))
@@ -386,8 +428,8 @@ end
function _makeheaders{T<:Function}(fs::Vector{T}, cn::Vector{Symbol})
fnames = _fnames(fs) # see other/utils.jl
- scn = [string(x) for x in cn]
- [Symbol("$(colname)_$(fname)") for fname in fnames, colname in scn][:]
+ reshape([Symbol(colname,'_',fname) for fname in fnames, colname in cn],
+ length(fnames)*length(cn))
end
function _aggregate{T<:Function}(d::AbstractDataFrame, fs::Vector{T}, headers::Vector{Symbol})
diff --git a/src/other/index.jl b/src/other/index.jl
index 4ba61fd94d..ef50b3b710 100644
--- a/src/other/index.jl
+++ b/src/other/index.jl
@@ -113,8 +113,10 @@ end
Base.getindex(x::Index, idx::Symbol) = x.lookup[idx]
Base.getindex(x::AbstractIndex, idx::Real) = @compat Int(idx)
-Base.getindex(x::AbstractIndex, idx::AbstractDataVector{Bool}) = getindex(x, convert(Array, idx, false))
-Base.getindex{T}(x::AbstractIndex, idx::AbstractDataVector{T}) = getindex(x, dropna(idx))
+Base.getindex(x::AbstractIndex, idx::AbstractVector{Nullable{Bool}}) =
+ getindex(x, convert(Vector{Bool}, idx, false))
+Base.getindex{T<:Nullable}(x::AbstractIndex, idx::AbstractVector{T}) =
+ getindex(x, dropnull(idx))
Base.getindex(x::AbstractIndex, idx::AbstractVector{Bool}) = find(idx)
Base.getindex(x::AbstractIndex, idx::Range) = [idx;]
Base.getindex{T <: Real}(x::AbstractIndex, idx::AbstractVector{T}) = convert(Vector{Int}, idx)
diff --git a/src/other/utils.jl b/src/other/utils.jl
index 7227f7027d..a0ceef879f 100644
--- a/src/other/utils.jl
+++ b/src/other/utils.jl
@@ -54,7 +54,7 @@ function make_unique(names::Vector{Symbol}; allow_duplicates=true)
name = names[i]
in(name, seen) ? push!(dups, i) : push!(seen, name)
end
-
+
if !allow_duplicates && length(dups) > 0
d = unique(names[dups])
msg = """Duplicate variable names: $d.
@@ -99,50 +99,55 @@ function gennames(n::Integer)
return res
end
+
#' @description
#'
-#' Count the number of missing values in an Array.
+#' Count the number of null values in an array.
#'
-#' NOTE: This function always returns 0.
+#' @field a::AbstractArray The array whose missing values are to be counted.
#'
-#' @field a::Array The Array whose missing values are to be counted.
-#'
-#' @returns count::Int The number of missing values in `a`.
+#' @returns count::Int The number of null values in `a`.
#'
#' @examples
#'
-#' DataFrames.countna([1, 2, 3])
-countna(a::Array) = 0
+#' DataFrames.countnull([1, 2, 3])
+function countnull(a::AbstractArray)
+ res = 0
+ for x in a
+ res += _isnull(x)
+ end
+ return res
+end
#' @description
#'
-#' Count the number of missing values in a DataArray.
+#' Count the number of missing values in a NullableArray.
#'
-#' @field da::DataArray The DataArray whose missing values are to be counted.
+#' @field a::NullableArray The NullableArray whose missing values are to be counted.
#'
-#' @returns count::Int The number of missing values in `a`.
+#' @returns count::Int The number of null values in `a`.
#'
#' @examples
#'
-#' DataFrames.countna(@data([1, 2, 3]))
-countna(da::DataArray) = sum(da.na)
+#' DataFrames.countnull(NullableArray([1, 2, 3]))
+countnull(a::NullableArray) = sum(a.isnull)
#' @description
#'
-#' Count the number of missing values in a PooledDataArray.
+#' Count the number of missing values in a NullableCategoricalArray.
#'
-#' @field pda::PooledDataArray The PooledDataArray whose missing values
+#' @field na::CategoricalArray The CategoricalArray whose missing values
#' are to be counted.
#'
-#' @returns count::Int The number of missing values in `a`.
+#' @returns count::Int The number of null values in `a`.
#'
#' @examples
#'
-#' DataFrames.countna(@pdata([1, 2, 3]))
-function countna(da::PooledDataArray)
+#' DataFrames.countnull(CategoricalArray([1, 2, 3]))
+function countnull(a::CategoricalArray)
res = 0
- for i in 1:length(da)
- res += da.refs[i] == 0
+ for x in a.refs
+ res += x == 0
end
return res
end
@@ -193,3 +198,6 @@ function _fnames{T<:Function}(fs::Vector{T})
end
names
end
+
+_isnull(x::Any) = false
+_isnull(x::Nullable) = isnull(x)
diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl
index 095ea5da0d..47e7b97434 100644
--- a/src/statsmodels/contrasts.jl
+++ b/src/statsmodels/contrasts.jl
@@ -140,19 +140,21 @@ end
# Methods for constructing ContrastsMatrix from data. These are called in
# ModelFrame constructor and setcontrasts!.
-# TODO: add methods for new categorical types
-
-ContrastsMatrix(C::AbstractContrasts, v::PooledDataArray) =
+ContrastsMatrix(C::AbstractContrasts,
+ v::Union{CategoricalArray, NullableCategoricalArray}) =
ContrastsMatrix(C, levels(v))
-ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, col::PooledDataArray) =
+ContrastsMatrix{C <: AbstractContrasts}(c::Type{C},
+ col::Union{CategoricalArray, NullableCategoricalArray}) =
throw(ArgumentError("contrast types must be instantiated (use $c() instead of $c)"))
+
# given an existing ContrastsMatrix, check that all of the levels present in the
# data are present in the contrasts. Note that this behavior is different from the
# ContrastsMatrix constructor, which requires that the levels be exactly the same.
# This method exists to support things like `predict` that can operate on new data
# which may contain only a subset of the original data's levels. Checking here
# (instead of in `modelmat_cols`) allows an informative error message.
-function ContrastsMatrix(c::ContrastsMatrix, col::PooledDataArray)
+function ContrastsMatrix(c::ContrastsMatrix,
+ col::Union{CategoricalArray, NullableCategoricalArray})
if !isempty(setdiff(levels(col), c.levels))
throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: " *
"$(setdiff(levels(col), c.levels))" *
@@ -171,7 +173,8 @@ nullify(x::Nullable) = x
nullify(x) = Nullable(x)
# Making a contrast type T only requires that there be a method for
-# contrasts_matrix(T, v::PooledDataArray). The rest is boilerplate.
+# contrasts_matrix(T, v::Union{CategoricalArray, NullableCategoricalArray}).
+# The rest is boilerplate.
for contrastType in [:DummyCoding, :EffectsCoding, :HelmertCoding]
@eval begin
type $contrastType <: AbstractContrasts
diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl
index 40f1cdb949..79777a5cab 100644
--- a/src/statsmodels/formula.jl
+++ b/src/statsmodels/formula.jl
@@ -40,6 +40,8 @@ type Terms
intercept::Bool # is there an intercept column in the model matrix?
end
+Base.:(==)(t1::Terms, t2::Terms) = all(getfield(t1, f)==getfield(t2, f) for f in fieldnames(t1))
+
type ModelFrame
df::AbstractDataFrame
terms::Terms
@@ -85,19 +87,26 @@ function dospecials(ex::Expr)
if !(a1 in specials) return ex end
excp = copy(ex)
excp.args = vcat(a1,map(dospecials, ex.args[2:end]))
- if a1 != :* return excp end
- aa = excp.args
- a2 = aa[2]
- a3 = aa[3]
- if length(aa) > 3
- excp.args = vcat(a1, aa[3:end])
- a3 = dospecials(excp)
+ if a1 == :-
+ a2, a3 = excp.args[2:3]
+ a3 == 1 || error("invalid expression $ex; subtraction only supported for -1")
+ return :($a2 + -1)
+ elseif a1 == :*
+ aa = excp.args
+ a2 = aa[2]
+ a3 = aa[3]
+ if length(aa) > 3
+ excp.args = vcat(a1, aa[3:end])
+ a3 = dospecials(excp)
+ end
+ ## this order of expansion gives the R-style ordering of interaction
+ ## terms (after sorting in increasing interaction order) for higher-
+ ## order interaction terms (e.g. x1 * x2 * x3 should expand to x1 +
+ ## x2 + x3 + x1&x2 + x1&x3 + x2&x3 + x1&x2&x3)
+ :($a2 + $a2 & $a3 + $a3)
+ else
+ excp
end
- ## this order of expansion gives the R-style ordering of interaction
- ## terms (after sorting in increasing interaction order) for higher-
- ## order interaction terms (e.g. x1 * x2 * x3 should expand to x1 +
- ## x2 + x3 + x1&x2 + x1&x3 + x2&x3 + x1&x2&x3)
- :($a2 + $a2 & $a3 + $a3)
end
dospecials(a::Any) = a
@@ -216,27 +225,16 @@ function Terms(f::Formula)
Terms(tt, ev, facs, non_redundants, oo, haslhs, !any(noint))
end
-## Default NA handler. Others can be added as keyword arguments
-function na_omit(df::DataFrame)
+## Default NULL handler. Others can be added as keyword arguments
+function null_omit(df::DataFrame)
cc = complete_cases(df)
df[cc,:], cc
end
-## Trim the pool field of da to only those levels that occur in the refs
-function dropunusedlevels!(da::PooledDataArray)
- rr = da.refs
- uu = unique(rr)
- length(uu) == length(da.pool) && return da
- T = eltype(rr)
- su = sort!(uu)
- dict = Dict(zip(su, one(T):convert(T, length(uu))))
- da.refs = map(x -> dict[x], rr)
- da.pool = da.pool[uu]
- da
-end
-dropunusedlevels!(x) = x
+_droplevels!(x::Any) = x
+_droplevels!(x::Union{CategoricalArray, NullableCategoricalArray}) = droplevels!(x)
-is_categorical(::PooledDataArray) = true
+is_categorical(::Union{CategoricalArray, NullableCategoricalArray}) = true
is_categorical(::Any) = false
## Check for non-redundancy of columns. For instance, if x is a factor with two
@@ -285,16 +283,11 @@ end
const DEFAULT_CONTRASTS = DummyCoding
-function ModelFrame(trms::Terms, d::AbstractDataFrame;
- contrasts::Dict = Dict())
- df, msng = na_omit(DataFrame(map(x -> d[x], trms.eterms)))
- names!(df, convert(Vector{Symbol}, map(string, trms.eterms)))
- for c in eachcol(df) dropunusedlevels!(c[2]) end
-
- ## Set up contrasts:
- ## Combine actual DF columns and contrast types if necessary to compute the
- ## actual contrasts matrices, levels, and term names (using DummyCoding
- ## as the default)
+## Set up contrasts:
+## Combine actual DF columns and contrast types if necessary to compute the
+## actual contrasts matrices, levels, and term names (using DummyCoding
+## as the default)
+function evalcontrasts(df::AbstractDataFrame, contrasts::Dict = Dict())
evaledContrasts = Dict()
for (term, col) in eachcol(df)
is_categorical(col) || continue
@@ -303,6 +296,16 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame;
DEFAULT_CONTRASTS(),
col)
end
+ return evaledContrasts
+end
+
+function ModelFrame(trms::Terms, d::AbstractDataFrame;
+ contrasts::Dict = Dict())
+ df, msng = null_omit(DataFrame(map(x -> d[x], trms.eterms)))
+ names!(df, convert(Vector{Symbol}, map(string, trms.eterms)))
+ for c in eachcol(df) _droplevels!(c[2]) end
+
+ evaledContrasts = evalcontrasts(df, contrasts)
## Check for non-redundant terms, modifying terms in place
check_non_redundancy!(trms, df)
@@ -310,6 +313,7 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame;
ModelFrame(df, trms, msng, evaledContrasts)
end
+ModelFrame(df::AbstractDataFrame, term::Terms, msng::BitArray) = ModelFrame(df, term, msng, evalcontrasts(df))
ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d; kwargs...)
ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), d; kwargs...)
@@ -348,8 +352,11 @@ function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, name::Symbol, mf::Mode
end
end
-modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::DataVector) = convert(T, reshape(v.data, length(v), 1))
-modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::Vector) = convert(T, reshape(v, length(v), 1))
+modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::AbstractVector) =
+ convert(T, reshape(v, length(v), 1))
+# FIXME: this inefficient method should not be needed, cf. JuliaLang/julia#18264
+modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::NullableVector) =
+ convert(T, Matrix(reshape(v, length(v), 1)))
"""
modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix)
@@ -357,16 +364,21 @@ modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::Vector) = convert(T, reshape
Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that
levels align properly.
"""
-function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix)
+function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T},
+ v::Union{CategoricalVector, NullableCategoricalVector},
+ contrast::ContrastsMatrix)
## make sure the levels of the contrast matrix and the categorical data
## are the same by constructing a re-indexing vector. Indexing into
## reindex with v.refs will give the corresponding row number of the
## contrast matrix
reindex = [findfirst(contrast.levels, l) for l in levels(v)]
contrastmatrix = convert(T, contrast.matrix)
- return contrastmatrix[reindex[v.refs], :]
+ return indexrows(contrastmatrix, reindex[v.refs])
end
+indexrows(m::SparseMatrixCSC, ind::Vector{Int}) = m'[:, ind]'
+indexrows(m::AbstractMatrix, ind::Vector{Int}) = m[ind, :]
+
"""
expandcols{T<:AbstractFloatMatrix}(trm::Vector{T})
Create pairwise products of columns from a vector of matrices
@@ -421,7 +433,6 @@ function dropresponse!(trms::Terms)
end
end
-
"""
ModelMatrix{T<:AbstractFloatMatrix}(mf::ModelFrame)
Create a `ModelMatrix` of type `T` (default `Matrix{Float64}`) from the
@@ -503,7 +514,8 @@ ModelMatrix(mf::ModelFrame) = ModelMatrix{Matrix{Float64}}(mf)
termnames(term::Symbol, col)
Returns a vector of strings with the names of the coefficients
associated with a term. If the column corresponding to the term
-is not a `PooledDataArray` a one-element vector is returned.
+is not a `CategoricalArray` or `NullableCategoricalArray`,
+a one-element vector is returned.
"""
termnames(term::Symbol, col) = [string(term)]
function termnames(term::Symbol, mf::ModelFrame; non_redundant::Bool = false)
diff --git a/src/statsmodels/statsmodel.jl b/src/statsmodels/statsmodel.jl
index 3424dc2c97..3dd5768d35 100644
--- a/src/statsmodels/statsmodel.jl
+++ b/src/statsmodels/statsmodel.jl
@@ -62,7 +62,7 @@ typealias DataFrameModels @compat(Union{DataFrameStatisticalModel, DataFrameRegr
@delegate DataFrameModels.model [StatsBase.coef, StatsBase.confint,
StatsBase.deviance, StatsBase.nulldeviance,
StatsBase.loglikelihood, StatsBase.nullloglikelihood,
- StatsBase.df, StatsBase.df_residual, StatsBase.nobs,
+ StatsBase.dof, StatsBase.dof_residual, StatsBase.nobs,
StatsBase.stderr, StatsBase.vcov]
@delegate DataFrameRegressionModel.model [StatsBase.residuals, StatsBase.model_response,
StatsBase.predict, StatsBase.predict!]
@@ -81,7 +81,7 @@ function StatsBase.predict(mm::DataFrameRegressionModel, df::AbstractDataFrame;
mf = ModelFrame(newTerms, df; contrasts = mm.mf.contrasts)
newX = ModelMatrix(mf).m
yp = predict(mm, newX; kwargs...)
- out = DataArray(eltype(yp), size(df, 1))
+ out = NullableArray(eltype(yp), size(df, 1))
out[mf.msng] = yp
return(out)
end
diff --git a/test/REQUIRE b/test/REQUIRE
index 22fc9ebb18..84bc366b4d 100644
--- a/test/REQUIRE
+++ b/test/REQUIRE
@@ -2,3 +2,4 @@ Compat 0.9.0
DataStructures
RDatasets # can be removed when deprecated.jl doesn't test read_rda anymore
RData
+LaTeXStrings
diff --git a/test/cat.jl b/test/cat.jl
index af45afd77b..1ec8cc2b4a 100644
--- a/test/cat.jl
+++ b/test/cat.jl
@@ -6,13 +6,13 @@ module TestCat
# hcat
#
- dvint = @data([1, 2, NA, 4])
- dvstr = @data(["one", "two", NA, "four"])
+ nvint = NullableArray(Nullable{Int}[1, 2, Nullable(), 4])
+ nvstr = NullableArray(Nullable{String}["one", "two", Nullable(), "four"])
- df2 = DataFrame(Any[dvint, dvstr])
- df3 = DataFrame(Any[dvint])
+ df2 = DataFrame(Any[nvint, nvstr])
+ df3 = DataFrame(Any[nvint])
df4 = convert(DataFrame, [1:4 1:4])
- df5 = DataFrame(Any[@data([1,2,3,4]), dvstr])
+ df5 = DataFrame(Any[NullableArray([1,2,3,4]), nvstr])
dfh = hcat(df3, df4)
@test size(dfh, 2) == 3
@@ -38,18 +38,40 @@ module TestCat
# Assignment of rows
df[1, :] = df[1, :]
df[1:2, :] = df[1:2, :]
+ df[[true,false,false,true], :] = df[2:3, :]
- # Broadcasting assignment of rows
+ # Scalar broadcasting assignment of rows
df[1, :] = 1
+ df[1:2, :] = 1
+ df[[true,false,false,true], :] = 3
+
+ # Vector broadcasting assignment of rows
+ df[1:2, :] = [2,3]
+ df[[true,false,false,true], :] = [2,3]
# Assignment of columns
df[1] = zeros(4)
+ df[:, 2] = ones(4)
# Broadcasting assignment of columns
df[:, 1] = 1
df[1] = 3
df[:x3] = 2
+ # assignment of subframes
+ df[1, 1:2] = df[2, 2:3]
+ df[1:2, 1:2] = df[2:3, 2:3]
+ df[[true,false,false,true], 2:3] = df[1:2,1:2]
+
+ # scalar broadcasting assignment of subframes
+ df[1, 1:2] = 3
+ df[1:2, 1:2] = 3
+ df[[true,false,false,true], 2:3] = 3
+
+ # vector broadcasting assignment of subframes
+ df[1:2, 1:2] = [3,2]
+ df[[true,false,false,true], 2:3] = [2,3]
+
vcat([])
vcat(null_df)
vcat(null_df, null_df)
@@ -78,37 +100,56 @@ module TestCat
dfr = vcat(df2, df3)
@test size(dfr) == (8,2)
@test names(df2) == names(dfr)
- @test isna(dfr[8,:x2])
+ @test isnull(dfr[8,:x2])
# Eltype promotion
- @test eltypes(vcat(DataFrame(a = [1]), DataFrame(a = [2.1]))) == [Float64]
- @test eltypes(vcat(DataFrame(a = [NA]), DataFrame(a = [2.1]))) == [Float64]
+ # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}
+ if VERSION >= v"0.5.0-dev"
+ @test eltypes(vcat(DataFrame(a = [1]), DataFrame(a = [2.1]))) == [Nullable{Float64}]
+ @test eltypes(vcat(DataFrame(a = NullableArray(Int, 1)), DataFrame(a = [2.1]))) == [Nullable{Float64}]
+ else
+ @test eltypes(vcat(DataFrame(a = [1]), DataFrame(a = [2.1]))) == [Nullable{Any}]
+ @test eltypes(vcat(DataFrame(a = NullableArray(Int, 1)), DataFrame(a = [2.1]))) == [Nullable{Any}]
+ end
# Minimal container type promotion
- dfa = DataFrame(a = @pdata([1, 2, 2]))
- dfb = DataFrame(a = @pdata([2, 3, 4]))
- dfc = DataFrame(a = @data([2, 3, 4]))
+ dfa = DataFrame(a = CategoricalArray([1, 2, 2]))
+ dfb = DataFrame(a = CategoricalArray([2, 3, 4]))
+ dfc = DataFrame(a = NullableArray([2, 3, 4]))
dfd = DataFrame(Any[2:4], [:a])
- @test vcat(dfa, dfb)[:a] == @pdata([1, 2, 2, 2, 3, 4])
- @test vcat(dfa, dfc)[:a] == @pdata([1, 2, 2, 2, 3, 4])
+ dfe = DataFrame(b = CategoricalArray([2, 3, 4]))
+ dfab = vcat(dfa, dfb)
+ dfac = vcat(dfa, dfc)
+ dfabcd = vcat(dfa, dfc, dfe)
+ @test isequal(dfab[:a], Nullable{Int}[1, 2, 2, 2, 3, 4])
+ @test isequal(dfac[:a], Nullable{Int}[1, 2, 2, 2, 3, 4])
+ @test isa(dfab[:a], NullableCategoricalVector{Int})
+ @test isa(dfabcd[:a], NullableCategoricalVector{Int})
+ @test isa(dfabcd[:b], NullableCategoricalVector{Int})
+ # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}
+ if VERSION >= v"0.5.0-dev"
+ @test isa(dfac[:a], NullableCategoricalVector{Int})
+ else
+ @test isa(dfac[:a], NullableCategoricalVector{Any})
+ end
# ^^ container may flip if container promotion happens in Base/DataArrays
dc = vcat(dfd, dfc)
- @test vcat(dfc, dfd) == dc
+ @test isequal(vcat(dfc, dfd), dc)
# Zero-row DataFrames
dfc0 = similar(dfc, 0)
- @test vcat(dfd, dfc0, dfc) == dc
+ @test isequal(vcat(dfd, dfc0, dfc), dc)
@test eltypes(vcat(dfd, dfc0)) == eltypes(dc)
# Missing columns
rename!(dfd, :a, :b)
- dfda = DataFrame(b = @data([2, 3, 4, NA, NA, NA]),
- a = @pdata([NA, NA, NA, 1, 2, 2]))
+ dfda = DataFrame(b = NullableArray(Nullable{Int}[2, 3, 4, Nullable(), Nullable(), Nullable()]),
+ a = NullableCategoricalVector(Nullable{Int}[Nullable(), Nullable(), Nullable(), 1, 2, 2]))
@test isequal(vcat(dfd, dfa), dfda)
# Alignment
@test isequal(vcat(dfda, dfd, dfa), vcat(dfda, dfda))
# vcat should be able to concatenate different implementations of AbstractDataFrame (PR #944)
- @test vcat(sub(DataFrame(A=1:3),2),DataFrame(A=4:5)) == DataFrame(A=[2,4,5])
+ @test isequal(vcat(sub(DataFrame(A=1:3),2),DataFrame(A=4:5)), DataFrame(A=[2,4,5]))
end
diff --git a/test/constructors.jl b/test/constructors.jl
index cba2e4eeed..600e067e25 100644
--- a/test/constructors.jl
+++ b/test/constructors.jl
@@ -10,27 +10,23 @@ module TestConstructors
@test isequal(df.columns, Any[])
@test isequal(df.colindex, Index())
- df = DataFrame(Any[data(zeros(3)), data(ones(3))],
+ df = DataFrame(Any[NullableCategoricalVector(zeros(3)),
+ NullableCategoricalVector(ones(3))],
Index([:x1, :x2]))
@test size(df, 1) == 3
@test size(df, 2) == 2
- @test isequal(df,
- DataFrame(Any[data(zeros(3)), data(ones(3))]))
- @test isequal(df,
- DataFrame(x1 = [0.0, 0.0, 0.0],
- x2 = [1.0, 1.0, 1.0]))
+ @test isequal(df, DataFrame(Any[NullableCategoricalVector(zeros(3)),
+ NullableCategoricalVector(ones(3))]))
+ @test isequal(df, DataFrame(x1 = [0.0, 0.0, 0.0],
+ x2 = [1.0, 1.0, 1.0]))
df2 = convert(DataFrame, [0.0 1.0;
0.0 1.0;
0.0 1.0])
names!(df2, [:x1, :x2])
- @test isequal(df, df2)
-
- @test isequal(df,
- convert(DataFrame, [0.0 1.0;
- 0.0 1.0;
- 0.0 1.0]))
+ @test isequal(df[:x1], NullableArray(df2[:x1]))
+ @test isequal(df[:x2], NullableArray(df2[:x2]))
@test isequal(df, DataFrame(x1 = [0.0, 0.0, 0.0],
x2 = [1.0, 1.0, 1.0]))
@@ -40,15 +36,12 @@ module TestConstructors
df = DataFrame(Int, 2, 2)
@test size(df) == (2, 2)
- @test all(eltypes(df) .== [Int, Int])
+ @test eltypes(df) == [Nullable{Int}, Nullable{Int}]
df = DataFrame([Int, Float64], [:x1, :x2], 2)
@test size(df) == (2, 2)
- @test all(eltypes(df) .== Any[Int, Float64])
+ @test eltypes(df) == [Nullable{Int}, Nullable{Float64}]
@test isequal(df, DataFrame([Int, Float64], 2))
-
-
-
end
diff --git a/test/contrasts.jl b/test/contrasts.jl
index 1ff2fed934..0a6b76671b 100644
--- a/test/contrasts.jl
+++ b/test/contrasts.jl
@@ -4,7 +4,7 @@ using Base.Test
using DataFrames
-d = DataFrame(x = @pdata( [:a, :b, :c, :a, :a, :b] ))
+d = DataFrame(x = CategoricalVector([:a, :b, :c, :a, :a, :b]))
mf = ModelFrame(Formula(nothing, :x), d)
@@ -75,7 +75,7 @@ setcontrasts!(mf, x = HelmertCoding())
@test_throws ArgumentError setcontrasts!(mf, x = EffectsCoding(levels = ["a", "b", "c"]))
# Missing data is handled gracefully, dropping columns when a level is lost
-d[3, :x] = NA
+d[3, :x] = Nullable()
mf_missing = ModelFrame(Formula(nothing, :x), d, contrasts = Dict(:x => EffectsCoding()))
@test ModelMatrix(mf_missing).m == [1 -1
1 1
diff --git a/test/conversions.jl b/test/conversions.jl
index adf1067c69..1a607b2cac 100644
--- a/test/conversions.jl
+++ b/test/conversions.jl
@@ -7,14 +7,17 @@ module TestConversions
df[:A] = 1:5
df[:B] = [:A, :B, :C, :D, :E]
@test isa(convert(Array, df), Matrix{Any})
- @test convert(Array, df) == convert(Array, convert(DataArray, df))
+ @test convert(Array, df) == convert(Array, convert(NullableArray, df))
@test isa(convert(Array{Any}, df), Matrix{Any})
df = DataFrame()
df[:A] = 1:5
df[:B] = 1.0:5.0
- @test isa(convert(Array, df), Matrix{Real})
- @test convert(Array, df) == convert(Array, convert(DataArray, df))
+ # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}
+ if VERSION >= v"0.5.0-dev"
+ @test isa(convert(Array, df), Matrix{Float64})
+ end
+ @test convert(Array, df) == convert(Array, convert(NullableArray, df))
@test isa(convert(Array{Any}, df), Matrix{Any})
@test isa(convert(Array{Float64}, df), Matrix{Float64})
@@ -25,24 +28,24 @@ module TestConversions
aa = convert(Array{Any}, df)
ai = convert(Array{Int}, df)
@test isa(a, Matrix{Float64})
- @test a == convert(Array, convert(DataArray, df))
+ @test a == convert(Array, convert(NullableArray, df))
@test a == convert(Matrix, df)
@test isa(aa, Matrix{Any})
@test aa == convert(Matrix{Any}, df)
@test isa(ai, Matrix{Int})
@test ai == convert(Matrix{Int}, df)
- df[1,1] = NA
+ df[1,1] = Nullable()
@test_throws ErrorException convert(Array, df)
- da = convert(DataArray, df)
- daa = convert(DataArray{Any}, df)
- dai = convert(DataArray{Int}, df)
- @test isa(da, DataMatrix{Float64})
- @test isequal(da, convert(DataMatrix, df))
- @test isa(daa, DataMatrix{Any})
- @test isequal(daa, convert(DataMatrix{Any}, df))
- @test isa(dai, DataMatrix{Int})
- @test isequal(dai, convert(DataMatrix{Int}, df))
+ na = convert(NullableArray, df)
+ naa = convert(NullableArray{Any}, df)
+ nai = convert(NullableArray{Int}, df)
+ @test isa(na, NullableMatrix{Float64})
+ @test isequal(na, convert(NullableMatrix, df))
+ @test isa(naa, NullableMatrix{Any})
+ @test isequal(naa, convert(NullableMatrix{Any}, df))
+ @test isa(nai, NullableMatrix{Int})
+ @test isequal(nai, convert(NullableMatrix{Int}, df))
a = [1.0,2.0]
b = [-0.1,3]
@@ -52,25 +55,25 @@ module TestConversions
df = convert(DataFrame,di)
@test isa(df,DataFrame)
@test names(df) == Symbol[x for x in sort(collect(keys(di)))]
- @test df[:a] == a
- @test df[:b] == b
- @test df[:c] == c
+ @test isequal(df[:a], NullableArray(a))
+ @test isequal(df[:b], NullableArray(b))
+ @test isequal(df[:c], NullableArray(c))
od = OrderedDict("c"=>c, "a"=>a, "b"=>b)
df = convert(DataFrame,od)
@test isa(df, DataFrame)
@test names(df) == Symbol[x for x in keys(od)]
- @test df[:a] == a
- @test df[:b] == b
- @test df[:c] == c
+ @test isequal(df[:a], NullableArray(a))
+ @test isequal(df[:b], NullableArray(b))
+ @test isequal(df[:c], NullableArray(c))
sd = SortedDict("c"=>c, "a"=>a, "b"=>b)
df = convert(DataFrame,sd)
@test isa(df, DataFrame)
@test names(df) == Symbol[x for x in keys(sd)]
- @test df[:a] == a
- @test df[:b] == b
- @test df[:c] == c
+ @test isequal(df[:a], NullableArray(a))
+ @test isequal(df[:b], NullableArray(b))
+ @test isequal(df[:c], NullableArray(c))
a = [1.0]
di = Dict("a"=>a, "b"=>b, "c"=>c)
diff --git a/test/data.jl b/test/data.jl
index d0fca036fe..1fc3f217cb 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -4,24 +4,24 @@ module TestData
using DataFrames
using Compat
- #test_group("DataVector creation")
- dvint = @data([1, 2, NA, 4])
- dvint2 = data([5:8;])
- dvint3 = data(5:8)
- dvflt = @data([1.0, 2, NA, 4])
- dvstr = @data(["one", "two", NA, "four"])
- dvdict = DataArray(Dict, 4) # for issue #199
+ #test_group("NullableArray creation")
+ nvint = NullableArray(Nullable{Int}[1, 2, Nullable(), 4])
+ nvint2 = NullableArray(5:8)
+ nvint3 = NullableArray(5:8)
+ nvflt = NullableArray(Nullable{Float64}[1.0, 2.0, Nullable(), 4.0])
+ nvstr = NullableArray(Nullable{Compat.ASCIIString}["one", "two", Nullable(), "four"])
+ dvdict = NullableArray(Dict, 4) # for issue #199
#test_group("constructors")
- df1 = DataFrame(Any[dvint, dvstr], [:Ints, :Strs])
- df2 = DataFrame(Any[dvint, dvstr])
- df3 = DataFrame(Any[dvint])
+ df1 = DataFrame(Any[nvint, nvstr], [:Ints, :Strs])
+ df2 = DataFrame(Any[nvint, nvstr])
+ df3 = DataFrame(Any[nvint])
df4 = convert(DataFrame, [1:4 1:4])
- df5 = DataFrame(Any[@data([1,2,3,4]), dvstr])
- df6 = DataFrame(Any[dvint, dvint, dvstr], [:A, :B, :C])
- df7 = DataFrame(x = dvint, y = dvstr)
+ df5 = DataFrame(Any[NullableArray([1,2,3,4]), nvstr])
+ df6 = DataFrame(Any[nvint, nvint, nvstr], [:A, :B, :C])
+ df7 = DataFrame(x = nvint, y = nvstr)
@test size(df7) == (4, 2)
- @test isequal(df7[:x], dvint)
+ @test isequal(df7[:x], nvint)
#test_group("description functions")
@test size(df6, 1) == 4
@@ -31,10 +31,10 @@ module TestData
@test names(df7) == [:x, :y]
#test_group("ref")
- @test df6[2, 3] == "two"
- @test isna(df6[3, 3])
- @test df6[2, :C] == "two"
- @test isequal(df6[:B], dvint)
+ @test isequal(df6[2, 3], Nullable("two"))
+ @test isnull(df6[3, 3])
+ @test isequal(df6[2, :C], Nullable("two"))
+ @test isequal(df6[:B], nvint)
@test size(df6[[2,3]], 2) == 2
@test size(df6[2,:], 1) == 1
@test size(df6[[1, 3], [1, 3]]) == (2, 2)
@@ -43,17 +43,17 @@ module TestData
# lots more to do
#test_group("assign")
- df6[3] = @data(["un", "deux", "troix", "quatre"])
- @test df6[1, 3] == "un"
+ df6[3] = NullableArray(["un", "deux", "troix", "quatre"])
+ @test isequal(df6[1, 3], Nullable("un"))
df6[:B] = [4, 3, 2, 1]
- @test df6[1,2] == 4
+ @test isequal(df6[1,2], Nullable(4))
df6[:D] = [true, false, true, false]
- @test df6[1,4] == true
+ @test isequal(df6[1,4], Nullable(true))
delete!(df6, :D)
@test names(df6) == [:A, :B, :C]
@test size(df6, 2) == 3
- #test_group("NA handling")
+ #test_group("null handling")
@test nrow(df5[complete_cases(df5), :]) == 3
#test_context("SubDataFrames")
@@ -68,7 +68,7 @@ module TestData
@test size(sdf6d) == (2,1)
#test_group("ref")
- @test sdf6a[1,2] == 4
+ @test isequal(sdf6a[1,2], Nullable(4))
#test_context("Within")
#test_group("Associative")
@@ -77,35 +77,37 @@ module TestData
srand(1)
N = 20
#Cast to Int64 as rand() behavior differs between Int32/64
- d1 = pdata(rand(@compat(map(Int64, 1:2)), N))
- d2 = (@pdata ["A", "B", NA])[rand(@compat(map(Int64, 1:3)), N)]
- d3 = data(randn(N))
- d4 = data(randn(N))
+ d1 = NullableArray(rand(map(Int64, 1:2), N))
+ d2 = NullableCategoricalArray(Nullable{String}["A", "B", Nullable()])[rand(map(Int64, 1:3), N)]
+ d3 = NullableArray(randn(N))
+ d4 = NullableArray(randn(N))
df7 = DataFrame(Any[d1, d2, d3], [:d1, :d2, :d3])
#test_group("groupby")
gd = groupby(df7, :d1)
@test length(gd) == 2
- # @test isequal(gd[2]["d2"], PooledDataVector["A", "B", NA, "A", NA, NA, NA, NA])
- @test sum(gd[2][:d3]) == sum(df7[:d3][dropna(df7[:d1] .== 2)])
+ # @test isequal(gd[2]["d2"], CategoricalVector["A", "B", Nullable(), "A", Nullable(), Nullable(), Nullable(), Nullable()])
+ @test isequal(sum(gd[2][:d3]), sum(df7[:d3][Vector(df7[:d1]) .== 2]))
g1 = groupby(df7, [:d1, :d2])
g2 = groupby(df7, [:d2, :d1])
- @test sum(g1[1][:d3]) == sum(g2[1][:d3])
+ @test isequal(sum(g1[1][:d3]), sum(g2[1][:d3]))
- res = 0.0
+ res = Nullable(0.0)
for x in g1
res += sum(x[:d1])
end
- @test res == sum(df7[:d1])
+ @test isequal(res, sum(df7[:d1]))
+
+ @test aggregate(DataFrame(a=1), identity) == DataFrame(a_identity=1)
df8 = aggregate(df7[[1, 3]], sum)
- @test df8[1, :d1_sum] == sum(df7[:d1])
+ @test isequal(df8[1, :d1_sum], sum(df7[:d1]))
df8 = aggregate(df7, :d2, [sum, length])
@test size(df8, 1) == 3
@test size(df8, 2) == 5
- @test df8[2, :d1_length] == 4
+ @test isequal(df8[2, :d1_length], Nullable(4))
@test isequal(df8, aggregate(groupby(df7, :d2), [sum, length]))
df9 = df7 |> groupby([:d2]) |> [sum, length]
@@ -189,11 +191,17 @@ module TestData
v2 = randn(5))
m1 = join(df1, df2, on = :a)
- @test isequal(m1[:a], @data([1, 2, 3, 4, 5]))
+ @test isequal(m1[:a], NullableArray([1, 2, 3, 4, 5]))
# TODO: Re-enable
- # m2 = join(df1, df2, on = :a, kind = :outer)
- # @test isequal(m2[:b2], DataVector["A", "B", "B", "B", "B", NA, NA, NA, NA, NA])
- # @test isequal(m2[:b2], DataVector["B", "B", "B", "C", "B", NA, NA, NA, NA, NA])
+ m2 = join(df1, df2, on = :a, kind = :outer)
+ # @test isequal(m2[:b2],
+ # NullableArray(Nullable{String}["A", "B", "B", "B", "B",
+ # Nullable(), Nullable(),
+ # Nullable(), Nullable(), Nullable()]))
+ # @test isequal(m2[:b2],
+ # NullableArray(Nullable{String}["B", "B", "B", "C", "B",
+ # Nullable(), Nullable(),
+ # Nullable(), Nullable(), Nullable()]))
df1 = DataFrame(a = [1, 2, 3],
b = ["America", "Europe", "Africa"])
@@ -201,33 +209,33 @@ module TestData
c = ["New World", "Old World", "New World"])
m1 = join(df1, df2, on = :a, kind = :inner)
- @test isequal(m1[:a], @data([1, 2]))
+ @test isequal(m1[:a], NullableArray([1, 2]))
m2 = join(df1, df2, on = :a, kind = :left)
- @test isequal(m2[:a], @data([1, 2, 3]))
+ @test isequal(m2[:a], NullableArray([1, 2, 3]))
m3 = join(df1, df2, on = :a, kind = :right)
- @test isequal(m3[:a], @data([1, 2, 4]))
+ @test isequal(m3[:a], NullableArray([1, 2, 4]))
m4 = join(df1, df2, on = :a, kind = :outer)
- @test isequal(m4[:a], @data([1, 2, 3, 4]))
+ @test isequal(m4[:a], NullableArray([1, 2, 3, 4]))
- # test with NAs (issue #185)
+ # test with nulls (issue #185)
df1 = DataFrame()
- df1[:A] = @data(["a", "b", "a", NA])
- df1[:B] = @data([1, 2, 1, 3])
+ df1[:A] = NullableArray(Nullable{Compat.ASCIIString}["a", "b", "a", Nullable()])
+ df1[:B] = NullableArray([1, 2, 1, 3])
df2 = DataFrame()
- df2[:A] = @data(["a", NA, "c"])
- df2[:C] = @data([1, 2, 4])
+ df2[:A] = NullableArray(Nullable{Compat.ASCIIString}["a", Nullable(), "c"])
+ df2[:C] = NullableArray([1, 2, 4])
m1 = join(df1, df2, on = :A)
@test size(m1) == (3,3)
- @test isequal(m1[:A], @data([NA,"a","a"]))
+ @test isequal(m1[:A], NullableArray(Nullable{Compat.ASCIIString}[Nullable(),"a","a"]))
m2 = join(df1, df2, on = :A, kind = :outer)
@test size(m2) == (5,3)
- @test isequal(m2[:A], @data([NA,"a","a","b","c"]))
+ @test isequal(m2[:A], NullableArray(Nullable{Compat.ASCIIString}[Nullable(),"a","a","b","c"]))
srand(1)
df1 = DataFrame(
@@ -241,20 +249,23 @@ module TestData
b = [:A,:B,:C][[1,1,1,2,3]],
v2 = randn(5)
)
- df2[1,:a] = NA
+ df2[1,:a] = Nullable()
# # TODO: Restore this functionality
# m1 = join(df1, df2, on = [:a,:b])
- # @test isequal(m1[:a], DataArray(["x", "x", "y", "y", fill("x", 5)]))
+ # @test isequal(m1[:a], NullableArray(["x", "x", "y", "y", fill("x", 5)]))
# m2 = join(df1, df2, on = ["a","b"], kind = :outer)
- # @test isequal(m2[10,:v2], NA)
- # @test isequal(m2[:a], DataVector["x", "x", "y", "y", "x", "x", "x", "x", "x", "y", NA, "y"])
+ # @test isequal(m2[10,:v2], Nullable())
+ # @test isequal(m2[:a],
+ # NullableArray(Nullable{String}["x", "x", "y", "y",
+ # "x", "x", "x", "x", "x", "y",
+ # Nullable(), "y"])
srand(1)
function spltdf(d)
- d[:x1] = map(x -> x[1], d[:a])
- d[:x2] = map(x -> x[2], d[:a])
- d[:x3] = map(x -> x[3], d[:a])
+ d[:x1] = map(x -> get(x)[1], d[:a])
+ d[:x2] = map(x -> get(x)[2], d[:a])
+ d[:x3] = map(x -> get(x)[3], d[:a])
d
end
df1 = DataFrame(
@@ -272,39 +283,6 @@ module TestData
# m2 = join(df1, df2, on = [:x1, :x2, :x3])
# @test isequal(sort(m1[:a]), sort(m2[:a]))
- #test_group("New DataVector constructors")
- dv = DataArray(Int, 5)
- @test all(isna(dv))
- dv = DataArray(Float64, 5)
- @test all(isna(dv))
- dv = @data(zeros(5))
- @test all(dv .== 0.0)
- dv = @data(ones(5))
- @test all(dv .== 1.0)
-
- # No more NA corruption
- dv = @data(ones(10_000))
- @test !any(isna(dv))
-
- PooledDataArray(falses(2), falses(2))
- PooledDataArray(falses(2), trues(2))
-
- # Test vectorized comparisons work for DataVector's and PooledDataVector's
- @data([1, 2, NA]) .== 1
- @pdata([1, 2, NA]) .== 1
- @data(["1", "2", NA]) .== "1"
- @pdata(["1", "2", NA]) .== "1"
-
- # Test unique()
- #test_group("unique()")
- # TODO: Restore this
- # dv = DataArray(1:4)
- # dv[4] = NA
- # @test (1 in unique(dv))
- # @test (2 in unique(dv))
- # @test (3 in unique(dv))
- # @test (NA in unique(dv))
-
# test nonunique() with extra argument
df1 = DataFrame(a = ["a", "b", "a", "b", "a", "b"], b = 1:6, c = [1:3;1:3])
df = vcat(df1, df1)
@@ -317,29 +295,16 @@ module TestData
@test find(nonunique(df, 1)) == collect(3:12)
# Test unique() with extra argument
- @test unique(df) == df1
- @test unique(df, :) == df1
- @test unique(df, Colon()) == df1
- @test unique(df, 2:3) == df1
- @test unique(df, 3) == df1[1:3,:]
- @test unique(df, [1, 3]) == df1
- @test unique(df, [:a, :c]) == df1
- @test unique(df, :a) == df1[1:2,:]
+ @test isequal(unique(df), df1)
+ @test isequal(unique(df, :), df1)
+ @test isequal(unique(df, Colon()), df1)
+ @test isequal(unique(df, 2:3), df1)
+ @test isequal(unique(df, 3), df1[1:3,:])
+ @test isequal(unique(df, [1, 3]), df1)
+ @test isequal(unique(df, [:a, :c]), df1)
+ @test isequal(unique(df, :a), df1[1:2,:])
#test unique!() with extra argument
unique!(df, [1, 3])
- @test df == df1
-
- #test_group("find()")
- dv = DataArray([true, false, true])
- @test isequal(find(dv), [1, 3])
-
- pdv = PooledDataArray([true, false, true])
- @test isequal(find(pdv), [1, 3])
-
- dv[1] = NA
- @test isequal(find(dv), [3])
-
- pdv[1] = NA
- @test isequal(find(pdv), [3])
+ @test isequal(df, df1)
end
diff --git a/test/data/iris.csv b/test/data/iris.csv
new file mode 100644
index 0000000000..603349e022
--- /dev/null
+++ b/test/data/iris.csv
@@ -0,0 +1,151 @@
+"SepalLength","SepalWidth","PetalLength","PetalWidth","Species"
+"5.1","3.5","1.4","0.2","setosa"
+"4.9","3.0","1.4","0.2","setosa"
+"4.7","3.2","1.3","0.2","setosa"
+"4.6","3.1","1.5","0.2","setosa"
+"5.0","3.6","1.4","0.2","setosa"
+"5.4","3.9","1.7","0.4","setosa"
+"4.6","3.4","1.4","0.3","setosa"
+"5.0","3.4","1.5","0.2","setosa"
+"4.4","2.9","1.4","0.2","setosa"
+"4.9","3.1","1.5","0.1","setosa"
+"5.4","3.7","1.5","0.2","setosa"
+"4.8","3.4","1.6","0.2","setosa"
+"4.8","3.0","1.4","0.1","setosa"
+"4.3","3.0","1.1","0.1","setosa"
+"5.8","4.0","1.2","0.2","setosa"
+"5.7","4.4","1.5","0.4","setosa"
+"5.4","3.9","1.3","0.4","setosa"
+"5.1","3.5","1.4","0.3","setosa"
+"5.7","3.8","1.7","0.3","setosa"
+"5.1","3.8","1.5","0.3","setosa"
+"5.4","3.4","1.7","0.2","setosa"
+"5.1","3.7","1.5","0.4","setosa"
+"4.6","3.6","1.0","0.2","setosa"
+"5.1","3.3","1.7","0.5","setosa"
+"4.8","3.4","1.9","0.2","setosa"
+"5.0","3.0","1.6","0.2","setosa"
+"5.0","3.4","1.6","0.4","setosa"
+"5.2","3.5","1.5","0.2","setosa"
+"5.2","3.4","1.4","0.2","setosa"
+"4.7","3.2","1.6","0.2","setosa"
+"4.8","3.1","1.6","0.2","setosa"
+"5.4","3.4","1.5","0.4","setosa"
+"5.2","4.1","1.5","0.1","setosa"
+"5.5","4.2","1.4","0.2","setosa"
+"4.9","3.1","1.5","0.2","setosa"
+"5.0","3.2","1.2","0.2","setosa"
+"5.5","3.5","1.3","0.2","setosa"
+"4.9","3.6","1.4","0.1","setosa"
+"4.4","3.0","1.3","0.2","setosa"
+"5.1","3.4","1.5","0.2","setosa"
+"5.0","3.5","1.3","0.3","setosa"
+"4.5","2.3","1.3","0.3","setosa"
+"4.4","3.2","1.3","0.2","setosa"
+"5.0","3.5","1.6","0.6","setosa"
+"5.1","3.8","1.9","0.4","setosa"
+"4.8","3.0","1.4","0.3","setosa"
+"5.1","3.8","1.6","0.2","setosa"
+"4.6","3.2","1.4","0.2","setosa"
+"5.3","3.7","1.5","0.2","setosa"
+"5.0","3.3","1.4","0.2","setosa"
+"7.0","3.2","4.7","1.4","versicolor"
+"6.4","3.2","4.5","1.5","versicolor"
+"6.9","3.1","4.9","1.5","versicolor"
+"5.5","2.3","4.0","1.3","versicolor"
+"6.5","2.8","4.6","1.5","versicolor"
+"5.7","2.8","4.5","1.3","versicolor"
+"6.3","3.3","4.7","1.6","versicolor"
+"4.9","2.4","3.3","1.0","versicolor"
+"6.6","2.9","4.6","1.3","versicolor"
+"5.2","2.7","3.9","1.4","versicolor"
+"5.0","2.0","3.5","1.0","versicolor"
+"5.9","3.0","4.2","1.5","versicolor"
+"6.0","2.2","4.0","1.0","versicolor"
+"6.1","2.9","4.7","1.4","versicolor"
+"5.6","2.9","3.6","1.3","versicolor"
+"6.7","3.1","4.4","1.4","versicolor"
+"5.6","3.0","4.5","1.5","versicolor"
+"5.8","2.7","4.1","1.0","versicolor"
+"6.2","2.2","4.5","1.5","versicolor"
+"5.6","2.5","3.9","1.1","versicolor"
+"5.9","3.2","4.8","1.8","versicolor"
+"6.1","2.8","4.0","1.3","versicolor"
+"6.3","2.5","4.9","1.5","versicolor"
+"6.1","2.8","4.7","1.2","versicolor"
+"6.4","2.9","4.3","1.3","versicolor"
+"6.6","3.0","4.4","1.4","versicolor"
+"6.8","2.8","4.8","1.4","versicolor"
+"6.7","3.0","5.0","1.7","versicolor"
+"6.0","2.9","4.5","1.5","versicolor"
+"5.7","2.6","3.5","1.0","versicolor"
+"5.5","2.4","3.8","1.1","versicolor"
+"5.5","2.4","3.7","1.0","versicolor"
+"5.8","2.7","3.9","1.2","versicolor"
+"6.0","2.7","5.1","1.6","versicolor"
+"5.4","3.0","4.5","1.5","versicolor"
+"6.0","3.4","4.5","1.6","versicolor"
+"6.7","3.1","4.7","1.5","versicolor"
+"6.3","2.3","4.4","1.3","versicolor"
+"5.6","3.0","4.1","1.3","versicolor"
+"5.5","2.5","4.0","1.3","versicolor"
+"5.5","2.6","4.4","1.2","versicolor"
+"6.1","3.0","4.6","1.4","versicolor"
+"5.8","2.6","4.0","1.2","versicolor"
+"5.0","2.3","3.3","1.0","versicolor"
+"5.6","2.7","4.2","1.3","versicolor"
+"5.7","3.0","4.2","1.2","versicolor"
+"5.7","2.9","4.2","1.3","versicolor"
+"6.2","2.9","4.3","1.3","versicolor"
+"5.1","2.5","3.0","1.1","versicolor"
+"5.7","2.8","4.1","1.3","versicolor"
+"6.3","3.3","6.0","2.5","virginica"
+"5.8","2.7","5.1","1.9","virginica"
+"7.1","3.0","5.9","2.1","virginica"
+"6.3","2.9","5.6","1.8","virginica"
+"6.5","3.0","5.8","2.2","virginica"
+"7.6","3.0","6.6","2.1","virginica"
+"4.9","2.5","4.5","1.7","virginica"
+"7.3","2.9","6.3","1.8","virginica"
+"6.7","2.5","5.8","1.8","virginica"
+"7.2","3.6","6.1","2.5","virginica"
+"6.5","3.2","5.1","2.0","virginica"
+"6.4","2.7","5.3","1.9","virginica"
+"6.8","3.0","5.5","2.1","virginica"
+"5.7","2.5","5.0","2.0","virginica"
+"5.8","2.8","5.1","2.4","virginica"
+"6.4","3.2","5.3","2.3","virginica"
+"6.5","3.0","5.5","1.8","virginica"
+"7.7","3.8","6.7","2.2","virginica"
+"7.7","2.6","6.9","2.3","virginica"
+"6.0","2.2","5.0","1.5","virginica"
+"6.9","3.2","5.7","2.3","virginica"
+"5.6","2.8","4.9","2.0","virginica"
+"7.7","2.8","6.7","2.0","virginica"
+"6.3","2.7","4.9","1.8","virginica"
+"6.7","3.3","5.7","2.1","virginica"
+"7.2","3.2","6.0","1.8","virginica"
+"6.2","2.8","4.8","1.8","virginica"
+"6.1","3.0","4.9","1.8","virginica"
+"6.4","2.8","5.6","2.1","virginica"
+"7.2","3.0","5.8","1.6","virginica"
+"7.4","2.8","6.1","1.9","virginica"
+"7.9","3.8","6.4","2.0","virginica"
+"6.4","2.8","5.6","2.2","virginica"
+"6.3","2.8","5.1","1.5","virginica"
+"6.1","2.6","5.6","1.4","virginica"
+"7.7","3.0","6.1","2.3","virginica"
+"6.3","3.4","5.6","2.4","virginica"
+"6.4","3.1","5.5","1.8","virginica"
+"6.0","3.0","4.8","1.8","virginica"
+"6.9","3.1","5.4","2.1","virginica"
+"6.7","3.1","5.6","2.4","virginica"
+"6.9","3.1","5.1","2.3","virginica"
+"5.8","2.7","5.1","1.9","virginica"
+"6.8","3.2","5.9","2.3","virginica"
+"6.7","3.3","5.7","2.5","virginica"
+"6.7","3.0","5.2","2.3","virginica"
+"6.3","2.5","5.0","1.9","virginica"
+"6.5","3.0","5.2","2.0","virginica"
+"6.2","3.4","5.4","2.3","virginica"
+"5.9","3.0","5.1","1.8","virginica"
diff --git a/test/dataframe.jl b/test/dataframe.jl
index 6b36e801fb..2814d45765 100644
--- a/test/dataframe.jl
+++ b/test/dataframe.jl
@@ -7,23 +7,28 @@ module TestDataFrame
# Equality
#
- @test isequal(DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])), DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])))
- @test !isequal(DataFrame(a=@data([1, 2]), b=@data([4, 5])), DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])))
- @test !isequal(DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])), DataFrame(a=@data([1, 2, 3])))
- @test !isequal(DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])), DataFrame(a=@data([1, 2, 3]), c=@data([4, 5, 6])))
- @test !isequal(DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])), DataFrame(b=@data([4, 5, 6]), a=@data([1, 2, 3])))
- @test !isequal(DataFrame(a=@data([1, 2, 2]), b=@data([4, 5, 6])), DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])))
- @test isequal(DataFrame(a=@data([1, 2, NA]), b=@data([4, 5, 6])), DataFrame(a=@data([1, 2, NA]), b=@data([4, 5, 6])))
-
- @test DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])) == DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6]))
- @test DataFrame(a=@data([1, 2]), b=@data([4, 5])) != DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6]))
- @test DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])) != DataFrame(a=@data([1, 2, 3]))
- @test DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])) != DataFrame(a=@data([1, 2, 3]), c=@data([4, 5, 6]))
- @test DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])) != DataFrame(b=@data([4, 5, 6]), a=@data([1, 2, 3]))
- @test DataFrame(a=@data([1, 2, 2]), b=@data([4, 5, 6])) != DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6]))
- @test DataFrame(a=@data([1, 3, NA]), b=@data([4, 5, 6])) != DataFrame(a=@data([1, 2, NA]), b=@data([4, 5, 6]))
- @test isna(DataFrame(a=@data([1, 2, NA]), b=@data([4, 5, 6])) == DataFrame(a=@data([1, 2, NA]), b=@data([4, 5, 6])))
- @test isna(DataFrame(a=@data([1, 2, NA]), b=@data([4, 5, 6])) == DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])))
+ @test isequal(DataFrame(a=[1, 2, 3], b=[4, 5, 6]), DataFrame(a=[1, 2, 3], b=[4, 5, 6]))
+ @test !isequal(DataFrame(a=[1, 2], b=[4, 5]), DataFrame(a=[1, 2, 3], b=[4, 5, 6]))
+ @test !isequal(DataFrame(a=[1, 2, 3], b=[4, 5, 6]), DataFrame(a=[1, 2, 3]))
+ @test !isequal(DataFrame(a=[1, 2, 3], b=[4, 5, 6]), DataFrame(a=[1, 2, 3], c=[4, 5, 6]))
+ @test !isequal(DataFrame(a=[1, 2, 3], b=[4, 5, 6]), DataFrame(b=[4, 5, 6], a=[1, 2, 3]))
+ @test !isequal(DataFrame(a=[1, 2, 2], b=[4, 5, 6]), DataFrame(a=[1, 2, 3], b=[4, 5, 6]))
+ @test isequal(DataFrame(a=Nullable{Int}[1, 2, Nullable()], b=[4, 5, 6]),
+ DataFrame(a=Nullable{Int}[1, 2, Nullable()], b=[4, 5, 6]))
+
+ # FIXME: equality operators won't work until JuliaStats/NullableArrays#84 is merged
+ #@test get(DataFrame(a=[1, 2, 3], b=[4, 5, 6]) == DataFrame(a=[1, 2, 3], b=[4, 5, 6]))
+ #@test get(DataFrame(a=[1, 2], b=[4, 5]) != DataFrame(a=[1, 2, 3], b=[4, 5, 6]))
+ #@test get(DataFrame(a=[1, 2, 3], b=[4, 5, 6]) != DataFrame(a=[1, 2, 3]))
+ #@test get(DataFrame(a=[1, 2, 3], b=[4, 5, 6]) != DataFrame(a=[1, 2, 3], c=[4, 5, 6]))
+ #@test get(DataFrame(a=[1, 2, 3], b=[4, 5, 6]) != DataFrame(b=[4, 5, 6], a=[1, 2, 3]))
+ #@test get(DataFrame(a=[1, 2, 2], b=[4, 5, 6]) != DataFrame(a=[1, 2, 3], b=[4, 5, 6]))
+ #@test get(DataFrame(a=Nullable{Int}[1, 3, Nullable()], b=[4, 5, 6]) !=
+ # DataFrame(a=Nullable{Int}[1, 2, Nullable()], b=[4, 5, 6]))
+ #@test isnull(DataFrame(a=Nullable{Int}[1, 2, Nullable()], b=[4, 5, 6]) ==
+ # DataFrame(a=Nullable{Int}[1, 2, Nullable()], b=[4, 5, 6]))
+ #@test isnull(DataFrame(a=Nullable{Int}[1, 2, Nullable()], b=[4, 5, 6]) ==
+ # DataFrame(a=Nullable{Int}[1, 2, 3], b=[4, 5, 6]))
#
# Copying
@@ -34,17 +39,17 @@ module TestDataFrame
dfdc = deepcopy(df)
df[1, :a] = 4
- df[1, :b][:e] = 5
+ get(df[1, :b])[:e] = 5
names!(df, [:f, :g])
@test names(dfc) == [:a, :b]
@test names(dfdc) == [:a, :b]
- @test dfc[1, :a] == 4
- @test dfdc[1, :a] == 2
+ @test get(dfc[1, :a]) === 4
+ @test get(dfdc[1, :a]) === 2
- @test names(dfc[1, :b]) == [:c, :e]
- @test names(dfdc[1, :b]) == [:c]
+ @test names(get(dfc[1, :b])) == [:c, :e]
+ @test names(get(dfdc[1, :b])) == [:c]
#
@@ -64,18 +69,18 @@ module TestDataFrame
# Insert single value
x[:d] = 3
- @test x[:d] == [3, 3, 3]
+ @test isequal(x[:d], NullableArray([3, 3, 3]))
x0[:d] = 3
@test x0[:d] == Int[]
- # similar / nas
- df = DataFrame(a = 1, b = "b", c = @pdata([3.3]))
- nadf = DataFrame(a = @data(Int[NA, NA]),
- b = DataArray(Array(String, 2), trues(2)),
- c = @pdata(Float64[NA, NA]))
- @test isequal(nadf, similar(df, 2))
- @test isequal(nadf, DataFrames.nas(df, 2))
+ # similar / nulls
+ df = DataFrame(a = 1, b = "b", c = CategoricalArray([3.3]))
+ nulldf = DataFrame(a = NullableArray(Int, 2),
+ b = NullableArray(String, 2),
+ c = NullableCategoricalArray(Float64, 2))
+ @test isequal(nulldf, similar(df, 2))
+ @test isequal(nulldf, DataFrames.similar_nullable(df, 2))
# Associative methods
@@ -93,58 +98,58 @@ module TestDataFrame
df = DataFrame(a=[1, 2], b=[3., 4.])
@test_throws BoundsError insert!(df, 5, ["a", "b"], :newcol)
@test_throws ErrorException insert!(df, 1, ["a"], :newcol)
- @test insert!(df, 1, ["a", "b"], :newcol) == df
- @test isequal(df, DataFrame(newcol=["a", "b"], a=[1, 2], b=[3., 4.]))
- df = DataFrame(a=[1, 2], b=[3., 4.])
- @test insert!(df, 3, ["a", "b"], :newcol) == df
- @test isequal(df, DataFrame(a=[1, 2], b=[3., 4.], newcol=["a", "b"]))
+ @test isequal(insert!(df, 1, ["a", "b"], :newcol), df)
+ @test names(df) == [:newcol, :a, :b]
+ @test isequal(df[:a], NullableArray([1, 2]))
+ @test isequal(df[:b], NullableArray([3., 4.]))
+ @test isequal(df[:newcol], ["a", "b"])
df = DataFrame(a=[1, 2], b=[3., 4.])
df2 = DataFrame(b=["a", "b"], c=[:c, :d])
- @test merge!(df, df2) == df
+ @test isequal(merge!(df, df2), df)
@test isequal(df, DataFrame(a=[1, 2], b=["a", "b"], c=[:c, :d]))
#test_group("Empty DataFrame constructors")
df = DataFrame(Int, 10, 3)
@test size(df, 1) == 10
@test size(df, 2) == 3
- @test typeof(df[:, 1]) == DataVector{Int}
- @test typeof(df[:, 2]) == DataVector{Int}
- @test typeof(df[:, 3]) == DataVector{Int}
- @test allna(df[:, 1])
- @test allna(df[:, 2])
- @test allna(df[:, 3])
+ @test typeof(df[:, 1]) == NullableVector{Int}
+ @test typeof(df[:, 2]) == NullableVector{Int}
+ @test typeof(df[:, 3]) == NullableVector{Int}
+ @test allnull(df[:, 1])
+ @test allnull(df[:, 2])
+ @test allnull(df[:, 3])
df = DataFrame(Any[Int, Float64, String], 100)
@test size(df, 1) == 100
@test size(df, 2) == 3
- @test typeof(df[:, 1]) == DataVector{Int}
- @test typeof(df[:, 2]) == DataVector{Float64}
- @test typeof(df[:, 3]) == DataVector{String}
- @test allna(df[:, 1])
- @test allna(df[:, 2])
- @test allna(df[:, 3])
+ @test typeof(df[:, 1]) == NullableVector{Int}
+ @test typeof(df[:, 2]) == NullableVector{Float64}
+ @test typeof(df[:, 3]) == NullableVector{String}
+ @test allnull(df[:, 1])
+ @test allnull(df[:, 2])
+ @test allnull(df[:, 3])
df = DataFrame(Any[Int, Float64, String], [:A, :B, :C], 100)
@test size(df, 1) == 100
@test size(df, 2) == 3
- @test typeof(df[:, 1]) == DataVector{Int}
- @test typeof(df[:, 2]) == DataVector{Float64}
- @test typeof(df[:, 3]) == DataVector{String}
- @test allna(df[:, 1])
- @test allna(df[:, 2])
- @test allna(df[:, 3])
+ @test typeof(df[:, 1]) == NullableVector{Int}
+ @test typeof(df[:, 2]) == NullableVector{Float64}
+ @test typeof(df[:, 3]) == NullableVector{String}
+ @test allnull(df[:, 1])
+ @test allnull(df[:, 2])
+ @test allnull(df[:, 3])
df = DataFrame(DataType[Int, Float64, Compat.UTF8String],[:A, :B, :C], [false,false,true],100)
@test size(df, 1) == 100
@test size(df, 2) == 3
- @test typeof(df[:, 1]) == DataVector{Int}
- @test typeof(df[:, 2]) == DataVector{Float64}
- @test typeof(df[:, 3]) == PooledDataVector{Compat.UTF8String,UInt32}
- @test allna(df[:, 1])
- @test allna(df[:, 2])
- @test allna(df[:, 3])
+ @test typeof(df[:, 1]) == NullableVector{Int}
+ @test typeof(df[:, 2]) == NullableVector{Float64}
+ @test typeof(df[:, 3]) == NullableCategoricalVector{Compat.UTF8String,UInt32}
+ @test allnull(df[:, 1])
+ @test allnull(df[:, 2])
+ @test allnull(df[:, 3])
df = convert(DataFrame, zeros(10, 5))
@@ -168,8 +173,8 @@ module TestDataFrame
@compat(Dict{Any,Any}(:a=>5))])
@test size(df, 1) == 3
@test size(df, 2) == 2
- @test typeof(df[:,:a]) == DataVector{Int}
- @test typeof(df[:,:b]) == DataVector{Char}
+ @test typeof(df[:,:a]) == NullableVector{Int}
+ @test typeof(df[:,:b]) == NullableVector{Char}
df = DataFrame([@compat(Dict{Any,Any}(:a=>1, :b=>'c')),
@compat(Dict{Any,Any}(:a=>3, :b=>'d')),
@@ -177,9 +182,10 @@ module TestDataFrame
[:a, :b])
@test size(df, 1) == 3
@test size(df, 2) == 2
- @test typeof(df[:,:a]) == DataVector{Int}
- @test typeof(df[:,:b]) == DataVector{Char}
+ @test typeof(df[:,:a]) == NullableVector{Int}
+ @test typeof(df[:,:b]) == NullableVector{Char}
+ @test DataFrame(NullableArray[[1,2,3],[2.5,4.5,6.5]], [:A, :B]) == DataFrame(A = [1,2,3], B = [2.5,4.5,6.5])
# This assignment was missing before
df = DataFrame(Column = [:A])
@@ -201,11 +207,11 @@ module TestDataFrame
dfb= DataFrame( first=[1,2], second=["apple","orange"] )
push!(dfb, Any[3,"pear"])
- @test df==dfb
+ @test isequal(df, dfb)
dfb= DataFrame( first=[1,2], second=["apple","orange"] )
push!(dfb, (3,"pear"))
- @test df==dfb
+ @test isequal(df, dfb)
dfb= DataFrame( first=[1,2], second=["apple","orange"] )
@test_throws ArgumentError push!(dfb, (33.33,"pear"))
@@ -215,22 +221,22 @@ module TestDataFrame
dfb= DataFrame( first=[1,2], second=["apple","orange"] )
push!(dfb, @compat(Dict(:first=>3, :second=>"pear")))
- @test df==dfb
+ @test isequal(df, dfb)
df=DataFrame( first=[1,2,3], second=["apple","orange","banana"] )
dfb= DataFrame( first=[1,2], second=["apple","orange"] )
push!(dfb, @compat(Dict("first"=>3, "second"=>"banana")))
- @test df==dfb
+ @test isequal(df, dfb)
df0= DataFrame( first=[1,2], second=["apple","orange"] )
dfb= DataFrame( first=[1,2], second=["apple","orange"] )
@test_throws ArgumentError push!(dfb, @compat(Dict(:first=>true, :second=>false)))
- @test df0==dfb
+ @test isequal(df0, dfb)
df0= DataFrame( first=[1,2], second=["apple","orange"] )
dfb= DataFrame( first=[1,2], second=["apple","orange"] )
@test_throws ArgumentError push!(dfb, @compat(Dict("first"=>"chicken", "second"=>"stuff")))
- @test df0==dfb
+ @test isequal(df0, dfb)
# delete!
df = DataFrame(a=1, b=2, c=3, d=4, e=5)
@@ -267,54 +273,61 @@ module TestDataFrame
@test deleterows!(df, [2, 3]) === df
@test isequal(df, DataFrame(a=[1], b=[3.]))
- df = DataFrame(a=@data([1, 2]), b=@data([3., 4.]))
+ df = DataFrame(a=NullableArray([1, 2]), b=NullableArray([3., 4.]))
@test deleterows!(df, 1) === df
- @test isequal(df, DataFrame(a=@data([2]), b=@data([4.])))
+ @test isequal(df, DataFrame(a=NullableArray([2]), b=NullableArray([4.])))
- df = DataFrame(a=@data([1, 2]), b=@data([3., 4.]))
+ df = DataFrame(a=NullableArray([1, 2]), b=NullableArray([3., 4.]))
@test deleterows!(df, 2) === df
- @test isequal(df, DataFrame(a=@data([1]), b=@data([3.])))
+ @test isequal(df, DataFrame(a=NullableArray([1]), b=NullableArray([3.])))
- df = DataFrame(a=@data([1, 2, 3]), b=@data([3., 4., 5.]))
+ df = DataFrame(a=NullableArray([1, 2, 3]), b=NullableArray([3., 4., 5.]))
@test deleterows!(df, 2:3) === df
- @test isequal(df, DataFrame(a=@data([1]), b=@data([3.])))
+ @test isequal(df, DataFrame(a=NullableArray([1]), b=NullableArray([3.])))
- df = DataFrame(a=@data([1, 2, 3]), b=@data([3., 4., 5.]))
+ df = DataFrame(a=NullableArray([1, 2, 3]), b=NullableArray([3., 4., 5.]))
@test deleterows!(df, [2, 3]) === df
- @test isequal(df, DataFrame(a=@data([1]), b=@data([3.])))
+ @test isequal(df, DataFrame(a=NullableArray([1]), b=NullableArray([3.])))
# describe
#suppress output and test that describe() does not throw
devnull = is_unix() ? "/dev/null" : "nul"
open(devnull, "w") do f
- @test nothing == describe(f, DataFrame(a=[1, 2], b=Any["3", NA]))
- @test nothing == describe(f, DataFrame(a=@data([1, 2]), b=@data(["3", NA])))
- @test nothing == describe(f, DataFrame(a=@pdata([1, 2]), b=@pdata(["3", NA])))
+ @test nothing == describe(f, DataFrame(a=[1, 2], b=Any["3", Nullable()]))
+ @test nothing ==
+ describe(f, DataFrame(a=NullableArray([1, 2]),
+ b=NullableArray(Nullable{String}["3", Nullable()])))
+ @test nothing ==
+ describe(f, DataFrame(a=CategoricalArray([1, 2]),
+ b=NullableCategoricalArray(Nullable{String}["3", Nullable()])))
@test nothing == describe(f, [1, 2, 3])
- @test nothing == describe(f, @data([1, 2, 3]))
- @test nothing == describe(f, @pdata([1, 2, 3]))
- @test nothing == describe(f, Any["1", "2", NA])
- @test nothing == describe(f, @data(["1", "2", NA]))
- @test nothing == describe(f, @pdata(["1", "2", NA]))
+ @test nothing == describe(f, NullableArray([1, 2, 3]))
+ @test nothing == describe(f, CategoricalArray([1, 2, 3]))
+ @test nothing == describe(f, Any["1", "2", Nullable()])
+ @test nothing == describe(f, NullableArray(Nullable{String}["1", "2", Nullable()]))
+ @test nothing == describe(f, NullableCategoricalArray(Nullable{String}["1", "2", Nullable()]))
end
#Check the output of unstack
- df = DataFrame(Fish = ["Bob", "Bob", "Batman", "Batman"],
- Key = ["Mass", "Color", "Mass", "Color"],
- Value = ["12 g", "Red", "18 g", "Grey"])
+ df = DataFrame(Fish = CategoricalArray(["Bob", "Bob", "Batman", "Batman"]),
+ Key = ["Mass", "Color", "Mass", "Color"],
+ Value = ["12 g", "Red", "18 g", "Grey"])
+ # Check that reordering levels does not confuse unstack
+ levels!(df[1], ["XXX", "Bob", "Batman"])
#Unstack specifying a row column
df2 = unstack(df,:Fish, :Key, :Value)
#Unstack without specifying a row column
df3 = unstack(df,:Key, :Value)
#The expected output
- df4 = DataFrame(Fish = ["Batman", "Bob"], Color = ["Grey", "Red"], Mass = ["18 g", "12 g"])
- @test df2 == df4
- @test df3 == df4
- #Make sure unstack works with NAs at the start of the value column
- df[1,:Value] = NA
+ df4 = DataFrame(Fish = ["XXX", "Bob", "Batman"],
+ Color = Nullable{String}[Nullable(), "Red", "Grey"],
+ Mass = Nullable{String}[Nullable(), "12 g", "18 g"])
+ @test isequal(df2, df4)
+ @test isequal(df3, df4[2:3, :])
+ #Make sure unstack works with NULLs at the start of the value column
+ df[1,:Value] = Nullable()
df2 = unstack(df,:Fish, :Key, :Value)
#This changes the expected result
- df4[2,:Mass] = NA
+ df4[2,:Mass] = Nullable()
@test isequal(df2, df4)
-
end
diff --git a/test/dataframerow.jl b/test/dataframerow.jl
index 4fcc2c66e0..0b69555db6 100644
--- a/test/dataframerow.jl
+++ b/test/dataframerow.jl
@@ -2,12 +2,14 @@ module TestDataFrameRow
using Base.Test
using DataFrames, Compat
- df = DataFrame(a=@data([1, 2, 3, 1, 2, 2 ]),
- b=@data([2.0, NA, 1.2, 2.0, NA, NA]),
- c=@data(["A", "B", "C", "A", "B", NA]),
- d=PooledDataArray(
- @data([:A, NA, :C, :A, NA, :C])))
- df2 = DataFrame(a = @data([1, 2, 3]))
+ df = DataFrame(a=NullableArray([1, 2, 3, 1, 2, 2 ]),
+ b=NullableArray(Nullable{Float64}[2.0, Nullable(),
+ 1.2, 2.0,
+ Nullable(), Nullable()]),
+ c=NullableArray(Nullable{String}["A", "B", "C", "A", "B", Nullable()]),
+ d=NullableCategoricalArray(Nullable{Symbol}[:A, Nullable(), :C, :A,
+ Nullable(), :C]))
+ df2 = DataFrame(a = NullableArray([1, 2, 3]))
#
# Equality
diff --git a/test/duplicates.jl b/test/duplicates.jl
index 848dded61c..5656cbbb51 100644
--- a/test/duplicates.jl
+++ b/test/duplicates.jl
@@ -9,10 +9,12 @@ module TestDuplicates
unique!(df)
@test isequal(df, udf)
- pdf = DataFrame( a = PooledDataArray( @data ["a", "a", NA, NA, "b", NA, "a", NA] ),
- b = PooledDataArray( @data ["a", "b", NA, NA, "b", "a", "a", "a"] ) )
- updf = DataFrame( a = PooledDataArray( @data ["a", "a", NA, "b", NA] ),
- b = PooledDataArray( @data ["a", "b", NA, "b", "a"] ) )
+ pdf = DataFrame(a = NullableCategoricalArray(Nullable{String}["a", "a", Nullable(),
+ Nullable(), "b", Nullable(), "a", Nullable()]),
+ b = NullableCategoricalArray(Nullable{String}["a", "b", Nullable(),
+ Nullable(), "b", "a", "a", "a"]))
+ updf = DataFrame(a = NullableCategoricalArray(Nullable{String}["a", "a", Nullable(), "b", Nullable()]),
+ b = NullableCategoricalArray(Nullable{String}["a", "b", Nullable(), "b", "a"]))
@test isequal(nonunique(pdf), [false, false, false, true, false, false, true, true])
@test isequal(nonunique(updf), falses(5) )
@test isequal(updf, unique(pdf))
diff --git a/test/formula.jl b/test/formula.jl
index 11c093158b..db7d5e00b9 100644
--- a/test/formula.jl
+++ b/test/formula.jl
@@ -53,9 +53,10 @@ module TestFormula
@test t.intercept == false
@test t.terms == [:x1, :x2]
- t = Terms(y ~ -1 + x1 + x2)
- @test t.intercept == false
- @test t.terms == [:x1, :x2]
+ @test t == Terms(y ~ -1 + x1 + x2) == Terms(y ~ x1 - 1 + x2) == Terms(y ~ x1 + x2 -1)
+
+ ## can't subtract terms other than 1
+ @test_throws ErrorException Terms(y ~ x1 - x2)
t = Terms(y ~ x1 & x2)
@test t.terms == [:(x1 & x2)]
@@ -133,11 +134,10 @@ module TestFormula
@test isa(mm.m, Matrix{Float64})
@test isa(smm.m, sparsetype)
- @test isa(ModelMatrix{DataMatrix{Float64}}(mf).m, DataMatrix{Float64})
- #test_group("expanding a PooledVec into a design matrix of indicators for each dummy variable")
+ #test_group("expanding a nominal array into a design matrix of indicators for each dummy variable")
- d[:x1p] = PooledDataArray(d[:x1])
+ d[:x1p] = NullableCategoricalArray(d[:x1])
mf = ModelFrame(y ~ x1p, d)
mm = ModelMatrix(mf)
@@ -182,24 +182,24 @@ module TestFormula
## @test r[:,1] == DataVector(df["x1"])
## @test r[:,2] == DataVector(df["x2"])
- ## df["x1"] = PooledDataArray(x1)
+ ## df["x1"] = CategoricalArray(x1)
## r = expand(:x1, df)
## @test isa(r, DataFrame)
## @test ncol(r) == 3
- ## @test r == expand(PooledDataArray(x1), "x1", DataFrame())
+ ## @test r == expand(CategoricalArray(x1), "x1", DataFrame())
## r = expand(:(x1 + x2), df)
## @test isa(r, DataFrame)
## @test ncol(r) == 4
- ## @test r[:,1:3] == expand(PooledDataArray(x1), "x1", DataFrame())
+ ## @test r[:,1:3] == expand(CategoricalArray(x1), "x1", DataFrame())
## @test r[:,4] == DataVector(df["x2"])
- ## df["x2"] = PooledDataArray(x2)
+ ## df["x2"] = CategoricalArray(x2)
## r = expand(:(x1 + x2), df)
## @test isa(r, DataFrame)
## @test ncol(r) == 6
- ## @test r[:,1:3] == expand(PooledDataArray(x1), "x1", DataFrame())
- ## @test r[:,4:6] == expand(PooledDataArray(x2), "x2", DataFrame())
+ ## @test r[:,1:3] == expand(CategoricalArray(x1), "x1", DataFrame())
+ ## @test r[:,4:6] == expand(CategoricalArray(x2), "x2", DataFrame())
#test_group("Creating a model matrix using full formulas: y ~ x1 + x2, etc")
@@ -216,7 +216,7 @@ module TestFormula
@test mm.m == [ones(4) x1 x2 x1.*x2]
@test mm.m == ModelMatrix{sparsetype}(mf).m
- df[:x1] = PooledDataArray(x1)
+ df[:x1] = CategoricalArray(x1)
x1e = [[0, 1, 0, 0] [0, 0, 1, 0] [0, 0, 0, 1]]
f = y ~ x1 * x2
mf = ModelFrame(f, df)
@@ -235,7 +235,7 @@ module TestFormula
## @test mm.m == [ones(4) x1 log(x2)]
## df = deepcopy(d)
- ## df["x1"] = PooledDataArray([5:8])
+ ## df["x1"] = CategoricalArray([5:8])
## f = Formula(:(y ~ x1 * (log(x2) + x3)))
## mf = ModelFrame(f, df)
## mm = ModelMatrix(mf)
@@ -277,7 +277,7 @@ module TestFormula
## @test model_response(mf) == y'' # fails: Int64 vs. Float64
df = deepcopy(d)
- df[:x1] = PooledDataArray(df[:x1])
+ df[:x1] = NullableCategoricalArray(df[:x1])
f = y ~ x2 + x3 + x3*x2
mm = ModelMatrix(ModelFrame(f, df))
@@ -334,9 +334,9 @@ module TestFormula
## FAILS: behavior is wrong when no lower-order terms (1+x1+x2+x1&x2...)
##
## df = DataFrame(y=1:27,
- ## x1 = PooledDataArray(vec([x for x in 1:3, y in 4:6, z in 7:9])),
- ## x2 = PooledDataArray(vec([y for x in 1:3, y in 4:6, z in 7:9])),
- ## x3 = PooledDataArray(vec([z for x in 1:3, y in 4:6, z in 7:9])))
+ ## x1 = CategoricalArray(vec([x for x in 1:3, y in 4:6, z in 7:9])),
+ ## x2 = CategoricalArray(vec([y for x in 1:3, y in 4:6, z in 7:9])),
+ ## x3 = CategoricalArray(vec([z for x in 1:3, y in 4:6, z in 7:9])))
## f = y ~ x1 & x2 & x3
## mf = ModelFrame(f, df)
## @test coefnames(mf)[2:end] ==
@@ -380,10 +380,10 @@ module TestFormula
@test size(mm_sub) == (3,3)
## Missing data
- d[:x1m] = @data [5, 6, NA, 7]
+ d[:x1m] = NullableArray(Nullable{Int}[5, 6, Nullable(), 7])
mf = ModelFrame(y ~ x1m, d)
mm = ModelMatrix(mf)
- @test mm.m[:, 2] == d[complete_cases(d), :x1m]
+ @test isequal(NullableArray(mm.m[:, 2]), d[complete_cases(d), :x1m])
@test mm.m == ModelMatrix{sparsetype}(mf).m
## Same variable on left and right side
@@ -396,7 +396,7 @@ module TestFormula
d = DataFrame(x = Compat.repeat([:a, :b], outer = 4),
y = Compat.repeat([:c, :d], inner = 2, outer = 2),
z = Compat.repeat([:e, :f], inner = 4))
-[pool!(d, name) for name in names(d)]
+[categorical!(d, name) for name in names(d)]
cs = Dict([Pair(name, EffectsCoding()) for name in names(d)])
d[:n] = 1.:8
@@ -545,5 +545,6 @@ df = DataFrame(x = [1.0,2.0,3.0], y = [4.0,5.0,6.0])
mf = ModelFrame(y ~ 0 + x, df)
X = ModelMatrix(mf).m
X[1] = 0.0
-@test mf.df[1, :x] == 1.0
+@test mf.df[1, :x] === Nullable(1.0)
+
end
diff --git a/test/grouping.jl b/test/grouping.jl
index c7dbb78dd1..c138584cec 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -5,8 +5,8 @@ module TestGrouping
df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]),
b = repeat([2, 1], outer=[4]),
c = randn(8))
- #df[6, :a] = NA
- #df[7, :b] = NA
+ #df[6, :a] = Nullable()
+ #df[7, :b] = Nullable()
cols = [:a, :b]
@@ -15,7 +15,7 @@ module TestGrouping
sdf = sort(df, cols=cols)
bdf = by(df, cols, f)
- @test bdf[cols] == unique(sdf[cols])
+ @test isequal(bdf[cols], unique(sdf[cols]))
byf = by(df, :a, df -> DataFrame(bsum = sum(df[:b])))
@@ -25,19 +25,68 @@ module TestGrouping
gd = groupby(df, cols)
ga = map(f, gd)
- @test bdf == combine(ga)
+ @test isequal(bdf, combine(ga))
- g(df) = DataFrame(cmax1 = df[:cmax] + 1)
+ g(df) = DataFrame(cmax1 = Vector(df[:cmax]) + 1)
h(df) = g(f(df))
- @test combine(map(h, gd)) == combine(map(g, ga))
+ @test isequal(combine(map(h, gd)), combine(map(g, ga)))
+
+ # testing pool overflow
+ df2 = DataFrame(v1 = categorical(collect(1:1000)), v2 = categorical(fill(1, 1000)))
+ @test groupby(df2, [:v1, :v2]).starts == collect(1:1000)
+ @test groupby(df2, [:v2, :v1]).starts == collect(1:1000)
+
+ # grouping empty frame
+ @test groupby(DataFrame(A=Int[]), :A).starts == Int[]
+ # grouping single row
+ @test groupby(DataFrame(A=Int[1]), :A).starts == Int[1]
# issue #960
- x = pool(collect(1:20))
+ x = CategoricalArray(collect(1:20))
df = DataFrame(v1=x, v2=x)
groupby(df, [:v1, :v2])
df2 = by(e->1, DataFrame(x=Int64[]), :x)
@test size(df2) == (0,1)
- @test sum(df2[:x]) == 0
+ @test isequal(sum(df2[:x]), Nullable(0))
+
+ # Check that reordering levels does not confuse groupby
+ df = DataFrame(Key1 = CategoricalArray(["A", "A", "B", "B"]),
+ Key2 = CategoricalArray(["A", "B", "A", "B"]),
+ Value = 1:4)
+ gd = groupby(df, :Key1)
+ @test isequal(gd[1], DataFrame(Key1=["A", "A"], Key2=["A", "B"], Value=1:2))
+ @test isequal(gd[2], DataFrame(Key1=["B", "B"], Key2=["A", "B"], Value=3:4))
+ gd = groupby(df, [:Key1, :Key2])
+ @test isequal(gd[1], DataFrame(Key1="A", Key2="A", Value=1))
+ @test isequal(gd[2], DataFrame(Key1="A", Key2="B", Value=2))
+ @test isequal(gd[3], DataFrame(Key1="B", Key2="A", Value=3))
+ @test isequal(gd[4], DataFrame(Key1="B", Key2="B", Value=4))
+ # Reorder levels, add unused level
+ levels!(df[:Key1], ["Z", "B", "A"])
+ levels!(df[:Key2], ["Z", "B", "A"])
+ gd = groupby(df, :Key1)
+ @test isequal(gd[1], DataFrame(Key1=["B", "B"], Key2=["A", "B"], Value=3:4))
+ @test isequal(gd[2], DataFrame(Key1=["A", "A"], Key2=["A", "B"], Value=1:2))
+ gd = groupby(df, [:Key1, :Key2])
+ @test isequal(gd[1], DataFrame(Key1="B", Key2="B", Value=4))
+ @test isequal(gd[2], DataFrame(Key1="B", Key2="A", Value=3))
+ @test isequal(gd[3], DataFrame(Key1="A", Key2="B", Value=2))
+ @test isequal(gd[4], DataFrame(Key1="A", Key2="A", Value=1))
+
+ a = DataFrame(x=categorical(1:200))
+ b = DataFrame(x=categorical(100:300))
+ a[:x] = compact(a[:x])
+ b[:x] = compact(b[:x])
+ r = vcat(a, b)
+ @test isequal(r, DataFrame(x=[categorical(1:200); categorical(100:300)]))
+
+ a = DataFrame(x=categorical(1:200))
+ b = DataFrame(y=categorical(100:300))
+ a[:x] = compact(a[:x])
+ b[:y] = compact(b[:y])
+ r = vcat(a, b)
+ @test isequal(r, DataFrame(x=NullableCategoricalArray(1:401, [fill(false, 200); fill(true, 201)]),
+ y=NullableCategoricalArray(-100:300, [fill(true, 200); fill(false, 201)])))
end
diff --git a/test/index.jl b/test/index.jl
index 9160da4249..41fc3f1495 100644
--- a/test/index.jl
+++ b/test/index.jl
@@ -16,16 +16,11 @@ inds = Any[1,
1:1,
1.0:1.0,
[:A],
- @data([true]),
- @data([1]),
- @data([1.0]),
- @data([:A]),
- DataArray([:A]),
- PooledDataArray([true]),
- @pdata([1]),
- @pdata([1.0]),
- @pdata([:A]),
- PooledDataArray([:A])]
+ NullableArray([true]),
+ NullableArray([1]),
+ NullableArray([1.0]),
+ NullableArray([:A]),
+ NullableArray([:A])]
for ind in inds
if isequal(ind, :A) || ndims(ind) == 0
@@ -56,4 +51,12 @@ for name in names(i)
i2[name] # Issue #715
end
+#= Aliasing & Mutation =#
+
+# columns should not alias if scalar broadcasted
+df = DataFrame(A=[0],B=[0])
+df[1:end] = 0.0
+df[1,:A] = 1.0
+@test df[1,:B] === Nullable(0)
+
end
diff --git a/test/io.jl b/test/io.jl
index c587a57b59..715de39e21 100644
--- a/test/io.jl
+++ b/test/io.jl
@@ -1,6 +1,7 @@
module TestIO
using Base.Test
using DataFrames, Compat
+ using LaTeXStrings
#test_group("We can read various file types.")
@@ -38,57 +39,57 @@ module TestIO
@test size(df) == (58788, 25)
- @test df[1, 1] === 1
- @test df[1, 2] == "\$"
- @test df[1, 3] === 1971
- @test df[1, 4] === 121
- @test df[1, 5] === NA
- @test df[1, 6] === 6.4
- @test df[1, 7] === 348
- @test df[1, 8] === 4.5
- @test df[1, 9] === 4.5
- @test df[1, 10] === 4.5
- @test df[1, 11] === 4.5
- @test df[1, 12] === 14.5
- @test df[1, 13] === 24.5
- @test df[1, 14] === 24.5
- @test df[1, 15] === 14.5
- @test df[1, 16] === 4.5
- @test df[1, 17] === 4.5
- @test df[1, 18] == ""
- @test df[1, 19] === 0
- @test df[1, 20] === 0
- @test df[1, 21] === 1
- @test df[1, 22] === 1
- @test df[1, 23] === 0
- @test df[1, 24] === 0
- @test df[1, 25] === 0
-
- @test df[end, 1] === 58788
- @test df[end, 2] == "xXx: State of the Union"
- @test df[end, 3] === 2005
- @test df[end, 4] === 101
- @test df[end, 5] === 87000000
- @test df[end, 6] === 3.9
- @test df[end, 7] === 1584
- @test df[end, 8] === 24.5
- @test df[end, 9] === 4.5
- @test df[end, 10] === 4.5
- @test df[end, 11] === 4.5
- @test df[end, 12] === 4.5
- @test df[end, 13] === 14.5
- @test df[end, 14] === 4.5
- @test df[end, 15] === 4.5
- @test df[end, 16] === 4.5
- @test df[end, 17] === 14.5
- @test df[end, 18] == "PG-13"
- @test df[end, 19] === 1
- @test df[end, 20] === 0
- @test df[end, 21] === 0
- @test df[end, 22] === 0
- @test df[end, 23] === 0
- @test df[end, 24] === 0
- @test df[end, 25] === 0
+ @test df[1, 1] === Nullable(1)
+ @test isequal(df[1, 2], Nullable("\$"))
+ @test df[1, 3] === Nullable(1971)
+ @test df[1, 4] === Nullable(121)
+ @test isnull(df[1, 5])
+ @test df[1, 6] === Nullable(6.4)
+ @test df[1, 7] === Nullable(348)
+ @test df[1, 8] === Nullable(4.5)
+ @test df[1, 9] === Nullable(4.5)
+ @test df[1, 10] === Nullable(4.5)
+ @test df[1, 11] === Nullable(4.5)
+ @test df[1, 12] === Nullable(14.5)
+ @test df[1, 13] === Nullable(24.5)
+ @test df[1, 14] === Nullable(24.5)
+ @test df[1, 15] === Nullable(14.5)
+ @test df[1, 16] === Nullable(4.5)
+ @test df[1, 17] === Nullable(4.5)
+ @test isequal(df[1, 18], Nullable(""))
+ @test df[1, 19] === Nullable(0)
+ @test df[1, 20] === Nullable(0)
+ @test df[1, 21] === Nullable(1)
+ @test df[1, 22] === Nullable(1)
+ @test df[1, 23] === Nullable(0)
+ @test df[1, 24] === Nullable(0)
+ @test df[1, 25] === Nullable(0)
+
+ @test df[end, 1] === Nullable(58788)
+ @test isequal(df[end, 2], Nullable("xXx: State of the Union"))
+ @test df[end, 3] === Nullable(2005)
+ @test df[end, 4] === Nullable(101)
+ @test df[end, 5] === Nullable(87000000)
+ @test df[end, 6] === Nullable(3.9)
+ @test df[end, 7] === Nullable(1584)
+ @test df[end, 8] === Nullable(24.5)
+ @test df[end, 9] === Nullable(4.5)
+ @test df[end, 10] === Nullable(4.5)
+ @test df[end, 11] === Nullable(4.5)
+ @test df[end, 12] === Nullable(4.5)
+ @test df[end, 13] === Nullable(14.5)
+ @test df[end, 14] === Nullable(4.5)
+ @test df[end, 15] === Nullable(4.5)
+ @test df[end, 16] === Nullable(4.5)
+ @test df[end, 17] === Nullable(14.5)
+ @test isequal(df[end, 18], Nullable("PG-13"))
+ @test df[end, 19] === Nullable(1)
+ @test df[end, 20] === Nullable(0)
+ @test df[end, 21] === Nullable(0)
+ @test df[end, 22] === Nullable(0)
+ @test df[end, 23] === Nullable(0)
+ @test df[end, 24] === Nullable(0)
+ @test df[end, 25] === Nullable(0)
#test_group("readtable handles common separators and infers them from extensions.")
@@ -97,9 +98,9 @@ module TestIO
df3 = readtable("$data/separators/sample_data.wsv")
df4 = readtable("$data/separators/sample_data_white.txt", separator = ' ')
- @test df1 == df2
- @test df2 == df3
- @test df3 == df4
+ @test isequal(df1, df2)
+ @test isequal(df2, df3)
+ @test isequal(df3, df4)
readtable("$data/quoting/quotedwhitespace.txt", separator = ' ')
@@ -129,23 +130,25 @@ module TestIO
# df10 = readtable("$data/skiplines/skipfront.csv", skipstart = 3, header = false, skiprows = [4, 6])
# names!(df10, names(df1))
- @test df2 == df1
- @test df3 == df1
- @test df4 == df1
+ @test isequal(df2, df1)
+ @test isequal(df3, df1)
+ @test isequal(df4, df1)
# Windows EOLS
- @test df5 == df1
- @test df6 == df1
- @test df7 == df1
- @test df8 == df1
+ @test isequal(df5, df1)
+ @test isequal(df6, df1)
+ @test isequal(df7, df1)
+ @test isequal(df8, df1)
- # @test df9 == df1[3:end]
- # @test df10 == df1[[1, 3:end]]
+ # @test isequal(df9, df1[3:end])
+ # @test isequal(df10, df1[[1, 3:end]])
function normalize_eol!(df)
for (name, col) in eachcol(df)
if eltype(col) <: AbstractString
df[name] = map(s -> replace(s, "\r\n", "\n"), col)
+ elseif eltype(col) <: Nullable && eltype(eltype(col)) <: AbstractString
+ df[name] = map(s -> replace(get(s), "\r\n", "\n"), col)
end
end
df
@@ -163,16 +166,16 @@ module TestIO
# df2w = readtable(winpath; opts2...)
# Normalize line endings in both and test equality
- @test normalize_eol!(df1w) == normalize_eol!(df1)
+ @test isequal(normalize_eol!(df1w), normalize_eol!(df1))
# @test normalize_eol!(df2w) == df1
opts1[:nrows] = 3
opts2[:nrows] = 3
- @test normalize_eol!(readtable(osxpath; opts1...)) == df1[1:3, :]
- # @test readtable(osxpath; opts2...) == df1[1:3, :]
- @test normalize_eol!(readtable(winpath; opts1...)) == df1[1:3, :]
- # @test readtable(winpath; opts2...) == df1[1:3, :]
+ @test isequal(normalize_eol!(readtable(osxpath; opts1...)), df1[1:3, :])
+ # @test isequalreadtable(osxpath; opts2...), df1[1:3, :]
+ @test isequal(normalize_eol!(readtable(winpath; opts1...)), df1[1:3, :])
+ # @test isequalreadtable(winpath; opts2...), df1[1:3, :])
#test_group("readtable handles custom delimiters.")
@@ -181,55 +184,55 @@ module TestIO
readtable("$data/separators/sample_data.csv", quotemark = Char[])
@test_throws ErrorException readtable("$data/newlines/embedded_osx.csv", quotemark = Char[])
df = readtable("$data/quoting/single.csv", quotemark = ['\''])
- @test df == readtable("$data/quoting/mixed.csv", quotemark = ['\'', '"'])
+ @test isequal(df, readtable("$data/quoting/mixed.csv", quotemark = ['\'', '"']))
# df = readtable("$data/decimal/period.csv")
- # @test df[2, :A] == 0.3
- # @test df[2, :B] == 4.0
+ # @test isequaldf[2, :A], 0.3)
+ # @test isequaldf[2, :B], 4.0)
- # @test df == readtable("$data/decimal/comma.tsv", decimal = ',')
+ # @test isequal(df, readtable("$data/decimal/comma.tsv", decimal = ','))
#test_group("readtable column names.")
ns = [:Var1, :Var2, :Var3, :Var4, :Var5]
df = readtable("$data/typeinference/mixedtypes.csv")
names!(df, ns)
- @test df == readtable("$data/typeinference/mixedtypes.csv", names = ns)
+ @test isequal(df, readtable("$data/typeinference/mixedtypes.csv", names = ns))
df = readtable("$data/separators/sample_data.csv", header = false, names = ns[1:3])
- @test df[1, :Var1] == 0
+ @test isequal(df[1, :Var1], Nullable(0))
df = readtable("$data/separators/sample_data.csv", names = ns[1:3])
- @test df[1, :Var1] == 1
+ @test isequal(df[1, :Var1], Nullable(1))
#test_group("Properties of data frames returned by readtable method.")
# Readtable ignorepadding
io = IOBuffer("A , \tB , C\n1 , \t2, 3\n")
- @test readtable(io, ignorepadding = true) == DataFrame(A = 1, B = 2, C = 3)
+ @test isequal(readtable(io, ignorepadding = true), DataFrame(A = 1, B = 2, C = 3))
# Readtable c-style escape options
df = readtable("$data/escapes/escapes.csv", allowescapes = true)
- @test df[1, :V] == "\t\r\n"
- @test df[2, :V] == "\\\\t"
- @test df[3, :V] == "\\\""
+ @test isequal(df[1, :V], Nullable("\t\r\n"))
+ @test isequal(df[2, :V], Nullable("\\\\t"))
+ @test isequal(df[3, :V], Nullable("\\\""))
df = readtable("$data/escapes/escapes.csv")
- @test df[1, :V] == "\\t\\r\\n"
- @test df[2, :V] == "\\\\t"
- @test df[3, :V] == "\\\""
+ @test isequal(df[1, :V], Nullable("\\t\\r\\n"))
+ @test isequal(df[2, :V], Nullable("\\\\t"))
+ @test isequal(df[3, :V], Nullable("\\\""))
# df = readtable("$data/escapes/escapes.csv", escapechars = ['"'], nrows = 2)
- # @test df[1, :V] == "\\t\\r\\n"
- # @test df[2, :V] == "\\\\\\\\t"
+ # @test isequal(df[1, :V], "\\t\\r\\n")
+ # @test isequal(df[2, :V], "\\\\\\\\t")
# Readtable with makefactors active should only make factors from columns
# of strings.
filename = "$data/factors/mixedvartypes.csv"
df = readtable(filename, makefactors = true)
- @test typeof(df[:factorvar]) == PooledDataArray{Compat.UTF8String,UInt32,1}
- @test typeof(df[:floatvar]) == DataArray{Float64,1}
+ @test isa(df[:factorvar], NullableCategoricalArray{Compat.UTF8String,1})
+ @test isa(df[:floatvar], NullableArray{Float64,1})
# Readtable shouldn't silently drop data when reading highly compressed gz.
df = readtable("$data/compressed/1000x2.csv.gz")
@@ -238,79 +241,79 @@ module TestIO
# Readtable type inference
filename = "$data/typeinference/bool.csv"
df = readtable(filename)
- @test typeof(df[:Name]) == DataArray{Compat.UTF8String,1}
- @test typeof(df[:IsMale]) == DataArray{Bool,1}
- @test df[:IsMale][1] == true
- @test df[:IsMale][4] == false
+ @test isa(df[:Name], NullableArray{Compat.UTF8String,1})
+ @test isa(df[:IsMale], NullableArray{Bool,1})
+ @test get(df[:IsMale][1])
+ @test !get(df[:IsMale][4])
filename = "$data/typeinference/standardtypes.csv"
df = readtable(filename)
- @test typeof(df[:IntColumn]) == DataArray{Int,1}
- @test typeof(df[:IntlikeColumn]) == DataArray{Float64,1}
- @test typeof(df[:FloatColumn]) == DataArray{Float64,1}
- @test typeof(df[:BoolColumn]) == DataArray{Bool,1}
- @test typeof(df[:StringColumn]) == DataArray{Compat.UTF8String,1}
+ @test isa(df[:IntColumn], NullableArray{Int,1})
+ @test isa(df[:IntlikeColumn], NullableArray{Float64,1})
+ @test isa(df[:FloatColumn], NullableArray{Float64,1})
+ @test isa(df[:BoolColumn], NullableArray{Bool,1})
+ @test isa(df[:StringColumn], NullableArray{Compat.UTF8String,1})
filename = "$data/typeinference/mixedtypes.csv"
df = readtable(filename)
- @test typeof(df[:c1]) == DataArray{Compat.UTF8String,1}
- @test df[:c1][1] == "1"
- @test df[:c1][2] == "2.0"
- @test df[:c1][3] == "true"
- @test typeof(df[:c2]) == DataArray{Float64,1}
- @test df[:c2][1] == 1.0
- @test df[:c2][2] == 3.0
- @test df[:c2][3] == 4.5
- @test typeof(df[:c3]) == DataArray{Compat.UTF8String,1}
- @test df[:c3][1] == "0"
- @test df[:c3][2] == "1"
- @test df[:c3][3] == "f"
- @test typeof(df[:c4]) == DataArray{Bool,1}
- @test df[:c4][1] == true
- @test df[:c4][2] == false
- @test df[:c4][3] == true
- @test typeof(df[:c5]) == DataArray{Compat.UTF8String,1}
- @test df[:c5][1] == "False"
- @test df[:c5][2] == "true"
- @test df[:c5][3] == "true"
+ @test isa(df[:c1], NullableArray{Compat.UTF8String,1})
+ @test isequal(df[:c1][1], Nullable("1"))
+ @test isequal(df[:c1][2], Nullable("2.0"))
+ @test isequal(df[:c1][3], Nullable("true"))
+ @test isa(df[:c2], NullableArray{Float64,1})
+ @test isequal(df[:c2][1], Nullable(1.0))
+ @test isequal(df[:c2][2], Nullable(3.0))
+ @test isequal(df[:c2][3], Nullable(4.5))
+ @test isa(df[:c3], NullableArray{Compat.UTF8String,1})
+ @test isequal(df[:c3][1], Nullable("0"))
+ @test isequal(df[:c3][2], Nullable("1"))
+ @test isequal(df[:c3][3], Nullable("f"))
+ @test isa(df[:c4], NullableArray{Bool,1})
+ @test isequal(df[:c4][1], Nullable(true))
+ @test isequal(df[:c4][2], Nullable(false))
+ @test isequal(df[:c4][3], Nullable(true))
+ @test isa(df[:c5], NullableArray{Compat.UTF8String,1})
+ @test isequal(df[:c5][1], Nullable("False"))
+ @test isequal(df[:c5][2], Nullable("true"))
+ @test isequal(df[:c5][3], Nullable("true"))
# Readtable defining column types
filename = "$data/definedtypes/mixedvartypes.csv"
df = readtable(filename)
- @test typeof(df[:n]) == DataArray{Int,1}
- @test df[:n][1] == 1
- @test typeof(df[:s]) == DataArray{Compat.UTF8String,1}
- @test df[:s][1] == "text"
- @test typeof(df[:f]) == DataArray{Float64,1}
- @test df[:f][1] == 2.3
- @test typeof(df[:b]) == DataArray{Bool,1}
- @test df[:b][1] == true
+ @test isa(df[:n], NullableArray{Int,1})
+ @test isequal(df[:n][1], Nullable(1))
+ @test isa(df[:s], NullableArray{Compat.UTF8String,1})
+ @test isequal(df[:s][1], Nullable("text"))
+ @test isa(df[:f], NullableArray{Float64,1})
+ @test isequal(df[:f][1], Nullable(2.3))
+ @test isa(df[:b], NullableArray{Bool,1})
+ @test isequal(df[:b][1], Nullable(true))
df = readtable(filename, eltypes = [Int64, Compat.UTF8String, Float64, Bool])
- @test typeof(df[:n]) == DataArray{Int64,1}
- @test df[:n][1] == 1
- @test typeof(df[:s]) == DataArray{Compat.UTF8String,1}
- @test df[:s][1] == "text"
- @test df[:s][4] == "text ole"
- @test typeof(df[:f]) == DataArray{Float64,1}
- @test df[:f][1] == 2.3
- @test typeof(df[:b]) == DataArray{Bool,1}
- @test df[:b][1] == true
- @test df[:b][2] == false
+ @test isa(df[:n], NullableArray{Int64,1})
+ @test isequal(df[:n][1], Nullable(1))
+ @test isa(df[:s], NullableArray{Compat.UTF8String,1})
+ @test isequal(df[:s][1], Nullable("text"))
+ @test isequal(df[:s][4], Nullable("text ole"))
+ @test isa(df[:f], NullableArray{Float64,1})
+ @test isequal(df[:f][1], Nullable(2.3))
+ @test isa(df[:b], NullableArray{Bool,1})
+ @test isequal(df[:b][1], Nullable(true))
+ @test isequal(df[:b][2], Nullable(false))
df = readtable(filename, eltypes = [Int64, Compat.UTF8String, Float64, Compat.UTF8String])
- @test typeof(df[:n]) == DataArray{Int64,1}
- @test df[:n][1] == 1.0
- @test isna(df[:s][3])
- @test typeof(df[:f]) == DataArray{Float64,1}
+ @test isa(df[:n], NullableArray{Int64,1})
+ @test isequal(df[:n][1], Nullable(1.0))
+ @test isnull(df[:s][3])
+ @test isa(df[:f], NullableArray{Float64,1})
# Float are not converted to int
- @test df[:f][1] == 2.3
- @test df[:f][2] == 0.2
- @test df[:f][3] == 5.7
- @test typeof(df[:b]) == DataArray{Compat.UTF8String,1}
- @test df[:b][1] == "T"
- @test df[:b][2] == "FALSE"
+ @test isequal(df[:f][1], Nullable(2.3))
+ @test isequal(df[:f][2], Nullable(0.2))
+ @test isequal(df[:f][3], Nullable(5.7))
+ @test isa(df[:b], NullableArray{Compat.UTF8String,1})
+ @test isequal(df[:b][1], Nullable("T"))
+ @test isequal(df[:b][2], Nullable("FALSE"))
# Readtable name normalization
abnormal = "\u212b"
@@ -324,12 +327,13 @@ module TestIO
io = IOBuffer(abnormal*",%_B*\tC*,end\n1,2,3\n")
@test names(readtable(io, normalizenames=false)) == [Symbol(abnormal),Symbol("%_B*\tC*"),:end]
- # Test writetable with NA and compare to the results
+ # Test writetable with Nullable() and compare to the results
tf = tempname()
isfile(tf) && rm(tf)
- df = DataFrame(A = @data([1,NA]), B = @data(["b", NA]))
+ df = DataFrame(A = NullableArray(Nullable{Int}[1,Nullable()]),
+ B = NullableArray(Nullable{String}["b", Nullable()]))
writetable(tf, df)
- @test readcsv(tf) == ["A" "B"; 1 "b"; "NA" "NA"]
+ @test readcsv(tf) == ["A" "B"; 1 "b"; "NULL" "NULL"]
# Test writetable with nastring set and compare to the results
isfile(tf) && rm(tf)
@@ -338,10 +342,10 @@ module TestIO
rm(tf)
# Test writetable with append
- df1 = DataFrame(a = @data([1, 2, 3]), b = @data([4, 5, 6]))
- df2 = DataFrame(a = @data([1, 2, 3]), b = @data([4, 5, 6]))
- df3 = DataFrame(a = @data([1, 2, 3]), c = @data([4, 5, 6])) # 2nd column mismatch
- df3b = DataFrame(a = @data([1, 2, 3]), b = @data([4, 5, 6]), c = @data([4, 5, 6])) # number of columns mismatch
+ df1 = DataFrame(a = NullableArray([1, 2, 3]), b = NullableArray([4, 5, 6]))
+ df2 = DataFrame(a = NullableArray([1, 2, 3]), b = NullableArray([4, 5, 6]))
+ df3 = DataFrame(a = NullableArray([1, 2, 3]), c = NullableArray([4, 5, 6])) # 2nd column mismatch
+ df3b = DataFrame(a = NullableArray([1, 2, 3]), b = NullableArray([4, 5, 6]), c = NullableArray([4, 5, 6])) # number of columns mismatch
# Would use joinpath(tempdir(), randstring()) to get around tempname
@@ -352,22 +356,22 @@ module TestIO
# Written as normal if file doesn't exist
writetable(tf, df1, append = true)
- @test readtable(tf) == df1
+ @test isequal(readtable(tf), df1)
# Written as normal if file is empty
open(io -> print(io, ""), tf, "w")
writetable(tf, df1, append = true)
- @test readtable(tf) == df1
+ @test isequal(readtable(tf), df1)
# Appends to existing file if append == true
writetable(tf, df1)
writetable(tf, df2, header = false, append = true)
- @test readtable(tf) == vcat(df1, df2)
+ @test isequal(readtable(tf), vcat(df1, df2))
# Overwrites file if append == false
writetable(tf, df1)
writetable(tf, df2)
- @test readtable(tf) == df2
+ @test isequal(readtable(tf), df2)
# Enforces matching column names iff append == true && header == true
writetable(tf, df2)
@@ -387,7 +391,7 @@ module TestIO
# Make sure the ' doesn't get escaped for no reason
writetable(tf, df)
- @test readtable(tf) == df
+ @test isequal(readtable(tf), df)
# Make sure the ' does get escaped when needed
writetable(tf, df, quotemark='\'')
@@ -405,10 +409,10 @@ module TestIO
"""
@test size(df1) == (4, 3)
@test names(df1) == [:name, :age, :squidPerWeek]
- @test df1[1] == ["Alice","Bob","Carol","Eve"]
- @test df1[2] == [36,24,58,49]
- @test df1[3] == [3.14,0,2.71,7.77]
- @test typeof(df1[1]) <: DataArray
+ @test isequal(df1[1], NullableArray(["Alice","Bob","Carol","Eve"]))
+ @test isequal(df1[2], NullableArray([36,24,58,49]))
+ @test isequal(df1[3], NullableArray([3.14,0,2.71,7.77]))
+ @test isa(df1[1], NullableArray{Compat.UTF8String,1})
# Test @wsv_str
df2 = wsv"""
@@ -418,7 +422,7 @@ module TestIO
Carol 58 2.71
Eve 49 7.77
"""
- @test df2 == df1
+ @test isequal(df2, df1)
# Test @tsv_str
df3 = tsv"""
@@ -428,7 +432,7 @@ module TestIO
Carol 58 2.71
Eve 49 7.77
"""
- @test df3 == df1
+ @test isequal(df3, df1)
# csv2 can't be tested until non-'.' decimals are implemented
#df4 = csv2"""
@@ -438,7 +442,7 @@ module TestIO
# Carol; 58; 2,71
# Eve; 49; 7,77
# """
- #@test df4 == df1
+ #@test isequal(df4, df1)
# Test 'f' flag
df5 = csv"""
@@ -448,7 +452,7 @@ module TestIO
Carol, 58, 2.71
Eve, 49, 7.77
"""f
- @test typeof(df5[1]) <: PooledDataArray
+ @test isa(df5[1], NullableCategoricalArray{Compat.UTF8String,1})
# Test 'c' flag
df6 = csv"""
@@ -458,7 +462,7 @@ module TestIO
#Carol, 58, 2.71
Eve, 49, 7.77
"""c
- @test df6 == df1[[1,2,4],:]
+ @test isequal(df6, df1[[1,2,4],:])
# Test 'H' flag
df7 = csv"""
@@ -468,7 +472,8 @@ module TestIO
Eve, 49, 7.77
"""H
@test names(df7) == [:x1,:x2,:x3]
- @test Array(df7) == Array(df1)
+ names!(df7, names(df1))
+ @test isequal(df7, df1)
# Test multiple flags at once
df8 = csv"""
@@ -477,12 +482,49 @@ module TestIO
#Carol, 58, 2.71
Eve, 49, 7.77
"""fcH
- @test typeof(df8[1]) <: PooledDataArray
+ @test isa(df8[1], NullableCategoricalArray{Compat.UTF8String,1})
@test names(df8) == [:x1,:x2,:x3]
- @test Array(df8) == Array(df1[[1,2,4],:])
+ names!(df8, names(df1))
+ @test isequal(df8, df1[[1,2,4],:])
# Test invalid flag
# Need to wrap macro call inside eval to prevent the error from being
# thrown prematurely
@test_throws ArgumentError eval(:(csv"foo,bar"a))
+
+ # Test LaTeX export
+ df = DataFrame(A = 1:4,
+ B = ["\$10.0", "M&F", "A~B", "\\alpha"],
+ C = [L"\alpha", L"\beta", L"\gamma", L"\sum_{i=1}^n \delta_i"],
+ D = [1.0, 2.0, Nullable(), 3.0]
+ )
+ str = """
+ \\begin{tabular}{r|cccc}
+ \t& A & B & C & D\\\\
+ \t\\hline
+ \t1 & 1 & \\\$10.0 & \$\\alpha\$ & 1.0 \\\\
+ \t2 & 2 & M\\&F & \$\\beta\$ & 2.0 \\\\
+ \t3 & 3 & A\\textasciitilde{}B & \$\\gamma\$ & \\\\
+ \t4 & 4 & \\textbackslash{}alpha & \$\\sum_{i=1}^n \\delta_i\$ & 3.0 \\\\
+ \\end{tabular}
+ """
+ @test reprmime(MIME("text/latex"), df) == str
+
+ #Test HTML output for IJulia and similar
+ df = DataFrame(Fish = ["Suzy", "Amir"], Mass = [1.5, Nullable()])
+ io = IOBuffer()
+ show(io, "text/html", df)
+ str = takebuf_string(io)
+ @test str == " | Fish | Mass |
---|
1 | Suzy | 1.5 |
---|
2 | Amir | #NULL |
---|
"
+
+ # test limit attribute of IOContext is used
+ df = DataFrame(a=collect(1:1000))
+ ioc = IOContext(IOBuffer(), displaysize=(10, 10), limit=false)
+ show(ioc, "text/html", df)
+ @test length(takebuf_string(ioc.io)) > 10000
+
+ io = IOBuffer()
+ show(io, "text/html", df)
+ @test length(takebuf_string(io)) < 10000
+
end
diff --git a/test/iteration.jl b/test/iteration.jl
index 5c712298cf..57c17becd4 100644
--- a/test/iteration.jl
+++ b/test/iteration.jl
@@ -1,34 +1,22 @@
module TestIteration
using Base.Test, DataFrames, Compat
- dv = @data([1, 2, NA])
- dm = DataArray([1 2; 3 4])
- dt = DataArray(zeros(2, 2, 2))
+ dv = NullableArray(Nullable{Int}[1, 2, Nullable()])
+ dm = NullableArray([1 2; 3 4])
+ dt = NullableArray(zeros(2, 2, 2))
df = DataFrame(A = 1:2, B = 2:3)
- for el in dv
- @test ndims(el) == 0
- end
-
- for el in dm
- @test ndims(el) == 0
- end
-
- for el in dt
- @test ndims(el) == 0
- end
-
for row in eachrow(df)
@test isa(row, DataFrameRow)
- @test row[:B]-row[:A] == 1
+ @test isequal(row[:B]-row[:A], Nullable(1))
# issue #683 (https://github.com/JuliaStats/DataFrames.jl/pull/683)
@test typeof(collect(row)) == @compat Array{Tuple{Symbol, Any}, 1}
end
for col in eachcol(df)
- @test isa(col, @compat Tuple{Symbol, AbstractDataVector})
+ @test isa(col, @compat Tuple{Symbol, NullableVector})
end
@test isequal(map(x -> minimum(convert(Array, x)), eachrow(df)), Any[1,2])
@@ -37,22 +25,22 @@ module TestIteration
row = DataFrameRow(df, 1)
row[:A] = 100
- @test df[1, :A] == 100
+ @test isequal(df[1, :A], Nullable(100))
row[1] = 101
- @test df[1, :A] == 101
+ @test isequal(df[1, :A], Nullable(101))
df = DataFrame(A = 1:4, B = ["M", "F", "F", "M"])
s1 = sub(df, 1:3)
s1[2,:A] = 4
- @test df[2, :A] == 4
- @test sub(s1, 1:2) == sub(df, 1:2)
+ @test isequal(df[2, :A], Nullable(4))
+ @test isequal(sub(s1, 1:2), sub(df, 1:2))
s2 = sub(df, 1:2:3)
s2[2, :B] = "M"
- @test df[3, :B] == "M"
- @test sub(s2, 1:1:2) == sub(df, [1,3])
+ @test isequal(df[3, :B], Nullable("M"))
+ @test isequal(sub(s2, 1:1:2), sub(df, [1,3]))
# @test_fail for x in df; end # Raises an error
end
diff --git a/test/join.jl b/test/join.jl
index b612eecc02..5be59c7915 100644
--- a/test/join.jl
+++ b/test/join.jl
@@ -14,15 +14,15 @@ module TestJoin
# Test output of various join types
outer = DataFrame(ID = [1, 2, 2, 3, 4],
- Name = @data(["John Doe", "Jane Doe", "Jane Doe", "Joe Blogs", NA]),
- Job = @data(["Lawyer", "Doctor", "Florist", NA, "Farmer"]))
+ Name = NullableArray(Nullable{String}["John Doe", "Jane Doe", "Jane Doe", "Joe Blogs", Nullable()]),
+ Job = NullableArray(Nullable{String}["Lawyer", "Doctor", "Florist", Nullable(), "Farmer"]))
# (Tests use current column ordering but don't promote it)
- right = outer[!isna(outer[:Job]), [:Name, :ID, :Job]]
- left = outer[!isna(outer[:Name]), :]
- inner = left[!isna(left[:Job]), :]
+ right = outer[Bool[!isnull(x) for x in outer[:Job]], [:Name, :ID, :Job]]
+ left = outer[Bool[!isnull(x) for x in outer[:Name]], :]
+ inner = left[Bool[!isnull(x) for x in left[:Job]], :]
semi = unique(inner[:, [:ID, :Name]])
- anti = left[isna(left[:Job]), [:ID, :Name]]
+ anti = left[Bool[isnull(x) for x in left[:Job]], [:ID, :Name]]
@test isequal(join(name, job, on = :ID), inner)
@test isequal(join(name, job, on = :ID, kind = :inner), inner)
@@ -59,7 +59,7 @@ module TestJoin
B = ['a', 'a', 'a', 'b', 'b', 'b'],
C = [3, 4, 5, 3, 4, 5])
- @test join(df1, df2[[:C]], kind = :cross) == cross
+ @test isequal(join(df1, df2[[:C]], kind = :cross), cross)
# Cross joins handle naming collisions
@test size(join(df1, df1, kind = :cross)) == (4, 4)
@@ -67,11 +67,44 @@ module TestJoin
# Cross joins don't take keys
@test_throws ArgumentError join(df1, df2, on = :A, kind = :cross)
+ # test empty inputs
+ simple_df(len::Int, col=:A) = (df = DataFrame(); df[col]=collect(1:len); df)
+ @test isequal(join(simple_df(0), simple_df(0), on = :A, kind = :left), simple_df(0))
+ @test isequal(join(simple_df(2), simple_df(0), on = :A, kind = :left), simple_df(2))
+ @test isequal(join(simple_df(0), simple_df(2), on = :A, kind = :left), simple_df(0))
+ @test isequal(join(simple_df(0), simple_df(0), on = :A, kind = :right), simple_df(0))
+ @test isequal(join(simple_df(0), simple_df(2), on = :A, kind = :right), simple_df(2))
+ @test isequal(join(simple_df(2), simple_df(0), on = :A, kind = :right), simple_df(0))
+ @test isequal(join(simple_df(0), simple_df(0), on = :A, kind = :inner), simple_df(0))
+ @test isequal(join(simple_df(0), simple_df(2), on = :A, kind = :inner), simple_df(0))
+ @test isequal(join(simple_df(2), simple_df(0), on = :A, kind = :inner), simple_df(0))
+ @test isequal(join(simple_df(0), simple_df(0), on = :A, kind = :outer), simple_df(0))
+ @test isequal(join(simple_df(0), simple_df(2), on = :A, kind = :outer), simple_df(2))
+ @test isequal(join(simple_df(2), simple_df(0), on = :A, kind = :outer), simple_df(2))
+ @test isequal(join(simple_df(0), simple_df(0), on = :A, kind = :semi), simple_df(0))
+ @test isequal(join(simple_df(2), simple_df(0), on = :A, kind = :semi), simple_df(0))
+ @test isequal(join(simple_df(0), simple_df(2), on = :A, kind = :semi), simple_df(0))
+ @test isequal(join(simple_df(0), simple_df(0), on = :A, kind = :anti), simple_df(0))
+ @test isequal(join(simple_df(2), simple_df(0), on = :A, kind = :anti), simple_df(2))
+ @test isequal(join(simple_df(0), simple_df(2), on = :A, kind = :anti), simple_df(0))
+ @test isequal(join(simple_df(0), simple_df(0, :B), kind = :cross), DataFrame(A=Int[], B=Int[]))
+ @test isequal(join(simple_df(0), simple_df(2, :B), kind = :cross), DataFrame(A=Int[], B=Int[]))
+ @test isequal(join(simple_df(2), simple_df(0, :B), kind = :cross), DataFrame(A=Int[], B=Int[]))
+
# issue #960
df1 = DataFrame(A = 1:50,
B = 1:50,
C = 1)
- pool!(df1, :A)
- pool!(df1, :B)
+ categorical!(df1, :A)
+ categorical!(df1, :B)
join(df1, df1, on = [:A, :B], kind = :inner)
+
+ # Test that Array{Nullable} works when combined with NullableArray (#1088)
+ df = DataFrame(Name = Nullable{String}["A", "B", "C"],
+ Mass = [1.5, 2.2, 1.1])
+ df2 = DataFrame(Name = ["A", "B", "C", "A"],
+ Quantity = [3, 3, 2, 4])
+ @test join(df2, df, on=:Name, kind=:left) == DataFrame(Name = ["A", "A", "B", "C"],
+ Quantity = [3, 4, 3, 2],
+ Mass = [1.5, 1.5, 2.2, 1.1])
end
diff --git a/test/show.jl b/test/show.jl
index d5afb45840..ef492ba8d3 100644
--- a/test/show.jl
+++ b/test/show.jl
@@ -1,6 +1,7 @@
module TestShow
using DataFrames
using Compat
+ using Base.Test
import Compat.String
df = DataFrame(A = 1:3, B = ["x", "y", "z"])
@@ -35,4 +36,16 @@ module TestShow
show(io, A)
A = DataFrames.RepeatedVector([1, 2, 3], 1, 5)
show(io, A)
+
+ #Test show output for REPL and similar
+ df = DataFrame(Fish = ["Suzy", "Amir"], Mass = [1.5, Nullable()])
+ io = IOBuffer()
+ show(io, df)
+ str = takebuf_string(io)
+ @test str == """
+2×2 DataFrames.DataFrame
+│ Row │ Fish │ Mass │
+├─────┼──────┼───────┤
+│ 1 │ Suzy │ 1.5 │
+│ 2 │ Amir │ #NULL │"""
end
diff --git a/test/sort.jl b/test/sort.jl
index cf48fab744..7760a3dd81 100644
--- a/test/sort.jl
+++ b/test/sort.jl
@@ -2,20 +2,20 @@ module TestSort
using Base.Test
using DataFrames
- dv1 = @data([9, 1, 8, NA, 3, 3, 7, NA])
- dv2 = 1.0 * dv1
- dv3 = DataArray([1:8;])
- pdv1 = convert(PooledDataArray, dv1)
+ dv1 = NullableArray(Nullable{Int}[9, 1, 8, Nullable(), 3, 3, 7, Nullable()])
+ dv2 = NullableArray(Nullable{Int}[9, 1, 8, Nullable(), 3, 3, 7, Nullable()])
+ dv3 = NullableArray(1:8)
+ cv1 = NullableCategoricalArray(dv1, ordered=true)
- d = DataFrame(dv1 = dv1, dv2 = dv2, dv3 = dv3, pdv1 = pdv1)
+ d = DataFrame(dv1 = dv1, dv2 = dv2, dv3 = dv3, cv1 = cv1)
@test sortperm(d) == sortperm(dv1)
@test sortperm(d[[:dv3, :dv1]]) == sortperm(dv3)
- @test sort(d, cols=:dv1)[:dv3] == sortperm(dv1)
- @test sort(d, cols=:dv2)[:dv3] == sortperm(dv1)
- @test sort(d, cols=:pdv1)[:dv3] == sortperm(dv1)
- @test sort(d, cols=[:dv1, :pdv1])[:dv3] == sortperm(dv1)
- @test sort(d, cols=[:dv1, :dv3])[:dv3] == sortperm(dv1)
+ @test isequal(sort(d, cols=:dv1)[:dv3], NullableArray(sortperm(dv1)))
+ @test isequal(sort(d, cols=:dv2)[:dv3], NullableArray(sortperm(dv1)))
+ @test isequal(sort(d, cols=:cv1)[:dv3], NullableArray(sortperm(dv1)))
+ @test isequal(sort(d, cols=[:dv1, :cv1])[:dv3], NullableArray(sortperm(dv1)))
+ @test isequal(sort(d, cols=[:dv1, :dv3])[:dv3], NullableArray(sortperm(dv1)))
df = DataFrame(rank=rand(1:12, 1000),
chrom=rand(1:24, 1000),
@@ -33,13 +33,17 @@ module TestSort
@test issorted(ds2, cols=(order(:rank, rev=true), :chrom, :pos))
@test issorted(ds2, rev=(true, false, false))
- @test ds2 == ds
+ @test isequal(ds2, ds)
sort!(df, cols=(:rank, :chrom, :pos), rev=(true, false, false))
@test issorted(df, cols=(order(:rank, rev=true), :chrom, :pos))
@test issorted(df, rev=(true, false, false))
- @test df == ds
-
+ @test isequal(df, ds)
+ # Check that columns that shares the same underlying array are only permuted once PR#1072
+ df = DataFrame(a=[2,1])
+ df[:b] = df[:a]
+ sort!(df, cols=:a)
+ @test df == DataFrame(a=[1,2],b=[1,2])
end
diff --git a/test/statsmodel.jl b/test/statsmodel.jl
index ca9dcab6f7..9d9e5cd3cc 100644
--- a/test/statsmodel.jl
+++ b/test/statsmodel.jl
@@ -33,12 +33,12 @@ d[:x4] = [17:20;]
f = y ~ x1 * x2
m = fit(DummyMod, f, d)
-@test model_response(m) == d[:y]
+@test model_response(m) == Array(d[:y])
## test prediction method
## vanilla
StatsBase.predict(mod::DummyMod) = mod.x * mod.beta
-@test predict(m) == [ ones(size(d,1)) d[:x1] d[:x2] d[:x1].*d[:x2] ] * collect(1:4)
+@test predict(m) == [ ones(size(d,1)) Array(d[:x1]) Array(d[:x2]) Array(d[:x1]).*Array(d[:x2]) ] * collect(1:4)
## new data from matrix
StatsBase.predict(mod::DummyMod, newX::Matrix) = newX * mod.beta
@@ -46,10 +46,10 @@ mm = ModelMatrix(ModelFrame(f, d))
@test predict(m, mm.m) == mm.m * collect(1:4)
## new data from DataFrame (via ModelMatrix)
-@test predict(m, d) == predict(m, mm.m)
+@test isequal(predict(m, d), NullableArray(predict(m, mm.m)))
d2 = deepcopy(d)
-d2[3, :x1] = NA
+d2[3, :x1] = Nullable()
@test length(predict(m, d2)) == 4
## test copying of names from Terms to CoefTable
@@ -61,23 +61,23 @@ io = IOBuffer()
show(io, m)
## with categorical variables
-d[:x1p] = PooledDataArray(d[:x1])
+d[:x1p] = NullableCategoricalArray(d[:x1])
f2 = y ~ x1p
m2 = fit(DummyMod, f2, d)
@test coeftable(m2).rownms == ["(Intercept)", "x1p: 6", "x1p: 7", "x1p: 8"]
## predict w/ new data missing levels
-@test predict(m2, d[2:4, :]) == predict(m2)[2:4]
+@test isequal(predict(m2, d[2:4, :]), NullableArray(predict(m2)[2:4]))
## predict w/ new data with _extra_ levels (throws an error)
d3 = deepcopy(d)
d3[1, :x1] = 0
-d3[:x1p] = PooledDataArray(d3[:x1])
+d3[:x1p] = NullableCategoricalVector(d3[:x1])
@test_throws ArgumentError predict(m2, d3)
## fit with contrasts specified
-d[:x2p] = PooledDataArray(d[:x2])
+d[:x2p] = NullableCategoricalVector(d[:x2])
f3 = y ~ x1p + x2p
m3 = fit(DummyMod, f3, d)
fit(DummyMod, f3, d, contrasts = Dict(:x1p => EffectsCoding()))
diff --git a/test/utils.jl b/test/utils.jl
index cc4e5bc931..9875fc4eb4 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -38,21 +38,21 @@ module TestUtils
"Expected if Julia was not built from source.")
end
- @test DataFrames.countna([1:3;]) == 0
-
- data = @data rand(20)
- @test DataFrames.countna(data) == 0
- data[sample(1:20, 11, replace=false)] = NA
- @test DataFrames.countna(data) == 11
- data[1:end] = NA
- @test DataFrames.countna(data) == 20
-
- pdata = @data sample(1:5, 20)
- @test DataFrames.countna(pdata) == 0
- pdata[sample(1:20, 11, replace=false)] = NA
- @test DataFrames.countna(pdata) == 11
- pdata[1:end] = NA
- @test DataFrames.countna(pdata) == 20
+ @test DataFrames.countnull([1:3;]) == 0
+
+ data = NullableArray(rand(20))
+ @test DataFrames.countnull(data) == 0
+ data[sample(1:20, 11, replace=false)] = Nullable()
+ @test DataFrames.countnull(data) == 11
+ data[1:end] = Nullable()
+ @test DataFrames.countnull(data) == 20
+
+ pdata = NullableArray(sample(1:5, 20))
+ @test DataFrames.countnull(pdata) == 0
+ pdata[sample(1:20, 11, replace=false)] = Nullable()
+ @test DataFrames.countnull(pdata) == 11
+ pdata[1:end] = Nullable()
+ @test DataFrames.countnull(pdata) == 20
funs = [mean, sum, var, x -> sum(x)]
if string(funs[end]) == "(anonymous function)" # Julia < 0.5