diff --git a/.travis.yml b/.travis.yml index ea0a185852..42e740c899 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,6 @@ language: julia julia: - - 0.4 - 0.5 - nightly os: @@ -15,6 +14,5 @@ script: - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi - julia --check-bounds=yes -e 'Pkg.clone(pwd()); Pkg.build("DataFrames"); Pkg.test("DataFrames"; coverage=true)' after_success: - - julia -e 'cd(Pkg.dir("DataFrames")); Pkg.clone("https://github.com/MichaelHatherly/Documenter.jl"); include(joinpath("docs", "make.jl"))' + - julia -e 'cd(Pkg.dir("DataFrames")); Pkg.add("Documenter"); include(joinpath("docs", "make.jl"))' - julia -e 'cd(Pkg.dir("DataFrames")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())' - diff --git a/REQUIRE b/REQUIRE index b20aea22d9..79622049cf 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,6 +1,7 @@ -julia 0.4 -DataArrays 0.3.4 -StatsBase 0.8.3 +julia 0.5 +NullableArrays 0.0.8 +CategoricalArrays 0.0.6 +StatsBase 0.11.0 GZip SortingAlgorithms Reexport diff --git a/appveyor.yml b/appveyor.yml index cfc1085114..84c37acbda 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,7 +1,5 @@ environment: matrix: - - JULIAVERSION: "julialang/bin/winnt/x86/0.4/julia-0.4-latest-win32.exe" - - JULIAVERSION: "julialang/bin/winnt/x64/0.4/julia-0.4-latest-win64.exe" - JULIAVERSION: "julialang/bin/winnt/x86/0.5/julia-0.5-latest-win32.exe" - JULIAVERSION: "julialang/bin/winnt/x64/0.5/julia-0.5-latest-win64.exe" - JULIAVERSION: "julianightlies/bin/winnt/x86/julia-latest-win32.exe" diff --git a/benchmark/datamatrix.jl b/benchmark/datamatrix.jl deleted file mode 100644 index c856a15020..0000000000 --- a/benchmark/datamatrix.jl +++ /dev/null @@ -1,37 +0,0 @@ -a = eye(100) -b = eye(100) - -dm_a = data(a) -dm_b = data(b) - -dm_a_na = copy(dm_a) -dm_a_na[:, :] = NA -dm_b_na = copy(dm_b) -dm_b_na[:, :] = NA - -f1() = *(a, b) -f2() = *(dm_a, dm_b) -f3() = *(dm_a_na, dm_b_na) - -df1 = benchmark(f1, - "Linear Algebra", - "Matrix Multiplication w/ No NA's", - 1_000) -df2 = benchmark(f2, - "Linear Algebra", - "DataMatrix Multiplication w/ No NA's", - 1_000) -df3 = benchmark(f3, - "Linear Algebra", - "DataMatrix Multiplication w/ NA's", - 1_000) - -# TODO: Keep permanent record -printtable(vcat(df1, df2, df3), header=false) - -# Compare with R -# We're 10x as fast! -# a <- diag(100) -# b <- diag(100) -# a %*% b -# s <- Sys.time(); a %*% b; e <- Sys.time(); e - s diff --git a/benchmark/datavector.jl b/benchmark/datavector.jl deleted file mode 100644 index 1139fa9114..0000000000 --- a/benchmark/datavector.jl +++ /dev/null @@ -1,56 +0,0 @@ -srand(1) -N = 1_000_000 -v = randn(N) -dv = DataArray(v) -dvna = deepcopy(dv) -dvna[rand(1:N, 10_000)] = NA -idxv = shuffle([1:N]) -idxdv = DataArray(idxv) - -f1(v) = sum(v) -f2(v) = sum(dropna(v)) -f3(v) = sum(dropna(v)) # Make this an iterator -f4(v) = mean(v) -f5(v) = mean(dropna(v)) -f6(v) = mean(dropna(v)) # Make this an iterator -f7(v1, v2) = v1 + v2 -f8(v1, v2) = v1 .> v2 -f9(v, i) = v[i] - -perf_test = Dict() - -perf_test["sum(v): Vector with no NA's"] = () -> f1(v) -perf_test["sum(dv): DataVector with no NA's"] = () -> f1(dv) -perf_test["sum(dropna(dv)): DataVector with no NA's"] = () -> f2(dv) -perf_test["sum(*dropna(dv)): DataVector with no NA's"] = () -> f3(dv) - -perf_test["sum(dvna): DataVector with NA's"] = () -> f4(dv) -perf_test["sum(dropna(dvna)): DataVector with NA's"] = () -> f5(dv) -perf_test["sum(*dropna(dvna)): DataVector with NA's"] = () -> f6(dv) - -perf_test["mean(v): Vector with no NA's"] = () -> f4(v) -perf_test["mean(dv): DataVector with no NA's"] = () -> f4(dv) -perf_test["mean(dropna(dv)): DataVector with no NA's"] = () -> f5(dv) -perf_test["mean(*dropna(dv)): DataVector with no NA's"] = () -> f6(dv) - -perf_test["mean(dvna): DataVector with NA's"] = () -> f4(dv) -perf_test["mean(dropna(dvna)): DataVector with NA's"] = () -> f5(dv) -perf_test["mean(*dropna(dvna)): DataVector with NA's"] = () -> f6(dv) - -perf_test["v + 1.0 : Vector"] = () -> f7(v, 1.0) -perf_test["dv + 1.0 : DataVector with no NA's"] = () -> f7(dv, 1.0) -perf_test["dvna + 1.0 : DataVector with NA's"] = () -> f7(dvna, 1.0) - -perf_test["v .> 1.0 : Vector"] = () -> f8(v, 1.0) -perf_test["dv .> 1.0 : DataVector with no NA's"] = () -> f8(dv, 1.0) -perf_test["dvna .> 1.0 : DataVector with NA's"] = () -> f8(dvna, 1.0) - -perf_test["v[idxv] : Vector"] = () -> f9(v, idxv) -perf_test["dv[idxv] : DataVector and Vector indexing"] = () -> f9(dv, idxv) -perf_test["dv[idxdv] : DataVector and DataVector indexing"] = () -> f9(dv, idxdv) - -for (name, f) in perf_test - res = benchmark(f, "DataArray Operations", name, 10) - # TODO: Keep permanent record - printtable(res, header=false) -end diff --git a/benchmark/results.csv b/benchmark/results.csv index 803a8f1963..2f828b1374 100644 --- a/benchmark/results.csv +++ b/benchmark/results.csv @@ -1,27 +1,4 @@ Category,Benchmark,Iterations,TotalWall,AverageWall,MaxWall,MinWall,Timestamp,JuliaHash,CodeHash,OS,CPUCores -"DataArray Operations","sum(v): Vector with no NA's",10,0.00857686996459961,0.000857686996459961,0.0008759498596191406,0.0008528232574462891,"2013-01-14 10:20:09","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","mean(dv): DataVector with no NA's",10,0.01034688949584961,0.001034688949584961,0.0015439987182617188,0.0009601116180419922,"2013-01-14 10:20:09","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","mean(removeNA(dvna)): DataVector with NA's",10,0.3172330856323242,0.031723308563232425,0.03600907325744629,0.031102895736694336,"2013-01-14 10:20:10","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","dv .> 1.0 : DataVector with no NA's",10,2.0662119388580322,0.20662119388580322,0.2186141014099121,0.19791007041931152,"2013-01-14 10:20:12","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","dv[idxdv] : DataVector and DataVector indexing",10,1.309283971786499,0.1309283971786499,0.1430819034576416,0.12758612632751465,"2013-01-14 10:20:14","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","mean(v): Vector with no NA's",10,0.008614063262939453,0.0008614063262939453,0.0008738040924072266,0.0008530616760253906,"2013-01-14 10:20:14","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","dv + 1.0 : DataVector with no NA's",10,0.06403565406799316,0.006403565406799316,0.018197059631347656,0.002321958541870117,"2013-01-14 10:20:14","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","v .> 1.0 : Vector",10,0.6535539627075195,0.06535539627075196,0.08094000816345215,0.059654951095581055,"2013-01-14 10:20:15","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","dv[idxv] : DataVector and Vector indexing",10,1.0075325965881348,0.10075325965881347,0.10670304298400879,0.09811711311340332,"2013-01-14 10:20:16","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","sum(removeNA(dv)): DataVector with no NA's",10,0.31229615211486816,0.031229615211486816,0.03513312339782715,0.029542922973632812,"2013-01-14 10:20:16","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","v[idxv] : Vector",10,0.17796993255615234,0.017796993255615234,0.03006291389465332,0.01340794563293457,"2013-01-14 10:20:16","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","dvna .> 1.0 : DataVector with NA's",10,2.047502040863037,0.2047502040863037,0.21759915351867676,0.19756603240966797,"2013-01-14 10:20:19","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","sum(dvna): DataVector with NA's",10,0.009662866592407227,0.0009662866592407227,0.0009889602661132812,0.0009551048278808594,"2013-01-14 10:20:19","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","mean(*removeNA(dv)): DataVector with no NA's",10,0.33266425132751465,0.033266425132751465,0.03953218460083008,0.031510114669799805,"2013-01-14 10:20:19","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","mean(*removeNA(dvna)): DataVector with NA's",10,0.3514890670776367,0.035148906707763675,0.04177212715148926,0.030744075775146484,"2013-01-14 10:20:20","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","dvna + 1.0 : DataVector with NA's",10,0.06465601921081543,0.006465601921081543,0.018489837646484375,0.0023229122161865234,"2013-01-14 10:20:20","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","sum(*removeNA(dvna)): DataVector with NA's",10,0.3297770023345947,0.03297770023345947,0.039092063903808594,0.031161069869995117,"2013-01-14 10:20:20","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","sum(dv): DataVector with no NA's",10,0.009810924530029297,0.0009810924530029297,0.0010459423065185547,0.0009570121765136719,"2013-01-14 10:20:20","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","sum(removeNA(dvna)): DataVector with NA's",10,0.3214747905731201,0.03214747905731201,0.03607583045959473,0.03061199188232422,"2013-01-14 10:20:20","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","mean(removeNA(dv)): DataVector with no NA's",10,0.31479310989379883,0.03147931098937988,0.03677701950073242,0.030520200729370117,"2013-01-14 10:20:21","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","v + 1.0 : Vector",10,0.06811809539794922,0.006811809539794922,0.030359983444213867,0.0016188621520996094,"2013-01-14 10:20:21","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","sum(*removeNA(dv)): DataVector with no NA's",10,0.313723087310791,0.0313723087310791,0.036063194274902344,0.030745983123779297,"2013-01-14 10:20:21","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","mean(dvna): DataVector with NA's",10,0.010206937789916992,0.0010206937789916993,0.001260995864868164,0.0009670257568359375,"2013-01-14 10:20:21","9e0ff15b52","61162cd918","Darwin",4 "Linear Algebra","Matrix Multiplication w/ No NA's",1000,0.1434164047241211,0.0001434164047241211,0.03787708282470703,5.888938903808594e-5,"2013-01-14 10:20:22","9e0ff15b52","61162cd918","Darwin",4 "Linear Algebra","DataMatrix Multiplication w/ No NA's",1000,2.0335049629211426,0.0020335049629211428,0.025124788284301758,0.001116037368774414,"2013-01-14 10:20:24","9e0ff15b52","61162cd918","Darwin",4 "Linear Algebra","DataMatrix Multiplication w/ NA's",1000,4.978086709976196,0.004978086709976196,0.038012027740478516,0.003064870834350586,"2013-01-14 10:20:29","9e0ff15b52","61162cd918","Darwin",4 @@ -40,29 +17,6 @@ Category,Benchmark,Iterations,TotalWall,AverageWall,MaxWall,MinWall,Timestamp,Ju "DataFrame I/O","space_before_delimiter.csv",10,0.007896661758422852,0.0007896661758422851,0.0009069442749023438,0.0005939006805419922,"2013-01-14 10:21:54","9e0ff15b52","61162cd918","Darwin",4 "DataFrame I/O","types.csv",10,0.01001596450805664,0.001001596450805664,0.0011119842529296875,0.0007750988006591797,"2013-01-14 10:21:54","9e0ff15b52","61162cd918","Darwin",4 "DataFrame I/O","utf8.csv",10,0.007441043853759766,0.0007441043853759766,0.0008280277252197266,0.0007090568542480469,"2013-01-14 10:21:54","9e0ff15b52","61162cd918","Darwin",4 -"DataArray Operations","sum(v): Vector with no NA's",10,0.009074211120605469,0.0009074211120605469,0.0009720325469970703,0.0008630752563476562,"2013-01-14 10:44:57","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","mean(dv): DataVector with no NA's",10,0.00978994369506836,0.000978994369506836,0.0010378360748291016,0.0009520053863525391,"2013-01-14 10:44:57","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","mean(removeNA(dvna)): DataVector with NA's",10,0.370542049407959,0.0370542049407959,0.07280802726745605,0.032627105712890625,"2013-01-14 10:44:58","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","dv .> 1.0 : DataVector with no NA's",10,2.0874040126800537,0.20874040126800536,0.22231101989746094,0.2008979320526123,"2013-01-14 10:45:00","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","dv[idxdv] : DataVector and DataVector indexing",10,1.3245141506195068,0.13245141506195068,0.13637208938598633,0.12865090370178223,"2013-01-14 10:45:02","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","mean(v): Vector with no NA's",10,0.010338783264160156,0.0010338783264160155,0.001631021499633789,0.0008609294891357422,"2013-01-14 10:45:02","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","dv + 1.0 : DataVector with no NA's",10,0.06729602813720703,0.006729602813720703,0.019953012466430664,0.0023021697998046875,"2013-01-14 10:45:02","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","v .> 1.0 : Vector",10,0.6516821384429932,0.06516821384429931,0.07804393768310547,0.059031009674072266,"2013-01-14 10:45:03","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","dv[idxv] : DataVector and Vector indexing",10,1.0018041133880615,0.10018041133880615,0.10875797271728516,0.09689211845397949,"2013-01-14 10:45:04","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","sum(removeNA(dv)): DataVector with no NA's",10,0.3324899673461914,0.03324899673461914,0.03690004348754883,0.029747962951660156,"2013-01-14 10:45:04","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","v[idxv] : Vector",10,0.21311020851135254,0.021311020851135253,0.03587007522583008,0.01586008071899414,"2013-01-14 10:45:05","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","dvna .> 1.0 : DataVector with NA's",10,2.057245969772339,0.2057245969772339,0.21527600288391113,0.19754600524902344,"2013-01-14 10:45:07","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","sum(dvna): DataVector with NA's",10,0.010241031646728516,0.0010241031646728515,0.0012030601501464844,0.0009589195251464844,"2013-01-14 10:45:07","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","mean(*removeNA(dv)): DataVector with no NA's",10,0.32787513732910156,0.032787513732910153,0.03671598434448242,0.03182506561279297,"2013-01-14 10:45:07","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","mean(*removeNA(dvna)): DataVector with NA's",10,0.3225698471069336,0.03225698471069336,0.03831791877746582,0.03119802474975586,"2013-01-14 10:45:08","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","dvna + 1.0 : DataVector with NA's",10,0.0656731128692627,0.006567311286926269,0.0189821720123291,0.002315044403076172,"2013-01-14 10:45:08","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","sum(*removeNA(dvna)): DataVector with NA's",10,0.33132195472717285,0.033132195472717285,0.04097390174865723,0.031419992446899414,"2013-01-14 10:45:08","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","sum(dv): DataVector with no NA's",10,0.009993314743041992,0.0009993314743041992,0.0010750293731689453,0.0009589195251464844,"2013-01-14 10:45:08","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","sum(removeNA(dvna)): DataVector with NA's",10,0.32796382904052734,0.032796382904052734,0.03624391555786133,0.030983924865722656,"2013-01-14 10:45:09","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","mean(removeNA(dv)): DataVector with no NA's",10,0.3377056121826172,0.03377056121826172,0.03966093063354492,0.03125596046447754,"2013-01-14 10:45:09","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","v + 1.0 : Vector",10,0.06936144828796387,0.0069361448287963865,0.030318021774291992,0.0016639232635498047,"2013-01-14 10:45:09","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","sum(*removeNA(dv)): DataVector with no NA's",10,0.325742244720459,0.0325742244720459,0.035830020904541016,0.031311988830566406,"2013-01-14 10:45:09","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","mean(dvna): DataVector with NA's",10,0.009889841079711914,0.0009889841079711915,0.0011150836944580078,0.0009551048278808594,"2013-01-14 10:45:09","11f365ef08","319eab675d","Darwin",4 "Linear Algebra","Matrix Multiplication w/ No NA's",1000,0.16434788703918457,0.00016434788703918458,0.03700113296508789,5.888938903808594e-5,"2013-01-14 10:45:10","11f365ef08","319eab675d","Darwin",4 "Linear Algebra","DataMatrix Multiplication w/ No NA's",1000,2.043034553527832,0.002043034553527832,0.02640819549560547,0.0011169910430908203,"2013-01-14 10:45:12","11f365ef08","319eab675d","Darwin",4 "Linear Algebra","DataMatrix Multiplication w/ NA's",1000,5.303671598434448,0.005303671598434448,0.04494500160217285,0.0030400753021240234,"2013-01-14 10:45:17","11f365ef08","319eab675d","Darwin",4 @@ -81,29 +35,6 @@ Category,Benchmark,Iterations,TotalWall,AverageWall,MaxWall,MinWall,Timestamp,Ju "DataFrame I/O","space_before_delimiter.csv",10,0.005663871765136719,0.0005663871765136719,0.0005869865417480469,0.0005459785461425781,"2013-01-14 10:46:44","11f365ef08","319eab675d","Darwin",4 "DataFrame I/O","types.csv",10,0.006924867630004883,0.0006924867630004883,0.0007150173187255859,0.0006740093231201172,"2013-01-14 10:46:44","11f365ef08","319eab675d","Darwin",4 "DataFrame I/O","utf8.csv",10,0.007310152053833008,0.0007310152053833008,0.0008001327514648438,0.0007109642028808594,"2013-01-14 10:46:44","11f365ef08","319eab675d","Darwin",4 -"DataArray Operations","sum(v): Vector with no NA's",10,0.008659124374389648,0.0008659124374389649,0.0009109973907470703,0.0008530616760253906,"2013-01-14 21:09:54","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","mean(dv): DataVector with no NA's",10,0.010535955429077148,0.001053595542907715,0.0016469955444335938,0.0009620189666748047,"2013-01-14 21:09:55","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","mean(removeNA(dvna)): DataVector with NA's",10,0.31494879722595215,0.031494879722595216,0.036512136459350586,0.0302579402923584,"2013-01-14 21:09:55","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","dv .> 1.0 : DataVector with no NA's",10,2.0206620693206787,0.20206620693206787,0.21630287170410156,0.19572210311889648,"2013-01-14 21:09:57","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","dv[idxdv] : DataVector and DataVector indexing",10,1.2879621982574463,0.12879621982574463,0.13094496726989746,0.1274411678314209,"2013-01-14 21:09:59","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","mean(v): Vector with no NA's",10,0.010784149169921875,0.0010784149169921875,0.0013751983642578125,0.0008599758148193359,"2013-01-14 21:09:59","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","dv + 1.0 : DataVector with no NA's",10,0.06326985359191895,0.006326985359191894,0.018002986907958984,0.002331972122192383,"2013-01-14 21:09:59","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","v .> 1.0 : Vector",10,0.6304340362548828,0.06304340362548828,0.07402396202087402,0.058135032653808594,"2013-01-14 21:10:00","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","dv[idxv] : DataVector and Vector indexing",10,0.9911110401153564,0.09911110401153564,0.10813021659851074,0.09464001655578613,"2013-01-14 21:10:01","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","sum(removeNA(dv)): DataVector with no NA's",10,0.323793888092041,0.0323793888092041,0.03618288040161133,0.03157186508178711,"2013-01-14 21:10:01","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","v[idxv] : Vector",10,0.1790471076965332,0.01790471076965332,0.030202150344848633,0.013410091400146484,"2013-01-14 21:10:02","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","dvna .> 1.0 : DataVector with NA's",10,2.020591974258423,0.20205919742584227,0.21394085884094238,0.19475698471069336,"2013-01-14 21:10:04","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","sum(dvna): DataVector with NA's",10,0.009839057922363281,0.0009839057922363282,0.0010139942169189453,0.000965118408203125,"2013-01-14 21:10:04","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","mean(*removeNA(dv)): DataVector with no NA's",10,0.3176560401916504,0.03176560401916504,0.036063194274902344,0.02964496612548828,"2013-01-14 21:10:04","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","mean(*removeNA(dvna)): DataVector with NA's",10,0.3231468200683594,0.03231468200683594,0.03954195976257324,0.029547929763793945,"2013-01-14 21:10:05","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","dvna + 1.0 : DataVector with NA's",10,0.06484484672546387,0.006484484672546387,0.018603086471557617,0.002290964126586914,"2013-01-14 21:10:05","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","sum(*removeNA(dvna)): DataVector with NA's",10,0.32024693489074707,0.032024693489074704,0.036015987396240234,0.0309140682220459,"2013-01-14 21:10:05","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","sum(dv): DataVector with no NA's",10,0.009638071060180664,0.0009638071060180664,0.0009829998016357422,0.0009531974792480469,"2013-01-14 21:10:05","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","sum(removeNA(dvna)): DataVector with NA's",10,0.3199589252471924,0.03199589252471924,0.03625988960266113,0.030423879623413086,"2013-01-14 21:10:06","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","mean(removeNA(dv)): DataVector with no NA's",10,0.31856274604797363,0.03185627460479736,0.03632402420043945,0.031010150909423828,"2013-01-14 21:10:06","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","v + 1.0 : Vector",10,0.07034850120544434,0.007034850120544433,0.030943870544433594,0.0016410350799560547,"2013-01-14 21:10:06","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","sum(*removeNA(dv)): DataVector with no NA's",10,0.3294239044189453,0.03294239044189453,0.03928399085998535,0.030253887176513672,"2013-01-14 21:10:06","3549f803f9","3b67c77708","Darwin",4 -"DataArray Operations","mean(dvna): DataVector with NA's",10,0.010222911834716797,0.0010222911834716797,0.0012698173522949219,0.0009789466857910156,"2013-01-14 21:10:06","3549f803f9","3b67c77708","Darwin",4 "Linear Algebra","Matrix Multiplication w/ No NA's",1000,0.1328754425048828,0.00013287544250488282,0.011707067489624023,5.888938903808594e-5,"2013-01-14 21:10:07","3549f803f9","3b67c77708","Darwin",4 "Linear Algebra","DataMatrix Multiplication w/ No NA's",1000,1.9026072025299072,0.0019026072025299073,0.026885986328125,0.0011301040649414062,"2013-01-14 21:10:09","3549f803f9","3b67c77708","Darwin",4 "Linear Algebra","DataMatrix Multiplication w/ NA's",1000,4.693065643310547,0.004693065643310547,0.034111976623535156,0.0031609535217285156,"2013-01-14 21:10:14","3549f803f9","3b67c77708","Darwin",4 diff --git a/benchmark/runbenchmarks.jl b/benchmark/runbenchmarks.jl index fa9859b406..adbcef595e 100644 --- a/benchmark/runbenchmarks.jl +++ b/benchmark/runbenchmarks.jl @@ -5,9 +5,7 @@ using DataFrames using Benchmark -benchmarks = ["datavector.jl", - "datamatrix.jl", - "io.jl"] +benchmarks = [ "io.jl"] # TODO: Print summary to stdout_stream, while printing results # to file with appends. diff --git a/docs/make.jl b/docs/make.jl index cfd5a2df8a..467ceb2f0f 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,4 +1,4 @@ -using Documenter, DataFrames, DataArrays +using Documenter, DataFrames # Build documentation. # ==================== @@ -6,18 +6,41 @@ using Documenter, DataFrames, DataArrays makedocs( # options modules = [DataFrames], - doctest = false, - clean = false + doctest = true, + clean = false, + sitename = "DataFrames.jl", + format = Documenter.Formats.HTML, + pages = Any[ + "Introduction" => "index.md", + "User Guide" => Any[ + "Getting Started" => "man/getting_started.md", + "IO" => "man/io.md", + "Joins" => "man/joins.md", + "Split-apply-combine" => "man/split_apply_combine.md", + "Reshaping" => "man/reshaping_and_pivoting.md", + "Sorting" => "man/sorting.md", + "Formulas" => "man/formulas.md", + "Pooling" => "man/pooling.md", + ], + "API" => Any[ + "Main types" => "lib/maintypes.md", + "Utilities" => "lib/utilities.md", + "Data manipulation" => "lib/manipulation.md", + ], + "About" => Any[ + "Release Notes" => "NEWS.md", + "License" => "LICENSE.md", + ] + ] ) # Deploy built documentation from Travis. # ======================================= -# Needs to install an additional dep, mkdocs-material, so provide a custom `deps`. -custom_deps() = run(`pip install --user pygments mkdocs mkdocs-material`) - deploydocs( # options - deps = custom_deps, - repo = "github.com/JuliaStats/DataFrames.jl.git" + repo = "github.com/JuliaStats/DataFrames.jl.git", + target = "build", + deps = nothing, + make = nothing, ) diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml deleted file mode 100644 index ea5c0f8695..0000000000 --- a/docs/mkdocs.yml +++ /dev/null @@ -1,43 +0,0 @@ - -site_name: DataFrames.jl -site_description: package for working with tabular data in Julia -repo_url: https://github.com/JuliaStats/DataFrames.jl - -theme: material - -extra: - palette: - primary: 'indigo' - accent: 'blue' - -extra_css: - - assets/Documenter.css - -markdown_extensions: - - codehilite - - extra - - tables - - fenced_code - -extra_javascript: - - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML - - assets/mathjaxhelper.js - -docs_dir: 'build' - -pages: -- [index.md, Introduction] -- [man/getting_started.md, User guide, Getting Started] -- [man/io.md, User guide, IO] -- [man/joins.md, User guide, Joins] -- [man/split_apply_combine.md, User guide, Split-apply-combine] -- [man/reshaping_and_pivoting.md, User guide, Reshaping] -- [man/sorting.md, User guide, Sorting] -- [man/formulas.md, User guide, Formulas] -- [man/pooling.md, User guide, Pooling] -- [lib/maintypes.md, API, Main types] -- [lib/utilities.md, API, Utilities] -- [lib/manipulation.md, API, Data manipulation] -- [NEWS.md, About, Release notes] -- [LICENSE.md, About, License] - diff --git a/docs/src/index.md b/docs/src/index.md index 9a08a33548..7943a0597d 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -2,17 +2,20 @@ ## Package Manual - {contents} - Pages = ["man/getting_started.md", "man/io.md", "man/joins.md", "man/split_apply_combine.md", "man/reshaping_and_pivoting.md", "man/sorting.md", "man/formulas.md", "man/pooling.md"] - Depth = 2 +```@contents +Pages = ["man/getting_started.md", "man/io.md", "man/joins.md", "man/split_apply_combine.md", "man/reshaping_and_pivoting.md", "man/sorting.md", "man/formulas.md", "man/pooling.md"] +Depth = 2 +``` ## API - {contents} - Pages = ["lib/maintypes.md", "lib/manipulation.md", "lib/utilities.md"] - Depth = 2 +```@contents +Pages = ["lib/maintypes.md", "lib/manipulation.md", "lib/utilities.md"] +Depth = 2 +``` ## Documentation Index - {index} - Pages = ["lib/maintypes.md", "lib/manipulation.md", "lib/utilities.md", "man/io.md"] +```@index +Pages = ["lib/maintypes.md", "lib/manipulation.md", "lib/utilities.md", "man/io.md"] +``` diff --git a/docs/src/lib/maintypes.md b/docs/src/lib/maintypes.md index 8b8f300283..ccc62d530c 100644 --- a/docs/src/lib/maintypes.md +++ b/docs/src/lib/maintypes.md @@ -1,16 +1,16 @@ - {meta} - CurrentModule = DataFrames +```@meta +CurrentModule = DataFrames +``` # Main Types - {index} - Pages = ["maintypes.md"] - -... - - {docs} - AbstractDataFrame - DataFrame - SubDataFrame +```@index +Pages = ["maintypes.md"] +``` +```@docs +AbstractDataFrame +DataFrame +SubDataFrame +``` diff --git a/docs/src/lib/manipulation.md b/docs/src/lib/manipulation.md index dae992c48b..1f9f578d25 100644 --- a/docs/src/lib/manipulation.md +++ b/docs/src/lib/manipulation.md @@ -1,22 +1,25 @@ - - {meta} - CurrentModule = DataFrames +```@meta +CurrentModule = DataFrames +``` # Data Manipulation - - {index} - Pages = ["manipulation.md"] + +```@index +Pages = ["manipulation.md"] +``` ## Joins - {docs} - join - +```@docs +join +``` + ## Reshaping - {docs} - melt - stack - unstack - stackdf - meltdf +```@docs +melt +stack +unstack +stackdf +meltdf +``` diff --git a/docs/src/lib/utilities.md b/docs/src/lib/utilities.md index 6b2d8d2564..23c9c76d65 100644 --- a/docs/src/lib/utilities.md +++ b/docs/src/lib/utilities.md @@ -1,27 +1,25 @@ - - {meta} - CurrentModule = DataFrames +```@meta +CurrentModule = DataFrames +``` # Utilities - {index} - Pages = ["utilities.md"] +```@index +Pages = ["utilities.md"] +``` -... - - {docs} - eltypes - head - complete_cases - complete_cases! - describe - dump - names! - nonunique - rename - rename! - tail - unique - unique! - - +```@docs +eltypes +head +complete_cases +complete_cases! +describe +dump +names! +nonunique +rename +rename! +tail +unique +unique! +``` diff --git a/docs/src/man/formulas.md b/docs/src/man/formulas.md index e9203439e8..c283a696a8 100644 --- a/docs/src/man/formulas.md +++ b/docs/src/man/formulas.md @@ -33,7 +33,7 @@ If you would like to specify both main effects and an interaction term at once, mm = ModelMatrix(ModelFrame(Z ~ X*Y, df)) ``` -You can control how categorical variables (e.g., `PooledDataArray` columns) are converted to `ModelMatrix` columns by specifying _contrasts_ when you construct a `ModelFrame`: +You can control how categorical variables (e.g., `CategoricalArray` columns) are converted to `ModelMatrix` columns by specifying _contrasts_ when you construct a `ModelFrame`: ```julia mm = ModelMatrix(ModelFrame(Z ~ X*Y, df, contrasts = Dict(:X => HelmertCoding()))) @@ -47,4 +47,3 @@ contrasts!(mf, X = HelmertCoding()) ``` The construction of model matrices makes it easy to formulate complex statistical models. These are used to good effect by the [GLM Package.](https://github.com/JuliaStats/GLM.jl) - diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md index 89a2f7e101..b4fc4e1a17 100644 --- a/docs/src/man/getting_started.md +++ b/docs/src/man/getting_started.md @@ -2,75 +2,75 @@ ## Installation -The DataFrames package is available through the Julia package system. Throughout the rest of this tutorial, we will assume that you have installed the DataFrames package and have already typed `using DataArrays, DataFrames` to bring all of the relevant variables into your current namespace. In addition, we will make use of the `RDatasets` package, which provides access to hundreds of classical data sets. +The DataFrames package is available through the Julia package system. Throughout the rest of this tutorial, we will assume that you have installed the DataFrames package and have already typed `using NullableArrays, DataFrames` to bring all of the relevant variables into your current namespace. In addition, we will make use of the `RDatasets` package, which provides access to hundreds of classical data sets. -## The `NA` Value +## The `Nullable` Type -To get started, let's examine the `NA` value. Type the following into the REPL: +To get started, let's examine the `Nullable` type. Objects of this type can either hold a value, or represent a missing value (`null`). For example, this is a `Nullable` holding the integer `1`: ```julia -NA +Nullable(1) ``` -One of the essential properties of `NA` is that it poisons other items. To see this, try to add something like `1` to `NA`: - +And this represents a missing value: ```julia -1 + NA +Nullable() ``` -## The `DataArray` Type - -Now that we see that `NA` is working, let's insert one into a `DataArray`. We'll create one now using the `@data` macro: +`Nullable` objects support all standard operators, which return another `Nullable`. One of the essential properties of `null` values is that they poison other items. To see this, try to add something like `Nullable(1)` to `Nullable()`: ```julia -dv = @data([NA, 3, 2, 5, 4]) +Nullable(1) + Nullable() ``` -To see how `NA` poisons even complex calculations, let's try to take the mean of the five numbers stored in `dv`: +Note that operations mixing `Nullable` and scalars (e.g. `1 + Nullable()`) are not supported. + +## The `NullableArray` Type + +`Nullable` objects can be stored in a standard `Array` just like any value: ```julia -mean(dv) +v = Nullable{Int}[1, 3, 4, 5, 4] ``` -In many cases we're willing to just ignore `NA` values and remove them from our vector. We can do that using the `dropna` function: +But arrays of `Nullable` are inefficient, both in terms of computation costs and of memory use. `NullableArrays` provide a more efficient storage, and behave like `Array{Nullable}` objects. ```julia -dropna(dv) -mean(dropna(dv)) +nv = NullableArray(Nullable{Int}[Nullable(), 3, 2, 5, 4]) ``` -Instead of removing `NA` values, you can try to conver the `DataArray` into a normal Julia `Array` using `convert`: +In many cases we're willing to just ignore missing values and remove them from our vector. We can do that using the `dropnull` function: ```julia -convert(Array, dv) +dropnull(nv) +mean(dropnull(nv)) ``` -This fails in the presence of `NA` values, but will succeed if there are no `NA` values: +Instead of removing `null` values, you can try to convert the `NullableArray` into a normal Julia `Array` using `convert`: ```julia -dv[1] = 3 -convert(Array, dv) +convert(Array, nv) ``` -In addition to removing `NA` values and hoping they won't occur, you can also replace any `NA` values using the `convert` function, which takes a replacement value as an argument: +This fails in the presence of `null` values, but will succeed if there are no `null` values: ```julia -dv = @data([NA, 3, 2, 5, 4]) -mean(convert(Array, dv, 11)) +nv[1] = 3 +convert(Array, nv) ``` -Which strategy for dealing with `NA` values is most appropriate will typically depend on the specific details of your data analysis pathway. - -Although the examples above employed only 1D `DataArray` objects, the `DataArray` type defines a completely generic N-dimensional array type. Operations on generic `DataArray` objects work in higher dimensions in the same way that they work on Julia's Base `Array` type: +In addition to removing `null` values and hoping they won't occur, you can also replace any `null` values using the `convert` function, which takes a replacement value as an argument: ```julia -dm = @data([NA 0.0; 0.0 1.0]) -dm * dm +nv = NullableArray(Nullable{Int}[Nullable(), 3, 2, 5, 4]) +mean(convert(Array, nv, 0)) ``` +Which strategy for dealing with `null` values is most appropriate will typically depend on the specific details of your data analysis pathway. + ## The `DataFrame` Type -The `DataFrame` type can be used to represent data tables, each column of which is a `DataArray`. You can specify the columns using keyword arguments: +The `DataFrame` type can be used to represent data tables, each column of which is an array (by default, a `NullableArray`). You can specify the columns using keyword arguments: ```julia df = DataFrame(A = 1:4, B = ["M", "F", "F", "M"]) @@ -110,22 +110,22 @@ describe(df) To focus our search, we start looking at just the means and medians of specific columns. In the example below, we use numeric indexing to access the columns of the `DataFrame`: ```julia -mean(df[1]) -median(df[1]) +mean(dropnull(df[1])) +median(dropnull(df[1])) ``` We could also have used column names to access individual columns: ```julia -mean(df[:A]) -median(df[:A]) +mean(dropnull(df[:A])) +median(dropnull(df[:A])) ``` We can also apply a function to each column of a `DataFrame` with the `colwise` function. For example: ```julia df = DataFrame(A = 1:4, B = randn(4)) -colwise(cumsum, df) +colwise(c->cumsum(dropnull(c)), df) ``` ## Accessing Classic Data Sets @@ -135,10 +135,15 @@ To see more of the functionality for working with `DataFrame` objects, we need a For example, we can access Fisher's iris data set using the following functions: ```julia -using RDatasets -iris = dataset("datasets", "iris") +iris = readtable(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv")) head(iris) ``` In the next section, we'll discuss generic I/O strategy for reading and writing `DataFrame` objects that you can use to import and export your own data files. +## Querying DataFrames + +While the `DataFrames` package provides basic data manipulation capabilities, users are encouraged to use the following packages for more powerful and complete data querying functionality in the spirit of [dplyr](https://github.com/hadley/dplyr) and [LINQ](https://msdn.microsoft.com/en-us/library/bb397926.aspx): + +- [DataFramesMeta.jl](https://github.com/JuliaStats/DataFramesMeta.jl) provides metaprogramming tools for `DataFrames` and associative objects. These macros improve performance and provide more convenient syntax. +- [Query.jl](https://github.com/davidanthoff/Query.jl) provides a LINQ like interface to a large number of data sources, including `DataFrame` instances. diff --git a/docs/src/man/io.md b/docs/src/man/io.md index fdb0869355..0ebcc94d60 100644 --- a/docs/src/man/io.md +++ b/docs/src/man/io.md @@ -4,8 +4,9 @@ To read data from a CSV-like file, use the `readtable` function: - {docs} - readtable +```@docs +readtable +``` `readtable` requires that you specify the path of the file that you would like to read as a `String`. To read data from a non-file source, you may also supply an `IO` object. It supports many additional keyword arguments: these are documented in the section on advanced I/O operations. @@ -13,8 +14,9 @@ To read data from a CSV-like file, use the `readtable` function: To write data to a CSV file, use the `writetable` function: - {docs} - writetable +```@docs +writetable +``` ## Supplying `DataFrame`s inline with non-standard string literals diff --git a/docs/src/man/joins.md b/docs/src/man/joins.md index 558bf317e3..c152ee9fa3 100644 --- a/docs/src/man/joins.md +++ b/docs/src/man/joins.md @@ -15,7 +15,7 @@ full = join(names, jobs, on = :ID) Output: -| Row | ID | Name | Job | +| Row | ID | Name | Job | |-----|----|------------|----------| | 1 | 1 | "John Doe" | "Lawyer" | | 2 | 1 | "Jane Doe" | "Doctor" | diff --git a/docs/src/man/pooling.md b/docs/src/man/pooling.md index 17757d8526..fcffaaba29 100644 --- a/docs/src/man/pooling.md +++ b/docs/src/man/pooling.md @@ -1,44 +1,49 @@ -# Pooling Data (Representing Factors) +# Categorical Data Often, we have to deal with factors that take on a small number of levels: ```julia -dv = @data(["Group A", "Group A", "Group A", - "Group B", "Group B", "Group B"]) +v = ["Group A", "Group A", "Group A", + "Group B", "Group B", "Group B"] ``` -The naive encoding used in a `DataArray` represents every entry of this vector as a full string. In contrast, we can represent the data more efficiently by replacing the strings with indices into a small pool of levels. This is what the `PooledDataArray` does: +The naive encoding used in an `Array` or in a `NullableArray` represents every entry of this vector as a full string. In contrast, we can represent the data more efficiently by replacing the strings with indices into a small pool of levels. This is what the `CategoricalArray` type does: ```julia -pdv = @pdata(["Group A", "Group A", "Group A", - "Group B", "Group B", "Group B"]) +cv = CategoricalArray(["Group A", "Group A", "Group A", + "Group B", "Group B", "Group B"]) ``` -In addition to representing repeated data efficiently, the `PooledDataArray` allows us to determine the levels of the factor at any time using the `levels` function: +A companion type, `NullableCategoricalArray`, allows storing missing values in the array: is to `CategoricalArray` what `NullableArray` is to the standard `Array` type. + +In addition to representing repeated data efficiently, the `CategoricalArray` type allows us to determine efficiently the allowed levels of the variable at any time using the `levels` function (note that levels may or may not be actually used in the data): ```julia -levels(pdv) +levels(cv) ``` -By default, a `PooledDataArray` is able to represent 232differents levels. You can use less memory by calling the `compact` function: +The `levels!` function also allows changing the order of appearance of the levels, which can be useful for display purposes or when working with ordered variables. + +By default, a `CategoricalArray` is able to represent 232differents levels. You can use less memory by calling the `compact` function: ```julia -pdv = compact(pdv) +cv = compact(cv) ``` -Often, you will have factors encoded inside a DataFrame with `DataArray` columns instead of `PooledDataArray` columns. You can do conversion of a single column using the `pool` function: +Often, you will have factors encoded inside a DataFrame with `Array` or `NullableArray` columns instead of `CategoricalArray` or `NullableCategoricalArray` columns. You can do conversion of a single column using the `categorical` function: ```julia -pdv = pool(dv) +cv = categorical(v) ``` -Or you can edit the columns of a `DataFrame` in-place using the `pool!` function: +Or you can edit the columns of a `DataFrame` in-place using the `categorical!` function: ```julia df = DataFrame(A = [1, 1, 1, 2, 2, 2], B = ["X", "X", "X", "Y", "Y", "Y"]) -pool!(df, [:A, :B]) +categorical!(df, [:A, :B]) ``` -Pooling columns is important for working with the [GLM package](https://github.com/JuliaStats/GLM.jl) When fitting regression models, `PooledDataArray` columns in the input are translated into 0/1 indicator columns in the `ModelMatrix` with one column for each of the levels of the `PooledDataArray`. This allows one to analyze categorical data efficiently. +Using categorical arrays is important for working with the [GLM package](https://github.com/JuliaStats/GLM.jl). When fitting regression models, `CategoricalArray` and `NullableCategoricalArray` columns in the input are translated into 0/1 indicator columns in the `ModelMatrix` with one column for each of the levels of the `CategoricalArray`/`NullableCategoricalArray`. This allows one to analyze categorical data efficiently. +See the [CategoricalArrays package](https://github.com/nalimilan/CategoricalArrays.jl) for more information regarding categorical arrays. diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index 959dd00bfb..dcd02c70d9 100644 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -3,10 +3,10 @@ Reshape data from wide to long format using the `stack` function: ```julia -using DataFrames, RDatasets -iris = dataset("datasets", "iris") +using DataFrames +iris = readtable(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv")) iris[:id] = 1:size(iris, 1) # this makes it easier to unstack -d = stack(iris, [1:4]) +d = stack(iris, 1:4) ``` The second optional argument to `stack` indicates the columns to be stacked. These are normally referred to as the measured variables. Column names can also be given: @@ -79,6 +79,6 @@ None of these reshaping functions perform any aggregation. To do aggregation, us ```julia d = stack(iris) -x = by(d, [:variable, :Species], df -> DataFrame(vsum = mean(df[:value]))) +x = by(d, [:variable, :Species], df -> DataFrame(vsum = mean(dropnull(df[:value])))) unstack(x, :Species, :vsum) ``` diff --git a/docs/src/man/sorting.md b/docs/src/man/sorting.md index 4d2140a258..14224a8b55 100644 --- a/docs/src/man/sorting.md +++ b/docs/src/man/sorting.md @@ -3,9 +3,8 @@ Sorting is a fundamental component of data analysis. Basic sorting is trivial: just calling `sort!` will sort all columns, in place: ```julia -using DataFrames, RDatasets - -iris = dataset("datasets", "iris") +using DataFrames +iris = readtable(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv")) sort!(iris) ``` diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index 912279f3ac..8caa9b1b1e 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -7,12 +7,11 @@ The DataFrames package supports the Split-Apply-Combine strategy through the `by We show several examples of the `by` function applied to the `iris` dataset below: ```julia -using DataFrames, RDatasets - -iris = dataset("datasets", "iris") +using DataFrames +iris = readtable(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv")) by(iris, :Species, size) -by(iris, :Species, df -> mean(df[:PetalLength])) +by(iris, :Species, df -> mean(dropnull(df[:PetalLength]))) by(iris, :Species, df -> DataFrame(N = size(df, 1))) ``` @@ -20,7 +19,7 @@ The `by` function also support the `do` block form: ```julia by(iris, :Species) do df - DataFrame(m = mean(df[:PetalLength]), s² = var(df[:PetalLength])) + DataFrame(m = mean(dropnull(df[:PetalLength])), s² = var(dropnull(df[:PetalLength]))) end ``` @@ -30,7 +29,7 @@ We show several examples of the `aggregate` function applied to the `iris` datas ```julia aggregate(iris, :Species, sum) -aggregate(iris, :Species, [sum, mean]) +aggregate(iris, :Species, [sum, x->mean(dropnull(x))]) ``` If you only want to split the data set into subsets, use the `groupby` function: diff --git a/docs/src/man/subsets.md b/docs/src/man/subsets.md index 2049d5cd1a..a1a899a848 100644 --- a/docs/src/man/subsets.md +++ b/docs/src/man/subsets.md @@ -1,59 +1,12 @@ # Subsets -## DataArrays - -The `DataArray` type is meant to behave like a standard Julia `Array` and tries to implement identical indexing rules: - -One dimensional `DataArray`: - -```julia -julia> using DataArrays - -julia> dv = data([1, 2, 3]) -3-element DataArray{Int64,1}: - 1 - 2 - 3 - -julia> dv[1] -1 - -julia> dv[2] = NA -NA - -julia> dv[2] -NA -``` - -Two dimensional `DataArray`: - -```julia -julia> using DataArrays - -julia> dm = data([1 2; 3 4]) -2×2 DataArray{Int64,2}: - 1 2 - 3 4 - -julia> dm[1, 1] -1 - -julia> dm[2, 1] = NA -NA - -julia> dm[2, 1] -NA -``` - -DataFrames - -In contrast, a `DataFrame` offers substantially more forms of indexing because columns can be referred to by name: +A `DataFrame` supports many forms of indexing. ```julia julia> using DataFrames julia> df = DataFrame(A = 1:10, B = 2:2:20) -10×2 DataFrame +10×2 DataFrames.DataFrame │ Row │ A │ B │ ├─────┼────┼────┤ │ 1 │ 1 │ 2 │ @@ -68,11 +21,11 @@ julia> df = DataFrame(A = 1:10, B = 2:2:20) │ 10 │ 10 │ 20 │ ``` -Refering to the first column by index or name: +Referring to the first column by index or name: ```julia julia> df[1] -10-element DataArray{Int64,1}: +10-element NullableArrays.NullableArray{Int64,1}: 1 2 3 @@ -85,7 +38,7 @@ julia> df[1] 10 julia> df[:A] -10-element DataArray{Int64,1}: +10-element NullableArrays.NullableArray{Int64,1}: 1 2 3 @@ -102,17 +55,17 @@ Refering to the first element of the first column: ```julia julia> df[1, 1] -1 +Nullable{Int64}(1) julia> df[1, :A] -1 +Nullable{Int64}(1) ``` Selecting a subset of rows by index and an (ordered) subset of columns by name: ```julia julia> df[1:3, [:A, :B]] -3×2 DataFrame +3×2 DataFrames.DataFrame │ Row │ A │ B │ ├─────┼───┼───┤ │ 1 │ 1 │ 2 │ @@ -120,39 +73,10 @@ julia> df[1:3, [:A, :B]] │ 3 │ 3 │ 6 │ julia> df[1:3, [:B, :A]] -3×2 DataFrame +3×2 DataFrames.DataFrame │ Row │ B │ A │ ├─────┼───┼───┤ │ 1 │ 2 │ 1 │ │ 2 │ 4 │ 2 │ │ 3 │ 6 │ 3 │ ``` - -Selecting a subset of rows by using a condition: - -```julia -julia> df[df[:A] % 2 .== 0, :] -5×2 DataFrame -│ Row │ A │ B │ -├─────┼────┼────┤ -│ 1 │ 2 │ 4 │ -│ 2 │ 4 │ 8 │ -│ 3 │ 6 │ 12 │ -│ 4 │ 8 │ 16 │ -│ 5 │ 10 │ 20 │ - -julia> df[df[:B] % 2 .== 0, :] -10×2 DataFrame -│ Row │ A │ B │ -├─────┼────┼────┤ -│ 1 │ 1 │ 2 │ -│ 2 │ 2 │ 4 │ -│ 3 │ 3 │ 6 │ -│ 4 │ 4 │ 8 │ -│ 5 │ 5 │ 10 │ -│ 6 │ 6 │ 12 │ -│ 7 │ 7 │ 14 │ -│ 8 │ 8 │ 16 │ -│ 9 │ 9 │ 18 │ -│ 10 │ 10 │ 20 │ -``` diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 3c2a53003f..e19750286c 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -12,7 +12,8 @@ using Compat import Compat.String using Reexport @reexport using StatsBase -@reexport using DataArrays +@reexport using NullableArrays +@reexport using CategoricalArrays using GZip using SortingAlgorithms @@ -50,6 +51,7 @@ export @~, aggregate, by, + categorical!, coefnames, colwise, combine, @@ -70,8 +72,6 @@ export @~, nrow, nullable!, order, - pool, - pool!, printtable, readtable, rename!, @@ -82,9 +82,14 @@ export @~, unique!, unstack, writetable, + head, + tail, # Remove after deprecation period - read_rda + read_rda, + pool, + pool! + ############################################################################## ## diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 3df74edcf7..6e7f9adedf 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -11,24 +11,24 @@ type in that it allows indexing by a key (the columns). The following are normally implemented for AbstractDataFrames: -* [`describe`]({ref}) : summarize columns -* [`dump`]({ref}) : show structure +* [`describe`](@ref) : summarize columns +* [`dump`](@ref) : show structure * `hcat` : horizontal concatenation * `vcat` : vertical concatenation * `names` : columns names -* [`names!`]({ref}) : set columns names -* [`rename!`]({ref}) : rename columns names based on keyword arguments -* [`eltypes`]({ref}) : `eltype` of each column +* [`names!`](@ref) : set columns names +* [`rename!`](@ref) : rename columns names based on keyword arguments +* [`eltypes`](@ref) : `eltype` of each column * `length` : number of columns * `size` : (nrows, ncols) -* [`head`]({ref}) : first `n` rows -* [`tail`]({ref}) : last `n` rows +* [`head`](@ref) : first `n` rows +* [`tail`](@ref) : last `n` rows * `convert` : convert to an array -* `DataArray` : convert to a DataArray -* [`complete_cases`]({ref}) : indexes of complete cases (rows with no NA's) -* [`complete_cases!`]({ref}) : remove rows with NA's -* [`nonunique`]({ref}) : indexes of duplicate rows -* [`unique!`]({ref}) : remove duplicate rows +* `NullableArray` : convert to a NullableArray +* [`complete_cases`](@ref) : indexes of complete cases (rows with no NA's) +* [`complete_cases!`](@ref) : remove rows with NA's +* [`nonunique`](@ref) : indexes of duplicate rows +* [`unique!`](@ref) : remove duplicate rows * `similar` : a DataFrame with similar columns as `d` **Indexing** @@ -79,13 +79,17 @@ abstract AbstractDataFrame ## ############################################################################## -immutable Cols{T <: AbstractDataFrame} +immutable Cols{T <: AbstractDataFrame} <: AbstractVector{Any} df::T end Base.start(::Cols) = 1 Base.done(itr::Cols, st) = st > length(itr.df) Base.next(itr::Cols, st) = (itr.df[st], st + 1) Base.length(itr::Cols) = length(itr.df) +Base.size(itr::Cols, ix) = ix==1 ? length(itr) : throw(ArgumentError("Incorrect dimension")) +Base.size(itr::Cols) = (length(itr.df),) +Base.linearindexing{T}(::Type{Cols{T}}) = Base.LinearFast() +Base.getindex(itr::Cols, inds...) = getindex(itr.df, inds...) # N.B. where stored as a vector, 'columns(x) = x.vector' is a bit cheaper columns{T <: AbstractDataFrame}(df::T) = Cols{T}(df) @@ -175,7 +179,7 @@ rename!(df, @compat(Dict(:i=>:A, :x=>:X))) (rename!, rename) """ -Column elemental types +Return element types of columns ```julia eltypes(df::AbstractDataFrame) @@ -187,7 +191,7 @@ eltypes(df::AbstractDataFrame) **Result** -* `::Vector{Type}` : the elemental type of each column +* `::Vector{Type}` : the element type of each column **Examples** @@ -197,14 +201,7 @@ eltypes(df) ``` """ -function eltypes(df::AbstractDataFrame) - ncols = size(df, 2) - res = Array(Type, ncols) - for j in 1:ncols - res[j] = eltype(df[j]) - end - return res -end +eltypes(df::AbstractDataFrame) = map!(eltype, Vector{Type}(size(df,2)), columns(df)) Base.size(df::AbstractDataFrame) = (nrow(df), ncol(df)) function Base.size(df::AbstractDataFrame, i::Integer) @@ -213,7 +210,7 @@ function Base.size(df::AbstractDataFrame, i::Integer) elseif i == 2 ncol(df) else - throw(ArgumentError("DataFrames have only two dimensions")) + throw(ArgumentError("DataFrames only have two dimensions")) end end @@ -231,21 +228,15 @@ Base.ndims(::AbstractDataFrame) = 2 Base.similar(df::AbstractDataFrame, dims::Int) = DataFrame(Any[similar(x, dims) for x in columns(df)], copy(index(df))) -nas{T}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) = # TODO move to datavector.jl? - DataArray(Array(T, dims), trues(dims)) - -nas{T,R}(dv::PooledDataArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) = - PooledDataArray(DataArrays.RefArray(zeros(R, dims)), dv.pool) - -nas(df::AbstractDataFrame, dims::Int) = - DataFrame(Any[nas(x, dims) for x in columns(df)], copy(index(df))) - ############################################################################## ## ## Equality ## ############################################################################## +# Imported in DataFrames.jl for compatibility across Julia 0.4 and 0.5 +@compat(Base.:(==))(df1::AbstractDataFrame, df2::AbstractDataFrame) = isequal(df1, df2) + function Base.isequal(df1::AbstractDataFrame, df2::AbstractDataFrame) size(df1, 2) == size(df2, 2) || return false isequal(index(df1), index(df2)) || return false @@ -255,20 +246,6 @@ function Base.isequal(df1::AbstractDataFrame, df2::AbstractDataFrame) return true end -# Imported in DataFrames.jl for compatibility across Julia 0.4 and 0.5 -function (==)(df1::AbstractDataFrame, df2::AbstractDataFrame) - size(df1, 2) == size(df2, 2) || return false - isequal(index(df1), index(df2)) || return false - eq = true - for idx in 1:size(df1, 2) - coleq = df1[idx] == df2[idx] - # coleq could be NA - !isequal(coleq, false) || return false - eq &= coleq - end - return eq -end - ############################################################################## ## ## Associative methods @@ -285,10 +262,10 @@ Base.isempty(df::AbstractDataFrame) = ncol(df) == 0 ## ############################################################################## -DataArrays.head(df::AbstractDataFrame, r::Int) = df[1:min(r,nrow(df)), :] -DataArrays.head(df::AbstractDataFrame) = head(df, 6) -DataArrays.tail(df::AbstractDataFrame, r::Int) = df[max(1,nrow(df)-r+1):nrow(df), :] -DataArrays.tail(df::AbstractDataFrame) = tail(df, 6) +head(df::AbstractDataFrame, r::Int) = df[1:min(r,nrow(df)), :] +head(df::AbstractDataFrame) = head(df, 6) +tail(df::AbstractDataFrame, r::Int) = df[max(1,nrow(df)-r+1):nrow(df), :] +tail(df::AbstractDataFrame) = tail(df, 6) """ Show the first or last part of an AbstractDataFrame @@ -341,7 +318,7 @@ dump(io::IO, df::AbstractDataFrame, n::Int = 5) ```julia df = DataFrame(i = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10)) -str(df) +dump(df) ``` """ @@ -355,10 +332,6 @@ function Base.dump(io::IO, df::AbstractDataFrame, n::Int, indent) end end -function Base.dump(io::IO, dv::AbstractDataVector, n::Int, indent) - println(io, typeof(dv), "(", length(dv), ") ", dv[1:min(4, end)]) -end - # summarize the columns of a DF # TODO: clever layout in rows """ @@ -404,32 +377,33 @@ function StatsBase.describe(io, df::AbstractDataFrame) println(io, ) end end -StatsBase.describe(dv::AbstractArray) = describe(STDOUT, dv) -function StatsBase.describe{T<:Number}(io, dv::AbstractArray{T}) - if all(isna(dv)) +StatsBase.describe(nv::AbstractArray) = describe(STDOUT, nv) +function StatsBase.describe{T<:Number}(io, nv::AbstractArray{T}) + if all(_isnull, nv) println(io, " * All NA * ") return end - filtered = float(dropna(dv)) + filtered = float(dropnull(nv)) qs = quantile(filtered, [0, .25, .5, .75, 1]) statNames = ["Min", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max"] statVals = [qs[1:3]; mean(filtered); qs[4:5]] for i = 1:6 - println(io, string(rpad(statNames[i], 8, " "), " ", string(statVals[i]))) + println(io, string(rpad(statNames[i], 10, " "), " ", string(statVals[i]))) end - nas = sum(isna(dv)) - println(io, "NAs $nas") - println(io, "NA% $(round(nas*100/length(dv), 2))%") + nulls = countnull(nv) + println(io, "NULLs $(nulls)") + println(io, "NULL % $(round(nulls*100/length(nv), 2))%") return end -function StatsBase.describe{T}(io, dv::AbstractArray{T}) - ispooled = isa(dv, PooledDataVector) ? "Pooled " : "" +function StatsBase.describe{T}(io, nv::AbstractArray{T}) + ispooled = isa(nv, CategoricalVector) ? "Pooled " : "" + nulls = countnull(nv) # if nothing else, just give the length and element type and NA count - println(io, "Length $(length(dv))") - println(io, "Type $(ispooled)$(string(eltype(dv)))") - println(io, "NAs $(sum(isna(dv)))") - println(io, "NA% $(round(sum(isna(dv))*100/length(dv), 2))%") - println(io, "Unique $(length(unique(dv)))") + println(io, "Length $(length(nv))") + println(io, "Type $(ispooled)$(string(eltype(nv)))") + println(io, "NULLs $(nulls)") + println(io, "NULL % $(round(nulls*100/length(nv), 2))%") + println(io, "Unique $(length(unique(nv)))") return end @@ -439,8 +413,27 @@ end ## ############################################################################## +function _nonnull!(res, col) + for (i, el) in enumerate(col) + res[i] &= !_isnull(el) + end +end + +function _nonnull!(res, col::NullableArray) + for (i, el) in enumerate(col.isnull) + res[i] &= !el + end +end + +function _nonnull!(res, col::NullableCategoricalArray) + for (i, el) in enumerate(col.refs) + res[i] &= el > 0 + end +end + + """ -Indexes of complete cases (rows without NA's) +Indexes of complete cases (rows without null values) ```julia complete_cases(df::AbstractDataFrame) @@ -454,29 +447,28 @@ complete_cases(df::AbstractDataFrame) * `::Vector{Bool}` : indexes of complete cases -See also [`complete_cases!`]({ref}). +See also [`complete_cases!`](@ref). **Examples** ```julia df = DataFrame(i = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10)) -df[[1,4,5], :x] = NA -df[[9,10], :y] = NA +df[[1,4,5], :x] = Nullable() +df[[9,10], :y] = Nullable() complete_cases(df) ``` """ function complete_cases(df::AbstractDataFrame) - ## Returns a Vector{Bool} of indexes of complete cases (rows with no NA's). - res = !isna(df[1]) - for i in 2:ncol(df) - res &= !isna(df[i]) + res = fill(true, size(df, 1)) + for i in 1:size(df, 2) + _nonnull!(res, df[i]) end res end """ -Delete rows with NA's. +Delete rows with null values. ```julia complete_cases!(df::AbstractDataFrame) @@ -490,14 +482,14 @@ complete_cases!(df::AbstractDataFrame) * `::AbstractDataFrame` : the updated version -See also [`complete_cases`]({ref}). +See also [`complete_cases`](@ref). **Examples** ```julia df = DataFrame(i = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10)) -df[[1,4,5], :x] = NA -df[[9,10], :y] = NA +df[[1,4,5], :x] = Nullable() +df[[9,10], :y] = Nullable() complete_cases!(df) ``` @@ -508,7 +500,8 @@ function Base.convert(::Type{Array}, df::AbstractDataFrame) convert(Matrix, df) end function Base.convert(::Type{Matrix}, df::AbstractDataFrame) - T = reduce(typejoin, eltypes(df)) + T = reduce(promote_type, eltypes(df)) + T <: Nullable && (T = eltype(T)) convert(Matrix{T}, df) end function Base.convert{T}(::Type{Array{T}}, df::AbstractDataFrame) @@ -518,27 +511,28 @@ function Base.convert{T}(::Type{Matrix{T}}, df::AbstractDataFrame) n, p = size(df) res = Array(T, n, p) idx = 1 - for col in columns(df) - anyna(col) && error("DataFrame contains NAs") - copy!(res, idx, data(col)) + for (name, col) in zip(names(df), columns(df)) + anynull(col) && error("cannot convert a DataFrame containing null values to array (found for column $name)") + copy!(res, idx, convert(Vector{T}, col)) idx += n end return res end -function Base.convert(::Type{DataArray}, df::AbstractDataFrame) - convert(DataMatrix, df) +function Base.convert(::Type{NullableArray}, df::AbstractDataFrame) + convert(NullableMatrix, df) end -function Base.convert(::Type{DataMatrix}, df::AbstractDataFrame) - T = reduce(typejoin, eltypes(df)) - convert(DataMatrix{T}, df) +function Base.convert(::Type{NullableMatrix}, df::AbstractDataFrame) + T = reduce(promote_type, eltypes(df)) + T <: Nullable && (T = eltype(T)) + convert(NullableMatrix{T}, df) end -function Base.convert{T}(::Type{DataArray{T}}, df::AbstractDataFrame) - convert(DataMatrix{T}, df) +function Base.convert{T}(::Type{NullableArray{T}}, df::AbstractDataFrame) + convert(NullableMatrix{T}, df) end -function Base.convert{T}(::Type{DataMatrix{T}}, df::AbstractDataFrame) +function Base.convert{T}(::Type{NullableMatrix{T}}, df::AbstractDataFrame) n, p = size(df) - res = DataArray(T, n, p) + res = NullableArray(T, n, p) idx = 1 for col in columns(df) copy!(res, idx, col) @@ -548,7 +542,7 @@ function Base.convert{T}(::Type{DataMatrix{T}}, df::AbstractDataFrame) end """ -Indexes of complete cases (rows without NA's) +Indexes of duplicate rows (a row that is a duplicate of a prior row) ```julia nonunique(df::AbstractDataFrame) @@ -565,7 +559,7 @@ nonunique(df::AbstractDataFrame, cols) * `::Vector{Bool}` : indicates whether the row is a duplicate of some prior row -See also [`unique`]({ref}) and [`unique!`]({ref}). +See also [`unique`](@ref) and [`unique!`](@ref). **Examples** @@ -623,7 +617,7 @@ specifying the column(s) to compare. When `cols` is specified, the return DataFrame contains complete rows, retaining in each case the first instance for which `df[cols]` is unique. -See also [`nonunique`]({ref}). +See also [`nonunique`](@ref). **Examples** @@ -641,7 +635,7 @@ unique!(df) # modifies df function nonuniquekey(df::AbstractDataFrame) # Here's another (probably a lot faster) way to do `nonunique` # by grouping on all columns. It will fail if columns cannot be - # made into PooledDataVector's. + # made into CategoricalVector's. gd = groupby(df, _names(df)) idx = [1:length(gd.idx)][gd.idx][gd.starts] res = fill(true, nrow(df)) @@ -654,7 +648,7 @@ function colmissing(df::AbstractDataFrame) # -> Vector{Int} nrows, ncols = size(df) missing = zeros(Int, ncols) for j in 1:ncols - missing[j] = countna(df[j]) + missing[j] = countnull(df[j]) end return missing end @@ -673,7 +667,7 @@ without(df::AbstractDataFrame, c::Any) = without(df, index(df)[c]) ############################################################################## # hcat's first argument must be an AbstractDataFrame -# Trailing arguments (currently) may also be DataVectors, Vectors, or scalars. +# Trailing arguments (currently) may also be NullableVectors, Vectors, or scalars. # hcat! is defined in dataframes/dataframes.jl # Its first argument (currently) must be a DataFrame. @@ -684,84 +678,63 @@ Base.hcat(df::AbstractDataFrame, x) = hcat!(df[:, :], x) Base.hcat(df::AbstractDataFrame, x, y...) = hcat!(hcat(df, x), y...) # vcat only accepts DataFrames. Finds union of columns, maintaining order -# of first df. Missing data becomes NAs. +# of first df. Missing data become null values. Base.vcat(df::AbstractDataFrame) = df Base.vcat(dfs::AbstractDataFrame...) = vcat(AbstractDataFrame[dfs...]) Base.vcat(dfs::Vector{Void}) = dfs + +_isnullable{A<:AbstractArray}(::Type{A}) = eltype(A) <: Nullable + function Base.vcat{T<:AbstractDataFrame}(dfs::Vector{T}) isempty(dfs) && return DataFrame() - coltyps, colnams, similars = _colinfo(dfs) - res = DataFrame() - Nrow = sum(nrow, dfs) - for j in 1:length(colnams) - colnam = colnams[j] - col = similar(similars[j], coltyps[j], Nrow) - - i = 1 - for df in dfs - if haskey(df, colnam) && eltype(df[colnam]) != NAtype - copy!(col, i, df[colnam]) - end - i += size(df, 1) + nrows = sum(nrow, dfs) + for colnam in unique(Base.flatten(names.(dfs))) + k = Bool[haskey(df, colnam) for df in dfs] + if all(k) + res[colnam] = vcat((dfs[i][colnam] for i in 1:length(dfs))...) + continue end - res[colnam] = col - end - res -end + c = ((typeof(dfs[i][colnam]) for i in 1:length(dfs) if k[i])...) + C = Base.return_types(vcat, c) + + if length(C)==1 && isleaftype(C[1]) + if _isnullable(C[1]) + NC = C[1] + else + NC = NullableArray{eltype(C[1])} + end -_isnullable(::AbstractArray) = false -_isnullable(::AbstractDataArray) = true -const EMPTY_DATA = DataArray(Void, 0) - -function _colinfo{T<:AbstractDataFrame}(dfs::Vector{T}) - df1 = dfs[1] - colindex = copy(index(df1)) - coltyps = eltypes(df1) - similars = collect(columns(df1)) - nonnull_ct = Int[_isnullable(c) for c in columns(df1)] - - for i in 2:length(dfs) - df = dfs[i] - for j in 1:size(df, 2) - col = df[j] - cn, ct = _names(df)[j], eltype(col) - if haskey(colindex, cn) - idx = colindex[cn] - - oldtyp = coltyps[idx] - if !(ct <: oldtyp) - coltyps[idx] = promote_type(oldtyp, ct) + col = NC(nrows) + j = 1 + for i in 1:length(dfs) + if k[i] + copy!(col, j, dfs[i][colnam]) end - nonnull_ct[idx] += !_isnullable(col) - else # new column - push!(colindex, cn) - push!(coltyps, ct) - push!(similars, col) - push!(nonnull_ct, !_isnullable(col)) + j += nrow(dfs[i]) end - end - end + else + # warn("Unstable return types: ", C, " from vcat of ", [typeof(dfs[i][colnam]) for i in 1:length(dfs) if k[i]]) - for j in 1:length(colindex) - if nonnull_ct[j] < length(dfs) && !_isnullable(similars[j]) - similars[j] = EMPTY_DATA + E = Base.promote_eltype(c...) + TN = NullableArray{E <: Nullable ? eltype(E) : E} + col = vcat((k[i] ? dfs[i][colnam] : TN(nrow(dfs[i])) for i in 1:length(dfs))...) end - end - colnams = _names(colindex) - coltyps, colnams, similars + res[colnam] = col + end + res end ############################################################################## ## ## Hashing ## -## Make sure this agrees with is_equals() +## Make sure this agrees with isequals() ## ############################################################################## @@ -792,7 +765,7 @@ ncol(df::AbstractDataFrame) * `::AbstractDataFrame` : the updated version -See also [`size`]({ref}). +See also [`size`](@ref). NOTE: these functions may be depreciated for `size`. diff --git a/src/abstractdataframe/io.jl b/src/abstractdataframe/io.jl index debd979db3..fc1f9ca418 100644 --- a/src/abstractdataframe/io.jl +++ b/src/abstractdataframe/io.jl @@ -23,7 +23,7 @@ function printtable(io::IO, header::Bool = true, separator::Char = ',', quotemark::Char = '"', - nastring::AbstractString = "NA") + nastring::AbstractString = "NULL") n, p = size(df) etypes = eltypes(df) if header @@ -42,10 +42,10 @@ function printtable(io::IO, quotestr = string(quotemark) for i in 1:n for j in 1:p - if ! (isna(df[j],i)) + if !isnull(df[j],i) if ! (etypes[j] <: Real) print(io, quotemark) - escapedprint(io, df[i, j], quotestr) + escapedprint(io, get(df[i, j]), quotestr) print(io, quotemark) else print(io, df[i, j]) @@ -67,7 +67,7 @@ function printtable(df::AbstractDataFrame; header::Bool = true, separator::Char = ',', quotemark::Char = '"', - nastring::AbstractString = "NA") + nastring::AbstractString = "NULL") printtable(STDOUT, df, header = header, @@ -94,7 +94,7 @@ writetable(filename, df, [keyword options]) * `separator::Char` -- The separator character that you would like to use. Defaults to the output of `getseparator(filename)`, which uses commas for files that end in `.csv`, tabs for files that end in `.tsv` and a single space for files that end in `.wsv`. * `quotemark::Char` -- The character used to delimit string fields. Defaults to `'"'`. * `header::Bool` -- Should the file contain a header that specifies the column names from `df`. Defaults to `true`. -* `nastring::AbstractString` -- What to write in place of missing data. Defaults to `"NA"`. +* `nastring::AbstractString` -- What to write in place of missing data. Defaults to `"NULL"`. ### Result @@ -115,7 +115,7 @@ function writetable(filename::AbstractString, header::Bool = true, separator::Char = getseparator(filename), quotemark::Char = '"', - nastring::AbstractString = "NA", + nastring::AbstractString = "NULL", append::Bool = false) if endswith(filename, ".bz") || endswith(filename, ".bz2") @@ -169,7 +169,6 @@ function html_escape(cell::AbstractString) end @compat function Base.show(io::IO, ::MIME"text/html", df::AbstractDataFrame) - n = size(df, 1) cnames = _names(df) write(io, "") write(io, "") @@ -178,13 +177,19 @@ end write(io, "") end write(io, "") - tty_rows, tty_cols = _displaysize(io) - mxrow = min(n,tty_rows) + haslimit = get(io, :limit, true) + n = size(df, 1) + if haslimit + tty_rows, tty_cols = _displaysize(io) + mxrow = min(n,tty_rows) + else + mxrow = n + end for row in 1:mxrow write(io, "") write(io, "") for column_name in cnames - cell = string(df[row, column_name]) + cell = sprint(ourshowcompact, df[row, column_name]) write(io, "") end write(io, "") @@ -200,6 +205,60 @@ end write(io, "
$column_name
$row$(html_escape(cell))
") end +############################################################################## +# +# LaTeX output +# +############################################################################## + +function latex_char_escape(char::AbstractString) + if char == "\\" + return "\\textbackslash{}" + elseif char == "~" + return "\\textasciitilde{}" + else + return string("\\", char) + end +end + +function latex_escape(cell::AbstractString) + cell = replace(cell, ['\\','~','#','$','%','&','_','^','{','}'], latex_char_escape) + return cell +end + +function Base.show(io::IO, ::MIME"text/latex", df::AbstractDataFrame) + nrows = size(df, 1) + ncols = size(df, 2) + cnames = _names(df) + alignment = repeat("c", ncols) + write(io, "\\begin{tabular}{r|") + write(io, alignment) + write(io, "}\n") + write(io, "\t& ") + header = join(map(c -> latex_escape(string(c)), cnames), " & ") + write(io, header) + write(io, "\\\\\n") + write(io, "\t\\hline\n") + for row in 1:nrows + write(io, "\t") + write(io, @sprintf("%d", row)) + for col in 1:ncols + write(io, " & ") + cell = df[row,col] + if !isnull(cell) + content = get(cell) + if mimewritable(MIME("text/latex"), content) + show(io, MIME("text/latex"), content) + else + print(io, latex_escape(string(content))) + end + end + end + write(io, " \\\\\n") + end + write(io, "\\end{tabular}\n") +end + ############################################################################## # # MIME diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index c8f7e8c7d9..c57f8e344b 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -2,13 +2,26 @@ ## Join / merge ## +# Like similar, but returns a nullable array +similar_nullable{T}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) = + NullableArray(T, dims) + +similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) = + NullableArray(eltype(T), dims) + +similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) = + NullableCategoricalArray(T, dims) + +similar_nullable(df::AbstractDataFrame, dims::Int) = + DataFrame(Any[similar_nullable(x, dims) for x in columns(df)], copy(index(df))) + function join_idx(left, right, max_groups) ## adapted from Wes McKinney's full_outer_join in pandas (file: src/join.pyx). - # NA group in location 0 + # NULL group in location 0 - left_sorter, where, left_count = DataArrays.groupsort_indexer(left, max_groups) - right_sorter, where, right_count = DataArrays.groupsort_indexer(right, max_groups) + left_sorter, where, left_count = groupsort_indexer(left, max_groups) + right_sorter, where, right_count = groupsort_indexer(right, max_groups) # First pass, determine size of result set tcount = 0 @@ -27,7 +40,7 @@ function join_idx(left, right, max_groups) end end - # group 0 is the NA group + # group 0 is the NULL group tposition = 0 lposition = 0 rposition = 0 @@ -72,66 +85,134 @@ function join_idx(left, right, max_groups) right_sorter[right_indexer], right_sorter[rightonly_indexer]) end -function DataArrays.PooledDataVecs(df1::AbstractDataFrame, - df2::AbstractDataFrame) +function sharepools{S,N}(v1::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}}, + v2::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}}, + index::Vector{S}, + R) + tidx1 = convert(Vector{R}, indexin(CategoricalArrays.index(v1.pool), index)) + tidx2 = convert(Vector{R}, indexin(CategoricalArrays.index(v2.pool), index)) + refs1 = zeros(R, length(v1)) + refs2 = zeros(R, length(v2)) + for i in 1:length(refs1) + if v1.refs[i] != 0 + refs1[i] = tidx1[v1.refs[i]] + end + end + for i in 1:length(refs2) + if v2.refs[i] != 0 + refs2[i] = tidx2[v2.refs[i]] + end + end + pool = CategoricalPool{S, R}(index) + return (CategoricalArray(refs1, pool), + CategoricalArray(refs2, pool)) +end + +function sharepools{S,N}(v1::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}}, + v2::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}}) + index = sort(unique([levels(v1); levels(v2)])) + sz = length(index) + + R = sz <= typemax(UInt8) ? UInt8 : + sz <= typemax(UInt16) ? UInt16 : + sz <= typemax(UInt32) ? UInt32 : + UInt64 + + # To ensure type stability during actual work + sharepools(v1, v2, index, R) +end + +sharepools{S,N}(v1::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}}, + v2::AbstractArray{S,N}) = + sharepools(v1, oftype(v1, v2)) + +sharepools{S,N}(v1::AbstractArray{S,N}, + v2::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}}) = + sharepools(oftype(v2, v1), v2) + +# TODO: write an optimized version for (Nullable)CategoricalArray +function sharepools(v1::AbstractArray, + v2::AbstractArray) + ## Return two categorical arrays that share the same pool. + + ## TODO: allow specification of R + R = CategoricalArrays.DefaultRefType + refs1 = Array(R, size(v1)) + refs2 = Array(R, size(v2)) + poolref = Dict{promote_type(eltype(v1), eltype(v2)), R}() + maxref = 0 + + # loop through once to fill the poolref dict + for i = 1:length(v1) + if !_isnull(v1[i]) + poolref[v1[i]] = 0 + end + end + for i = 1:length(v2) + if !_isnull(v2[i]) + poolref[v2[i]] = 0 + end + end + + # fill positions in poolref + pool = sort(collect(keys(poolref))) + i = 1 + for p in pool + poolref[p] = i + i += 1 + end + + # fill in newrefs + zeroval = zero(R) + for i = 1:length(v1) + if _isnull(v1[i]) + refs1[i] = zeroval + else + refs1[i] = poolref[v1[i]] + end + end + for i = 1:length(v2) + if _isnull(v2[i]) + refs2[i] = zeroval + else + refs2[i] = poolref[v2[i]] + end + end + + pool = CategoricalPool(pool) + return (NullableCategoricalArray(refs1, pool), + NullableCategoricalArray(refs2, pool)) +end + +function sharepools(df1::AbstractDataFrame, df2::AbstractDataFrame) # This method exists to allow merge to work with multiple columns. - # It takes the columns of each DataFrame and returns a DataArray + # It takes the columns of each DataFrame and returns a categorical array # with a merged pool that "keys" the combination of column values. # The pools of the result don't really mean anything. - dv1, dv2 = PooledDataVecs(df1[1], df2[1]) - # use UInt32 instead of the minimum integer size chosen by PooledDataVecs + dv1, dv2 = sharepools(df1[1], df2[1]) + # use UInt32 instead of the minimum integer size chosen by sharepools # since the number of levels can be high refs1 = Vector{UInt32}(dv1.refs) refs2 = Vector{UInt32}(dv2.refs) - # the + 1 handles NA's + # the + 1 handles nulls refs1[:] += 1 refs2[:] += 1 - ngroups = length(dv1.pool) + 1 + ngroups = length(levels(dv1)) + 1 for j = 2:ncol(df1) - dv1, dv2 = PooledDataVecs(df1[j], df2[j]) + dv1, dv2 = sharepools(df1[j], df2[j]) for i = 1:length(refs1) refs1[i] += (dv1.refs[i]) * ngroups end for i = 1:length(refs2) refs2[i] += (dv2.refs[i]) * ngroups end - ngroups *= (length(dv1.pool) + 1) + ngroups *= length(levels(dv1)) + 1 end # recode refs1 and refs2 to drop the unused column combinations and # limit the pool size - PooledDataVecs( refs1, refs2 ) + sharepools(refs1, refs2) end -function DataArrays.PooledDataArray{R}(df::AbstractDataFrame, ::Type{R}) - # This method exists to allow another way for merge to work with - # multiple columns. It takes the columns of the DataFrame and - # returns a DataArray with a merged pool that "keys" the - # combination of column values. - # Notes: - # - I skipped the sort to make it faster. - # - Converting each individual one-row DataFrame to a Tuple - # might be faster. - refs = zeros(R, nrow(df)) - poolref = Dict{AbstractDataFrame, Int}() - pool = Array(UInt64, 0) - j = 1 - for i = 1:nrow(df) - val = df[i,:] - if haskey(poolref, val) - refs[i] = poolref[val] - else - push!(pool, hash(val)) - refs[i] = j - poolref[val] = j - j += 1 - end - end - return PooledDataArray(DataArrays.RefArray(refs), pool) -end - -DataArrays.PooledDataArray(df::AbstractDataFrame) = PooledDataArray(df, DEFAULT_POOLED_REF_TYPE) - - """ Join two DataFrames @@ -164,11 +245,11 @@ join(df1::AbstractDataFrame, - `:cross` : a full Cartesian product of the key combinations; every row of `df1` is matched with every row of `df2` -`NA`s are filled in where needed to complete joins. +Null values are filled in where needed to complete joins. ### Result -* `::DataFrame` : the joined DataFrame +* `::DataFrame` : the joined DataFrame ### Examples @@ -199,7 +280,7 @@ function Base.join(df1::AbstractDataFrame, throw(ArgumentError("Missing join argument 'on'.")) end - dv1, dv2 = PooledDataVecs(df1[on], df2[on]) + dv1, dv2 = sharepools(df1[on], df2[on]) left_idx, leftonly_idx, right_idx, rightonly_idx = join_idx(dv1.refs, dv2.refs, length(dv1.pool)) @@ -216,14 +297,14 @@ function Base.join(df1::AbstractDataFrame, left = df1[[left_idx; leftonly_idx], :] right = vcat(df2w[right_idx, :], - nas(df2w, length(leftonly_idx))) + similar_nullable(df2w, length(leftonly_idx))) return hcat!(left, right) elseif kind == :right df1w = without(df1, on) left = vcat(df1w[left_idx, :], - nas(df1w, length(rightonly_idx))) + similar_nullable(df1w, length(rightonly_idx))) right = df2[[right_idx; rightonly_idx], :] return hcat!(left, right) @@ -232,8 +313,8 @@ function Base.join(df1::AbstractDataFrame, mixed = hcat!(df1[left_idx, :], df2w[right_idx, :]) leftonly = hcat!(df1[leftonly_idx, :], - nas(df2w, length(leftonly_idx))) - rightonly = hcat!(nas(df1w, length(rightonly_idx)), + similar_nullable(df2w, length(leftonly_idx))) + rightonly = hcat!(similar_nullable(df1w, length(rightonly_idx)), df2[rightonly_idx, :]) return vcat(mixed, leftonly, rightonly) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 47f2e33b90..75c27bfaa0 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -78,25 +78,24 @@ function stack(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vector [Compat.repeat(df[c], outer=N) for c in id_vars]...], # id_var columns cnames) end -function stack(df::AbstractDataFrame, measure_vars::Int, id_vars::Int) - stack(df, [measure_vars], [id_vars]) +function stack(df::AbstractDataFrame, measure_var::Int, id_var::Int) + stack(df, [measure_var], [id_var]) end -function stack(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Int) - stack(df, measure_vars, [id_vars]) +function stack(df::AbstractDataFrame, measure_vars::Vector{Int}, id_var::Int) + stack(df, measure_vars, [id_var]) end -function stack(df::AbstractDataFrame, measure_vars::Int, id_vars::Vector{Int}) - stackdf(df, [measure_vars], id_vars) +function stack(df::AbstractDataFrame, measure_var::Int, id_vars::Vector{Int}) + stackdf(df, [measure_var], id_vars) end stack(df::AbstractDataFrame, measure_vars, id_vars) = stack(df, index(df)[measure_vars], index(df)[id_vars]) -function stack(df::AbstractDataFrame, measure_vars) +# no vars specified, by default select only numeric columns +numeric_vars(df::AbstractDataFrame) = [T <: AbstractFloat || (T <: Nullable && eltype(T) <: AbstractFloat) + for T in eltypes(df)] +function stack(df::AbstractDataFrame, measure_vars = numeric_vars(df)) mv_inds = index(df)[measure_vars] stack(df, mv_inds, _setdiff(1:ncol(df), mv_inds)) end -function stack(df::AbstractDataFrame) - idx = [1:length(df);][[t <: AbstractFloat for t in eltypes(df)]] - stack(df, idx) -end """ Stacks a DataFrame; convert from a wide to long format; see @@ -163,27 +162,30 @@ function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int) # `rowkey` integer indicating which column to place along rows # `colkey` integer indicating which column to place along column headers # `value` integer indicating which column has values - refkeycol = PooledDataArray(df[rowkey]) + refkeycol = NullableCategoricalArray(df[rowkey]) valuecol = df[value] - # TODO make a version with a default refkeycol - keycol = PooledDataArray(df[colkey]) + keycol = NullableCategoricalArray(df[colkey]) Nrow = length(refkeycol.pool) Ncol = length(keycol.pool) - # TODO make fillNA(type, length) - payload = DataFrame(Any[DataArray(eltype(valuecol), Nrow) for i in 1:Ncol], map(Symbol, keycol.pool)) + T = eltype(valuecol) + if T <: Nullable + T = eltype(T) + end + payload = DataFrame(Any[NullableArray(T, Nrow) for i in 1:Ncol], + map(Symbol, levels(keycol))) nowarning = true for k in 1:nrow(df) - j = @compat Int(keycol.refs[k]) - i = @compat Int(refkeycol.refs[k]) + j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]]) + i = Int(CategoricalArrays.order(refkeycol.pool)[refkeycol.refs[k]]) if i > 0 && j > 0 - if nowarning && !isna(payload[j][i]) + if nowarning && !isnull(payload[j][i]) warn("Duplicate entries in unstack.") nowarning = false end payload[j][i] = valuecol[k] end end - insert!(payload, 1, refkeycol.pool, _names(df)[rowkey]) + insert!(payload, 1, NullableArray(levels(refkeycol)), _names(df)[rowkey]) end unstack(df::AbstractDataFrame, rowkey, colkey, value) = unstack(df, index(df)[rowkey], index(df)[colkey], index(df)[value]) @@ -196,24 +198,28 @@ function unstack(df::AbstractDataFrame, colkey::Int, value::Int) # group on anything not a key or value: g = groupby(df, setdiff(_names(df), _names(df)[[colkey, value]])) groupidxs = [g.idx[g.starts[i]:g.ends[i]] for i in 1:length(g.starts)] - rowkey = PooledDataArray(zeros(Int, size(df, 1)), [1:length(groupidxs);]) + rowkey = zeros(Int, size(df, 1)) for i in 1:length(groupidxs) rowkey[groupidxs[i]] = i end - keycol = PooledDataArray(df[colkey]) + keycol = NullableCategoricalArray(df[colkey]) valuecol = df[value] df1 = df[g.idx[g.starts], g.cols] - keys = unique(keycol) Nrow = length(g) - Ncol = length(keycol.pool) - df2 = DataFrame(Any[DataArray(fill(valuecol[1], Nrow), fill(true, Nrow)) for i in 1:Ncol], map(@compat(Symbol), keycol.pool)) + Ncol = length(levels(keycol)) + T = eltype(valuecol) + if T <: Nullable + T = eltype(T) + end + df2 = DataFrame(Any[NullableArray(T, Nrow) for i in 1:Ncol], + map(@compat(Symbol), levels(keycol))) nowarning = true for k in 1:nrow(df) - j = @compat Int(keycol.refs[k]) + j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]]) i = rowkey[k] if i > 0 && j > 0 - if nowarning && !isna(df2[j][i]) - warn("Duplicate entries in unstack.") + if nowarning && !isnull(df2[j][i]) + warn("Duplicate entries in unstack at row $k.") nowarning = false end df2[j][i] = valuecol[k] @@ -243,7 +249,7 @@ NOTE: Not exported. ### Constructor ```julia -RepeatedVector(d::AbstractVector...) +StackedVector(d::AbstractVector...) ``` ### Arguments @@ -289,7 +295,7 @@ Base.ndims(v::StackedVector) = 1 Base.eltype(v::StackedVector) = promote_type(map(eltype, v.components)...) Base.similar(v::StackedVector, T, dims::Dims) = similar(v.components[1], T, dims) -DataArrays.PooledDataArray(v::StackedVector) = PooledDataArray(v[:]) # could be more efficient +CategoricalArrays.CategoricalArray(v::StackedVector) = CategoricalArray(v[:]) # could be more efficient """ @@ -349,8 +355,8 @@ Base.reverse(v::RepeatedVector) = RepeatedVector(reverse(v.parent), v.inner, v.o Base.similar(v::RepeatedVector, T, dims::Dims) = similar(v.parent, T, dims) Base.unique(v::RepeatedVector) = unique(v.parent) -function DataArrays.PooledDataArray(v::RepeatedVector) - res = DataArrays.PooledDataArray(v.parent) +function CategoricalArrays.CategoricalArray(v::RepeatedVector) + res = CategoricalArrays.CategoricalArray(v.parent) res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer]) res end @@ -424,26 +430,22 @@ function stackdf(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vect [RepeatedVector(df[:,c], 1, N) for c in id_vars]...], # id_var columns cnames) end -function stackdf(df::AbstractDataFrame, measure_vars::Int, id_vars::Int) - stackdf(df, [measure_vars], [id_vars]) +function stackdf(df::AbstractDataFrame, measure_var::Int, id_var::Int) + stackdf(df, [measure_var], [id_var]) end -function stackdf(df::AbstractDataFrame, measure_vars, id_vars::Int) - stackdf(df, measure_vars, [id_vars]) +function stackdf(df::AbstractDataFrame, measure_vars, id_var::Int) + stackdf(df, measure_vars, [id_var]) end -function stackdf(df::AbstractDataFrame, measure_vars::Int, id_vars) - stackdf(df, [measure_vars], id_vars) +function stackdf(df::AbstractDataFrame, measure_var::Int, id_vars) + stackdf(df, [measure_var], id_vars) end function stackdf(df::AbstractDataFrame, measure_vars, id_vars) stackdf(df, index(df)[measure_vars], index(df)[id_vars]) end -function stackdf(df::AbstractDataFrame, measure_vars) +function stackdf(df::AbstractDataFrame, measure_vars = numeric_vars(df)) m_inds = index(df)[measure_vars] stackdf(df, m_inds, _setdiff(1:ncol(df), m_inds)) end -function stackdf(df::AbstractDataFrame) - idx = [1:length(df);][[t <: AbstractFloat for t in eltypes(df)]] - stackdf(df, idx) -end """ A stacked view of a DataFrame (long format); see `stackdf` diff --git a/src/abstractdataframe/show.jl b/src/abstractdataframe/show.jl index 4effdf1965..b981623b85 100644 --- a/src/abstractdataframe/show.jl +++ b/src/abstractdataframe/show.jl @@ -62,8 +62,9 @@ end #' ourshowcompact(STDOUT, "abc") #' ourshowcompact(STDOUT, 10000) ourshowcompact(io::IO, x::Any) = showcompact(io, x) # -> Void -ourshowcompact(io::IO, x::AbstractString) = showcompact(io, x) # -> Void +ourshowcompact(io::IO, x::AbstractString) = print(io, x) # -> Void ourshowcompact(io::IO, x::Symbol) = print(io, x) # -> Void +ourshowcompact(io::IO, x::Nullable{String}) = isnull(x) ? showcompact(io, x) : print(io, get(x)) # -> Void #' @description #' @@ -100,8 +101,6 @@ function getmaxwidths(df::AbstractDataFrame, rowlabel::Symbol) # -> Vector{Int} maxwidths = Array(Int, size(df, 2) + 1) - # TODO: Move this definition somewhere else - NAstrwidth = 2 undefstrwidth = ourstrwidth(Base.undef_ref_str) j = 1 @@ -110,17 +109,11 @@ function getmaxwidths(df::AbstractDataFrame, maxwidth = ourstrwidth(name) # (2) Consider length of longest entry in that column - for indices in (rowindices1, rowindices2) - for i in indices - if isna(col, i) - maxwidth = max(maxwidth, NAstrwidth) - else - try - maxwidth = max(maxwidth, ourstrwidth(col[i])) - catch - maxwidth = max(maxwidth, undefstrwidth) - end - end + for indices in (rowindices1, rowindices2), i in indices + try + maxwidth = max(maxwidth, ourstrwidth(col[i])) + catch + maxwidth = max(maxwidth, undefstrwidth) end end maxwidths[j] = maxwidth diff --git a/src/abstractdataframe/sort.jl b/src/abstractdataframe/sort.jl index c92021f2e8..1d05e2c9da 100644 --- a/src/abstractdataframe/sort.jl +++ b/src/abstractdataframe/sort.jl @@ -308,7 +308,3 @@ end Base.sort(df::AbstractDataFrame, a::Algorithm, o::Ordering) = df[sortperm(df, a, o),:] Base.sortperm(df::AbstractDataFrame, a::Algorithm, o::@compat(Union{Perm,DFPerm})) = sort!([1:size(df, 1);], a, o) Base.sortperm(df::AbstractDataFrame, a::Algorithm, o::Ordering) = sortperm(df, a, DFPerm(o,df)) - -# Extras to speed up sorting -Base.sortperm{V}(df::AbstractDataFrame, a::Algorithm, o::FastPerm{Sort.ForwardOrdering,V}) = sortperm(o.vec) -Base.sortperm{V}(df::AbstractDataFrame, a::Algorithm, o::FastPerm{Sort.ReverseOrdering,V}) = reverse(sortperm(o.vec)) diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 3c33fc1505..4ce59b324f 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -2,7 +2,7 @@ An AbstractDataFrame that stores a set of named columns The columns are normally AbstractVectors stored in memory, -particularly a Vector, DataVector, or PooledDataVector. +particularly a Vector, NullableVector, or CategoricalVector. **Constructors** @@ -30,9 +30,9 @@ Each column in `columns` should be the same length. **Notes** -Most of the default constructors convert columns to `DataArrays`. The +Most of the default constructors convert columns to `NullableArray`. The base constructor, `DataFrame(columns::Vector{Any}, -names::Vector{Symbol})` does not convert to `DataArrays`. +names::Vector{Symbol})` does not convert to `NullableArray`. A `DataFrame` is a lightweight object. As long as columns are not manipulated, creation of a DataFrame from existing AbstractVectors is @@ -48,12 +48,12 @@ loops. ```julia df = DataFrame() v = ["x","y","z"][rand(1:3, 10)] -df1 = DataFrame(Any[[1:10], v, rand(10)], [:A, :B, :C]) # columns are Arrays -df2 = DataFrame(A = 1:10, B = v, C = rand(10)) # columns are DataArrays +df1 = DataFrame(Any[collect(1:10), v, rand(10)], [:A, :B, :C]) # columns are Arrays +df2 = DataFrame(A = 1:10, B = v, C = rand(10)) # columns are NullableArrays dump(df1) dump(df2) describe(df2) -head(df1) +DataFrames.head(df1) df1[:A] + df2[:C] df1[1:4, 1:2] df1[[:A,:C]] @@ -102,9 +102,9 @@ function DataFrame(; kwargs...) return result end -function DataFrame(columns::Vector{Any}, - cnames::Vector{Symbol} = gennames(length(columns))) - return DataFrame(columns, Index(cnames)) +function DataFrame(columns::AbstractVector, + cnames::AbstractVector{Symbol} = gennames(length(columns))) + return DataFrame(convert(Vector{Any}, columns), Index(convert(Vector{Symbol}, cnames))) end @@ -112,7 +112,7 @@ end function DataFrame(t::Type, nrows::Integer, ncols::Integer) columns = Array(Any, ncols) for i in 1:ncols - columns[i] = DataArray(t, nrows) + columns[i] = NullableArray(t, nrows) end cnames = gennames(ncols) return DataFrame(columns, Index(cnames)) @@ -123,19 +123,21 @@ function DataFrame(column_eltypes::Vector, cnames::Vector, nrows::Integer) p = length(column_eltypes) columns = Array(Any, p) for j in 1:p - columns[j] = DataArray(column_eltypes[j], nrows) + columns[j] = NullableArray(column_eltypes[j], nrows) end return DataFrame(columns, Index(cnames)) end -# Initialize an empty DataFrame with specific eltypes and names and whether is pooled data array -function DataFrame(column_eltypes::Vector{DataType}, cnames::Vector{Symbol}, ispda::Vector{Bool}, nrows::Integer) +# Initialize an empty DataFrame with specific eltypes and names +# and whether a nominal array should be created +function DataFrame(column_eltypes::Vector{DataType}, cnames::Vector{Symbol}, + nominal::Vector{Bool}, nrows::Integer) p = length(column_eltypes) columns = Array(Any, p) for j in 1:p - if ispda[j] - columns[j] = PooledDataArray(column_eltypes[j], nrows) + if nominal[j] + columns[j] = NullableCategoricalArray(column_eltypes[j], nrows) else - columns[j] = DataArray(column_eltypes[j], nrows) + columns[j] = NullableArray(column_eltypes[j], nrows) end end return DataFrame(columns, Index(cnames)) @@ -147,7 +149,7 @@ function DataFrame(column_eltypes::Vector, nrows::Integer) columns = Array(Any, p) cnames = gennames(p) for j in 1:p - columns[j] = DataArray(column_eltypes[j], nrows) + columns[j] = NullableArray(column_eltypes[j], nrows) end return DataFrame(columns, Index(cnames)) end @@ -167,8 +169,7 @@ function DataFrame{D <: Associative}(ds::Vector{D}, ks::Vector) col_eltypes = Type[@compat(Union{}) for _ = 1:length(ks)] for d in ds for (i,k) in enumerate(ks) - # TODO: check for user-defined "NA" values, ala pandas - if haskey(d, k) && !isna(d[k]) + if haskey(d, k) && !_isnull(d[k]) col_eltypes[i] = promote_type(col_eltypes[i], typeof(d[k])) end end @@ -179,7 +180,7 @@ function DataFrame{D <: Associative}(ds::Vector{D}, ks::Vector) df = DataFrame(col_eltypes, ks, length(ds)) for (i,d) in enumerate(ds) for (j,k) in enumerate(ks) - df[i,j] = get(d, k, NA) + df[i,j] = get(d, k, Nullable()) end end @@ -230,7 +231,9 @@ function Base.getindex(df::DataFrame, col_ind::ColumnIndex) end # df[MultiColumnIndex] => (Sub)?DataFrame -function Base.getindex{T <: ColumnIndex}(df::DataFrame, col_inds::AbstractVector{T}) +function Base.getindex{T <: ColumnIndex}(df::DataFrame, + col_inds::Union{AbstractVector{T}, + AbstractVector{Nullable{T}}}) selected_columns = index(df)[col_inds] new_columns = df.columns[selected_columns] return DataFrame(new_columns, Index(_names(df)[selected_columns])) @@ -246,20 +249,29 @@ function Base.getindex(df::DataFrame, row_ind::Real, col_ind::ColumnIndex) end # df[SingleRowIndex, MultiColumnIndex] => (Sub)?DataFrame -function Base.getindex{T <: ColumnIndex}(df::DataFrame, row_ind::Real, col_inds::AbstractVector{T}) +function Base.getindex{T <: ColumnIndex}(df::DataFrame, + row_ind::Real, + col_inds::Union{AbstractVector{T}, + AbstractVector{Nullable{T}}}) selected_columns = index(df)[col_inds] new_columns = Any[dv[[row_ind]] for dv in df.columns[selected_columns]] return DataFrame(new_columns, Index(_names(df)[selected_columns])) end # df[MultiRowIndex, SingleColumnIndex] => (Sub)?AbstractDataVector -function Base.getindex{T <: Real}(df::DataFrame, row_inds::AbstractVector{T}, col_ind::ColumnIndex) +function Base.getindex{T <: Real}(df::DataFrame, + row_inds::Union{AbstractVector{T}, AbstractVector{Nullable{T}}}, + col_ind::ColumnIndex) selected_column = index(df)[col_ind] return df.columns[selected_column][row_inds] end # df[MultiRowIndex, MultiColumnIndex] => (Sub)?DataFrame -function Base.getindex{R <: Real, T <: ColumnIndex}(df::DataFrame, row_inds::AbstractVector{R}, col_inds::AbstractVector{T}) +function Base.getindex{R <: Real, T <: ColumnIndex}(df::DataFrame, + row_inds::Union{AbstractVector{R}, + AbstractVector{Nullable{R}}}, + col_inds::Union{AbstractVector{T}, + AbstractVector{Nullable{T}}}) selected_columns = index(df)[col_inds] new_columns = Any[dv[row_inds] for dv in df.columns[selected_columns]] return DataFrame(new_columns, Index(_names(df)[selected_columns])) @@ -267,13 +279,20 @@ end # df[:, SingleColumnIndex] => (Sub)?AbstractVector # df[:, MultiColumnIndex] => (Sub)?DataFrame -Base.getindex{T<:ColumnIndex}(df::DataFrame, row_inds::Colon, col_inds::@compat(Union{T, AbstractVector{T}})) = df[col_inds] +Base.getindex{T<:ColumnIndex}(df::DataFrame, + row_inds::Colon, + col_inds::Union{T, AbstractVector{T}, + AbstractVector{Nullable{T}}}) = + df[col_inds] # df[SingleRowIndex, :] => (Sub)?DataFrame Base.getindex(df::DataFrame, row_ind::Real, col_inds::Colon) = df[[row_ind], col_inds] # df[MultiRowIndex, :] => (Sub)?DataFrame -function Base.getindex{R<:Real}(df::DataFrame, row_inds::AbstractVector{R}, col_inds::Colon) +function Base.getindex{R<:Real}(df::DataFrame, + row_inds::Union{AbstractVector{R}, + AbstractVector{Nullable{R}}}, + col_inds::Colon) new_columns = Any[dv[row_inds] for dv in df.columns] return DataFrame(new_columns, copy(index(df))) end @@ -344,17 +363,17 @@ function insert_multiple_entries!{T <: Real}(df::DataFrame, end end -upgrade_vector(v::Vector) = DataArray(v, falses(length(v))) -upgrade_vector(v::Range) = DataArray([v;], falses(length(v))) -upgrade_vector(v::BitVector) = DataArray(convert(Array{Bool}, v), falses(length(v))) -upgrade_vector(adv::AbstractDataArray) = adv +upgrade_vector{T<:Nullable}(v::AbstractArray{T}) = v +upgrade_vector(v::CategoricalArray) = NullableCategoricalArray(v) +upgrade_vector(v::AbstractArray) = NullableArray(v) + function upgrade_scalar(df::DataFrame, v::AbstractArray) msg = "setindex!(::DataFrame, ...) only broadcasts scalars, not arrays" throw(ArgumentError(msg)) end function upgrade_scalar(df::DataFrame, v::Any) n = (ncol(df) == 0) ? 1 : nrow(df) - DataArray(fill(v, n), falses(n)) + NullableArray(fill(v, n)) end # df[SingleColumnIndex] = AbstractVector @@ -365,10 +384,13 @@ function Base.setindex!(df::DataFrame, end # df[SingleColumnIndex] = Single Item (EXPANDS TO NROW(DF) if NCOL(DF) > 0) -function Base.setindex!(df::DataFrame, - v::Any, - col_ind::ColumnIndex) - insert_single_column!(df, upgrade_scalar(df, v), col_ind) +function Base.setindex!(df::DataFrame, v, col_ind::ColumnIndex) + if haskey(index(df), col_ind) + fill!(df[col_ind], v) + else + insert_single_column!(df, upgrade_scalar(df, v), col_ind) + end + return df end # df[MultiColumnIndex] = DataFrame @@ -397,7 +419,7 @@ function Base.setindex!{T <: ColumnIndex}(df::DataFrame, col_inds::AbstractVector{T}) dv = upgrade_vector(v) for col_ind in col_inds - insert_single_column!(df, dv, col_ind) + df[col_ind] = dv end return df end @@ -411,9 +433,8 @@ end function Base.setindex!{T <: ColumnIndex}(df::DataFrame, val::Any, col_inds::AbstractVector{T}) - dv = upgrade_scalar(df, val) for col_ind in col_inds - insert_single_column!(df, dv, col_ind) + df[col_ind] = val end return df end @@ -621,8 +642,20 @@ function Base.insert!(df::DataFrame, col_ind::Int, item::AbstractVector, name::S insert!(df.columns, col_ind, item) df end -Base.insert!(df::DataFrame, col_ind::Int, item, name::Symbol) = + +# FIXME: Needed to work around a crash: JuliaLang/julia#18299 +function Base.insert!(df::DataFrame, col_ind::Int, item::NullableArray, name::Symbol) + 0 < col_ind <= ncol(df) + 1 || throw(BoundsError()) + size(df, 1) == length(item) || size(df, 1) == 0 || error("number of rows does not match") + + insert!(index(df), col_ind, name) + insert!(df.columns, col_ind, item) + df +end + +function Base.insert!(df::DataFrame, col_ind::Int, item, name::Symbol) insert!(df, col_ind, upgrade_scalar(df, item), name) +end function Base.merge!(df::DataFrame, others::AbstractDataFrame...) for other in others @@ -721,9 +754,11 @@ function hcat!(df1::DataFrame, df2::AbstractDataFrame) return df1 end -hcat!{T}(df::DataFrame, x::DataVector{T}) = hcat!(df, DataFrame(Any[x])) -hcat!{T}(df::DataFrame, x::Vector{T}) = hcat!(df, DataFrame(Any[DataArray(x)])) -hcat!{T}(df::DataFrame, x::T) = hcat!(df, DataFrame(Any[DataArray([x])])) +hcat!(df::DataFrame, x::CategoricalArray) = hcat!(df, DataFrame(Any[x])) +hcat!(df::DataFrame, x::NullableCategoricalArray) = hcat!(df, DataFrame(Any[x])) +hcat!(df::DataFrame, x::NullableVector) = hcat!(df, DataFrame(Any[x])) +hcat!(df::DataFrame, x::Vector) = hcat!(df, DataFrame(Any[NullableArray(x)])) +hcat!(df::DataFrame, x) = hcat!(df, DataFrame(Any[NullableArray([x])])) # hcat! for 1-n arguments hcat!(df::DataFrame) = df @@ -739,7 +774,7 @@ Base.hcat(df::DataFrame, x) = hcat!(copy(df), x) ############################################################################## function nullable!(df::DataFrame, col::ColumnIndex) - df[col] = DataArray(df[col]) + df[col] = NullableArray(df[col]) df end function nullable!{T <: ColumnIndex}(df::DataFrame, cols::Vector{T}) @@ -755,25 +790,23 @@ end ## ############################################################################## -pool(a::AbstractVector) = compact(PooledDataArray(a)) - -function pool!(df::DataFrame, cname::@compat(Union{Integer, Symbol})) - df[cname] = pool(df[cname]) +function categorical!(df::DataFrame, cname::@compat(Union{Integer, Symbol}), compact::Bool=true) + df[cname] = categorical(df[cname], compact) return end -function pool!{T <: @compat(Union{Integer, Symbol})}(df::DataFrame, cnames::Vector{T}) +function categorical!{T <: @compat(Union{Integer, Symbol})}(df::DataFrame, cnames::Vector{T}, + compact::Bool=true) for cname in cnames - df[cname] = pool(df[cname]) + df[cname] = categorical(df[cname], compact) end return end -# TODO: Deprecate or change for being too inconsistent with other pool methods -function pool!(df::DataFrame) +function categorical!(df::DataFrame, compact::Bool=true) for i in 1:size(df, 2) if eltype(df[i]) <: AbstractString - df[i] = pool(df[i]) + df[i] = categorical(df[i], compact) end end return @@ -811,7 +844,7 @@ function _dataframe_from_associative(dnames, d::Associative) if length(col) != n throw(ArgumentError("All columns in Dict must have the same length")) end - columns[j] = DataArray(col) + columns[j] = NullableArray(col) colnames[j] = Symbol(name) end return DataFrame(columns, Index(colnames)) diff --git a/src/dataframe/io.jl b/src/dataframe/io.jl index c7f76baae6..6a86c2466d 100644 --- a/src/dataframe/io.jl +++ b/src/dataframe/io.jl @@ -517,7 +517,7 @@ function builddf(rows::Integer, values = Array(o.eltypes[j], rows) end - missing = falses(rows) + missing = fill(false, rows) is_int = true is_float = true is_bool = true @@ -640,9 +640,9 @@ function builddf(rows::Integer, end if o.makefactors && !(is_int || is_float || is_bool) - columns[j] = PooledDataArray(values, missing) + columns[j] = NullableCategoricalArray(values, missing) else - columns[j] = DataArray(values, missing) + columns[j] = NullableArray(values, missing) end end @@ -801,7 +801,7 @@ function readtable(io::IO, separator::Char = ',', quotemark::Vector{Char} = ['"'], decimal::Char = '.', - nastrings::Vector = ["", "NA"], + nastrings::Vector = ["", "NULL", "NA"], truestrings::Vector = ["T", "t", "TRUE", "true"], falsestrings::Vector = ["F", "f", "FALSE", "false"], makefactors::Bool = false, @@ -874,10 +874,10 @@ readtable(filename, [keyword options]) * `separator::Char` -- Assume that fields are split by the `separator` character. If not specified, it will be guessed from the filename: `.csv` defaults to `','`, `.tsv` defaults to `'\t'`, `.wsv` defaults to `' '`. * `quotemark::Vector{Char}` -- Assume that fields contained inside of two `quotemark` characters are quoted, which disables processing of separators and linebreaks. Set to `Char[]` to disable this feature and slightly improve performance. Defaults to `['"']`. * `decimal::Char` -- Assume that the decimal place in numbers is written using the `decimal` character. Defaults to `'.'`. -* `nastrings::Vector{String}` -- Translate any of the strings into this vector into an `NA`. Defaults to `["", "NA"]`. +* `nastrings::Vector{String}` -- Translate any of the strings into this vector into a NULL value. Defaults to `["", "NULL", "NA"]`. * `truestrings::Vector{String}` -- Translate any of the strings into this vector into a Boolean `true`. Defaults to `["T", "t", "TRUE", "true"]`. * `falsestrings::Vector{String}` -- Translate any of the strings into this vector into a Boolean `false`. Defaults to `["F", "f", "FALSE", "false"]`. -* `makefactors::Bool` -- Convert string columns into `PooledDataVector`'s for use as factors. Defaults to `false`. +* `makefactors::Bool` -- Convert string columns into `CategoricalVector`'s for use as factors. Defaults to `false`. * `nrows::Int` -- Read only `nrows` from the file. Defaults to `-1`, which indicates that the entire file should be read. * `names::Vector{Symbol}` -- Use the values in this array as the names for all columns instead of or in lieu of the names in the file's header. Defaults to `[]`, which indicates that the header should be used if present or that numeric names should be invented if there is no header. * `eltypes::Vector` -- Specify the types of all columns. Defaults to `[]`. @@ -909,7 +909,7 @@ function readtable(pathname::AbstractString; separator::Char = getseparator(pathname), quotemark::Vector{Char} = ['"'], decimal::Char = '.', - nastrings::Vector = String["", "NA"], + nastrings::Vector = String["", "NULL", "NA"], truestrings::Vector = String["T", "t", "TRUE", "true"], falsestrings::Vector = String["F", "f", "FALSE", "false"], makefactors::Bool = false, @@ -975,7 +975,7 @@ literals. Parses the string `s` containing delimiter-separated tabular data argument contains a list of flag characters, which, if present, are equivalent to supplying named arguments to `readtable` as follows: -- `f`: `makefactors=true`, convert string columns to `PooledData` columns +- `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns - `c`: `allowcomments=true`, ignore lines beginning with `#` - `H`: `header=false`, do not interpret the first line as column names """ @@ -1004,7 +1004,7 @@ separated values (CSV) using `readtable`, just as if it were being loaded from an external file. The suffix flags `f`, `c`, and `H` are optional. If present, they are equivalent to supplying named arguments to `readtable` as follows: -* `f`: `makefactors=true`, convert string columns to `PooledDataArray` columns +* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns * `c`: `allowcomments=true`, ignore lines beginning with `#` * `H`: `header=false`, do not interpret the first line as column names @@ -1038,7 +1038,7 @@ character, just as if it were being loaded from an external file. The suffix flags `f`, `c`, and `H` are optional. If present, they are equivalent to supplying named arguments to `readtable` as follows: -* `f`: `makefactors=true`, convert string columns to `PooledDataArray` columns +* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns * `c`: `allowcomments=true`, ignore lines beginning with `#` * `H`: `header=false`, do not interpret the first line as column names @@ -1074,7 +1074,7 @@ loaded from an external file. The suffix flags `f`, `c`, and `H` are optional. If present, they are equivalent to supplying named arguments to `readtable` as follows: -* `f`: `makefactors=true`, convert string columns to `PooledDataArray` columns +* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns * `c`: `allowcomments=true`, ignore lines beginning with `#` * `H`: `header=false`, do not interpret the first line as column names @@ -1107,7 +1107,7 @@ separated values (TSV) using `readtable`, just as if it were being loaded from an external file. The suffix flags `f`, `c`, and `H` are optional. If present, they are equivalent to supplying named arguments to `readtable` as follows: -* `f`: `makefactors=true`, convert string columns to `PooledDataArray` columns +* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns * `c`: `allowcomments=true`, ignore lines beginning with `#` * `H`: `header=false`, do not interpret the first line as column names diff --git a/src/dataframe/sort.jl b/src/dataframe/sort.jl index 9255557469..e6cd01a88d 100644 --- a/src/dataframe/sort.jl +++ b/src/dataframe/sort.jl @@ -12,7 +12,14 @@ end function Base.sort!(df::DataFrame, a::Base.Sort.Algorithm, o::Base.Sort.Ordering) p = sortperm(df, a, o) pp = similar(p) - for col in columns(df) + c = columns(df) + + for (i,col) in enumerate(c) + # Check if this column has been sorted already + if any(j -> c[j]===col, 1:i-1) + continue + end + copy!(pp,p) Base.permute!!(col, pp) end diff --git a/src/dataframerow/dataframerow.jl b/src/dataframerow/dataframerow.jl index daf7db86d7..1614e317b6 100644 --- a/src/dataframerow/dataframerow.jl +++ b/src/dataframerow/dataframerow.jl @@ -41,7 +41,7 @@ Base.convert(::Type{Array}, r::DataFrameRow) = convert(Array, r.df[r.row,:]) # so that duplicate rows would have the same hash function Base.hash(r::DataFrameRow, h::UInt) for col in columns(r.df) - if isna(col, r.row) + if _isnull(col[r.row]) h = hash(false, h) else h = hash(true, hash(col[r.row], h)) @@ -50,34 +50,16 @@ function Base.hash(r::DataFrameRow, h::UInt) return h end -# compare two elements in the array -_isequalelms(a::Array, i::Int, j::Int) = isequal(a[i], a[j]) - -# compare the two elements in the data array -function _isequalelms(a::DataArray, i::Int, j::Int) - if isna(a, i) - return isna(a, j) - else - return !isna(a, j) && isequal(a.data[i], a.data[j]) - end -end - -# compare two elements in the pooled array -# NOTE assume there are no duplicated elements in the pool -_isequalelms(a::PooledDataArray, i::Int, j::Int) = isequal(a.refs[i], a.refs[j]) - # comparison of DataFrame rows # only the rows of the same DataFrame could be compared # rows are equal if they have the same values (while the row indices could differ) +@compat(Base.:(==))(r1::DataFrameRow, r2::DataFrameRow) = isequal(r1, r2) + function Base.isequal(r1::DataFrameRow, r2::DataFrameRow) - if r1.df !== r2.df - throw(ArgumentError("Comparing rows from different frames not supported")) - end - if r1.row == r2.row - return true - end + r1.df == r2.df || throw(ArgumentError("Comparing rows from different frames not supported")) + r1.row == r2.row && return true for col in columns(r1.df) - if !_isequalelms(col, r1.row, r2.row) + if !isequal(col[r1.row], col[r2.row]) return false end end diff --git a/src/deprecated.jl b/src/deprecated.jl index 0c7e43c31d..286aa61b93 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -4,17 +4,13 @@ import Base: @deprecate @deprecate by(d::AbstractDataFrame, cols, s::Symbol) aggregate(d, cols, eval(s)) @deprecate nullable!(colnames::Array{Symbol,1}, df::AbstractDataFrame) nullable!(df, colnames) @deprecate nullable!(colnums::Array{Int,1}, df::AbstractDataFrame) nullable!(df, colnums) + import Base: keys, values, insert! @deprecate keys(df::AbstractDataFrame) names(df) @deprecate values(df::AbstractDataFrame) DataFrames.columns(df) @deprecate insert!(df::DataFrame, df2::AbstractDataFrame) merge!(df, df2) -import DataArrays: array, DataArray -@deprecate array(df::AbstractDataFrame) convert(Array, df) -@deprecate array(r::DataFrameRow) convert(Array, r) -if VERSION < v"0.4.0-" - @deprecate DataArray(df::AbstractDataFrame) convert(DataArray, df) -end -@deprecate DataArray(df::AbstractDataFrame, T::DataType) convert(DataArray{T}, df) - @deprecate read_rda(args...) FileIO.load(args...) + +@deprecate pool categorical +@deprecate pool! categorical! diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl index 50dbc288fc..9caca98f24 100644 --- a/src/groupeddataframe/grouping.jl +++ b/src/groupeddataframe/grouping.jl @@ -25,6 +25,41 @@ end # # Split # + +function groupsort_indexer(x::AbstractVector, ngroups::Integer, null_last::Bool=false) + # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). + + # count group sizes, location 0 for NULL + n = length(x) + # counts = x.pool + counts = fill(0, ngroups + 1) + for i = 1:n + counts[x[i] + 1] += 1 + end + + # mark the start of each contiguous group of like-indexed data + where = fill(1, ngroups + 1) + if null_last + for i = 3:ngroups+1 + where[i] = where[i - 1] + counts[i - 1] + end + where[1] = where[end] + counts[end] + else + for i = 2:ngroups+1 + where[i] = where[i - 1] + counts[i - 1] + end + end + + # this is our indexer + result = fill(0, n) + for i = 1:n + label = x[i] + 1 + result[where[label]] = i + where[label] += 1 + end + result, where, counts +end + """ A view of an AbstractDataFrame split into row groups @@ -35,14 +70,13 @@ groupby(cols) ### Arguments -* `d` : an AbstractDataFrame -* `cols` : an - -If `d` is not provided, a curried version of groupby is given. +* `d` : an AbstractDataFrame to split (optional, see [Returns](#returns)) +* `cols` : data frame columns to group by ### Returns * `::GroupedDataFrame` : a grouped view into `d` +* `::Function`: a function `x -> groupby(x, cols)` (if `d` is not specified) ### Details @@ -76,8 +110,8 @@ vcat([g[:b] for g in gd]...) for g in gd println(g) end -map(d -> mean(d[:c]), gd) # returns a GroupApplied object -combine(map(d -> mean(d[:c]), gd)) +map(d -> mean(dropnull(d[:c])), gd) # returns a GroupApplied object +combine(map(d -> mean(dropnull(d[:c])), gd)) df |> groupby(:a) |> [sum, length] df |> groupby([:a, :b]) |> [sum, length] ``` @@ -88,25 +122,34 @@ function groupby{T}(d::AbstractDataFrame, cols::Vector{T}) ## http://wesmckinney.com/blog/?p=489 ncols = length(cols) - # use the pool trick to get a set of integer references for each unique item - dv = PooledDataArray(d[cols[ncols]]) - # if there are NAs, add 1 to the refs to avoid underflows in x later - dv_has_nas = (findfirst(dv.refs, 0) > 0 ? 1 : 0) - # use UInt32 instead of the PDA's integer size since the number of levels can be high - x = copy!(similar(dv.refs, UInt32), dv.refs) .+ dv_has_nas + # use CategoricalArray to get a set of integer references for each unique item + nv = NullableCategoricalArray(d[cols[ncols]]) + # if there are NULLs, add 1 to the refs to avoid underflows in x later + anynulls = (findfirst(nv.refs, 0) > 0 ? 1 : 0) + # use UInt32 instead of the original array's integer size since the number of levels can be high + x = similar(nv.refs, UInt32) + for i = 1:nrow(d) + if nv.refs[i] == 0 + x[i] = 1 + else + x[i] = CategoricalArrays.order(nv.pool)[nv.refs[i]] + anynulls + end + end # also compute the number of groups, which is the product of the set lengths - ngroups = length(dv.pool) + dv_has_nas + ngroups = length(levels(nv)) + anynulls # if there's more than 1 column, do roughly the same thing repeatedly for j = (ncols - 1):-1:1 - dv = PooledDataArray(d[cols[j]]) - dv_has_nas = (findfirst(dv.refs, 0) > 0 ? 1 : 0) + nv = NullableCategoricalArray(d[cols[j]]) + anynulls = (findfirst(nv.refs, 0) > 0 ? 1 : 0) for i = 1:nrow(d) - x[i] += (dv.refs[i] + dv_has_nas- 1) * ngroups + if nv.refs[i] != 0 + x[i] += (CategoricalArrays.order(nv.pool)[nv.refs[i]] + anynulls - 1) * ngroups + end end - ngroups = ngroups * (length(dv.pool) + dv_has_nas) + ngroups = ngroups * (length(levels(nv)) + anynulls) # TODO if ngroups is really big, shrink it end - (idx, starts) = DataArrays.groupsort_indexer(x, ngroups) + (idx, starts) = groupsort_indexer(x, ngroups) # Remove zero-length groupings starts = _uniqueofsorted(starts) ends = starts[2:end] - 1 @@ -159,15 +202,14 @@ Not meant to be constructed directly, see `groupby` abnd provided for a GroupApplied object. """ -type GroupApplied +immutable GroupApplied{T<:AbstractDataFrame} gd::GroupedDataFrame - vals::Vector + vals::Vector{T} - function GroupApplied(gd, vals) - if length(gd) != length(vals) - error("GroupApplied requires keys and vals be of equal length.") - end - new(gd, vals) + @compat function (::Type{GroupApplied})(gd::GroupedDataFrame, vals::Vector) + length(gd) == length(vals) || + throw(DimensionMismatch("GroupApplied requires keys and vals be of equal length (got $(length(gd)) and $(length(vals))).")) + new{eltype(vals)}(gd, vals) end end @@ -178,10 +220,10 @@ end # map() sweeps along groups function Base.map(f::Function, gd::GroupedDataFrame) - GroupApplied(gd, AbstractDataFrame[wrap(f(d)) for d in gd]) + GroupApplied(gd, [wrap(f(df)) for df in gd]) end function Base.map(f::Function, ga::GroupApplied) - GroupApplied(ga.gd, AbstractDataFrame[wrap(f(d)) for d in ga.vals]) + GroupApplied(ga.gd, [wrap(f(df)) for df in ga.vals]) end wrap(df::AbstractDataFrame) = df @@ -209,23 +251,21 @@ combine(ga::GroupApplied) df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]), b = repeat([2, 1], outer=[4]), c = randn(8)) -combine(map(d -> mean(d[:c]), gd)) +combine(map(d -> mean(dropnull(d[:c])), gd)) ``` """ function combine(ga::GroupApplied) gd, vals = ga.gd, ga.vals - # Could be made shorter with a rep(x, lengths) function - # See JuliaLang/julia#16443 - idx = Vector{Int}(sum(Int[size(val, 1) for val in vals])) + valscat = vcat(vals) + idx = Vector{Int}(size(valscat, 1)) j = 0 - for i in 1:length(vals) - n = size(vals[i], 1) - @inbounds idx[j + (1:n)] = gd.idx[gd.starts[i]] + @inbounds for (start, val) in zip(gd.starts, vals) + n = size(val, 1) + idx[j + (1:n)] = gd.idx[start] j += n end - ret = gd.parent[idx, gd.cols] - hcat!(ret, vcat(vals)) + hcat!(gd.parent[idx, gd.cols], valscat) end @@ -260,12 +300,14 @@ colwise(sum, groupby(df, :a)) ``` """ -colwise(f::Function, d::AbstractDataFrame) = Any[[f(d[idx])] for idx in 1:size(d, 2)] +colwise(f::Function, d::AbstractDataFrame) = Any[vcat(f(d[idx])) for idx in 1:size(d, 2)] colwise(f::Function, gd::GroupedDataFrame) = map(colwise(f), gd) colwise(f::Function) = x -> colwise(f, x) colwise(f) = x -> colwise(f, x) # apply several functions to each column in a DataFrame -colwise{T<:Function}(fns::Vector{T}, d::AbstractDataFrame) = Any[[f(d[idx])] for f in fns, idx in 1:size(d, 2)][:] +colwise{T<:Function}(fns::Vector{T}, d::AbstractDataFrame) = + reshape(Any[vcat(f(d[idx])) for f in fns, idx in 1:size(d, 2)], + length(fns)*size(d, 2)) colwise{T<:Function}(fns::Vector{T}, gd::GroupedDataFrame) = map(colwise(fns), gd) colwise{T<:Function}(fns::Vector{T}) = x -> colwise(fns, x) @@ -299,7 +341,7 @@ notation can be used. ### Returns -* `::DataFrame` +* `::DataFrame` ### Examples @@ -308,11 +350,11 @@ df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]), b = repeat([2, 1], outer=[4]), c = randn(8)) by(df, :a, d -> sum(d[:c])) -by(df, :a, d -> 2 * d[:c]) -by(df, :a, d -> DataFrame(c_sum = sum(d[:c]), c_mean = mean(d[:c]))) -by(df, :a, d -> DataFrame(c = d[:c], c_mean = mean(d[:c]))) +by(df, :a, d -> 2 * dropnull(d[:c])) +by(df, :a, d -> DataFrame(c_sum = sum(d[:c]), c_mean = mean(dropnull(d[:c])))) +by(df, :a, d -> DataFrame(c = d[:c], c_mean = mean(dropnull(d[:c])))) by(df, [:a, :b]) do d - DataFrame(m = mean(d[:c]), v = var(d[:c])) + DataFrame(m = mean(dropnull(d[:c])), v = var(dropnull(d[:c]))) end ``` @@ -347,7 +389,7 @@ same length. ### Returns -* `::DataFrame` +* `::DataFrame` ### Examples @@ -356,9 +398,9 @@ df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]), b = repeat([2, 1], outer=[4]), c = randn(8)) aggregate(df, :a, sum) -aggregate(df, :a, [sum, mean]) -aggregate(groupby(df, :a), [sum, mean]) -df |> groupby(:a) |> [sum, mean] # equivalent +aggregate(df, :a, [sum, x->mean(dropnull(x))]) +aggregate(groupby(df, :a), [sum, x->mean(dropnull(x))]) +df |> groupby(:a) |> [sum, x->mean(dropnull(x))] # equivalent ``` """ @@ -369,7 +411,7 @@ function aggregate{T<:Function}(d::AbstractDataFrame, fs::Vector{T}) end # Applies aggregate to non-key cols of each SubDataFrame of a GroupedDataFrame -aggregate(gd::GroupedDataFrame, fs::Function) = aggregate(gd, [fs]) +aggregate(gd::GroupedDataFrame, f::Function) = aggregate(gd, [f]) function aggregate{T<:Function}(gd::GroupedDataFrame, fs::Vector{T}) headers = _makeheaders(fs, _setdiff(_names(gd), gd.cols)) combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd)) @@ -386,8 +428,8 @@ end function _makeheaders{T<:Function}(fs::Vector{T}, cn::Vector{Symbol}) fnames = _fnames(fs) # see other/utils.jl - scn = [string(x) for x in cn] - [Symbol("$(colname)_$(fname)") for fname in fnames, colname in scn][:] + reshape([Symbol(colname,'_',fname) for fname in fnames, colname in cn], + length(fnames)*length(cn)) end function _aggregate{T<:Function}(d::AbstractDataFrame, fs::Vector{T}, headers::Vector{Symbol}) diff --git a/src/other/index.jl b/src/other/index.jl index 4ba61fd94d..ef50b3b710 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -113,8 +113,10 @@ end Base.getindex(x::Index, idx::Symbol) = x.lookup[idx] Base.getindex(x::AbstractIndex, idx::Real) = @compat Int(idx) -Base.getindex(x::AbstractIndex, idx::AbstractDataVector{Bool}) = getindex(x, convert(Array, idx, false)) -Base.getindex{T}(x::AbstractIndex, idx::AbstractDataVector{T}) = getindex(x, dropna(idx)) +Base.getindex(x::AbstractIndex, idx::AbstractVector{Nullable{Bool}}) = + getindex(x, convert(Vector{Bool}, idx, false)) +Base.getindex{T<:Nullable}(x::AbstractIndex, idx::AbstractVector{T}) = + getindex(x, dropnull(idx)) Base.getindex(x::AbstractIndex, idx::AbstractVector{Bool}) = find(idx) Base.getindex(x::AbstractIndex, idx::Range) = [idx;] Base.getindex{T <: Real}(x::AbstractIndex, idx::AbstractVector{T}) = convert(Vector{Int}, idx) diff --git a/src/other/utils.jl b/src/other/utils.jl index 7227f7027d..a0ceef879f 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -54,7 +54,7 @@ function make_unique(names::Vector{Symbol}; allow_duplicates=true) name = names[i] in(name, seen) ? push!(dups, i) : push!(seen, name) end - + if !allow_duplicates && length(dups) > 0 d = unique(names[dups]) msg = """Duplicate variable names: $d. @@ -99,50 +99,55 @@ function gennames(n::Integer) return res end + #' @description #' -#' Count the number of missing values in an Array. +#' Count the number of null values in an array. #' -#' NOTE: This function always returns 0. +#' @field a::AbstractArray The array whose missing values are to be counted. #' -#' @field a::Array The Array whose missing values are to be counted. -#' -#' @returns count::Int The number of missing values in `a`. +#' @returns count::Int The number of null values in `a`. #' #' @examples #' -#' DataFrames.countna([1, 2, 3]) -countna(a::Array) = 0 +#' DataFrames.countnull([1, 2, 3]) +function countnull(a::AbstractArray) + res = 0 + for x in a + res += _isnull(x) + end + return res +end #' @description #' -#' Count the number of missing values in a DataArray. +#' Count the number of missing values in a NullableArray. #' -#' @field da::DataArray The DataArray whose missing values are to be counted. +#' @field a::NullableArray The NullableArray whose missing values are to be counted. #' -#' @returns count::Int The number of missing values in `a`. +#' @returns count::Int The number of null values in `a`. #' #' @examples #' -#' DataFrames.countna(@data([1, 2, 3])) -countna(da::DataArray) = sum(da.na) +#' DataFrames.countnull(NullableArray([1, 2, 3])) +countnull(a::NullableArray) = sum(a.isnull) #' @description #' -#' Count the number of missing values in a PooledDataArray. +#' Count the number of missing values in a NullableCategoricalArray. #' -#' @field pda::PooledDataArray The PooledDataArray whose missing values +#' @field na::CategoricalArray The CategoricalArray whose missing values #' are to be counted. #' -#' @returns count::Int The number of missing values in `a`. +#' @returns count::Int The number of null values in `a`. #' #' @examples #' -#' DataFrames.countna(@pdata([1, 2, 3])) -function countna(da::PooledDataArray) +#' DataFrames.countnull(CategoricalArray([1, 2, 3])) +function countnull(a::CategoricalArray) res = 0 - for i in 1:length(da) - res += da.refs[i] == 0 + for x in a.refs + res += x == 0 end return res end @@ -193,3 +198,6 @@ function _fnames{T<:Function}(fs::Vector{T}) end names end + +_isnull(x::Any) = false +_isnull(x::Nullable) = isnull(x) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 095ea5da0d..47e7b97434 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -140,19 +140,21 @@ end # Methods for constructing ContrastsMatrix from data. These are called in # ModelFrame constructor and setcontrasts!. -# TODO: add methods for new categorical types - -ContrastsMatrix(C::AbstractContrasts, v::PooledDataArray) = +ContrastsMatrix(C::AbstractContrasts, + v::Union{CategoricalArray, NullableCategoricalArray}) = ContrastsMatrix(C, levels(v)) -ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, col::PooledDataArray) = +ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, + col::Union{CategoricalArray, NullableCategoricalArray}) = throw(ArgumentError("contrast types must be instantiated (use $c() instead of $c)")) + # given an existing ContrastsMatrix, check that all of the levels present in the # data are present in the contrasts. Note that this behavior is different from the # ContrastsMatrix constructor, which requires that the levels be exactly the same. # This method exists to support things like `predict` that can operate on new data # which may contain only a subset of the original data's levels. Checking here # (instead of in `modelmat_cols`) allows an informative error message. -function ContrastsMatrix(c::ContrastsMatrix, col::PooledDataArray) +function ContrastsMatrix(c::ContrastsMatrix, + col::Union{CategoricalArray, NullableCategoricalArray}) if !isempty(setdiff(levels(col), c.levels)) throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: " * "$(setdiff(levels(col), c.levels))" * @@ -171,7 +173,8 @@ nullify(x::Nullable) = x nullify(x) = Nullable(x) # Making a contrast type T only requires that there be a method for -# contrasts_matrix(T, v::PooledDataArray). The rest is boilerplate. +# contrasts_matrix(T, v::Union{CategoricalArray, NullableCategoricalArray}). +# The rest is boilerplate. for contrastType in [:DummyCoding, :EffectsCoding, :HelmertCoding] @eval begin type $contrastType <: AbstractContrasts diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 40f1cdb949..79777a5cab 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -40,6 +40,8 @@ type Terms intercept::Bool # is there an intercept column in the model matrix? end +Base.:(==)(t1::Terms, t2::Terms) = all(getfield(t1, f)==getfield(t2, f) for f in fieldnames(t1)) + type ModelFrame df::AbstractDataFrame terms::Terms @@ -85,19 +87,26 @@ function dospecials(ex::Expr) if !(a1 in specials) return ex end excp = copy(ex) excp.args = vcat(a1,map(dospecials, ex.args[2:end])) - if a1 != :* return excp end - aa = excp.args - a2 = aa[2] - a3 = aa[3] - if length(aa) > 3 - excp.args = vcat(a1, aa[3:end]) - a3 = dospecials(excp) + if a1 == :- + a2, a3 = excp.args[2:3] + a3 == 1 || error("invalid expression $ex; subtraction only supported for -1") + return :($a2 + -1) + elseif a1 == :* + aa = excp.args + a2 = aa[2] + a3 = aa[3] + if length(aa) > 3 + excp.args = vcat(a1, aa[3:end]) + a3 = dospecials(excp) + end + ## this order of expansion gives the R-style ordering of interaction + ## terms (after sorting in increasing interaction order) for higher- + ## order interaction terms (e.g. x1 * x2 * x3 should expand to x1 + + ## x2 + x3 + x1&x2 + x1&x3 + x2&x3 + x1&x2&x3) + :($a2 + $a2 & $a3 + $a3) + else + excp end - ## this order of expansion gives the R-style ordering of interaction - ## terms (after sorting in increasing interaction order) for higher- - ## order interaction terms (e.g. x1 * x2 * x3 should expand to x1 + - ## x2 + x3 + x1&x2 + x1&x3 + x2&x3 + x1&x2&x3) - :($a2 + $a2 & $a3 + $a3) end dospecials(a::Any) = a @@ -216,27 +225,16 @@ function Terms(f::Formula) Terms(tt, ev, facs, non_redundants, oo, haslhs, !any(noint)) end -## Default NA handler. Others can be added as keyword arguments -function na_omit(df::DataFrame) +## Default NULL handler. Others can be added as keyword arguments +function null_omit(df::DataFrame) cc = complete_cases(df) df[cc,:], cc end -## Trim the pool field of da to only those levels that occur in the refs -function dropunusedlevels!(da::PooledDataArray) - rr = da.refs - uu = unique(rr) - length(uu) == length(da.pool) && return da - T = eltype(rr) - su = sort!(uu) - dict = Dict(zip(su, one(T):convert(T, length(uu)))) - da.refs = map(x -> dict[x], rr) - da.pool = da.pool[uu] - da -end -dropunusedlevels!(x) = x +_droplevels!(x::Any) = x +_droplevels!(x::Union{CategoricalArray, NullableCategoricalArray}) = droplevels!(x) -is_categorical(::PooledDataArray) = true +is_categorical(::Union{CategoricalArray, NullableCategoricalArray}) = true is_categorical(::Any) = false ## Check for non-redundancy of columns. For instance, if x is a factor with two @@ -285,16 +283,11 @@ end const DEFAULT_CONTRASTS = DummyCoding -function ModelFrame(trms::Terms, d::AbstractDataFrame; - contrasts::Dict = Dict()) - df, msng = na_omit(DataFrame(map(x -> d[x], trms.eterms))) - names!(df, convert(Vector{Symbol}, map(string, trms.eterms))) - for c in eachcol(df) dropunusedlevels!(c[2]) end - - ## Set up contrasts: - ## Combine actual DF columns and contrast types if necessary to compute the - ## actual contrasts matrices, levels, and term names (using DummyCoding - ## as the default) +## Set up contrasts: +## Combine actual DF columns and contrast types if necessary to compute the +## actual contrasts matrices, levels, and term names (using DummyCoding +## as the default) +function evalcontrasts(df::AbstractDataFrame, contrasts::Dict = Dict()) evaledContrasts = Dict() for (term, col) in eachcol(df) is_categorical(col) || continue @@ -303,6 +296,16 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame; DEFAULT_CONTRASTS(), col) end + return evaledContrasts +end + +function ModelFrame(trms::Terms, d::AbstractDataFrame; + contrasts::Dict = Dict()) + df, msng = null_omit(DataFrame(map(x -> d[x], trms.eterms))) + names!(df, convert(Vector{Symbol}, map(string, trms.eterms))) + for c in eachcol(df) _droplevels!(c[2]) end + + evaledContrasts = evalcontrasts(df, contrasts) ## Check for non-redundant terms, modifying terms in place check_non_redundancy!(trms, df) @@ -310,6 +313,7 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame; ModelFrame(df, trms, msng, evaledContrasts) end +ModelFrame(df::AbstractDataFrame, term::Terms, msng::BitArray) = ModelFrame(df, term, msng, evalcontrasts(df)) ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d; kwargs...) ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), d; kwargs...) @@ -348,8 +352,11 @@ function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, name::Symbol, mf::Mode end end -modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::DataVector) = convert(T, reshape(v.data, length(v), 1)) -modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::Vector) = convert(T, reshape(v, length(v), 1)) +modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::AbstractVector) = + convert(T, reshape(v, length(v), 1)) +# FIXME: this inefficient method should not be needed, cf. JuliaLang/julia#18264 +modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::NullableVector) = + convert(T, Matrix(reshape(v, length(v), 1))) """ modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix) @@ -357,16 +364,21 @@ modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::Vector) = convert(T, reshape Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that levels align properly. """ -function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix) +function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, + v::Union{CategoricalVector, NullableCategoricalVector}, + contrast::ContrastsMatrix) ## make sure the levels of the contrast matrix and the categorical data ## are the same by constructing a re-indexing vector. Indexing into ## reindex with v.refs will give the corresponding row number of the ## contrast matrix reindex = [findfirst(contrast.levels, l) for l in levels(v)] contrastmatrix = convert(T, contrast.matrix) - return contrastmatrix[reindex[v.refs], :] + return indexrows(contrastmatrix, reindex[v.refs]) end +indexrows(m::SparseMatrixCSC, ind::Vector{Int}) = m'[:, ind]' +indexrows(m::AbstractMatrix, ind::Vector{Int}) = m[ind, :] + """ expandcols{T<:AbstractFloatMatrix}(trm::Vector{T}) Create pairwise products of columns from a vector of matrices @@ -421,7 +433,6 @@ function dropresponse!(trms::Terms) end end - """ ModelMatrix{T<:AbstractFloatMatrix}(mf::ModelFrame) Create a `ModelMatrix` of type `T` (default `Matrix{Float64}`) from the @@ -503,7 +514,8 @@ ModelMatrix(mf::ModelFrame) = ModelMatrix{Matrix{Float64}}(mf) termnames(term::Symbol, col) Returns a vector of strings with the names of the coefficients associated with a term. If the column corresponding to the term -is not a `PooledDataArray` a one-element vector is returned. +is not a `CategoricalArray` or `NullableCategoricalArray`, +a one-element vector is returned. """ termnames(term::Symbol, col) = [string(term)] function termnames(term::Symbol, mf::ModelFrame; non_redundant::Bool = false) diff --git a/src/statsmodels/statsmodel.jl b/src/statsmodels/statsmodel.jl index 3424dc2c97..3dd5768d35 100644 --- a/src/statsmodels/statsmodel.jl +++ b/src/statsmodels/statsmodel.jl @@ -62,7 +62,7 @@ typealias DataFrameModels @compat(Union{DataFrameStatisticalModel, DataFrameRegr @delegate DataFrameModels.model [StatsBase.coef, StatsBase.confint, StatsBase.deviance, StatsBase.nulldeviance, StatsBase.loglikelihood, StatsBase.nullloglikelihood, - StatsBase.df, StatsBase.df_residual, StatsBase.nobs, + StatsBase.dof, StatsBase.dof_residual, StatsBase.nobs, StatsBase.stderr, StatsBase.vcov] @delegate DataFrameRegressionModel.model [StatsBase.residuals, StatsBase.model_response, StatsBase.predict, StatsBase.predict!] @@ -81,7 +81,7 @@ function StatsBase.predict(mm::DataFrameRegressionModel, df::AbstractDataFrame; mf = ModelFrame(newTerms, df; contrasts = mm.mf.contrasts) newX = ModelMatrix(mf).m yp = predict(mm, newX; kwargs...) - out = DataArray(eltype(yp), size(df, 1)) + out = NullableArray(eltype(yp), size(df, 1)) out[mf.msng] = yp return(out) end diff --git a/test/REQUIRE b/test/REQUIRE index 22fc9ebb18..84bc366b4d 100644 --- a/test/REQUIRE +++ b/test/REQUIRE @@ -2,3 +2,4 @@ Compat 0.9.0 DataStructures RDatasets # can be removed when deprecated.jl doesn't test read_rda anymore RData +LaTeXStrings diff --git a/test/cat.jl b/test/cat.jl index af45afd77b..1ec8cc2b4a 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -6,13 +6,13 @@ module TestCat # hcat # - dvint = @data([1, 2, NA, 4]) - dvstr = @data(["one", "two", NA, "four"]) + nvint = NullableArray(Nullable{Int}[1, 2, Nullable(), 4]) + nvstr = NullableArray(Nullable{String}["one", "two", Nullable(), "four"]) - df2 = DataFrame(Any[dvint, dvstr]) - df3 = DataFrame(Any[dvint]) + df2 = DataFrame(Any[nvint, nvstr]) + df3 = DataFrame(Any[nvint]) df4 = convert(DataFrame, [1:4 1:4]) - df5 = DataFrame(Any[@data([1,2,3,4]), dvstr]) + df5 = DataFrame(Any[NullableArray([1,2,3,4]), nvstr]) dfh = hcat(df3, df4) @test size(dfh, 2) == 3 @@ -38,18 +38,40 @@ module TestCat # Assignment of rows df[1, :] = df[1, :] df[1:2, :] = df[1:2, :] + df[[true,false,false,true], :] = df[2:3, :] - # Broadcasting assignment of rows + # Scalar broadcasting assignment of rows df[1, :] = 1 + df[1:2, :] = 1 + df[[true,false,false,true], :] = 3 + + # Vector broadcasting assignment of rows + df[1:2, :] = [2,3] + df[[true,false,false,true], :] = [2,3] # Assignment of columns df[1] = zeros(4) + df[:, 2] = ones(4) # Broadcasting assignment of columns df[:, 1] = 1 df[1] = 3 df[:x3] = 2 + # assignment of subframes + df[1, 1:2] = df[2, 2:3] + df[1:2, 1:2] = df[2:3, 2:3] + df[[true,false,false,true], 2:3] = df[1:2,1:2] + + # scalar broadcasting assignment of subframes + df[1, 1:2] = 3 + df[1:2, 1:2] = 3 + df[[true,false,false,true], 2:3] = 3 + + # vector broadcasting assignment of subframes + df[1:2, 1:2] = [3,2] + df[[true,false,false,true], 2:3] = [2,3] + vcat([]) vcat(null_df) vcat(null_df, null_df) @@ -78,37 +100,56 @@ module TestCat dfr = vcat(df2, df3) @test size(dfr) == (8,2) @test names(df2) == names(dfr) - @test isna(dfr[8,:x2]) + @test isnull(dfr[8,:x2]) # Eltype promotion - @test eltypes(vcat(DataFrame(a = [1]), DataFrame(a = [2.1]))) == [Float64] - @test eltypes(vcat(DataFrame(a = [NA]), DataFrame(a = [2.1]))) == [Float64] + # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T} + if VERSION >= v"0.5.0-dev" + @test eltypes(vcat(DataFrame(a = [1]), DataFrame(a = [2.1]))) == [Nullable{Float64}] + @test eltypes(vcat(DataFrame(a = NullableArray(Int, 1)), DataFrame(a = [2.1]))) == [Nullable{Float64}] + else + @test eltypes(vcat(DataFrame(a = [1]), DataFrame(a = [2.1]))) == [Nullable{Any}] + @test eltypes(vcat(DataFrame(a = NullableArray(Int, 1)), DataFrame(a = [2.1]))) == [Nullable{Any}] + end # Minimal container type promotion - dfa = DataFrame(a = @pdata([1, 2, 2])) - dfb = DataFrame(a = @pdata([2, 3, 4])) - dfc = DataFrame(a = @data([2, 3, 4])) + dfa = DataFrame(a = CategoricalArray([1, 2, 2])) + dfb = DataFrame(a = CategoricalArray([2, 3, 4])) + dfc = DataFrame(a = NullableArray([2, 3, 4])) dfd = DataFrame(Any[2:4], [:a]) - @test vcat(dfa, dfb)[:a] == @pdata([1, 2, 2, 2, 3, 4]) - @test vcat(dfa, dfc)[:a] == @pdata([1, 2, 2, 2, 3, 4]) + dfe = DataFrame(b = CategoricalArray([2, 3, 4])) + dfab = vcat(dfa, dfb) + dfac = vcat(dfa, dfc) + dfabcd = vcat(dfa, dfc, dfe) + @test isequal(dfab[:a], Nullable{Int}[1, 2, 2, 2, 3, 4]) + @test isequal(dfac[:a], Nullable{Int}[1, 2, 2, 2, 3, 4]) + @test isa(dfab[:a], NullableCategoricalVector{Int}) + @test isa(dfabcd[:a], NullableCategoricalVector{Int}) + @test isa(dfabcd[:b], NullableCategoricalVector{Int}) + # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T} + if VERSION >= v"0.5.0-dev" + @test isa(dfac[:a], NullableCategoricalVector{Int}) + else + @test isa(dfac[:a], NullableCategoricalVector{Any}) + end # ^^ container may flip if container promotion happens in Base/DataArrays dc = vcat(dfd, dfc) - @test vcat(dfc, dfd) == dc + @test isequal(vcat(dfc, dfd), dc) # Zero-row DataFrames dfc0 = similar(dfc, 0) - @test vcat(dfd, dfc0, dfc) == dc + @test isequal(vcat(dfd, dfc0, dfc), dc) @test eltypes(vcat(dfd, dfc0)) == eltypes(dc) # Missing columns rename!(dfd, :a, :b) - dfda = DataFrame(b = @data([2, 3, 4, NA, NA, NA]), - a = @pdata([NA, NA, NA, 1, 2, 2])) + dfda = DataFrame(b = NullableArray(Nullable{Int}[2, 3, 4, Nullable(), Nullable(), Nullable()]), + a = NullableCategoricalVector(Nullable{Int}[Nullable(), Nullable(), Nullable(), 1, 2, 2])) @test isequal(vcat(dfd, dfa), dfda) # Alignment @test isequal(vcat(dfda, dfd, dfa), vcat(dfda, dfda)) # vcat should be able to concatenate different implementations of AbstractDataFrame (PR #944) - @test vcat(sub(DataFrame(A=1:3),2),DataFrame(A=4:5)) == DataFrame(A=[2,4,5]) + @test isequal(vcat(sub(DataFrame(A=1:3),2),DataFrame(A=4:5)), DataFrame(A=[2,4,5])) end diff --git a/test/constructors.jl b/test/constructors.jl index cba2e4eeed..600e067e25 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -10,27 +10,23 @@ module TestConstructors @test isequal(df.columns, Any[]) @test isequal(df.colindex, Index()) - df = DataFrame(Any[data(zeros(3)), data(ones(3))], + df = DataFrame(Any[NullableCategoricalVector(zeros(3)), + NullableCategoricalVector(ones(3))], Index([:x1, :x2])) @test size(df, 1) == 3 @test size(df, 2) == 2 - @test isequal(df, - DataFrame(Any[data(zeros(3)), data(ones(3))])) - @test isequal(df, - DataFrame(x1 = [0.0, 0.0, 0.0], - x2 = [1.0, 1.0, 1.0])) + @test isequal(df, DataFrame(Any[NullableCategoricalVector(zeros(3)), + NullableCategoricalVector(ones(3))])) + @test isequal(df, DataFrame(x1 = [0.0, 0.0, 0.0], + x2 = [1.0, 1.0, 1.0])) df2 = convert(DataFrame, [0.0 1.0; 0.0 1.0; 0.0 1.0]) names!(df2, [:x1, :x2]) - @test isequal(df, df2) - - @test isequal(df, - convert(DataFrame, [0.0 1.0; - 0.0 1.0; - 0.0 1.0])) + @test isequal(df[:x1], NullableArray(df2[:x1])) + @test isequal(df[:x2], NullableArray(df2[:x2])) @test isequal(df, DataFrame(x1 = [0.0, 0.0, 0.0], x2 = [1.0, 1.0, 1.0])) @@ -40,15 +36,12 @@ module TestConstructors df = DataFrame(Int, 2, 2) @test size(df) == (2, 2) - @test all(eltypes(df) .== [Int, Int]) + @test eltypes(df) == [Nullable{Int}, Nullable{Int}] df = DataFrame([Int, Float64], [:x1, :x2], 2) @test size(df) == (2, 2) - @test all(eltypes(df) .== Any[Int, Float64]) + @test eltypes(df) == [Nullable{Int}, Nullable{Float64}] @test isequal(df, DataFrame([Int, Float64], 2)) - - - end diff --git a/test/contrasts.jl b/test/contrasts.jl index 1ff2fed934..0a6b76671b 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -4,7 +4,7 @@ using Base.Test using DataFrames -d = DataFrame(x = @pdata( [:a, :b, :c, :a, :a, :b] )) +d = DataFrame(x = CategoricalVector([:a, :b, :c, :a, :a, :b])) mf = ModelFrame(Formula(nothing, :x), d) @@ -75,7 +75,7 @@ setcontrasts!(mf, x = HelmertCoding()) @test_throws ArgumentError setcontrasts!(mf, x = EffectsCoding(levels = ["a", "b", "c"])) # Missing data is handled gracefully, dropping columns when a level is lost -d[3, :x] = NA +d[3, :x] = Nullable() mf_missing = ModelFrame(Formula(nothing, :x), d, contrasts = Dict(:x => EffectsCoding())) @test ModelMatrix(mf_missing).m == [1 -1 1 1 diff --git a/test/conversions.jl b/test/conversions.jl index adf1067c69..1a607b2cac 100644 --- a/test/conversions.jl +++ b/test/conversions.jl @@ -7,14 +7,17 @@ module TestConversions df[:A] = 1:5 df[:B] = [:A, :B, :C, :D, :E] @test isa(convert(Array, df), Matrix{Any}) - @test convert(Array, df) == convert(Array, convert(DataArray, df)) + @test convert(Array, df) == convert(Array, convert(NullableArray, df)) @test isa(convert(Array{Any}, df), Matrix{Any}) df = DataFrame() df[:A] = 1:5 df[:B] = 1.0:5.0 - @test isa(convert(Array, df), Matrix{Real}) - @test convert(Array, df) == convert(Array, convert(DataArray, df)) + # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T} + if VERSION >= v"0.5.0-dev" + @test isa(convert(Array, df), Matrix{Float64}) + end + @test convert(Array, df) == convert(Array, convert(NullableArray, df)) @test isa(convert(Array{Any}, df), Matrix{Any}) @test isa(convert(Array{Float64}, df), Matrix{Float64}) @@ -25,24 +28,24 @@ module TestConversions aa = convert(Array{Any}, df) ai = convert(Array{Int}, df) @test isa(a, Matrix{Float64}) - @test a == convert(Array, convert(DataArray, df)) + @test a == convert(Array, convert(NullableArray, df)) @test a == convert(Matrix, df) @test isa(aa, Matrix{Any}) @test aa == convert(Matrix{Any}, df) @test isa(ai, Matrix{Int}) @test ai == convert(Matrix{Int}, df) - df[1,1] = NA + df[1,1] = Nullable() @test_throws ErrorException convert(Array, df) - da = convert(DataArray, df) - daa = convert(DataArray{Any}, df) - dai = convert(DataArray{Int}, df) - @test isa(da, DataMatrix{Float64}) - @test isequal(da, convert(DataMatrix, df)) - @test isa(daa, DataMatrix{Any}) - @test isequal(daa, convert(DataMatrix{Any}, df)) - @test isa(dai, DataMatrix{Int}) - @test isequal(dai, convert(DataMatrix{Int}, df)) + na = convert(NullableArray, df) + naa = convert(NullableArray{Any}, df) + nai = convert(NullableArray{Int}, df) + @test isa(na, NullableMatrix{Float64}) + @test isequal(na, convert(NullableMatrix, df)) + @test isa(naa, NullableMatrix{Any}) + @test isequal(naa, convert(NullableMatrix{Any}, df)) + @test isa(nai, NullableMatrix{Int}) + @test isequal(nai, convert(NullableMatrix{Int}, df)) a = [1.0,2.0] b = [-0.1,3] @@ -52,25 +55,25 @@ module TestConversions df = convert(DataFrame,di) @test isa(df,DataFrame) @test names(df) == Symbol[x for x in sort(collect(keys(di)))] - @test df[:a] == a - @test df[:b] == b - @test df[:c] == c + @test isequal(df[:a], NullableArray(a)) + @test isequal(df[:b], NullableArray(b)) + @test isequal(df[:c], NullableArray(c)) od = OrderedDict("c"=>c, "a"=>a, "b"=>b) df = convert(DataFrame,od) @test isa(df, DataFrame) @test names(df) == Symbol[x for x in keys(od)] - @test df[:a] == a - @test df[:b] == b - @test df[:c] == c + @test isequal(df[:a], NullableArray(a)) + @test isequal(df[:b], NullableArray(b)) + @test isequal(df[:c], NullableArray(c)) sd = SortedDict("c"=>c, "a"=>a, "b"=>b) df = convert(DataFrame,sd) @test isa(df, DataFrame) @test names(df) == Symbol[x for x in keys(sd)] - @test df[:a] == a - @test df[:b] == b - @test df[:c] == c + @test isequal(df[:a], NullableArray(a)) + @test isequal(df[:b], NullableArray(b)) + @test isequal(df[:c], NullableArray(c)) a = [1.0] di = Dict("a"=>a, "b"=>b, "c"=>c) diff --git a/test/data.jl b/test/data.jl index d0fca036fe..1fc3f217cb 100644 --- a/test/data.jl +++ b/test/data.jl @@ -4,24 +4,24 @@ module TestData using DataFrames using Compat - #test_group("DataVector creation") - dvint = @data([1, 2, NA, 4]) - dvint2 = data([5:8;]) - dvint3 = data(5:8) - dvflt = @data([1.0, 2, NA, 4]) - dvstr = @data(["one", "two", NA, "four"]) - dvdict = DataArray(Dict, 4) # for issue #199 + #test_group("NullableArray creation") + nvint = NullableArray(Nullable{Int}[1, 2, Nullable(), 4]) + nvint2 = NullableArray(5:8) + nvint3 = NullableArray(5:8) + nvflt = NullableArray(Nullable{Float64}[1.0, 2.0, Nullable(), 4.0]) + nvstr = NullableArray(Nullable{Compat.ASCIIString}["one", "two", Nullable(), "four"]) + dvdict = NullableArray(Dict, 4) # for issue #199 #test_group("constructors") - df1 = DataFrame(Any[dvint, dvstr], [:Ints, :Strs]) - df2 = DataFrame(Any[dvint, dvstr]) - df3 = DataFrame(Any[dvint]) + df1 = DataFrame(Any[nvint, nvstr], [:Ints, :Strs]) + df2 = DataFrame(Any[nvint, nvstr]) + df3 = DataFrame(Any[nvint]) df4 = convert(DataFrame, [1:4 1:4]) - df5 = DataFrame(Any[@data([1,2,3,4]), dvstr]) - df6 = DataFrame(Any[dvint, dvint, dvstr], [:A, :B, :C]) - df7 = DataFrame(x = dvint, y = dvstr) + df5 = DataFrame(Any[NullableArray([1,2,3,4]), nvstr]) + df6 = DataFrame(Any[nvint, nvint, nvstr], [:A, :B, :C]) + df7 = DataFrame(x = nvint, y = nvstr) @test size(df7) == (4, 2) - @test isequal(df7[:x], dvint) + @test isequal(df7[:x], nvint) #test_group("description functions") @test size(df6, 1) == 4 @@ -31,10 +31,10 @@ module TestData @test names(df7) == [:x, :y] #test_group("ref") - @test df6[2, 3] == "two" - @test isna(df6[3, 3]) - @test df6[2, :C] == "two" - @test isequal(df6[:B], dvint) + @test isequal(df6[2, 3], Nullable("two")) + @test isnull(df6[3, 3]) + @test isequal(df6[2, :C], Nullable("two")) + @test isequal(df6[:B], nvint) @test size(df6[[2,3]], 2) == 2 @test size(df6[2,:], 1) == 1 @test size(df6[[1, 3], [1, 3]]) == (2, 2) @@ -43,17 +43,17 @@ module TestData # lots more to do #test_group("assign") - df6[3] = @data(["un", "deux", "troix", "quatre"]) - @test df6[1, 3] == "un" + df6[3] = NullableArray(["un", "deux", "troix", "quatre"]) + @test isequal(df6[1, 3], Nullable("un")) df6[:B] = [4, 3, 2, 1] - @test df6[1,2] == 4 + @test isequal(df6[1,2], Nullable(4)) df6[:D] = [true, false, true, false] - @test df6[1,4] == true + @test isequal(df6[1,4], Nullable(true)) delete!(df6, :D) @test names(df6) == [:A, :B, :C] @test size(df6, 2) == 3 - #test_group("NA handling") + #test_group("null handling") @test nrow(df5[complete_cases(df5), :]) == 3 #test_context("SubDataFrames") @@ -68,7 +68,7 @@ module TestData @test size(sdf6d) == (2,1) #test_group("ref") - @test sdf6a[1,2] == 4 + @test isequal(sdf6a[1,2], Nullable(4)) #test_context("Within") #test_group("Associative") @@ -77,35 +77,37 @@ module TestData srand(1) N = 20 #Cast to Int64 as rand() behavior differs between Int32/64 - d1 = pdata(rand(@compat(map(Int64, 1:2)), N)) - d2 = (@pdata ["A", "B", NA])[rand(@compat(map(Int64, 1:3)), N)] - d3 = data(randn(N)) - d4 = data(randn(N)) + d1 = NullableArray(rand(map(Int64, 1:2), N)) + d2 = NullableCategoricalArray(Nullable{String}["A", "B", Nullable()])[rand(map(Int64, 1:3), N)] + d3 = NullableArray(randn(N)) + d4 = NullableArray(randn(N)) df7 = DataFrame(Any[d1, d2, d3], [:d1, :d2, :d3]) #test_group("groupby") gd = groupby(df7, :d1) @test length(gd) == 2 - # @test isequal(gd[2]["d2"], PooledDataVector["A", "B", NA, "A", NA, NA, NA, NA]) - @test sum(gd[2][:d3]) == sum(df7[:d3][dropna(df7[:d1] .== 2)]) + # @test isequal(gd[2]["d2"], CategoricalVector["A", "B", Nullable(), "A", Nullable(), Nullable(), Nullable(), Nullable()]) + @test isequal(sum(gd[2][:d3]), sum(df7[:d3][Vector(df7[:d1]) .== 2])) g1 = groupby(df7, [:d1, :d2]) g2 = groupby(df7, [:d2, :d1]) - @test sum(g1[1][:d3]) == sum(g2[1][:d3]) + @test isequal(sum(g1[1][:d3]), sum(g2[1][:d3])) - res = 0.0 + res = Nullable(0.0) for x in g1 res += sum(x[:d1]) end - @test res == sum(df7[:d1]) + @test isequal(res, sum(df7[:d1])) + + @test aggregate(DataFrame(a=1), identity) == DataFrame(a_identity=1) df8 = aggregate(df7[[1, 3]], sum) - @test df8[1, :d1_sum] == sum(df7[:d1]) + @test isequal(df8[1, :d1_sum], sum(df7[:d1])) df8 = aggregate(df7, :d2, [sum, length]) @test size(df8, 1) == 3 @test size(df8, 2) == 5 - @test df8[2, :d1_length] == 4 + @test isequal(df8[2, :d1_length], Nullable(4)) @test isequal(df8, aggregate(groupby(df7, :d2), [sum, length])) df9 = df7 |> groupby([:d2]) |> [sum, length] @@ -189,11 +191,17 @@ module TestData v2 = randn(5)) m1 = join(df1, df2, on = :a) - @test isequal(m1[:a], @data([1, 2, 3, 4, 5])) + @test isequal(m1[:a], NullableArray([1, 2, 3, 4, 5])) # TODO: Re-enable - # m2 = join(df1, df2, on = :a, kind = :outer) - # @test isequal(m2[:b2], DataVector["A", "B", "B", "B", "B", NA, NA, NA, NA, NA]) - # @test isequal(m2[:b2], DataVector["B", "B", "B", "C", "B", NA, NA, NA, NA, NA]) + m2 = join(df1, df2, on = :a, kind = :outer) + # @test isequal(m2[:b2], + # NullableArray(Nullable{String}["A", "B", "B", "B", "B", + # Nullable(), Nullable(), + # Nullable(), Nullable(), Nullable()])) + # @test isequal(m2[:b2], + # NullableArray(Nullable{String}["B", "B", "B", "C", "B", + # Nullable(), Nullable(), + # Nullable(), Nullable(), Nullable()])) df1 = DataFrame(a = [1, 2, 3], b = ["America", "Europe", "Africa"]) @@ -201,33 +209,33 @@ module TestData c = ["New World", "Old World", "New World"]) m1 = join(df1, df2, on = :a, kind = :inner) - @test isequal(m1[:a], @data([1, 2])) + @test isequal(m1[:a], NullableArray([1, 2])) m2 = join(df1, df2, on = :a, kind = :left) - @test isequal(m2[:a], @data([1, 2, 3])) + @test isequal(m2[:a], NullableArray([1, 2, 3])) m3 = join(df1, df2, on = :a, kind = :right) - @test isequal(m3[:a], @data([1, 2, 4])) + @test isequal(m3[:a], NullableArray([1, 2, 4])) m4 = join(df1, df2, on = :a, kind = :outer) - @test isequal(m4[:a], @data([1, 2, 3, 4])) + @test isequal(m4[:a], NullableArray([1, 2, 3, 4])) - # test with NAs (issue #185) + # test with nulls (issue #185) df1 = DataFrame() - df1[:A] = @data(["a", "b", "a", NA]) - df1[:B] = @data([1, 2, 1, 3]) + df1[:A] = NullableArray(Nullable{Compat.ASCIIString}["a", "b", "a", Nullable()]) + df1[:B] = NullableArray([1, 2, 1, 3]) df2 = DataFrame() - df2[:A] = @data(["a", NA, "c"]) - df2[:C] = @data([1, 2, 4]) + df2[:A] = NullableArray(Nullable{Compat.ASCIIString}["a", Nullable(), "c"]) + df2[:C] = NullableArray([1, 2, 4]) m1 = join(df1, df2, on = :A) @test size(m1) == (3,3) - @test isequal(m1[:A], @data([NA,"a","a"])) + @test isequal(m1[:A], NullableArray(Nullable{Compat.ASCIIString}[Nullable(),"a","a"])) m2 = join(df1, df2, on = :A, kind = :outer) @test size(m2) == (5,3) - @test isequal(m2[:A], @data([NA,"a","a","b","c"])) + @test isequal(m2[:A], NullableArray(Nullable{Compat.ASCIIString}[Nullable(),"a","a","b","c"])) srand(1) df1 = DataFrame( @@ -241,20 +249,23 @@ module TestData b = [:A,:B,:C][[1,1,1,2,3]], v2 = randn(5) ) - df2[1,:a] = NA + df2[1,:a] = Nullable() # # TODO: Restore this functionality # m1 = join(df1, df2, on = [:a,:b]) - # @test isequal(m1[:a], DataArray(["x", "x", "y", "y", fill("x", 5)])) + # @test isequal(m1[:a], NullableArray(["x", "x", "y", "y", fill("x", 5)])) # m2 = join(df1, df2, on = ["a","b"], kind = :outer) - # @test isequal(m2[10,:v2], NA) - # @test isequal(m2[:a], DataVector["x", "x", "y", "y", "x", "x", "x", "x", "x", "y", NA, "y"]) + # @test isequal(m2[10,:v2], Nullable()) + # @test isequal(m2[:a], + # NullableArray(Nullable{String}["x", "x", "y", "y", + # "x", "x", "x", "x", "x", "y", + # Nullable(), "y"]) srand(1) function spltdf(d) - d[:x1] = map(x -> x[1], d[:a]) - d[:x2] = map(x -> x[2], d[:a]) - d[:x3] = map(x -> x[3], d[:a]) + d[:x1] = map(x -> get(x)[1], d[:a]) + d[:x2] = map(x -> get(x)[2], d[:a]) + d[:x3] = map(x -> get(x)[3], d[:a]) d end df1 = DataFrame( @@ -272,39 +283,6 @@ module TestData # m2 = join(df1, df2, on = [:x1, :x2, :x3]) # @test isequal(sort(m1[:a]), sort(m2[:a])) - #test_group("New DataVector constructors") - dv = DataArray(Int, 5) - @test all(isna(dv)) - dv = DataArray(Float64, 5) - @test all(isna(dv)) - dv = @data(zeros(5)) - @test all(dv .== 0.0) - dv = @data(ones(5)) - @test all(dv .== 1.0) - - # No more NA corruption - dv = @data(ones(10_000)) - @test !any(isna(dv)) - - PooledDataArray(falses(2), falses(2)) - PooledDataArray(falses(2), trues(2)) - - # Test vectorized comparisons work for DataVector's and PooledDataVector's - @data([1, 2, NA]) .== 1 - @pdata([1, 2, NA]) .== 1 - @data(["1", "2", NA]) .== "1" - @pdata(["1", "2", NA]) .== "1" - - # Test unique() - #test_group("unique()") - # TODO: Restore this - # dv = DataArray(1:4) - # dv[4] = NA - # @test (1 in unique(dv)) - # @test (2 in unique(dv)) - # @test (3 in unique(dv)) - # @test (NA in unique(dv)) - # test nonunique() with extra argument df1 = DataFrame(a = ["a", "b", "a", "b", "a", "b"], b = 1:6, c = [1:3;1:3]) df = vcat(df1, df1) @@ -317,29 +295,16 @@ module TestData @test find(nonunique(df, 1)) == collect(3:12) # Test unique() with extra argument - @test unique(df) == df1 - @test unique(df, :) == df1 - @test unique(df, Colon()) == df1 - @test unique(df, 2:3) == df1 - @test unique(df, 3) == df1[1:3,:] - @test unique(df, [1, 3]) == df1 - @test unique(df, [:a, :c]) == df1 - @test unique(df, :a) == df1[1:2,:] + @test isequal(unique(df), df1) + @test isequal(unique(df, :), df1) + @test isequal(unique(df, Colon()), df1) + @test isequal(unique(df, 2:3), df1) + @test isequal(unique(df, 3), df1[1:3,:]) + @test isequal(unique(df, [1, 3]), df1) + @test isequal(unique(df, [:a, :c]), df1) + @test isequal(unique(df, :a), df1[1:2,:]) #test unique!() with extra argument unique!(df, [1, 3]) - @test df == df1 - - #test_group("find()") - dv = DataArray([true, false, true]) - @test isequal(find(dv), [1, 3]) - - pdv = PooledDataArray([true, false, true]) - @test isequal(find(pdv), [1, 3]) - - dv[1] = NA - @test isequal(find(dv), [3]) - - pdv[1] = NA - @test isequal(find(pdv), [3]) + @test isequal(df, df1) end diff --git a/test/data/iris.csv b/test/data/iris.csv new file mode 100644 index 0000000000..603349e022 --- /dev/null +++ b/test/data/iris.csv @@ -0,0 +1,151 @@ +"SepalLength","SepalWidth","PetalLength","PetalWidth","Species" +"5.1","3.5","1.4","0.2","setosa" +"4.9","3.0","1.4","0.2","setosa" +"4.7","3.2","1.3","0.2","setosa" +"4.6","3.1","1.5","0.2","setosa" +"5.0","3.6","1.4","0.2","setosa" +"5.4","3.9","1.7","0.4","setosa" +"4.6","3.4","1.4","0.3","setosa" +"5.0","3.4","1.5","0.2","setosa" +"4.4","2.9","1.4","0.2","setosa" +"4.9","3.1","1.5","0.1","setosa" +"5.4","3.7","1.5","0.2","setosa" +"4.8","3.4","1.6","0.2","setosa" +"4.8","3.0","1.4","0.1","setosa" +"4.3","3.0","1.1","0.1","setosa" +"5.8","4.0","1.2","0.2","setosa" +"5.7","4.4","1.5","0.4","setosa" +"5.4","3.9","1.3","0.4","setosa" +"5.1","3.5","1.4","0.3","setosa" +"5.7","3.8","1.7","0.3","setosa" +"5.1","3.8","1.5","0.3","setosa" +"5.4","3.4","1.7","0.2","setosa" +"5.1","3.7","1.5","0.4","setosa" +"4.6","3.6","1.0","0.2","setosa" +"5.1","3.3","1.7","0.5","setosa" +"4.8","3.4","1.9","0.2","setosa" +"5.0","3.0","1.6","0.2","setosa" +"5.0","3.4","1.6","0.4","setosa" +"5.2","3.5","1.5","0.2","setosa" +"5.2","3.4","1.4","0.2","setosa" +"4.7","3.2","1.6","0.2","setosa" +"4.8","3.1","1.6","0.2","setosa" +"5.4","3.4","1.5","0.4","setosa" +"5.2","4.1","1.5","0.1","setosa" +"5.5","4.2","1.4","0.2","setosa" +"4.9","3.1","1.5","0.2","setosa" +"5.0","3.2","1.2","0.2","setosa" +"5.5","3.5","1.3","0.2","setosa" +"4.9","3.6","1.4","0.1","setosa" +"4.4","3.0","1.3","0.2","setosa" +"5.1","3.4","1.5","0.2","setosa" +"5.0","3.5","1.3","0.3","setosa" +"4.5","2.3","1.3","0.3","setosa" +"4.4","3.2","1.3","0.2","setosa" +"5.0","3.5","1.6","0.6","setosa" +"5.1","3.8","1.9","0.4","setosa" +"4.8","3.0","1.4","0.3","setosa" +"5.1","3.8","1.6","0.2","setosa" +"4.6","3.2","1.4","0.2","setosa" +"5.3","3.7","1.5","0.2","setosa" +"5.0","3.3","1.4","0.2","setosa" +"7.0","3.2","4.7","1.4","versicolor" +"6.4","3.2","4.5","1.5","versicolor" +"6.9","3.1","4.9","1.5","versicolor" +"5.5","2.3","4.0","1.3","versicolor" +"6.5","2.8","4.6","1.5","versicolor" +"5.7","2.8","4.5","1.3","versicolor" +"6.3","3.3","4.7","1.6","versicolor" +"4.9","2.4","3.3","1.0","versicolor" +"6.6","2.9","4.6","1.3","versicolor" +"5.2","2.7","3.9","1.4","versicolor" +"5.0","2.0","3.5","1.0","versicolor" +"5.9","3.0","4.2","1.5","versicolor" +"6.0","2.2","4.0","1.0","versicolor" +"6.1","2.9","4.7","1.4","versicolor" +"5.6","2.9","3.6","1.3","versicolor" +"6.7","3.1","4.4","1.4","versicolor" +"5.6","3.0","4.5","1.5","versicolor" +"5.8","2.7","4.1","1.0","versicolor" +"6.2","2.2","4.5","1.5","versicolor" +"5.6","2.5","3.9","1.1","versicolor" +"5.9","3.2","4.8","1.8","versicolor" +"6.1","2.8","4.0","1.3","versicolor" +"6.3","2.5","4.9","1.5","versicolor" +"6.1","2.8","4.7","1.2","versicolor" +"6.4","2.9","4.3","1.3","versicolor" +"6.6","3.0","4.4","1.4","versicolor" +"6.8","2.8","4.8","1.4","versicolor" +"6.7","3.0","5.0","1.7","versicolor" +"6.0","2.9","4.5","1.5","versicolor" +"5.7","2.6","3.5","1.0","versicolor" +"5.5","2.4","3.8","1.1","versicolor" +"5.5","2.4","3.7","1.0","versicolor" +"5.8","2.7","3.9","1.2","versicolor" +"6.0","2.7","5.1","1.6","versicolor" +"5.4","3.0","4.5","1.5","versicolor" +"6.0","3.4","4.5","1.6","versicolor" +"6.7","3.1","4.7","1.5","versicolor" +"6.3","2.3","4.4","1.3","versicolor" +"5.6","3.0","4.1","1.3","versicolor" +"5.5","2.5","4.0","1.3","versicolor" +"5.5","2.6","4.4","1.2","versicolor" +"6.1","3.0","4.6","1.4","versicolor" +"5.8","2.6","4.0","1.2","versicolor" +"5.0","2.3","3.3","1.0","versicolor" +"5.6","2.7","4.2","1.3","versicolor" +"5.7","3.0","4.2","1.2","versicolor" +"5.7","2.9","4.2","1.3","versicolor" +"6.2","2.9","4.3","1.3","versicolor" +"5.1","2.5","3.0","1.1","versicolor" +"5.7","2.8","4.1","1.3","versicolor" +"6.3","3.3","6.0","2.5","virginica" +"5.8","2.7","5.1","1.9","virginica" +"7.1","3.0","5.9","2.1","virginica" +"6.3","2.9","5.6","1.8","virginica" +"6.5","3.0","5.8","2.2","virginica" +"7.6","3.0","6.6","2.1","virginica" +"4.9","2.5","4.5","1.7","virginica" +"7.3","2.9","6.3","1.8","virginica" +"6.7","2.5","5.8","1.8","virginica" +"7.2","3.6","6.1","2.5","virginica" +"6.5","3.2","5.1","2.0","virginica" +"6.4","2.7","5.3","1.9","virginica" +"6.8","3.0","5.5","2.1","virginica" +"5.7","2.5","5.0","2.0","virginica" +"5.8","2.8","5.1","2.4","virginica" +"6.4","3.2","5.3","2.3","virginica" +"6.5","3.0","5.5","1.8","virginica" +"7.7","3.8","6.7","2.2","virginica" +"7.7","2.6","6.9","2.3","virginica" +"6.0","2.2","5.0","1.5","virginica" +"6.9","3.2","5.7","2.3","virginica" +"5.6","2.8","4.9","2.0","virginica" +"7.7","2.8","6.7","2.0","virginica" +"6.3","2.7","4.9","1.8","virginica" +"6.7","3.3","5.7","2.1","virginica" +"7.2","3.2","6.0","1.8","virginica" +"6.2","2.8","4.8","1.8","virginica" +"6.1","3.0","4.9","1.8","virginica" +"6.4","2.8","5.6","2.1","virginica" +"7.2","3.0","5.8","1.6","virginica" +"7.4","2.8","6.1","1.9","virginica" +"7.9","3.8","6.4","2.0","virginica" +"6.4","2.8","5.6","2.2","virginica" +"6.3","2.8","5.1","1.5","virginica" +"6.1","2.6","5.6","1.4","virginica" +"7.7","3.0","6.1","2.3","virginica" +"6.3","3.4","5.6","2.4","virginica" +"6.4","3.1","5.5","1.8","virginica" +"6.0","3.0","4.8","1.8","virginica" +"6.9","3.1","5.4","2.1","virginica" +"6.7","3.1","5.6","2.4","virginica" +"6.9","3.1","5.1","2.3","virginica" +"5.8","2.7","5.1","1.9","virginica" +"6.8","3.2","5.9","2.3","virginica" +"6.7","3.3","5.7","2.5","virginica" +"6.7","3.0","5.2","2.3","virginica" +"6.3","2.5","5.0","1.9","virginica" +"6.5","3.0","5.2","2.0","virginica" +"6.2","3.4","5.4","2.3","virginica" +"5.9","3.0","5.1","1.8","virginica" diff --git a/test/dataframe.jl b/test/dataframe.jl index 6b36e801fb..2814d45765 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -7,23 +7,28 @@ module TestDataFrame # Equality # - @test isequal(DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])), DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6]))) - @test !isequal(DataFrame(a=@data([1, 2]), b=@data([4, 5])), DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6]))) - @test !isequal(DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])), DataFrame(a=@data([1, 2, 3]))) - @test !isequal(DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])), DataFrame(a=@data([1, 2, 3]), c=@data([4, 5, 6]))) - @test !isequal(DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])), DataFrame(b=@data([4, 5, 6]), a=@data([1, 2, 3]))) - @test !isequal(DataFrame(a=@data([1, 2, 2]), b=@data([4, 5, 6])), DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6]))) - @test isequal(DataFrame(a=@data([1, 2, NA]), b=@data([4, 5, 6])), DataFrame(a=@data([1, 2, NA]), b=@data([4, 5, 6]))) - - @test DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])) == DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])) - @test DataFrame(a=@data([1, 2]), b=@data([4, 5])) != DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])) - @test DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])) != DataFrame(a=@data([1, 2, 3])) - @test DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])) != DataFrame(a=@data([1, 2, 3]), c=@data([4, 5, 6])) - @test DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])) != DataFrame(b=@data([4, 5, 6]), a=@data([1, 2, 3])) - @test DataFrame(a=@data([1, 2, 2]), b=@data([4, 5, 6])) != DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6])) - @test DataFrame(a=@data([1, 3, NA]), b=@data([4, 5, 6])) != DataFrame(a=@data([1, 2, NA]), b=@data([4, 5, 6])) - @test isna(DataFrame(a=@data([1, 2, NA]), b=@data([4, 5, 6])) == DataFrame(a=@data([1, 2, NA]), b=@data([4, 5, 6]))) - @test isna(DataFrame(a=@data([1, 2, NA]), b=@data([4, 5, 6])) == DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6]))) + @test isequal(DataFrame(a=[1, 2, 3], b=[4, 5, 6]), DataFrame(a=[1, 2, 3], b=[4, 5, 6])) + @test !isequal(DataFrame(a=[1, 2], b=[4, 5]), DataFrame(a=[1, 2, 3], b=[4, 5, 6])) + @test !isequal(DataFrame(a=[1, 2, 3], b=[4, 5, 6]), DataFrame(a=[1, 2, 3])) + @test !isequal(DataFrame(a=[1, 2, 3], b=[4, 5, 6]), DataFrame(a=[1, 2, 3], c=[4, 5, 6])) + @test !isequal(DataFrame(a=[1, 2, 3], b=[4, 5, 6]), DataFrame(b=[4, 5, 6], a=[1, 2, 3])) + @test !isequal(DataFrame(a=[1, 2, 2], b=[4, 5, 6]), DataFrame(a=[1, 2, 3], b=[4, 5, 6])) + @test isequal(DataFrame(a=Nullable{Int}[1, 2, Nullable()], b=[4, 5, 6]), + DataFrame(a=Nullable{Int}[1, 2, Nullable()], b=[4, 5, 6])) + + # FIXME: equality operators won't work until JuliaStats/NullableArrays#84 is merged + #@test get(DataFrame(a=[1, 2, 3], b=[4, 5, 6]) == DataFrame(a=[1, 2, 3], b=[4, 5, 6])) + #@test get(DataFrame(a=[1, 2], b=[4, 5]) != DataFrame(a=[1, 2, 3], b=[4, 5, 6])) + #@test get(DataFrame(a=[1, 2, 3], b=[4, 5, 6]) != DataFrame(a=[1, 2, 3])) + #@test get(DataFrame(a=[1, 2, 3], b=[4, 5, 6]) != DataFrame(a=[1, 2, 3], c=[4, 5, 6])) + #@test get(DataFrame(a=[1, 2, 3], b=[4, 5, 6]) != DataFrame(b=[4, 5, 6], a=[1, 2, 3])) + #@test get(DataFrame(a=[1, 2, 2], b=[4, 5, 6]) != DataFrame(a=[1, 2, 3], b=[4, 5, 6])) + #@test get(DataFrame(a=Nullable{Int}[1, 3, Nullable()], b=[4, 5, 6]) != + # DataFrame(a=Nullable{Int}[1, 2, Nullable()], b=[4, 5, 6])) + #@test isnull(DataFrame(a=Nullable{Int}[1, 2, Nullable()], b=[4, 5, 6]) == + # DataFrame(a=Nullable{Int}[1, 2, Nullable()], b=[4, 5, 6])) + #@test isnull(DataFrame(a=Nullable{Int}[1, 2, Nullable()], b=[4, 5, 6]) == + # DataFrame(a=Nullable{Int}[1, 2, 3], b=[4, 5, 6])) # # Copying @@ -34,17 +39,17 @@ module TestDataFrame dfdc = deepcopy(df) df[1, :a] = 4 - df[1, :b][:e] = 5 + get(df[1, :b])[:e] = 5 names!(df, [:f, :g]) @test names(dfc) == [:a, :b] @test names(dfdc) == [:a, :b] - @test dfc[1, :a] == 4 - @test dfdc[1, :a] == 2 + @test get(dfc[1, :a]) === 4 + @test get(dfdc[1, :a]) === 2 - @test names(dfc[1, :b]) == [:c, :e] - @test names(dfdc[1, :b]) == [:c] + @test names(get(dfc[1, :b])) == [:c, :e] + @test names(get(dfdc[1, :b])) == [:c] # @@ -64,18 +69,18 @@ module TestDataFrame # Insert single value x[:d] = 3 - @test x[:d] == [3, 3, 3] + @test isequal(x[:d], NullableArray([3, 3, 3])) x0[:d] = 3 @test x0[:d] == Int[] - # similar / nas - df = DataFrame(a = 1, b = "b", c = @pdata([3.3])) - nadf = DataFrame(a = @data(Int[NA, NA]), - b = DataArray(Array(String, 2), trues(2)), - c = @pdata(Float64[NA, NA])) - @test isequal(nadf, similar(df, 2)) - @test isequal(nadf, DataFrames.nas(df, 2)) + # similar / nulls + df = DataFrame(a = 1, b = "b", c = CategoricalArray([3.3])) + nulldf = DataFrame(a = NullableArray(Int, 2), + b = NullableArray(String, 2), + c = NullableCategoricalArray(Float64, 2)) + @test isequal(nulldf, similar(df, 2)) + @test isequal(nulldf, DataFrames.similar_nullable(df, 2)) # Associative methods @@ -93,58 +98,58 @@ module TestDataFrame df = DataFrame(a=[1, 2], b=[3., 4.]) @test_throws BoundsError insert!(df, 5, ["a", "b"], :newcol) @test_throws ErrorException insert!(df, 1, ["a"], :newcol) - @test insert!(df, 1, ["a", "b"], :newcol) == df - @test isequal(df, DataFrame(newcol=["a", "b"], a=[1, 2], b=[3., 4.])) - df = DataFrame(a=[1, 2], b=[3., 4.]) - @test insert!(df, 3, ["a", "b"], :newcol) == df - @test isequal(df, DataFrame(a=[1, 2], b=[3., 4.], newcol=["a", "b"])) + @test isequal(insert!(df, 1, ["a", "b"], :newcol), df) + @test names(df) == [:newcol, :a, :b] + @test isequal(df[:a], NullableArray([1, 2])) + @test isequal(df[:b], NullableArray([3., 4.])) + @test isequal(df[:newcol], ["a", "b"]) df = DataFrame(a=[1, 2], b=[3., 4.]) df2 = DataFrame(b=["a", "b"], c=[:c, :d]) - @test merge!(df, df2) == df + @test isequal(merge!(df, df2), df) @test isequal(df, DataFrame(a=[1, 2], b=["a", "b"], c=[:c, :d])) #test_group("Empty DataFrame constructors") df = DataFrame(Int, 10, 3) @test size(df, 1) == 10 @test size(df, 2) == 3 - @test typeof(df[:, 1]) == DataVector{Int} - @test typeof(df[:, 2]) == DataVector{Int} - @test typeof(df[:, 3]) == DataVector{Int} - @test allna(df[:, 1]) - @test allna(df[:, 2]) - @test allna(df[:, 3]) + @test typeof(df[:, 1]) == NullableVector{Int} + @test typeof(df[:, 2]) == NullableVector{Int} + @test typeof(df[:, 3]) == NullableVector{Int} + @test allnull(df[:, 1]) + @test allnull(df[:, 2]) + @test allnull(df[:, 3]) df = DataFrame(Any[Int, Float64, String], 100) @test size(df, 1) == 100 @test size(df, 2) == 3 - @test typeof(df[:, 1]) == DataVector{Int} - @test typeof(df[:, 2]) == DataVector{Float64} - @test typeof(df[:, 3]) == DataVector{String} - @test allna(df[:, 1]) - @test allna(df[:, 2]) - @test allna(df[:, 3]) + @test typeof(df[:, 1]) == NullableVector{Int} + @test typeof(df[:, 2]) == NullableVector{Float64} + @test typeof(df[:, 3]) == NullableVector{String} + @test allnull(df[:, 1]) + @test allnull(df[:, 2]) + @test allnull(df[:, 3]) df = DataFrame(Any[Int, Float64, String], [:A, :B, :C], 100) @test size(df, 1) == 100 @test size(df, 2) == 3 - @test typeof(df[:, 1]) == DataVector{Int} - @test typeof(df[:, 2]) == DataVector{Float64} - @test typeof(df[:, 3]) == DataVector{String} - @test allna(df[:, 1]) - @test allna(df[:, 2]) - @test allna(df[:, 3]) + @test typeof(df[:, 1]) == NullableVector{Int} + @test typeof(df[:, 2]) == NullableVector{Float64} + @test typeof(df[:, 3]) == NullableVector{String} + @test allnull(df[:, 1]) + @test allnull(df[:, 2]) + @test allnull(df[:, 3]) df = DataFrame(DataType[Int, Float64, Compat.UTF8String],[:A, :B, :C], [false,false,true],100) @test size(df, 1) == 100 @test size(df, 2) == 3 - @test typeof(df[:, 1]) == DataVector{Int} - @test typeof(df[:, 2]) == DataVector{Float64} - @test typeof(df[:, 3]) == PooledDataVector{Compat.UTF8String,UInt32} - @test allna(df[:, 1]) - @test allna(df[:, 2]) - @test allna(df[:, 3]) + @test typeof(df[:, 1]) == NullableVector{Int} + @test typeof(df[:, 2]) == NullableVector{Float64} + @test typeof(df[:, 3]) == NullableCategoricalVector{Compat.UTF8String,UInt32} + @test allnull(df[:, 1]) + @test allnull(df[:, 2]) + @test allnull(df[:, 3]) df = convert(DataFrame, zeros(10, 5)) @@ -168,8 +173,8 @@ module TestDataFrame @compat(Dict{Any,Any}(:a=>5))]) @test size(df, 1) == 3 @test size(df, 2) == 2 - @test typeof(df[:,:a]) == DataVector{Int} - @test typeof(df[:,:b]) == DataVector{Char} + @test typeof(df[:,:a]) == NullableVector{Int} + @test typeof(df[:,:b]) == NullableVector{Char} df = DataFrame([@compat(Dict{Any,Any}(:a=>1, :b=>'c')), @compat(Dict{Any,Any}(:a=>3, :b=>'d')), @@ -177,9 +182,10 @@ module TestDataFrame [:a, :b]) @test size(df, 1) == 3 @test size(df, 2) == 2 - @test typeof(df[:,:a]) == DataVector{Int} - @test typeof(df[:,:b]) == DataVector{Char} + @test typeof(df[:,:a]) == NullableVector{Int} + @test typeof(df[:,:b]) == NullableVector{Char} + @test DataFrame(NullableArray[[1,2,3],[2.5,4.5,6.5]], [:A, :B]) == DataFrame(A = [1,2,3], B = [2.5,4.5,6.5]) # This assignment was missing before df = DataFrame(Column = [:A]) @@ -201,11 +207,11 @@ module TestDataFrame dfb= DataFrame( first=[1,2], second=["apple","orange"] ) push!(dfb, Any[3,"pear"]) - @test df==dfb + @test isequal(df, dfb) dfb= DataFrame( first=[1,2], second=["apple","orange"] ) push!(dfb, (3,"pear")) - @test df==dfb + @test isequal(df, dfb) dfb= DataFrame( first=[1,2], second=["apple","orange"] ) @test_throws ArgumentError push!(dfb, (33.33,"pear")) @@ -215,22 +221,22 @@ module TestDataFrame dfb= DataFrame( first=[1,2], second=["apple","orange"] ) push!(dfb, @compat(Dict(:first=>3, :second=>"pear"))) - @test df==dfb + @test isequal(df, dfb) df=DataFrame( first=[1,2,3], second=["apple","orange","banana"] ) dfb= DataFrame( first=[1,2], second=["apple","orange"] ) push!(dfb, @compat(Dict("first"=>3, "second"=>"banana"))) - @test df==dfb + @test isequal(df, dfb) df0= DataFrame( first=[1,2], second=["apple","orange"] ) dfb= DataFrame( first=[1,2], second=["apple","orange"] ) @test_throws ArgumentError push!(dfb, @compat(Dict(:first=>true, :second=>false))) - @test df0==dfb + @test isequal(df0, dfb) df0= DataFrame( first=[1,2], second=["apple","orange"] ) dfb= DataFrame( first=[1,2], second=["apple","orange"] ) @test_throws ArgumentError push!(dfb, @compat(Dict("first"=>"chicken", "second"=>"stuff"))) - @test df0==dfb + @test isequal(df0, dfb) # delete! df = DataFrame(a=1, b=2, c=3, d=4, e=5) @@ -267,54 +273,61 @@ module TestDataFrame @test deleterows!(df, [2, 3]) === df @test isequal(df, DataFrame(a=[1], b=[3.])) - df = DataFrame(a=@data([1, 2]), b=@data([3., 4.])) + df = DataFrame(a=NullableArray([1, 2]), b=NullableArray([3., 4.])) @test deleterows!(df, 1) === df - @test isequal(df, DataFrame(a=@data([2]), b=@data([4.]))) + @test isequal(df, DataFrame(a=NullableArray([2]), b=NullableArray([4.]))) - df = DataFrame(a=@data([1, 2]), b=@data([3., 4.])) + df = DataFrame(a=NullableArray([1, 2]), b=NullableArray([3., 4.])) @test deleterows!(df, 2) === df - @test isequal(df, DataFrame(a=@data([1]), b=@data([3.]))) + @test isequal(df, DataFrame(a=NullableArray([1]), b=NullableArray([3.]))) - df = DataFrame(a=@data([1, 2, 3]), b=@data([3., 4., 5.])) + df = DataFrame(a=NullableArray([1, 2, 3]), b=NullableArray([3., 4., 5.])) @test deleterows!(df, 2:3) === df - @test isequal(df, DataFrame(a=@data([1]), b=@data([3.]))) + @test isequal(df, DataFrame(a=NullableArray([1]), b=NullableArray([3.]))) - df = DataFrame(a=@data([1, 2, 3]), b=@data([3., 4., 5.])) + df = DataFrame(a=NullableArray([1, 2, 3]), b=NullableArray([3., 4., 5.])) @test deleterows!(df, [2, 3]) === df - @test isequal(df, DataFrame(a=@data([1]), b=@data([3.]))) + @test isequal(df, DataFrame(a=NullableArray([1]), b=NullableArray([3.]))) # describe #suppress output and test that describe() does not throw devnull = is_unix() ? "/dev/null" : "nul" open(devnull, "w") do f - @test nothing == describe(f, DataFrame(a=[1, 2], b=Any["3", NA])) - @test nothing == describe(f, DataFrame(a=@data([1, 2]), b=@data(["3", NA]))) - @test nothing == describe(f, DataFrame(a=@pdata([1, 2]), b=@pdata(["3", NA]))) + @test nothing == describe(f, DataFrame(a=[1, 2], b=Any["3", Nullable()])) + @test nothing == + describe(f, DataFrame(a=NullableArray([1, 2]), + b=NullableArray(Nullable{String}["3", Nullable()]))) + @test nothing == + describe(f, DataFrame(a=CategoricalArray([1, 2]), + b=NullableCategoricalArray(Nullable{String}["3", Nullable()]))) @test nothing == describe(f, [1, 2, 3]) - @test nothing == describe(f, @data([1, 2, 3])) - @test nothing == describe(f, @pdata([1, 2, 3])) - @test nothing == describe(f, Any["1", "2", NA]) - @test nothing == describe(f, @data(["1", "2", NA])) - @test nothing == describe(f, @pdata(["1", "2", NA])) + @test nothing == describe(f, NullableArray([1, 2, 3])) + @test nothing == describe(f, CategoricalArray([1, 2, 3])) + @test nothing == describe(f, Any["1", "2", Nullable()]) + @test nothing == describe(f, NullableArray(Nullable{String}["1", "2", Nullable()])) + @test nothing == describe(f, NullableCategoricalArray(Nullable{String}["1", "2", Nullable()])) end #Check the output of unstack - df = DataFrame(Fish = ["Bob", "Bob", "Batman", "Batman"], - Key = ["Mass", "Color", "Mass", "Color"], - Value = ["12 g", "Red", "18 g", "Grey"]) + df = DataFrame(Fish = CategoricalArray(["Bob", "Bob", "Batman", "Batman"]), + Key = ["Mass", "Color", "Mass", "Color"], + Value = ["12 g", "Red", "18 g", "Grey"]) + # Check that reordering levels does not confuse unstack + levels!(df[1], ["XXX", "Bob", "Batman"]) #Unstack specifying a row column df2 = unstack(df,:Fish, :Key, :Value) #Unstack without specifying a row column df3 = unstack(df,:Key, :Value) #The expected output - df4 = DataFrame(Fish = ["Batman", "Bob"], Color = ["Grey", "Red"], Mass = ["18 g", "12 g"]) - @test df2 == df4 - @test df3 == df4 - #Make sure unstack works with NAs at the start of the value column - df[1,:Value] = NA + df4 = DataFrame(Fish = ["XXX", "Bob", "Batman"], + Color = Nullable{String}[Nullable(), "Red", "Grey"], + Mass = Nullable{String}[Nullable(), "12 g", "18 g"]) + @test isequal(df2, df4) + @test isequal(df3, df4[2:3, :]) + #Make sure unstack works with NULLs at the start of the value column + df[1,:Value] = Nullable() df2 = unstack(df,:Fish, :Key, :Value) #This changes the expected result - df4[2,:Mass] = NA + df4[2,:Mass] = Nullable() @test isequal(df2, df4) - end diff --git a/test/dataframerow.jl b/test/dataframerow.jl index 4fcc2c66e0..0b69555db6 100644 --- a/test/dataframerow.jl +++ b/test/dataframerow.jl @@ -2,12 +2,14 @@ module TestDataFrameRow using Base.Test using DataFrames, Compat - df = DataFrame(a=@data([1, 2, 3, 1, 2, 2 ]), - b=@data([2.0, NA, 1.2, 2.0, NA, NA]), - c=@data(["A", "B", "C", "A", "B", NA]), - d=PooledDataArray( - @data([:A, NA, :C, :A, NA, :C]))) - df2 = DataFrame(a = @data([1, 2, 3])) + df = DataFrame(a=NullableArray([1, 2, 3, 1, 2, 2 ]), + b=NullableArray(Nullable{Float64}[2.0, Nullable(), + 1.2, 2.0, + Nullable(), Nullable()]), + c=NullableArray(Nullable{String}["A", "B", "C", "A", "B", Nullable()]), + d=NullableCategoricalArray(Nullable{Symbol}[:A, Nullable(), :C, :A, + Nullable(), :C])) + df2 = DataFrame(a = NullableArray([1, 2, 3])) # # Equality diff --git a/test/duplicates.jl b/test/duplicates.jl index 848dded61c..5656cbbb51 100644 --- a/test/duplicates.jl +++ b/test/duplicates.jl @@ -9,10 +9,12 @@ module TestDuplicates unique!(df) @test isequal(df, udf) - pdf = DataFrame( a = PooledDataArray( @data ["a", "a", NA, NA, "b", NA, "a", NA] ), - b = PooledDataArray( @data ["a", "b", NA, NA, "b", "a", "a", "a"] ) ) - updf = DataFrame( a = PooledDataArray( @data ["a", "a", NA, "b", NA] ), - b = PooledDataArray( @data ["a", "b", NA, "b", "a"] ) ) + pdf = DataFrame(a = NullableCategoricalArray(Nullable{String}["a", "a", Nullable(), + Nullable(), "b", Nullable(), "a", Nullable()]), + b = NullableCategoricalArray(Nullable{String}["a", "b", Nullable(), + Nullable(), "b", "a", "a", "a"])) + updf = DataFrame(a = NullableCategoricalArray(Nullable{String}["a", "a", Nullable(), "b", Nullable()]), + b = NullableCategoricalArray(Nullable{String}["a", "b", Nullable(), "b", "a"])) @test isequal(nonunique(pdf), [false, false, false, true, false, false, true, true]) @test isequal(nonunique(updf), falses(5) ) @test isequal(updf, unique(pdf)) diff --git a/test/formula.jl b/test/formula.jl index 11c093158b..db7d5e00b9 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -53,9 +53,10 @@ module TestFormula @test t.intercept == false @test t.terms == [:x1, :x2] - t = Terms(y ~ -1 + x1 + x2) - @test t.intercept == false - @test t.terms == [:x1, :x2] + @test t == Terms(y ~ -1 + x1 + x2) == Terms(y ~ x1 - 1 + x2) == Terms(y ~ x1 + x2 -1) + + ## can't subtract terms other than 1 + @test_throws ErrorException Terms(y ~ x1 - x2) t = Terms(y ~ x1 & x2) @test t.terms == [:(x1 & x2)] @@ -133,11 +134,10 @@ module TestFormula @test isa(mm.m, Matrix{Float64}) @test isa(smm.m, sparsetype) - @test isa(ModelMatrix{DataMatrix{Float64}}(mf).m, DataMatrix{Float64}) - #test_group("expanding a PooledVec into a design matrix of indicators for each dummy variable") + #test_group("expanding a nominal array into a design matrix of indicators for each dummy variable") - d[:x1p] = PooledDataArray(d[:x1]) + d[:x1p] = NullableCategoricalArray(d[:x1]) mf = ModelFrame(y ~ x1p, d) mm = ModelMatrix(mf) @@ -182,24 +182,24 @@ module TestFormula ## @test r[:,1] == DataVector(df["x1"]) ## @test r[:,2] == DataVector(df["x2"]) - ## df["x1"] = PooledDataArray(x1) + ## df["x1"] = CategoricalArray(x1) ## r = expand(:x1, df) ## @test isa(r, DataFrame) ## @test ncol(r) == 3 - ## @test r == expand(PooledDataArray(x1), "x1", DataFrame()) + ## @test r == expand(CategoricalArray(x1), "x1", DataFrame()) ## r = expand(:(x1 + x2), df) ## @test isa(r, DataFrame) ## @test ncol(r) == 4 - ## @test r[:,1:3] == expand(PooledDataArray(x1), "x1", DataFrame()) + ## @test r[:,1:3] == expand(CategoricalArray(x1), "x1", DataFrame()) ## @test r[:,4] == DataVector(df["x2"]) - ## df["x2"] = PooledDataArray(x2) + ## df["x2"] = CategoricalArray(x2) ## r = expand(:(x1 + x2), df) ## @test isa(r, DataFrame) ## @test ncol(r) == 6 - ## @test r[:,1:3] == expand(PooledDataArray(x1), "x1", DataFrame()) - ## @test r[:,4:6] == expand(PooledDataArray(x2), "x2", DataFrame()) + ## @test r[:,1:3] == expand(CategoricalArray(x1), "x1", DataFrame()) + ## @test r[:,4:6] == expand(CategoricalArray(x2), "x2", DataFrame()) #test_group("Creating a model matrix using full formulas: y ~ x1 + x2, etc") @@ -216,7 +216,7 @@ module TestFormula @test mm.m == [ones(4) x1 x2 x1.*x2] @test mm.m == ModelMatrix{sparsetype}(mf).m - df[:x1] = PooledDataArray(x1) + df[:x1] = CategoricalArray(x1) x1e = [[0, 1, 0, 0] [0, 0, 1, 0] [0, 0, 0, 1]] f = y ~ x1 * x2 mf = ModelFrame(f, df) @@ -235,7 +235,7 @@ module TestFormula ## @test mm.m == [ones(4) x1 log(x2)] ## df = deepcopy(d) - ## df["x1"] = PooledDataArray([5:8]) + ## df["x1"] = CategoricalArray([5:8]) ## f = Formula(:(y ~ x1 * (log(x2) + x3))) ## mf = ModelFrame(f, df) ## mm = ModelMatrix(mf) @@ -277,7 +277,7 @@ module TestFormula ## @test model_response(mf) == y'' # fails: Int64 vs. Float64 df = deepcopy(d) - df[:x1] = PooledDataArray(df[:x1]) + df[:x1] = NullableCategoricalArray(df[:x1]) f = y ~ x2 + x3 + x3*x2 mm = ModelMatrix(ModelFrame(f, df)) @@ -334,9 +334,9 @@ module TestFormula ## FAILS: behavior is wrong when no lower-order terms (1+x1+x2+x1&x2...) ## ## df = DataFrame(y=1:27, - ## x1 = PooledDataArray(vec([x for x in 1:3, y in 4:6, z in 7:9])), - ## x2 = PooledDataArray(vec([y for x in 1:3, y in 4:6, z in 7:9])), - ## x3 = PooledDataArray(vec([z for x in 1:3, y in 4:6, z in 7:9]))) + ## x1 = CategoricalArray(vec([x for x in 1:3, y in 4:6, z in 7:9])), + ## x2 = CategoricalArray(vec([y for x in 1:3, y in 4:6, z in 7:9])), + ## x3 = CategoricalArray(vec([z for x in 1:3, y in 4:6, z in 7:9]))) ## f = y ~ x1 & x2 & x3 ## mf = ModelFrame(f, df) ## @test coefnames(mf)[2:end] == @@ -380,10 +380,10 @@ module TestFormula @test size(mm_sub) == (3,3) ## Missing data - d[:x1m] = @data [5, 6, NA, 7] + d[:x1m] = NullableArray(Nullable{Int}[5, 6, Nullable(), 7]) mf = ModelFrame(y ~ x1m, d) mm = ModelMatrix(mf) - @test mm.m[:, 2] == d[complete_cases(d), :x1m] + @test isequal(NullableArray(mm.m[:, 2]), d[complete_cases(d), :x1m]) @test mm.m == ModelMatrix{sparsetype}(mf).m ## Same variable on left and right side @@ -396,7 +396,7 @@ module TestFormula d = DataFrame(x = Compat.repeat([:a, :b], outer = 4), y = Compat.repeat([:c, :d], inner = 2, outer = 2), z = Compat.repeat([:e, :f], inner = 4)) -[pool!(d, name) for name in names(d)] +[categorical!(d, name) for name in names(d)] cs = Dict([Pair(name, EffectsCoding()) for name in names(d)]) d[:n] = 1.:8 @@ -545,5 +545,6 @@ df = DataFrame(x = [1.0,2.0,3.0], y = [4.0,5.0,6.0]) mf = ModelFrame(y ~ 0 + x, df) X = ModelMatrix(mf).m X[1] = 0.0 -@test mf.df[1, :x] == 1.0 +@test mf.df[1, :x] === Nullable(1.0) + end diff --git a/test/grouping.jl b/test/grouping.jl index c7dbb78dd1..c138584cec 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -5,8 +5,8 @@ module TestGrouping df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]), b = repeat([2, 1], outer=[4]), c = randn(8)) - #df[6, :a] = NA - #df[7, :b] = NA + #df[6, :a] = Nullable() + #df[7, :b] = Nullable() cols = [:a, :b] @@ -15,7 +15,7 @@ module TestGrouping sdf = sort(df, cols=cols) bdf = by(df, cols, f) - @test bdf[cols] == unique(sdf[cols]) + @test isequal(bdf[cols], unique(sdf[cols])) byf = by(df, :a, df -> DataFrame(bsum = sum(df[:b]))) @@ -25,19 +25,68 @@ module TestGrouping gd = groupby(df, cols) ga = map(f, gd) - @test bdf == combine(ga) + @test isequal(bdf, combine(ga)) - g(df) = DataFrame(cmax1 = df[:cmax] + 1) + g(df) = DataFrame(cmax1 = Vector(df[:cmax]) + 1) h(df) = g(f(df)) - @test combine(map(h, gd)) == combine(map(g, ga)) + @test isequal(combine(map(h, gd)), combine(map(g, ga))) + + # testing pool overflow + df2 = DataFrame(v1 = categorical(collect(1:1000)), v2 = categorical(fill(1, 1000))) + @test groupby(df2, [:v1, :v2]).starts == collect(1:1000) + @test groupby(df2, [:v2, :v1]).starts == collect(1:1000) + + # grouping empty frame + @test groupby(DataFrame(A=Int[]), :A).starts == Int[] + # grouping single row + @test groupby(DataFrame(A=Int[1]), :A).starts == Int[1] # issue #960 - x = pool(collect(1:20)) + x = CategoricalArray(collect(1:20)) df = DataFrame(v1=x, v2=x) groupby(df, [:v1, :v2]) df2 = by(e->1, DataFrame(x=Int64[]), :x) @test size(df2) == (0,1) - @test sum(df2[:x]) == 0 + @test isequal(sum(df2[:x]), Nullable(0)) + + # Check that reordering levels does not confuse groupby + df = DataFrame(Key1 = CategoricalArray(["A", "A", "B", "B"]), + Key2 = CategoricalArray(["A", "B", "A", "B"]), + Value = 1:4) + gd = groupby(df, :Key1) + @test isequal(gd[1], DataFrame(Key1=["A", "A"], Key2=["A", "B"], Value=1:2)) + @test isequal(gd[2], DataFrame(Key1=["B", "B"], Key2=["A", "B"], Value=3:4)) + gd = groupby(df, [:Key1, :Key2]) + @test isequal(gd[1], DataFrame(Key1="A", Key2="A", Value=1)) + @test isequal(gd[2], DataFrame(Key1="A", Key2="B", Value=2)) + @test isequal(gd[3], DataFrame(Key1="B", Key2="A", Value=3)) + @test isequal(gd[4], DataFrame(Key1="B", Key2="B", Value=4)) + # Reorder levels, add unused level + levels!(df[:Key1], ["Z", "B", "A"]) + levels!(df[:Key2], ["Z", "B", "A"]) + gd = groupby(df, :Key1) + @test isequal(gd[1], DataFrame(Key1=["B", "B"], Key2=["A", "B"], Value=3:4)) + @test isequal(gd[2], DataFrame(Key1=["A", "A"], Key2=["A", "B"], Value=1:2)) + gd = groupby(df, [:Key1, :Key2]) + @test isequal(gd[1], DataFrame(Key1="B", Key2="B", Value=4)) + @test isequal(gd[2], DataFrame(Key1="B", Key2="A", Value=3)) + @test isequal(gd[3], DataFrame(Key1="A", Key2="B", Value=2)) + @test isequal(gd[4], DataFrame(Key1="A", Key2="A", Value=1)) + + a = DataFrame(x=categorical(1:200)) + b = DataFrame(x=categorical(100:300)) + a[:x] = compact(a[:x]) + b[:x] = compact(b[:x]) + r = vcat(a, b) + @test isequal(r, DataFrame(x=[categorical(1:200); categorical(100:300)])) + + a = DataFrame(x=categorical(1:200)) + b = DataFrame(y=categorical(100:300)) + a[:x] = compact(a[:x]) + b[:y] = compact(b[:y]) + r = vcat(a, b) + @test isequal(r, DataFrame(x=NullableCategoricalArray(1:401, [fill(false, 200); fill(true, 201)]), + y=NullableCategoricalArray(-100:300, [fill(true, 200); fill(false, 201)]))) end diff --git a/test/index.jl b/test/index.jl index 9160da4249..41fc3f1495 100644 --- a/test/index.jl +++ b/test/index.jl @@ -16,16 +16,11 @@ inds = Any[1, 1:1, 1.0:1.0, [:A], - @data([true]), - @data([1]), - @data([1.0]), - @data([:A]), - DataArray([:A]), - PooledDataArray([true]), - @pdata([1]), - @pdata([1.0]), - @pdata([:A]), - PooledDataArray([:A])] + NullableArray([true]), + NullableArray([1]), + NullableArray([1.0]), + NullableArray([:A]), + NullableArray([:A])] for ind in inds if isequal(ind, :A) || ndims(ind) == 0 @@ -56,4 +51,12 @@ for name in names(i) i2[name] # Issue #715 end +#= Aliasing & Mutation =# + +# columns should not alias if scalar broadcasted +df = DataFrame(A=[0],B=[0]) +df[1:end] = 0.0 +df[1,:A] = 1.0 +@test df[1,:B] === Nullable(0) + end diff --git a/test/io.jl b/test/io.jl index c587a57b59..715de39e21 100644 --- a/test/io.jl +++ b/test/io.jl @@ -1,6 +1,7 @@ module TestIO using Base.Test using DataFrames, Compat + using LaTeXStrings #test_group("We can read various file types.") @@ -38,57 +39,57 @@ module TestIO @test size(df) == (58788, 25) - @test df[1, 1] === 1 - @test df[1, 2] == "\$" - @test df[1, 3] === 1971 - @test df[1, 4] === 121 - @test df[1, 5] === NA - @test df[1, 6] === 6.4 - @test df[1, 7] === 348 - @test df[1, 8] === 4.5 - @test df[1, 9] === 4.5 - @test df[1, 10] === 4.5 - @test df[1, 11] === 4.5 - @test df[1, 12] === 14.5 - @test df[1, 13] === 24.5 - @test df[1, 14] === 24.5 - @test df[1, 15] === 14.5 - @test df[1, 16] === 4.5 - @test df[1, 17] === 4.5 - @test df[1, 18] == "" - @test df[1, 19] === 0 - @test df[1, 20] === 0 - @test df[1, 21] === 1 - @test df[1, 22] === 1 - @test df[1, 23] === 0 - @test df[1, 24] === 0 - @test df[1, 25] === 0 - - @test df[end, 1] === 58788 - @test df[end, 2] == "xXx: State of the Union" - @test df[end, 3] === 2005 - @test df[end, 4] === 101 - @test df[end, 5] === 87000000 - @test df[end, 6] === 3.9 - @test df[end, 7] === 1584 - @test df[end, 8] === 24.5 - @test df[end, 9] === 4.5 - @test df[end, 10] === 4.5 - @test df[end, 11] === 4.5 - @test df[end, 12] === 4.5 - @test df[end, 13] === 14.5 - @test df[end, 14] === 4.5 - @test df[end, 15] === 4.5 - @test df[end, 16] === 4.5 - @test df[end, 17] === 14.5 - @test df[end, 18] == "PG-13" - @test df[end, 19] === 1 - @test df[end, 20] === 0 - @test df[end, 21] === 0 - @test df[end, 22] === 0 - @test df[end, 23] === 0 - @test df[end, 24] === 0 - @test df[end, 25] === 0 + @test df[1, 1] === Nullable(1) + @test isequal(df[1, 2], Nullable("\$")) + @test df[1, 3] === Nullable(1971) + @test df[1, 4] === Nullable(121) + @test isnull(df[1, 5]) + @test df[1, 6] === Nullable(6.4) + @test df[1, 7] === Nullable(348) + @test df[1, 8] === Nullable(4.5) + @test df[1, 9] === Nullable(4.5) + @test df[1, 10] === Nullable(4.5) + @test df[1, 11] === Nullable(4.5) + @test df[1, 12] === Nullable(14.5) + @test df[1, 13] === Nullable(24.5) + @test df[1, 14] === Nullable(24.5) + @test df[1, 15] === Nullable(14.5) + @test df[1, 16] === Nullable(4.5) + @test df[1, 17] === Nullable(4.5) + @test isequal(df[1, 18], Nullable("")) + @test df[1, 19] === Nullable(0) + @test df[1, 20] === Nullable(0) + @test df[1, 21] === Nullable(1) + @test df[1, 22] === Nullable(1) + @test df[1, 23] === Nullable(0) + @test df[1, 24] === Nullable(0) + @test df[1, 25] === Nullable(0) + + @test df[end, 1] === Nullable(58788) + @test isequal(df[end, 2], Nullable("xXx: State of the Union")) + @test df[end, 3] === Nullable(2005) + @test df[end, 4] === Nullable(101) + @test df[end, 5] === Nullable(87000000) + @test df[end, 6] === Nullable(3.9) + @test df[end, 7] === Nullable(1584) + @test df[end, 8] === Nullable(24.5) + @test df[end, 9] === Nullable(4.5) + @test df[end, 10] === Nullable(4.5) + @test df[end, 11] === Nullable(4.5) + @test df[end, 12] === Nullable(4.5) + @test df[end, 13] === Nullable(14.5) + @test df[end, 14] === Nullable(4.5) + @test df[end, 15] === Nullable(4.5) + @test df[end, 16] === Nullable(4.5) + @test df[end, 17] === Nullable(14.5) + @test isequal(df[end, 18], Nullable("PG-13")) + @test df[end, 19] === Nullable(1) + @test df[end, 20] === Nullable(0) + @test df[end, 21] === Nullable(0) + @test df[end, 22] === Nullable(0) + @test df[end, 23] === Nullable(0) + @test df[end, 24] === Nullable(0) + @test df[end, 25] === Nullable(0) #test_group("readtable handles common separators and infers them from extensions.") @@ -97,9 +98,9 @@ module TestIO df3 = readtable("$data/separators/sample_data.wsv") df4 = readtable("$data/separators/sample_data_white.txt", separator = ' ') - @test df1 == df2 - @test df2 == df3 - @test df3 == df4 + @test isequal(df1, df2) + @test isequal(df2, df3) + @test isequal(df3, df4) readtable("$data/quoting/quotedwhitespace.txt", separator = ' ') @@ -129,23 +130,25 @@ module TestIO # df10 = readtable("$data/skiplines/skipfront.csv", skipstart = 3, header = false, skiprows = [4, 6]) # names!(df10, names(df1)) - @test df2 == df1 - @test df3 == df1 - @test df4 == df1 + @test isequal(df2, df1) + @test isequal(df3, df1) + @test isequal(df4, df1) # Windows EOLS - @test df5 == df1 - @test df6 == df1 - @test df7 == df1 - @test df8 == df1 + @test isequal(df5, df1) + @test isequal(df6, df1) + @test isequal(df7, df1) + @test isequal(df8, df1) - # @test df9 == df1[3:end] - # @test df10 == df1[[1, 3:end]] + # @test isequal(df9, df1[3:end]) + # @test isequal(df10, df1[[1, 3:end]]) function normalize_eol!(df) for (name, col) in eachcol(df) if eltype(col) <: AbstractString df[name] = map(s -> replace(s, "\r\n", "\n"), col) + elseif eltype(col) <: Nullable && eltype(eltype(col)) <: AbstractString + df[name] = map(s -> replace(get(s), "\r\n", "\n"), col) end end df @@ -163,16 +166,16 @@ module TestIO # df2w = readtable(winpath; opts2...) # Normalize line endings in both and test equality - @test normalize_eol!(df1w) == normalize_eol!(df1) + @test isequal(normalize_eol!(df1w), normalize_eol!(df1)) # @test normalize_eol!(df2w) == df1 opts1[:nrows] = 3 opts2[:nrows] = 3 - @test normalize_eol!(readtable(osxpath; opts1...)) == df1[1:3, :] - # @test readtable(osxpath; opts2...) == df1[1:3, :] - @test normalize_eol!(readtable(winpath; opts1...)) == df1[1:3, :] - # @test readtable(winpath; opts2...) == df1[1:3, :] + @test isequal(normalize_eol!(readtable(osxpath; opts1...)), df1[1:3, :]) + # @test isequalreadtable(osxpath; opts2...), df1[1:3, :] + @test isequal(normalize_eol!(readtable(winpath; opts1...)), df1[1:3, :]) + # @test isequalreadtable(winpath; opts2...), df1[1:3, :]) #test_group("readtable handles custom delimiters.") @@ -181,55 +184,55 @@ module TestIO readtable("$data/separators/sample_data.csv", quotemark = Char[]) @test_throws ErrorException readtable("$data/newlines/embedded_osx.csv", quotemark = Char[]) df = readtable("$data/quoting/single.csv", quotemark = ['\'']) - @test df == readtable("$data/quoting/mixed.csv", quotemark = ['\'', '"']) + @test isequal(df, readtable("$data/quoting/mixed.csv", quotemark = ['\'', '"'])) # df = readtable("$data/decimal/period.csv") - # @test df[2, :A] == 0.3 - # @test df[2, :B] == 4.0 + # @test isequaldf[2, :A], 0.3) + # @test isequaldf[2, :B], 4.0) - # @test df == readtable("$data/decimal/comma.tsv", decimal = ',') + # @test isequal(df, readtable("$data/decimal/comma.tsv", decimal = ',')) #test_group("readtable column names.") ns = [:Var1, :Var2, :Var3, :Var4, :Var5] df = readtable("$data/typeinference/mixedtypes.csv") names!(df, ns) - @test df == readtable("$data/typeinference/mixedtypes.csv", names = ns) + @test isequal(df, readtable("$data/typeinference/mixedtypes.csv", names = ns)) df = readtable("$data/separators/sample_data.csv", header = false, names = ns[1:3]) - @test df[1, :Var1] == 0 + @test isequal(df[1, :Var1], Nullable(0)) df = readtable("$data/separators/sample_data.csv", names = ns[1:3]) - @test df[1, :Var1] == 1 + @test isequal(df[1, :Var1], Nullable(1)) #test_group("Properties of data frames returned by readtable method.") # Readtable ignorepadding io = IOBuffer("A , \tB , C\n1 , \t2, 3\n") - @test readtable(io, ignorepadding = true) == DataFrame(A = 1, B = 2, C = 3) + @test isequal(readtable(io, ignorepadding = true), DataFrame(A = 1, B = 2, C = 3)) # Readtable c-style escape options df = readtable("$data/escapes/escapes.csv", allowescapes = true) - @test df[1, :V] == "\t\r\n" - @test df[2, :V] == "\\\\t" - @test df[3, :V] == "\\\"" + @test isequal(df[1, :V], Nullable("\t\r\n")) + @test isequal(df[2, :V], Nullable("\\\\t")) + @test isequal(df[3, :V], Nullable("\\\"")) df = readtable("$data/escapes/escapes.csv") - @test df[1, :V] == "\\t\\r\\n" - @test df[2, :V] == "\\\\t" - @test df[3, :V] == "\\\"" + @test isequal(df[1, :V], Nullable("\\t\\r\\n")) + @test isequal(df[2, :V], Nullable("\\\\t")) + @test isequal(df[3, :V], Nullable("\\\"")) # df = readtable("$data/escapes/escapes.csv", escapechars = ['"'], nrows = 2) - # @test df[1, :V] == "\\t\\r\\n" - # @test df[2, :V] == "\\\\\\\\t" + # @test isequal(df[1, :V], "\\t\\r\\n") + # @test isequal(df[2, :V], "\\\\\\\\t") # Readtable with makefactors active should only make factors from columns # of strings. filename = "$data/factors/mixedvartypes.csv" df = readtable(filename, makefactors = true) - @test typeof(df[:factorvar]) == PooledDataArray{Compat.UTF8String,UInt32,1} - @test typeof(df[:floatvar]) == DataArray{Float64,1} + @test isa(df[:factorvar], NullableCategoricalArray{Compat.UTF8String,1}) + @test isa(df[:floatvar], NullableArray{Float64,1}) # Readtable shouldn't silently drop data when reading highly compressed gz. df = readtable("$data/compressed/1000x2.csv.gz") @@ -238,79 +241,79 @@ module TestIO # Readtable type inference filename = "$data/typeinference/bool.csv" df = readtable(filename) - @test typeof(df[:Name]) == DataArray{Compat.UTF8String,1} - @test typeof(df[:IsMale]) == DataArray{Bool,1} - @test df[:IsMale][1] == true - @test df[:IsMale][4] == false + @test isa(df[:Name], NullableArray{Compat.UTF8String,1}) + @test isa(df[:IsMale], NullableArray{Bool,1}) + @test get(df[:IsMale][1]) + @test !get(df[:IsMale][4]) filename = "$data/typeinference/standardtypes.csv" df = readtable(filename) - @test typeof(df[:IntColumn]) == DataArray{Int,1} - @test typeof(df[:IntlikeColumn]) == DataArray{Float64,1} - @test typeof(df[:FloatColumn]) == DataArray{Float64,1} - @test typeof(df[:BoolColumn]) == DataArray{Bool,1} - @test typeof(df[:StringColumn]) == DataArray{Compat.UTF8String,1} + @test isa(df[:IntColumn], NullableArray{Int,1}) + @test isa(df[:IntlikeColumn], NullableArray{Float64,1}) + @test isa(df[:FloatColumn], NullableArray{Float64,1}) + @test isa(df[:BoolColumn], NullableArray{Bool,1}) + @test isa(df[:StringColumn], NullableArray{Compat.UTF8String,1}) filename = "$data/typeinference/mixedtypes.csv" df = readtable(filename) - @test typeof(df[:c1]) == DataArray{Compat.UTF8String,1} - @test df[:c1][1] == "1" - @test df[:c1][2] == "2.0" - @test df[:c1][3] == "true" - @test typeof(df[:c2]) == DataArray{Float64,1} - @test df[:c2][1] == 1.0 - @test df[:c2][2] == 3.0 - @test df[:c2][3] == 4.5 - @test typeof(df[:c3]) == DataArray{Compat.UTF8String,1} - @test df[:c3][1] == "0" - @test df[:c3][2] == "1" - @test df[:c3][3] == "f" - @test typeof(df[:c4]) == DataArray{Bool,1} - @test df[:c4][1] == true - @test df[:c4][2] == false - @test df[:c4][3] == true - @test typeof(df[:c5]) == DataArray{Compat.UTF8String,1} - @test df[:c5][1] == "False" - @test df[:c5][2] == "true" - @test df[:c5][3] == "true" + @test isa(df[:c1], NullableArray{Compat.UTF8String,1}) + @test isequal(df[:c1][1], Nullable("1")) + @test isequal(df[:c1][2], Nullable("2.0")) + @test isequal(df[:c1][3], Nullable("true")) + @test isa(df[:c2], NullableArray{Float64,1}) + @test isequal(df[:c2][1], Nullable(1.0)) + @test isequal(df[:c2][2], Nullable(3.0)) + @test isequal(df[:c2][3], Nullable(4.5)) + @test isa(df[:c3], NullableArray{Compat.UTF8String,1}) + @test isequal(df[:c3][1], Nullable("0")) + @test isequal(df[:c3][2], Nullable("1")) + @test isequal(df[:c3][3], Nullable("f")) + @test isa(df[:c4], NullableArray{Bool,1}) + @test isequal(df[:c4][1], Nullable(true)) + @test isequal(df[:c4][2], Nullable(false)) + @test isequal(df[:c4][3], Nullable(true)) + @test isa(df[:c5], NullableArray{Compat.UTF8String,1}) + @test isequal(df[:c5][1], Nullable("False")) + @test isequal(df[:c5][2], Nullable("true")) + @test isequal(df[:c5][3], Nullable("true")) # Readtable defining column types filename = "$data/definedtypes/mixedvartypes.csv" df = readtable(filename) - @test typeof(df[:n]) == DataArray{Int,1} - @test df[:n][1] == 1 - @test typeof(df[:s]) == DataArray{Compat.UTF8String,1} - @test df[:s][1] == "text" - @test typeof(df[:f]) == DataArray{Float64,1} - @test df[:f][1] == 2.3 - @test typeof(df[:b]) == DataArray{Bool,1} - @test df[:b][1] == true + @test isa(df[:n], NullableArray{Int,1}) + @test isequal(df[:n][1], Nullable(1)) + @test isa(df[:s], NullableArray{Compat.UTF8String,1}) + @test isequal(df[:s][1], Nullable("text")) + @test isa(df[:f], NullableArray{Float64,1}) + @test isequal(df[:f][1], Nullable(2.3)) + @test isa(df[:b], NullableArray{Bool,1}) + @test isequal(df[:b][1], Nullable(true)) df = readtable(filename, eltypes = [Int64, Compat.UTF8String, Float64, Bool]) - @test typeof(df[:n]) == DataArray{Int64,1} - @test df[:n][1] == 1 - @test typeof(df[:s]) == DataArray{Compat.UTF8String,1} - @test df[:s][1] == "text" - @test df[:s][4] == "text ole" - @test typeof(df[:f]) == DataArray{Float64,1} - @test df[:f][1] == 2.3 - @test typeof(df[:b]) == DataArray{Bool,1} - @test df[:b][1] == true - @test df[:b][2] == false + @test isa(df[:n], NullableArray{Int64,1}) + @test isequal(df[:n][1], Nullable(1)) + @test isa(df[:s], NullableArray{Compat.UTF8String,1}) + @test isequal(df[:s][1], Nullable("text")) + @test isequal(df[:s][4], Nullable("text ole")) + @test isa(df[:f], NullableArray{Float64,1}) + @test isequal(df[:f][1], Nullable(2.3)) + @test isa(df[:b], NullableArray{Bool,1}) + @test isequal(df[:b][1], Nullable(true)) + @test isequal(df[:b][2], Nullable(false)) df = readtable(filename, eltypes = [Int64, Compat.UTF8String, Float64, Compat.UTF8String]) - @test typeof(df[:n]) == DataArray{Int64,1} - @test df[:n][1] == 1.0 - @test isna(df[:s][3]) - @test typeof(df[:f]) == DataArray{Float64,1} + @test isa(df[:n], NullableArray{Int64,1}) + @test isequal(df[:n][1], Nullable(1.0)) + @test isnull(df[:s][3]) + @test isa(df[:f], NullableArray{Float64,1}) # Float are not converted to int - @test df[:f][1] == 2.3 - @test df[:f][2] == 0.2 - @test df[:f][3] == 5.7 - @test typeof(df[:b]) == DataArray{Compat.UTF8String,1} - @test df[:b][1] == "T" - @test df[:b][2] == "FALSE" + @test isequal(df[:f][1], Nullable(2.3)) + @test isequal(df[:f][2], Nullable(0.2)) + @test isequal(df[:f][3], Nullable(5.7)) + @test isa(df[:b], NullableArray{Compat.UTF8String,1}) + @test isequal(df[:b][1], Nullable("T")) + @test isequal(df[:b][2], Nullable("FALSE")) # Readtable name normalization abnormal = "\u212b" @@ -324,12 +327,13 @@ module TestIO io = IOBuffer(abnormal*",%_B*\tC*,end\n1,2,3\n") @test names(readtable(io, normalizenames=false)) == [Symbol(abnormal),Symbol("%_B*\tC*"),:end] - # Test writetable with NA and compare to the results + # Test writetable with Nullable() and compare to the results tf = tempname() isfile(tf) && rm(tf) - df = DataFrame(A = @data([1,NA]), B = @data(["b", NA])) + df = DataFrame(A = NullableArray(Nullable{Int}[1,Nullable()]), + B = NullableArray(Nullable{String}["b", Nullable()])) writetable(tf, df) - @test readcsv(tf) == ["A" "B"; 1 "b"; "NA" "NA"] + @test readcsv(tf) == ["A" "B"; 1 "b"; "NULL" "NULL"] # Test writetable with nastring set and compare to the results isfile(tf) && rm(tf) @@ -338,10 +342,10 @@ module TestIO rm(tf) # Test writetable with append - df1 = DataFrame(a = @data([1, 2, 3]), b = @data([4, 5, 6])) - df2 = DataFrame(a = @data([1, 2, 3]), b = @data([4, 5, 6])) - df3 = DataFrame(a = @data([1, 2, 3]), c = @data([4, 5, 6])) # 2nd column mismatch - df3b = DataFrame(a = @data([1, 2, 3]), b = @data([4, 5, 6]), c = @data([4, 5, 6])) # number of columns mismatch + df1 = DataFrame(a = NullableArray([1, 2, 3]), b = NullableArray([4, 5, 6])) + df2 = DataFrame(a = NullableArray([1, 2, 3]), b = NullableArray([4, 5, 6])) + df3 = DataFrame(a = NullableArray([1, 2, 3]), c = NullableArray([4, 5, 6])) # 2nd column mismatch + df3b = DataFrame(a = NullableArray([1, 2, 3]), b = NullableArray([4, 5, 6]), c = NullableArray([4, 5, 6])) # number of columns mismatch # Would use joinpath(tempdir(), randstring()) to get around tempname @@ -352,22 +356,22 @@ module TestIO # Written as normal if file doesn't exist writetable(tf, df1, append = true) - @test readtable(tf) == df1 + @test isequal(readtable(tf), df1) # Written as normal if file is empty open(io -> print(io, ""), tf, "w") writetable(tf, df1, append = true) - @test readtable(tf) == df1 + @test isequal(readtable(tf), df1) # Appends to existing file if append == true writetable(tf, df1) writetable(tf, df2, header = false, append = true) - @test readtable(tf) == vcat(df1, df2) + @test isequal(readtable(tf), vcat(df1, df2)) # Overwrites file if append == false writetable(tf, df1) writetable(tf, df2) - @test readtable(tf) == df2 + @test isequal(readtable(tf), df2) # Enforces matching column names iff append == true && header == true writetable(tf, df2) @@ -387,7 +391,7 @@ module TestIO # Make sure the ' doesn't get escaped for no reason writetable(tf, df) - @test readtable(tf) == df + @test isequal(readtable(tf), df) # Make sure the ' does get escaped when needed writetable(tf, df, quotemark='\'') @@ -405,10 +409,10 @@ module TestIO """ @test size(df1) == (4, 3) @test names(df1) == [:name, :age, :squidPerWeek] - @test df1[1] == ["Alice","Bob","Carol","Eve"] - @test df1[2] == [36,24,58,49] - @test df1[3] == [3.14,0,2.71,7.77] - @test typeof(df1[1]) <: DataArray + @test isequal(df1[1], NullableArray(["Alice","Bob","Carol","Eve"])) + @test isequal(df1[2], NullableArray([36,24,58,49])) + @test isequal(df1[3], NullableArray([3.14,0,2.71,7.77])) + @test isa(df1[1], NullableArray{Compat.UTF8String,1}) # Test @wsv_str df2 = wsv""" @@ -418,7 +422,7 @@ module TestIO Carol 58 2.71 Eve 49 7.77 """ - @test df2 == df1 + @test isequal(df2, df1) # Test @tsv_str df3 = tsv""" @@ -428,7 +432,7 @@ module TestIO Carol 58 2.71 Eve 49 7.77 """ - @test df3 == df1 + @test isequal(df3, df1) # csv2 can't be tested until non-'.' decimals are implemented #df4 = csv2""" @@ -438,7 +442,7 @@ module TestIO # Carol; 58; 2,71 # Eve; 49; 7,77 # """ - #@test df4 == df1 + #@test isequal(df4, df1) # Test 'f' flag df5 = csv""" @@ -448,7 +452,7 @@ module TestIO Carol, 58, 2.71 Eve, 49, 7.77 """f - @test typeof(df5[1]) <: PooledDataArray + @test isa(df5[1], NullableCategoricalArray{Compat.UTF8String,1}) # Test 'c' flag df6 = csv""" @@ -458,7 +462,7 @@ module TestIO #Carol, 58, 2.71 Eve, 49, 7.77 """c - @test df6 == df1[[1,2,4],:] + @test isequal(df6, df1[[1,2,4],:]) # Test 'H' flag df7 = csv""" @@ -468,7 +472,8 @@ module TestIO Eve, 49, 7.77 """H @test names(df7) == [:x1,:x2,:x3] - @test Array(df7) == Array(df1) + names!(df7, names(df1)) + @test isequal(df7, df1) # Test multiple flags at once df8 = csv""" @@ -477,12 +482,49 @@ module TestIO #Carol, 58, 2.71 Eve, 49, 7.77 """fcH - @test typeof(df8[1]) <: PooledDataArray + @test isa(df8[1], NullableCategoricalArray{Compat.UTF8String,1}) @test names(df8) == [:x1,:x2,:x3] - @test Array(df8) == Array(df1[[1,2,4],:]) + names!(df8, names(df1)) + @test isequal(df8, df1[[1,2,4],:]) # Test invalid flag # Need to wrap macro call inside eval to prevent the error from being # thrown prematurely @test_throws ArgumentError eval(:(csv"foo,bar"a)) + + # Test LaTeX export + df = DataFrame(A = 1:4, + B = ["\$10.0", "M&F", "A~B", "\\alpha"], + C = [L"\alpha", L"\beta", L"\gamma", L"\sum_{i=1}^n \delta_i"], + D = [1.0, 2.0, Nullable(), 3.0] + ) + str = """ + \\begin{tabular}{r|cccc} + \t& A & B & C & D\\\\ + \t\\hline + \t1 & 1 & \\\$10.0 & \$\\alpha\$ & 1.0 \\\\ + \t2 & 2 & M\\&F & \$\\beta\$ & 2.0 \\\\ + \t3 & 3 & A\\textasciitilde{}B & \$\\gamma\$ & \\\\ + \t4 & 4 & \\textbackslash{}alpha & \$\\sum_{i=1}^n \\delta_i\$ & 3.0 \\\\ + \\end{tabular} + """ + @test reprmime(MIME("text/latex"), df) == str + + #Test HTML output for IJulia and similar + df = DataFrame(Fish = ["Suzy", "Amir"], Mass = [1.5, Nullable()]) + io = IOBuffer() + show(io, "text/html", df) + str = takebuf_string(io) + @test str == "
FishMass
1Suzy1.5
2Amir#NULL
" + + # test limit attribute of IOContext is used + df = DataFrame(a=collect(1:1000)) + ioc = IOContext(IOBuffer(), displaysize=(10, 10), limit=false) + show(ioc, "text/html", df) + @test length(takebuf_string(ioc.io)) > 10000 + + io = IOBuffer() + show(io, "text/html", df) + @test length(takebuf_string(io)) < 10000 + end diff --git a/test/iteration.jl b/test/iteration.jl index 5c712298cf..57c17becd4 100644 --- a/test/iteration.jl +++ b/test/iteration.jl @@ -1,34 +1,22 @@ module TestIteration using Base.Test, DataFrames, Compat - dv = @data([1, 2, NA]) - dm = DataArray([1 2; 3 4]) - dt = DataArray(zeros(2, 2, 2)) + dv = NullableArray(Nullable{Int}[1, 2, Nullable()]) + dm = NullableArray([1 2; 3 4]) + dt = NullableArray(zeros(2, 2, 2)) df = DataFrame(A = 1:2, B = 2:3) - for el in dv - @test ndims(el) == 0 - end - - for el in dm - @test ndims(el) == 0 - end - - for el in dt - @test ndims(el) == 0 - end - for row in eachrow(df) @test isa(row, DataFrameRow) - @test row[:B]-row[:A] == 1 + @test isequal(row[:B]-row[:A], Nullable(1)) # issue #683 (https://github.com/JuliaStats/DataFrames.jl/pull/683) @test typeof(collect(row)) == @compat Array{Tuple{Symbol, Any}, 1} end for col in eachcol(df) - @test isa(col, @compat Tuple{Symbol, AbstractDataVector}) + @test isa(col, @compat Tuple{Symbol, NullableVector}) end @test isequal(map(x -> minimum(convert(Array, x)), eachrow(df)), Any[1,2]) @@ -37,22 +25,22 @@ module TestIteration row = DataFrameRow(df, 1) row[:A] = 100 - @test df[1, :A] == 100 + @test isequal(df[1, :A], Nullable(100)) row[1] = 101 - @test df[1, :A] == 101 + @test isequal(df[1, :A], Nullable(101)) df = DataFrame(A = 1:4, B = ["M", "F", "F", "M"]) s1 = sub(df, 1:3) s1[2,:A] = 4 - @test df[2, :A] == 4 - @test sub(s1, 1:2) == sub(df, 1:2) + @test isequal(df[2, :A], Nullable(4)) + @test isequal(sub(s1, 1:2), sub(df, 1:2)) s2 = sub(df, 1:2:3) s2[2, :B] = "M" - @test df[3, :B] == "M" - @test sub(s2, 1:1:2) == sub(df, [1,3]) + @test isequal(df[3, :B], Nullable("M")) + @test isequal(sub(s2, 1:1:2), sub(df, [1,3])) # @test_fail for x in df; end # Raises an error end diff --git a/test/join.jl b/test/join.jl index b612eecc02..5be59c7915 100644 --- a/test/join.jl +++ b/test/join.jl @@ -14,15 +14,15 @@ module TestJoin # Test output of various join types outer = DataFrame(ID = [1, 2, 2, 3, 4], - Name = @data(["John Doe", "Jane Doe", "Jane Doe", "Joe Blogs", NA]), - Job = @data(["Lawyer", "Doctor", "Florist", NA, "Farmer"])) + Name = NullableArray(Nullable{String}["John Doe", "Jane Doe", "Jane Doe", "Joe Blogs", Nullable()]), + Job = NullableArray(Nullable{String}["Lawyer", "Doctor", "Florist", Nullable(), "Farmer"])) # (Tests use current column ordering but don't promote it) - right = outer[!isna(outer[:Job]), [:Name, :ID, :Job]] - left = outer[!isna(outer[:Name]), :] - inner = left[!isna(left[:Job]), :] + right = outer[Bool[!isnull(x) for x in outer[:Job]], [:Name, :ID, :Job]] + left = outer[Bool[!isnull(x) for x in outer[:Name]], :] + inner = left[Bool[!isnull(x) for x in left[:Job]], :] semi = unique(inner[:, [:ID, :Name]]) - anti = left[isna(left[:Job]), [:ID, :Name]] + anti = left[Bool[isnull(x) for x in left[:Job]], [:ID, :Name]] @test isequal(join(name, job, on = :ID), inner) @test isequal(join(name, job, on = :ID, kind = :inner), inner) @@ -59,7 +59,7 @@ module TestJoin B = ['a', 'a', 'a', 'b', 'b', 'b'], C = [3, 4, 5, 3, 4, 5]) - @test join(df1, df2[[:C]], kind = :cross) == cross + @test isequal(join(df1, df2[[:C]], kind = :cross), cross) # Cross joins handle naming collisions @test size(join(df1, df1, kind = :cross)) == (4, 4) @@ -67,11 +67,44 @@ module TestJoin # Cross joins don't take keys @test_throws ArgumentError join(df1, df2, on = :A, kind = :cross) + # test empty inputs + simple_df(len::Int, col=:A) = (df = DataFrame(); df[col]=collect(1:len); df) + @test isequal(join(simple_df(0), simple_df(0), on = :A, kind = :left), simple_df(0)) + @test isequal(join(simple_df(2), simple_df(0), on = :A, kind = :left), simple_df(2)) + @test isequal(join(simple_df(0), simple_df(2), on = :A, kind = :left), simple_df(0)) + @test isequal(join(simple_df(0), simple_df(0), on = :A, kind = :right), simple_df(0)) + @test isequal(join(simple_df(0), simple_df(2), on = :A, kind = :right), simple_df(2)) + @test isequal(join(simple_df(2), simple_df(0), on = :A, kind = :right), simple_df(0)) + @test isequal(join(simple_df(0), simple_df(0), on = :A, kind = :inner), simple_df(0)) + @test isequal(join(simple_df(0), simple_df(2), on = :A, kind = :inner), simple_df(0)) + @test isequal(join(simple_df(2), simple_df(0), on = :A, kind = :inner), simple_df(0)) + @test isequal(join(simple_df(0), simple_df(0), on = :A, kind = :outer), simple_df(0)) + @test isequal(join(simple_df(0), simple_df(2), on = :A, kind = :outer), simple_df(2)) + @test isequal(join(simple_df(2), simple_df(0), on = :A, kind = :outer), simple_df(2)) + @test isequal(join(simple_df(0), simple_df(0), on = :A, kind = :semi), simple_df(0)) + @test isequal(join(simple_df(2), simple_df(0), on = :A, kind = :semi), simple_df(0)) + @test isequal(join(simple_df(0), simple_df(2), on = :A, kind = :semi), simple_df(0)) + @test isequal(join(simple_df(0), simple_df(0), on = :A, kind = :anti), simple_df(0)) + @test isequal(join(simple_df(2), simple_df(0), on = :A, kind = :anti), simple_df(2)) + @test isequal(join(simple_df(0), simple_df(2), on = :A, kind = :anti), simple_df(0)) + @test isequal(join(simple_df(0), simple_df(0, :B), kind = :cross), DataFrame(A=Int[], B=Int[])) + @test isequal(join(simple_df(0), simple_df(2, :B), kind = :cross), DataFrame(A=Int[], B=Int[])) + @test isequal(join(simple_df(2), simple_df(0, :B), kind = :cross), DataFrame(A=Int[], B=Int[])) + # issue #960 df1 = DataFrame(A = 1:50, B = 1:50, C = 1) - pool!(df1, :A) - pool!(df1, :B) + categorical!(df1, :A) + categorical!(df1, :B) join(df1, df1, on = [:A, :B], kind = :inner) + + # Test that Array{Nullable} works when combined with NullableArray (#1088) + df = DataFrame(Name = Nullable{String}["A", "B", "C"], + Mass = [1.5, 2.2, 1.1]) + df2 = DataFrame(Name = ["A", "B", "C", "A"], + Quantity = [3, 3, 2, 4]) + @test join(df2, df, on=:Name, kind=:left) == DataFrame(Name = ["A", "A", "B", "C"], + Quantity = [3, 4, 3, 2], + Mass = [1.5, 1.5, 2.2, 1.1]) end diff --git a/test/show.jl b/test/show.jl index d5afb45840..ef492ba8d3 100644 --- a/test/show.jl +++ b/test/show.jl @@ -1,6 +1,7 @@ module TestShow using DataFrames using Compat + using Base.Test import Compat.String df = DataFrame(A = 1:3, B = ["x", "y", "z"]) @@ -35,4 +36,16 @@ module TestShow show(io, A) A = DataFrames.RepeatedVector([1, 2, 3], 1, 5) show(io, A) + + #Test show output for REPL and similar + df = DataFrame(Fish = ["Suzy", "Amir"], Mass = [1.5, Nullable()]) + io = IOBuffer() + show(io, df) + str = takebuf_string(io) + @test str == """ +2×2 DataFrames.DataFrame +│ Row │ Fish │ Mass │ +├─────┼──────┼───────┤ +│ 1 │ Suzy │ 1.5 │ +│ 2 │ Amir │ #NULL │""" end diff --git a/test/sort.jl b/test/sort.jl index cf48fab744..7760a3dd81 100644 --- a/test/sort.jl +++ b/test/sort.jl @@ -2,20 +2,20 @@ module TestSort using Base.Test using DataFrames - dv1 = @data([9, 1, 8, NA, 3, 3, 7, NA]) - dv2 = 1.0 * dv1 - dv3 = DataArray([1:8;]) - pdv1 = convert(PooledDataArray, dv1) + dv1 = NullableArray(Nullable{Int}[9, 1, 8, Nullable(), 3, 3, 7, Nullable()]) + dv2 = NullableArray(Nullable{Int}[9, 1, 8, Nullable(), 3, 3, 7, Nullable()]) + dv3 = NullableArray(1:8) + cv1 = NullableCategoricalArray(dv1, ordered=true) - d = DataFrame(dv1 = dv1, dv2 = dv2, dv3 = dv3, pdv1 = pdv1) + d = DataFrame(dv1 = dv1, dv2 = dv2, dv3 = dv3, cv1 = cv1) @test sortperm(d) == sortperm(dv1) @test sortperm(d[[:dv3, :dv1]]) == sortperm(dv3) - @test sort(d, cols=:dv1)[:dv3] == sortperm(dv1) - @test sort(d, cols=:dv2)[:dv3] == sortperm(dv1) - @test sort(d, cols=:pdv1)[:dv3] == sortperm(dv1) - @test sort(d, cols=[:dv1, :pdv1])[:dv3] == sortperm(dv1) - @test sort(d, cols=[:dv1, :dv3])[:dv3] == sortperm(dv1) + @test isequal(sort(d, cols=:dv1)[:dv3], NullableArray(sortperm(dv1))) + @test isequal(sort(d, cols=:dv2)[:dv3], NullableArray(sortperm(dv1))) + @test isequal(sort(d, cols=:cv1)[:dv3], NullableArray(sortperm(dv1))) + @test isequal(sort(d, cols=[:dv1, :cv1])[:dv3], NullableArray(sortperm(dv1))) + @test isequal(sort(d, cols=[:dv1, :dv3])[:dv3], NullableArray(sortperm(dv1))) df = DataFrame(rank=rand(1:12, 1000), chrom=rand(1:24, 1000), @@ -33,13 +33,17 @@ module TestSort @test issorted(ds2, cols=(order(:rank, rev=true), :chrom, :pos)) @test issorted(ds2, rev=(true, false, false)) - @test ds2 == ds + @test isequal(ds2, ds) sort!(df, cols=(:rank, :chrom, :pos), rev=(true, false, false)) @test issorted(df, cols=(order(:rank, rev=true), :chrom, :pos)) @test issorted(df, rev=(true, false, false)) - @test df == ds - + @test isequal(df, ds) + # Check that columns that shares the same underlying array are only permuted once PR#1072 + df = DataFrame(a=[2,1]) + df[:b] = df[:a] + sort!(df, cols=:a) + @test df == DataFrame(a=[1,2],b=[1,2]) end diff --git a/test/statsmodel.jl b/test/statsmodel.jl index ca9dcab6f7..9d9e5cd3cc 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -33,12 +33,12 @@ d[:x4] = [17:20;] f = y ~ x1 * x2 m = fit(DummyMod, f, d) -@test model_response(m) == d[:y] +@test model_response(m) == Array(d[:y]) ## test prediction method ## vanilla StatsBase.predict(mod::DummyMod) = mod.x * mod.beta -@test predict(m) == [ ones(size(d,1)) d[:x1] d[:x2] d[:x1].*d[:x2] ] * collect(1:4) +@test predict(m) == [ ones(size(d,1)) Array(d[:x1]) Array(d[:x2]) Array(d[:x1]).*Array(d[:x2]) ] * collect(1:4) ## new data from matrix StatsBase.predict(mod::DummyMod, newX::Matrix) = newX * mod.beta @@ -46,10 +46,10 @@ mm = ModelMatrix(ModelFrame(f, d)) @test predict(m, mm.m) == mm.m * collect(1:4) ## new data from DataFrame (via ModelMatrix) -@test predict(m, d) == predict(m, mm.m) +@test isequal(predict(m, d), NullableArray(predict(m, mm.m))) d2 = deepcopy(d) -d2[3, :x1] = NA +d2[3, :x1] = Nullable() @test length(predict(m, d2)) == 4 ## test copying of names from Terms to CoefTable @@ -61,23 +61,23 @@ io = IOBuffer() show(io, m) ## with categorical variables -d[:x1p] = PooledDataArray(d[:x1]) +d[:x1p] = NullableCategoricalArray(d[:x1]) f2 = y ~ x1p m2 = fit(DummyMod, f2, d) @test coeftable(m2).rownms == ["(Intercept)", "x1p: 6", "x1p: 7", "x1p: 8"] ## predict w/ new data missing levels -@test predict(m2, d[2:4, :]) == predict(m2)[2:4] +@test isequal(predict(m2, d[2:4, :]), NullableArray(predict(m2)[2:4])) ## predict w/ new data with _extra_ levels (throws an error) d3 = deepcopy(d) d3[1, :x1] = 0 -d3[:x1p] = PooledDataArray(d3[:x1]) +d3[:x1p] = NullableCategoricalVector(d3[:x1]) @test_throws ArgumentError predict(m2, d3) ## fit with contrasts specified -d[:x2p] = PooledDataArray(d[:x2]) +d[:x2p] = NullableCategoricalVector(d[:x2]) f3 = y ~ x1p + x2p m3 = fit(DummyMod, f3, d) fit(DummyMod, f3, d, contrasts = Dict(:x1p => EffectsCoding())) diff --git a/test/utils.jl b/test/utils.jl index cc4e5bc931..9875fc4eb4 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -38,21 +38,21 @@ module TestUtils "Expected if Julia was not built from source.") end - @test DataFrames.countna([1:3;]) == 0 - - data = @data rand(20) - @test DataFrames.countna(data) == 0 - data[sample(1:20, 11, replace=false)] = NA - @test DataFrames.countna(data) == 11 - data[1:end] = NA - @test DataFrames.countna(data) == 20 - - pdata = @data sample(1:5, 20) - @test DataFrames.countna(pdata) == 0 - pdata[sample(1:20, 11, replace=false)] = NA - @test DataFrames.countna(pdata) == 11 - pdata[1:end] = NA - @test DataFrames.countna(pdata) == 20 + @test DataFrames.countnull([1:3;]) == 0 + + data = NullableArray(rand(20)) + @test DataFrames.countnull(data) == 0 + data[sample(1:20, 11, replace=false)] = Nullable() + @test DataFrames.countnull(data) == 11 + data[1:end] = Nullable() + @test DataFrames.countnull(data) == 20 + + pdata = NullableArray(sample(1:5, 20)) + @test DataFrames.countnull(pdata) == 0 + pdata[sample(1:20, 11, replace=false)] = Nullable() + @test DataFrames.countnull(pdata) == 11 + pdata[1:end] = Nullable() + @test DataFrames.countnull(pdata) == 20 funs = [mean, sum, var, x -> sum(x)] if string(funs[end]) == "(anonymous function)" # Julia < 0.5