diff --git a/src/abstractdataarray.jl b/src/abstractdataarray.jl index b2ce25c..b09dc92 100644 --- a/src/abstractdataarray.jl +++ b/src/abstractdataarray.jl @@ -4,7 +4,7 @@ An `N`-dimensional `AbstractArray` whose entries can take on values of type `T` or the value `NA`. """ -abstract type AbstractDataArray{T, N} <: AbstractArray{T, N} end +abstract type AbstractDataArray{T, N} <: AbstractArray{Data{T}, N} end """ AbstractDataVector{T} @@ -20,7 +20,7 @@ A 2-dimensional [`AbstractDataArray`](@ref) with element type `T`. """ const AbstractDataMatrix{T} = AbstractDataArray{T, 2} -Base.eltype(d::AbstractDataArray{T, N}) where {T, N} = T +Base.eltype(d::AbstractDataArray{T, N}) where {T, N} = Union{T,NAtype} # Generic iteration over AbstractDataArray's diff --git a/src/broadcast.jl b/src/broadcast.jl index 3dfe817..d38a369 100644 --- a/src/broadcast.jl +++ b/src/broadcast.jl @@ -189,9 +189,30 @@ Base.Broadcast._containertype(::Type{T}) where T<:DataArray = DataArra Base.Broadcast._containertype(::Type{T}) where T<:PooledDataArray = PooledDataArray Base.Broadcast.broadcast_indices(::Type{T}, A) where T<:AbstractDataArray = indices(A) +@inline function broadcast_t(f, ::Type{T}, shape, A, Bs...) where {T} + dest = Base.Broadcast.containertype(A, Bs...)(extractT(T), Base.index_lengths(shape...)) + return broadcast!(f, dest, A, Bs...) +end + +# This is mainly to handle isna.(x) since isna is probably the only +# function that can guarantee that NAs will never propagate +@inline function broadcast_t(f, ::Type{Bool}, shape, A, Bs...) + dest = similar(BitArray, shape) + return broadcast!(f, dest, A, Bs...) +end + +# This one is almost identical to the version in Base and can hopefully be +# removed at some point. The main issue in Base is that it tests for +# isleaftype(T) which is false for Union{T,NAtype}. If the test in Base +# can be modified to cover simple unions of leaftypes then this method +# can probably be deleted and the two _t methods adjusted to match the Base +# invokation from Base.Broadcast.broadcast_c @inline function Base.Broadcast.broadcast_c{S<:AbstractDataArray}(f, ::Type{S}, A, Bs...) T = Base.Broadcast._broadcast_eltype(f, A, Bs...) shape = Base.Broadcast.broadcast_indices(A, Bs...) - dest = S(T, Base.index_lengths(shape...)) - return broadcast!(f, dest, A, Bs...) + return broadcast_t(f, T, shape, A, Bs...) end + +# This one is much faster than normal broadcasting but the method won't get called +# in fusing operations like (!).(isna.(x)) +Base.broadcast(::typeof(isna), da::DataArray) = copy(da.na) diff --git a/src/dataarray.jl b/src/dataarray.jl index 83451c1..2e4617a 100644 --- a/src/dataarray.jl +++ b/src/dataarray.jl @@ -162,7 +162,7 @@ function Base.resize!(da::DataArray{T,1}, n::Int) where T end function Base.similar(da::DataArray, T::Type, dims::Dims) #-> DataArray{T} - return DataArray(Array{T}(dims), trues(dims)) + return DataArray(Array{extractT(T)}(dims), trues(dims)) end Base.size(d::DataArray) = size(d.data) # -> (Int...) @@ -244,8 +244,6 @@ end dropna(dv::DataVector) = dv.data[.!dv.na] # -> Vector -Base.broadcast(::typeof(isna), da::DataArray) = copy(da.na) - Base.any(::typeof(isna), da::DataArray) = any(da.na) # -> Bool Base.all(::typeof(isna), da::DataArray) = all(da.na) # -> Bool diff --git a/src/extras.jl b/src/extras.jl index 5ed5175..b35670c 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -19,12 +19,13 @@ function StatsBase.addcounts!(cm::Dict{U,W}, x::AbstractDataArray{T}, wv::Weight return cm end -function StatsBase.countmap(x::AbstractDataArray{T}) where T - addcounts!(Dict{Union{T, NAtype}, Int}(), x) + +function StatsBase.countmap(x::AbstractDataArray{T}) where {T} + addcounts!(Dict{Data{T}, Int}(), x) end function StatsBase.countmap(x::AbstractDataArray{T}, wv::Weights{W}) where {T,W} - addcounts!(Dict{Union{T, NAtype}, W}(), x, wv) + addcounts!(Dict{Data{T}, W}(), x, wv) end """ diff --git a/src/natype.jl b/src/natype.jl index 6f99ac5..ab55f06 100644 --- a/src/natype.jl +++ b/src/natype.jl @@ -29,6 +29,8 @@ A value denoting missingness within the domain of any type. """ const NA = NAtype() +const Data{T} = Union{T,NAtype} + Base.show(io::IO, x::NAtype) = print(io, "NA") struct NAException <: Exception @@ -36,12 +38,38 @@ struct NAException <: Exception end NAException() = NAException("NA found") +# Restrict to Number to avoid infinite recursion +# Might be possible to get rid of these restrictions if the promotion in base gets changed. +## Numbers +Base.promote_rule(::Type{Data{T}}, ::Type{Data{S}}) where {T<:Number,S<:Number} = + Union{promote_type(T, S),NAtype} +Base.promote_rule(::Type{Data{T}}, ::Type{S}) where {T<:Number,S<:Number} = + Union{promote_type(T, S),NAtype} +## Dates +Base.promote_rule(::Type{Data{T}}, ::Type{Data{S}}) where {T<:Dates.AbstractTime,S<:Dates.AbstractTime} = + Union{promote_type(T, S),NAtype} +Base.promote_rule(::Type{Data{T}}, ::Type{S}) where {T<:Dates.AbstractTime,S<:Dates.AbstractTime} = + Union{promote_type(T, S),NAtype} + +Base.promote_rule(::Type{NAtype}, ::Type{T}) where {T} = Union{T,NAtype} + +# Restrict to Number to avoid maching everything +Base.convert(::Type{Data{T}}, x::Number) where {T<:Number} = convert(T, x) +Base.convert(::Type{Data{T}}, x::Dates.AbstractTime) where {T<:Dates.AbstractTime} = convert(T, x) + Base.length(x::NAtype) = 1 Base.size(x::NAtype) = () Base.size(x::NAtype, i::Integer) = i < 1 ? throw(BoundsError()) : 1 Base.ndims(x::NAtype) = 0 Base.getindex(x::NAtype, i) = i == 1 ? NA : throw(BoundsError()) +# extractT(::Type{Data{T}}) where {T} = T +extractT(::Type{Union{T,NAtype}}) where {T} = T +extractT(::Type{T}) where {T} = T +extractT(::Type{NAtype}) = NAtype + +Base.zero(::Type{Data{T}}) where {T} = zero(T) + """ isna(x) -> Bool @@ -60,7 +88,4 @@ true isna(x::NAtype) = true isna(x::Any) = false -# TODO: Rethink this rule -Base.promote_rule{T}(::Type{T}, ::Type{NAtype} ) = T - Base.isnan(::NAtype) = NA diff --git a/src/operators.jl b/src/operators.jl index 2d602f6..1c5ad8b 100644 --- a/src/operators.jl +++ b/src/operators.jl @@ -205,6 +205,7 @@ for f in [:+,:-,:*,:/] end # Unary operators, DataArrays. +@dataarray_unary(+, Any, T) @dataarray_unary(-, Bool, Int) @dataarray_unary(-, Any, T) @dataarray_unary(!, Bool, T) @@ -531,31 +532,31 @@ function (-)(J::UniformScaling{TJ},A::DataArray{TA,2}) where {TA,TJ<:Number} end (+)(A::DataArray{Bool,2},J::UniformScaling{Bool}) = - invoke(+, Tuple{AbstractArray{Bool,2},UniformScaling{Bool}}, A, J) + invoke(+, Tuple{AbstractArray{Data{Bool},2},UniformScaling{Bool}}, A, J) (+)(J::UniformScaling{Bool},A::DataArray{Bool,2}) = - invoke(+, Tuple{UniformScaling{Bool},AbstractArray{Bool,2}}, J, A) + invoke(+, Tuple{UniformScaling{Bool},AbstractArray{Data{Bool},2}}, J, A) (-)(A::DataArray{Bool,2},J::UniformScaling{Bool}) = - invoke(-, Tuple{AbstractArray{Bool,2},UniformScaling{Bool}}, A, J) + invoke(-, Tuple{AbstractArray{Data{Bool},2},UniformScaling{Bool}}, A, J) (-)(J::UniformScaling{Bool},A::DataArray{Bool,2}) = - invoke(-, Tuple{UniformScaling{Bool},AbstractArray{Bool,2}}, J, A) + invoke(-, Tuple{UniformScaling{Bool},AbstractArray{Data{Bool},2}}, J, A) (+)(A::AbstractDataArray{TA,2},J::UniformScaling{TJ}) where {TA,TJ} = - invoke(+, Tuple{AbstractArray{TA,2},UniformScaling{TJ}}, A, J) + invoke(+, Tuple{AbstractArray{Data{TA},2},UniformScaling{TJ}}, A, J) (+)(J::UniformScaling,A::AbstractDataArray{TA,2}) where {TA} = - invoke(+, Tuple{UniformScaling,AbstractArray{TA,2}}, J, A) + invoke(+, Tuple{UniformScaling,AbstractArray{Data{TA},2}}, J, A) (-)(A::AbstractDataArray{TA,2},J::UniformScaling{TJ}) where {TA,TJ<:Number} = - invoke(-, Tuple{AbstractArray{TA,2},UniformScaling{TJ}}, A, J) + invoke(-, Tuple{AbstractArray{Data{TA},2},UniformScaling{TJ}}, A, J) (-)(J::UniformScaling{TJ},A::AbstractDataArray{TA,2}) where {TA,TJ<:Number} = - invoke(-, Tuple{UniformScaling{TJ},AbstractArray{TA,2}}, J, A) + invoke(-, Tuple{UniformScaling{TJ},AbstractArray{Data{TA},2}}, J, A) (+)(A::AbstractDataArray{Bool,2},J::UniformScaling{Bool}) = - invoke(+, Tuple{AbstractArray{Bool,2},UniformScaling{Bool}}, A, J) + invoke(+, Tuple{AbstractArray{Data{Bool},2},UniformScaling{Bool}}, A, J) (+)(J::UniformScaling{Bool},A::AbstractDataArray{Bool,2}) = - invoke(+, Tuple{UniformScaling{Bool},AbstractArray{Bool,2}}, J, A) + invoke(+, Tuple{UniformScaling{Bool},AbstractArray{Data{Bool},2}}, J, A) (-)(A::AbstractDataArray{Bool,2},J::UniformScaling{Bool}) = - invoke(-, Tuple{AbstractArray{Bool,2},UniformScaling{Bool}}, A, J) + invoke(-, Tuple{AbstractArray{Data{Bool},2},UniformScaling{Bool}}, A, J) (-)(J::UniformScaling{Bool},A::AbstractDataArray{Bool,2}) = - invoke(-, Tuple{UniformScaling{Bool},AbstractArray{Bool,2}}, J, A) + invoke(-, Tuple{UniformScaling{Bool},AbstractArray{Data{Bool},2}}, J, A) end # if isdefined(Base, :UniformScaling) diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl index 068d739..fe8b13a 100644 --- a/src/pooleddataarray.jl +++ b/src/pooleddataarray.jl @@ -106,9 +106,9 @@ end PooledDataArray(d::PooledDataArray) = d # Constructor from array, w/ pool, missingness, and ref type -function PooledDataArray(d::AbstractArray{T, N}, +function PooledDataArray(d::AbstractArray{<:Data{T}, N}, pool::Vector{T}, - m::AbstractArray{Bool, N}, + m::AbstractArray{<:Data{Bool}, N}, r::Type{R} = DEFAULT_POOLED_REF_TYPE) where {T,R<:Integer,N} if length(pool) > typemax(R) throw(ArgumentError("Cannot construct a PooledDataVector with type $R with a pool of size $(length(pool))")) @@ -466,7 +466,7 @@ julia> p # has been modified "B" ``` """ -function setlevels!(x::PooledDataArray{T,R}, newpool::AbstractVector{T}) where {T,R} +function setlevels!(x::PooledDataArray{T,R}, newpool::AbstractVector) where {T,R} if newpool == myunique(newpool) # no NAs or duplicates x.pool = newpool return x @@ -483,9 +483,6 @@ function setlevels!(x::PooledDataArray{T,R}, newpool::AbstractVector{T}) where { end end -setlevels!(x::PooledDataArray{T, R}, - newpool::AbstractVector) where {T, R} = setlevels!(x, convert(Array{T}, newpool)) - function setlevels(x::PooledDataArray, d::Dict) newpool = copy(DataArray(x.pool)) # An NA in `v` is put in the pool; that will cause it to become NA @@ -552,8 +549,9 @@ end ## ############################################################################## + function Base.similar(pda::PooledDataArray{T,R}, S::Type, dims::Dims) where {T,R} - PooledDataArray(RefArray(zeros(R, dims)), S[]) + PooledDataArray(RefArray(zeros(R, dims)), extractT(S)[]) end ############################################################################## diff --git a/src/reduce.jl b/src/reduce.jl index 8e0f462..79dd322 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -162,7 +162,7 @@ Base.varm(A::DataArray{T}, m::NAtype; corrected::Bool=true, skipna::Bool=false) function Base.var(A::DataArray; corrected::Bool=true, mean=nothing, skipna::Bool=false) mean == 0 ? Base.varm(A, 0; corrected=corrected, skipna=skipna) : mean == nothing ? varm(A, Base.mean(A; skipna=skipna); corrected=corrected, skipna=skipna) : - isa(mean, Union{Number, NAtype}) ? + isa(mean, Data{Number}) ? varm(A, mean; corrected=corrected, skipna=skipna) : throw(ErrorException("Invalid value of mean.")) end diff --git a/src/reducedim.jl b/src/reducedim.jl index 7de9071..f7bdcd6 100644 --- a/src/reducedim.jl +++ b/src/reducedim.jl @@ -302,8 +302,8 @@ end ## mean function Base.mean!(R::AbstractArray{T}, A::DataArray; skipna::Bool=false, - init::Bool=true) where T - init && fill!(R, zero(eltype(R))) + init::Bool=true) where {T} + init && fill!(R, 0) if skipna C = Array{Int}(size(R)) _mapreducedim_skipna_impl!(identity, +, R, C, A) diff --git a/src/statistics.jl b/src/statistics.jl index 988c1c0..7236f69 100644 --- a/src/statistics.jl +++ b/src/statistics.jl @@ -39,13 +39,13 @@ gl(n::Integer, k::Integer) = gl(n, k, n*k) StatsBase.describe(X::DataVector) = StatsBase.describe(STDOUT, X) function StatsBase.describe(io::IO, X::AbstractDataVector{T}) where T<:Real - nacount = sum(isna.(X)) + nacount = sum(isna, X) pna = 100nacount/length(X) if pna != 100 # describe will fail if dropna returns an empty vector describe(io, dropna(X)) else println(io, "Summary Stats:") - println(io, "Type: $(eltype(X))") + println(io, "Type: $(T)") end println(io, "Number Missing: $(nacount)") @printf(io, "%% Missing: %.6f\n", pna) @@ -53,11 +53,11 @@ function StatsBase.describe(io::IO, X::AbstractDataVector{T}) where T<:Real end function StatsBase.describe(io::IO, X::AbstractDataVector) - nacount = sum(isna.(X)) + nacount = sum(isna, X) pna = 100nacount/length(X) println(io, "Summary Stats:") println(io, "Length: $(length(X))") - println(io, "Type: $(eltype(X))") + println(io, "Type: $(extractT(eltype(X)))") println(io, "Number Unique: $(length(unique(X)))") println(io, "Number Missing: $(nacount)") @printf(io, "%% Missing: %.6f\n", pna) diff --git a/test/broadcast.jl b/test/broadcast.jl index 26e7850..61e89ff 100644 --- a/test/broadcast.jl +++ b/test/broadcast.jl @@ -128,4 +128,12 @@ @test map!(abs, x, x) == @data([1, 2]) @test isequal(map!(+, DataArray(Float64, 3), @data([1, NA, 3]), @data([NA, 2, 3])), @data([NA, NA, 6])) @test map!(isequal, DataArray(Float64, 3), @data([1, NA, NA]), @data([1, NA, 3])) == @data([true, true, false]) + + # isna doesn't propagate NAs so it should return BitArrays + x = isna.(@data [NA, 1, 2]) + @test x isa BitArray + @test x == [true, false, false] + x = (!).(isna.(@data [NA, 1, 2])) + @test x isa BitArray + @test x == [false, true, true] end diff --git a/test/constructors.jl b/test/constructors.jl index cb02688..6b7ee99 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -25,7 +25,7 @@ @test isequal(dv, convert(DataArray, 1:3)) dv = DataArray(Int, 3) - @test isequal(eltype(dv), Int) + @test isequal(eltype(dv), Data{Int}) @test isequal(dv.na, trues(3)) dv = convert(DataArray, zeros(3)) @@ -67,7 +67,7 @@ @test isequal(pdv, convert(PooledDataArray, PooledDataArray([1, 2, 3]))) pdv = PooledDataArray(Int, 3) - @test isequal(eltype(pdv), Int) + @test isequal(eltype(pdv), Data{Int}) @test all(isna.(pdv) .== trues(3)) pdv = convert(PooledDataArray, zeros(3)) @@ -106,7 +106,7 @@ @test isequal(dm, convert(DataArray, trues(2, 2))) dm = DataArray(Int, 2, 2) - @test isequal(eltype(dm), Int) + @test isequal(eltype(dm), Data{Int}) @test isequal(dm.na, trues(2, 2)) @test_nowarn convert(DataArray, zeros(2, 2)) diff --git a/test/data.jl b/test/data.jl index e8f7798..6796c30 100644 --- a/test/data.jl +++ b/test/data.jl @@ -71,13 +71,13 @@ @test size(dvint) == (4,) @test length(dvint) == 4 @test sum(isna.(dvint)) == 1 - @test eltype(dvint) == Int + @test eltype(dvint) == Data{Int} #test_group("PooledDataVector methods") @test size(pdvstr) == (7,) @test length(pdvstr) == 7 @test sum(isna.(pdvstr)) == 1 - @test eltype(pdvstr) == String + @test eltype(pdvstr) == Data{String} #test_group("DataVector operations") @test isequal(dvint .+ 1, DataArray([2, 3, 4, 5], [false, false, true, false])) @@ -99,7 +99,7 @@ @test all(convert(Vector{Int}, dvint2) .== [5:8;]) @test all([i + 1 for i in dvint2] .== [6:9;]) @test all([length(x)::Int for x in dvstr] == [3, 3, 1, 4]) - @test repr(dvint) == "[1, 2, NA, 4]" + @test repr(dvint) == "Union{DataArrays.NAtype, $Int}[1, 2, NA, 4]" #test_group("PooledDataVector to something else") @test all(dropna(pdvstr) .== ["one", "one", "two", "two", "one", "one"]) diff --git a/test/dataarray.jl b/test/dataarray.jl index 7f483a5..e8597e8 100644 --- a/test/dataarray.jl +++ b/test/dataarray.jl @@ -99,7 +99,7 @@ end # Inferrability of map (#276) - @test eltype(map(x -> x > 1, @data [1, 2])) == Bool + @test eltype(map(x -> x > 1, @data [1, 2])) == Data{Bool} @testset "Issue #278" begin x = @data ones(4) diff --git a/test/extras.jl b/test/extras.jl index 61f2327..1ecddb8 100644 --- a/test/extras.jl +++ b/test/extras.jl @@ -1,12 +1,14 @@ -@testset "Extras" begin +# @testset "Extras" begin ########## ## countmap ########## d = @data [NA,3,3] w = weights([1.1,2.2,3.3]) - cm = Dict{Union{Int, NAtype}, Int}([(NA, 1), (3, 2)]) - cmw = Dict{Union{Int, NAtype}, Real}([(NA, 1.1), (3, 5.5)]) + # cm = Dict{DataArrays.Data{Int}, Int}([(NA, 1), (3, 2)]) + # cmw = Dict{DataArrays.Data{Int}, Real}([(NA, 1.1), (3, 5.5)]) + cm = Dict{Union{NAtype,Int}, Int}([(NA, 1), (3, 2)]) + cmw = Dict{Union{NAtype,Int}, Real}([(NA, 1.1), (3, 5.5)]) @test isequal(countmap(d), cm) @test isequal(countmap(d, w), cmw) @@ -44,4 +46,4 @@ @test isequal(repeat(@pdata [:a :b NA]; inner = [2,1], outer = [1,3]), @pdata [:a :b NA :a :b NA :a :b NA; :a :b NA :a :b NA :a :b NA]) -end +# end diff --git a/test/nas.jl b/test/nas.jl index 1152938..f80150c 100644 --- a/test/nas.jl +++ b/test/nas.jl @@ -62,4 +62,14 @@ @test_throws NAException for v in each_failna(dv); end @test collect(each_dropna(dv)) == a @test collect(each_replacena(dv, 4)) == [4, 4, a..., 4] + + @testset "promotion" for (T1, T2) in ((Int, Float64), + (Dates.Minute, Dates.Second)) + @eval begin + @test promote_type($T1, Data{$T2}) == Data{$T2} + @test promote_type(Data{$T1}, $T2) == Data{$T2} + @test promote_type(Data{$T1}, Data{$T2}) == Data{$T2} + end + end + end diff --git a/test/runtests.jl b/test/runtests.jl index d39730e..d9f3ef3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,6 +4,7 @@ using Base.Test using DataArrays +using DataArrays: Data my_tests = ["abstractarray.jl", "booleans.jl",