Skip to content
This repository has been archived by the owner on May 4, 2019. It is now read-only.

Commit

Permalink
RFC: Stop lying about eltype (#280)
Browse files Browse the repository at this point in the history
* Stop lying about eltype

* Define Data{T} = Union{T,NAtype}

* Cleanup after rebase

* Fix promotion on master
  • Loading branch information
andreasnoack authored Aug 20, 2017
1 parent 607b453 commit 8b9e896
Show file tree
Hide file tree
Showing 17 changed files with 115 additions and 50 deletions.
4 changes: 2 additions & 2 deletions src/abstractdataarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
An `N`-dimensional `AbstractArray` whose entries can take on values of type
`T` or the value `NA`.
"""
abstract type AbstractDataArray{T, N} <: AbstractArray{T, N} end
abstract type AbstractDataArray{T, N} <: AbstractArray{Data{T}, N} end

"""
AbstractDataVector{T}
Expand All @@ -20,7 +20,7 @@ A 2-dimensional [`AbstractDataArray`](@ref) with element type `T`.
"""
const AbstractDataMatrix{T} = AbstractDataArray{T, 2}

Base.eltype(d::AbstractDataArray{T, N}) where {T, N} = T
Base.eltype(d::AbstractDataArray{T, N}) where {T, N} = Union{T,NAtype}

# Generic iteration over AbstractDataArray's

Expand Down
25 changes: 23 additions & 2 deletions src/broadcast.jl
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,30 @@ Base.Broadcast._containertype(::Type{T}) where T<:DataArray = DataArra
Base.Broadcast._containertype(::Type{T}) where T<:PooledDataArray = PooledDataArray
Base.Broadcast.broadcast_indices(::Type{T}, A) where T<:AbstractDataArray = indices(A)

@inline function broadcast_t(f, ::Type{T}, shape, A, Bs...) where {T}
dest = Base.Broadcast.containertype(A, Bs...)(extractT(T), Base.index_lengths(shape...))
return broadcast!(f, dest, A, Bs...)
end

# This is mainly to handle isna.(x) since isna is probably the only
# function that can guarantee that NAs will never propagate
@inline function broadcast_t(f, ::Type{Bool}, shape, A, Bs...)
dest = similar(BitArray, shape)
return broadcast!(f, dest, A, Bs...)
end

# This one is almost identical to the version in Base and can hopefully be
# removed at some point. The main issue in Base is that it tests for
# isleaftype(T) which is false for Union{T,NAtype}. If the test in Base
# can be modified to cover simple unions of leaftypes then this method
# can probably be deleted and the two _t methods adjusted to match the Base
# invokation from Base.Broadcast.broadcast_c
@inline function Base.Broadcast.broadcast_c{S<:AbstractDataArray}(f, ::Type{S}, A, Bs...)
T = Base.Broadcast._broadcast_eltype(f, A, Bs...)
shape = Base.Broadcast.broadcast_indices(A, Bs...)
dest = S(T, Base.index_lengths(shape...))
return broadcast!(f, dest, A, Bs...)
return broadcast_t(f, T, shape, A, Bs...)
end

# This one is much faster than normal broadcasting but the method won't get called
# in fusing operations like (!).(isna.(x))
Base.broadcast(::typeof(isna), da::DataArray) = copy(da.na)
4 changes: 1 addition & 3 deletions src/dataarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ function Base.resize!(da::DataArray{T,1}, n::Int) where T
end

function Base.similar(da::DataArray, T::Type, dims::Dims) #-> DataArray{T}
return DataArray(Array{T}(dims), trues(dims))
return DataArray(Array{extractT(T)}(dims), trues(dims))
end

Base.size(d::DataArray) = size(d.data) # -> (Int...)
Expand Down Expand Up @@ -244,8 +244,6 @@ end

dropna(dv::DataVector) = dv.data[.!dv.na] # -> Vector

Base.broadcast(::typeof(isna), da::DataArray) = copy(da.na)

Base.any(::typeof(isna), da::DataArray) = any(da.na) # -> Bool
Base.all(::typeof(isna), da::DataArray) = all(da.na) # -> Bool

Expand Down
7 changes: 4 additions & 3 deletions src/extras.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@ function StatsBase.addcounts!(cm::Dict{U,W}, x::AbstractDataArray{T}, wv::Weight
return cm
end

function StatsBase.countmap(x::AbstractDataArray{T}) where T
addcounts!(Dict{Union{T, NAtype}, Int}(), x)

function StatsBase.countmap(x::AbstractDataArray{T}) where {T}
addcounts!(Dict{Data{T}, Int}(), x)
end

function StatsBase.countmap(x::AbstractDataArray{T}, wv::Weights{W}) where {T,W}
addcounts!(Dict{Union{T, NAtype}, W}(), x, wv)
addcounts!(Dict{Data{T}, W}(), x, wv)
end

"""
Expand Down
31 changes: 28 additions & 3 deletions src/natype.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,19 +29,47 @@ A value denoting missingness within the domain of any type.
"""
const NA = NAtype()

const Data{T} = Union{T,NAtype}

Base.show(io::IO, x::NAtype) = print(io, "NA")

struct NAException <: Exception
msg::String
end
NAException() = NAException("NA found")

# Restrict to Number to avoid infinite recursion
# Might be possible to get rid of these restrictions if the promotion in base gets changed.
## Numbers
Base.promote_rule(::Type{Data{T}}, ::Type{Data{S}}) where {T<:Number,S<:Number} =
Union{promote_type(T, S),NAtype}
Base.promote_rule(::Type{Data{T}}, ::Type{S}) where {T<:Number,S<:Number} =
Union{promote_type(T, S),NAtype}
## Dates
Base.promote_rule(::Type{Data{T}}, ::Type{Data{S}}) where {T<:Dates.AbstractTime,S<:Dates.AbstractTime} =
Union{promote_type(T, S),NAtype}
Base.promote_rule(::Type{Data{T}}, ::Type{S}) where {T<:Dates.AbstractTime,S<:Dates.AbstractTime} =
Union{promote_type(T, S),NAtype}

Base.promote_rule(::Type{NAtype}, ::Type{T}) where {T} = Union{T,NAtype}

# Restrict to Number to avoid maching everything
Base.convert(::Type{Data{T}}, x::Number) where {T<:Number} = convert(T, x)
Base.convert(::Type{Data{T}}, x::Dates.AbstractTime) where {T<:Dates.AbstractTime} = convert(T, x)

Base.length(x::NAtype) = 1
Base.size(x::NAtype) = ()
Base.size(x::NAtype, i::Integer) = i < 1 ? throw(BoundsError()) : 1
Base.ndims(x::NAtype) = 0
Base.getindex(x::NAtype, i) = i == 1 ? NA : throw(BoundsError())

# extractT(::Type{Data{T}}) where {T} = T
extractT(::Type{Union{T,NAtype}}) where {T} = T
extractT(::Type{T}) where {T} = T
extractT(::Type{NAtype}) = NAtype

Base.zero(::Type{Data{T}}) where {T} = zero(T)

"""
isna(x) -> Bool
Expand All @@ -60,7 +88,4 @@ true
isna(x::NAtype) = true
isna(x::Any) = false

# TODO: Rethink this rule
Base.promote_rule{T}(::Type{T}, ::Type{NAtype} ) = T

Base.isnan(::NAtype) = NA
25 changes: 13 additions & 12 deletions src/operators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ for f in [:+,:-,:*,:/]
end

# Unary operators, DataArrays.
@dataarray_unary(+, Any, T)
@dataarray_unary(-, Bool, Int)
@dataarray_unary(-, Any, T)
@dataarray_unary(!, Bool, T)
Expand Down Expand Up @@ -531,31 +532,31 @@ function (-)(J::UniformScaling{TJ},A::DataArray{TA,2}) where {TA,TJ<:Number}
end

(+)(A::DataArray{Bool,2},J::UniformScaling{Bool}) =
invoke(+, Tuple{AbstractArray{Bool,2},UniformScaling{Bool}}, A, J)
invoke(+, Tuple{AbstractArray{Data{Bool},2},UniformScaling{Bool}}, A, J)
(+)(J::UniformScaling{Bool},A::DataArray{Bool,2}) =
invoke(+, Tuple{UniformScaling{Bool},AbstractArray{Bool,2}}, J, A)
invoke(+, Tuple{UniformScaling{Bool},AbstractArray{Data{Bool},2}}, J, A)
(-)(A::DataArray{Bool,2},J::UniformScaling{Bool}) =
invoke(-, Tuple{AbstractArray{Bool,2},UniformScaling{Bool}}, A, J)
invoke(-, Tuple{AbstractArray{Data{Bool},2},UniformScaling{Bool}}, A, J)
(-)(J::UniformScaling{Bool},A::DataArray{Bool,2}) =
invoke(-, Tuple{UniformScaling{Bool},AbstractArray{Bool,2}}, J, A)
invoke(-, Tuple{UniformScaling{Bool},AbstractArray{Data{Bool},2}}, J, A)

(+)(A::AbstractDataArray{TA,2},J::UniformScaling{TJ}) where {TA,TJ} =
invoke(+, Tuple{AbstractArray{TA,2},UniformScaling{TJ}}, A, J)
invoke(+, Tuple{AbstractArray{Data{TA},2},UniformScaling{TJ}}, A, J)
(+)(J::UniformScaling,A::AbstractDataArray{TA,2}) where {TA} =
invoke(+, Tuple{UniformScaling,AbstractArray{TA,2}}, J, A)
invoke(+, Tuple{UniformScaling,AbstractArray{Data{TA},2}}, J, A)
(-)(A::AbstractDataArray{TA,2},J::UniformScaling{TJ}) where {TA,TJ<:Number} =
invoke(-, Tuple{AbstractArray{TA,2},UniformScaling{TJ}}, A, J)
invoke(-, Tuple{AbstractArray{Data{TA},2},UniformScaling{TJ}}, A, J)
(-)(J::UniformScaling{TJ},A::AbstractDataArray{TA,2}) where {TA,TJ<:Number} =
invoke(-, Tuple{UniformScaling{TJ},AbstractArray{TA,2}}, J, A)
invoke(-, Tuple{UniformScaling{TJ},AbstractArray{Data{TA},2}}, J, A)

(+)(A::AbstractDataArray{Bool,2},J::UniformScaling{Bool}) =
invoke(+, Tuple{AbstractArray{Bool,2},UniformScaling{Bool}}, A, J)
invoke(+, Tuple{AbstractArray{Data{Bool},2},UniformScaling{Bool}}, A, J)
(+)(J::UniformScaling{Bool},A::AbstractDataArray{Bool,2}) =
invoke(+, Tuple{UniformScaling{Bool},AbstractArray{Bool,2}}, J, A)
invoke(+, Tuple{UniformScaling{Bool},AbstractArray{Data{Bool},2}}, J, A)
(-)(A::AbstractDataArray{Bool,2},J::UniformScaling{Bool}) =
invoke(-, Tuple{AbstractArray{Bool,2},UniformScaling{Bool}}, A, J)
invoke(-, Tuple{AbstractArray{Data{Bool},2},UniformScaling{Bool}}, A, J)
(-)(J::UniformScaling{Bool},A::AbstractDataArray{Bool,2}) =
invoke(-, Tuple{UniformScaling{Bool},AbstractArray{Bool,2}}, J, A)
invoke(-, Tuple{UniformScaling{Bool},AbstractArray{Data{Bool},2}}, J, A)

end # if isdefined(Base, :UniformScaling)

Expand Down
12 changes: 5 additions & 7 deletions src/pooleddataarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ end
PooledDataArray(d::PooledDataArray) = d

# Constructor from array, w/ pool, missingness, and ref type
function PooledDataArray(d::AbstractArray{T, N},
function PooledDataArray(d::AbstractArray{<:Data{T}, N},
pool::Vector{T},
m::AbstractArray{Bool, N},
m::AbstractArray{<:Data{Bool}, N},
r::Type{R} = DEFAULT_POOLED_REF_TYPE) where {T,R<:Integer,N}
if length(pool) > typemax(R)
throw(ArgumentError("Cannot construct a PooledDataVector with type $R with a pool of size $(length(pool))"))
Expand Down Expand Up @@ -466,7 +466,7 @@ julia> p # has been modified
"B"
```
"""
function setlevels!(x::PooledDataArray{T,R}, newpool::AbstractVector{T}) where {T,R}
function setlevels!(x::PooledDataArray{T,R}, newpool::AbstractVector) where {T,R}
if newpool == myunique(newpool) # no NAs or duplicates
x.pool = newpool
return x
Expand All @@ -483,9 +483,6 @@ function setlevels!(x::PooledDataArray{T,R}, newpool::AbstractVector{T}) where {
end
end

setlevels!(x::PooledDataArray{T, R},
newpool::AbstractVector) where {T, R} = setlevels!(x, convert(Array{T}, newpool))

function setlevels(x::PooledDataArray, d::Dict)
newpool = copy(DataArray(x.pool))
# An NA in `v` is put in the pool; that will cause it to become NA
Expand Down Expand Up @@ -552,8 +549,9 @@ end
##
##############################################################################


function Base.similar(pda::PooledDataArray{T,R}, S::Type, dims::Dims) where {T,R}
PooledDataArray(RefArray(zeros(R, dims)), S[])
PooledDataArray(RefArray(zeros(R, dims)), extractT(S)[])
end

##############################################################################
Expand Down
2 changes: 1 addition & 1 deletion src/reduce.jl
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ Base.varm(A::DataArray{T}, m::NAtype; corrected::Bool=true, skipna::Bool=false)
function Base.var(A::DataArray; corrected::Bool=true, mean=nothing, skipna::Bool=false)
mean == 0 ? Base.varm(A, 0; corrected=corrected, skipna=skipna) :
mean == nothing ? varm(A, Base.mean(A; skipna=skipna); corrected=corrected, skipna=skipna) :
isa(mean, Union{Number, NAtype}) ?
isa(mean, Data{Number}) ?
varm(A, mean; corrected=corrected, skipna=skipna) :
throw(ErrorException("Invalid value of mean."))
end
Expand Down
4 changes: 2 additions & 2 deletions src/reducedim.jl
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,8 @@ end
## mean

function Base.mean!(R::AbstractArray{T}, A::DataArray; skipna::Bool=false,
init::Bool=true) where T
init && fill!(R, zero(eltype(R)))
init::Bool=true) where {T}
init && fill!(R, 0)
if skipna
C = Array{Int}(size(R))
_mapreducedim_skipna_impl!(identity, +, R, C, A)
Expand Down
8 changes: 4 additions & 4 deletions src/statistics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,25 +39,25 @@ gl(n::Integer, k::Integer) = gl(n, k, n*k)
StatsBase.describe(X::DataVector) = StatsBase.describe(STDOUT, X)

function StatsBase.describe(io::IO, X::AbstractDataVector{T}) where T<:Real
nacount = sum(isna.(X))
nacount = sum(isna, X)
pna = 100nacount/length(X)
if pna != 100 # describe will fail if dropna returns an empty vector
describe(io, dropna(X))
else
println(io, "Summary Stats:")
println(io, "Type: $(eltype(X))")
println(io, "Type: $(T)")
end
println(io, "Number Missing: $(nacount)")
@printf(io, "%% Missing: %.6f\n", pna)
return
end

function StatsBase.describe(io::IO, X::AbstractDataVector)
nacount = sum(isna.(X))
nacount = sum(isna, X)
pna = 100nacount/length(X)
println(io, "Summary Stats:")
println(io, "Length: $(length(X))")
println(io, "Type: $(eltype(X))")
println(io, "Type: $(extractT(eltype(X)))")
println(io, "Number Unique: $(length(unique(X)))")
println(io, "Number Missing: $(nacount)")
@printf(io, "%% Missing: %.6f\n", pna)
Expand Down
8 changes: 8 additions & 0 deletions test/broadcast.jl
Original file line number Diff line number Diff line change
Expand Up @@ -128,4 +128,12 @@
@test map!(abs, x, x) == @data([1, 2])
@test isequal(map!(+, DataArray(Float64, 3), @data([1, NA, 3]), @data([NA, 2, 3])), @data([NA, NA, 6]))
@test map!(isequal, DataArray(Float64, 3), @data([1, NA, NA]), @data([1, NA, 3])) == @data([true, true, false])

# isna doesn't propagate NAs so it should return BitArrays
x = isna.(@data [NA, 1, 2])
@test x isa BitArray
@test x == [true, false, false]
x = (!).(isna.(@data [NA, 1, 2]))
@test x isa BitArray
@test x == [false, true, true]
end
6 changes: 3 additions & 3 deletions test/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
@test isequal(dv, convert(DataArray, 1:3))

dv = DataArray(Int, 3)
@test isequal(eltype(dv), Int)
@test isequal(eltype(dv), Data{Int})
@test isequal(dv.na, trues(3))

dv = convert(DataArray, zeros(3))
Expand Down Expand Up @@ -67,7 +67,7 @@
@test isequal(pdv, convert(PooledDataArray, PooledDataArray([1, 2, 3])))

pdv = PooledDataArray(Int, 3)
@test isequal(eltype(pdv), Int)
@test isequal(eltype(pdv), Data{Int})
@test all(isna.(pdv) .== trues(3))

pdv = convert(PooledDataArray, zeros(3))
Expand Down Expand Up @@ -106,7 +106,7 @@
@test isequal(dm, convert(DataArray, trues(2, 2)))

dm = DataArray(Int, 2, 2)
@test isequal(eltype(dm), Int)
@test isequal(eltype(dm), Data{Int})
@test isequal(dm.na, trues(2, 2))

@test_nowarn convert(DataArray, zeros(2, 2))
Expand Down
6 changes: 3 additions & 3 deletions test/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,13 @@
@test size(dvint) == (4,)
@test length(dvint) == 4
@test sum(isna.(dvint)) == 1
@test eltype(dvint) == Int
@test eltype(dvint) == Data{Int}

#test_group("PooledDataVector methods")
@test size(pdvstr) == (7,)
@test length(pdvstr) == 7
@test sum(isna.(pdvstr)) == 1
@test eltype(pdvstr) == String
@test eltype(pdvstr) == Data{String}

#test_group("DataVector operations")
@test isequal(dvint .+ 1, DataArray([2, 3, 4, 5], [false, false, true, false]))
Expand All @@ -99,7 +99,7 @@
@test all(convert(Vector{Int}, dvint2) .== [5:8;])
@test all([i + 1 for i in dvint2] .== [6:9;])
@test all([length(x)::Int for x in dvstr] == [3, 3, 1, 4])
@test repr(dvint) == "[1, 2, NA, 4]"
@test repr(dvint) == "Union{DataArrays.NAtype, $Int}[1, 2, NA, 4]"

#test_group("PooledDataVector to something else")
@test all(dropna(pdvstr) .== ["one", "one", "two", "two", "one", "one"])
Expand Down
2 changes: 1 addition & 1 deletion test/dataarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@
end

# Inferrability of map (#276)
@test eltype(map(x -> x > 1, @data [1, 2])) == Bool
@test eltype(map(x -> x > 1, @data [1, 2])) == Data{Bool}

@testset "Issue #278" begin
x = @data ones(4)
Expand Down
10 changes: 6 additions & 4 deletions test/extras.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
@testset "Extras" begin
# @testset "Extras" begin
##########
## countmap
##########

d = @data [NA,3,3]
w = weights([1.1,2.2,3.3])
cm = Dict{Union{Int, NAtype}, Int}([(NA, 1), (3, 2)])
cmw = Dict{Union{Int, NAtype}, Real}([(NA, 1.1), (3, 5.5)])
# cm = Dict{DataArrays.Data{Int}, Int}([(NA, 1), (3, 2)])
# cmw = Dict{DataArrays.Data{Int}, Real}([(NA, 1.1), (3, 5.5)])
cm = Dict{Union{NAtype,Int}, Int}([(NA, 1), (3, 2)])
cmw = Dict{Union{NAtype,Int}, Real}([(NA, 1.1), (3, 5.5)])
@test isequal(countmap(d), cm)
@test isequal(countmap(d, w), cmw)

Expand Down Expand Up @@ -44,4 +46,4 @@
@test isequal(repeat(@pdata [:a :b NA]; inner = [2,1], outer = [1,3]),
@pdata [:a :b NA :a :b NA :a :b NA;
:a :b NA :a :b NA :a :b NA])
end
# end
10 changes: 10 additions & 0 deletions test/nas.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,14 @@
@test_throws NAException for v in each_failna(dv); end
@test collect(each_dropna(dv)) == a
@test collect(each_replacena(dv, 4)) == [4, 4, a..., 4]

@testset "promotion" for (T1, T2) in ((Int, Float64),
(Dates.Minute, Dates.Second))
@eval begin
@test promote_type($T1, Data{$T2}) == Data{$T2}
@test promote_type(Data{$T1}, $T2) == Data{$T2}
@test promote_type(Data{$T1}, Data{$T2}) == Data{$T2}
end
end

end
Loading

0 comments on commit 8b9e896

Please sign in to comment.