Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make arrays and ranges hash and compare equal #16401

Merged
merged 11 commits into from
Dec 21, 2017
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,13 @@ This section lists changes that do not have deprecation warnings.
* `eachindex(A, B...)` now requires that all inputs have the same number of elements.
When the chosen indexing is Cartesian, they must have the same axes.

* `AbstractRange` objects are now considered as equal to other `AbstractArray` objects
by `==` and `isequal` if all of their elements are equal ([#16401]).
This has required changing the hashing algorithm: ranges now use an O(N) fallback
instead of a O(1) specialized method unless they define the `Base.TypeRangeStep`
trait; see its documentation for details. Types which support subtraction (operator
`-`) must now implement `widen` for hashing to work inside heterogeneous arrays.

Library improvements
--------------------

Expand Down
96 changes: 83 additions & 13 deletions base/abstractarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1547,9 +1547,6 @@ function isequal(A::AbstractArray, B::AbstractArray)
if axes(A) != axes(B)
return false
end
if isa(A,AbstractRange) != isa(B,AbstractRange)
return false
end
for (a, b) in zip(A, B)
if !isequal(a, b)
return false
Expand All @@ -1570,9 +1567,6 @@ function (==)(A::AbstractArray, B::AbstractArray)
if axes(A) != axes(B)
return false
end
if isa(A,AbstractRange) != isa(B,AbstractRange)
return false
end
anymissing = false
for (a, b) in zip(A, B)
eq = (a == b)
Expand Down Expand Up @@ -1944,19 +1938,92 @@ unshift!(A, a, b, c...) = unshift!(unshift!(A, c...), a, b)

const hashaa_seed = UInt === UInt64 ? 0x7f53e68ceb575e76 : 0xeb575e76
const hashrle_seed = UInt === UInt64 ? 0x2aab8909bfea414c : 0xbfea414c
function hash(a::AbstractArray, h::UInt)
const hashr_seed = UInt === UInt64 ? 0x80707b6821b70087 : 0x21b70087

# Efficient O(1) method equivalent to the O(N) AbstractArray fallback,
# which works only for ranges with regular step (RangeStepRegular)
function hash_range(r::AbstractRange, h::UInt)
h += hashaa_seed
h += hash(size(r))

length(r) == 0 && return h
h = hash(first(r), h)
length(r) == 1 && return h
length(r) == 2 && return hash(last(r), h)

h += hashr_seed
h = hash(length(r), h)
h = hash(last(r), h)
end

function hash(a::AbstractArray{T}, h::UInt) where T
# O(1) hashing for types with regular step
if isa(a, AbstractRange) && isa(TypeRangeStep(a), RangeStepRegular)
return hash_range(a, h)
end

h += hashaa_seed
h += hash(size(a))

state = start(a)
done(a, state) && return h
x1, state = next(a, state)
done(a, state) && return hash(x1, h)
x2, state = next(a, state)
done(a, state) && return hash(x2, h)
done(a, state) && return hash(x2, hash(x1, h))

# Check whether the array is equal to a range, and hash the elements
# at the beginning of the array as such as long as they match this assumption
# This needs to be done even for non-RangeStepRegular types since they may still be equal
# to RangeStepRegular values (e.g. 1.0:3.0 == 1:3)
if isa(a, AbstractVector) && applicable(-, x2, x1)
n = 1
local step, laststep, laststate
while true
# If overflow happens with entries of the same type, a cannot be equal
# to a range with more than two elements because more extreme values
# cannot be represented. We must still hash the two first values as a
# range since they can always be considered as such (in a wider type)
if isconcrete(T)
try
step = x2 - x1
catch err
isa(err, OverflowError) || rethrow(err)
break
end
# If true, wraparound overflow happened
sign(step) == cmp(x2, x1) || break
else
# widen() is here to ensure no overflow can happen
step = widen(x2) - widen(x1)
end
n > 1 && !isequal(step, laststep) && break
n += 1
x1 = x2
laststep = step
laststate = state
done(a, state) && break
x2, state = next(a, state)
end

x1 = x2
while !done(a, state)
x1 = x2
x2, state = next(a, state)
h = hash(first(a), h)
h += hashr_seed
# Always hash at least the two first elements as a range (even in case of overflow)
if n < 2
h = hash(2, h)
h = hash(x2, h)
done(a, state) && return h
x1 = x2
x2, state = next(a, state)
else
h = hash(n, h)
h = hash(x1, h)
done(a, laststate) && return h
end
end

# Hash elements which do not correspond to a range (if any)
while true
if isequal(x2, x1)
# For repeated elements, use run length encoding
# This allows efficient hashing of sparse arrays
Expand All @@ -1970,7 +2037,10 @@ function hash(a::AbstractArray, h::UInt)
h = hash(runlength, h)
end
h = hash(x1, h)
done(a, state) && break
x1 = x2
x2, state = next(a, state)
end
!isequal(x2, x1) && (h = hash(x2, h))
return h
end
end
1 change: 1 addition & 0 deletions base/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ in(x::Char, y::Char) = x == y
isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y)
hash(x::Char, h::UInt) =
hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h))
widen(::Type{Char}) = Char

-(x::Char, y::Char) = Int(x) - Int(y)
-(x::Char, y::Integer) = Char(Int32(x) - Int32(y))
Expand Down
13 changes: 3 additions & 10 deletions base/hashing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ optional second argument `h` is a hash code to be mixed with the result.
New types should implement the 2-argument form, typically by calling the 2-argument `hash`
method recursively in order to mix hashes of the contents with each other (and with `h`).
Typically, any type that implements `hash` should also implement its own `==` (hence
`isequal`) to guarantee the property mentioned above.
`isequal`) to guarantee the property mentioned above. Types supporting subtraction
(operator `-`) should also implement [`widen`](@ref), which is required to hash
values inside heterogeneous arrays.
"""
hash(x::Any) = hash(x, zero(UInt))
hash(w::WeakRef, h::UInt) = hash(w.value, h)
Expand Down Expand Up @@ -73,12 +75,3 @@ else
end

hash(x::QuoteNode, h::UInt) = hash(x.value, hash(QuoteNode, h))

# hashing ranges by component at worst leads to collisions for very similar ranges
const hashr_seed = UInt === UInt64 ? 0x80707b6821b70087 : 0x21b70087
function hash(r::AbstractRange, h::UInt)
h += hashr_seed
h = hash(first(r), h)
h = hash(step(r), h)
h = hash(last(r), h)
end
2 changes: 1 addition & 1 deletion base/hashing2.jl
Original file line number Diff line number Diff line change
Expand Up @@ -178,4 +178,4 @@ function hash(s::Union{String,SubString{String}}, h::UInt)
# note: use pointer(s) here (see #6058).
ccall(memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), pointer(s), sizeof(s), h % UInt32) + h
end
hash(s::AbstractString, h::UInt) = hash(String(s), h)
hash(s::AbstractString, h::UInt) = hash(String(s), h)
2 changes: 2 additions & 0 deletions base/irrationals.jl
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ isone(::AbstractIrrational) = false

hash(x::Irrational, h::UInt) = 3*object_id(x) - h

widen(::Type{T}) where {T<:Irrational} = T

-(x::AbstractIrrational) = -Float64(x)
for op in Symbol[:+, :-, :*, :/, :^]
@eval $op(x::AbstractIrrational, y::AbstractIrrational) = $op(Float64(x),Float64(y))
Expand Down
10 changes: 6 additions & 4 deletions base/operators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -771,9 +771,11 @@ conj(x) = x
"""
widen(x)

If `x` is a type, return a "larger" type (for numeric types, this will be
a type with at least as much range and precision as the argument, and usually more).
Otherwise `x` is converted to `widen(typeof(x))`.
If `x` is a type, return a "larger" type, defined so that arithmetic operations
`+` and `-` are guaranteed not to overflow nor lose precision for any combination
of values that type `x` can hold.

If `x` is a value, it is converted to `widen(typeof(x))`.

# Examples
```jldoctest
Expand All @@ -784,7 +786,7 @@ julia> widen(1.5f0)
1.5
```
"""
widen(x::T) where {T<:Number} = convert(widen(T), x)
widen(x::T) where {T} = convert(widen(T), x)

# function pipelining

Expand Down
3 changes: 3 additions & 0 deletions base/range.jl
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ range(a::AbstractFloat, st::Real, len::Integer) = range(a, float(st), len)

abstract type AbstractRange{T} <: AbstractArray{T,1} end

TypeRangeStep(::Type{<:AbstractRange}) = RangeStepIrregular()
TypeRangeStep(::Type{<:AbstractRange{<:Integer}}) = RangeStepRegular()

## ordinal ranges

abstract type OrdinalRange{T,S} <: AbstractRange{T} end
Expand Down
37 changes: 37 additions & 0 deletions base/traits.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,40 @@ TypeArithmetic(instance) = TypeArithmetic(typeof(instance))
TypeArithmetic(::Type{<:AbstractFloat}) = ArithmeticRounds()
TypeArithmetic(::Type{<:Integer}) = ArithmeticOverflows()
TypeArithmetic(::Type{<:Any}) = ArithmeticUnknown()

# trait for objects that support ranges with regular step
"""
TypeRangeStep(instance)
TypeRangeStep(T::Type)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these aren't exported, but should they go in the stdlib? are the other traits in this file there?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. The other traits are not exported, but I'm not sure what their use outside of Base is. @timholy?

The traits could also be mentioned somewhere in the manual, but I'm not sure where. They could go to the Interfaces section, but they aren't related to any existing interface (except maybe Real). Maybe we could have a sub-section listing all traits that a custom type may want to implement: this trait will generally be useful when you write your own type.


Indicate whether an instance or a type supports constructing a range with
a perfectly regular step or not. A regular step means that
[`step`](@ref) will always be exactly equal to the difference between two
subsequent elements in a range, i.e. for a range `r::AbstractRange{T}`:
```julia
all(diff(r) .== step(r))
```

When a type `T` always leads to ranges with regular steps, it should
define the following method:
```julia
Base.TypeRangeStep(::Type{<:AbstractRange{<:T}}) = Base.RangeStepRegular()
```
This will allow [`hash`](@ref) to use an O(1) algorithm for `AbstractRange{T}`
objects instead of the default O(N) algorithm (with N the length of the range).

In some cases, whether the step will be regular depends not only on the
element type `T`, but also on the type of the step `S`. In that case, more
specific methods should be defined:
```julia
Base.TypeRangeStep(::Type{<:OrdinalRange{<:T, <:S}}) = Base.RangeStepRegular()
```

By default, all range types are assumed to be `RangeStepIrregular`, except
ranges with an element type which is a subtype of `Integer`.
"""
abstract type TypeRangeStep end
struct RangeStepRegular <: TypeRangeStep end # range with regular step
struct RangeStepIrregular <: TypeRangeStep end # range with rounding error

TypeRangeStep(instance) = TypeRangeStep(typeof(instance))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we put in an error for TypeRangeStep(::Type{DataType}) to avoid stack overflows?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that's necessary since that method is only called by hash on Range objects. And it's not really useful outside of Base AFAICT.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what about package authors defining custom range types, or element types that go into ranges?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only situation in which package authors would need this would be if they define a new range type for which neither the O(N) AbstractRange fallback nor the O(1) hash_range method works, and yet the custom type's method would need to check whether a given type uses a regular step. I can't find an example where this would apply, especially given that a custom hash implementation for a range should be consistent with other AbstractArray implementations.

1 change: 1 addition & 0 deletions stdlib/Dates/src/periods.jl
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ end
# intfuncs
Base.gcdx(a::T, b::T) where {T<:Period} = ((g, x, y) = gcdx(value(a), value(b)); return T(g), x, y)
Base.abs(a::T) where {T<:Period} = T(abs(value(a)))
Base.sign(x::Period) = sign(value(x))

periodisless(::Period,::Year) = true
periodisless(::Period,::Month) = true
Expand Down
4 changes: 4 additions & 0 deletions stdlib/Dates/src/ranges.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,7 @@ Base.done(r::StepRange{<:TimeType,<:Period}, i::Integer) = length(r) <= i
+(x::Period, r::AbstractRange{<:TimeType}) = (x + first(r)):step(r):(x + last(r))
+(r::AbstractRange{<:TimeType}, x::Period) = x + r
-(r::AbstractRange{<:TimeType}, x::Period) = (first(r)-x):step(r):(last(r)-x)

# Combinations of types and periods for which the range step is regular
Base.TypeRangeStep(::Type{<:OrdinalRange{<:TimeType, <:FixedPeriod}}) =
Base.RangeStepRegular()
3 changes: 3 additions & 0 deletions stdlib/Dates/test/periods.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ using Test
@test mod.([t, t, t, t, t], Dates.Year(2)) == ([t, t, t, t, t])
@test [t, t, t] / t2 == [0.5, 0.5, 0.5]
@test abs(-t) == t
@test sign(t) == sign(t2) == 1
@test sign(-t) == sign(-t2) == -1
@test sign(Dates.Year(0)) == 0
end
@testset "div/mod/gcd/lcm/rem" begin
@test Dates.Year(10) % Dates.Year(4) == Dates.Year(2)
Expand Down
16 changes: 16 additions & 0 deletions stdlib/Dates/test/ranges.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ let
@test !(l1 in dr)
@test !(f1 - pos_step in dr)
@test !(l1 + pos_step in dr)
@test dr == []
@test hash(dr) == hash([])

for (f, l) in ((f2, l2), (f3, l3), (f4, l4))
dr = f:pos_step:l
Expand All @@ -58,6 +60,8 @@ let
@test length(dr1) == len
@test findin(dr, dr) == [1:len;]
@test length([dr;]) == len
@test dr == dr1
@test hash(dr) == hash(dr1)
end
@test !isempty(reverse(dr))
@test length(reverse(dr)) == len
Expand Down Expand Up @@ -92,6 +96,8 @@ let
@test !(l1 in dr)
@test !(l1 - neg_step in dr)
@test !(l1 + neg_step in dr)
@test dr == []
@test hash(dr) == hash([])

for (f, l) in ((f2, l2), (f3, l3), (f4, l4))
dr = l:neg_step:f
Expand All @@ -112,6 +118,8 @@ let
@test length(dr1) == len
@test findin(dr, dr) == [1:len;]
@test length([dr;]) == len
@test dr == dr1
@test hash(dr) == hash(dr1)
end
@test !isempty(reverse(dr))
@test length(reverse(dr)) == len
Expand Down Expand Up @@ -148,6 +156,8 @@ let
@test !(l1 in dr)
@test !(f1 - pos_step in dr)
@test !(l1 + pos_step in dr)
@test dr == []
@test hash(dr) == hash([])

for (f, l) in ((f2, l2), (f3, l3), (f4, l4))
dr = f:pos_step:l
Expand All @@ -168,6 +178,8 @@ let
@test length(dr1) == len
@test findin(dr, dr) == [1:len;]
@test length([dr;]) == len
@test dr == dr1
@test hash(dr) == hash(dr1)
end
@test !isempty(reverse(dr))
@test length(reverse(dr)) == len
Expand Down Expand Up @@ -202,6 +214,8 @@ let
@test !(l1 in dr)
@test !(l1 - neg_step in dr)
@test !(l1 + neg_step in dr)
@test dr == []
@test hash(dr) == hash([])

for (f, l) in ((f2, l2), (f3, l3), (f4, l4))
dr = l:neg_step:f
Expand All @@ -222,6 +236,8 @@ let
@test length(dr1) == len
@test findin(dr, dr) == [1:len;]
@test length([dr;]) == len
@test dr == dr1
@test hash(dr) == hash(dr1)
end
@test !isempty(reverse(dr))
@test length(reverse(dr)) == len
Expand Down
4 changes: 2 additions & 2 deletions test/abstractarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -388,8 +388,8 @@ function test_vector_indexing(::Type{T}, shape, ::Type{TestAbstractArray}) where
idxs = rand(1:N, 3, 3, 3)
@test B[idxs] == A[idxs] == idxs
@test B[vec(idxs)] == A[vec(idxs)] == vec(idxs)
@test B[:] == A[:] == collect(1:N)
@test B[1:end] == A[1:end] == collect(1:N)
@test B[:] == A[:] == 1:N
@test B[1:end] == A[1:end] == 1:N
@test B[:,:,trailing2] == A[:,:,trailing2] == B[:,:,1,trailing3] == A[:,:,1,trailing3]
B[1:end,1:end,trailing2] == A[1:end,1:end,trailing2] == B[1:end,1:end,1,trailing3] == A[1:end,1:end,1,trailing3]

Expand Down
6 changes: 3 additions & 3 deletions test/arrayops.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1125,7 +1125,7 @@ end
# base case w/ Vector
a = collect(1:10)
filter!(x -> x > 5, a)
@test a == collect(6:10)
@test a == 6:10

# different subtype of AbstractVector
ba = rand(10) .> 0.5
Expand Down Expand Up @@ -1761,8 +1761,8 @@ end
for A in (reshape(collect(1:20), 4, 5),
reshape(1:20, 4, 5))
local A
@test slicedim(A, 1, 2) == collect(2:4:20)
@test slicedim(A, 2, 2) == collect(5:8)
@test slicedim(A, 1, 2) == 2:4:20
@test slicedim(A, 2, 2) == 5:8
@test_throws ArgumentError slicedim(A,0,1)
@test slicedim(A, 3, 1) == A
@test_throws BoundsError slicedim(A, 3, 2)
Expand Down
Loading