Skip to content

Commit

Permalink
Use RLE on step hash to make arrays and ranges equal
Browse files Browse the repository at this point in the history
Instead of hashing the values themselves, hash the first value and
differences between subsequent elements using run-length encoding.
This allows for O(1) hashing of ranges consistent with AbstractArrays,
which means they can now compare equal.

Elements for which the - operator is not defined are hashed directly.
This assumes that types which can be used for ranges and sparse matrices
implement -.
  • Loading branch information
nalimilan committed May 17, 2016
1 parent 807ec46 commit 0e6e2ce
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 54 deletions.
41 changes: 0 additions & 41 deletions base/abstractarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -912,9 +912,6 @@ function isequal(A::AbstractArray, B::AbstractArray)
if size(A) != size(B)
return false
end
if isa(A,Range) != isa(B,Range)
return false
end
for (a, b) in zip(A, B)
if !isequal(a, b)
return false
Expand All @@ -935,9 +932,6 @@ function (==)(A::AbstractArray, B::AbstractArray)
if size(A) != size(B)
return false
end
if isa(A,Range) != isa(B,Range)
return false
end
for (a, b) in zip(A, B)
if !(a == b)
return false
Expand Down Expand Up @@ -1151,38 +1145,3 @@ push!(A, a, b) = push!(push!(A, a), b)
push!(A, a, b, c...) = push!(push!(A, a, b), c...)
unshift!(A, a, b) = unshift!(unshift!(A, b), a)
unshift!(A, a, b, c...) = unshift!(unshift!(A, c...), a, b)

## hashing collections ##

const hashaa_seed = UInt === UInt64 ? 0x7f53e68ceb575e76 : 0xeb575e76
const hashrle_seed = UInt == UInt64 ? 0x2aab8909bfea414c : 0xbfea414c
function hash(a::AbstractArray, h::UInt)
h += hashaa_seed
h += hash(size(a))

state = start(a)
done(a, state) && return h
x2, state = next(a, state)
done(a, state) && return hash(x2, h)

x1 = x2
while !done(a, state)
x1 = x2
x2, state = next(a, state)
if isequal(x2, x1)
# For repeated elements, use run length encoding
# This allows efficient hashing of sparse arrays
runlength = 2
while !done(a, state)
x2, state = next(a, state)
isequal(x1, x2) || break
runlength += 1
end
h += hashrle_seed
h = hash(runlength, h)
end
h = hash(x1, h)
end
!isequal(x2, x1) && (h = hash(x2, h))
return h
end
78 changes: 73 additions & 5 deletions base/hashing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,79 @@ end

hash(x::QuoteNode, h::UInt) = hash(x.value, hash(QuoteNode, h))

# hashing ranges by component at worst leads to collisions for very similar ranges
const hashr_seed = UInt === UInt64 ? 0x80707b6821b70087 : 0x21b70087
## hashing collections ##

const hashaa_seed = UInt === UInt64 ? 0x7f53e68ceb575e76 : 0xeb575e76
const hashrle_seed = UInt == UInt64 ? 0x2aab8909bfea414c : 0xbfea414c
function hash{T}(a::AbstractArray{T}, h::UInt)
if isleaftype(T)
if method_exists(-, (T, T))
val = (x1, x2) -> x2 - x1
else
val = (x1, x2) -> x2
end
else
val = (x1, x2) -> applicable(-, x2, x1) ? x2 - x1 : x2
end

_hash(a, h, val)
end

function _hash{T}(a::AbstractArray{T}, h::UInt, val::Function)
h += hashaa_seed
h += hash(size(a))

state = start(a)
done(a, state) && return h
x1, state = next(a, state)
# Always hash the first element
h = hash(x1, h)
done(a, state) && return h

# Then hash the difference between two subsequent elements when - is supported,
# or the elements themselves when not
x2, state = next(a, state)
v2 = val(x1, x2)
done(a, state) && return hash(v2, h)

v1 = v2
while !done(a, state)
x1 = x2
x2, state = next(a, state)
v1 = v2
v2 = val(x1, x2)
if isequal(v2, v1)
# For repeated elements, use run length encoding
# This allows efficient hashing of sparse arrays
runlength = 2
while !done(a, state)
x1 = x2
x2, state = next(a, state)
v2 = val(x1, x2)
isequal(v1, v2) || break
runlength += 1
end
h += hashrle_seed
h = hash(runlength, h)
end
h = hash(v1, h)
end
!isequal(v2, v1) && (h = hash(v2, h))
return h
end

# hashaa_seed and hashrle_seed are defined in abstractarray.jl
function hash(r::Range, h::UInt)
h += hashr_seed
h += hashaa_seed
h += hash(size(r))

length(r) == 0 && return h

h = hash(first(r), h)
h = hash(step(r), h)
h = hash(last(r), h)
length(r) == 1 && return h
length(r) == 2 && return hash(step(r), h)

h += hashrle_seed
h = hash(length(r)-1, h)
hash(step(r), h)
end
28 changes: 21 additions & 7 deletions base/sparse/sparsematrix.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3301,6 +3301,14 @@ end
hash(val, h)
end

# Hash a sequence of zero entries, including the step before the first one
@inline function hashzeros(val, runlength::Int, h::UInt)
runlength == 0 && return h
h = hash(zero(val)-val, h)
hashrun(zero(val), runlength-1, h)
end

# hashaa_seed and hashrle_seed are defined in abstractarray.jl
function hash{T}(A::SparseMatrixCSC{T}, h::UInt)
h += Base.hashaa_seed
sz = size(A)
Expand All @@ -3311,26 +3319,32 @@ function hash{T}(A::SparseMatrixCSC{T}, h::UInt)
nzval = A.nzval
lastidx = 0
runlength = 0
lastrunlength = 0
last = zero(T)
lastnz = zero(T)
lastdiff = zero(T)
@inbounds for col = 1:size(A, 2)
for j = colptr[col]:colptr[col+1]-1
nz = nzval[j]
isequal(nz, zero(T)) && continue
idx = sub2ind(sz, rowval[j], col)
if idx != lastidx+1 || !isequal(nz, lastnz) # Run is over
h = hashrun(lastnz, runlength, h) # Hash previous run
h = hashrun(0, idx-lastidx-1, h) # Hash intervening zeros

idx != lastidx+1 && (last = 0) # There have been zeros since the previous value
diff = nz - last
if idx != lastidx+1 || !isequal(diff, lastdiff) # Run is over
h = hashrun(lastdiff, runlength, h) # Hash previous run
h = hashzeros(lastnz, idx-lastidx-1, h) # Hash intervening zeros
runlength = 1
lastnz = nz
else
runlength += 1
end
lastidx = idx
last = nz
lastnz = nz
lastdiff = diff
end
end
h = hashrun(lastnz, runlength, h) # Hash previous run
hashrun(0, length(A)-lastidx, h) # Hash zeros at end
h = hashrun(lastdiff, runlength, h) # Hash previous run
hashzeros(last, length(A)-lastidx, h) # Hash zeros at end
end

## Statistics
Expand Down
38 changes: 37 additions & 1 deletion test/hashing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,49 @@ vals = Any[
[], [1], [2], [1, 1], [1, 2], [1, 3], [2, 2], [1, 2, 2], [1, 3, 3],
zeros(2, 2), spzeros(2, 2), eye(2, 2), speye(2, 2),
sparse(ones(2, 2)), ones(2, 2), sparse([0 0; 1 0]), [0 0; 1 0],
[-0. 0; -0. 0.], SparseMatrixCSC(2, 2, [1, 3, 3], [1, 2], [-0., -0.])
# [-0. 0; -0. 0.], SparseMatrixCSC(2, 2, [1, 3, 3], [1, 2], [-0., -0.]),
# issue #16364
1:4, 1:1:4, 1:-1:0, 1.0:4.0, 1.0:1.0:4.0, linspace(1, 4, 4),
'a':'e', ['a', 'b', 'c', 'd', 'e'],
# check that hash is still consistent with heteregeneous arrays for which - is defined
# for some pairs and not others (no element must be ignored)
["a", "b", 1, 2], ["a", 1, 2], ["a", "b", 2, 2], ["a", "a", 1, 2], ["a", "b", 2, 3]
]

for a in vals, b in vals
@test isequal(a,b) == (hash(a)==hash(b))
end

vals = Any[
Int[], Char[], String[],
[0], [1], ['a'], ["a"],
[0, 1], ['a', 'b'], ["a", "b"],
[0, 1, 2], ['a', 'b', 'c'], ["a", "b", "c"],
# test various sparsity patterns
[0, 0], [0, 0, 0], [0, 1], [1, 0],
[0, 0, 1], [0, 1, 0], [1, 0, 0],
[0 0; 0 0], [1 0; 0 0], [0 1; 0 0], [0 0; 1 0], [0 0; 0 1],
[5 1; 0 0], [1 0; 0 1], [0 2; 3 0], [0 4; 1 2], [4 0; 0 1],
[0 0 0; 0 0 0], [1 0 0; 0 0 1], [0 0 2; 3 0 0], [0 0 7; 6 1 2], [4 0 0; 3 0 1]
]

for a in vals
# check that element type does not affect hash
@test hash(convert(Array{Any}, a)) == hash(a)
@test hash(convert(Array{supertype(eltype(a))}, a)) == hash(a)
@test hash(sparse(a)) == hash(a)
end

vals = Any[
1:0, 1:1, 1:2, 1:3, 1.0:0.0, 1.0:1.0:1.0, 1.0:0.5:3.0,
0:-1:1, 0.0:-1.0:1.0, -4:10, 'a':'e', 'b':'a',
linspace(1, 1, 1), linspace(1, 10, 3)
]

for a in vals
@test hash(collect(a)) == hash(a)
end

@test hash(SubString("--hello--",3,7)) == hash("hello")
@test hash(:(X.x)) == hash(:(X.x))
@test hash(:(X.x)) != hash(:(X.y))
Expand Down

0 comments on commit 0e6e2ce

Please sign in to comment.