Skip to content

Commit

Permalink
Use RLE on step hash to make arrays and ranges equal
Browse files Browse the repository at this point in the history
Instead of hashing the values themselves, hash the first value and
differences between subsequent elements using run-length encoding.
This allows for O(1) hashing of ranges consistent with AbstractArrays,
which means they can now compare equal.

Elements for which the - operator is not defined are hashed directly.
This assumes that types which can be used for ranges and sparse matrices
implement -.
  • Loading branch information
nalimilan committed May 17, 2016
1 parent 807ec46 commit 73c056e
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 58 deletions.
41 changes: 0 additions & 41 deletions base/abstractarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -912,9 +912,6 @@ function isequal(A::AbstractArray, B::AbstractArray)
if size(A) != size(B)
return false
end
if isa(A,Range) != isa(B,Range)
return false
end
for (a, b) in zip(A, B)
if !isequal(a, b)
return false
Expand All @@ -935,9 +932,6 @@ function (==)(A::AbstractArray, B::AbstractArray)
if size(A) != size(B)
return false
end
if isa(A,Range) != isa(B,Range)
return false
end
for (a, b) in zip(A, B)
if !(a == b)
return false
Expand Down Expand Up @@ -1151,38 +1145,3 @@ push!(A, a, b) = push!(push!(A, a), b)
push!(A, a, b, c...) = push!(push!(A, a, b), c...)
unshift!(A, a, b) = unshift!(unshift!(A, b), a)
unshift!(A, a, b, c...) = unshift!(unshift!(A, c...), a, b)

## hashing collections ##

const hashaa_seed = UInt === UInt64 ? 0x7f53e68ceb575e76 : 0xeb575e76
const hashrle_seed = UInt == UInt64 ? 0x2aab8909bfea414c : 0xbfea414c
function hash(a::AbstractArray, h::UInt)
h += hashaa_seed
h += hash(size(a))

state = start(a)
done(a, state) && return h
x2, state = next(a, state)
done(a, state) && return hash(x2, h)

x1 = x2
while !done(a, state)
x1 = x2
x2, state = next(a, state)
if isequal(x2, x1)
# For repeated elements, use run length encoding
# This allows efficient hashing of sparse arrays
runlength = 2
while !done(a, state)
x2, state = next(a, state)
isequal(x1, x2) || break
runlength += 1
end
h += hashrle_seed
h = hash(runlength, h)
end
h = hash(x1, h)
end
!isequal(x2, x1) && (h = hash(x2, h))
return h
end
78 changes: 73 additions & 5 deletions base/hashing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,79 @@ end

hash(x::QuoteNode, h::UInt) = hash(x.value, hash(QuoteNode, h))

# hashing ranges by component at worst leads to collisions for very similar ranges
const hashr_seed = UInt === UInt64 ? 0x80707b6821b70087 : 0x21b70087
## hashing collections ##

const hashaa_seed = UInt === UInt64 ? 0x7f53e68ceb575e76 : 0xeb575e76
const hashrle_seed = UInt == UInt64 ? 0x2aab8909bfea414c : 0xbfea414c
function hash{T}(a::AbstractArray{T}, h::UInt)
if isleaftype(T)
if method_exists(-, (T, T))
val = (x1, x2) -> x2 - x1
else
val = (x1, x2) -> x2
end
else
val = (x1, x2) -> applicable(-, x2, x1) ? x2 - x1 : x2
end

_hash(a, h, val)
end

function _hash{T}(a::AbstractArray{T}, h::UInt, val::Function)
h += hashaa_seed
h += hash(size(a))

state = start(a)
done(a, state) && return h
x1, state = next(a, state)
# Always hash the first element
h = hash(x1, h)
done(a, state) && return h

# Then hash the difference between two subsequent elements when - is supported,
# or the elements themselves when not
x2, state = next(a, state)
v2 = val(x1, x2)
done(a, state) && return hash(v2, h)

v1 = v2
while !done(a, state)
x1 = x2
x2, state = next(a, state)
v1 = v2
v2 = applicable(-, x2, x1) ? x2 - x1 : x2
if isequal(v2, v1)
# For repeated elements, use run length encoding
# This allows efficient hashing of sparse arrays
runlength = 2
while !done(a, state)
x1 = x2
x2, state = next(a, state)
v2 = val(x1, x2)
isequal(v1, v2) || break
runlength += 1
end
h += hashrle_seed
h = hash(runlength, h)
end
h = hash(v1, h)
end
!isequal(v2, v1) && (h = hash(v2, h))
return h
end

# hashaa_seed and hashrle_seed are defined in abstractarray.jl
function hash(r::Range, h::UInt)
h += hashr_seed
h += hashaa_seed
h += hash(size(r))

length(r) == 0 && return h

h = hash(first(r), h)
h = hash(step(r), h)
h = hash(last(r), h)
length(r) == 1 && return h
length(r) == 2 && return hash(step(r), h)

h += hashrle_seed
h = hash(length(r)-1, h)
hash(step(r), h)
end
40 changes: 29 additions & 11 deletions base/sparse/sparsematrix.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3292,6 +3292,7 @@ end

# End the run and return the current hash
@inline function hashrun(val, runlength::Int, h::UInt)
# @show val, runlength
if runlength == 0
return h
elseif runlength > 1
Expand All @@ -3301,7 +3302,16 @@ end
hash(val, h)
end

function hash{T}(A::SparseMatrixCSC{T}, h::UInt)
# Hash a sequence of zero entries, including the step before the first one
@inline function hashzeros(val, runlength::Int, h::UInt)
#@show "b", val, runlength
runlength == 0 && return h
h = hash(zero(val)-val, h)
hashrun(zero(val), runlength-1, h)
end

# hashaa_seed and hashrle_seed are defined in abstractarray.jl
function hashsp{T}(A::SparseMatrixCSC{T}, h::UInt)
h += Base.hashaa_seed
sz = size(A)
h += hash(sz)
Expand All @@ -3311,26 +3321,34 @@ function hash{T}(A::SparseMatrixCSC{T}, h::UInt)
nzval = A.nzval
lastidx = 0
runlength = 0
last = zero(T)
lastnz = zero(T)
lastdiff = zero(T)
@inbounds for col = 1:size(A, 2)
for j = colptr[col]:colptr[col+1]-1
nz = nzval[j]
isequal(nz, zero(T)) && continue
idx = sub2ind(sz, rowval[j], col)
if idx != lastidx+1 || !isequal(nz, lastnz) # Run is over
h = hashrun(lastnz, runlength, h) # Hash previous run
h = hashrun(0, idx-lastidx-1, h) # Hash intervening zeros

runlength = 1
lastnz = nz
else
runlength += 1
diff = nz - last
# @show col, j, nz, last, diff, lastdiff, idx, lastidx
if idx != lastidx+1 # There are zeros since the previous value
h = hashzeros(lastnz, idx-lastidx-1, h) # Hash intervening zeros
last = 0
runlength = 0
end
if !isequal(diff, lastdiff) # Run is over
h = hashrun(lastdiff, runlength, h) # Hash previous run
runlength = 0
end
runlength += 1
lastidx = idx
last = nz
lastnz = nz
lastdiff = diff
end
end
h = hashrun(lastnz, runlength, h) # Hash previous run
hashrun(0, length(A)-lastidx, h) # Hash zeros at end
h = hashrun(lastdiff, runlength, h) # Hash previous run
hashzeros(last, length(A)-lastidx, h) # Hash zeros at end
end

## Statistics
Expand Down
38 changes: 37 additions & 1 deletion test/hashing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,49 @@ vals = Any[
[], [1], [2], [1, 1], [1, 2], [1, 3], [2, 2], [1, 2, 2], [1, 3, 3],
zeros(2, 2), spzeros(2, 2), eye(2, 2), speye(2, 2),
sparse(ones(2, 2)), ones(2, 2), sparse([0 0; 1 0]), [0 0; 1 0],
[-0. 0; -0. 0.], SparseMatrixCSC(2, 2, [1, 3, 3], [1, 2], [-0., -0.])
[-0. 0; -0. 0.], SparseMatrixCSC(2, 2, [1, 3, 3], [1, 2], [-0., -0.]),
# issue #16364
1:4, 1:1:4, 1:-1:0, 1.0:4.0, 1.0:1.0:4.0, linspace(1, 4, 4),
'a':'e', ['a', 'b', 'c', 'd', 'e'],
# check that hash is still consistent with heteregeneous arrays for which - is defined
# for some pairs and not others (no element must be ignored)
["a", "b", 1, 2], ["a", 1, 2], ["a", "b", 2, 2], ["a", "a", 1, 2], ["a", "b", 2, 3]
]

for a in vals, b in vals
@test isequal(a,b) == (hash(a)==hash(b))
end

vals = Any[
Int[], Char[], String[],
[0], [1], ['a'], ["a"],
[0, 1], ['a', 'b'], ["a", "b"],
[0, 1, 2], ['a', 'b', 'c'], ["a", "b", "c"],
# test various sparsity patterns
[0, 0], [0, 0, 0], [0, 1], [1, 0],
[0, 0, 1], [0, 1, 0], [1, 0, 0],
[0 0; 0 0], [1 0; 0 0], [0 1; 0 0], [0 0; 1 0], [0 0; 0 1],
[5 1; 0 0], [1 0; 0 1], [0 2; 3 0], [0 4; 1 2], [4 0; 0 1],
[0 0 0; 0 0 0], [1 0 0; 0 0 1], [0 0 2; 3 0 0], [0 0 7; 6 1 2], [4 0 0; 3 0 1]
]

for a in vals
# check that element type does not affect hash
@test hash(convert(Array{Any}, a)) == hash(a)
@test hash(convert(Array{supertype(eltype(a))}, a)) == hash(a)
@test hash(sparse(a)) == hash(a)
end

vals = Any[
1:0, 1:1, 1:2, 1:3, 1.0:0.0, 1.0:1.0:1.0, 1.0:0.5:3.0,
0:-1:1, 0.0:-1.0:1.0, -4:10, 'a':'e', 'b':'a',
linspace(1, 1, 1), linspace(1, 10, 3)
]

for a in vals
@test hash(collect(a)) == hash(a)
end

@test hash(SubString("--hello--",3,7)) == hash("hello")
@test hash(:(X.x)) == hash(:(X.x))
@test hash(:(X.x)) != hash(:(X.y))
Expand Down

0 comments on commit 73c056e

Please sign in to comment.