Skip to content

Commit

Permalink
Use RLE on step hash to make arrays and ranges equal
Browse files Browse the repository at this point in the history
Instead of hashing the values themselves, hash the first value and
differences between subsequent elements using run-length encoding.
This allows for 0(1) hashing of ranges consistent with AbstractArrays,
which means they can now compare equal.

Elements for which the - operator is not defined are hashed directly.
This assumes that types which can be used for ranges implement -.

[ci skip]
  • Loading branch information
nalimilan committed May 17, 2016
1 parent 807ec46 commit 97c3bb3
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 92 deletions.
41 changes: 0 additions & 41 deletions base/abstractarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -912,9 +912,6 @@ function isequal(A::AbstractArray, B::AbstractArray)
if size(A) != size(B)
return false
end
if isa(A,Range) != isa(B,Range)
return false
end
for (a, b) in zip(A, B)
if !isequal(a, b)
return false
Expand All @@ -935,9 +932,6 @@ function (==)(A::AbstractArray, B::AbstractArray)
if size(A) != size(B)
return false
end
if isa(A,Range) != isa(B,Range)
return false
end
for (a, b) in zip(A, B)
if !(a == b)
return false
Expand Down Expand Up @@ -1151,38 +1145,3 @@ push!(A, a, b) = push!(push!(A, a), b)
push!(A, a, b, c...) = push!(push!(A, a, b), c...)
unshift!(A, a, b) = unshift!(unshift!(A, b), a)
unshift!(A, a, b, c...) = unshift!(unshift!(A, c...), a, b)

## hashing collections ##

const hashaa_seed = UInt === UInt64 ? 0x7f53e68ceb575e76 : 0xeb575e76
const hashrle_seed = UInt == UInt64 ? 0x2aab8909bfea414c : 0xbfea414c
function hash(a::AbstractArray, h::UInt)
h += hashaa_seed
h += hash(size(a))

state = start(a)
done(a, state) && return h
x2, state = next(a, state)
done(a, state) && return hash(x2, h)

x1 = x2
while !done(a, state)
x1 = x2
x2, state = next(a, state)
if isequal(x2, x1)
# For repeated elements, use run length encoding
# This allows efficient hashing of sparse arrays
runlength = 2
while !done(a, state)
x2, state = next(a, state)
isequal(x1, x2) || break
runlength += 1
end
h += hashrle_seed
h = hash(runlength, h)
end
h = hash(x1, h)
end
!isequal(x2, x1) && (h = hash(x2, h))
return h
end
64 changes: 59 additions & 5 deletions base/hashing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,65 @@ end

hash(x::QuoteNode, h::UInt) = hash(x.value, hash(QuoteNode, h))

# hashing ranges by component at worst leads to collisions for very similar ranges
const hashr_seed = UInt === UInt64 ? 0x80707b6821b70087 : 0x21b70087
## hashing collections ##

const hashaa_seed = UInt === UInt64 ? 0x7f53e68ceb575e76 : 0xeb575e76
const hashrle_seed = UInt == UInt64 ? 0x2aab8909bfea414c : 0xbfea414c
function hash(a::AbstractArray, h::UInt)
h += hashaa_seed
h += hash(size(a))

state = start(a)
done(a, state) && return h
x1, state = next(a, state)
# Always hash the first element
h = hash(x1, h)
done(a, state) && return h

# Then hash the difference between two subsequent elements when - is supported,
# or the elements themselves when not
x2, state = next(a, state)
v2 = applicable(-, x2, x1) ? x2 - x1 : x2
done(a, state) && return hash(v2, h)

v1 = v2
while !done(a, state)
x1 = x2
x2, state = next(a, state)
v1 = v2
v2 = applicable(-, x2, x1) ? x2 - x1 : x2
if isequal(v2, v1)
# For repeated elements, use run length encoding
# This allows efficient hashing of sparse arrays
runlength = 2
while !done(a, state)
x1 = x2
x2, state = next(a, state)
v2 = applicable(-, x2, x1) ? x2 - x1 : x2
isequal(v1, v2) || break
runlength += 1
end
h += hashrle_seed
h = hash(runlength, h)
end
h = hash(v1, h)
end
!isequal(v2, v1) && (h = hash(v2, h))
return h
end

function hash(r::Range, h::UInt)
h += hashr_seed
h += hashaa_seed
h += hash(size(r))

length(r) == 0 && return h

h = hash(first(r), h)
h = hash(step(r), h)
h = hash(last(r), h)
length(r) == 1 && return h

length(r) == 2 && return hash(step(r), h)

h += hashrle_seed
h = hash(length(r)-1, h)
hash(step(r), h)
end
45 changes: 0 additions & 45 deletions base/sparse/sparsematrix.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3288,51 +3288,6 @@ function rotl90(A::SparseMatrixCSC)
return sparse(J, I, V, n, m)
end

## hashing

# End the run and return the current hash
@inline function hashrun(val, runlength::Int, h::UInt)
if runlength == 0
return h
elseif runlength > 1
h += Base.hashrle_seed
h = hash(runlength, h)
end
hash(val, h)
end

function hash{T}(A::SparseMatrixCSC{T}, h::UInt)
h += Base.hashaa_seed
sz = size(A)
h += hash(sz)

colptr = A.colptr
rowval = A.rowval
nzval = A.nzval
lastidx = 0
runlength = 0
lastnz = zero(T)
@inbounds for col = 1:size(A, 2)
for j = colptr[col]:colptr[col+1]-1
nz = nzval[j]
isequal(nz, zero(T)) && continue
idx = sub2ind(sz, rowval[j], col)
if idx != lastidx+1 || !isequal(nz, lastnz) # Run is over
h = hashrun(lastnz, runlength, h) # Hash previous run
h = hashrun(0, idx-lastidx-1, h) # Hash intervening zeros

runlength = 1
lastnz = nz
else
runlength += 1
end
lastidx = idx
end
end
h = hashrun(lastnz, runlength, h) # Hash previous run
hashrun(0, length(A)-lastidx, h) # Hash zeros at end
end

## Statistics

# This is the function that does the reduction underlying var/std
Expand Down
9 changes: 8 additions & 1 deletion test/hashing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,20 @@ vals = Any[
Dict(42 => 101, 77 => 93), Dict{Any,Any}(42 => 101, 77 => 93),
(1,2,3,4), (1.0,2.0,3.0,4.0), (1,3,2,4),
("a","b"), (SubString("a",1,1), SubString("b",1,1)),
['a', 'b', 'c', 'd', 'e'],
# issue #6900
[x => x for x in 1:10],
Dict(7=>7,9=>9,4=>4,10=>10,2=>2,3=>3,8=>8,5=>5,6=>6,1=>1),
[], [1], [2], [1, 1], [1, 2], [1, 3], [2, 2], [1, 2, 2], [1, 3, 3],
zeros(2, 2), spzeros(2, 2), eye(2, 2), speye(2, 2),
sparse(ones(2, 2)), ones(2, 2), sparse([0 0; 1 0]), [0 0; 1 0],
[-0. 0; -0. 0.], SparseMatrixCSC(2, 2, [1, 3, 3], [1, 2], [-0., -0.])
[-0. 0; -0. 0.], SparseMatrixCSC(2, 2, [1, 3, 3], [1, 2], [-0., -0.]),
# issue #16364
1:4, 1:1:4, 1:-1:0, 1.0:4.0, 1.0:1.0:4.0, 'a':'e',
linspace(1, 3, 10), collect(linspace(1, 3, 10)),
# check that hash is still consistent with heteregeneous arrays for which - is defined
# for some pairs and not others (no element must be ignored)
["a", "b", 1, 2], ["a", 1, 2], ["a", "b", 2, 2], ["a", "a", 1, 2], ["a", "b", 2, 3]
]

for a in vals, b in vals
Expand Down

0 comments on commit 97c3bb3

Please sign in to comment.