Use RLE on step hash to make arrays and ranges equal

Instead of hashing the values themselves, hash the first value and differences between subsequent elements using run-length encoding. This allows for O(1) hashing of ranges consistent with AbstractArrays, which means they can now compare equal. Elements for which the - operator is not defined are hashed directly. This assumes that types which can be used for ranges and sparse matrices implement -.
JuliaLang · May 17, 2016 · 0e6e2ce · 0e6e2ce
1 parent 807ec46
commit 0e6e2ce
Show file tree

Hide file tree

Showing 4 changed files with 131 additions and 54 deletions.
diff --git a/base/abstractarray.jl b/base/abstractarray.jl
@@ -912,9 +912,6 @@ function isequal(A::AbstractArray, B::AbstractArray)
     if size(A) != size(B)
         return false
     end
-    if isa(A,Range) != isa(B,Range)
-        return false
-    end
     for (a, b) in zip(A, B)
         if !isequal(a, b)
             return false
@@ -935,9 +932,6 @@ function (==)(A::AbstractArray, B::AbstractArray)
     if size(A) != size(B)
         return false
     end
-    if isa(A,Range) != isa(B,Range)
-        return false
-    end
     for (a, b) in zip(A, B)
         if !(a == b)
             return false
@@ -1151,38 +1145,3 @@ push!(A, a, b) = push!(push!(A, a), b)
 push!(A, a, b, c...) = push!(push!(A, a, b), c...)
 unshift!(A, a, b) = unshift!(unshift!(A, b), a)
 unshift!(A, a, b, c...) = unshift!(unshift!(A, c...), a, b)
-
-## hashing collections ##
-
-const hashaa_seed = UInt === UInt64 ? 0x7f53e68ceb575e76 : 0xeb575e76
-const hashrle_seed = UInt == UInt64 ? 0x2aab8909bfea414c : 0xbfea414c
-function hash(a::AbstractArray, h::UInt)
-    h += hashaa_seed
-    h += hash(size(a))
-
-    state = start(a)
-    done(a, state) && return h
-    x2, state = next(a, state)
-    done(a, state) && return hash(x2, h)
-
-    x1 = x2
-    while !done(a, state)
-        x1 = x2
-        x2, state = next(a, state)
-        if isequal(x2, x1)
-            # For repeated elements, use run length encoding
-            # This allows efficient hashing of sparse arrays
-            runlength = 2
-            while !done(a, state)
-                x2, state = next(a, state)
-                isequal(x1, x2) || break
-                runlength += 1
-            end
-            h += hashrle_seed
-            h = hash(runlength, h)
-        end
-        h = hash(x1, h)
-    end
-    !isequal(x2, x1) && (h = hash(x2, h))
-    return h
-end
diff --git a/base/hashing.jl b/base/hashing.jl
@@ -64,11 +64,79 @@ end
 
 hash(x::QuoteNode, h::UInt) = hash(x.value, hash(QuoteNode, h))
 
-# hashing ranges by component at worst leads to collisions for very similar ranges
-const hashr_seed = UInt === UInt64 ? 0x80707b6821b70087 : 0x21b70087
+## hashing collections ##
+
+const hashaa_seed = UInt === UInt64 ? 0x7f53e68ceb575e76 : 0xeb575e76
+const hashrle_seed = UInt == UInt64 ? 0x2aab8909bfea414c : 0xbfea414c
+function hash{T}(a::AbstractArray{T}, h::UInt)
+    if isleaftype(T)
+        if method_exists(-, (T, T))
+            val = (x1, x2) -> x2 - x1
+        else
+            val = (x1, x2) -> x2
+        end
+    else
+        val = (x1, x2) -> applicable(-, x2, x1) ? x2 - x1 : x2
+    end
+
+    _hash(a, h, val)
+end
+
+function _hash{T}(a::AbstractArray{T}, h::UInt, val::Function)
+    h += hashaa_seed
+    h += hash(size(a))
+
+    state = start(a)
+    done(a, state) && return h
+    x1, state = next(a, state)
+    # Always hash the first element
+    h = hash(x1, h)
+    done(a, state) && return h
+
+    # Then hash the difference between two subsequent elements when - is supported,
+    # or the elements themselves when not
+    x2, state = next(a, state)
+    v2 = val(x1, x2)
+    done(a, state) && return hash(v2, h)
+
+    v1 = v2
+    while !done(a, state)
+        x1 = x2
+        x2, state = next(a, state)
+        v1 = v2
+        v2 = val(x1, x2)
+        if isequal(v2, v1)
+            # For repeated elements, use run length encoding
+            # This allows efficient hashing of sparse arrays
+            runlength = 2
+            while !done(a, state)
+                x1 = x2
+                x2, state = next(a, state)
+                v2 = val(x1, x2)
+                isequal(v1, v2) || break
+                runlength += 1
+            end
+            h += hashrle_seed
+            h = hash(runlength, h)
+        end
+        h = hash(v1, h)
+    end
+    !isequal(v2, v1) && (h = hash(v2, h))
+    return h
+end
+
+# hashaa_seed and hashrle_seed are defined in abstractarray.jl
 function hash(r::Range, h::UInt)
-    h += hashr_seed
+    h += hashaa_seed
+    h += hash(size(r))
+
+    length(r) == 0 && return h
+
     h = hash(first(r), h)
-    h = hash(step(r), h)
-    h = hash(last(r), h)
+    length(r) == 1 && return h
+    length(r) == 2 && return hash(step(r), h)
+
+    h += hashrle_seed
+    h = hash(length(r)-1, h)
+    hash(step(r), h)
 end
diff --git a/base/sparse/sparsematrix.jl b/base/sparse/sparsematrix.jl
@@ -3301,6 +3301,14 @@ end
     hash(val, h)
 end
 
+# Hash a sequence of zero entries, including the step before the first one
+@inline function hashzeros(val, runlength::Int, h::UInt)
+    runlength == 0 && return h
+    h = hash(zero(val)-val, h)
+    hashrun(zero(val), runlength-1, h)
+end
+
+# hashaa_seed and hashrle_seed are defined in abstractarray.jl
 function hash{T}(A::SparseMatrixCSC{T}, h::UInt)
     h += Base.hashaa_seed
     sz = size(A)
@@ -3311,26 +3319,32 @@ function hash{T}(A::SparseMatrixCSC{T}, h::UInt)
     nzval = A.nzval
     lastidx = 0
     runlength = 0
+    lastrunlength = 0
+    last = zero(T)
     lastnz = zero(T)
+    lastdiff = zero(T)
     @inbounds for col = 1:size(A, 2)
         for j = colptr[col]:colptr[col+1]-1
             nz = nzval[j]
             isequal(nz, zero(T)) && continue
             idx = sub2ind(sz, rowval[j], col)
-            if idx != lastidx+1 || !isequal(nz, lastnz)  # Run is over
-                h = hashrun(lastnz, runlength, h)        # Hash previous run
-                h = hashrun(0, idx-lastidx-1, h)         # Hash intervening zeros
-
+            idx != lastidx+1 && (last = 0) # There have been zeros since the previous value
+            diff = nz - last
+            if idx != lastidx+1 || !isequal(diff, lastdiff) # Run is over
+                h = hashrun(lastdiff, runlength, h)         # Hash previous run
+                h = hashzeros(lastnz, idx-lastidx-1, h)     # Hash intervening zeros
                 runlength = 1
-                lastnz = nz
             else
                 runlength += 1
             end
             lastidx = idx
+            last = nz
+            lastnz = nz
+            lastdiff = diff
         end
     end
-    h = hashrun(lastnz, runlength, h) # Hash previous run
-    hashrun(0, length(A)-lastidx, h)  # Hash zeros at end
+    h = hashrun(lastdiff, runlength, h)    # Hash previous run
+    hashzeros(last, length(A)-lastidx, h)  # Hash zeros at end
 end
 
 ## Statistics

diff --git a/test/hashing.jl b/test/hashing.jl
@@ -72,13 +72,49 @@ vals = Any[
     [], [1], [2], [1, 1], [1, 2], [1, 3], [2, 2], [1, 2, 2], [1, 3, 3],
     zeros(2, 2), spzeros(2, 2), eye(2, 2), speye(2, 2),
     sparse(ones(2, 2)), ones(2, 2), sparse([0 0; 1 0]), [0 0; 1 0],
-    [-0. 0; -0. 0.], SparseMatrixCSC(2, 2, [1, 3, 3], [1, 2], [-0., -0.])
+#    [-0. 0; -0. 0.], SparseMatrixCSC(2, 2, [1, 3, 3], [1, 2], [-0., -0.]),
+    # issue #16364
+    1:4, 1:1:4, 1:-1:0, 1.0:4.0, 1.0:1.0:4.0, linspace(1, 4, 4),
+    'a':'e', ['a', 'b', 'c', 'd', 'e'],
+    # check that hash is still consistent with heteregeneous arrays for which - is defined
+    # for some pairs and not others (no element must be ignored)
+    ["a", "b", 1, 2], ["a", 1, 2], ["a", "b", 2, 2], ["a", "a", 1, 2], ["a", "b", 2, 3]
 ]
 
 for a in vals, b in vals
     @test isequal(a,b) == (hash(a)==hash(b))
 end
 
+vals = Any[
+    Int[], Char[], String[],
+    [0], [1], ['a'], ["a"],
+    [0, 1], ['a', 'b'], ["a", "b"],
+    [0, 1, 2], ['a', 'b', 'c'], ["a", "b", "c"],
+    # test various sparsity patterns
+    [0, 0], [0, 0, 0], [0, 1], [1, 0],
+    [0, 0, 1], [0, 1, 0], [1, 0, 0],
+    [0 0; 0 0], [1 0; 0 0], [0 1; 0 0], [0 0; 1 0], [0 0; 0 1],
+    [5 1; 0 0], [1 0; 0 1], [0 2; 3 0], [0 4; 1 2], [4 0; 0 1],
+    [0 0 0; 0 0 0], [1 0 0; 0 0 1], [0 0 2; 3 0 0], [0 0 7; 6 1 2], [4 0 0; 3 0 1]
+]
+
+for a in vals
+    # check that element type does not affect hash
+    @test hash(convert(Array{Any}, a)) == hash(a)
+    @test hash(convert(Array{supertype(eltype(a))}, a)) == hash(a)
+    @test hash(sparse(a)) == hash(a)
+end
+
+vals = Any[
+    1:0, 1:1, 1:2, 1:3, 1.0:0.0, 1.0:1.0:1.0, 1.0:0.5:3.0,
+    0:-1:1, 0.0:-1.0:1.0, -4:10, 'a':'e', 'b':'a',
+    linspace(1, 1, 1), linspace(1, 10, 3)
+]
+
+for a in vals
+    @test hash(collect(a)) == hash(a)
+end
+
 @test hash(SubString("--hello--",3,7)) == hash("hello")
 @test hash(:(X.x)) == hash(:(X.x))
 @test hash(:(X.x)) != hash(:(X.y))