From 3b250c73dc686cb3857067582f47be336ac788ef Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 19 Dec 2023 07:55:04 -0500 Subject: [PATCH] fix isequal_normalized for combining-char reordering (#52447) Fixes #52408. (Note that this function was added in Julia 1.8, in #42493.) In the future it would be good to further optimize this function by adding a fast path for the common case of strings that are mostly ASCII characters. Perhaps simply skip ahead to the first byte that doesn't match before we begin doing decomposition etcetera. --- stdlib/Unicode/src/Unicode.jl | 81 ++++++++++++++++++++++++++++----- stdlib/Unicode/test/runtests.jl | 67 +++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 11 deletions(-) diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl index bd54037ec9e6f..9d86f6f9cbe7b 100644 --- a/stdlib/Unicode/src/Unicode.jl +++ b/stdlib/Unicode/src/Unicode.jl @@ -208,12 +208,19 @@ end using Base.Unicode: utf8proc_error, UTF8PROC_DECOMPOSE, UTF8PROC_CASEFOLD, UTF8PROC_STRIPMARK -function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, options::Integer) - ret = @ccall utf8proc_decompose_char(codepoint::UInt32, dest::Ptr{UInt32}, length(dest)::Int, options::Cint, C_NULL::Ptr{Cint})::Int +function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, offset::Integer, options::Integer) + ret = GC.@preserve dest @ccall utf8proc_decompose_char(codepoint::UInt32, pointer(dest, 1+offset)::Ptr{UInt32}, (length(dest)-offset)::Int, options::Cint, C_NULL::Ptr{Cint})::Int ret < 0 && utf8proc_error(ret) return ret end +# would be good to have higher-level accessor functions in utf8proc. alternatively, +# we could mirror the whole utf8proc_property_t struct in Julia, but that is annoying +# because of the bitfields. +combining_class(uc::Integer) = + 0x000301 ≤ uc ≤ 0x10ffff ? unsafe_load(ccall(:utf8proc_get_property, Ptr{UInt16}, (UInt32,), uc), 2) : 0x0000 +combining_class(c::AbstractChar) = ismalformed(c) ? 0x0000 : combining_class(UInt32(c)) + """ isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity) @@ -225,6 +232,9 @@ As with [`Unicode.normalize`](@ref), you can also pass an arbitrary function via the `chartransform` keyword (mapping `Integer` codepoints to codepoints) to perform custom normalizations, such as [`Unicode.julia_chartransform`](@ref). +!!! compat "Julia 1.8" + The `isequal_normalized` function was added in Julia 1.8. + # Examples For example, the string `"noël"` can be constructed in two canonically equivalent ways @@ -251,29 +261,78 @@ julia> isequal_normalized(s1, "NOËL", casefold=true) true ``` """ -function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity) - function decompose_next_char!(c, state, d, options, s) - n = _decompose_char!(c, d, options) - if n > length(d) # may be possible in future Unicode versions? - n = _decompose_char!(c, resize!(d, n), options) +isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity) = + _isequal_normalized!(s1, s2, Vector{UInt32}(undef, 4), Vector{UInt32}(undef, 4), chartransform; casefold, stripmark) + +# like isequal_normalized, but takes pre-allocated codepoint buffers as arguments, and chartransform is a positional argument +function _isequal_normalized!(s1::AbstractString, s2::AbstractString, + d1::Vector{UInt32}, d2::Vector{UInt32}, chartransform::F=identity; + casefold::Bool=false, stripmark::Bool=false) where {F} + function decompose_next_chars!(state, d, options, s) + local n + offset = 0 + @inbounds while true + # read a char and decompose it to d + c = chartransform(UInt32(state[1])) + state = iterate(s, state[2]) + if c < 0x80 # fast path for common ASCII case + n = 1 + offset + n > length(d) && resize!(d, 2n) + d[n] = casefold ? (0x41 ≤ c ≤ 0x5A ? c+0x20 : c) : c + break # ASCII characters are all zero combining class + else + while true + n = _decompose_char!(c, d, offset, options) + offset + if n > length(d) + resize!(d, 2n) + continue + end + break + end + end + + # decomposed chars must be sorted in ascending order of combining class, + # which means we need to keep fetching chars until we get to non-combining + (iszero(combining_class(d[n])) || isnothing(state)) && break # non-combining + offset = n end - return 1, n, iterate(s, state) + + # sort by combining class + if n < 32 # almost always true + for j1 = 2:n # insertion sort + cc = combining_class(d[j1]) + iszero(cc) && continue # don't re-order non-combiners + for j2 = j1:-1:2 + combining_class(d[j2-1]) ≤ cc && break + d[j2-1], d[j2] = d[j2], d[j2-1] + end + end + else # avoid n^2 complexity in crazy large-n case + j = 1 + @views while j < n + j₀ = j + something(findnext(iszero ∘ combining_class, d[j+1:n], 1), n+1-j) + sort!(d[j:j₀-1], by=combining_class) + j = j₀ + end + end + + # split return statement to help type inference: + return state === nothing ? (1, n, nothing) : (1, n, state) end options = UTF8PROC_DECOMPOSE casefold && (options |= UTF8PROC_CASEFOLD) stripmark && (options |= UTF8PROC_STRIPMARK) i1,i2 = iterate(s1),iterate(s2) - d1,d2 = Vector{UInt32}(undef, 4), Vector{UInt32}(undef, 4) # codepoint buffers n1 = n2 = 0 # lengths of codepoint buffers j1 = j2 = 1 # indices in d1, d2 while true if j1 > n1 i1 === nothing && return i2 === nothing && j2 > n2 - j1, n1, i1 = decompose_next_char!(chartransform(UInt32(i1[1])), i1[2], d1, options, s1) + j1, n1, i1 = decompose_next_chars!(i1, d1, options, s1) end if j2 > n2 i2 === nothing && return false - j2, n2, i2 = decompose_next_char!(chartransform(UInt32(i2[1])), i2[2], d2, options, s2) + j2, n2, i2 = decompose_next_chars!(i2, d2, options, s2) end d1[j1] == d2[j2] || return false j1 += 1; j2 += 1 diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl index 573d5da6167ba..5248bd1e1fd27 100644 --- a/stdlib/Unicode/test/runtests.jl +++ b/stdlib/Unicode/test/runtests.jl @@ -3,6 +3,9 @@ using Test using Unicode using Unicode: normalize, isassigned, julia_chartransform +import Random + +Random.seed!(12345) @testset "string normalization" begin # normalize (Unicode normalization etc.): @@ -455,6 +458,9 @@ end @test !Base.Unicode.isvalid(Char, overlong_char) end +# the obvious, but suboptimal, algorithm: +isequal_normalized_naive(s1, s2; kws...) = normalize(s1; kws...) == normalize(s2; kws...) + @testset "Unicode equivalence" begin @test isequal_normalized("no\u00EBl", "noe\u0308l") @test !isequal_normalized("no\u00EBl", "noe\u0308l ") @@ -466,4 +472,65 @@ end @test isequal_normalized("no\u00EBl", "noel", stripmark=true) @test isequal_normalized("no\u00EBl", "NOEL", stripmark=true, casefold=true) @test isequal_normalized("\u00B5\u0302m", "\u03BC\u0302m", chartransform=julia_chartransform) + + # issue #52408 + @testset "Sorting combining characters" begin + for str in ("\u5bc\u5b0", "j\u5ae\u5bf\u5b2\u5b4") # julia#52408 examples + @test isequal_normalized(str, normalize(str)) + end + + # first codepoint in every possible Unicode combining class + let cc_chars = UInt32[0x00000334, 0x00016ff0, 0x0000093c, 0x00003099, 0x0000094d, 0x000005b0, 0x000005b1, 0x000005b2, 0x000005b3, 0x000005b4, 0x000005b5, 0x000005b6, 0x000005b7, 0x000005b8, 0x000005b9, 0x000005bb, 0x000005bc, 0x000005bd, 0x000005bf, 0x000005c1, 0x000005c2, 0x0000fb1e, 0x0000064b, 0x0000064c, 0x0000064d, 0x00000618, 0x00000619, 0x0000061a, 0x00000651, 0x00000652, 0x00000670, 0x00000711, 0x00000c55, 0x00000c56, 0x00000e38, 0x00000e48, 0x00000eb8, 0x00000ec8, 0x00000f71, 0x00000f72, 0x00000f74, 0x00000321, 0x00001dce, 0x0000031b, 0x00001dfa, 0x00000316, 0x0000059a, 0x0000302e, 0x0001d16d, 0x000005ae, 0x00000301, 0x00000315, 0x0000035c, 0x0000035d, 0x00000345], + vowels = ['a', 'e', 'i', 'o', 'u', 'å', 'é', 'î', 'ö', 'ü'], Vowels = [vowels; uppercase.(vowels)] + function randcc(n, n_cc) # random string with lots of combining chars + buf = IOBuffer() + for _ = 1:n + print.(buf, rand(Vowels, rand(1:5))) + print.(buf, Char.(rand(cc_chars, rand(0:n_cc)))) + end + return String(take!(buf)) + end + for _ = 1:100 + s = randcc(10,10) + ns = normalize(s) + cs = normalize(s, casefold=true) + @test isequal_normalized(s, s) + if !isequal_normalized(s, ns) + @show s + end + @test isequal_normalized(s, ns) + @test isequal_normalized(cs, ns) == isequal_normalized_naive(cs, ns) + @test isequal_normalized(cs, ns, casefold=true) == + isequal_normalized_naive(cs, ns, casefold=true) + end + for _ = 1:3 + s = randcc(5,1000) # exercise sort!-based fallback + @test isequal_normalized(s, normalize(s)) + end + function randcc2(n, n_cc) # 2 strings with equivalent reordered combiners + buf1 = IOBuffer() + buf2 = IOBuffer() + p = n_cc / length(cc_chars) + for _ = 1:n + a = join(rand(Vowels, rand(1:5))) + print(buf1, a) + print(buf2, a) + + # chars from distinct combining classes + # are canonically equivalent when re-ordered + c = Random.randsubseq(cc_chars, p) + print.(buf1, Char.(Random.shuffle!(c))) + print.(buf2, Char.(Random.shuffle!(c))) + end + return String(take!(buf1)), String(take!(buf2)) + end + for _ = 1:100 + s1, s2 = randcc2(10,10) + @test isequal_normalized(s1, s2) + end + end + + # combining characters in the same class are inequivalent if re-ordered: + @test !isequal_normalized("x\u0334\u0335", "x\u0335\u0334") + end end