diff --git a/base/char.jl b/base/char.jl index 749d561762b2c9..ddec23bb3c018b 100644 --- a/base/char.jl +++ b/base/char.jl @@ -50,6 +50,15 @@ Char (::Type{T})(x::AbstractChar) where {T<:Union{Number,AbstractChar}} = T(codepoint(x)) (::Type{T})(x::T) where {T<:AbstractChar} = x +""" + ncodeunits(c::Char) -> Int + +Return the number of code units required to encode a character as UTF-8. +This is the number of bytes which will be printed if the character is written +to an output stream, or `ncodeunits(string(c))` but computed efficiently. +""" +ncodeunits(c::Char) = write(devnull, c) # this is surprisingly efficient + """ codepoint(c::AbstractChar) -> Integer diff --git a/test/char.jl b/test/char.jl index 0793ba583b94f4..64f9a80a1f0478 100644 --- a/test/char.jl +++ b/test/char.jl @@ -256,3 +256,26 @@ Base.codepoint(c::ASCIIChar) = reinterpret(UInt8, c) @test_throws MethodError write(IOBuffer(), ASCIIChar('x')) @test_throws MethodError read(IOBuffer('x'), ASCIIChar) end + +@testset "ncodeunits(::Char)" begin + # valid encodings + @test ncodeunits('\0') == 1 + @test ncodeunits('\x1') == 1 + @test ncodeunits('\x7f') == 1 + @test ncodeunits('\u80') == 2 + @test ncodeunits('\uff') == 2 + @test ncodeunits('\u7ff') == 2 + @test ncodeunits('\u800') == 3 + @test ncodeunits('\uffff') == 3 + @test ncodeunits('\U10000') == 4 + @test ncodeunits('\U10ffff') == 4 + # invalid encodings + @test ncodeunits(reinterpret(Char, 0x80_00_00_00)) == 1 + @test ncodeunits(reinterpret(Char, 0x81_00_00_00)) == 1 + @test ncodeunits(reinterpret(Char, 0x80_80_00_00)) == 2 + @test ncodeunits(reinterpret(Char, 0x80_01_00_00)) == 2 + @test ncodeunits(reinterpret(Char, 0x80_00_80_00)) == 3 + @test ncodeunits(reinterpret(Char, 0x80_00_01_00)) == 3 + @test ncodeunits(reinterpret(Char, 0x80_00_00_80)) == 4 + @test ncodeunits(reinterpret(Char, 0x80_00_00_01)) == 4 +end