From 486af2f58c2daa5f90a6c68bd4ec740e9c90fb8b Mon Sep 17 00:00:00 2001 From: Nicolas Stucki Date: Mon, 27 Nov 2023 10:05:51 +0100 Subject: [PATCH] Add UTF8 abstraction in the TASTy format We add a `Utf8` encoding to the grammar. This should not to be confused with the `UTF8` name tag. This mistake was made in the `Comment` format. We also add corresponding `writeUtf8` and `readUtf8` methods to the `TastyBuffer`. --- .../tools/dotc/core/tasty/CommentPickler.scala | 5 +---- .../tools/dotc/core/tasty/CommentUnpickler.scala | 10 +++------- .../tools/dotc/core/tasty/TastyPickler.scala | 15 ++++----------- tasty/src/dotty/tools/tasty/TastyBuffer.scala | 11 +++++++++++ tasty/src/dotty/tools/tasty/TastyFormat.scala | 7 ++++--- tasty/src/dotty/tools/tasty/TastyReader.scala | 10 ++++++++++ 6 files changed, 33 insertions(+), 25 deletions(-) diff --git a/compiler/src/dotty/tools/dotc/core/tasty/CommentPickler.scala b/compiler/src/dotty/tools/dotc/core/tasty/CommentPickler.scala index fde6c669045d..10df2a437af6 100644 --- a/compiler/src/dotty/tools/dotc/core/tasty/CommentPickler.scala +++ b/compiler/src/dotty/tools/dotc/core/tasty/CommentPickler.scala @@ -22,11 +22,8 @@ object CommentPickler: def pickleComment(addr: Addr, comment: Comment): Unit = if addr != NoAddr then - val bytes = comment.raw.getBytes(StandardCharsets.UTF_8).nn - val length = bytes.length buf.writeAddr(addr) - buf.writeNat(length) - buf.writeBytes(bytes, length) + buf.writeUtf8(comment.raw) buf.writeLongInt(comment.span.coords) def traverse(x: Any): Unit = x match diff --git a/compiler/src/dotty/tools/dotc/core/tasty/CommentUnpickler.scala b/compiler/src/dotty/tools/dotc/core/tasty/CommentUnpickler.scala index 1bbea6447bf3..d3b5c647b9c5 100644 --- a/compiler/src/dotty/tools/dotc/core/tasty/CommentUnpickler.scala +++ b/compiler/src/dotty/tools/dotc/core/tasty/CommentUnpickler.scala @@ -19,13 +19,9 @@ class CommentUnpickler(reader: TastyReader) { val comments = new HashMap[Addr, Comment] while (!isAtEnd) { val addr = readAddr() - val length = readNat() - if (length > 0) { - val bytes = readBytes(length) - val position = new Span(readLongInt()) - val rawComment = new String(bytes, StandardCharsets.UTF_8) - comments(addr) = Comment(position, rawComment) - } + val rawComment = readUtf8() + val position = new Span(readLongInt()) + comments(addr) = Comment(position, rawComment) } comments } diff --git a/compiler/src/dotty/tools/dotc/core/tasty/TastyPickler.scala b/compiler/src/dotty/tools/dotc/core/tasty/TastyPickler.scala index 556265c66ce9..214f7a5f6702 100644 --- a/compiler/src/dotty/tools/dotc/core/tasty/TastyPickler.scala +++ b/compiler/src/dotty/tools/dotc/core/tasty/TastyPickler.scala @@ -13,14 +13,8 @@ import collection.mutable import core.Symbols.ClassSymbol import Decorators.* -object TastyPickler { - - private val versionStringBytes = { - val compilerString = s"Scala ${config.Properties.simpleVersionString}" - compilerString.getBytes(java.nio.charset.StandardCharsets.UTF_8) - } - -} +object TastyPickler: + private val versionString = s"Scala ${config.Properties.simpleVersionString}" class TastyPickler(val rootCls: ClassSymbol) { @@ -48,13 +42,12 @@ class TastyPickler(val rootCls: ClassSymbol) { val uuidHi: Long = otherSectionHashes.fold(0L)(_ ^ _) val headerBuffer = { - val buf = new TastyBuffer(header.length + TastyPickler.versionStringBytes.length + 32) + val buf = new TastyBuffer(header.length + TastyPickler.versionString.length + 32) for (ch <- header) buf.writeByte(ch.toByte) buf.writeNat(MajorVersion) buf.writeNat(MinorVersion) buf.writeNat(ExperimentalVersion) - buf.writeNat(TastyPickler.versionStringBytes.length) - buf.writeBytes(TastyPickler.versionStringBytes, TastyPickler.versionStringBytes.length) + buf.writeUtf8(TastyPickler.versionString) buf.writeUncompressedLong(uuidLow) buf.writeUncompressedLong(uuidHi) buf diff --git a/tasty/src/dotty/tools/tasty/TastyBuffer.scala b/tasty/src/dotty/tools/tasty/TastyBuffer.scala index f9266cf23617..b27a5b8878ab 100644 --- a/tasty/src/dotty/tools/tasty/TastyBuffer.scala +++ b/tasty/src/dotty/tools/tasty/TastyBuffer.scala @@ -1,6 +1,7 @@ package dotty.tools.tasty import util.Util.dble +import java.nio.charset.StandardCharsets object TastyBuffer { @@ -115,6 +116,16 @@ class TastyBuffer(initialSize: Int) { writeBytes(bytes, 8) } + /** Write a UTF8 string encoded as `Nat UTF8-CodePoint*`, + * where the `Nat` is the length of the code-points bytes. + */ + def writeUtf8(x: String): Unit = { + val bytes = x.getBytes(StandardCharsets.UTF_8) + val length = bytes.length + writeNat(length) + writeBytes(bytes, length) + } + // -- Address handling -------------------------------------------- /** Write natural number `x` right-adjusted in a field of `width` bytes diff --git a/tasty/src/dotty/tools/tasty/TastyFormat.scala b/tasty/src/dotty/tools/tasty/TastyFormat.scala index 7e412a5e67a7..a77ad5d400d5 100644 --- a/tasty/src/dotty/tools/tasty/TastyFormat.scala +++ b/tasty/src/dotty/tools/tasty/TastyFormat.scala @@ -16,6 +16,7 @@ Micro-syntax: Nat = LongInt -- non-negative value, fits in an Int without overflow Digit = 0 | ... | 127 StopDigit = 128 | ... | 255 -- value = digit - 128 + Utf8 = Nat UTF8-CodePoint* ``` Macro-format: @@ -24,12 +25,12 @@ Macro-format: nameTable_Length Name* Section* Header = 0x5CA1AB1F UUID = Byte*16 -- random UUID - VersionString = Length UTF8-CodePoint* -- string that represents the compiler that produced the TASTy + VersionString = Utf8 -- string that represents the compiler that produced the TASTy Section = NameRef Length Bytes Length = Nat -- length of rest of entry in bytes - Name = UTF8 Length UTF8-CodePoint* + Name = UTF8 Utf8 QUALIFIED Length qualified_NameRef selector_NameRef -- A.B EXPANDED Length qualified_NameRef selector_NameRef -- A$$B, semantically a NameKinds.ExpandedName EXPANDPREFIX Length qualified_NameRef selector_NameRef -- A$B, prefix of expanded name, see NamedKinds.ExpandPrefixName @@ -265,7 +266,7 @@ All elements of a position section are serialized as Ints Standard Section: "Comments" Comment* ```none - Comment = UTF8 LongInt // Raw comment's bytes encoded as UTF-8, followed by the comment's coordinates. + Comment = Utf8 LongInt // Raw comment's bytes encoded as UTF-8, followed by the comment's coordinates. ``` Standard Section: "Attributes" Attribute* diff --git a/tasty/src/dotty/tools/tasty/TastyReader.scala b/tasty/src/dotty/tools/tasty/TastyReader.scala index 31407f7a4ab8..b5aa29f16954 100644 --- a/tasty/src/dotty/tools/tasty/TastyReader.scala +++ b/tasty/src/dotty/tools/tasty/TastyReader.scala @@ -3,6 +3,7 @@ package dotty.tools.tasty import collection.mutable import TastyBuffer._ +import java.nio.charset.StandardCharsets /** A byte array buffer that can be filled with bytes or natural numbers in TASTY format, * and that supports reading and patching addresses represented as natural numbers. @@ -104,6 +105,15 @@ class TastyReader(val bytes: Array[Byte], start: Int, end: Int, val base: Int = x } + /** Read a UTF8 string encoded as `Nat UTF8-CodePoint*`, + * where the `Nat` is the length of the code-points bytes. + */ + def readUtf8(): String = { + val length = readNat() + if (length == 0) "" + else new String(readBytes(length), StandardCharsets.UTF_8) + } + /** Read a natural number and return as a NameRef */ def readNameRef(): NameRef = NameRef(readNat())