-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add UTF8 abstraction in the TASTy format #19090
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,14 +13,8 @@ import collection.mutable | |
import core.Symbols.ClassSymbol | ||
import Decorators.* | ||
|
||
object TastyPickler { | ||
|
||
private val versionStringBytes = { | ||
val compilerString = s"Scala ${config.Properties.simpleVersionString}" | ||
compilerString.getBytes(java.nio.charset.StandardCharsets.UTF_8) | ||
} | ||
|
||
} | ||
object TastyPickler: | ||
private val versionString = s"Scala ${config.Properties.simpleVersionString}" | ||
|
||
class TastyPickler(val rootCls: ClassSymbol) { | ||
|
||
|
@@ -48,13 +42,12 @@ class TastyPickler(val rootCls: ClassSymbol) { | |
val uuidHi: Long = otherSectionHashes.fold(0L)(_ ^ _) | ||
|
||
val headerBuffer = { | ||
val buf = new TastyBuffer(header.length + TastyPickler.versionStringBytes.length + 32) | ||
val buf = new TastyBuffer(header.length + TastyPickler.versionString.length + 32) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this seems wrong - string length != utf-8 bytes length, e.g. scala> val sc = "Scala 3.3.1➽"
val sc: String = Scala 3.3.1➽
scala> sc.length
val res0: Int = 12
scala> val scBytes = sc.getBytes(java.nio.charset.StandardCharsets.UTF_8)
val scBytes: Array[Byte] = Array(83, 99, 97, 108, 97, 32, 51, 46, 51, 46, 49, -30, -98, -67)
scala> scBytes.length
val res1: Int = 14 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The We also do not have an exact formula to know how much space the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess in practice we shouldn't have these non-ascii strings but :/ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That was my assumption. |
||
for (ch <- header) buf.writeByte(ch.toByte) | ||
buf.writeNat(MajorVersion) | ||
buf.writeNat(MinorVersion) | ||
buf.writeNat(ExperimentalVersion) | ||
buf.writeNat(TastyPickler.versionStringBytes.length) | ||
buf.writeBytes(TastyPickler.versionStringBytes, TastyPickler.versionStringBytes.length) | ||
buf.writeUtf8(TastyPickler.versionString) | ||
buf.writeUncompressedLong(uuidLow) | ||
buf.writeUncompressedLong(uuidHi) | ||
buf | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
package dotty.tools.tasty | ||
|
||
import util.Util.dble | ||
import java.nio.charset.StandardCharsets | ||
|
||
object TastyBuffer { | ||
|
||
|
@@ -115,6 +116,16 @@ class TastyBuffer(initialSize: Int) { | |
writeBytes(bytes, 8) | ||
} | ||
|
||
/** Write a UTF8 string encoded as `Nat UTF8-CodePoint*`, | ||
* where the `Nat` is the length of the code-points bytes. | ||
*/ | ||
def writeUtf8(x: String): Unit = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe you can have an overload for |
||
val bytes = x.getBytes(StandardCharsets.UTF_8) | ||
val length = bytes.length | ||
writeNat(length) | ||
writeBytes(bytes, length) | ||
} | ||
|
||
// -- Address handling -------------------------------------------- | ||
|
||
/** Write natural number `x` right-adjusted in a field of `width` bytes | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems like a bug. If the comment is empty we should read this long int, otherwise the next comment will start by reading this value instead of the the length of the comment.
I assume it was fine because we never pickled empty documentation. I wonder what should be the behaviour of
/***/
. In that case we still have some coordinates we should pickle.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All pickled comments contain the
/**
and*/
in the comments section. Therefore they can never be empty. I wonder if we could optimize that away at some point.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree this long int should always be read