Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Accept supplementary characters #13136

Merged
merged 1 commit into from
Mar 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 135 additions & 80 deletions compiler/src/dotty/tools/dotc/parsing/Scanners.scala
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,45 @@ object Scanners {
recur(lastOffset, false)
}

import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierPart, isUnicodeIdentifierStart, isValidCodePoint, toCodePoint}

// f"\\u$c%04x" or f"${"\\"}u$c%04x"
private def toUnicode(c: Char): String = { val s = c.toInt.toHexString; "\\u" + "0" * (4 - s.length) + s }

// given char (ch) is high surrogate followed by low, codepoint passes predicate.
// true means supplementary chars were put to buffer.
// strict to require low surrogate (if not in string literal).
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
isHighSurrogate(high) && {
var res = false
nextChar()
val low = ch
if isLowSurrogate(low) then
nextChar()
val codepoint = toCodePoint(high, low)
if isValidCodePoint(codepoint) && test(codepoint) then
putChar(high)
putChar(low)
res = true
else
error(s"illegal character '${toUnicode(high)}${toUnicode(low)}'")
else if !strict then
putChar(high)
res = true
else
error(s"illegal character '${toUnicode(high)}' missing low surrogate")
res
}
private def atSupplementary(ch: Char, f: Int => Boolean): Boolean =
isHighSurrogate(ch) && {
val hi = ch
val lo = lookaheadChar()
isLowSurrogate(lo) && {
val codepoint = toCodePoint(hi, lo)
isValidCodePoint(codepoint) && f(codepoint)
}
}

/** read next token, filling TokenData fields of Scanner.
*/
protected final def fetchToken(): Unit = {
Expand Down Expand Up @@ -822,11 +861,12 @@ object Scanners {
else ch match {
case '{' | '[' | ' ' | '\t' if lookaheadChar() != '\'' =>
token = QUOTE
case _ if !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape) =>
case _ if !isAtEnd && ch != SU && ch != CR && ch != LF =>
val isEmptyCharLit = (ch == '\'')
getLitChar()
if ch == '\'' then
if isEmptyCharLit then error("empty character literal (use '\\'' for single quote)")
else if litBuf.length != 1 then error("illegal codepoint in Char constant: " + litBuf.toString.map(toUnicode).mkString("'", "", "'"))
else finishCharLit()
else if isEmptyCharLit then error("empty character literal")
else error("unclosed character literal")
Expand Down Expand Up @@ -869,9 +909,11 @@ object Scanners {
def fetchOther() =
if (ch == '\u21D2') {
nextChar(); token = ARROW
report.deprecationWarning("The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
}
else if (ch == '\u2190') {
nextChar(); token = LARROW
report.deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
}
else if (Character.isUnicodeIdentifierStart(ch)) {
putChar(ch)
Expand All @@ -883,9 +925,10 @@ object Scanners {
nextChar()
getOperatorRest()
}
else if isSupplementary(ch, isUnicodeIdentifierStart) then
getIdentRest()
else {
// FIXME: Dotty deviation: f"" interpolator is not supported (#1814)
error("illegal character '\\u%04x'".format(ch: Int))
error(s"illegal character '${toUnicode(ch)}'")
nextChar()
}
fetchOther()
Expand Down Expand Up @@ -1024,11 +1067,12 @@ object Scanners {
case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
finishNamed()
case _ =>
if (Character.isUnicodeIdentifierPart(ch)) {
if isUnicodeIdentifierPart(ch) then
putChar(ch)
nextChar()
getIdentRest()
}
else if isSupplementary(ch, isUnicodeIdentifierPart) then
getIdentRest()
else
finishNamed()
}
Expand Down Expand Up @@ -1111,7 +1155,7 @@ object Scanners {
}

// for interpolated strings
@annotation.tailrec private def getStringPart(multiLine: Boolean): Unit =
@tailrec private def getStringPart(multiLine: Boolean): Unit =
if (ch == '"')
if (multiLine) {
nextRawChar()
Expand All @@ -1136,6 +1180,28 @@ object Scanners {
getStringPart(multiLine)
}
else if (ch == '$') {
def getInterpolatedIdentRest(hasSupplement: Boolean): Unit =
@tailrec def loopRest(): Unit =
if ch != SU && isUnicodeIdentifierPart(ch) then
putChar(ch) ; nextRawChar()
loopRest()
else if atSupplementary(ch, isUnicodeIdentifierPart) then
putChar(ch) ; nextRawChar()
putChar(ch) ; nextRawChar()
loopRest()
else
finishNamedToken(IDENTIFIER, target = next)
end loopRest
setStrVal()
token = STRINGPART
next.lastOffset = charOffset - 1
next.offset = charOffset - 1
putChar(ch) ; nextRawChar()
if hasSupplement then
putChar(ch) ; nextRawChar()
loopRest()
end getInterpolatedIdentRest

nextRawChar()
if (ch == '$' || ch == '"') {
putChar(ch)
Expand All @@ -1146,18 +1212,10 @@ object Scanners {
setStrVal()
token = STRINGPART
}
else if (Character.isUnicodeIdentifierStart(ch) || ch == '_') {
setStrVal()
token = STRINGPART
next.lastOffset = charOffset - 1
next.offset = charOffset - 1
while
putChar(ch)
nextRawChar()
ch != SU && Character.isUnicodeIdentifierPart(ch)
do ()
finishNamedToken(IDENTIFIER, target = next)
}
else if isUnicodeIdentifierStart(ch) || ch == '_' then
getInterpolatedIdentRest(hasSupplement = false)
else if atSupplementary(ch, isUnicodeIdentifierStart) then
getInterpolatedIdentRest(hasSupplement = true)
else
error("invalid string interpolation: `$$`, `$\"`, `$`ident or `$`BlockExpr expected", off = charOffset - 2)
putChar('$')
Expand Down Expand Up @@ -1205,76 +1263,73 @@ object Scanners {
false
}

/** copy current character into litBuf, interpreting any escape sequences,
* and advance to next character.
/** Copy current character into cbuf, interpreting any escape sequences,
* and advance to next character. Surrogate pairs are consumed (see check
* at fetchSingleQuote), but orphan surrogate is allowed.
*/
protected def getLitChar(): Unit =
def invalidUnicodeEscape() = {
error("invalid character in unicode escape sequence", charOffset - 1)
putChar(ch)
}
def putUnicode(): Unit = {
while ch == 'u' || ch == 'U' do nextChar()
var i = 0
var cp = 0
while (i < 4) {
val shift = (3 - i) * 4
val d = digit2int(ch, 16)
if(d < 0) {
return invalidUnicodeEscape()
}
cp += (d << shift)
nextChar()
i += 1
}
putChar(cp.asInstanceOf[Char])
}
if (ch == '\\') {
if ch == '\\' then
nextChar()
if ('0' <= ch && ch <= '7') {
val start = charOffset - 2
val leadch: Char = ch
var oct: Int = digit2int(ch, 8)
nextChar()
if ('0' <= ch && ch <= '7') {
oct = oct * 8 + digit2int(ch, 8)
nextChar()
if (leadch <= '3' && '0' <= ch && ch <= '7') {
oct = oct * 8 + digit2int(ch, 8)
nextChar()
}
}
val alt = if oct == LF then raw"\n" else f"${"\\"}u$oct%04x"
error(s"octal escape literals are unsupported: use $alt instead", start)
putChar(oct.toChar)
}
else if (ch == 'u' || ch == 'U') {
putUnicode()
}
else {
ch match {
case 'b' => putChar('\b')
case 't' => putChar('\t')
case 'n' => putChar('\n')
case 'f' => putChar('\f')
case 'r' => putChar('\r')
case '\"' => putChar('\"')
case '\'' => putChar('\'')
case '\\' => putChar('\\')
case _ => invalidEscape()
}
nextChar()
}
}
else {
charEscape()
else if !isSupplementary(ch, _ => true, strict = false) then
putChar(ch)
nextChar()
}

protected def invalidEscape(): Unit = {
private def charEscape(): Unit =
var bump = true
ch match
case 'b' => putChar('\b')
case 't' => putChar('\t')
case 'n' => putChar('\n')
case 'f' => putChar('\f')
case 'r' => putChar('\r')
case '\"' => putChar('\"')
case '\'' => putChar('\'')
case '\\' => putChar('\\')
case 'u' |
'U' => uEscape(); bump = false
case x if '0' <= x && x <= '7' => octalEscape(); bump = false
case _ => invalidEscape()
if bump then nextChar()
end charEscape

private def uEscape(): Unit =
while ch == 'u' || ch == 'U' do nextChar()
var i = 0
var cp = 0
while i < 4 do
val digit = digit2int(ch, 16)
if digit < 0 then
error("invalid character in unicode escape sequence", charOffset - 1)
putChar(ch)
return
val shift = (3 - i) * 4
cp += digit << shift
nextChar()
i += 1
end while
putChar(cp.asInstanceOf[Char])
end uEscape

private def octalEscape(): Unit =
val start = charOffset - 2
val leadch: Char = ch
var oct: Int = digit2int(ch, 8)
nextChar()
if '0' <= ch && ch <= '7' then
oct = oct * 8 + digit2int(ch, 8)
nextChar()
if leadch <= '3' && '0' <= ch && ch <= '7' then
oct = oct * 8 + digit2int(ch, 8)
nextChar()
val alt = if oct == LF then raw"\n" else toUnicode(oct.toChar)
error(s"octal escape literals are unsupported: use $alt instead", start)
putChar(oct.toChar)
end octalEscape

protected def invalidEscape(): Unit =
error("invalid escape character", charOffset - 1)
putChar(ch)
}

private def getLitChars(delimiter: Char) =
while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
Expand Down
7 changes: 5 additions & 2 deletions compiler/src/dotty/tools/dotc/transform/Pickler.scala
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,14 @@ class Pickler extends Phase {
}

private def testSame(unpickled: String, previous: String, cls: ClassSymbol)(using Context) =
if (previous != unpickled) {
import java.nio.charset.StandardCharsets.UTF_8
def normal(s: String) = new String(s.getBytes(UTF_8), UTF_8)
val unequal = unpickled.length() != previous.length() || normal(unpickled) != normal(previous)
if unequal then
som-snytt marked this conversation as resolved.
Show resolved Hide resolved
output("before-pickling.txt", previous)
output("after-pickling.txt", unpickled)
report.error(s"""pickling difference for $cls in ${cls.source}, for details:
|
| diff before-pickling.txt after-pickling.txt""".stripMargin)
}
end testSame
}
2 changes: 1 addition & 1 deletion scaladoc/src/dotty/tools/scaladoc/util/JSON.scala
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def jsonString(s: String): JSON =

sb.append('"')
firstToBeEncoded() match
case -1 sb.append(s)
case -1 => sb.append(s)
case first =>
// sb.append(s, 0, first) for "abc", 0, 2 produce "(abc,0,2)" rather then "ab" as in Java
sb.append(s.substring(0, first))
Expand Down
4 changes: 4 additions & 0 deletions tests/neg-custom-args/deprecation/old-syntax.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

val f = (x: Int) ⇒ x + 1 // error

val list = for (n ← List(42)) yield n + 1 // error
4 changes: 4 additions & 0 deletions tests/neg/surrogates.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

class C {
def `too wide for Char` = '𐐀' // error
}
14 changes: 7 additions & 7 deletions tests/patmat/t11620.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,20 @@ object B {
}

def foo[T](b: B[T]) = b match {
case B(A1(t)) t
case B(A2(t, _)) t
case B(A1(t)) => t
case B(A2(t, _)) => t
}

def foo2[_A[+U] <: A[U], T](b: B.Aux[_A, T]) = b match {
case B.Aux(a @ A1(_ )) a.t
case B.Aux(a @ A2(_, _)) a.t1 // 👎 (false-positive): unreachable code
case B.Aux(a @ A1(_ )) => a.t
case B.Aux(a @ A2(_, _)) => a.t1 // 👎 (false-positive): unreachable code
}

def foo3[_A[+U] <: A[U], T](b: B.Aux[_A, T]) = b match {
case B.Aux(a: A1[T]) a.t
case B.Aux(a: A2[T]) a.t1 // 👎 (false-positive): unreachable code
case B.Aux(a: A1[T]) => a.t
case B.Aux(a: A2[T]) => a.t1 // 👎 (false-positive): unreachable code
}

def foo4[T](b: B[T]) = b match {
case B(A1(t)) t // 👎 (false-negative): incomplete match
case B(A1(t)) => t // 👎 (false-negative): incomplete match
}
28 changes: 28 additions & 0 deletions tests/pos/surrogates.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@

// allow supplementary chars in identifiers

class 𐐀 {
def 𐐀 = 42

// regression check: anything goes in strings
def x = "𐐀"
def y = s"$𐐀"
def w = s" 𐐀"
}

case class 𐐀𐐀(n: Int) {
def 𐐀𐐀 = n
def `𐐀𐐀1` = n + n
}

// uncontroversially, orphan surrogates may be introduced
// via unicode escape.
class Construction {
def hi = '\ud801'
def lo = '\udc00'
def endhi = "abc\ud801"
def startlo = "\udc00xyz"
def reversed = "xyz\udc00\ud801abc"
}

// was: error: illegal character '\ud801', '\udc00'
Loading