Skip to content

Commit

Permalink
Factor Ident escaping from python code generation (#1251)
Browse files Browse the repository at this point in the history
  • Loading branch information
johnynek authored Nov 11, 2024
1 parent 426b058 commit be453d5
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 139 deletions.
116 changes: 116 additions & 0 deletions core/src/main/scala/org/bykn/bosatsu/codegen/Idents.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
package org.bykn.bosatsu.codegen

object Idents {

private[this] val base62Items =
(('0' to '9') ++ ('A' to 'Z') ++ ('a' to 'z')).toSet

private[this] val offset0: Int = '0'.toInt
private[this] val offsetA: Int = 'A'.toInt - 10
private[this] val offseta: Int = 'a'.toInt - 36

private def toBase62(c: Char, bldr: java.lang.StringBuilder): java.lang.StringBuilder =
if (base62Items(c)) bldr.append(c)
else if (c == '_') bldr.append("__")
else {
def toChar(i0: Int): Char =
(i0 + (
if (i0 < 36) {
if (i0 < 10) offset0
else offsetA
}
else offseta
)).toChar

def toString(i: Int): Unit =
if (i < 62) {
val _ = bldr.append(toChar(i))
}
else {
val i1 = i / 62
val i0 = i % 62
// this isn't tail recursion, but it's okay
// because the int can't be that big so we can
// only divide by 62 a few times
toString(i1)
val _ = bldr.append(toChar(i0))
}

bldr.append('_')
toString(c.toInt)
bldr.append('_')
}

def escape(prefix: String, str: CharSequence): String = {
val bldr = new java.lang.StringBuilder
var idx = 0
val len = str.length
bldr.append(prefix)
while (idx < len) {
toBase62(str.charAt(idx), bldr)
idx += 1
}
bldr.toString()
}

private def unBase62(
str: String,
offset: Int,
bldr: java.lang.StringBuilder
): Int = {
var idx = offset
var num = 0

while (idx < str.length) {
val c = str.charAt(idx)
idx += 1
if (c == '_') {
if (idx == offset + 1) {
// this is a literal _
bldr.append('_')
}
else {
// done, this is the trailing _
bldr.append(num.toChar)
}
return (idx - offset)
} else {
val base =
if (c <= 'Z') {
if (c <= '9') offset0
else offsetA
}
else offseta

num = num * 62 + c.toInt - base
}
}
return -1
}

def unescape(prefix: String, str: String): Option[String] =
if (str.startsWith(prefix)) {
val bldr = new java.lang.StringBuilder()
var idx = prefix.length
val len = str.length
while (idx < len) {
val c = str.charAt(idx)
idx += 1
if (c == '_') {
val res = unBase62(str, idx, bldr)
if (res < 1) return None
else {
// this tells us how many characters we read
idx += res
}
} else {
// this character is literally encoded
bldr.append(c)
}
}

Some(bldr.toString())
} else {
None
}
}
110 changes: 5 additions & 105 deletions core/src/main/scala/org/bykn/bosatsu/codegen/python/PythonGen.scala
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import org.bykn.bosatsu.{
Parser,
RecursionKind
}
import org.bykn.bosatsu.codegen.Idents
import org.bykn.bosatsu.rankn.Type
import org.typelevel.paiges.Doc

Expand Down Expand Up @@ -69,7 +70,7 @@ object PythonGen {

def bind(b: Bindable): (EnvState, Code.Ident) =
bindInc(b, 1) { c =>
Code.Ident(escapeRaw("___b", b.asString + c.toString))
Code.Ident(Idents.escape("___b", b.asString + c.toString))
}

// in loops we need to substitute
Expand Down Expand Up @@ -114,7 +115,7 @@ object PythonGen {
case None =>
val impNumber = imports.size
val alias = Code.Ident(
escapeRaw("___i", mod.last.name + impNumber.toString)
Idents.escape("___i", mod.last.name + impNumber.toString)
)
(copy(imports = imports.updated(mod, alias)), alias)
}
Expand Down Expand Up @@ -464,75 +465,6 @@ object PythonGen {

}

private[this] val base62Items =
(('0' to '9') ++ ('A' to 'Z') ++ ('a' to 'z')).toSet

private def toBase62(c: Char): String =
if (base62Items(c)) c.toString
else if (c == '_') "__"
else {
def toChar(i0: Int): Char =
if (i0 < 0) {
// $COVERAGE-OFF$
sys.error(s"invalid in: $i0")
// $COVERAGE-ON$
} else if (i0 < 10) (i0 + '0'.toInt).toChar
else if (i0 < 36) (i0 - 10 + 'A'.toInt).toChar
else if (i0 < 62) (i0 - 36 + 'a'.toInt).toChar
else {
// $COVERAGE-OFF$
sys.error(s"invalid int: $i0")
// $COVERAGE-ON$
}

def toString(i: Int): String =
if (i < 62) toChar(i).toString
else {
val i0 = i % 62
val i1 = i / 62
toString(i1) + toChar(i0)
}

"_" + toString(c.toInt) + "_"
}

private def escapeRaw(prefix: String, str: String): String =
str.map(toBase62).mkString(prefix, "", "")

private def unBase62(
str: String,
offset: Int,
bldr: java.lang.StringBuilder
): Int = {
var idx = offset
var num = 0

while (idx < str.length) {
val c = str.charAt(idx)
idx += 1
if (c == '_') {
if (idx != offset + 1) {
// done
val numC = num.toChar
bldr.append(numC)
return (idx - offset)
} else {
// "__" decodes to "_"
bldr.append('_')
return (idx - offset)
}
} else {
val base =
if (c <= '9') '0'.toInt
else if (c <= 'Z') ('A'.toInt - 10)
else ('a'.toInt - 36)

num = num * 62 + c.toInt - base
}
}
return -1
}

// we escape by prefixing by three underscores, ___ and n (for name)
// we use other ___x escapes for different name spaces, e.g. tmps, and anons
// then we escape _ by __ and any character outside the allowed
Expand All @@ -551,7 +483,7 @@ object PythonGen {
) Code.Ident(str)
else {
// we need to escape
Code.Ident(escapeRaw("___n", str))
Code.Ident(Idents.escape("___n", str))
}
}

Expand All @@ -562,41 +494,9 @@ object PythonGen {
) Code.Ident(str)
else {
// we need to escape
Code.Ident(escapeRaw("___m", str))
}

def unescape(ident: Code.Ident): Option[Bindable] = {
val str = ident.name
val res = if (str.startsWith("___n")) {
val bldr = new java.lang.StringBuilder()
var idx = 4
while (idx < str.length) {
val c = str.charAt(idx)
idx += 1
if (c == '_') {
val res = unBase62(str, idx, bldr)
if (res < 1) return None
else {
idx += res
}
} else {
bldr.append(c)
}
}

bldr.toString()
} else {
str
Code.Ident(Idents.escape("___m", str))
}

if (str.isEmpty) None
else {
Identifier
.optionParse(Identifier.bindableParser, res)
.orElse(Some(Identifier.Backticked(res)))
}
}

/** Remap is used to handle remapping external values
*/
private def apply(packName: PackageName, name: Bindable, me: Expr)(
Expand Down
50 changes: 50 additions & 0 deletions core/src/test/scala/org/bykn/bosatsu/codegen/IdentsTest.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package org.bykn.bosatsu.codegen
import org.scalacheck.Prop.forAll

class IdentsTest extends munit.ScalaCheckSuite {
val validIdentChars =
(('0' to '9') ++ ('A' to 'Z') ++ ('a' to 'z')).toSet + '_'

property("Idents.escape/unescape") {
forAll { (prefix: String, content: String) =>
val escaped = Idents.escape(prefix, content)
val stringNums = content.map(_.toInt).toList
Idents.unescape(prefix, escaped) match {
case Some(c1) => assertEquals(c1, content, s"escaped = $escaped, stringNums = $stringNums")
case None => fail(s"expected to unescape: $escaped, stringNums = $stringNums")
}
}
}

property("escape starts with prefix") {
forAll { (prefix: String, content: String) =>
assert(Idents.escape(prefix, content).startsWith(prefix))
}
}

property("escape creates validIdentChars") {
forAll { (prefix: String, content: String) =>
val escaped = Idents.escape(prefix, content)
assert(escaped.drop(prefix.length).forall(validIdentChars))
}
}

property("valid strings are escaped with identity") {
forAll { (prefix: String, content: String) =>
val escaped = Idents.escape(prefix, content)
if (content.forall(validIdentChars)) {
assertEquals(escaped, prefix + content.flatMap {
case '_' => "__"
case a => a.toString
})
}
else {
assert(escaped.length > (prefix + content).length)
}
}
}

test("some examples") {
assertEquals(Idents.escape("foo", "bar_baz"), "foobar__baz")
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.bykn.bosatsu.codegen.python

import org.bykn.bosatsu.Identifier.{Bindable, unsafeBindable}
import org.bykn.bosatsu.Generators.bindIdentGen
import org.scalatestplus.scalacheck.ScalaCheckPropertyChecks.{
forAll,
Expand All @@ -14,25 +13,6 @@ class PythonGenTest extends AnyFunSuite {
PropertyCheckConfiguration(minSuccessful = 5000)
// PropertyCheckConfiguration(minSuccessful = 500)

test("PythonGen.escape round trips") {

def law(b: Bindable) = {
val ident = PythonGen.escape(b)
PythonGen.unescape(ident) match {
case Some(b1) => assert(b1.asString == b.asString)
case None => assert(false, s"$b => $ident could not round trip")
}
}

forAll(bindIdentGen)(law(_))

val examples: List[Bindable] =
List("`12 =_=`", "`N`").map(unsafeBindable)

examples.foreach(law(_))

}

val PythonName = "[_A-Za-z][_A-Za-z0-9]*".r.pattern

test("all escapes are valid python identifiers") {
Expand All @@ -46,20 +26,6 @@ class PythonGenTest extends AnyFunSuite {
}
}

test("if unescape works, escape would round trip") {
forAll { (s: String) =>
if (Code.python2Name.matcher(s).matches) {
val ident = Code.Ident(s)
PythonGen.unescape(ident) match {
case Some(b) =>
assert(PythonGen.escape(b) == ident)
case None =>
()
}
}
}
}

test("we can parse an example externals file") {
val extstr = """
{ IO: { foo: bar.baz, quux: quux.quux_impl }, Bop: { foo: collections.queue } }
Expand Down

0 comments on commit be453d5

Please sign in to comment.