Skip to content

Commit

Permalink
Improve testing and maybe efficiency of string matching (#1054)
Browse files Browse the repository at this point in the history
* Improve testing and maybe efficiency of string matching

* fix python string match generation

* remove unreachable code
  • Loading branch information
johnynek authored Oct 10, 2023
1 parent 8bf939b commit 334e1ed
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 99 deletions.
49 changes: 49 additions & 0 deletions core/src/main/scala/org/bykn/bosatsu/Matchless.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,55 @@ object Matchless {
case object WildChar extends CharPart(false)
case object IndexChar extends CharPart(true)
case class LitStr(asString: String) extends StrPart

sealed abstract class MatchSize(val isExact: Boolean) {
def charCount: Int
def canMatch(cp: Int): Boolean
// we know chars/2 <= cpCount <= chars for utf16
def canMatchUtf16Count(chars: Int): Boolean
}
object MatchSize {
case class Exactly(charCount: Int) extends MatchSize(true) {
def canMatch(cp: Int): Boolean = cp == charCount
def canMatchUtf16Count(chars: Int): Boolean = {
val cpmin = chars / 2
val cpmax = chars
(cpmin <= charCount) && (charCount <= cpmax)
}
}
case class AtLeast(charCount: Int) extends MatchSize(false) {
def canMatch(cp: Int): Boolean = charCount <= cp
def canMatchUtf16Count(chars: Int): Boolean = {
val cpmax = chars
// we have any cp in [cpmin, cpmax]
// but we require charCount <= cp
(charCount <= cpmax)
}
}

private val atLeast0 = AtLeast(0)
private val exactly0 = Exactly(0)
private val exactly1 = Exactly(1)

def from(sp: StrPart): MatchSize =
sp match {
case _: Glob => atLeast0
case _: CharPart => exactly1
case LitStr(str) =>
Exactly(str.codePointCount(0, str.length))
}

def apply[F[_]: cats.Foldable](f: F[StrPart]): MatchSize =
cats.Foldable[F].foldMap(f)(from)

implicit val monoidMatchSize: Monoid[MatchSize] =
new Monoid[MatchSize] {
def empty: MatchSize = exactly0
def combine(l: MatchSize, r: MatchSize) =
if (l.isExact && r.isExact) Exactly(l.charCount + r.charCount)
else AtLeast(l.charCount + r.charCount)
}
}
}

// we should probably allocate static slots for each bindable,
Expand Down
195 changes: 103 additions & 92 deletions core/src/main/scala/org/bykn/bosatsu/MatchlessToValue.scala
Original file line number Diff line number Diff line change
Expand Up @@ -528,108 +528,119 @@ object MatchlessToValue {
Static(c)
}

private[this] val emptyStringArray: Array[String] = new Array[String](0)

def matchString(str: String, pat: List[Matchless.StrPart], binds: Int): Array[String] = {
import Matchless.StrPart._

val strLen = str.length()
val results = if (binds > 0) new Array[String](binds) else emptyStringArray

def loop(offset: Int, pat: List[Matchless.StrPart], next: Int): Boolean =
pat match {
case Nil => offset == strLen
case LitStr(expect) :: tail =>
val len = expect.length
str.regionMatches(offset, expect, 0, len) && loop(offset + len, tail, next)
case (c: CharPart) :: tail =>
try {
val nextOffset = str.offsetByCodePoints(offset, 1)
val n =
if (c.capture) {
results(next) = str.substring(offset, nextOffset)
next + 1
}
else next
}

private[this] val emptyStringArray: Array[String] = new Array[String](0)
def matchString(
str: String,
pat: List[StrPart],
binds: Int): Array[String] = {
import Matchless.StrPart._

val strLen = str.length()
val results = if (binds > 0) new Array[String](binds) else emptyStringArray

def loop(offset: Int, pat: List[StrPart], next: Int): Boolean =
pat match {
case Nil => offset == strLen
case LitStr(expect) :: tail =>
val len = expect.length
str.regionMatches(offset, expect, 0, len) && loop(offset + len, tail, next)
case (c: CharPart) :: tail =>
try {
val nextOffset = str.offsetByCodePoints(offset, 1)
val n =
if (c.capture) {
results(next) = str.substring(offset, nextOffset)
next + 1
}
else next

loop(nextOffset, tail, n)
}
catch {
case _: IndexOutOfBoundsException => false
}
case (h: Glob) :: tail =>
tail match {
case Nil =>
// we capture all the rest
if (h.capture) {
results(next) = str.substring(offset)
}
true
case rest @ ((_: CharPart) :: _) =>
// (.*)(.)tail2
// this is a naive algorithm that just
// checks at all possible later offsets
// a smarter algorithm could see if there
// are Lit parts that can match or not
var matched = false
var off1 = offset
val n1 = if (h.capture) (next + 1) else next
while (!matched && (off1 < strLen)) {
matched = loop(off1, rest, n1)
if (!matched) {
off1 = off1 + 1
}
loop(nextOffset, tail, n)
}
catch {
case _: IndexOutOfBoundsException => false
}
case (h: Glob) :: tail =>
tail match {
case Nil =>
// we capture all the rest
if (h.capture) {
results(next) = str.substring(offset)
}
true
case rest @ ((_: CharPart) :: _) =>
val matchableSizes = MatchSize[List](rest)

def canMatch(off: Int): Boolean =
matchableSizes.canMatch(str.codePointCount(off, strLen))

// (.*)(.)tail2
// this is a naive algorithm that just
// checks at all possible later offsets
// a smarter algorithm could see if there
// are Lit parts that can match or not
var matched = false
var off1 = offset
val n1 = if (h.capture) (next + 1) else next
while (!matched && (off1 < strLen)) {
matched = canMatch(off1) && loop(off1, rest, n1)
if (!matched) {
off1 = off1 + Character.charCount(str.codePointAt(off1))
}
}

matched && {
if (h.capture) {
results(next) = str.substring(offset, off1)
}
true
matched && {
if (h.capture) {
results(next) = str.substring(offset, off1)
}
case LitStr(expect) :: tail2 =>
val next1 = if (h.capture) next + 1 else next

var start = offset
var result = false
while (start >= 0) {
val candidate = str.indexOf(expect, start)
if (candidate >= 0) {
// we have to skip the current expect string
val check1 = loop(candidate + expect.length, tail2, next1)
if (check1) {
// this was a match, write into next if needed
if (h.capture) {
results(next) = str.substring(offset, candidate)
}
result = true
start = -1
}
else {
// we couldn't match here, try just after candidate
start = candidate + 1
true
}
case LitStr(expect) :: tail2 =>
val next1 = if (h.capture) next + 1 else next

val matchableSizes = MatchSize(tail2)

def canMatch(off: Int): Boolean =
matchableSizes.canMatchUtf16Count(strLen - off)

var start = offset
var result = false
while (start >= 0) {
val candidate = str.indexOf(expect, start)
if (candidate >= 0) {
// we have to skip the current expect string
val nextOff = candidate + expect.length
val check1 = canMatch(nextOff) && loop(nextOff, tail2, next1)
if (check1) {
// this was a match, write into next if needed
if (h.capture) {
results(next) = str.substring(offset, candidate)
}
result = true
start = -1
}
else {
// no more candidates
start = -1
// we couldn't match here, try just after candidate
start = candidate + Character.charCount(str.codePointAt(candidate))
}
}
result
case (_: Glob) :: _ =>
val n1 = if (h.capture) {
// this index gets the empty string
results(next) = ""
// we match the right side first, so this wild gets nothing
next + 1
} else next

loop(offset, tail, n1)
}
}
else {
// no more candidates
start = -1
}
}
result
// $COVERAGE-OFF$
case (_: Glob) :: _ =>
// this should be an error at compile time since it
// is never meaningful to have two adjacent globs
sys.error(s"invariant violation, adjacent globs: $pat")
// $COVERAGE-ON$
}
}

if (loop(0, pat, 0)) results else null
}
if (loop(0, pat, 0)) results else null
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1149,7 +1149,7 @@ object PythonGen {
val regionMatches = strEx.dot(Code.Ident("startswith"))(expect, offsetIdent)
val rest =
(
offsetIdent := offsetIdent + expect.length
offsetIdent := offsetIdent + expect.codePointCount(0, expect.length)
).withValue(loopRes)

Env.andCode(regionMatches, rest)
Expand Down Expand Up @@ -1251,7 +1251,7 @@ object PythonGen {
candidate :> -1,
// update candidate and search
Code.block(
candOffset := candidate + expect.length,
candOffset := candidate + expect.codePointCount(0, expect.length),
onS)
,
// else no more candidates
Expand Down Expand Up @@ -1291,13 +1291,14 @@ object PythonGen {
)
).withValue(matched)

capture = Code.block(
bindArray(next) := Code.SelectRange(strEx, Some(offsetIdent), Some(off1))
).withValue(true)

fullMatch <-
if (!h.capture) Monad[Env].pure(matchStmt)
else Env.andCode(matchStmt, capture)
else {
val capture = Code.block(
bindArray(next) := Code.SelectRange(strEx, Some(offsetIdent), Some(off1))
).withValue(true)
Env.andCode(matchStmt, capture)
}

} yield fullMatch
// $COVERAGE-OFF$
Expand Down
16 changes: 16 additions & 0 deletions test_workspace/Char.bosatsu
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ str_to_char_tests = TestSuite("string_to_Char",
Assertion(string_to_Char("s") matches Some(.'s'), "s"),
Assertion(string_to_Char("") matches None, "empty"),
Assertion(string_to_Char("foo") matches None, "foo"),
# test a 2 word character
Assertion(string_to_Char("👋") matches Some(.'👋'), "foo"),
]
)

Expand All @@ -26,6 +28,7 @@ len_test = TestSuite("len tests",
Assertion(length_String("") matches 0, "empty"),
Assertion(length_String("x") matches 1, "x"),
Assertion(length_String("hello") matches 5, "hello"),
Assertion(length_String("👋") matches 1, "👋"),
]
)

Expand All @@ -40,13 +43,26 @@ last_tests = TestSuite(
Assertion(last_String("") matches None, "empty"),
Assertion(last_String("x") matches Some(.'x'), "x"),
Assertion(last_String("1234") matches Some(.'4'), "1234"),
Assertion(last_String("👋") matches Some(.'👋'), "👋"),
]
)

match_tests = TestSuite("match tests",
[
Assertion("👋👋👋" matches "👋$.{_}👋", "test matching 1"),
Assertion("abc👋👋👋" matches "${_}👋$.{_}👋", "test matching 2"),
Assertion("abc👋👋👋" matches "abc$.{_}👋${_}", "test matching 2.5"),
Assertion("abc👋👋👋" matches "abc👋${_}", "test matching 3"),
Assertion("abc👋👋👋" matches "${_}c👋${_}", "test matching 4"),
Assertion("abc👋👋👋" matches "${_}c👋${_}$.{_}", "test matching 5"),
Assertion("abc👋👋👋" matches "${_}👋", "test matching 6"),
])

tests = TestSuite("Char tests",
[
str_to_char_tests,
len_test,
last_tests,
match_tests,
]
)

0 comments on commit 334e1ed

Please sign in to comment.