From 334e1edf285da83446bdc7a98992a76cbb54056b Mon Sep 17 00:00:00 2001 From: "P. Oscar Boykin" Date: Mon, 9 Oct 2023 14:29:22 -1000 Subject: [PATCH] Improve testing and maybe efficiency of string matching (#1054) * Improve testing and maybe efficiency of string matching * fix python string match generation * remove unreachable code --- .../scala/org/bykn/bosatsu/Matchless.scala | 49 +++++ .../org/bykn/bosatsu/MatchlessToValue.scala | 195 +++++++++--------- .../bosatsu/codegen/python/PythonGen.scala | 15 +- test_workspace/Char.bosatsu | 16 ++ 4 files changed, 176 insertions(+), 99 deletions(-) diff --git a/core/src/main/scala/org/bykn/bosatsu/Matchless.scala b/core/src/main/scala/org/bykn/bosatsu/Matchless.scala index f4a2f94e9..dd0f37202 100644 --- a/core/src/main/scala/org/bykn/bosatsu/Matchless.scala +++ b/core/src/main/scala/org/bykn/bosatsu/Matchless.scala @@ -24,6 +24,55 @@ object Matchless { case object WildChar extends CharPart(false) case object IndexChar extends CharPart(true) case class LitStr(asString: String) extends StrPart + + sealed abstract class MatchSize(val isExact: Boolean) { + def charCount: Int + def canMatch(cp: Int): Boolean + // we know chars/2 <= cpCount <= chars for utf16 + def canMatchUtf16Count(chars: Int): Boolean + } + object MatchSize { + case class Exactly(charCount: Int) extends MatchSize(true) { + def canMatch(cp: Int): Boolean = cp == charCount + def canMatchUtf16Count(chars: Int): Boolean = { + val cpmin = chars / 2 + val cpmax = chars + (cpmin <= charCount) && (charCount <= cpmax) + } + } + case class AtLeast(charCount: Int) extends MatchSize(false) { + def canMatch(cp: Int): Boolean = charCount <= cp + def canMatchUtf16Count(chars: Int): Boolean = { + val cpmax = chars + // we have any cp in [cpmin, cpmax] + // but we require charCount <= cp + (charCount <= cpmax) + } + } + + private val atLeast0 = AtLeast(0) + private val exactly0 = Exactly(0) + private val exactly1 = Exactly(1) + + def from(sp: StrPart): MatchSize = + sp match { + case _: Glob => atLeast0 + case _: CharPart => exactly1 + case LitStr(str) => + Exactly(str.codePointCount(0, str.length)) + } + + def apply[F[_]: cats.Foldable](f: F[StrPart]): MatchSize = + cats.Foldable[F].foldMap(f)(from) + + implicit val monoidMatchSize: Monoid[MatchSize] = + new Monoid[MatchSize] { + def empty: MatchSize = exactly0 + def combine(l: MatchSize, r: MatchSize) = + if (l.isExact && r.isExact) Exactly(l.charCount + r.charCount) + else AtLeast(l.charCount + r.charCount) + } + } } // we should probably allocate static slots for each bindable, diff --git a/core/src/main/scala/org/bykn/bosatsu/MatchlessToValue.scala b/core/src/main/scala/org/bykn/bosatsu/MatchlessToValue.scala index 30abbbc1e..b6707aa4b 100644 --- a/core/src/main/scala/org/bykn/bosatsu/MatchlessToValue.scala +++ b/core/src/main/scala/org/bykn/bosatsu/MatchlessToValue.scala @@ -528,108 +528,119 @@ object MatchlessToValue { Static(c) } - private[this] val emptyStringArray: Array[String] = new Array[String](0) - - def matchString(str: String, pat: List[Matchless.StrPart], binds: Int): Array[String] = { - import Matchless.StrPart._ - - val strLen = str.length() - val results = if (binds > 0) new Array[String](binds) else emptyStringArray - - def loop(offset: Int, pat: List[Matchless.StrPart], next: Int): Boolean = - pat match { - case Nil => offset == strLen - case LitStr(expect) :: tail => - val len = expect.length - str.regionMatches(offset, expect, 0, len) && loop(offset + len, tail, next) - case (c: CharPart) :: tail => - try { - val nextOffset = str.offsetByCodePoints(offset, 1) - val n = - if (c.capture) { - results(next) = str.substring(offset, nextOffset) - next + 1 - } - else next + } + + private[this] val emptyStringArray: Array[String] = new Array[String](0) + def matchString( + str: String, + pat: List[StrPart], + binds: Int): Array[String] = { + import Matchless.StrPart._ + + val strLen = str.length() + val results = if (binds > 0) new Array[String](binds) else emptyStringArray + + def loop(offset: Int, pat: List[StrPart], next: Int): Boolean = + pat match { + case Nil => offset == strLen + case LitStr(expect) :: tail => + val len = expect.length + str.regionMatches(offset, expect, 0, len) && loop(offset + len, tail, next) + case (c: CharPart) :: tail => + try { + val nextOffset = str.offsetByCodePoints(offset, 1) + val n = + if (c.capture) { + results(next) = str.substring(offset, nextOffset) + next + 1 + } + else next - loop(nextOffset, tail, n) - } - catch { - case _: IndexOutOfBoundsException => false - } - case (h: Glob) :: tail => - tail match { - case Nil => - // we capture all the rest - if (h.capture) { - results(next) = str.substring(offset) - } - true - case rest @ ((_: CharPart) :: _) => - // (.*)(.)tail2 - // this is a naive algorithm that just - // checks at all possible later offsets - // a smarter algorithm could see if there - // are Lit parts that can match or not - var matched = false - var off1 = offset - val n1 = if (h.capture) (next + 1) else next - while (!matched && (off1 < strLen)) { - matched = loop(off1, rest, n1) - if (!matched) { - off1 = off1 + 1 - } + loop(nextOffset, tail, n) + } + catch { + case _: IndexOutOfBoundsException => false + } + case (h: Glob) :: tail => + tail match { + case Nil => + // we capture all the rest + if (h.capture) { + results(next) = str.substring(offset) + } + true + case rest @ ((_: CharPart) :: _) => + val matchableSizes = MatchSize[List](rest) + + def canMatch(off: Int): Boolean = + matchableSizes.canMatch(str.codePointCount(off, strLen)) + + // (.*)(.)tail2 + // this is a naive algorithm that just + // checks at all possible later offsets + // a smarter algorithm could see if there + // are Lit parts that can match or not + var matched = false + var off1 = offset + val n1 = if (h.capture) (next + 1) else next + while (!matched && (off1 < strLen)) { + matched = canMatch(off1) && loop(off1, rest, n1) + if (!matched) { + off1 = off1 + Character.charCount(str.codePointAt(off1)) } + } - matched && { - if (h.capture) { - results(next) = str.substring(offset, off1) - } - true + matched && { + if (h.capture) { + results(next) = str.substring(offset, off1) } - case LitStr(expect) :: tail2 => - val next1 = if (h.capture) next + 1 else next - - var start = offset - var result = false - while (start >= 0) { - val candidate = str.indexOf(expect, start) - if (candidate >= 0) { - // we have to skip the current expect string - val check1 = loop(candidate + expect.length, tail2, next1) - if (check1) { - // this was a match, write into next if needed - if (h.capture) { - results(next) = str.substring(offset, candidate) - } - result = true - start = -1 - } - else { - // we couldn't match here, try just after candidate - start = candidate + 1 + true + } + case LitStr(expect) :: tail2 => + val next1 = if (h.capture) next + 1 else next + + val matchableSizes = MatchSize(tail2) + + def canMatch(off: Int): Boolean = + matchableSizes.canMatchUtf16Count(strLen - off) + + var start = offset + var result = false + while (start >= 0) { + val candidate = str.indexOf(expect, start) + if (candidate >= 0) { + // we have to skip the current expect string + val nextOff = candidate + expect.length + val check1 = canMatch(nextOff) && loop(nextOff, tail2, next1) + if (check1) { + // this was a match, write into next if needed + if (h.capture) { + results(next) = str.substring(offset, candidate) } + result = true + start = -1 } else { - // no more candidates - start = -1 + // we couldn't match here, try just after candidate + start = candidate + Character.charCount(str.codePointAt(candidate)) } } - result - case (_: Glob) :: _ => - val n1 = if (h.capture) { - // this index gets the empty string - results(next) = "" - // we match the right side first, so this wild gets nothing - next + 1 - } else next - - loop(offset, tail, n1) - } - } + else { + // no more candidates + start = -1 + } + } + result + // $COVERAGE-OFF$ + case (_: Glob) :: _ => + // this should be an error at compile time since it + // is never meaningful to have two adjacent globs + sys.error(s"invariant violation, adjacent globs: $pat") + // $COVERAGE-ON$ + } + } - if (loop(0, pat, 0)) results else null - } + if (loop(0, pat, 0)) results else null } } } diff --git a/core/src/main/scala/org/bykn/bosatsu/codegen/python/PythonGen.scala b/core/src/main/scala/org/bykn/bosatsu/codegen/python/PythonGen.scala index 0f3a9390a..4ba25427d 100644 --- a/core/src/main/scala/org/bykn/bosatsu/codegen/python/PythonGen.scala +++ b/core/src/main/scala/org/bykn/bosatsu/codegen/python/PythonGen.scala @@ -1149,7 +1149,7 @@ object PythonGen { val regionMatches = strEx.dot(Code.Ident("startswith"))(expect, offsetIdent) val rest = ( - offsetIdent := offsetIdent + expect.length + offsetIdent := offsetIdent + expect.codePointCount(0, expect.length) ).withValue(loopRes) Env.andCode(regionMatches, rest) @@ -1251,7 +1251,7 @@ object PythonGen { candidate :> -1, // update candidate and search Code.block( - candOffset := candidate + expect.length, + candOffset := candidate + expect.codePointCount(0, expect.length), onS) , // else no more candidates @@ -1291,13 +1291,14 @@ object PythonGen { ) ).withValue(matched) - capture = Code.block( - bindArray(next) := Code.SelectRange(strEx, Some(offsetIdent), Some(off1)) - ).withValue(true) - fullMatch <- if (!h.capture) Monad[Env].pure(matchStmt) - else Env.andCode(matchStmt, capture) + else { + val capture = Code.block( + bindArray(next) := Code.SelectRange(strEx, Some(offsetIdent), Some(off1)) + ).withValue(true) + Env.andCode(matchStmt, capture) + } } yield fullMatch // $COVERAGE-OFF$ diff --git a/test_workspace/Char.bosatsu b/test_workspace/Char.bosatsu index 6a1be0b31..a690e6d4c 100644 --- a/test_workspace/Char.bosatsu +++ b/test_workspace/Char.bosatsu @@ -10,6 +10,8 @@ str_to_char_tests = TestSuite("string_to_Char", Assertion(string_to_Char("s") matches Some(.'s'), "s"), Assertion(string_to_Char("") matches None, "empty"), Assertion(string_to_Char("foo") matches None, "foo"), + # test a 2 word character + Assertion(string_to_Char("👋") matches Some(.'👋'), "foo"), ] ) @@ -26,6 +28,7 @@ len_test = TestSuite("len tests", Assertion(length_String("") matches 0, "empty"), Assertion(length_String("x") matches 1, "x"), Assertion(length_String("hello") matches 5, "hello"), + Assertion(length_String("👋") matches 1, "👋"), ] ) @@ -40,13 +43,26 @@ last_tests = TestSuite( Assertion(last_String("") matches None, "empty"), Assertion(last_String("x") matches Some(.'x'), "x"), Assertion(last_String("1234") matches Some(.'4'), "1234"), + Assertion(last_String("👋") matches Some(.'👋'), "👋"), ] ) +match_tests = TestSuite("match tests", + [ + Assertion("👋👋👋" matches "👋$.{_}👋", "test matching 1"), + Assertion("abc👋👋👋" matches "${_}👋$.{_}👋", "test matching 2"), + Assertion("abc👋👋👋" matches "abc$.{_}👋${_}", "test matching 2.5"), + Assertion("abc👋👋👋" matches "abc👋${_}", "test matching 3"), + Assertion("abc👋👋👋" matches "${_}c👋${_}", "test matching 4"), + Assertion("abc👋👋👋" matches "${_}c👋${_}$.{_}", "test matching 5"), + Assertion("abc👋👋👋" matches "${_}👋", "test matching 6"), + ]) + tests = TestSuite("Char tests", [ str_to_char_tests, len_test, last_tests, + match_tests, ] ) \ No newline at end of file