Skip to content

Commit

Permalink
Implement Unicode Case Folding
Browse files Browse the repository at this point in the history
This commit adds CaseFoldedString as a partner to CIString. A CaseFoldedString is case folded according to the Unicode rules for Caseless Matching. In contrast to CIString, it does _not_ keep a reference to the input `String`.

This commit changes CIString to be based on CaseFoldedString.
  • Loading branch information
isomarcte committed Feb 6, 2022
1 parent 3f8199b commit b376e37
Show file tree
Hide file tree
Showing 9 changed files with 2,128 additions and 43 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package org.typelevel.ci
package bench

import org.scalacheck._
import org.typelevel.ci.testing.arbitraries._
import cats._
import org.openjdk.jmh.annotations._
import java.util.concurrent.TimeUnit

@State(Scope.Thread)
@BenchmarkMode(Array(Mode.Throughput, Mode.AverageTime))
@OutputTimeUnit(TimeUnit.MILLISECONDS)
class CaseFoldedStringBench {

var currentSeed: Long = Long.MinValue

def nextSeed: Long = {
val seed = currentSeed
currentSeed += 1L
seed
}

def nextString: String =
Arbitrary.arbitrary[String].apply(Gen.Parameters.default, rng.Seed(nextSeed)).getOrElse(throw new AssertionError("Failed to generate String."))

def nextListOfString: List[String] =
Gen.listOf(Arbitrary.arbitrary[String])(Gen.Parameters.default, rng.Seed(nextSeed)).getOrElse(throw new AssertionError("Failed to generate String."))

@Benchmark
def caseFoldedStringHash: Int =
CaseFoldedString(nextString).hashCode

@Benchmark
def caseFoldedStringFoldMap: CaseFoldedString =
Foldable[List].foldMap(nextListOfString)(CaseFoldedString.apply)

@Benchmark
def stringHash: Int =
nextString.hashCode

@Benchmark
def stringFoldMap: String =
Foldable[List].foldMap(nextListOfString)(identity)
}
7 changes: 5 additions & 2 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,12 @@ lazy val bench = project
.enablePlugins(NoPublishPlugin)
.enablePlugins(JmhPlugin)
.settings(
name := "case-insensitive-bench"
name := "case-insensitive-bench",
libraryDependencies ++= List(
"org.scalacheck" %% "scalacheck" % scalacheckV
)
)
.dependsOn(core.jvm)
.dependsOn(core.jvm, testing.jvm)

lazy val docs = project
.in(file("site"))
Expand Down
66 changes: 34 additions & 32 deletions core/src/main/scala/org/typelevel/ci/CIString.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,52 +24,46 @@ import scala.math.Ordered

/** A case-insensitive String.
*
* Two CI strings are equal if and only if they are the same length, and each corresponding
* character is equal after calling either `toUpper` or `toLower`.
* Comparisions are based on the case folded representation of the `String`
* as defined by the Unicode standard. See [[CaseFoldedString]] for a full
* discussion on those rules.
*
* Ordering is based on a string comparison after folding each character to uppercase and then back
* to lowercase.
*
* All comparisons are insensitive to locales.
* @note This class differs from [[CaseFoldedString]] in that it keeps a
* reference to original input `String` in whatever form it was
* given. This makes [[CIString]] useful if you which to perform case
* insensitive operations on a `String`, but then recover the original,
* unaltered form. If you do not care about the original input form,
* and just want a single case insensitive `String` value, then
* [[CaseFoldedString]] is more efficient and you should consider using
* that directly.
*
* @param toString
* The original value the CI String was constructed with.
*/
final class CIString private (override val toString: String)
final class CIString private (override val toString: String, val asCaseFoldedString: CaseFoldedString)
extends Ordered[CIString]
with Serializable {

@deprecated(message = "Please provide a CaseFoldedString directly.", since = "1.3.0")
private def this(toString: String) = {
this(toString, CaseFoldedString(toString))
}

override def equals(that: Any): Boolean =
that match {
case that: CIString =>
this.toString.equalsIgnoreCase(that.toString)
// Note java.lang.String.equalsIgnoreCase _does not_ handle all title
// case unicode characters, so we can't use it here. See the tests for
// an example.
this.asCaseFoldedString == that.asCaseFoldedString
case _ => false
}

@transient private[this] var hash = 0
override def hashCode(): Int = {
if (hash == 0)
hash = calculateHash
hash
}

private[this] def calculateHash: Int = {
var h = 17
var i = 0
val len = toString.length
while (i < len) {
// Strings are equal igoring case if either their uppercase or lowercase
// forms are equal. Equality of one does not imply the other, so we need
// to go in both directions. A character is not guaranteed to make this
// round trip, but it doesn't matter as long as all equal characters
// hash the same.
h = h * 31 + toString.charAt(i).toUpper.toLower
i += 1
}
h
}
override def hashCode(): Int =
asCaseFoldedString.hashCode

override def compare(that: CIString): Int =
this.toString.compareToIgnoreCase(that.toString)
asCaseFoldedString.compare(that.asCaseFoldedString)

def transform(f: String => String): CIString = CIString(f(toString))

Expand All @@ -87,7 +81,15 @@ final class CIString private (override val toString: String)

@suppressUnusedImportWarningForCompat
object CIString {
def apply(value: String): CIString = new CIString(value)

def apply(value: String, useTurkicFolding: Boolean): CIString =
new CIString(value, CaseFoldedString(value, useTurkicFolding))

def apply(value: String): CIString =
apply(value, false)

def fromCaseFoldedString(value: CaseFoldedString): CIString =
new CIString(value.toString, value)

val empty = CIString("")

Expand Down
161 changes: 161 additions & 0 deletions core/src/main/scala/org/typelevel/ci/CaseFoldedString.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
package org.typelevel.ci

import cats._
import cats.kernel.LowerBounded
import org.typelevel.ci.compat._
import scala.annotation.tailrec

/** A case folded `String`. This is a `String` which has been converted into a
* state which is suitable for case insensitive matching under the Unicode
* standard.
*
* This type differs from [[CIString]] in that it does ''not'' retain the
* original input `String` value. That is, this is a destructive
* transformation. You should use [[CaseFoldedString]] instead of
* [[CIString]] when you only want the case insensitive `String` and you
* never want to return the `String` back into the input value. In such cases
* [[CaseFoldedString]] will be more efficient than [[CIString]] as it only
* has to keep around a single `String` in memory.
*
* Case insensitive `String` values under Unicode are not always intuitive,
* especially on the JVM. There are three character cases to consider, lower
* case, upper case, and title case, and not all Unicode codePoints have all
* 3, some only have 2, some only 1. For some codePoints, the JRE standard
* operations don't always work as you'd expect.
*
* {{{
* scala> val codePoint: Int = 8093
* val codePoint: Int = 8093
*
* scala> new String(Character.toChars(codePoint)) * val res0: String = ᾝ
*
* scala> res0.toUpperCase
* val res1: String = ἭΙ
*
* scala> res0.toUpperCase.toLowerCase == res0.toLowerCase
* val res2: Boolean = false
*
* scala> Character.getName(res0.head)
* val res3: String = GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
*
* scala> res0.toUpperCase.toLowerCase.equalsIgnoreCase(res0.toLowerCase)
* val res4: Boolean = false
* }}}
*
* In this example, given the Unicode character \u1f9d, converting it to
* upper case, then to lower case, is not equal under normal String
* equality. `String.equalsIgnoreCase` also does not work correctly by the
* Unicode standard.
*
* Making matters more complicated, for certain Turkic languages, the case
* folding rules change. See the Unicode standard for a full discussion of
* the topic.
*
* @note For most `String` values the `toString` form of this is lower case
* (when the given character has more than one case), but this is not
* always the case. Certain Unicode scripts have exceptions to this and
* will be case folded into upper case. If you want/need an only lower
* case `String`, you should call `.toString.toLowerCase`.
*
* @see [[https://www.unicode.org/versions/Unicode14.0.0/ch05.pdf#G21790]]
*/
final case class CaseFoldedString private (override val toString: String) extends AnyVal {

def isEmpty: Boolean = toString.isEmpty

def nonEmpty: Boolean = !isEmpty

def length: Int = toString.length

def size: Int = length

def trim: CaseFoldedString =
CaseFoldedString(toString.trim)

private final def copy(toString: String): CaseFoldedString =
CaseFoldedString(toString)
}

object CaseFoldedString {

/** Create a [[CaseFoldedString]] from a `String`.
*
* @param turkicFoldingRules if `true`, use the case folding rules for
* applicable to some Turkic languages.
*/
def apply(value: String, turkicFoldingRules: Boolean): CaseFoldedString = {
val builder: java.lang.StringBuilder = new java.lang.StringBuilder(value.length * 3)
val foldCodePoint: Int => Array[Int] =
if (turkicFoldingRules) {
CaseFolds.turkicFullCaseFoldedCodePoints
} else {
CaseFolds.fullCaseFoldedCodePoints
}

@tailrec
def loop(index: Int): String =
if (index >= value.length) {
builder.toString
} else {
val codePoint: Int = value.codePointAt(index)
foldCodePoint(codePoint).foreach(c => builder.appendCodePoint(c))
val inc: Int = if (codePoint >= 0x10000) 2 else 1
loop(index + inc)
}

new CaseFoldedString(loop(0))
}

/** Create a [[CaseFoldedString]] from a `String`.
*
* @note This factory method does ''not'' use the Turkic case folding
* rules. For the majority of languages this is the correct method of
* case folding. If you know your `String` is specific to one of the
* Turkic languages which use special case folding rules, you can use
* the secondary factory method to enable case folding under those
* rules.
*/
def apply(value: String): CaseFoldedString =
apply(value, false)

val empty: CaseFoldedString =
CaseFoldedString("")

implicit val hashAndOrderForCaseFoldedString: Hash[CaseFoldedString] with Order[CaseFoldedString] =
new Hash[CaseFoldedString] with Order[CaseFoldedString] {
override def hash(x: CaseFoldedString): Int =
x.hashCode

override def compare(x: CaseFoldedString, y: CaseFoldedString): Int =
x.compare(y)
}

implicit val orderingForCaseFoldedString: Ordering[CaseFoldedString] =
hashAndOrderForCaseFoldedString.toOrdering

implicit val showForCaseFoldedString: Show[CaseFoldedString] =
Show.fromToString

implicit val lowerBoundForCaseFoldedString: LowerBounded[CaseFoldedString] =
new LowerBounded[CaseFoldedString] {
override val partialOrder: PartialOrder[CaseFoldedString] =
hashAndOrderForCaseFoldedString

override val minBound: CaseFoldedString =
empty
}

implicit val monoidForCaseFoldedString: Monoid[CaseFoldedString] =
new Monoid[CaseFoldedString] {
override val empty: CaseFoldedString = CaseFoldedString.empty

override def combine(x: CaseFoldedString, y: CaseFoldedString): CaseFoldedString =
new CaseFoldedString(x.toString + y.toString)

override def combineAll(xs: IterableOnce[CaseFoldedString]): CaseFoldedString = {
val sb: StringBuilder = new StringBuilder
xs.iterator.foreach(cfs => sb.append(cfs.toString))
new CaseFoldedString(sb.toString)
}
}
}
Loading

0 comments on commit b376e37

Please sign in to comment.