Skip to content

Commit

Permalink
#107: Simplify UnicodeDecoder.decode()
Browse files Browse the repository at this point in the history
  • Loading branch information
pathikrit committed Feb 14, 2017
1 parent 4307856 commit 14276ac
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 25 deletions.
27 changes: 9 additions & 18 deletions core/src/main/scala/better/files/UnicodeDecoder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -14,42 +14,33 @@ import java.nio.{ByteBuffer, CharBuffer}
class UnicodeDecoder(defaultCharset: Charset) extends CharsetDecoder(null, 1, 1) {
import UnicodeDecoder._

private var inferredCharset: Option[Charset] = None
private[this] var inferredCharset: Option[Charset] = None

override def decodeLoop(in: ByteBuffer, out: CharBuffer) =
decode(
in = in,
out = out,
candidates = bomTable.keys.toList,
header = ByteBuffer.allocate(4).mark().asInstanceOf[ByteBuffer]
)
decode(in = in, out = out, candidates = bomTable.keys.toList)

@annotation.tailrec
private def decode(in: ByteBuffer, out: CharBuffer, candidates: List[Charset], header: ByteBuffer): CoderResult = {
private[this] def decode(in: ByteBuffer, out: CharBuffer, candidates: List[Charset]): CoderResult = {
if (isCharsetDetected) {
val decoder = detectedCharset().newDecoder()
val hasMore = in.remaining() > 0
if (header.remaining() > 0) decoder.decode(header, out, !hasMore)
decoder.decode(in, out, true)
detectedCharset().newDecoder().decode(in, out, true)
} else if (candidates.isEmpty || in.remaining() <= 0) {
inferredCharset = Some(defaultCharset)
header.reset()
decode(in, out, candidates, header)
in.position(0)
decode(in, out, Nil)
} else {
val idx = in.position()
val byte = in.get()
val idx = header.position()
header.put(byte)
val newCandidates = candidates filter {charset =>
val bom = bomTable(charset)
bom.isDefinedAt(idx) && bom(idx) == byte
}
newCandidates match {
case charset :: Nil if bomTable(charset).length == idx + 1 =>
inferredCharset = Some(charset)
header.limit(idx)
in.position(idx + 1)
case _ =>
}
decode(in, out, newCandidates, header)
decode(in, out, newCandidates)
}
}

Expand Down
14 changes: 7 additions & 7 deletions core/src/test/scala/better/files/FileSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,13 @@ class FileSpec extends FlatSpec with BeforeAndAfterEach with Matchers {
.lines.toSeq should contain theSameElementsInOrderAs Seq("", "My name is", "Inigo Montoya", "x", "1")
}

it should "handle BOM" in {
val file = File.resource("file_with_bom.txt")
val expectedContent = "I contain an offending UTF-8 BOM\n"
file.contentAsString should not equal expectedContent
file.contentAsString(charset = UnicodeDecoder("UTF-8")) shouldEqual expectedContent
}

it should "glob" in {
a1.glob("**/*.txt").map(_.name).toSeq.sorted shouldEqual Seq("t1.txt", "t2.txt")
//a1.glob("*.txt").map(_.name).toSeq shouldEqual Seq("t1.txt", "t2.txt")
Expand Down Expand Up @@ -495,13 +502,6 @@ class FileSpec extends FlatSpec with BeforeAndAfterEach with Matchers {
Seq.fill(2)(scanner.next[Animal]) should contain theSameElementsInOrderAs Seq(Cat("Garfield"), Dog("Woofer"))
}

it should "handle BOM" in {
val file = File.resource("file_with_bom.txt")
val expectedContent = "I contain an offending UTF-8 BOM\n"
file.contentAsString should not equal expectedContent
file.contentAsString(charset = UnicodeDecoder("UTF-8")) shouldEqual expectedContent
}

"file watcher" should "watch single files" in {
assume(isCI)
val file = File.newTemporaryFile(suffix = ".txt").writeText("Hello world")
Expand Down

0 comments on commit 14276ac

Please sign in to comment.