Merge pull request #108 from /issues/107

Fix #107: Support encoding/decoding of Unicode byte-order markers
pathikrit · Feb 14, 2017 · e00a817 · e00a817
2 parents 555984a + 376bd65
commit e00a817
Show file tree

Hide file tree

Showing 9 changed files with 207 additions and 33 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,6 @@
 ## v3.0.0
 
+* [Issue #107](https://github.com/pathikrit/better-files/issues/107): Handle Byte-order markers
 * [PR #113](https://github.com/pathikrit/better-files/pull/113): File anchor util
 * [Issue #105](https://github.com/pathikrit/better-files/issues/105): Remove dependency on scala.io
 * [File.usingTemp](https://github.com/pathikrit/better-files/commit/d3522e8da63b55c7d3fa14cc9b0b76acd57c60ca)

diff --git a/README.md b/README.md
@@ -15,7 +15,8 @@
 ## Tutorial [![Scaladoc][scaladocImg]][scaladocLink]
   0. [Instantiation](#instantiation)
   0. [Simple I/O](#file-readwrite)  
-  0. [Streams and encodings](#streams-and-encodings)
+  0. [Streams](#streams)
+  0. [Encodings](#encodings)
   0. [Java compatibility](#java-interoperability)
   0. [Globbing](#globbing)
   0. [File system operations](#file-system-operations)
@@ -166,7 +167,7 @@ val bytes: Array[Byte] = file.loadBytes
   .lines
 ```
 
-### Streams and encodings
+### Streams
 Various ways to slurp a file without loading the contents into memory:
  ```scala
 val bytes  : Iterator[Byte]            = file.bytes
@@ -182,17 +183,30 @@ file.writeBytes(bytes)
 file.printLines(lines)
 ```
 
-You can supply your own encoding too for anything that does a read/write (it assumes `java.nio.charset.Charset.defaultCharset()` if you don't provide one):
+### Encodings
+You can supply your own charset too for anything that does a read/write (it assumes `java.nio.charset.Charset.defaultCharset()` if you don't provide one):
 ```scala
 val content: String = file.contentAsString  // default charset
 
 // custom charset:
 import java.nio.charset.Charset
-file.content(charset = Charset.forName("US-ASCII"))
+file.contentAsString(charset = Charset.forName("US-ASCII"))
 
-//or simply using File.charset util
-file.write("hello world")(charset = File.charset("US-ASCII"))
+//or simply using implicit conversion from Strings
+file.write("hello world")(charset = "US-ASCII")
  ```
+
+Note: By default, `better-files` [correctly handles BOMs while decoding](core/src/main/scala/better/files/UnicodeCharset.scala).
+If you wish to have the [incorrect JDK behaviour](http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4508058), 
+you would need to supply Java's UTF-8 charset e.g.:
+```scala
+file.contentAsString(charset = Charset.forName("UTF-8"))    // Default incorrect JDK behaviour for UTF-8 (see: JDK-4508058) 
+```
+
+If you also wish to write BOMs while encoding, you would need to supply it as:
+```scala
+file.write("hello world")(charset = UnicodeCharset("UTF-8", writeByteOrderMarkers = true)) 
+```
 
 ### Java interoperability
 You can always access the Java I/O classes:

diff --git a/benchmarks/src/test/scala/better/files/Benchmark.scala b/benchmarks/src/test/scala/better/files/Benchmark.scala
@@ -0,0 +1,8 @@
+package better.files
+
+abstract class Benchmark extends App {
+  def profile[A](f: => A): (A, Long) = {
+    val t = System.nanoTime()
+    (f, ((System.nanoTime() - t) / 1e6).toLong)
+  }
+}
diff --git a/benchmarks/src/test/scala/better/files/EncodingBenchmark.scala b/benchmarks/src/test/scala/better/files/EncodingBenchmark.scala
@@ -0,0 +1,37 @@
+package better.files
+
+import java.nio.charset.Charset
+
+import scala.util.Random
+
+object EncodingBenchmark extends Benchmark {
+
+  def testWrite(file: File, charset: Charset) = profile {
+    for {
+      writer <- file.bufferedWriter(charset)
+      content <- Iterator.continually(Random.nextString(10000)).take(1000)
+    } writer.write(content + "\n")
+  }
+
+  def testRead(file: File, charset: Charset) = profile {
+    for {
+      reader <- file.bufferedReader
+      line <- reader.lines().autoClosed
+    } line
+  }
+
+  def test(charset: Charset) = {
+    File.usingTemporaryFile() {file =>
+      val (_, w) = testWrite(file, charset)
+      println(s"Charset=$charset, write=$w ms")
+
+      val (_, r) = testRead(file, charset)
+      println(s"Charset=$charset, read=$r ms")
+    }
+  }
+
+  val utf8 = Charset.forName("UTF-8")
+  test(charset = utf8)
+  println("-------------")
+  test(charset = UnicodeCharset(utf8))
+}
diff --git a/benchmarks/src/test/scala/better/files/ScannerBenchmark.scala b/benchmarks/src/test/scala/better/files/ScannerBenchmark.scala
@@ -2,7 +2,7 @@ package better.files
 
 import java.io.{BufferedReader, StringReader}
 
-object ScannerBenchmark extends App {
+object ScannerBenchmark extends Benchmark {
   val file = File.newTemporaryFile()
   val n = 1000
   repeat(n) {
@@ -35,11 +35,6 @@ object ScannerBenchmark extends App {
     (line, ints, words)
   }
 
-  def profile[A](f: => A): (A, Long) = {
-    val t = System.nanoTime()
-    (f, ((System.nanoTime() - t) / 1e6).toLong)
-  }
-
   println("Warming up ...")
   scanners foreach { scannerBuilder =>
     val canaryData =

diff --git a/core/src/main/scala/better/files/File.scala b/core/src/main/scala/better/files/File.scala
@@ -860,10 +860,16 @@ class File private(val path: Path) {
 }
 
 object File {
-  implicit val defaultCharset: Charset = Charset.defaultCharset()
+  /**
+    * The default charset used by better-files
+    * Note: It uses java.net.charset.Charset.defaultCharset() in general but if the default supports byte-order markers,
+    *       it uses a more compliant version than the JDK one (see: https://github.com/pathikrit/better-files/issues/107)
+    */
+  implicit val defaultCharset: Charset =
+    UnicodeCharset(Charset.defaultCharset())
 
-  def charset(name: String): Charset =
-    Charset.forName(name)
+  def resource(name: String): File =
+    File(Thread.currentThread().getContextClassLoader.getResource(name))
 
   def newTemporaryDirectory(prefix: String = "", parent: Option[File] = None)(implicit attributes: Attributes = Attributes.default): File = {
     parent match {

diff --git a/core/src/main/scala/better/files/Implicits.scala b/core/src/main/scala/better/files/Implicits.scala
@@ -274,6 +274,9 @@ trait Implicits {
   implicit def stringToMessageDigest(algorithmName: String): MessageDigest =
     MessageDigest.getInstance(algorithmName)
 
+  implicit def stringToCharset(charsetName: String): Charset =
+    Charset.forName(charsetName)
+
   implicit def tokenizerToIterator(s: StringTokenizer): Iterator[String] =
     produce(s.nextToken()).till(s.hasMoreTokens)
 

diff --git a/core/src/main/scala/better/files/UnicodeCharset.scala b/core/src/main/scala/better/files/UnicodeCharset.scala
@@ -0,0 +1,100 @@
+package better.files
+
+import java.nio.charset._
+import java.nio.{BufferOverflowException, ByteBuffer, CharBuffer}
+
+import scala.collection.JavaConverters._
+
+/**
+  * A Unicode charset that handles byte-order markers
+  *
+  * @param underlyingCharset Use this charset if no known byte-order marker is detected; use this for encoding too
+  * @param writeByteOrderMarkers If set, write BOMs while encoding
+  */
+class UnicodeCharset(underlyingCharset: Charset, writeByteOrderMarkers: Boolean)
+  extends Charset(underlyingCharset.name(), underlyingCharset.aliases().asScala.toArray) {
+  override def newDecoder() = new UnicodeDecoder(underlyingCharset)
+  override def newEncoder() = if (writeByteOrderMarkers) new BomEncoder(underlyingCharset) else underlyingCharset.newEncoder()
+  override def contains(cs: Charset) = underlyingCharset.contains(cs)
+}
+
+/**
+  * A Unicode decoder that uses the Unicode byte-order marker (BOM) to auto-detect the encoding
+  * (if none detected, falls back on the defaultCharset). This also gets around a bug in the JDK
+  * (http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4508058) where BOM is not consumed for UTF-8.
+  * See: https://github.com/pathikrit/better-files/issues/107
+  *
+  * @param defaultCharset Use this charset if no known byte-order marker is detected
+  */
+class UnicodeDecoder(defaultCharset: Charset) extends CharsetDecoder(defaultCharset, 1, 1) {
+  import UnicodeCharset.bomTable
+
+  private[this] var inferredCharset: Option[Charset] = None
+
+  @annotation.tailrec
+  private[this] def decode(in: ByteBuffer, out: CharBuffer, candidates: Set[Charset] = Set.empty): CoderResult = {
+    if (isCharsetDetected) {
+      detectedCharset().newDecoder().decode(in, out, true)
+    } else if (candidates.isEmpty || !in.hasRemaining) {
+      inferredCharset = Some(defaultCharset)
+      in.rewind()
+      decode(in, out)
+    } else if (candidates.forall(c => bomTable(c).length == in.position())) {
+      inferredCharset = candidates.headOption.ensuring(candidates.size == 1, "Ambiguous BOMs found")
+      decode(in, out)
+    } else {
+      val idx = in.position()
+      val byte = in.get()
+      def isPossible(charset: Charset) = bomTable(charset).lift(idx).contains(byte)
+      decode(in, out, candidates.filter(isPossible))
+    }
+  }
+
+  override def decodeLoop(in: ByteBuffer, out: CharBuffer) = decode(in = in, out = out, candidates = bomTable.keySet)
+
+  override def isCharsetDetected = inferredCharset.isDefined
+
+  override def isAutoDetecting = true
+
+  override def implReset() = inferredCharset = None
+
+  override def detectedCharset() = inferredCharset.getOrElse(throw new IllegalStateException("Insufficient bytes read to determine charset"))
+}
+
+/**
+  * Encoder that writes the BOM for this charset
+  * @param charset
+  */
+class BomEncoder(charset: Charset) extends CharsetEncoder(charset, 1, 1) {
+  private[this] val bom = UnicodeCharset.bomTable.getOrElse(charset, throw new IllegalArgumentException(s"$charset does not support BOMs")).toArray
+  private[this] var isBomWritten = false
+
+  override def encodeLoop(in: CharBuffer, out: ByteBuffer): CoderResult = {
+    if (!isBomWritten) {
+      try {
+        out.put(bom)
+      } catch {
+        case _: BufferOverflowException => return CoderResult.OVERFLOW
+      } finally {
+        isBomWritten = true
+      }
+    }
+    charset.newEncoder().encode(in, out, true)
+  }
+
+  override def implReset() = isBomWritten = false
+}
+
+object UnicodeCharset {
+  private[files] val bomTable: Map[Charset, IndexedSeq[Byte]] = Map(
+    "UTF-8"    -> IndexedSeq(0xEF, 0xBB, 0xBF),
+    "UTF-16BE" -> IndexedSeq(0xFE, 0xFF),
+    "UTF-16LE" -> IndexedSeq(0xFF, 0xFE),
+    "UTF-32BE" -> IndexedSeq(0x00, 0x00, 0xFE, 0xFF),
+    "UTF-32LE" -> IndexedSeq(0xFF, 0xFE, 0x00, 0x00)
+  ).collect{case (charset, bytes) if Charset.isSupported(charset) => Charset.forName(charset) -> bytes.map(_.toByte)}
+   .ensuring(_.nonEmpty, "No unicode charset detected")
+
+  def apply(charset: Charset, writeByteOrderMarkers: Boolean = false): Charset =
+    if (bomTable.contains(charset)) new UnicodeCharset(charset, writeByteOrderMarkers) else charset
+}
diff --git a/core/src/test/scala/better/files/FileSpec.scala b/core/src/test/scala/better/files/FileSpec.scala
@@ -158,16 +158,26 @@ class FileSpec extends FlatSpec with BeforeAndAfterEach with Matchers {
       .lines.toSeq should contain theSameElementsInOrderAs Seq("", "My name is", "Inigo Montoya", "x", "1")
   }
 
+  it should "handle BOM" in {
+    val lines = Seq("Line 1", "Line 2")
+    val expectedContent = lines.mkString(start = "", sep = "\n", end = "\n")
+    File.usingTemporaryFile() {file =>
+      file.appendLines(lines: _*)(charset = UnicodeCharset("UTF-8", writeByteOrderMarkers = true))
+      file.contentAsString(charset = "UTF-8") should not equal expectedContent
+      file.contentAsString shouldEqual expectedContent
+    }
+  }
+
   it should "glob" in {
     a1.glob("**/*.txt").map(_.name).toSeq.sorted shouldEqual Seq("t1.txt", "t2.txt")
     //a1.glob("*.txt").map(_.name).toSeq shouldEqual Seq("t1.txt", "t2.txt")
     testRoot.glob("**/*.txt").map(_.name).toSeq.sorted shouldEqual Seq("t1.txt", "t2.txt")
     val path = testRoot.path.toString.ensuring(testRoot.path.isAbsolute)
     File(path).glob("**/*.{txt}").map(_.name).toSeq.sorted shouldEqual Seq("t1.txt", "t2.txt")
-    ("benchmarks"/"src").glob("**/*.{scala,java}").map(_.name).toSeq.sorted shouldEqual Seq("ArrayBufferScanner.java", "ScannerBenchmark.scala", "Scanners.scala")
-    ("benchmarks"/"src").glob("**/*.{scala}").map(_.name).toSeq.sorted shouldEqual Seq("ScannerBenchmark.scala", "Scanners.scala")
-    ("benchmarks"/"src").glob("**/*.scala").map(_.name).toSeq.sorted shouldEqual Seq("ScannerBenchmark.scala", "Scanners.scala")
-    ("benchmarks"/"src").listRecursively.filter(_.extension.contains(".scala")).map(_.name).toSeq.sorted shouldEqual Seq("ScannerBenchmark.scala", "Scanners.scala")
+    ("benchmarks"/"src").glob("**/*.{scala,java}").map(_.name).toSeq.sorted shouldEqual Seq("ArrayBufferScanner.java",  "Benchmark.scala", "EncodingBenchmark.scala", "ScannerBenchmark.scala", "Scanners.scala")
+    ("benchmarks"/"src").glob("**/*.{scala}").map(_.name).toSeq.sorted shouldEqual Seq( "Benchmark.scala", "EncodingBenchmark.scala", "ScannerBenchmark.scala", "Scanners.scala")
+    ("benchmarks"/"src").glob("**/*.scala").map(_.name).toSeq.sorted shouldEqual Seq("Benchmark.scala", "EncodingBenchmark.scala", "ScannerBenchmark.scala", "Scanners.scala")
+    ("benchmarks"/"src").listRecursively.filter(_.extension.contains(".scala")).map(_.name).toSeq.sorted shouldEqual Seq( "Benchmark.scala", "EncodingBenchmark.scala", "ScannerBenchmark.scala", "Scanners.scala")
     ls("core"/"src"/"test") should have length 1
     ("core"/"src"/"test").walk(maxDepth = 1) should have length 2
     ("core"/"src"/"test").walk(maxDepth = 0) should have length 1
@@ -222,8 +232,8 @@ class FileSpec extends FlatSpec with BeforeAndAfterEach with Matchers {
   it should "support sorting" in {
     testRoot.list.toSeq.sorted(File.Order.byName) should not be empty
     testRoot.list.toSeq.max(File.Order.bySize).isEmpty shouldBe false
-    List(fa, fb).contains(testRoot.list.toSeq.min(File.Order.byDepth)) shouldBe true
-    Thread.sleep(1000)
+    Seq(fa, fb).contains(testRoot.list.toSeq.min(File.Order.byDepth)) shouldBe true
+    sleep()
     t2.appendLine("modified!")
     a1.list.toSeq.min(File.Order.byModificationTime) shouldBe t1
     testRoot.list.toSeq.sorted(File.Order.byDirectoriesFirst) should not be empty
@@ -328,11 +338,11 @@ class FileSpec extends FlatSpec with BeforeAndAfterEach with Matchers {
     (a1 / "t3.scala.txt").contentAsString shouldEqual magicWord
   }
 
-  it should "support custom codec" in {
+  it should "support custom charset" in {
     import java.nio.charset.Charset
-    t1.writeText("你好世界")(charset = File.charset("UTF8"))
-    t1.contentAsString(File.charset("ISO-8859-1")) should not equal "你好世界"
-    t1.contentAsString(File.charset("UTF8")) shouldEqual "你好世界"
+    t1.writeText("你好世界")(charset = "UTF8")
+    t1.contentAsString(charset = "ISO-8859-1") should not equal "你好世界"
+    t1.contentAsString(charset = "UTF8") shouldEqual "你好世界"
     val c1 = md5(t1)
     val c2 = t1.overwrite("你好世界")(File.OpenOptions.default, Charset.forName("ISO-8859-1")).md5
     c1 should not equal c2
@@ -342,19 +352,19 @@ class FileSpec extends FlatSpec with BeforeAndAfterEach with Matchers {
   it should "support hashing algos" in {
     implicit val charset = java.nio.charset.StandardCharsets.UTF_8
     t1.writeText("")
-    assert(md5(t1) == "D41D8CD98F00B204E9800998ECF8427E")
-    assert(sha1(t1) == "DA39A3EE5E6B4B0D3255BFEF95601890AFD80709")
-    assert(sha256(t1) == "E3B0C44298FC1C149AFBF4C8996FB92427AE41E4649B934CA495991B7852B855")
-    assert(sha512(t1) == "CF83E1357EEFB8BDF1542850D66D8007D620E4050B5715DC83F4A921D36CE9CE47D0D13C5D85F2B0FF8318D2877EEC2F63B931BD47417A81A538327AF927DA3E")
+    md5(t1) shouldEqual "D41D8CD98F00B204E9800998ECF8427E"
+    sha1(t1) shouldEqual "DA39A3EE5E6B4B0D3255BFEF95601890AFD80709"
+    sha256(t1) shouldEqual "E3B0C44298FC1C149AFBF4C8996FB92427AE41E4649B934CA495991B7852B855"
+    sha512(t1) shouldEqual "CF83E1357EEFB8BDF1542850D66D8007D620E4050B5715DC83F4A921D36CE9CE47D0D13C5D85F2B0FF8318D2877EEC2F63B931BD47417A81A538327AF927DA3E"
   }
 
   it should "compute correct checksum for non-zero length string" in {
     implicit val charset = java.nio.charset.StandardCharsets.UTF_8
     t1.writeText("test")
-    assert(md5(t1) == "098F6BCD4621D373CADE4E832627B4F6")
-    assert(sha1(t1) == "A94A8FE5CCB19BA61C4C0873D391E987982FBBD3")
-    assert(sha256(t1) == "9F86D081884C7D659A2FEAA0C55AD015A3BF4F1B2B0B822CD15D6C15B0F00A08")
-    assert(sha512(t1) == "EE26B0DD4AF7E749AA1A8EE3C10AE9923F618980772E473F8819A5D4940E0DB27AC185F8A0E1D5F84F88BC887FD67B143732C304CC5FA9AD8E6F57F50028A8FF")
+    md5(t1) shouldEqual "098F6BCD4621D373CADE4E832627B4F6"
+    sha1(t1) shouldEqual "A94A8FE5CCB19BA61C4C0873D391E987982FBBD3"
+    sha256(t1) shouldEqual "9F86D081884C7D659A2FEAA0C55AD015A3BF4F1B2B0B822CD15D6C15B0F00A08"
+    sha512(t1) shouldEqual "EE26B0DD4AF7E749AA1A8EE3C10AE9923F618980772E473F8819A5D4940E0DB27AC185F8A0E1D5F84F88BC887FD67B143732C304CC5FA9AD8E6F57F50028A8FF"
   }
 
   it should "copy" in {