Skip to content

Commit

Permalink
Merge pull request #108 from /issues/107
Browse files Browse the repository at this point in the history
Fix #107: Support encoding/decoding of Unicode byte-order markers
  • Loading branch information
pathikrit authored Feb 14, 2017
2 parents 555984a + 376bd65 commit e00a817
Show file tree
Hide file tree
Showing 9 changed files with 207 additions and 33 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
## v3.0.0

* [Issue #107](https://github.com/pathikrit/better-files/issues/107): Handle Byte-order markers
* [PR #113](https://github.com/pathikrit/better-files/pull/113): File anchor util
* [Issue #105](https://github.com/pathikrit/better-files/issues/105): Remove dependency on scala.io
* [File.usingTemp](https://github.com/pathikrit/better-files/commit/d3522e8da63b55c7d3fa14cc9b0b76acd57c60ca)
Expand Down
26 changes: 20 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
## Tutorial [![Scaladoc][scaladocImg]][scaladocLink]
0. [Instantiation](#instantiation)
0. [Simple I/O](#file-readwrite)
0. [Streams and encodings](#streams-and-encodings)
0. [Streams](#streams)
0. [Encodings](#encodings)
0. [Java compatibility](#java-interoperability)
0. [Globbing](#globbing)
0. [File system operations](#file-system-operations)
Expand Down Expand Up @@ -166,7 +167,7 @@ val bytes: Array[Byte] = file.loadBytes
.lines
```

### Streams and encodings
### Streams
Various ways to slurp a file without loading the contents into memory:
```scala
val bytes : Iterator[Byte] = file.bytes
Expand All @@ -182,17 +183,30 @@ file.writeBytes(bytes)
file.printLines(lines)
```

You can supply your own encoding too for anything that does a read/write (it assumes `java.nio.charset.Charset.defaultCharset()` if you don't provide one):
### Encodings
You can supply your own charset too for anything that does a read/write (it assumes `java.nio.charset.Charset.defaultCharset()` if you don't provide one):
```scala
val content: String = file.contentAsString // default charset

// custom charset:
import java.nio.charset.Charset
file.content(charset = Charset.forName("US-ASCII"))
file.contentAsString(charset = Charset.forName("US-ASCII"))

//or simply using File.charset util
file.write("hello world")(charset = File.charset("US-ASCII"))
//or simply using implicit conversion from Strings
file.write("hello world")(charset = "US-ASCII")
```

Note: By default, `better-files` [correctly handles BOMs while decoding](core/src/main/scala/better/files/UnicodeCharset.scala).
If you wish to have the [incorrect JDK behaviour](http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4508058),
you would need to supply Java's UTF-8 charset e.g.:
```scala
file.contentAsString(charset = Charset.forName("UTF-8")) // Default incorrect JDK behaviour for UTF-8 (see: JDK-4508058)
```

If you also wish to write BOMs while encoding, you would need to supply it as:
```scala
file.write("hello world")(charset = UnicodeCharset("UTF-8", writeByteOrderMarkers = true))
```

### Java interoperability
You can always access the Java I/O classes:
Expand Down
8 changes: 8 additions & 0 deletions benchmarks/src/test/scala/better/files/Benchmark.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package better.files

abstract class Benchmark extends App {
def profile[A](f: => A): (A, Long) = {
val t = System.nanoTime()
(f, ((System.nanoTime() - t) / 1e6).toLong)
}
}
37 changes: 37 additions & 0 deletions benchmarks/src/test/scala/better/files/EncodingBenchmark.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package better.files

import java.nio.charset.Charset

import scala.util.Random

object EncodingBenchmark extends Benchmark {

def testWrite(file: File, charset: Charset) = profile {
for {
writer <- file.bufferedWriter(charset)
content <- Iterator.continually(Random.nextString(10000)).take(1000)
} writer.write(content + "\n")
}

def testRead(file: File, charset: Charset) = profile {
for {
reader <- file.bufferedReader
line <- reader.lines().autoClosed
} line
}

def test(charset: Charset) = {
File.usingTemporaryFile() {file =>
val (_, w) = testWrite(file, charset)
println(s"Charset=$charset, write=$w ms")

val (_, r) = testRead(file, charset)
println(s"Charset=$charset, read=$r ms")
}
}

val utf8 = Charset.forName("UTF-8")
test(charset = utf8)
println("-------------")
test(charset = UnicodeCharset(utf8))
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package better.files

import java.io.{BufferedReader, StringReader}

object ScannerBenchmark extends App {
object ScannerBenchmark extends Benchmark {
val file = File.newTemporaryFile()
val n = 1000
repeat(n) {
Expand Down Expand Up @@ -35,11 +35,6 @@ object ScannerBenchmark extends App {
(line, ints, words)
}

def profile[A](f: => A): (A, Long) = {
val t = System.nanoTime()
(f, ((System.nanoTime() - t) / 1e6).toLong)
}

println("Warming up ...")
scanners foreach { scannerBuilder =>
val canaryData =
Expand Down
12 changes: 9 additions & 3 deletions core/src/main/scala/better/files/File.scala
Original file line number Diff line number Diff line change
Expand Up @@ -860,10 +860,16 @@ class File private(val path: Path) {
}

object File {
implicit val defaultCharset: Charset = Charset.defaultCharset()
/**
* The default charset used by better-files
* Note: It uses java.net.charset.Charset.defaultCharset() in general but if the default supports byte-order markers,
* it uses a more compliant version than the JDK one (see: https://github.com/pathikrit/better-files/issues/107)
*/
implicit val defaultCharset: Charset =
UnicodeCharset(Charset.defaultCharset())

def charset(name: String): Charset =
Charset.forName(name)
def resource(name: String): File =
File(Thread.currentThread().getContextClassLoader.getResource(name))

def newTemporaryDirectory(prefix: String = "", parent: Option[File] = None)(implicit attributes: Attributes = Attributes.default): File = {
parent match {
Expand Down
3 changes: 3 additions & 0 deletions core/src/main/scala/better/files/Implicits.scala
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,9 @@ trait Implicits {
implicit def stringToMessageDigest(algorithmName: String): MessageDigest =
MessageDigest.getInstance(algorithmName)

implicit def stringToCharset(charsetName: String): Charset =
Charset.forName(charsetName)

implicit def tokenizerToIterator(s: StringTokenizer): Iterator[String] =
produce(s.nextToken()).till(s.hasMoreTokens)

Expand Down
100 changes: 100 additions & 0 deletions core/src/main/scala/better/files/UnicodeCharset.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package better.files

import java.nio.charset._
import java.nio.{BufferOverflowException, ByteBuffer, CharBuffer}

import scala.collection.JavaConverters._

/**
* A Unicode charset that handles byte-order markers
*
* @param underlyingCharset Use this charset if no known byte-order marker is detected; use this for encoding too
* @param writeByteOrderMarkers If set, write BOMs while encoding
*/
class UnicodeCharset(underlyingCharset: Charset, writeByteOrderMarkers: Boolean)
extends Charset(underlyingCharset.name(), underlyingCharset.aliases().asScala.toArray) {
override def newDecoder() = new UnicodeDecoder(underlyingCharset)
override def newEncoder() = if (writeByteOrderMarkers) new BomEncoder(underlyingCharset) else underlyingCharset.newEncoder()
override def contains(cs: Charset) = underlyingCharset.contains(cs)
}

/**
* A Unicode decoder that uses the Unicode byte-order marker (BOM) to auto-detect the encoding
* (if none detected, falls back on the defaultCharset). This also gets around a bug in the JDK
* (http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4508058) where BOM is not consumed for UTF-8.
* See: https://github.com/pathikrit/better-files/issues/107
*
* @param defaultCharset Use this charset if no known byte-order marker is detected
*/
class UnicodeDecoder(defaultCharset: Charset) extends CharsetDecoder(defaultCharset, 1, 1) {
import UnicodeCharset.bomTable

private[this] var inferredCharset: Option[Charset] = None

@annotation.tailrec
private[this] def decode(in: ByteBuffer, out: CharBuffer, candidates: Set[Charset] = Set.empty): CoderResult = {
if (isCharsetDetected) {
detectedCharset().newDecoder().decode(in, out, true)
} else if (candidates.isEmpty || !in.hasRemaining) {
inferredCharset = Some(defaultCharset)
in.rewind()
decode(in, out)
} else if (candidates.forall(c => bomTable(c).length == in.position())) {
inferredCharset = candidates.headOption.ensuring(candidates.size == 1, "Ambiguous BOMs found")
decode(in, out)
} else {
val idx = in.position()
val byte = in.get()
def isPossible(charset: Charset) = bomTable(charset).lift(idx).contains(byte)
decode(in, out, candidates.filter(isPossible))
}
}

override def decodeLoop(in: ByteBuffer, out: CharBuffer) = decode(in = in, out = out, candidates = bomTable.keySet)

override def isCharsetDetected = inferredCharset.isDefined

override def isAutoDetecting = true

override def implReset() = inferredCharset = None

override def detectedCharset() = inferredCharset.getOrElse(throw new IllegalStateException("Insufficient bytes read to determine charset"))
}

/**
* Encoder that writes the BOM for this charset
* @param charset
*/
class BomEncoder(charset: Charset) extends CharsetEncoder(charset, 1, 1) {
private[this] val bom = UnicodeCharset.bomTable.getOrElse(charset, throw new IllegalArgumentException(s"$charset does not support BOMs")).toArray
private[this] var isBomWritten = false

override def encodeLoop(in: CharBuffer, out: ByteBuffer): CoderResult = {
if (!isBomWritten) {
try {
out.put(bom)
} catch {
case _: BufferOverflowException => return CoderResult.OVERFLOW
} finally {
isBomWritten = true
}
}
charset.newEncoder().encode(in, out, true)
}

override def implReset() = isBomWritten = false
}

object UnicodeCharset {
private[files] val bomTable: Map[Charset, IndexedSeq[Byte]] = Map(
"UTF-8" -> IndexedSeq(0xEF, 0xBB, 0xBF),
"UTF-16BE" -> IndexedSeq(0xFE, 0xFF),
"UTF-16LE" -> IndexedSeq(0xFF, 0xFE),
"UTF-32BE" -> IndexedSeq(0x00, 0x00, 0xFE, 0xFF),
"UTF-32LE" -> IndexedSeq(0xFF, 0xFE, 0x00, 0x00)
).collect{case (charset, bytes) if Charset.isSupported(charset) => Charset.forName(charset) -> bytes.map(_.toByte)}
.ensuring(_.nonEmpty, "No unicode charset detected")

def apply(charset: Charset, writeByteOrderMarkers: Boolean = false): Charset =
if (bomTable.contains(charset)) new UnicodeCharset(charset, writeByteOrderMarkers) else charset
}
46 changes: 28 additions & 18 deletions core/src/test/scala/better/files/FileSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -158,16 +158,26 @@ class FileSpec extends FlatSpec with BeforeAndAfterEach with Matchers {
.lines.toSeq should contain theSameElementsInOrderAs Seq("", "My name is", "Inigo Montoya", "x", "1")
}

it should "handle BOM" in {
val lines = Seq("Line 1", "Line 2")
val expectedContent = lines.mkString(start = "", sep = "\n", end = "\n")
File.usingTemporaryFile() {file =>
file.appendLines(lines: _*)(charset = UnicodeCharset("UTF-8", writeByteOrderMarkers = true))
file.contentAsString(charset = "UTF-8") should not equal expectedContent
file.contentAsString shouldEqual expectedContent
}
}

it should "glob" in {
a1.glob("**/*.txt").map(_.name).toSeq.sorted shouldEqual Seq("t1.txt", "t2.txt")
//a1.glob("*.txt").map(_.name).toSeq shouldEqual Seq("t1.txt", "t2.txt")
testRoot.glob("**/*.txt").map(_.name).toSeq.sorted shouldEqual Seq("t1.txt", "t2.txt")
val path = testRoot.path.toString.ensuring(testRoot.path.isAbsolute)
File(path).glob("**/*.{txt}").map(_.name).toSeq.sorted shouldEqual Seq("t1.txt", "t2.txt")
("benchmarks"/"src").glob("**/*.{scala,java}").map(_.name).toSeq.sorted shouldEqual Seq("ArrayBufferScanner.java", "ScannerBenchmark.scala", "Scanners.scala")
("benchmarks"/"src").glob("**/*.{scala}").map(_.name).toSeq.sorted shouldEqual Seq("ScannerBenchmark.scala", "Scanners.scala")
("benchmarks"/"src").glob("**/*.scala").map(_.name).toSeq.sorted shouldEqual Seq("ScannerBenchmark.scala", "Scanners.scala")
("benchmarks"/"src").listRecursively.filter(_.extension.contains(".scala")).map(_.name).toSeq.sorted shouldEqual Seq("ScannerBenchmark.scala", "Scanners.scala")
("benchmarks"/"src").glob("**/*.{scala,java}").map(_.name).toSeq.sorted shouldEqual Seq("ArrayBufferScanner.java", "Benchmark.scala", "EncodingBenchmark.scala", "ScannerBenchmark.scala", "Scanners.scala")
("benchmarks"/"src").glob("**/*.{scala}").map(_.name).toSeq.sorted shouldEqual Seq( "Benchmark.scala", "EncodingBenchmark.scala", "ScannerBenchmark.scala", "Scanners.scala")
("benchmarks"/"src").glob("**/*.scala").map(_.name).toSeq.sorted shouldEqual Seq("Benchmark.scala", "EncodingBenchmark.scala", "ScannerBenchmark.scala", "Scanners.scala")
("benchmarks"/"src").listRecursively.filter(_.extension.contains(".scala")).map(_.name).toSeq.sorted shouldEqual Seq( "Benchmark.scala", "EncodingBenchmark.scala", "ScannerBenchmark.scala", "Scanners.scala")
ls("core"/"src"/"test") should have length 1
("core"/"src"/"test").walk(maxDepth = 1) should have length 2
("core"/"src"/"test").walk(maxDepth = 0) should have length 1
Expand Down Expand Up @@ -222,8 +232,8 @@ class FileSpec extends FlatSpec with BeforeAndAfterEach with Matchers {
it should "support sorting" in {
testRoot.list.toSeq.sorted(File.Order.byName) should not be empty
testRoot.list.toSeq.max(File.Order.bySize).isEmpty shouldBe false
List(fa, fb).contains(testRoot.list.toSeq.min(File.Order.byDepth)) shouldBe true
Thread.sleep(1000)
Seq(fa, fb).contains(testRoot.list.toSeq.min(File.Order.byDepth)) shouldBe true
sleep()
t2.appendLine("modified!")
a1.list.toSeq.min(File.Order.byModificationTime) shouldBe t1
testRoot.list.toSeq.sorted(File.Order.byDirectoriesFirst) should not be empty
Expand Down Expand Up @@ -328,11 +338,11 @@ class FileSpec extends FlatSpec with BeforeAndAfterEach with Matchers {
(a1 / "t3.scala.txt").contentAsString shouldEqual magicWord
}

it should "support custom codec" in {
it should "support custom charset" in {
import java.nio.charset.Charset
t1.writeText("你好世界")(charset = File.charset("UTF8"))
t1.contentAsString(File.charset("ISO-8859-1")) should not equal "你好世界"
t1.contentAsString(File.charset("UTF8")) shouldEqual "你好世界"
t1.writeText("你好世界")(charset = "UTF8")
t1.contentAsString(charset = "ISO-8859-1") should not equal "你好世界"
t1.contentAsString(charset = "UTF8") shouldEqual "你好世界"
val c1 = md5(t1)
val c2 = t1.overwrite("你好世界")(File.OpenOptions.default, Charset.forName("ISO-8859-1")).md5
c1 should not equal c2
Expand All @@ -342,19 +352,19 @@ class FileSpec extends FlatSpec with BeforeAndAfterEach with Matchers {
it should "support hashing algos" in {
implicit val charset = java.nio.charset.StandardCharsets.UTF_8
t1.writeText("")
assert(md5(t1) == "D41D8CD98F00B204E9800998ECF8427E")
assert(sha1(t1) == "DA39A3EE5E6B4B0D3255BFEF95601890AFD80709")
assert(sha256(t1) == "E3B0C44298FC1C149AFBF4C8996FB92427AE41E4649B934CA495991B7852B855")
assert(sha512(t1) == "CF83E1357EEFB8BDF1542850D66D8007D620E4050B5715DC83F4A921D36CE9CE47D0D13C5D85F2B0FF8318D2877EEC2F63B931BD47417A81A538327AF927DA3E")
md5(t1) shouldEqual "D41D8CD98F00B204E9800998ECF8427E"
sha1(t1) shouldEqual "DA39A3EE5E6B4B0D3255BFEF95601890AFD80709"
sha256(t1) shouldEqual "E3B0C44298FC1C149AFBF4C8996FB92427AE41E4649B934CA495991B7852B855"
sha512(t1) shouldEqual "CF83E1357EEFB8BDF1542850D66D8007D620E4050B5715DC83F4A921D36CE9CE47D0D13C5D85F2B0FF8318D2877EEC2F63B931BD47417A81A538327AF927DA3E"
}

it should "compute correct checksum for non-zero length string" in {
implicit val charset = java.nio.charset.StandardCharsets.UTF_8
t1.writeText("test")
assert(md5(t1) == "098F6BCD4621D373CADE4E832627B4F6")
assert(sha1(t1) == "A94A8FE5CCB19BA61C4C0873D391E987982FBBD3")
assert(sha256(t1) == "9F86D081884C7D659A2FEAA0C55AD015A3BF4F1B2B0B822CD15D6C15B0F00A08")
assert(sha512(t1) == "EE26B0DD4AF7E749AA1A8EE3C10AE9923F618980772E473F8819A5D4940E0DB27AC185F8A0E1D5F84F88BC887FD67B143732C304CC5FA9AD8E6F57F50028A8FF")
md5(t1) shouldEqual "098F6BCD4621D373CADE4E832627B4F6"
sha1(t1) shouldEqual "A94A8FE5CCB19BA61C4C0873D391E987982FBBD3"
sha256(t1) shouldEqual "9F86D081884C7D659A2FEAA0C55AD015A3BF4F1B2B0B822CD15D6C15B0F00A08"
sha512(t1) shouldEqual "EE26B0DD4AF7E749AA1A8EE3C10AE9923F618980772E473F8819A5D4940E0DB27AC185F8A0E1D5F84F88BC887FD67B143732C304CC5FA9AD8E6F57F50028A8FF"
}

it should "copy" in {
Expand Down

0 comments on commit e00a817

Please sign in to comment.