-
Notifications
You must be signed in to change notification settings - Fork 145
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #108 from /issues/107
Fix #107: Support encoding/decoding of Unicode byte-order markers
- Loading branch information
Showing
9 changed files
with
207 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
package better.files | ||
|
||
abstract class Benchmark extends App { | ||
def profile[A](f: => A): (A, Long) = { | ||
val t = System.nanoTime() | ||
(f, ((System.nanoTime() - t) / 1e6).toLong) | ||
} | ||
} |
37 changes: 37 additions & 0 deletions
37
benchmarks/src/test/scala/better/files/EncodingBenchmark.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
package better.files | ||
|
||
import java.nio.charset.Charset | ||
|
||
import scala.util.Random | ||
|
||
object EncodingBenchmark extends Benchmark { | ||
|
||
def testWrite(file: File, charset: Charset) = profile { | ||
for { | ||
writer <- file.bufferedWriter(charset) | ||
content <- Iterator.continually(Random.nextString(10000)).take(1000) | ||
} writer.write(content + "\n") | ||
} | ||
|
||
def testRead(file: File, charset: Charset) = profile { | ||
for { | ||
reader <- file.bufferedReader | ||
line <- reader.lines().autoClosed | ||
} line | ||
} | ||
|
||
def test(charset: Charset) = { | ||
File.usingTemporaryFile() {file => | ||
val (_, w) = testWrite(file, charset) | ||
println(s"Charset=$charset, write=$w ms") | ||
|
||
val (_, r) = testRead(file, charset) | ||
println(s"Charset=$charset, read=$r ms") | ||
} | ||
} | ||
|
||
val utf8 = Charset.forName("UTF-8") | ||
test(charset = utf8) | ||
println("-------------") | ||
test(charset = UnicodeCharset(utf8)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
package better.files | ||
|
||
import java.nio.charset._ | ||
import java.nio.{BufferOverflowException, ByteBuffer, CharBuffer} | ||
|
||
import scala.collection.JavaConverters._ | ||
|
||
/** | ||
* A Unicode charset that handles byte-order markers | ||
* | ||
* @param underlyingCharset Use this charset if no known byte-order marker is detected; use this for encoding too | ||
* @param writeByteOrderMarkers If set, write BOMs while encoding | ||
*/ | ||
class UnicodeCharset(underlyingCharset: Charset, writeByteOrderMarkers: Boolean) | ||
extends Charset(underlyingCharset.name(), underlyingCharset.aliases().asScala.toArray) { | ||
override def newDecoder() = new UnicodeDecoder(underlyingCharset) | ||
override def newEncoder() = if (writeByteOrderMarkers) new BomEncoder(underlyingCharset) else underlyingCharset.newEncoder() | ||
override def contains(cs: Charset) = underlyingCharset.contains(cs) | ||
} | ||
|
||
/** | ||
* A Unicode decoder that uses the Unicode byte-order marker (BOM) to auto-detect the encoding | ||
* (if none detected, falls back on the defaultCharset). This also gets around a bug in the JDK | ||
* (http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4508058) where BOM is not consumed for UTF-8. | ||
* See: https://github.com/pathikrit/better-files/issues/107 | ||
* | ||
* @param defaultCharset Use this charset if no known byte-order marker is detected | ||
*/ | ||
class UnicodeDecoder(defaultCharset: Charset) extends CharsetDecoder(defaultCharset, 1, 1) { | ||
import UnicodeCharset.bomTable | ||
|
||
private[this] var inferredCharset: Option[Charset] = None | ||
|
||
@annotation.tailrec | ||
private[this] def decode(in: ByteBuffer, out: CharBuffer, candidates: Set[Charset] = Set.empty): CoderResult = { | ||
if (isCharsetDetected) { | ||
detectedCharset().newDecoder().decode(in, out, true) | ||
} else if (candidates.isEmpty || !in.hasRemaining) { | ||
inferredCharset = Some(defaultCharset) | ||
in.rewind() | ||
decode(in, out) | ||
} else if (candidates.forall(c => bomTable(c).length == in.position())) { | ||
inferredCharset = candidates.headOption.ensuring(candidates.size == 1, "Ambiguous BOMs found") | ||
decode(in, out) | ||
} else { | ||
val idx = in.position() | ||
val byte = in.get() | ||
def isPossible(charset: Charset) = bomTable(charset).lift(idx).contains(byte) | ||
decode(in, out, candidates.filter(isPossible)) | ||
} | ||
} | ||
|
||
override def decodeLoop(in: ByteBuffer, out: CharBuffer) = decode(in = in, out = out, candidates = bomTable.keySet) | ||
|
||
override def isCharsetDetected = inferredCharset.isDefined | ||
|
||
override def isAutoDetecting = true | ||
|
||
override def implReset() = inferredCharset = None | ||
|
||
override def detectedCharset() = inferredCharset.getOrElse(throw new IllegalStateException("Insufficient bytes read to determine charset")) | ||
} | ||
|
||
/** | ||
* Encoder that writes the BOM for this charset | ||
* @param charset | ||
*/ | ||
class BomEncoder(charset: Charset) extends CharsetEncoder(charset, 1, 1) { | ||
private[this] val bom = UnicodeCharset.bomTable.getOrElse(charset, throw new IllegalArgumentException(s"$charset does not support BOMs")).toArray | ||
private[this] var isBomWritten = false | ||
|
||
override def encodeLoop(in: CharBuffer, out: ByteBuffer): CoderResult = { | ||
if (!isBomWritten) { | ||
try { | ||
out.put(bom) | ||
} catch { | ||
case _: BufferOverflowException => return CoderResult.OVERFLOW | ||
} finally { | ||
isBomWritten = true | ||
} | ||
} | ||
charset.newEncoder().encode(in, out, true) | ||
} | ||
|
||
override def implReset() = isBomWritten = false | ||
} | ||
|
||
object UnicodeCharset { | ||
private[files] val bomTable: Map[Charset, IndexedSeq[Byte]] = Map( | ||
"UTF-8" -> IndexedSeq(0xEF, 0xBB, 0xBF), | ||
"UTF-16BE" -> IndexedSeq(0xFE, 0xFF), | ||
"UTF-16LE" -> IndexedSeq(0xFF, 0xFE), | ||
"UTF-32BE" -> IndexedSeq(0x00, 0x00, 0xFE, 0xFF), | ||
"UTF-32LE" -> IndexedSeq(0xFF, 0xFE, 0x00, 0x00) | ||
).collect{case (charset, bytes) if Charset.isSupported(charset) => Charset.forName(charset) -> bytes.map(_.toByte)} | ||
.ensuring(_.nonEmpty, "No unicode charset detected") | ||
|
||
def apply(charset: Charset, writeByteOrderMarkers: Boolean = false): Charset = | ||
if (bomTable.contains(charset)) new UnicodeCharset(charset, writeByteOrderMarkers) else charset | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters