From 3f3277819613924a26a58c6e4bd33b21eb4bb984 Mon Sep 17 00:00:00 2001
From: Andreas Bolka <a@bolka.at>
Date: Tue, 18 Nov 2014 00:18:43 +0100
Subject: [PATCH] Fix encoding detection for READ/string

Files with a BOM are now properly handled. Files without a BOM are again
decoded using UTF-8. Trying to READ/string a file with an encoding for
which support is not yet implemented (UCS4 LE and BE) now causes an
error.

This fixes zsx/r3#13, regressions introduced by 8336a45 and 70c7c3a, and
CureCode issue cc#2186.
---
 src/core/p-file.c    |  2 +-
 src/core/s-unicode.c | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/core/p-file.c b/src/core/p-file.c
index 1abb65fe9b..cba02e5fa5 100644
--- a/src/core/p-file.c
+++ b/src/core/p-file.c
@@ -206,7 +206,7 @@ REBINT Mode_Syms[] = {
 	if (args & (AM_READ_STRING | AM_READ_LINES)) {
 		REBSER *nser = Decode_UTF_String(BIN_HEAD(ser), file->actual, -1);
 		if (nser == NULL) {
-			nser = ser;
+			Trap0(RE_BAD_DECODE);
 		}
 		Set_String(ds, nser);
 		if (args & AM_READ_LINES) Set_Block(ds, Split_Lines(ds));
diff --git a/src/core/s-unicode.c b/src/core/s-unicode.c
index 128fe8d998..792635d45a 100644
--- a/src/core/s-unicode.c
+++ b/src/core/s-unicode.c
@@ -922,7 +922,9 @@ ConversionResult ConvertUTF8toUTF32 (
 **		Do all the details to decode a string.
 **		Input is a byte series. Len is len of input.
 **		The utf is 0, 8, +/-16, +/-32.
-**		A special -1 means use the BOM.
+**		A special -1 means use the BOM, if present, or UTF-8 otherwise.
+**
+**		Returns the decoded string or NULL for unsupported encodings.
 **
 ***********************************************************************/
 {
@@ -931,17 +933,17 @@ ConversionResult ConvertUTF8toUTF32 (
 	REBINT size;
 
 	if (utf == -1) {
+		// Try to detect UTF encoding from a BOM. Returns 0 if no BOM present.
 		utf = What_UTF(bp, len);
-		if (utf) {
+		if (utf != 0) {
 			if (utf == 8) bp += 3, len -= 3;
 			else if (utf == -16 || utf == 16) bp += 2, len -= 2;
 			//else if (utf == -32 || utf == 32) bp += 4, len -= 4;
 			else return NULL;
-		} else {
-			return NULL;
 		}
 	}
-	else if (utf == 0 || utf == 8) {
+
+	if (utf == 0 || utf == 8) {
 		size = Decode_UTF8((REBUNI*)Reset_Buffer(ser, len), bp, len, TRUE);
 	} 
 	else if (utf == -16 || utf == 16) {
@@ -951,7 +953,8 @@ ConversionResult ConvertUTF8toUTF32 (
 //		size = Decode_UTF32((REBUNI*)Reset_Buffer(ser, len/4 + 1), bp, len, utf < 0, TRUE);
 //	}
 	else {
-		return NULL; /* should never be here */
+		// Encoding is unsupported or not yet implemented.
+		return NULL;
 	}
 
 	if (size < 0) {