Encode into / decode from Uint16Array

mathiasbynens · Jul 30, 2021 · a347c16 · a347c16
1 parent 5f41fc8
commit a347c16
Show file tree

Hide file tree

Showing 8 changed files with 293 additions and 116 deletions.
diff --git a/README.md b/README.md
@@ -28,25 +28,25 @@ An array of strings, each representing a [label](https://encoding.spec.whatwg.or
 
 ### `iso885914.encode(input, options)`
 
-This function takes a plain text string (the `input` parameter) and encodes it according to iso-8859-14. The return value is a ‘byte string’, i.e. a string of which each item represents an octet as per iso-8859-14.
+This function takes a plain text string (the `input` parameter) and encodes it according to iso-8859-14. The return value is an environment-agnostic `Uint16Array` of which each element represents an octet as per iso-8859-14.
 
 ```js
 const encodedData = iso885914.encode(text);
 ```
 
-The optional `options` object and its `mode` property can be used to set the [error mode](https://encoding.spec.whatwg.org/#error-mode). For encoding, the error mode can be `'fatal'` (the default) or `'html'`.
+The optional `options` object and its `mode` property can be used to set the error mode. The two available error modes are `'fatal'` (the default) or `'replacement'`. (Note: This differs from [the spec](https://encoding.spec.whatwg.org/#error-mode), which recognizes `'fatal`' and `html` modes for encoders. The reason behind this difference is that the spec algorithm is aimed at producing HTML, whereas this library encodes into an environment-agnostic `Uint16Array` of bytes.)
 
 ```js
 const encodedData = iso885914.encode(text, {
-  mode: 'html'
+  mode: 'replacement'
 });
 // If `text` contains a symbol that cannot be represented in iso-8859-14,
-// instead of throwing an error, it will return an HTML entity for the symbol.
+// instead of throwing an error, it becomes 0xFFFD.
 ```
 
 ### `iso885914.decode(input, options)`
 
-This function takes a byte string (the `input` parameter) and decodes it according to iso-8859-14.
+This function decodes `input` according to iso-8859-14. The `input` parameter can either be a `Uint16Array` of which each element represents an octet as per iso-8859-14, or a ‘byte string’ (i.e. a string of which each item represents an octet as per iso-8859-14).
 
 ```js
 const text = iso885914.decode(encodedData);
@@ -62,8 +62,6 @@ const text = iso885914.decode(encodedData, {
 // instead of replacing it with U+FFFD in the output, an error is thrown.
 ```
 
-For decoding a buffer (e.g. from `fs.readFile`) use `buffer.toString('binary')` to get the byte string which `decode` takes.
-
 ## Notes
 
 [Similar modules for other single-byte legacy encodings are available.](https://www.npmjs.com/browse/keyword/legacy-encoding)

diff --git a/data/encoded.json b/data/encoded.json
@@ -1 +1,130 @@
-"\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0\u00A1\u00A2\u00A3\u00A4\u00A5\u00A6\u00A7\u00A8\u00A9\u00AA\u00AB\u00AC\u00AD\u00AE\u00AF\u00B0\u00B1\u00B2\u00B3\u00B4\u00B5\u00B6\u00B7\u00B8\u00B9\u00BA\u00BB\u00BC\u00BD\u00BE\u00BF\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5\u00C6\u00C7\u00C8\u00C9\u00CA\u00CB\u00CC\u00CD\u00CE\u00CF\u00D0\u00D1\u00D2\u00D3\u00D4\u00D5\u00D6\u00D7\u00D8\u00D9\u00DA\u00DB\u00DC\u00DD\u00DE\u00DF\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5\u00E6\u00E7\u00E8\u00E9\u00EA\u00EB\u00EC\u00ED\u00EE\u00EF\u00F0\u00F1\u00F2\u00F3\u00F4\u00F5\u00F6\u00F7\u00F8\u00F9\u00FA\u00FB\u00FC\u00FD\u00FE\u00FF"
+[
+	128,
+	129,
+	130,
+	131,
+	132,
+	133,
+	134,
+	135,
+	136,
+	137,
+	138,
+	139,
+	140,
+	141,
+	142,
+	143,
+	144,
+	145,
+	146,
+	147,
+	148,
+	149,
+	150,
+	151,
+	152,
+	153,
+	154,
+	155,
+	156,
+	157,
+	158,
+	159,
+	160,
+	161,
+	162,
+	163,
+	164,
+	165,
+	166,
+	167,
+	168,
+	169,
+	170,
+	171,
+	172,
+	173,
+	174,
+	175,
+	176,
+	177,
+	178,
+	179,
+	180,
+	181,
+	182,
+	183,
+	184,
+	185,
+	186,
+	187,
+	188,
+	189,
+	190,
+	191,
+	192,
+	193,
+	194,
+	195,
+	196,
+	197,
+	198,
+	199,
+	200,
+	201,
+	202,
+	203,
+	204,
+	205,
+	206,
+	207,
+	208,
+	209,
+	210,
+	211,
+	212,
+	213,
+	214,
+	215,
+	216,
+	217,
+	218,
+	219,
+	220,
+	221,
+	222,
+	223,
+	224,
+	225,
+	226,
+	227,
+	228,
+	229,
+	230,
+	231,
+	232,
+	233,
+	234,
+	235,
+	236,
+	237,
+	238,
+	239,
+	240,
+	241,
+	242,
+	243,
+	244,
+	245,
+	246,
+	247,
+	248,
+	249,
+	250,
+	251,
+	252,
+	253,
+	254,
+	255
+]
diff --git a/iso-8859-14.mjs b/iso-8859-14.mjs
@@ -264,12 +264,17 @@ const INDEX_BY_POINTER = new Map([
 ]);
 
 // https://encoding.spec.whatwg.org/#error-mode
-const error = (codePoint, mode) => {
-	if (mode == 'replacement') {
+const decodingError = (mode) => {
+	if (mode === 'replacement') {
 		return '\uFFFD';
 	}
-	if (codePoint !== null && mode === 'html') {
-		return '&#' + codePoint + ';';
+	// Else, `mode == 'fatal'`.
+	throw new Error();
+};
+
+const encodingError = (mode) => {
+	if (mode === 'replacement') {
+		return 0xFFFD;
 	}
 	// Else, `mode == 'fatal'`.
 	throw new Error();
@@ -286,24 +291,36 @@ export const decode = (input, options) => {
 	if (mode !== 'replacement' && mode !== 'fatal') {
 		mode = 'replacement';
 	}
+
+	const length = input.length;
+
+	// Support byte strings as input.
+	if (typeof input === 'string') {
+		const bytes = new Uint16Array(length);
+		for (let index = 0; index < length; index++) {
+			bytes[index] = input.charCodeAt(index);
+		}
+		input = bytes;
+	}
+
 	const buffer = [];
-	for (let index = 0; index < input.length; index++) {
-		const byteValue = input.charCodeAt(index);
-		// “If `byte` is in the range `0x00` to `0x7F`, return a code point whose
-		// value is `byte`.”
+	for (let index = 0; index < length; index++) {
+		const byteValue = input[index];
+		// “If `byte` is an ASCII byte, return a code point whose value is
+		// `byte`.”
 		if (0x00 <= byteValue && byteValue <= 0x7F) {
 			buffer.push(stringFromCharCode(byteValue));
 			continue;
 		}
 		// “Let `code point` be the index code point for `byte − 0x80` in index
-		// `single-byte`.”
+		// single-byte.”
 		const pointer = byteValue - 0x80;
 		if (INDEX_BY_POINTER.has(pointer)) {
 			// “Return a code point whose value is `code point`.”
 			buffer.push(INDEX_BY_POINTER.get(pointer));
 		} else {
 			// “If `code point` is `null`, return `error`.”
-			buffer.push(error(null, mode));
+			buffer.push(decodingError(mode));
 		}
 	}
 	const result = buffer.join('');
@@ -321,27 +338,27 @@ export const encode = (input, options) => {
 	if (mode !== 'fatal' && mode !== 'html') {
 		mode = 'fatal';
 	}
-	const buffer = [];
-	for (let index = 0; index < input.length; index++) {
+	const length = input.length;
+	const result = new Uint16Array(length);
+	for (let index = 0; index < length; index++) {
 		const codePoint = input.charCodeAt(index);
-		// “If `code point` is in the range U+0000 to U+007F, return a byte whose
+		// “If `code point` is an ASCII code point, return a byte whose
 		// value is `code point`.”
 		if (0x00 <= codePoint && codePoint <= 0x7F) {
-			buffer.push(stringFromCharCode(codePoint));
+			result[index] = codePoint;
 			continue;
 		}
 		// “Let `pointer` be the index pointer for `code point` in index
-		// `single-byte`.”
+		// single-byte.”
 		if (INDEX_BY_CODE_POINT.has(codePoint)) {
 			const pointer = INDEX_BY_CODE_POINT.get(codePoint);
 			// “Return a byte whose value is `pointer + 0x80`.”
-			buffer.push(stringFromCharCode(pointer + 0x80));
+			result[index] = pointer + 0x80;
 		} else {
 			// “If `pointer` is `null`, return `error` with `code point`.”
-			buffer.push(error(codePoint, mode));
+			result[index] = encodingError(mode);
 		}
 	}
-	const result = buffer.join('');
 	return result;
 };
 

diff --git a/scripts/export-data.js b/scripts/export-data.js
@@ -15,7 +15,7 @@ function objectToMap(object) {
 
 module.exports = {
 	labels: jsesc(readJSON('data/labels.json'), { compact: false }),
-	encoded: jsesc(readJSON('data/encoded.json'), { wrap: true }),
+	encoded: jsesc(readJSON('data/encoded.json'), { wrap: true, numbers: 'hexadecimal' }),
 	decoded: jsesc(readJSON('data/decoded.json'), { wrap: true }),
 	indexByCodePoint: jsesc(objectToMap(readJSON('data/index-by-code-point.json')), { compact: false }),
 	indexByPointer: jsesc(objectToMap(readJSON('data/index-by-pointer.json')), { compact: false }),

diff --git a/scripts/transform-data.js b/scripts/transform-data.js
@@ -13,7 +13,7 @@ function parse(source) {
 	const indexByCodePoint = {};
 	const indexByPointer = {};
 	let decoded = '';
-	let encoded = '';
+	const encoded = [];
 	var lines = source.split('\n');
 	for (const line of lines) {
 		const data = line.trim().split('\t');
@@ -24,7 +24,7 @@ function parse(source) {
 		const codePoint = Number(data[1]);
 		const symbol = String.fromCodePoint(codePoint);
 		decoded += symbol;
-		encoded += String.fromCodePoint(pointer + 0x80);
+		encoded.push(pointer + 0x80);
 		indexByCodePoint[codePoint] = pointer;
 		indexByPointer[pointer] = symbol;
 	}