Skip to content

Commit

Permalink
Encode into / decode from Uint16Array
Browse files Browse the repository at this point in the history
  • Loading branch information
mathiasbynens committed Jul 30, 2021
1 parent 5f41fc8 commit a347c16
Show file tree
Hide file tree
Showing 8 changed files with 293 additions and 116 deletions.
12 changes: 5 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,25 +28,25 @@ An array of strings, each representing a [label](https://encoding.spec.whatwg.or

### `iso885914.encode(input, options)`

This function takes a plain text string (the `input` parameter) and encodes it according to iso-8859-14. The return value is a ‘byte string’, i.e. a string of which each item represents an octet as per iso-8859-14.
This function takes a plain text string (the `input` parameter) and encodes it according to iso-8859-14. The return value is an environment-agnostic `Uint16Array` of which each element represents an octet as per iso-8859-14.

```js
const encodedData = iso885914.encode(text);
```

The optional `options` object and its `mode` property can be used to set the [error mode](https://encoding.spec.whatwg.org/#error-mode). For encoding, the error mode can be `'fatal'` (the default) or `'html'`.
The optional `options` object and its `mode` property can be used to set the error mode. The two available error modes are `'fatal'` (the default) or `'replacement'`. (Note: This differs from [the spec](https://encoding.spec.whatwg.org/#error-mode), which recognizes `'fatal`' and `html` modes for encoders. The reason behind this difference is that the spec algorithm is aimed at producing HTML, whereas this library encodes into an environment-agnostic `Uint16Array` of bytes.)

```js
const encodedData = iso885914.encode(text, {
mode: 'html'
mode: 'replacement'
});
// If `text` contains a symbol that cannot be represented in iso-8859-14,
// instead of throwing an error, it will return an HTML entity for the symbol.
// instead of throwing an error, it becomes 0xFFFD.
```

### `iso885914.decode(input, options)`

This function takes a byte string (the `input` parameter) and decodes it according to iso-8859-14.
This function decodes `input` according to iso-8859-14. The `input` parameter can either be a `Uint16Array` of which each element represents an octet as per iso-8859-14, or a ‘byte string’ (i.e. a string of which each item represents an octet as per iso-8859-14).

```js
const text = iso885914.decode(encodedData);
Expand All @@ -62,8 +62,6 @@ const text = iso885914.decode(encodedData, {
// instead of replacing it with U+FFFD in the output, an error is thrown.
```

For decoding a buffer (e.g. from `fs.readFile`) use `buffer.toString('binary')` to get the byte string which `decode` takes.

## Notes

[Similar modules for other single-byte legacy encodings are available.](https://www.npmjs.com/browse/keyword/legacy-encoding)
Expand Down
131 changes: 130 additions & 1 deletion data/encoded.json
Original file line number Diff line number Diff line change
@@ -1 +1,130 @@
"\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0\u00A1\u00A2\u00A3\u00A4\u00A5\u00A6\u00A7\u00A8\u00A9\u00AA\u00AB\u00AC\u00AD\u00AE\u00AF\u00B0\u00B1\u00B2\u00B3\u00B4\u00B5\u00B6\u00B7\u00B8\u00B9\u00BA\u00BB\u00BC\u00BD\u00BE\u00BF\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5\u00C6\u00C7\u00C8\u00C9\u00CA\u00CB\u00CC\u00CD\u00CE\u00CF\u00D0\u00D1\u00D2\u00D3\u00D4\u00D5\u00D6\u00D7\u00D8\u00D9\u00DA\u00DB\u00DC\u00DD\u00DE\u00DF\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5\u00E6\u00E7\u00E8\u00E9\u00EA\u00EB\u00EC\u00ED\u00EE\u00EF\u00F0\u00F1\u00F2\u00F3\u00F4\u00F5\u00F6\u00F7\u00F8\u00F9\u00FA\u00FB\u00FC\u00FD\u00FE\u00FF"
[
128,
129,
130,
131,
132,
133,
134,
135,
136,
137,
138,
139,
140,
141,
142,
143,
144,
145,
146,
147,
148,
149,
150,
151,
152,
153,
154,
155,
156,
157,
158,
159,
160,
161,
162,
163,
164,
165,
166,
167,
168,
169,
170,
171,
172,
173,
174,
175,
176,
177,
178,
179,
180,
181,
182,
183,
184,
185,
186,
187,
188,
189,
190,
191,
192,
193,
194,
195,
196,
197,
198,
199,
200,
201,
202,
203,
204,
205,
206,
207,
208,
209,
210,
211,
212,
213,
214,
215,
216,
217,
218,
219,
220,
221,
222,
223,
224,
225,
226,
227,
228,
229,
230,
231,
232,
233,
234,
235,
236,
237,
238,
239,
240,
241,
242,
243,
244,
245,
246,
247,
248,
249,
250,
251,
252,
253,
254,
255
]
53 changes: 35 additions & 18 deletions iso-8859-14.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -264,12 +264,17 @@ const INDEX_BY_POINTER = new Map([
]);

// https://encoding.spec.whatwg.org/#error-mode
const error = (codePoint, mode) => {
if (mode == 'replacement') {
const decodingError = (mode) => {
if (mode === 'replacement') {
return '\uFFFD';
}
if (codePoint !== null && mode === 'html') {
return '&#' + codePoint + ';';
// Else, `mode == 'fatal'`.
throw new Error();
};

const encodingError = (mode) => {
if (mode === 'replacement') {
return 0xFFFD;
}
// Else, `mode == 'fatal'`.
throw new Error();
Expand All @@ -286,24 +291,36 @@ export const decode = (input, options) => {
if (mode !== 'replacement' && mode !== 'fatal') {
mode = 'replacement';
}

const length = input.length;

// Support byte strings as input.
if (typeof input === 'string') {
const bytes = new Uint16Array(length);
for (let index = 0; index < length; index++) {
bytes[index] = input.charCodeAt(index);
}
input = bytes;
}

const buffer = [];
for (let index = 0; index < input.length; index++) {
const byteValue = input.charCodeAt(index);
// “If `byte` is in the range `0x00` to `0x7F`, return a code point whose
// value is `byte`.”
for (let index = 0; index < length; index++) {
const byteValue = input[index];
// “If `byte` is an ASCII byte, return a code point whose value is
// `byte`.”
if (0x00 <= byteValue && byteValue <= 0x7F) {
buffer.push(stringFromCharCode(byteValue));
continue;
}
// “Let `code point` be the index code point for `byte − 0x80` in index
// `single-byte`.”
// single-byte.”
const pointer = byteValue - 0x80;
if (INDEX_BY_POINTER.has(pointer)) {
// “Return a code point whose value is `code point`.”
buffer.push(INDEX_BY_POINTER.get(pointer));
} else {
// “If `code point` is `null`, return `error`.”
buffer.push(error(null, mode));
buffer.push(decodingError(mode));
}
}
const result = buffer.join('');
Expand All @@ -321,27 +338,27 @@ export const encode = (input, options) => {
if (mode !== 'fatal' && mode !== 'html') {
mode = 'fatal';
}
const buffer = [];
for (let index = 0; index < input.length; index++) {
const length = input.length;
const result = new Uint16Array(length);
for (let index = 0; index < length; index++) {
const codePoint = input.charCodeAt(index);
// “If `code point` is in the range U+0000 to U+007F, return a byte whose
// “If `code point` is an ASCII code point, return a byte whose
// value is `code point`.”
if (0x00 <= codePoint && codePoint <= 0x7F) {
buffer.push(stringFromCharCode(codePoint));
result[index] = codePoint;
continue;
}
// “Let `pointer` be the index pointer for `code point` in index
// `single-byte`.”
// single-byte.”
if (INDEX_BY_CODE_POINT.has(codePoint)) {
const pointer = INDEX_BY_CODE_POINT.get(codePoint);
// “Return a byte whose value is `pointer + 0x80`.”
buffer.push(stringFromCharCode(pointer + 0x80));
result[index] = pointer + 0x80;
} else {
// “If `pointer` is `null`, return `error` with `code point`.”
buffer.push(error(codePoint, mode));
result[index] = encodingError(mode);
}
}
const result = buffer.join('');
return result;
};

Expand Down
2 changes: 1 addition & 1 deletion scripts/export-data.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ function objectToMap(object) {

module.exports = {
labels: jsesc(readJSON('data/labels.json'), { compact: false }),
encoded: jsesc(readJSON('data/encoded.json'), { wrap: true }),
encoded: jsesc(readJSON('data/encoded.json'), { wrap: true, numbers: 'hexadecimal' }),
decoded: jsesc(readJSON('data/decoded.json'), { wrap: true }),
indexByCodePoint: jsesc(objectToMap(readJSON('data/index-by-code-point.json')), { compact: false }),
indexByPointer: jsesc(objectToMap(readJSON('data/index-by-pointer.json')), { compact: false }),
Expand Down
4 changes: 2 additions & 2 deletions scripts/transform-data.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ function parse(source) {
const indexByCodePoint = {};
const indexByPointer = {};
let decoded = '';
let encoded = '';
const encoded = [];
var lines = source.split('\n');
for (const line of lines) {
const data = line.trim().split('\t');
Expand All @@ -24,7 +24,7 @@ function parse(source) {
const codePoint = Number(data[1]);
const symbol = String.fromCodePoint(codePoint);
decoded += symbol;
encoded += String.fromCodePoint(pointer + 0x80);
encoded.push(pointer + 0x80);
indexByCodePoint[codePoint] = pointer;
indexByPointer[pointer] = symbol;
}
Expand Down
Loading

0 comments on commit a347c16

Please sign in to comment.