From 0f70ad91ff6bf007eb8c1296cc88004f0768d9c3 Mon Sep 17 00:00:00 2001 From: Hyeseong Kim Date: Fri, 30 Aug 2024 04:21:25 +0900 Subject: [PATCH] Migrate Unicode codegen script into JavaScript Resolves #40 --- package.json | 2 +- scripts/unicode.js | 1001 +++++++++++++++++++++++++++++++++++++ src/_emoji_table.js | 3 + src/_general_table.js | 5 +- src/_grapheme_table.js | 16 +- src/_incb_table.js | 3 + test/_unicode_testdata.js | 5 +- tsconfig.json | 1 + yarn.lock | 18 +- 9 files changed, 1028 insertions(+), 26 deletions(-) create mode 100755 scripts/unicode.js diff --git a/package.json b/package.json index db19924..985ad07 100644 --- a/package.json +++ b/package.json @@ -127,7 +127,7 @@ "pretty-bytes": "^6.1.1", "rimraf": "^5.0.5", "tinybench": "^2.6.0", - "typescript": "^5.4.5", + "typescript": "^5.5.4", "unicode-segmentation-wasm": "portal:benchmark/unicode-segmentation-wasm", "vite": "^5.2.11", "xregexp": "5.1.1", diff --git a/scripts/unicode.js b/scripts/unicode.js new file mode 100755 index 0000000..116c677 --- /dev/null +++ b/scripts/unicode.js @@ -0,0 +1,1001 @@ +#!/usr/bin/env node + +// Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// This script has been modified from +// [https://github.com/unicode-rs/unicode-segmentation/blob/b4c9ce15/scripts/unicode.py] +// +// Which is licensed under the +// [MIT license](../licenses/unicode-segmentation_MIT.txt) +// +// This script uses the following Unicode tables: +// - DerivedCoreProperties.txt +// - auxiliary/GraphemeBreakProperty.txt +// - auxiliary/GraphemeBreakTest.txt +// - auxiliary/WordBreakProperty.txt +// - auxiliary/WordBreakTest.txt +// - ReadMe.txt +// - UnicodeData.txt + +// @ts-check + +/** + * @import { WriteStream } from 'node:fs'; + * @import { UnicodeRange, CategorizedUnicodeRange } from '../src/core.js'; + * + * @typedef {number[]} UnicodeValues + */ + +import * as assert from 'node:assert/strict'; +import * as path from 'node:path'; +import { existsSync, createWriteStream } from 'node:fs'; +import * as fs from 'node:fs/promises'; +import { fileURLToPath } from 'node:url'; + +let __dirname = path.dirname(fileURLToPath(import.meta.url)); +let srcPath = path.resolve(__dirname, '../src'); +let testPath = path.resolve(__dirname, '../test'); +let dataPath = path.resolve(__dirname, 'unicode_data'); + +let preamble = ` +// The following code was generated by "scripts/unicode.js", +// DO NOT EDIT DIRECTLY. +// +// @ts-check +`.trimStart(); + +/** @type {[major: number, minor: number, patch: number]} */ +const UNICODE_VERSION = [15, 1, 0]; +const UNICODE_VERSION_STRING = UNICODE_VERSION.join('.'); + +// these are the surrogate codepoints, which are not valid rust characters +/** @type {UnicodeRange} */ +let surrogateCodepoints = [0xd800, 0xdfff]; + +/** @type {Record} */ +let expandedCategories = { + 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], + 'Lm': ['L'], 'Lo': ['L'], + 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], + 'Nd': ['N'], 'Nl': ['N'], 'No': ['N'], + 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], + 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], + 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], + 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], + 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], +}; + +/** + * @param {string} f + * @return {Promise} + */ +let fetchData = async f => { + let url = new URL(`https://www.unicode.org/Public/${UNICODE_VERSION_STRING}/ucd/${f}`); + let filePath = path.join(dataPath, f); + + let content = null; + if (existsSync(filePath)) { + content = await fs.readFile(filePath, 'utf-8'); + } else { + let res = await fetch(url); + if (!res.ok) { + throw new Error(`Failed to fetch ${url}`); + } + content = await res.text(); + await fs.mkdir(path.dirname(filePath), { recursive: true }); + await fs.writeFile(filePath, content, 'utf-8'); + } + + return content; +}; + +/** + * @template T + * @param {T[]} a + * @param {T[]} b + */ +let arraysEqual = (a, b) => { + return a.length === b.length && a.every((v, i) => v === b[i]); +}; + +/** + * @param {number} from + * @param {number} to + * @return {Iterable} + */ +let range = (from, to) => { + return Array.from({ length: to - from }, (_, i) => i + from); +}; + +/** + * @param {string} str + * @return {string} + */ +let capitalize = str => { + return str[0].toUpperCase() + str.slice(1); +}; + +/** + * @param {number} c + * @return {string} + */ +let escapeUnicode = c => { + return `\\u{${c.toString(16).padStart(4, '0')}}`; +}; + +/** + * @param {number} n + * @return {boolean} + */ +let isSurrogate = n => { + return surrogateCodepoints[0] <= n && n <= surrogateCodepoints[1]; +}; + +/** + * @template T + * @param {Set} a + * @param {Set} b + * @return {Set} + */ +let difference = (a, b) => { + const result = new Set(a); + if (a.size <= b.size) { + for (const elem of a) { + if (b.has(elem)) { + result.delete(elem); + } + } + } else { + for (const elem of b.keys()) { + if (result.has(elem)) { + result.delete(elem); + } + } + } + return result; +}; + +/** + * @param {UnicodeRange} range + * @param {boolean} [compressed=true] + * @return {string} + */ +let formatRange = (range, compressed = true) => { + return compressed + ? `[${range[0]},${range[1]}]` + : `[${range[0]}, ${range[1]}]` +}; + +/** + * @param {UnicodeRange[]} ranges + * @return {UnicodeValues} + */ +let ungroupCat = ranges => { + /** @type {UnicodeValues} */ + let catOut = []; + for (let [lo, hi] of ranges) { + while (lo <= hi) { + catOut.push(lo); + lo += 1; + } + } + return catOut; +}; + +/** + * @param {UnicodeValues} values + * @return {UnicodeRange[]} + */ +let groupCat = values => { + /** @type {UnicodeRange[]} */ + let catOut = []; + let letters = [...new Set(values)].toSorted((a, b) => a - b); + + let curStart = letters.shift(); + assert.ok(curStart != null); + + let curEnd = curStart; + for (let letter of letters) { + assert.ok(letter > curEnd, `curEnd: ${curEnd}, letter: ${letter}`); + + if (letter === curEnd + 1) { + curEnd = letter; + } else { + catOut.push([curStart, curEnd]); + curStart = curEnd = letter; + } + } + catOut.push([curStart, curEnd]); + return catOut; +}; + +/** + * @param {Record} cats + * @return {Record} + */ +let groupCats = cats => { + /** @type {Record} */ + let catsOut = {}; + for (let [cat, codes] of Object.entries(cats)) { + catsOut[cat] = groupCat(codes); + } + return catsOut; +}; + +/** + * @param {string} data + * @return {Record} + */ +let parseGencats = (data) => { + /** @type {Record} */ + let gencats = {}; + + /** @type {Record} */ + let udict = {}; + let rangeStart = -1; + + for (let line of data.split('\n')) { + let data = line.split(';'); + if (data.length !== 15) { + continue; + } + let cp = Number.parseInt(data[0], 16); + if (isSurrogate(cp)) { + continue; + } + if (rangeStart >= 0) { + for (let i of range(rangeStart, cp)) { + udict[i] = data; + } + rangeStart = -1; + } + if (data[1].endsWith(', First>')) { + rangeStart = cp; + continue; + } + udict[cp] = data; + } + + for (let [code, data] of Object.entries(udict)) { + let [codeOrg, name, gencat, combine, bidi, + decomp, deci, digit, num, mirror, + old, iso, upcase, lowcase, titlecase] = data; + + // place letter in categories as appropriate + for (let cat of [gencat, "Assigned"].concat(expandedCategories[gencat] || [])) { + gencats[cat] ||= []; + gencats[cat].push(Number.parseInt(code)); + } + } + + return groupCats(gencats); +}; + +/** + * @param {string} data + * @param {string[]} [interestingProps] + * @return {Record} + */ +let parseProperties = (data, interestingProps) => { + let pattern = /^ *([0-9A-F]+)(?:\.\.([0-9A-F]+))? *; *(\w+)(?:; *(\w+))?/; + + /** @type {Record} */ + let props = {}; + + for (let line of data.split('\n')) { + let match = line.match(pattern); + if (!match) { + continue; + } + + let d_lo = match[1]; + let d_hi = match[2] || d_lo; + let prop = match[3]; + let value = match[4]; + + let propKey = value ? `${prop}=${value}` : prop; + let propValue = value || prop; + + if (interestingProps && !interestingProps.some(p => propKey === p || prop === p)) { + continue; + } + + let lo = Number.parseInt(d_lo, 16); + let hi = Number.parseInt(d_hi, 16); + + props[propValue] ||= []; + props[propValue].push([lo, hi]); + } + + for (let [key, ranges] of Object.entries(props)) { + props[key] = groupCat(ungroupCat(ranges)); + } + + return props; +}; + +/** + * @param {string} data + * @param {string[]} [optsplit=[]] + */ +let parseTestData = (data, optsplit = []) => { + /** + * @param {string} str + * @param {UnicodeValues[]} chars + * @param {string[]} o + * @return {[UnicodeValues[], string[]]} + */ + let processSplitInfo = (str, chars, o) => { + /** @type {UnicodeValues[]} */ + let outcs = []; + /** @type {string[]} */ + let outis = []; + let workcs = chars.shift(); + + let s = str; + + // are we on a × or a ÷? + let isX = false; + if (s.startsWith('×')) { + isX = true; + } + + // find each instance of '(÷|×) [x.y] ' + while (s) { + // find the currently considered rule number + let sInd = s.indexOf('[') + 1; + let eInd = s.indexOf(']'); + + if (!isX || o.includes(s.substring(sInd, eInd))) { + outis.push(s.substring(sInd, eInd)); + outcs.push(workcs || []); + workcs = chars.shift(); + } else { + workcs = workcs?.concat(chars.shift() || []); + } + + let idx = 1; + while (idx < s.length) { + if (s.substring(idx).startsWith('×')) { + isX = true; + break; + } + if (s.substring(idx).startsWith('÷')) { + isX = false; + break; + } + idx += 1; + } + s = s.substring(idx); + } + + outcs.push(workcs || []); + return [outcs, outis]; + }; + + /** + * @param {string} str + * @return {UnicodeValues[]} + */ + let processSplitString = (str) => { + /** @type {UnicodeValues[]} */ + let outls = []; + + /** @type {UnicodeValues} */ + let workls = []; + + let inls = str.split(' '); + for (let i of inls) { + if (i === '÷' || i === '×') { + outls.push(workls); + workls = []; + continue; + } + + let ival = Number.parseInt(`0x${i}`, 16); + if (isSurrogate(ival)) { + return []; + } + + workls.push(ival); + } + + if (workls.length) { + outls.push(workls); + } + + return outls; + }; + + let pattern = /^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$/; + + /** + * @type {Array<[UnicodeValues[], string[]]>} + */ + let testdata = []; + for (let line of data.split('\n')) { + // lines that include a test start with the ÷ character + if (line.length < 2 || !line.startsWith('÷')) { + continue; + } + + let match = line.match(pattern); + if (!match) { + console.error(`error: no match on line where test was expected: ${line}`); + continue; + } + + // process the characters in this test case + let chars = processSplitString(match[1]); + // skip test case if it contains invalid characters (viz., surrogates) + if (!chars.length) { + continue; + } + + let [proceedChars, info] = processSplitInfo(match[2], chars, optsplit); + assert.equal(proceedChars.length - 1, info.length); + + testdata.push([proceedChars, info]); + } + + return testdata; +}; + +/** + * @template T + * @param {WriteStream} f + * @param {string} name + * @param {T[]} table + * @param {(row: T) => string} format + */ +let printTableCompressed = (f, name, table, format) => { + f.write(`export const ${name} = JSON.parse('[`); + let first = true; + for (let row of table) { + if (first) { + f.write(format(row)); + } else { + f.write(',' + format(row)); + } + first = false; + } + f.write(`]');`); +}; + +/** + * @template T + * @param {WriteStream} f + * @param {string} name + * @param {T[]} table + * @param {(row: T) => string} format + */ +let printTableRaw = (f, name, table, format) => { + f.write(`export const ${name} = [\n`); + for (let row of table) { + f.write(` ${format(row)},\n`); + } + f.write('];'); +}; + +/** + * @param {WriteStream} f + * @param {CategorizedUnicodeRange[]} breakTable + * @param {string[]} breakCats + * @param {string} name + * @returns + */ +let printBreakModule = (f, breakTable, breakCats, name) => { + let cats = [...breakCats, 'Any'].toSorted(); + + let capitalName = capitalize(name); + let typeName = `${capitalName}Category`; + let keyTypeName = `${typeName}Key`; + let numTypeName = `${typeName}Num`; + + // We don't want the lookup table to be too large so choose a reasonable + // cutoff. 0x20000 is selected because most of the range table entries are + // within the interval of [0x0, 0x20000] + let lookupValueCutoff = 0x20000; + + // Length of lookup table. It has to be a divisor of `lookup_value_cutoff`. + let lookupTableLen = 0x400; + + let lookupInterval = Math.round(lookupValueCutoff / lookupTableLen); + + let lookupTable = Array.from({ length: lookupTableLen }, _ => 0); + let j = 0; + for (let i of range(0, lookupTableLen)) { + let lookupFrom = i * lookupInterval; + while (j < breakTable.length) { + let [_, entryTo] = breakTable[j]; + if (entryTo >= lookupFrom) { + break; + } + j += 1; + } + lookupTable[i] = j; + } + + f.write(preamble); + f.write(` +import { bsearchUnicodeRange } from './core.js'; + +/** +`, + ); + + /** @type {Record} */ + let inversed = {}; + cats.forEach((cat, idx) => { + inversed[cat] = idx; + f.write(` * @typedef {${idx}} ${typeName[0]}C_${cat}\n`); + }); + + f.write(' * @typedef {(\n'); + for (let cat of cats) { + f.write(` * | ${typeName[0]}C_${cat}\n`); + } + f.write(` * )} ${numTypeName}\n`); + f.write(' */\n\n'); + + f.write(` +/** + * @typedef {import('./core.js').CategorizedUnicodeRange<${numTypeName}>} ${typeName}Range + * + * NOTE: It might be garbage \`from\` and \`to\` values when the \`category\` is {@link ${typeName[0]}C_Any}. + */ + +`.trimStart(), + ); + + f.write(` +/** + * @typedef {( +`.trimStart(), + ); + for (let cat of cats) { + f.write(` * | '${cat}'\n`); + } + f.write(` * )} ${keyTypeName}\n`); + f.write(' */\n\n'); + + f.write(` +/** + * Grapheme category enum + * + * Note: The enum object is not actually \`Object.freeze\` + * because it increases 800 bytes of Brotli compression... Not sure why :P + * + * @type {Readonly>} + */ +export const ${typeName} = { +`.trimStart(), + ); + for (let cat of cats) { + f.write(` ${cat}: ${inversed[cat]},\n`); + } + f.write('};\n\n'); + + printTableCompressed( + f, + `${name}_cat_lookup`, + lookupTable, + x => x.toString(), + ); + f.write('\n\n'); + + f.write(` +/** + * @type {${typeName}Range[]} + */ +`.trimStart(), + ); + + printTableCompressed( + f, + `${name}_cat_table`, + breakTable, + x => `[${x[0]},${x[1]},${inversed[x[2]]}]`, + ); + f.write('\n\n'); + + f.write(` +/** + * @param {number} cp + * @return An exact {@link ${typeName}Range} if found, or garbage \`start\` and \`from\` values with {@link ${typeName[0]}C_Any} category. + */ +export function search${typeName}(cp) { + // Perform a quick O(1) lookup in a precomputed table to determine + // the slice of the range table to search in. + let lookup_table = ${name}_cat_lookup; + let lookup_interval = 0x${lookupInterval.toString(16)}; + + let idx = cp / lookup_interval | 0; + // If the \`idx\` is outside of the precomputed table - use the slice + // starting from the last covered index in the precomputed table and + // ending with the length of the range table. + let sliceFrom = ${j}, sliceTo = ${breakTable.length}; + if (idx + 1 < lookup_table.length) { + sliceFrom = lookup_table[idx]; + sliceTo = lookup_table[idx + 1] + 1; + } + + // Compute pessimistic default lower and upper bounds on the category. + // If character doesn't map to any range and there is no adjacent range + // in the table slice - these bounds has to apply. + let lower = idx * lookup_interval; + let upper = lower + lookup_interval - 1; + return bsearchUnicodeRange(cp, ${name}_cat_table, lower, upper, sliceFrom, sliceTo); +} +`.trimStart(), + ); +}; + +/** + * @param {WriteStream} f + */ +let printIncbModule = async f => { + let ucd = await fetchData('DerivedCoreProperties.txt'); + let props = parseProperties(ucd, ['InCB=Consonant']); + + f.write(preamble); + f.write(` +/** + * The Unicode \`Indic_Conjunct_Break=Consonant\` derived property table + * + * @type {import('./core.js').UnicodeRange[]} + */ +`, + ); + printTableCompressed( + f, + 'consonant_table', + props['Consonant'], + formatRange, + ); + f.write('\n'); +}; + +/** + * @param {WriteStream} f + */ +let printGeneralModule = async f => { + let [ + gencatSrc, + derivedSrc, + ] = await Promise.all([ + fetchData('UnicodeData.txt'), + fetchData('DerivedCoreProperties.txt'), + ]); + + let gencats = parseGencats(gencatSrc); + let derived = parseProperties(derivedSrc, ['Alphabetic']); + + f.write(preamble); + f.write(` +/** + * The Unicode \`L\` (Letter) property table + * + * @type {import('./core.js').UnicodeRange[]} + */ +`, + ); + printTableCompressed(f, 'letter_table', gencats['L'], formatRange); + f.write('\n'); + + f.write(` +/** + * The Unicode \`N\` (Numeric) property table + * + * @type {import('./core.js').UnicodeRange[]} + */ +`, + ) + printTableCompressed(f, 'numeric_table', gencats['N'], formatRange); + f.write('\n'); + + f.write(` +/** + * The Unicode \`Alphabetic\` property table + * + * @type {import('./core.js').UnicodeRange[]} + */ +`, + ) + printTableCompressed(f, 'alphabetic_table', derived['Alphabetic'], formatRange); + f.write('\n'); +}; + +/** + * @param {WriteStream} f + */ +let printEmojiModule = async f => { + let emojiData = await fetchData('emoji/emoji-data.txt'); + let emojiProps = parseProperties(emojiData, ['Extended_Pictographic', 'Emoji_Presentation']); + + f.write(preamble); + f.write(` +/** + * The Unicode \`Emoji_Presentation\` property table + * + * @type {import('./core.js').UnicodeRange[]} + */ +`, + ); + printTableCompressed( + f, + 'emoji_presentation_table', + emojiProps['Emoji_Presentation'], + formatRange, + ); + f.write('\n'); + + f.write(` +/** + * The Unicode \`Extended_Pictographic\` property table + * + * @type {import('./core.js').UnicodeRange[]} + */ +`, + ); + printTableCompressed( + f, + 'extended_pictographic_table', + emojiProps['Extended_Pictographic'], + formatRange, + ); + f.write('\n'); +}; + +/** + * @param {WriteStream} f + */ +let printTestDataModule = async f => { + f.write(preamble); + f.write(` +/** + * @typedef {[input: string, expected: string[]]} TestCase + */ +`, + ); + + /** + * @typedef {[UnicodeValues, UnicodeValues[]]} TestCaseRow + */ + + /** + * @param {TestCaseRow} row + * @return {string} + */ + let formatTestCase = row => { + let outstr = `['`; + for (let c of row[0]) { + outstr += escapeUnicode(c); + } + outstr += `', [`; + let xfirst = true; + for (let x of [row[1]]) { + if (!xfirst) { + outstr += '], ['; + } + xfirst = false; + let sfirst = true; + for (let sp of x) { + if (!sfirst) { + outstr += ', '; + } + sfirst = false; + outstr += `'`; + for (let c of sp) { + outstr += escapeUnicode(c); + } + outstr += `'`; + } + } + outstr += ']]'; + return outstr; + }; + + let grapehmeTestDataSrc = await fetchData('auxiliary/GraphemeBreakTest.txt'); + // rules 9.1 and 9.2 are for extended graphemes only + let optsplits = ['9.1', '9.2']; + let graphemeTestData = parseTestData(grapehmeTestDataSrc, optsplits); + + /** @type {TestCaseRow[]} */ + let tests = []; + + for (let [c, i] of graphemeTestData) { + let allChars = c.flatMap(s => s); + + /** @type {UnicodeValues[]} */ + let extgraphs = []; + + /** @type {UnicodeValues} */ + let extwork = []; + + extwork = extwork.concat(c[0]); + for (let n of range(0, i.length)) { + if (optsplits.includes(i[n])) { + extwork = extwork.concat(c[n + 1]); + } else { + extgraphs.push(extwork); + extwork = []; + extwork = extwork.concat(c[n + 1]); + } + } + + // These are the extended grapheme clusters + // And the JS' segmenter only cares extended grapheme clusters + extgraphs.push(extwork); + + if (arraysEqual(extgraphs, c)) { + tests.push([allChars, c]); + } else { + tests.push([allChars, extgraphs]); + } + } + + f.write(` +/** + * Official Unicode test data for extended grapheme clusters + * + * @see http://www.unicode.org/Public/${UNICODE_VERSION_STRING}/ucd/auxiliary/GraphemeBreakTest.txt + * + * @type {TestCase[]} + */ +`, + ); + printTableRaw(f, 'TESTDATA_GRAPHEME', tests, formatTestCase); + f.write('\n'); +}; + +/** + * @param {string} file + * @param {(f: WriteStream) => Promise} print + */ +let emitSrc = async (file, print) => { + let filePath = path.join(srcPath, file); + let writeStream = createWriteStream(filePath, 'utf-8'); + try { + await print(writeStream); + } finally { + writeStream.end(); + } +}; + +/** + * @param {string} file + * @param {(f: WriteStream) => Promise} print + */ +let emitTest = async (file, print) => { + let filePath = path.join(testPath, file); + let writeStream = createWriteStream(filePath, 'utf-8'); + try { + await print(writeStream); + } finally { + writeStream.end(); + } +}; + + +// Start main procedure + +let [ + graphemeData, + emojiData, + // wordData, + // sentenceData, +] = await Promise.all([ + fetchData('auxiliary/GraphemeBreakProperty.txt'), + fetchData('emoji/emoji-data.txt'), + // fetchData('auxiliary/WordBreakProperty.txt'), + // fetchData('auxiliary/SentenceBreakProperty.txt'), +]); + +let graphemeCats = parseProperties(graphemeData); +// Control +// Note: +// This category also includes Cs (surrogate codepoints). +// We have to remove Cs from the Control category +graphemeCats["Control"] = groupCat(Array.from( + difference( + new Set(ungroupCat(graphemeCats["Control"])), + new Set(ungroupCat([surrogateCodepoints])), + ), +)); + +let emojiProps = parseProperties(emojiData, ['Extended_Pictographic']); + +/** @type {CategorizedUnicodeRange[]} */ +let graphemeTable = []; +for (let [cat, ranges] of Object.entries(graphemeCats)) { + for (let [from, to] of ranges) { + graphemeTable.push([from, to, cat]); + } +} +for (let [cat, ranges] of Object.entries(emojiProps)) { + for (let [from, to] of ranges) { + graphemeTable.push([from, to, cat]); + } +} +graphemeTable.sort((a, b) => a[0] - b[0]); + +let last = -1; +for (let chars of graphemeTable) { + if (chars[0] <= last) { + throw new Error('Grapheme tables and Extended_Pictographic values overlap; need to store these separately!'); + } + last = chars[1]; +} + +// let wordCats = parseProperties(wordData); +// /** @type {CategorizedUnicodeRange[]} */ +// let wordTable = []; +// for (let [cat, ranges] of Object.entries(wordCats)) { +// for (let [from, to] of ranges) { +// graphemeTable.push([from, to, cat]); +// } +// } +// wordTable.sort((a, b) => a[0] - b[0]); + +// let sentenceCats = parseProperties(sentenceData); +// /** @type {CategorizedUnicodeRange[]} */ +// let sentenceTable = []; +// for (let [cat, ranges] of Object.entries(sentenceCats)) { +// for (let [from, to] of ranges) { +// graphemeTable.push([from, to, cat]); +// } +// } +// sentenceTable.sort((a, b) => a[0] - b[0]); + +await emitSrc( + '_grapheme_table.js', + async f => printBreakModule( + f, + graphemeTable, + Object.keys(graphemeCats).concat(['Extended_Pictographic']), + 'grapheme', + ), +); + +// emitSrc( +// '_word_table.js', +// async f => printBreakModule( +// f, +// wordTable, +// Object.keys(wordCats), +// 'word', +// ), +// ); + +// emitSrc( +// '_sentence_table.js', +// async f => printBreakModule( +// f, +// sentenceTable, +// Object.keys(sentenceCats), +// 'sentence', +// ), +// ); + +await emitSrc( + '_incb_table.js', + printIncbModule, +); + +await emitSrc( + '_general_table.js', + printGeneralModule, +); + +await emitSrc( + '_emoji_table.js', + printEmojiModule, +); + +await emitTest( + '_unicode_testdata.js', + printTestDataModule, +); diff --git a/src/_emoji_table.js b/src/_emoji_table.js index 05cb093..94fd9ae 100644 --- a/src/_emoji_table.js +++ b/src/_emoji_table.js @@ -1,3 +1,6 @@ +// The following code was generated by "scripts/unicode.js", +// DO NOT EDIT DIRECTLY. +// // @ts-check /** diff --git a/src/_general_table.js b/src/_general_table.js index fe7b6f2..c6c9a84 100644 --- a/src/_general_table.js +++ b/src/_general_table.js @@ -1,3 +1,6 @@ +// The following code was generated by "scripts/unicode.js", +// DO NOT EDIT DIRECTLY. +// // @ts-check /** @@ -8,7 +11,7 @@ export const letter_table = JSON.parse('[[65,90],[97,122],[170,170],[181,181],[186,186],[192,214],[216,246],[248,705],[710,721],[736,740],[748,748],[750,750],[880,884],[886,887],[890,893],[895,895],[902,902],[904,906],[908,908],[910,929],[931,1013],[1015,1153],[1162,1327],[1329,1366],[1369,1369],[1376,1416],[1488,1514],[1519,1522],[1568,1610],[1646,1647],[1649,1747],[1749,1749],[1765,1766],[1774,1775],[1786,1788],[1791,1791],[1808,1808],[1810,1839],[1869,1957],[1969,1969],[1994,2026],[2036,2037],[2042,2042],[2048,2069],[2074,2074],[2084,2084],[2088,2088],[2112,2136],[2144,2154],[2160,2183],[2185,2190],[2208,2249],[2308,2361],[2365,2365],[2384,2384],[2392,2401],[2417,2432],[2437,2444],[2447,2448],[2451,2472],[2474,2480],[2482,2482],[2486,2489],[2493,2493],[2510,2510],[2524,2525],[2527,2529],[2544,2545],[2556,2556],[2565,2570],[2575,2576],[2579,2600],[2602,2608],[2610,2611],[2613,2614],[2616,2617],[2649,2652],[2654,2654],[2674,2676],[2693,2701],[2703,2705],[2707,2728],[2730,2736],[2738,2739],[2741,2745],[2749,2749],[2768,2768],[2784,2785],[2809,2809],[2821,2828],[2831,2832],[2835,2856],[2858,2864],[2866,2867],[2869,2873],[2877,2877],[2908,2909],[2911,2913],[2929,2929],[2947,2947],[2949,2954],[2958,2960],[2962,2965],[2969,2970],[2972,2972],[2974,2975],[2979,2980],[2984,2986],[2990,3001],[3024,3024],[3077,3084],[3086,3088],[3090,3112],[3114,3129],[3133,3133],[3160,3162],[3165,3165],[3168,3169],[3200,3200],[3205,3212],[3214,3216],[3218,3240],[3242,3251],[3253,3257],[3261,3261],[3293,3294],[3296,3297],[3313,3314],[3332,3340],[3342,3344],[3346,3386],[3389,3389],[3406,3406],[3412,3414],[3423,3425],[3450,3455],[3461,3478],[3482,3505],[3507,3515],[3517,3517],[3520,3526],[3585,3632],[3634,3635],[3648,3654],[3713,3714],[3716,3716],[3718,3722],[3724,3747],[3749,3749],[3751,3760],[3762,3763],[3773,3773],[3776,3780],[3782,3782],[3804,3807],[3840,3840],[3904,3911],[3913,3948],[3976,3980],[4096,4138],[4159,4159],[4176,4181],[4186,4189],[4193,4193],[4197,4198],[4206,4208],[4213,4225],[4238,4238],[4256,4293],[4295,4295],[4301,4301],[4304,4346],[4348,4680],[4682,4685],[4688,4694],[4696,4696],[4698,4701],[4704,4744],[4746,4749],[4752,4784],[4786,4789],[4792,4798],[4800,4800],[4802,4805],[4808,4822],[4824,4880],[4882,4885],[4888,4954],[4992,5007],[5024,5109],[5112,5117],[5121,5740],[5743,5759],[5761,5786],[5792,5866],[5873,5880],[5888,5905],[5919,5937],[5952,5969],[5984,5996],[5998,6000],[6016,6067],[6103,6103],[6108,6108],[6176,6264],[6272,6276],[6279,6312],[6314,6314],[6320,6389],[6400,6430],[6480,6509],[6512,6516],[6528,6571],[6576,6601],[6656,6678],[6688,6740],[6823,6823],[6917,6963],[6981,6988],[7043,7072],[7086,7087],[7098,7141],[7168,7203],[7245,7247],[7258,7293],[7296,7304],[7312,7354],[7357,7359],[7401,7404],[7406,7411],[7413,7414],[7418,7418],[7424,7615],[7680,7957],[7960,7965],[7968,8005],[8008,8013],[8016,8023],[8025,8025],[8027,8027],[8029,8029],[8031,8061],[8064,8116],[8118,8124],[8126,8126],[8130,8132],[8134,8140],[8144,8147],[8150,8155],[8160,8172],[8178,8180],[8182,8188],[8305,8305],[8319,8319],[8336,8348],[8450,8450],[8455,8455],[8458,8467],[8469,8469],[8473,8477],[8484,8484],[8486,8486],[8488,8488],[8490,8493],[8495,8505],[8508,8511],[8517,8521],[8526,8526],[8579,8580],[11264,11492],[11499,11502],[11506,11507],[11520,11557],[11559,11559],[11565,11565],[11568,11623],[11631,11631],[11648,11670],[11680,11686],[11688,11694],[11696,11702],[11704,11710],[11712,11718],[11720,11726],[11728,11734],[11736,11742],[11823,11823],[12293,12294],[12337,12341],[12347,12348],[12353,12438],[12445,12447],[12449,12538],[12540,12543],[12549,12591],[12593,12686],[12704,12735],[12784,12799],[13312,19903],[19968,42124],[42192,42237],[42240,42508],[42512,42527],[42538,42539],[42560,42606],[42623,42653],[42656,42725],[42775,42783],[42786,42888],[42891,42954],[42960,42961],[42963,42963],[42965,42969],[42994,43009],[43011,43013],[43015,43018],[43020,43042],[43072,43123],[43138,43187],[43250,43255],[43259,43259],[43261,43262],[43274,43301],[43312,43334],[43360,43388],[43396,43442],[43471,43471],[43488,43492],[43494,43503],[43514,43518],[43520,43560],[43584,43586],[43588,43595],[43616,43638],[43642,43642],[43646,43695],[43697,43697],[43701,43702],[43705,43709],[43712,43712],[43714,43714],[43739,43741],[43744,43754],[43762,43764],[43777,43782],[43785,43790],[43793,43798],[43808,43814],[43816,43822],[43824,43866],[43868,43881],[43888,44002],[44032,55203],[55216,55238],[55243,55291],[63744,64109],[64112,64217],[64256,64262],[64275,64279],[64285,64285],[64287,64296],[64298,64310],[64312,64316],[64318,64318],[64320,64321],[64323,64324],[64326,64433],[64467,64829],[64848,64911],[64914,64967],[65008,65019],[65136,65140],[65142,65276],[65313,65338],[65345,65370],[65382,65470],[65474,65479],[65482,65487],[65490,65495],[65498,65500],[65536,65547],[65549,65574],[65576,65594],[65596,65597],[65599,65613],[65616,65629],[65664,65786],[66176,66204],[66208,66256],[66304,66335],[66349,66368],[66370,66377],[66384,66421],[66432,66461],[66464,66499],[66504,66511],[66560,66717],[66736,66771],[66776,66811],[66816,66855],[66864,66915],[66928,66938],[66940,66954],[66956,66962],[66964,66965],[66967,66977],[66979,66993],[66995,67001],[67003,67004],[67072,67382],[67392,67413],[67424,67431],[67456,67461],[67463,67504],[67506,67514],[67584,67589],[67592,67592],[67594,67637],[67639,67640],[67644,67644],[67647,67669],[67680,67702],[67712,67742],[67808,67826],[67828,67829],[67840,67861],[67872,67897],[67968,68023],[68030,68031],[68096,68096],[68112,68115],[68117,68119],[68121,68149],[68192,68220],[68224,68252],[68288,68295],[68297,68324],[68352,68405],[68416,68437],[68448,68466],[68480,68497],[68608,68680],[68736,68786],[68800,68850],[68864,68899],[69248,69289],[69296,69297],[69376,69404],[69415,69415],[69424,69445],[69488,69505],[69552,69572],[69600,69622],[69635,69687],[69745,69746],[69749,69749],[69763,69807],[69840,69864],[69891,69926],[69956,69956],[69959,69959],[69968,70002],[70006,70006],[70019,70066],[70081,70084],[70106,70106],[70108,70108],[70144,70161],[70163,70187],[70207,70208],[70272,70278],[70280,70280],[70282,70285],[70287,70301],[70303,70312],[70320,70366],[70405,70412],[70415,70416],[70419,70440],[70442,70448],[70450,70451],[70453,70457],[70461,70461],[70480,70480],[70493,70497],[70656,70708],[70727,70730],[70751,70753],[70784,70831],[70852,70853],[70855,70855],[71040,71086],[71128,71131],[71168,71215],[71236,71236],[71296,71338],[71352,71352],[71424,71450],[71488,71494],[71680,71723],[71840,71903],[71935,71942],[71945,71945],[71948,71955],[71957,71958],[71960,71983],[71999,71999],[72001,72001],[72096,72103],[72106,72144],[72161,72161],[72163,72163],[72192,72192],[72203,72242],[72250,72250],[72272,72272],[72284,72329],[72349,72349],[72368,72440],[72704,72712],[72714,72750],[72768,72768],[72818,72847],[72960,72966],[72968,72969],[72971,73008],[73030,73030],[73056,73061],[73063,73064],[73066,73097],[73112,73112],[73440,73458],[73474,73474],[73476,73488],[73490,73523],[73648,73648],[73728,74649],[74880,75075],[77712,77808],[77824,78895],[78913,78918],[82944,83526],[92160,92728],[92736,92766],[92784,92862],[92880,92909],[92928,92975],[92992,92995],[93027,93047],[93053,93071],[93760,93823],[93952,94026],[94032,94032],[94099,94111],[94176,94177],[94179,94179],[94208,100343],[100352,101589],[101632,101640],[110576,110579],[110581,110587],[110589,110590],[110592,110882],[110898,110898],[110928,110930],[110933,110933],[110948,110951],[110960,111355],[113664,113770],[113776,113788],[113792,113800],[113808,113817],[119808,119892],[119894,119964],[119966,119967],[119970,119970],[119973,119974],[119977,119980],[119982,119993],[119995,119995],[119997,120003],[120005,120069],[120071,120074],[120077,120084],[120086,120092],[120094,120121],[120123,120126],[120128,120132],[120134,120134],[120138,120144],[120146,120485],[120488,120512],[120514,120538],[120540,120570],[120572,120596],[120598,120628],[120630,120654],[120656,120686],[120688,120712],[120714,120744],[120746,120770],[120772,120779],[122624,122654],[122661,122666],[122928,122989],[123136,123180],[123191,123197],[123214,123214],[123536,123565],[123584,123627],[124112,124139],[124896,124902],[124904,124907],[124909,124910],[124912,124926],[124928,125124],[125184,125251],[125259,125259],[126464,126467],[126469,126495],[126497,126498],[126500,126500],[126503,126503],[126505,126514],[126516,126519],[126521,126521],[126523,126523],[126530,126530],[126535,126535],[126537,126537],[126539,126539],[126541,126543],[126545,126546],[126548,126548],[126551,126551],[126553,126553],[126555,126555],[126557,126557],[126559,126559],[126561,126562],[126564,126564],[126567,126570],[126572,126578],[126580,126583],[126585,126588],[126590,126590],[126592,126601],[126603,126619],[126625,126627],[126629,126633],[126635,126651],[131072,173791],[173824,177977],[177984,178205],[178208,183969],[183984,191456],[191472,192093],[194560,195101],[196608,201546],[201552,205743]]'); /** - * The Unicode `N` (Number) property table + * The Unicode `N` (Numeric) property table * * @type {import('./core.js').UnicodeRange[]} */ diff --git a/src/_grapheme_table.js b/src/_grapheme_table.js index 68d0553..ec9eb7e 100644 --- a/src/_grapheme_table.js +++ b/src/_grapheme_table.js @@ -1,18 +1,6 @@ -// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. +// The following code was generated by "scripts/unicode.js", +// DO NOT EDIT DIRECTLY. // -// Licensed under the MIT license -// . -// -// Modified original Rust library [unicode-segmentation] -// (https://unicode-rs.github.io/unicode-segmentation) -// -// to create JavaScript library, [unicode-segmenter] -// (https://github.com/cometkim/unicode-segmenter) - -// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly - // @ts-check import { bsearchUnicodeRange } from './core.js'; diff --git a/src/_incb_table.js b/src/_incb_table.js index 7153f8f..ea23129 100644 --- a/src/_incb_table.js +++ b/src/_incb_table.js @@ -1,3 +1,6 @@ +// The following code was generated by "scripts/unicode.js", +// DO NOT EDIT DIRECTLY. +// // @ts-check /** diff --git a/test/_unicode_testdata.js b/test/_unicode_testdata.js index abe5faf..9a0e85e 100644 --- a/test/_unicode_testdata.js +++ b/test/_unicode_testdata.js @@ -1,7 +1,10 @@ +// The following code was generated by "scripts/unicode.js", +// DO NOT EDIT DIRECTLY. +// // @ts-check /** - * @typedef {[input: string, expected: string[]]} TestCase + * @typedef {[input: string, expected: string[]]} TestCase */ /** diff --git a/tsconfig.json b/tsconfig.json index 65ff31b..bb10b2a 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,5 +1,6 @@ { "compilerOptions": { + "target": "ESNext", "module": "NodeNext", "moduleResolution": "NodeNext", "strict": true, diff --git a/yarn.lock b/yarn.lock index 67215ae..1529809 100644 --- a/yarn.lock +++ b/yarn.lock @@ -7251,23 +7251,23 @@ __metadata: languageName: node linkType: hard -"typescript@npm:^5.4.5": - version: 5.4.5 - resolution: "typescript@npm:5.4.5" +"typescript@npm:^5.5.4": + version: 5.5.4 + resolution: "typescript@npm:5.5.4" bin: tsc: bin/tsc tsserver: bin/tsserver - checksum: 10c0/2954022ada340fd3d6a9e2b8e534f65d57c92d5f3989a263754a78aba549f7e6529acc1921913560a4b816c46dce7df4a4d29f9f11a3dc0d4213bb76d043251e + checksum: 10c0/422be60f89e661eab29ac488c974b6cc0a660fb2228003b297c3d10c32c90f3bcffc1009b43876a082515a3c376b1eefcce823d6e78982e6878408b9a923199c languageName: node linkType: hard -"typescript@patch:typescript@npm%3A^5.4.5#optional!builtin": - version: 5.4.5 - resolution: "typescript@patch:typescript@npm%3A5.4.5#optional!builtin::version=5.4.5&hash=5adc0c" +"typescript@patch:typescript@npm%3A^5.5.4#optional!builtin": + version: 5.5.4 + resolution: "typescript@patch:typescript@npm%3A5.5.4#optional!builtin::version=5.5.4&hash=b45daf" bin: tsc: bin/tsc tsserver: bin/tsserver - checksum: 10c0/db2ad2a16ca829f50427eeb1da155e7a45e598eec7b086d8b4e8ba44e5a235f758e606d681c66992230d3fc3b8995865e5fd0b22a2c95486d0b3200f83072ec9 + checksum: 10c0/10dd9881baba22763de859e8050d6cb6e2db854197495c6f1929b08d1eb2b2b00d0b5d9b0bcee8472f1c3f4a7ef6a5d7ebe0cfd703f853aa5ae465b8404bc1ba languageName: node linkType: hard @@ -7349,7 +7349,7 @@ __metadata: pretty-bytes: "npm:^6.1.1" rimraf: "npm:^5.0.5" tinybench: "npm:^2.6.0" - typescript: "npm:^5.4.5" + typescript: "npm:^5.5.4" unicode-segmentation-wasm: "portal:benchmark/unicode-segmentation-wasm" vite: "npm:^5.2.11" xregexp: "npm:5.1.1"