diff --git a/build/generate-unicode-data.ts b/build/generate-unicode-data.ts new file mode 100644 index 0000000000..0a9373ad31 --- /dev/null +++ b/build/generate-unicode-data.ts @@ -0,0 +1,45 @@ +import * as fs from 'fs'; + +// Or https://www.unicode.org/Public/draft/UCD/ucd if the next Unicode version is finalized and awaiting publication. +const ucdBaseUrl = 'https://www.unicode.org/Public/UCD/latest/ucd'; + +async function getPropertyData(property: string, value: string): Promise<{[_: string]: string}> { + const indicSyllabicCategoryUrl = `${ucdBaseUrl}/${property.replaceAll('_', '')}.txt`; + const response = await fetch(indicSyllabicCategoryUrl); + if (!response.ok) { + throw new Error(`Unable to fetch latest Unicode character database file for ${property}: ${response.status}`); + } + + const table = await response.text(); + const header = table.match(/^# \w+-(\d+\.\d+\.\d+)\.txt\n# Date: (\d\d\d\d-\d\d-\d\d)/); + const tableRegExp = new RegExp(`^([0-9A-Z]{4,6}(?:..[0-9A-Z]{4,6})?)(?= *; ${value})`, 'gm'); + const characterClass = table + .match(tableRegExp) + .map(record => record + .split('..') + .map(codePoint => (codePoint.length > 4) ? `\\u{${codePoint}}` : `\\u${codePoint}`) + .join('-')) + .join(''); + return { + version: header && header[1], + date: header && header[2], + characterClass, + }; +} + +const indicSyllabicCategory = await getPropertyData('Indic_Syllabic_Category', 'Invisible_Stacker'); + +fs.writeFileSync('src/data/unicode_properties.ts', + `// This file is generated. Edit build/generate-unicode-data.ts, then run \`npm run generate-unicode-data\`. + +/** + * Returns whether two grapheme clusters detected by \`Intl.Segmenter\` can be combined to prevent an invisible combining mark from appearing unexpectedly. + */ +export function canCombineGraphemes(former: string, latter: string): boolean { + // Zero-width joiner + // Indic_Syllabic_Category=Invisible_Stacker as of Unicode ${indicSyllabicCategory.version}, published ${indicSyllabicCategory.date}. + // eslint-disable-next-line no-misleading-character-class + const terminalJoinersRegExp = /[\\u200D${indicSyllabicCategory.characterClass}]$/u; + return terminalJoinersRegExp.test(former) || /^\\p{gc=Mc}/u.test(latter); +} +`); diff --git a/package.json b/package.json index c9f5bdcfc8..243f06f943 100644 --- a/package.json +++ b/package.json @@ -140,6 +140,7 @@ }, "scripts": { "generate-dist-package": "node --no-warnings --loader ts-node/esm build/generate-dist-package.js", + "generate-unicode-data": "node --no-warnings --loader ts-node/esm build/generate-unicode-data.ts", "generate-shaders": "node --no-warnings --loader ts-node/esm build/generate-shaders.ts", "generate-struct-arrays": "node --no-warnings --loader ts-node/esm build/generate-struct-arrays.ts", "generate-style-code": "node --no-warnings --loader ts-node/esm build/generate-style-code.ts", diff --git a/src/data/bucket/symbol_bucket.ts b/src/data/bucket/symbol_bucket.ts index 531631622d..d828a71adc 100644 --- a/src/data/bucket/symbol_bucket.ts +++ b/src/data/bucket/symbol_bucket.ts @@ -23,7 +23,7 @@ import {ProgramConfigurationSet} from '../program_configuration'; import {TriangleIndexArray, LineIndexArray} from '../index_array_type'; import {transformText} from '../../symbol/transform_text'; import {mergeLines} from '../../symbol/merge_lines'; -import {allowsVerticalWritingMode, stringContainsRTLText} from '../../util/script_detection'; +import {allowsVerticalWritingMode, splitByGraphemeCluster, stringContainsRTLText} from '../../util/script_detection'; import {WritingMode} from '../../symbol/shaping'; import {loadGeometry} from '../load_geometry'; import {toEvaluationFeature} from '../evaluation_feature'; @@ -419,17 +419,17 @@ export class SymbolBucket implements Bucket { private calculateGlyphDependencies( text: string, - stack: {[_: number]: boolean}, + stack: {[_: string]: boolean}, textAlongLine: boolean, allowVerticalPlacement: boolean, doesAllowVerticalWritingMode: boolean) { - for (const char of text) { - stack[char.codePointAt(0)] = true; + for (const {segment} of splitByGraphemeCluster(text)) { + stack[segment] = true; if ((textAlongLine || allowVerticalPlacement) && doesAllowVerticalWritingMode) { - const verticalChar = verticalizedCharacterMap[char]; + const verticalChar = verticalizedCharacterMap[segment]; if (verticalChar) { - stack[verticalChar.codePointAt(0)] = true; + stack[segment] = true; } } } diff --git a/src/data/unicode_properties.ts b/src/data/unicode_properties.ts new file mode 100644 index 0000000000..e25497d4f5 --- /dev/null +++ b/src/data/unicode_properties.ts @@ -0,0 +1,12 @@ +// This file is generated. Edit build/generate-unicode-data.ts, then run `npm run generate-unicode-data`. + +/** + * Returns whether two grapheme clusters detected by `Intl.Segmenter` can be combined to prevent an invisible combining mark from appearing unexpectedly. + */ +export function canCombineGraphemes(former: string, latter: string): boolean { + // Zero-width joiner + // Indic_Syllabic_Category=Invisible_Stacker as of Unicode 16.0.0, published 2024-04-30. + // eslint-disable-next-line no-misleading-character-class + const terminalJoinersRegExp = /[\u200D\u1039\u17D2\u1A60\u1BAB\uAAF6\u{10A3F}\u{11133}\u{113D0}\u{1193E}\u{11A47}\u{11A99}\u{11D45}\u{11D97}\u{11F42}]$/u; + return terminalJoinersRegExp.test(former) || /^\p{gc=Mc}/u.test(latter); +} diff --git a/src/render/glyph_atlas.ts b/src/render/glyph_atlas.ts index 6ce83f7e4c..90899bc1ee 100644 --- a/src/render/glyph_atlas.ts +++ b/src/render/glyph_atlas.ts @@ -30,7 +30,7 @@ export type GlyphPosition = { */ export type GlyphPositions = { [_: string]: { - [_: number]: GlyphPosition; + [_: string]: GlyphPosition; }; }; @@ -46,8 +46,8 @@ export class GlyphAtlas { const glyphs = stacks[stack]; const stackPositions = positions[stack] = {}; - for (const id in glyphs) { - const src = glyphs[+id]; + for (const grapheme in glyphs) { + const src = glyphs[grapheme]; if (!src || src.bitmap.width === 0 || src.bitmap.height === 0) continue; const bin = { @@ -57,7 +57,7 @@ export class GlyphAtlas { h: src.bitmap.height + 2 * padding }; bins.push(bin); - stackPositions[id] = {rect: bin, metrics: src.metrics}; + stackPositions[grapheme] = {rect: bin, metrics: src.metrics}; } } @@ -67,10 +67,10 @@ export class GlyphAtlas { for (const stack in stacks) { const glyphs = stacks[stack]; - for (const id in glyphs) { - const src = glyphs[+id]; + for (const grapheme in glyphs) { + const src = glyphs[grapheme]; if (!src || src.bitmap.width === 0 || src.bitmap.height === 0) continue; - const bin = positions[stack][id].rect; + const bin = positions[stack][grapheme].rect; AlphaImage.copy(src.bitmap, image, {x: 0, y: 0}, {x: bin.x + padding, y: bin.y + padding}, src.bitmap); } } diff --git a/src/render/glyph_manager.test.ts b/src/render/glyph_manager.test.ts index 186e7e8a9c..b042338d6a 100644 --- a/src/render/glyph_manager.test.ts +++ b/src/render/glyph_manager.test.ts @@ -6,7 +6,7 @@ import {RequestManager} from '../util/request_manager'; describe('GlyphManager', () => { const GLYPHS = {}; for (const glyph of parseGlyphPbf(fs.readFileSync('./test/unit/assets/0-255.pbf'))) { - GLYPHS[glyph.id] = glyph; + GLYPHS[glyph.grapheme] = glyph; } const identityTransform = ((url) => ({url})) as any as RequestManager; @@ -35,22 +35,22 @@ describe('GlyphManager', () => { createLoadGlyphRangeStub(); const manager = createGlyphManager(); - const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': [55]}); - expect(returnedGlyphs['Arial Unicode MS']['55'].metrics.advance).toBe(12); + const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': ['7']}); + expect(returnedGlyphs['Arial Unicode MS']['7'].metrics.advance).toBe(12); }); test('GlyphManager doesn\'t request twice 0-255 PBF if a glyph is missing', async () => { const stub = createLoadGlyphRangeStub(); const manager = createGlyphManager(); - await manager.getGlyphs({'Arial Unicode MS': [0.5]}); + await manager.getGlyphs({'Arial Unicode MS': ['文']}); expect(manager.entries['Arial Unicode MS'].ranges[0]).toBe(true); expect(stub).toHaveBeenCalledTimes(1); // We remove all requests as in getGlyphs code. delete manager.entries['Arial Unicode MS'].requests[0]; - await manager.getGlyphs({'Arial Unicode MS': [0.5]}); + await manager.getGlyphs({'Arial Unicode MS': ['文']}); expect(manager.entries['Arial Unicode MS'].ranges[0]).toBe(true); expect(stub).toHaveBeenCalledTimes(1); }); @@ -62,8 +62,8 @@ describe('GlyphManager', () => { const manager = createGlyphManager(); - const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': [0x5e73]}); - expect(returnedGlyphs['Arial Unicode MS'][0x5e73]).toBeNull(); // The fixture returns a PBF without the glyph we requested + const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': ['平']}); + expect(returnedGlyphs['Arial Unicode MS']['平']).toBeNull(); // The fixture returns a PBF without the glyph we requested }); test('GlyphManager requests remote non-BMP, non-CJK PBF', async () => { @@ -74,8 +74,8 @@ describe('GlyphManager', () => { const manager = createGlyphManager(); // Request Egyptian hieroglyph 𓃰 - const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': [0x1e0f0]}); - expect(returnedGlyphs['Arial Unicode MS'][0x1e0f0]).toBeNull(); // The fixture returns a PBF without the glyph we requested + const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': ['𓃰']}); + expect(returnedGlyphs['Arial Unicode MS']['𓃰']).toBeNull(); // The fixture returns a PBF without the glyph we requested }); test('GlyphManager does not cache CJK chars that should be rendered locally', async () => { @@ -92,11 +92,11 @@ describe('GlyphManager', () => { const manager = createGlyphManager('sans-serif'); //Request char that overlaps Katakana range - let returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': [0x3005]}); - expect(returnedGlyphs['Arial Unicode MS'][0x3005]).not.toBeNull(); + let returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': ['々']}); + expect(returnedGlyphs['Arial Unicode MS']['々']).not.toBeNull(); //Request char from Katakana range (te テ) - returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': [0x30C6]}); - const glyph = returnedGlyphs['Arial Unicode MS'][0x30c6]; + returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': ['テ']}); + const glyph = returnedGlyphs['Arial Unicode MS']['テ']; //Ensure that te is locally generated. expect(glyph.bitmap.height).toBe(12); expect(glyph.bitmap.width).toBe(12); @@ -106,32 +106,32 @@ describe('GlyphManager', () => { const manager = createGlyphManager('sans-serif'); // Chinese character píng 平 - const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': [0x5e73]}); - expect(returnedGlyphs['Arial Unicode MS'][0x5e73].metrics.advance).toBe(0.5); + const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': ['平']}); + expect(returnedGlyphs['Arial Unicode MS']['平'].metrics.advance).toBe(0.5); }); test('GlyphManager generates non-BMP CJK PBF locally', async () => { const manager = createGlyphManager('sans-serif'); // Chinese character biáng 𰻞 - const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': [0x30EDE]}); - expect(returnedGlyphs['Arial Unicode MS'][0x30EDE].metrics.advance).toBe(1); + const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': ['𰻞']}); + expect(returnedGlyphs['Arial Unicode MS']['𰻞'].metrics.advance).toBe(1); }); test('GlyphManager generates Katakana PBF locally', async () => { const manager = createGlyphManager('sans-serif'); // Katakana letter te テ - const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': [0x30c6]}); - expect(returnedGlyphs['Arial Unicode MS'][0x30c6].metrics.advance).toBe(0.5); + const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': ['テ']}); + expect(returnedGlyphs['Arial Unicode MS']['テ'].metrics.advance).toBe(0.5); }); test('GlyphManager generates Hiragana PBF locally', async () => { const manager = createGlyphManager('sans-serif'); //Hiragana letter te て - const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': [0x3066]}); - expect(returnedGlyphs['Arial Unicode MS'][0x3066].metrics.advance).toBe(0.5); + const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': ['て']}); + expect(returnedGlyphs['Arial Unicode MS']['て'].metrics.advance).toBe(0.5); }); test('GlyphManager consistently generates CJKV text locally', async () => { @@ -159,9 +159,9 @@ describe('GlyphManager', () => { }); // Katakana letter te - const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': [0x30c6]}); - expect(returnedGlyphs['Arial Unicode MS'][0x30c6].metrics.advance).toBe(24); - await manager.getGlyphs({'Arial Unicode MS': [0x30c6]}); + const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': ['テ']}); + expect(returnedGlyphs['Arial Unicode MS']['テ'].metrics.advance).toBe(24); + await manager.getGlyphs({'Arial Unicode MS': ['テ']}); expect(drawSpy).toHaveBeenCalledTimes(1); }); }); diff --git a/src/render/glyph_manager.ts b/src/render/glyph_manager.ts index 3b987860ae..bbd2e02a36 100644 --- a/src/render/glyph_manager.ts +++ b/src/render/glyph_manager.ts @@ -1,7 +1,6 @@ import {loadGlyphRange} from '../style/load_glyph_range'; import TinySDF from '@mapbox/tiny-sdf'; -import {charAllowsIdeographicBreaking} from '../util/script_detection'; import {AlphaImage} from '../util/image'; import type {StyleGlyph} from '../style/style_glyph'; @@ -11,7 +10,7 @@ import type {GetGlyphsResponse} from '../util/actor_messages'; type Entry = { // null means we've requested the range, but the glyph wasn't included in the result. glyphs: { - [id: number]: StyleGlyph | null; + [grapheme: string]: StyleGlyph | null; }; requests: { [range: number]: Promise<{[_: number]: StyleGlyph | null}>; @@ -42,12 +41,12 @@ export class GlyphManager { this.url = url; } - async getGlyphs(glyphs: {[stack: string]: Array}): Promise { - const glyphsPromises: Promise<{stack: string; id: number; glyph: StyleGlyph}>[] = []; + async getGlyphs(glyphs: {[stack: string]: Array}): Promise { + const glyphsPromises: Promise<{stack: string; grapheme: string; glyph: StyleGlyph}>[] = []; for (const stack in glyphs) { - for (const id of glyphs[stack]) { - glyphsPromises.push(this._getAndCacheGlyphsPromise(stack, id)); + for (const grapheme of glyphs[stack]) { + glyphsPromises.push(this._getAndCacheGlyphsPromise(stack, grapheme)); } } @@ -55,13 +54,13 @@ export class GlyphManager { const result: GetGlyphsResponse = {}; - for (const {stack, id, glyph} of updatedGlyphs) { + for (const {stack, grapheme, glyph} of updatedGlyphs) { if (!result[stack]) { result[stack] = {}; } // Clone the glyph so that our own copy of its ArrayBuffer doesn't get transferred. - result[stack][id] = glyph && { - id: glyph.id, + result[stack][grapheme] = glyph && { + grapheme: glyph.grapheme, bitmap: glyph.bitmap.clone(), metrics: glyph.metrics }; @@ -70,7 +69,7 @@ export class GlyphManager { return result; } - async _getAndCacheGlyphsPromise(stack: string, id: number): Promise<{stack: string; id: number; glyph: StyleGlyph}> { + async _getAndCacheGlyphsPromise(stack: string, grapheme: string): Promise<{stack: string; grapheme: string; glyph: StyleGlyph}> { let entry = this.entries[stack]; if (!entry) { entry = this.entries[stack] = { @@ -80,20 +79,21 @@ export class GlyphManager { }; } - let glyph = entry.glyphs[id]; + let glyph = entry.glyphs[grapheme]; if (glyph !== undefined) { - return {stack, id, glyph}; + return {stack, grapheme, glyph}; } - glyph = this._tinySDF(entry, stack, id); + glyph = this._tinySDF(entry, stack, grapheme); if (glyph) { - entry.glyphs[id] = glyph; - return {stack, id, glyph}; + entry.glyphs[grapheme] = glyph; + return {stack, grapheme, glyph}; } + const id = grapheme.codePointAt(0); const range = Math.floor(id / 256); if (entry.ranges[range]) { - return {stack, id, glyph}; + return {stack, grapheme, glyph}; } if (!this.url) { @@ -106,13 +106,14 @@ export class GlyphManager { } const response = await entry.requests[range]; - for (const id in response) { + for (const grapheme in response) { + const id = grapheme.codePointAt(0); if (!this._doesCharSupportLocalGlyph(+id)) { - entry.glyphs[+id] = response[+id]; + entry.glyphs[grapheme] = response[grapheme]; } } entry.ranges[range] = true; - return {stack, id, glyph: response[id] || null}; + return {stack, grapheme, glyph: response[grapheme] || null}; } /** @@ -126,18 +127,17 @@ export class GlyphManager { * rendered remotely. For visual consistency within CJKV text, even relatively small CJKV and * other siniform code blocks prefer local rendering. */ - _doesCharSupportLocalGlyph(id: number): boolean { - return !!this.localIdeographFontFamily && - (/\p{Ideo}|\p{sc=Hang}|\p{sc=Hira}|\p{sc=Kana}/u.test(String.fromCodePoint(id)) || - charAllowsIdeographicBreaking(id)); + _doesCharSupportLocalGlyph(_id: number): boolean { + return true; } - _tinySDF(entry: Entry, stack: string, id: number): StyleGlyph { + _tinySDF(entry: Entry, stack: string, grapheme: string): StyleGlyph { const fontFamily = this.localIdeographFontFamily; if (!fontFamily) { return; } + const id = grapheme.codePointAt(0); if (!this._doesCharSupportLocalGlyph(id)) { return; } @@ -145,9 +145,16 @@ export class GlyphManager { // Client-generated glyphs are rendered at 2x texture scale, // because CJK glyphs are more detailed than others. const textureScale = 2; + const buffer = 10; let tinySDF = entry.tinySDF; if (!tinySDF) { + let fontStyle = 'normal'; + if (/italic/i.test(stack)) { + fontStyle = 'italic'; + } else if (/oblique/i.test(stack)) { + fontStyle = 'oblique'; + } let fontWeight = '400'; if (/bold/i.test(stack)) { fontWeight = '900'; @@ -158,42 +165,28 @@ export class GlyphManager { } tinySDF = entry.tinySDF = new GlyphManager.TinySDF({ fontSize: 24 * textureScale, - buffer: 3 * textureScale, + buffer: buffer * textureScale, radius: 8 * textureScale, cutoff: 0.25, fontFamily, + fontStyle, fontWeight }); } - const char = tinySDF.draw(String.fromCodePoint(id)); - - /** - * TinySDF's "top" is the distance from the alphabetic baseline to the top of the glyph. - * Server-generated fonts specify "top" relative to an origin above the em box (the origin - * comes from FreeType, but I'm unclear on exactly how it's derived) - * ref: https://github.com/mapbox/sdf-glyph-foundry - * - * Server fonts don't yet include baseline information, so we can't line up exactly with them - * (and they don't line up with each other) - * ref: https://github.com/mapbox/node-fontnik/pull/160 - * - * To approximately align TinySDF glyphs with server-provided glyphs, we use this baseline adjustment - * factor calibrated to be in between DIN Pro and Arial Unicode (but closer to Arial Unicode) - */ - const topAdjustment = 27.5; - - const leftAdjustment = 0.5; + const char = tinySDF.draw(grapheme); + + const isControl = /^\p{gc=Cf}+$/u.test(grapheme); return { - id, + grapheme, bitmap: new AlphaImage({width: char.width || 30 * textureScale, height: char.height || 30 * textureScale}, char.data), metrics: { - width: char.glyphWidth / textureScale || 24, + width: isControl ? 0 : (char.glyphWidth / textureScale || 24), height: char.glyphHeight / textureScale || 24, - left: (char.glyphLeft / textureScale + leftAdjustment) || 0, - top: char.glyphTop / textureScale - topAdjustment || -8, - advance: char.glyphAdvance / textureScale || 24, + left: (char.glyphLeft - buffer) / textureScale || 0, + top: char.glyphTop / textureScale || 0, + advance: isControl ? 0 : (char.glyphAdvance / textureScale || 24), isDoubleResolution: true } }; diff --git a/src/source/worker_tile.ts b/src/source/worker_tile.ts index d0e86bbdb8..a02c483c10 100644 --- a/src/source/worker_tile.ts +++ b/src/source/worker_tile.ts @@ -128,9 +128,9 @@ export class WorkerTile { } } - // options.glyphDependencies looks like: {"SomeFontName":{"10":true,"32":true}} - // this line makes an object like: {"SomeFontName":[10,32]} - const stacks: {[_: string]: Array} = mapObject(options.glyphDependencies, (glyphs) => Object.keys(glyphs).map(Number)); + // options.glyphDependencies looks like: {"SomeFontName":{"A":true,"文":true}} + // this line makes an object like: {"SomeFontName":["A","文"]} + const stacks: {[_: string]: Array} = mapObject(options.glyphDependencies, (glyphs) => Object.keys(glyphs)); this.inFlightDependencies.forEach((request) => request?.abort()); this.inFlightDependencies = []; diff --git a/src/style/load_glyph_range.test.ts b/src/style/load_glyph_range.test.ts index f79334a301..1922c8fb59 100644 --- a/src/style/load_glyph_range.test.ts +++ b/src/style/load_glyph_range.test.ts @@ -25,11 +25,10 @@ test('loadGlyphRange', async () => { expect(transform).toHaveBeenCalledWith('https://localhost/fonts/v1/Arial Unicode MS/0-255.pbf', 'Glyphs'); expect(Object.keys(result)).toHaveLength(223); - for (const key in result) { - const id = Number(key); - const glyph = result[id]; + for (const grapheme in result) { + const glyph = result[grapheme]; - expect(glyph.id).toBe(Number(id)); + expect(glyph.grapheme).toBe(grapheme); expect(glyph.metrics).toBeTruthy(); expect(typeof glyph.metrics.width).toBe('number'); expect(typeof glyph.metrics.height).toBe('number'); diff --git a/src/style/load_glyph_range.ts b/src/style/load_glyph_range.ts index ba63c10a4b..c3ddd7ac7f 100644 --- a/src/style/load_glyph_range.ts +++ b/src/style/load_glyph_range.ts @@ -25,7 +25,7 @@ export async function loadGlyphRange(fontstack: string, const glyphs = {}; for (const glyph of parseGlyphPbf(response.data)) { - glyphs[glyph.id] = glyph; + glyphs[glyph.grapheme] = glyph; } return glyphs; diff --git a/src/style/parse_glyph_pbf.ts b/src/style/parse_glyph_pbf.ts index 10fefb2a27..2f68ee9a44 100644 --- a/src/style/parse_glyph_pbf.ts +++ b/src/style/parse_glyph_pbf.ts @@ -15,7 +15,7 @@ function readFontstack(tag: number, glyphs: Array, pbf: Protobuf) { if (tag === 3) { const {id, bitmap, width, height, left, top, advance} = pbf.readMessage(readGlyph, {}); glyphs.push({ - id, + grapheme: String.fromCodePoint(id), bitmap: new AlphaImage({ width: width + 2 * border, height: height + 2 * border diff --git a/src/style/style.ts b/src/style/style.ts index dd500ff05d..2bd4e2e3d5 100644 --- a/src/style/style.ts +++ b/src/style/style.ts @@ -56,9 +56,9 @@ import type {CustomLayerInterface} from './style_layer/custom_style_layer'; import type {Validator} from './validate_style'; import { MessageType, - type GetGlyphsParamerters, + type GetGlyphsParameters, type GetGlyphsResponse, - type GetImagesParamerters, + type GetImagesParameters, type GetImagesResponse } from '../util/actor_messages'; @@ -1668,7 +1668,7 @@ export class Style extends Evented { // Callbacks from web workers - async getImages(mapId: string | number, params: GetImagesParamerters): Promise { + async getImages(mapId: string | number, params: GetImagesParameters): Promise { const images = await this.imageManager.getImages(params.icons); // Apply queued image changes before setting the tile's dependencies so that the tile @@ -1688,15 +1688,15 @@ export class Style extends Evented { return images; } - async getGlyphs(mapId: string | number, params: GetGlyphsParamerters): Promise { - const glypgs = await this.glyphManager.getGlyphs(params.stacks); + async getGlyphs(mapId: string | number, params: GetGlyphsParameters): Promise { + const glyphs = await this.glyphManager.getGlyphs(params.stacks); const sourceCache = this.sourceCaches[params.source]; if (sourceCache) { // we are not setting stacks as dependencies since for now // we just need to know which tiles have glyph dependencies sourceCache.setDependencies(params.tileID.key, params.type, ['']); } - return glypgs; + return glyphs; } getGlyphsUrl() { diff --git a/src/style/style_glyph.ts b/src/style/style_glyph.ts index 29dd48ac0a..81b640b3ed 100644 --- a/src/style/style_glyph.ts +++ b/src/style/style_glyph.ts @@ -19,7 +19,7 @@ export type GlyphMetrics = { * A style glyph type */ export type StyleGlyph = { - id: number; + grapheme: string; bitmap: AlphaImage; metrics: GlyphMetrics; }; diff --git a/src/style/style_layer/variable_text_anchor.ts b/src/style/style_layer/variable_text_anchor.ts index cb6474ee65..1f3f641d52 100644 --- a/src/style/style_layer/variable_text_anchor.ts +++ b/src/style/style_layer/variable_text_anchor.ts @@ -23,7 +23,7 @@ export type TextAnchor = keyof typeof TextAnchorEnum; // But in the vertical direction, the glyphs appear to "start" at the baseline // We don't actually load baseline data, but we assume an offset of ONE_EM - 17 // (see "yOffset" in shaping.js) -const baselineOffset = 7; +const baselineOffset = 0; export const INVALID_TEXT_OFFSET = Number.POSITIVE_INFINITY; export function evaluateVariableOffset(anchor: TextAnchor, offset: [number, number]): [number, number] { diff --git a/src/symbol/shaping.test.ts b/src/symbol/shaping.test.ts index b45c61ae4a..13321ec60c 100644 --- a/src/symbol/shaping.test.ts +++ b/src/symbol/shaping.test.ts @@ -2,6 +2,7 @@ import {type PositionedIcon, type Box, type Shaping, SectionOptions, TaggedStrin import {ImagePosition} from '../render/image_atlas'; import type {StyleGlyph} from '../style/style_glyph'; import {StyleImage, TextFit} from '../style/style_image'; +import {AlphaImage} from '../util/image'; describe('TaggedString', () => { describe('length', () => { @@ -75,20 +76,18 @@ describe('determineLineBreaks', () => { top: -8, advance: 22, }; - const rect = { - x: 0, - y: 0, - w: 32, - h: 32, - }; + const bitmap = new AlphaImage({ + width: 32, + height: 32, + }); const glyphs = { 'Test': { - '97': {id: 0x61, metrics, rect}, - '98': {id: 0x62, metrics, rect}, - '99': {id: 0x63, metrics, rect}, - '40629': {id: 0x9EB5, metrics, rect}, - '200414': {id: 0x30EDE, metrics, rect}, - } as any as StyleGlyph, + 'a': {grapheme: 'a', bitmap, metrics} as StyleGlyph, + 'b': {grapheme: 'b', bitmap, metrics} as StyleGlyph, + 'c': {grapheme: 'c', bitmap, metrics} as StyleGlyph, + '麵': {grapheme: '麵', bitmap, metrics} as StyleGlyph, + '𰻞': {grapheme: '𰻞', bitmap, metrics} as StyleGlyph, + }, }; test('keeps alphabetic characters together', () => { diff --git a/src/symbol/shaping.ts b/src/symbol/shaping.ts index baeefec516..2c59cfa4f7 100644 --- a/src/symbol/shaping.ts +++ b/src/symbol/shaping.ts @@ -1,7 +1,8 @@ import { charHasUprightVerticalOrientation, - charAllowsIdeographicBreaking, - charInComplexShapingScript + charInComplexShapingScript, + rtlScriptRegExp, + splitByGraphemeCluster } from '../util/script_detection'; import {verticalizePunctuation} from '../util/verticalize_punctuation'; import {rtlWorkerPlugin} from '../source/rtl_text_plugin_worker'; @@ -23,7 +24,7 @@ enum WritingMode { horizontalOnly = 3 } -const SHAPING_DEFAULT_OFFSET = -17; +const SHAPING_DEFAULT_OFFSET = 0; export {shapeText, shapeIcon, applyTextFit, fitIconToText, getAnchorAlignment, WritingMode, SHAPING_DEFAULT_OFFSET}; // The position of a glyph relative to the text's anchor point. @@ -67,6 +68,22 @@ function isEmpty(positionedLines: Array) { return true; } +const rtlCombiningMarkRegExp = new RegExp(`(${rtlScriptRegExp.source})([\\p{gc=Mn}\\p{gc=Mc}])`, 'gu'); +const wordSegmenter = ('Segmenter' in Intl) ? new Intl.Segmenter(undefined, {granularity: 'word'}) : { + // Polyfill for Intl.Segmenter with word granularity for the purpose of line breaking + segment: (text: String) => { + // Prefer breaking on an individual CJKV ideograph instead of keeping the entire run of CJKV together. + const segments = text.split(/\b|(?=\p{Ideo})/u).map((segment, index) => ({ + index, + segment, + })); + return { + containing: (index: number) => segments.find(s => s.index <= index && s.index + s.segment.length > index), + [Symbol.iterator]: () => segments[Symbol.iterator](), + }; + }, +}; + export type SymbolAnchor = 'center' | 'left' | 'right' | 'top' | 'bottom' | 'top-left' | 'top-right' | 'bottom-left' | 'bottom-right'; export type TextJustify = 'left' | 'center' | 'right'; @@ -130,7 +147,7 @@ export class TaggedString { } length(): number { - return [...this.text].length; + return splitByGraphemeCluster(this.text).length; } getSection(index: number): SectionOptions { @@ -157,17 +174,17 @@ export class TaggedString { substring(start: number, end: number): TaggedString { const substring = new TaggedString(); - substring.text = [...this.text].slice(start, end).join(''); + substring.text = splitByGraphemeCluster(this.text).slice(start, end).map(s => s.segment).join(''); substring.sectionIndex = this.sectionIndex.slice(start, end); substring.sections = this.sections; return substring; } /** - * Converts a UTF-16 character index to a UTF-16 code unit (JavaScript character index). + * Converts a grapheme cluster index to a UTF-16 code unit (JavaScript character index). */ toCodeUnitIndex(unicodeIndex: number): number { - return [...this.text].slice(0, unicodeIndex).join('').length; + return splitByGraphemeCluster(this.text).slice(0, unicodeIndex).map(s => s.segment).join('').length; } toString(): string { @@ -182,10 +199,7 @@ export class TaggedString { this.text += section.text; this.sections.push(SectionOptions.forText(section.scale, section.fontStack || defaultFontStack)); const index = this.sections.length - 1; - // eslint-disable-next-line @typescript-eslint/no-unused-vars - for (const char of section.text) { - this.sectionIndex.push(index); - } + this.sectionIndex.push(...Array(splitByGraphemeCluster(section.text).length).fill(index)); } addImageSection(section: FormattedSection) { @@ -235,7 +249,7 @@ function shapeText( text: Formatted, glyphMap: { [_: string]: { - [_: number]: StyleGlyph; + [_: string]: StyleGlyph; }; }, glyphPositions: { @@ -265,49 +279,70 @@ function shapeText( let lines: Array; let lineBreaks = determineLineBreaks(logicalInput, spacing, maxWidth, glyphMap, imagePositions, layoutTextSize); + + /// Prepares a string as input to the RTL plugin. + const stripMarker = '\uF8FF'; + const prepareBidiInput = string => string + // Replace zero-width joiners with temporary strip markers (from the Private Use Area) to prevent ICU from stripping them out. + .replace(/\u200D/g, stripMarker) + // Preemptively swap combining marks with the characters they modify so they remain in logical order. + .replace(rtlCombiningMarkRegExp, '$2$1'); + + /// Prepares a line break array as input to the RTL plugin. + const adjustLineBreaks = () => { + const graphemes = splitByGraphemeCluster(logicalInput.toString()); + // ICU operates on code units. + lineBreaks = lineBreaks + // Get the length of the prefix leading up to each code unit. + .map(index => graphemes.slice(0, index).map(s => s.segment).join('').length); + }; + + /// Converts a line of output from the RTL plugin into a tagged string, except for `sectionIndex`. + const taggedLineFromBidi = (line) => { + const taggedLine = new TaggedString(); + // Restore zero-width joiners from temporary strip markers. + taggedLine.text = line.replaceAll(stripMarker, '\u200D'); + taggedLine.sections = logicalInput.sections; + return taggedLine; + }; + const {processBidirectionalText, processStyledBidirectionalText} = rtlWorkerPlugin; if (processBidirectionalText && logicalInput.sections.length === 1) { // Bidi doesn't have to be style-aware lines = []; - // ICU operates on code units. - lineBreaks = lineBreaks.map(index => logicalInput.toCodeUnitIndex(index)); + const markedInput = prepareBidiInput(logicalInput.toString()); + adjustLineBreaks(); const untaggedLines = - processBidirectionalText(logicalInput.toString(), lineBreaks); + processBidirectionalText(markedInput, lineBreaks); for (const line of untaggedLines) { - const taggedLine = new TaggedString(); - taggedLine.text = line; + const taggedLine = taggedLineFromBidi(line); taggedLine.sections = logicalInput.sections; - // eslint-disable-next-line @typescript-eslint/no-unused-vars - for (const char of line) { - taggedLine.sectionIndex.push(0); - } + taggedLine.sectionIndex.push(...Array(splitByGraphemeCluster(taggedLine.text).length).fill(0)); lines.push(taggedLine); } } else if (processStyledBidirectionalText) { // Need version of mapbox-gl-rtl-text with style support for combining RTL text // with formatting lines = []; - // ICU operates on code units. - lineBreaks = lineBreaks.map(index => logicalInput.toCodeUnitIndex(index)); + const markedInput = prepareBidiInput(logicalInput.toString()); - // Convert character-based section index to be based on code units. + // Convert grapheme cluster–based section index to be based on code units. let i = 0; const sectionIndex = []; - for (const char of logicalInput.text) { - sectionIndex.push(...Array(char.length).fill(logicalInput.sectionIndex[i])); + for (const {segment} of splitByGraphemeCluster(markedInput)) { + sectionIndex.push(...Array(segment.length).fill(logicalInput.sectionIndex[i])); i++; } + adjustLineBreaks(); const processedLines = - processStyledBidirectionalText(logicalInput.text, sectionIndex, lineBreaks); + processStyledBidirectionalText(markedInput, sectionIndex, lineBreaks); for (const line of processedLines) { - const taggedLine = new TaggedString(); - taggedLine.text = line[0]; - taggedLine.sections = logicalInput.sections; - let elapsedChars = ''; - for (const char of line[0]) { - taggedLine.sectionIndex.push(line[1][elapsedChars.length]); - elapsedChars += char; + const taggedLine = taggedLineFromBidi(line[0]); + let i = 0; + for (const {segment} of splitByGraphemeCluster(taggedLine.text)) { + taggedLine.sectionIndex.push(line[1][i]); + i += segment.length; } lines.push(taggedLine); } @@ -348,40 +383,12 @@ const whitespace: { [0x20]: true, // space }; -const breakable: { - [_: number]: boolean; -} = { - [0x0a]: true, // newline - [0x20]: true, // space - [0x26]: true, // ampersand - [0x29]: true, // right parenthesis - [0x2b]: true, // plus sign - [0x2d]: true, // hyphen-minus - [0x2f]: true, // solidus - [0xad]: true, // soft hyphen - [0xb7]: true, // middle dot - [0x200b]: true, // zero-width space - [0x2010]: true, // hyphen - [0x2013]: true, // en dash - [0x2027]: true // interpunct - // Many other characters may be reasonable breakpoints - // Consider "neutral orientation" characters at scriptDetection.charHasNeutralVerticalOrientation - // See https://github.com/mapbox/mapbox-gl-js/issues/3658 -}; - -// Allow breaks depending on the following character -const breakableBefore: { - [_: number]: boolean; -} = { - [0x28]: true, // left parenthesis -}; - function getGlyphAdvance( - codePoint: number, + grapheme: string, section: SectionOptions, glyphMap: { [_: string]: { - [_: number]: StyleGlyph; + [_: string]: StyleGlyph; }; }, imagePositions: {[_: string]: ImagePosition}, @@ -390,7 +397,7 @@ function getGlyphAdvance( ): number { if (!section.imageName) { const positions = glyphMap[section.fontStack]; - const glyph = positions && positions[codePoint]; + const glyph = positions && positions[grapheme]; if (!glyph) return 0; return glyph.metrics.advance * section.scale + spacing; } else { @@ -405,7 +412,7 @@ function determineAverageLineWidth(logicalInput: TaggedString, maxWidth: number, glyphMap: { [_: string]: { - [_: number]: StyleGlyph; + [_: string]: StyleGlyph; }; }, imagePositions: {[_: string]: ImagePosition}, @@ -413,9 +420,9 @@ function determineAverageLineWidth(logicalInput: TaggedString, let totalWidth = 0; let index = 0; - for (const char of logicalInput.text) { + for (const {segment} of splitByGraphemeCluster(logicalInput.text)) { const section = logicalInput.getSection(index); - totalWidth += getGlyphAdvance(char.codePointAt(0), section, glyphMap, imagePositions, spacing, layoutTextSize); + totalWidth += getGlyphAdvance(segment, section, glyphMap, imagePositions, spacing, layoutTextSize); index++; } @@ -440,17 +447,12 @@ function calculateBadness(lineWidth: number, return raggedness + Math.abs(penalty) * penalty; } -function calculatePenalty(codePoint: number, nextCodePoint: number, penalizableIdeographicBreak: boolean) { +function calculatePenalty(codePoint: number, nextCodePoint: number) { let penalty = 0; // Force break on newline if (codePoint === 0x0a) { penalty -= 10000; } - // Penalize breaks between characters that allow ideographic breaking because - // they are less preferable than breaks at spaces (or zero width spaces). - if (penalizableIdeographicBreak) { - penalty += 150; - } // Penalize open parenthesis at end of line if (codePoint === 0x28 || codePoint === 0xff08) { @@ -518,7 +520,7 @@ export function determineLineBreaks( maxWidth: number, glyphMap: { [_: string]: { - [_: number]: StyleGlyph; + [_: string]: StyleGlyph; }; }, imagePositions: {[_: string]: ImagePosition}, @@ -530,47 +532,26 @@ export function determineLineBreaks( const potentialLineBreaks = []; const targetWidth = determineAverageLineWidth(logicalInput, spacing, maxWidth, glyphMap, imagePositions, layoutTextSize); - const hasServerSuggestedBreakpoints = logicalInput.text.indexOf('\u200b') >= 0; - + const graphemes = splitByGraphemeCluster(logicalInput.text); + const words = wordSegmenter.segment(logicalInput.text); let currentX = 0; + for (const [i, grapheme] of graphemes.entries()) { + // Check whether the grapheme cluster immediately follows a word boundary. + const prevWord = words.containing(grapheme.index - 1); + const word = words.containing(grapheme.index); + if (prevWord && prevWord.index !== word.index) { + // Score the line breaking opportunity based on the characters immediately before and after the word boundary. + const prevCodePoint = logicalInput.text.codePointAt(grapheme.index - 1); + const firstCodePoint = grapheme.segment.codePointAt(0); + const penalty = calculatePenalty(prevCodePoint, firstCodePoint); + const lineBreak = evaluateBreak(i, currentX, targetWidth, potentialLineBreaks, penalty, false); + potentialLineBreaks.push(lineBreak); + } - let i = 0; - const chars = logicalInput.text[Symbol.iterator](); - let char = chars.next(); - const nextChars = logicalInput.text[Symbol.iterator](); - nextChars.next(); - let nextChar = nextChars.next(); - const nextNextChars = logicalInput.text[Symbol.iterator](); - nextNextChars.next(); - nextNextChars.next(); - let nextNextChar = nextNextChars.next(); - - while (!char.done) { const section = logicalInput.getSection(i); - const codePoint = char.value.codePointAt(0); - if (!whitespace[codePoint]) currentX += getGlyphAdvance(codePoint, section, glyphMap, imagePositions, spacing, layoutTextSize); - - // Ideographic characters, spaces, and word-breaking punctuation that often appear without - // surrounding spaces. - if (!nextChar.done) { - const ideographicBreak = charAllowsIdeographicBreaking(codePoint); - const nextCodePoint = nextChar.value.codePointAt(0); - if (breakable[codePoint] || ideographicBreak || section.imageName || (!nextNextChar.done && breakableBefore[nextCodePoint])) { - - potentialLineBreaks.push( - evaluateBreak( - i + 1, - currentX, - targetWidth, - potentialLineBreaks, - calculatePenalty(codePoint, nextCodePoint, ideographicBreak && hasServerSuggestedBreakpoints), - false)); - } + if (grapheme.segment.trim()) { + currentX += getGlyphAdvance(grapheme.segment, section, glyphMap, imagePositions, spacing, layoutTextSize); } - i++; - char = chars.next(); - nextChar = nextChars.next(); - nextNextChar = nextNextChars.next(); } return leastBadBreaks( @@ -618,7 +599,7 @@ function getAnchorAlignment(anchor: SymbolAnchor) { function shapeLines(shaping: Shaping, glyphMap: { [_: string]: { - [_: number]: StyleGlyph; + [_: string]: StyleGlyph; }; }, glyphPositions: { @@ -664,10 +645,10 @@ function shapeLines(shaping: Shaping, } let i = 0; - for (const char of line.text) { + for (const {segment} of splitByGraphemeCluster(line.text)) { const section = line.getSection(i); const sectionIndex = line.getSectionIndex(i); - const codePoint = char.codePointAt(0); + const codePoint = segment.codePointAt(0); let baselineOffset = 0.0; let metrics = null; let rect = null; @@ -682,13 +663,13 @@ function shapeLines(shaping: Shaping, if (!section.imageName) { const positions = glyphPositions[section.fontStack]; - const glyphPosition = positions && positions[codePoint]; + const glyphPosition = positions && positions[segment]; if (glyphPosition && glyphPosition.rect) { rect = glyphPosition.rect; metrics = glyphPosition.metrics; } else { const glyphs = glyphMap[section.fontStack]; - const glyph = glyphs && glyphs[codePoint]; + const glyph = glyphs && glyphs[segment]; if (!glyph) continue; metrics = glyph.metrics; } @@ -731,12 +712,13 @@ function shapeLines(shaping: Shaping, } if (!vertical) { - positionedGlyphs.push({glyph: codePoint, imageName, x, y: y + baselineOffset, vertical, scale: section.scale, fontStack: section.fontStack, sectionIndex, metrics, rect}); + positionedGlyphs.push({glyph: segment, imageName, x, y: y + baselineOffset, vertical, scale: section.scale, fontStack: section.fontStack, sectionIndex, metrics, rect}); x += metrics.advance * section.scale + spacing; } else { shaping.verticalizable = true; - positionedGlyphs.push({glyph: codePoint, imageName, x, y: y + baselineOffset, vertical, scale: section.scale, fontStack: section.fontStack, sectionIndex, metrics, rect}); - x += verticalAdvance * section.scale + spacing; + const advance = verticalAdvance * section.scale + spacing; + positionedGlyphs.push({glyph: segment, imageName, x: x + advance, y: y + baselineOffset, vertical, scale: section.scale, fontStack: section.fontStack, sectionIndex, metrics, rect}); + x += advance; } i++; diff --git a/src/util/actor_messages.ts b/src/util/actor_messages.ts index 6d9b81d19e..09da48c155 100644 --- a/src/util/actor_messages.ts +++ b/src/util/actor_messages.ts @@ -49,7 +49,7 @@ export type UpdateLayersParamaeters = { /** * Parameters needed to get the images */ -export type GetImagesParamerters = { +export type GetImagesParameters = { icons: Array; source: string; tileID: OverscaledTileID; @@ -59,9 +59,9 @@ export type GetImagesParamerters = { /** * Parameters needed to get the glyphs */ -export type GetGlyphsParamerters = { +export type GetGlyphsParameters = { type: string; - stacks: {[_: string]: Array}; + stacks: {[_: string]: Array}; source: string; tileID: OverscaledTileID; } @@ -71,7 +71,7 @@ export type GetGlyphsParamerters = { */ export type GetGlyphsResponse = { [stack: string]: { - [id: number]: StyleGlyph; + [grapheme: string]: StyleGlyph; }; } @@ -121,8 +121,8 @@ export type RequestResponseMessageMap = { [MessageType.getData]: [LoadGeoJSONParameters, GeoJSON.GeoJSON]; [MessageType.loadTile]: [WorkerTileParameters, WorkerTileResult]; [MessageType.reloadTile]: [WorkerTileParameters, WorkerTileResult]; - [MessageType.getGlyphs]: [GetGlyphsParamerters, GetGlyphsResponse]; - [MessageType.getImages]: [GetImagesParamerters, GetImagesResponse]; + [MessageType.getGlyphs]: [GetGlyphsParameters, GetGlyphsResponse]; + [MessageType.getImages]: [GetImagesParameters, GetImagesResponse]; [MessageType.setImages]: [string[], void]; [MessageType.setLayers]: [Array, void]; [MessageType.updateLayers]: [UpdateLayersParamaeters, void]; diff --git a/src/util/script_detection.test.ts b/src/util/script_detection.test.ts index 04a7856fde..cbb4eb42ff 100644 --- a/src/util/script_detection.test.ts +++ b/src/util/script_detection.test.ts @@ -1,58 +1,23 @@ -import {charAllowsIdeographicBreaking, charAllowsLetterSpacing, charHasUprightVerticalOrientation, charInComplexShapingScript, charInRTLScript} from './script_detection'; +import {allowsLetterSpacing, charHasUprightVerticalOrientation, charInComplexShapingScript, stringContainsRTLText} from './script_detection'; -describe('charAllowsIdeographicBreaking', () => { - test('disallows ideographic breaking of Latin text', () => { - expect(charAllowsIdeographicBreaking('A'.codePointAt(0))).toBe(false); - expect(charAllowsIdeographicBreaking('3'.codePointAt(0))).toBe(false); - }); - - test('allows ideographic breaking of ideographic punctuation', () => { - expect(charAllowsIdeographicBreaking('〈'.codePointAt(0))).toBe(true); - }); - - test('allows ideographic breaking of Bopomofo text', () => { - expect(charAllowsIdeographicBreaking('ㄎ'.codePointAt(0))).toBe(true); - }); - - test('allows ideographic breaking of Chinese and Vietnamese text', () => { - expect(charAllowsIdeographicBreaking('市'.codePointAt(0))).toBe(true); - expect(charAllowsIdeographicBreaking('𡔖'.codePointAt(0))).toBe(true); - expect(charAllowsIdeographicBreaking('麵'.codePointAt(0))).toBe(true); - expect(charAllowsIdeographicBreaking('𪚥'.codePointAt(0))).toBe(true); - }); - - test('disallows ideographic breaking of Korean text', () => { - expect(charAllowsIdeographicBreaking('아'.codePointAt(0))).toBe(false); - }); - - test('allows ideographic breaking of Japanese text', () => { - expect(charAllowsIdeographicBreaking('あ'.codePointAt(0))).toBe(true); - expect(charAllowsIdeographicBreaking('カ'.codePointAt(0))).toBe(true); - }); - - test('allows ideographic breaking of Yi text', () => { - expect(charAllowsIdeographicBreaking('ꉆ'.codePointAt(0))).toBe(true); - }); -}); - -describe('charAllowsLetterSpacing', () => { +describe('allowsLetterSpacing', () => { test('allows letter spacing of Latin text', () => { - expect(charAllowsLetterSpacing('A'.codePointAt(0))).toBe(true); + expect(allowsLetterSpacing('A')).toBe(true); }); test('disallows ideographic breaking of Arabic text', () => { // Arabic - expect(charAllowsLetterSpacing('۳'.codePointAt(0))).toBe(false); + expect(allowsLetterSpacing('۳')).toBe(false); // Arabic Supplement - expect(charAllowsLetterSpacing('ݣ'.codePointAt(0))).toBe(false); + expect(allowsLetterSpacing('ݣ')).toBe(false); // Arabic Extended-A - expect(charAllowsLetterSpacing('ࢳ'.codePointAt(0))).toBe(false); + expect(allowsLetterSpacing('ࢳ')).toBe(false); // Arabic Extended-B - expect(charAllowsLetterSpacing('࢐'.codePointAt(0))).toBe(false); + expect(allowsLetterSpacing('࢐')).toBe(false); // Arabic Presentation Forms-A - expect(charAllowsLetterSpacing('ﰤ'.codePointAt(0))).toBe(false); + expect(allowsLetterSpacing('ﰤ')).toBe(false); // Arabic Presentation Forms-B - expect(charAllowsLetterSpacing('ﺽ'.codePointAt(0))).toBe(false); + expect(allowsLetterSpacing('ﺽ')).toBe(false); }); }); @@ -107,35 +72,35 @@ describe('charInComplexShapingScript', () => { }); }); -describe('charInRTLScript', () => { +describe('stringContainsRTLText', () => { test('does not identify direction-neutral text as right-to-left', () => { - expect(charInRTLScript('3'.codePointAt(0))).toBe(false); + expect(stringContainsRTLText('3')).toBe(false); }); test('identifies Arabic text as right-to-left', () => { // Arabic - expect(charInRTLScript('۳'.codePointAt(0))).toBe(true); + expect(stringContainsRTLText('۳')).toBe(true); // Arabic Supplement - expect(charInRTLScript('ݣ'.codePointAt(0))).toBe(true); + expect(stringContainsRTLText('ݣ')).toBe(true); // Arabic Extended-A - expect(charInRTLScript('ࢳ'.codePointAt(0))).toBe(true); + expect(stringContainsRTLText('ࢳ')).toBe(true); // Arabic Extended-B - expect(charInRTLScript('࢐'.codePointAt(0))).toBe(true); + expect(stringContainsRTLText('࢐')).toBe(true); // Arabic Presentation Forms-A - expect(charInRTLScript('ﰤ'.codePointAt(0))).toBe(true); + expect(stringContainsRTLText('ﰤ')).toBe(true); // Arabic Presentation Forms-B - expect(charInRTLScript('ﺽ'.codePointAt(0))).toBe(true); + expect(stringContainsRTLText('ﺽ')).toBe(true); }); test('identifies Hebrew text as right-to-left', () => { // Hebrew - expect(charInRTLScript('ה'.codePointAt(0))).toBe(true); + expect(stringContainsRTLText('ה')).toBe(true); // Alphabetic Presentation Forms - expect(charInRTLScript('ﬡ'.codePointAt(0))).toBe(true); + expect(stringContainsRTLText('ﬡ')).toBe(true); }); test('identifies Thaana text as right-to-left', () => { // Thaana - expect(charInRTLScript('ޘ'.codePointAt(0))).toBe(true); + expect(stringContainsRTLText('ޘ')).toBe(true); }); }); diff --git a/src/util/script_detection.ts b/src/util/script_detection.ts index ba93a5e7a1..89d49ca9b9 100644 --- a/src/util/script_detection.ts +++ b/src/util/script_detection.ts @@ -1,12 +1,42 @@ /* eslint-disable new-cap */ import {unicodeBlockLookup as isChar} from './is_char_in_unicode_block'; +import {canCombineGraphemes} from '../data/unicode_properties'; -export function allowsIdeographicBreaking(chars: string) { - for (const char of chars) { - if (!charAllowsIdeographicBreaking(char.codePointAt(0))) return false; +const segmenter = ('Segmenter' in Intl) ? new Intl.Segmenter() : { + segment: (text: string) => { + const segments = [...text].map((char, index) => ({ + index, + segment: char, + })); + return { + containing: (index: number) => segments.find(s => s.index <= index && s.index + s.segment.length > index), + [Symbol.iterator]: () => segments[Symbol.iterator](), + }; + }, +}; + +export function splitByGraphemeCluster(text: string) { + const segments = segmenter.segment(text)[Symbol.iterator](); + let segment = segments.next(); + const nextSegments = segmenter.segment(text)[Symbol.iterator](); + nextSegments.next(); + let nextSegment = nextSegments.next(); + + const baseSegments = []; + while (!segment.done) { + const baseSegment = segment; + while (!nextSegment.done && canCombineGraphemes(baseSegment.value.segment, nextSegment.value.segment)) { + baseSegment.value.segment += nextSegment.value.segment; + segment = segments.next(); + nextSegment = nextSegments.next(); + } + baseSegments.push(baseSegment.value); + segment = segments.next(); + nextSegment = nextSegments.next(); } - return true; + + return baseSegments; } export function allowsVerticalWritingMode(chars: string) { @@ -17,10 +47,7 @@ export function allowsVerticalWritingMode(chars: string) { } export function allowsLetterSpacing(chars: string) { - for (const char of chars) { - if (!charAllowsLetterSpacing(char.codePointAt(0))) return false; - } - return true; + return !cursiveScriptRegExp.test(chars); } /** @@ -35,7 +62,7 @@ function sanitizedRegExpFromScriptCodes(scriptCodes: Array): RegExp { return null; } }).filter(pe => pe); - return new RegExp(supportedPropertyEscapes.join('|'), 'u'); + return new RegExp(`[${supportedPropertyEscapes.join('')}]`, 'u'); } /** @@ -54,10 +81,6 @@ const cursiveScriptCodes = [ const cursiveScriptRegExp = sanitizedRegExpFromScriptCodes(cursiveScriptCodes); -export function charAllowsLetterSpacing(char: number) { - return !cursiveScriptRegExp.test(String.fromCodePoint(char)); -} - /** * ISO 15924 script codes of scripts that allow ideographic line breaking beyond * the CJKV scripts that are considered ideographic in Unicode 16.0.0. @@ -75,30 +98,6 @@ const ideographicBreakingScriptCodes = [ const ideographicBreakingRegExp = sanitizedRegExpFromScriptCodes(ideographicBreakingScriptCodes); -export function charAllowsIdeographicBreaking(char: number) { - // Return early for characters outside all ideographic ranges. - if (char < 0x2E80) return false; - - if (isChar['CJK Compatibility'](char)) return true; - if (isChar['CJK Compatibility Forms'](char)) return true; - if (isChar['CJK Radicals Supplement'](char)) return true; - if (isChar['CJK Strokes'](char)) return true; - if (isChar['CJK Symbols and Punctuation'](char)) return true; - if (isChar['Enclosed CJK Letters and Months'](char)) return true; - if (isChar['Enclosed Ideographic Supplement'](char)) return true; - if (isChar['Halfwidth and Fullwidth Forms'](char)) return true; - if (isChar['Ideographic Description Characters'](char)) return true; - if (isChar['Ideographic Symbols and Punctuation'](char)) return true; - if (isChar['Kana Extended-A'](char)) return true; - if (isChar['Kana Extended-B'](char)) return true; - if (isChar['Kana Supplement'](char)) return true; - if (isChar['Kangxi Radicals'](char)) return true; - if (isChar['Katakana Phonetic Extensions'](char)) return true; - if (isChar['Small Kana Extension'](char)) return true; - if (isChar['Vertical Forms'](char)) return true; - return ideographicBreakingRegExp.test(String.fromCodePoint(char)); -} - // The following logic comes from // . // Keep it synchronized with @@ -201,9 +200,25 @@ export function charHasUprightVerticalOrientation(char: number) { if (/* Canadian Aboriginal */ /\p{sc=Cans}/u.test(String.fromCodePoint(char))) return true; if (/* Egyptian Hieroglyphs */ /\p{sc=Egyp}/u.test(String.fromCodePoint(char))) return true; if (/* Hangul */ /\p{sc=Hang}/u.test(String.fromCodePoint(char))) return true; - if (charAllowsIdeographicBreaking(char)) return true; - return false; + if (isChar['CJK Compatibility'](char)) return true; + if (isChar['CJK Compatibility Forms'](char)) return true; + if (isChar['CJK Radicals Supplement'](char)) return true; + if (isChar['CJK Strokes'](char)) return true; + if (isChar['CJK Symbols and Punctuation'](char)) return true; + if (isChar['Enclosed CJK Letters and Months'](char)) return true; + if (isChar['Enclosed Ideographic Supplement'](char)) return true; + if (isChar['Halfwidth and Fullwidth Forms'](char)) return true; + if (isChar['Ideographic Description Characters'](char)) return true; + if (isChar['Ideographic Symbols and Punctuation'](char)) return true; + if (isChar['Kana Extended-A'](char)) return true; + if (isChar['Kana Extended-B'](char)) return true; + if (isChar['Kana Supplement'](char)) return true; + if (isChar['Kangxi Radicals'](char)) return true; + if (isChar['Katakana Phonetic Extensions'](char)) return true; + if (isChar['Small Kana Extension'](char)) return true; + if (isChar['Vertical Forms'](char)) return true; + return ideographicBreakingRegExp.test(String.fromCodePoint(char)); } /** @@ -361,11 +376,7 @@ const rtlScriptCodes = [ 'Yezi', // Yezidi ]; -const rtlScriptRegExp = sanitizedRegExpFromScriptCodes(rtlScriptCodes); - -export function charInRTLScript(char: number) { - return rtlScriptRegExp.test(String.fromCodePoint(char)); -} +export const rtlScriptRegExp = sanitizedRegExpFromScriptCodes(rtlScriptCodes); export function charInSupportedScript(char: number, canRenderRTL: boolean) { // This is a rough heuristic: whether we "can render" a script @@ -375,7 +386,7 @@ export function charInSupportedScript(char: number, canRenderRTL: boolean) { // Even in Latin script, we "can't render" combinations such as the fi // ligature, but we don't consider that semantically significant. - if (!canRenderRTL && charInRTLScript(char)) { + if (!canRenderRTL && rtlScriptRegExp.test(String.fromCodePoint(char))) { return false; } if ((char >= 0x0900 && char <= 0x0DFF) || @@ -393,12 +404,7 @@ export function charInSupportedScript(char: number, canRenderRTL: boolean) { } export function stringContainsRTLText(chars: string): boolean { - for (const char of chars) { - if (charInRTLScript(char.codePointAt(0))) { - return true; - } - } - return false; + return rtlScriptRegExp.test(chars); } export function isStringInSupportedScript(chars: string, canRenderRTL: boolean) { diff --git a/src/util/verticalize_punctuation.ts b/src/util/verticalize_punctuation.ts index 069f45fd90..b9fff650d2 100644 --- a/src/util/verticalize_punctuation.ts +++ b/src/util/verticalize_punctuation.ts @@ -86,26 +86,28 @@ export const verticalizedCharacterMap = { '」': '﹂' }; +const segmenter = new Intl.Segmenter(); + export function verticalizePunctuation(input: string) { let output = ''; let prevChar = {premature: true, value: undefined}; - const chars = input[Symbol.iterator](); + const chars = segmenter.segment(input)[Symbol.iterator](); let char = chars.next(); - const nextChars = input[Symbol.iterator](); + const nextChars = segmenter.segment(input)[Symbol.iterator](); nextChars.next(); let nextChar = nextChars.next(); while (!char.done) { const canReplacePunctuation = ( - (nextChar.done || !charHasRotatedVerticalOrientation(nextChar.value.codePointAt(0)) || verticalizedCharacterMap[nextChar.value]) && - (prevChar.premature || !charHasRotatedVerticalOrientation(prevChar.value.codePointAt(0)) || verticalizedCharacterMap[prevChar.value]) + (nextChar.done || !charHasRotatedVerticalOrientation(nextChar.value.segment.codePointAt(0)) || verticalizedCharacterMap[nextChar.value.segment]) && + (prevChar.premature || !charHasRotatedVerticalOrientation(prevChar.value.segment.codePointAt(0)) || verticalizedCharacterMap[prevChar.value.segment]) ); - if (canReplacePunctuation && verticalizedCharacterMap[char.value]) { - output += verticalizedCharacterMap[char.value]; + if (canReplacePunctuation && verticalizedCharacterMap[char.value.segment]) { + output += verticalizedCharacterMap[char.value.segment]; } else { - output += char.value; + output += char.value.segment; } prevChar = {value: char.value, premature: false}; diff --git a/test/integration/symbol-shaping/shaping.test.ts b/test/integration/symbol-shaping/shaping.test.ts index fca54cd0fd..bd388a1eab 100644 --- a/test/integration/symbol-shaping/shaping.test.ts +++ b/test/integration/symbol-shaping/shaping.test.ts @@ -5,6 +5,7 @@ import {ResolvedImage, Formatted, FormattedSection} from '@maplibre/maplibre-gl- import {ImagePosition} from '../../../src/render/image_atlas'; import type {StyleImage} from '../../../src/style/style_image'; import type {StyleGlyph} from '../../../src/style/style_glyph'; +import {AlphaImage} from '../../../src/util/image'; import glyphsJson from '../assets/glyphs/fontstack-glyphs.json' with {type: 'json'}; import expectedJson from './tests/text-shaping-linebreak.json' with {type: 'json'}; @@ -35,9 +36,29 @@ describe('shaping', () => { const layoutTextSizeThisZoom = 16; const fontStack = 'Test'; const glyphs = { - 'Test': glyphsJson as any as StyleGlyph + 'Test': Object.fromEntries(Object.entries(glyphsJson).map(entry => { + const bitmap = new AlphaImage({ + width: entry[1].rect.w, + height: entry[1].rect.h, + }); + const glyph = { + grapheme: String.fromCodePoint(entry[1].id), + bitmap, + metrics: entry[1].metrics, + } as StyleGlyph; + return [entry[0], glyph]; + })) + }; + const glyphPositions = { + 'Test': Object.fromEntries(Object.entries(glyphsJson).map(entry => { + const position = { + grapheme: String.fromCodePoint(entry[1].id), + rect: entry[1].rect, + metrics: entry[1].metrics, + }; + return [entry[0], position]; + })) }; - const glyphPositions = glyphs; const images = { 'square': new ImagePosition({x: 0, y: 0, w: 16, h: 16}, {pixelRatio: 1, version: 1} as StyleImage),