From d794a61d8b5d50695a03d67bbf32fa8b1027dbe7 Mon Sep 17 00:00:00 2001 From: Anthony Fu Date: Thu, 27 Jun 2024 18:06:14 +0200 Subject: [PATCH 1/3] feat: introduce `GrammarState` --- packages/core/src/code-to-tokens-base.ts | 81 +++++++- packages/core/src/grammar-state.ts | 54 +++++ packages/core/src/highlighter.ts | 3 +- packages/core/src/registry.ts | 11 +- packages/core/src/types/highlighter.ts | 10 +- packages/core/src/types/options.ts | 2 +- packages/core/src/types/textmate.ts | 7 +- packages/core/src/types/tokens.ts | 18 ++ packages/shiki/test/grammar-state.test.ts | 232 ++++++++++++++++++++++ 9 files changed, 402 insertions(+), 16 deletions(-) create mode 100644 packages/core/src/grammar-state.ts create mode 100644 packages/shiki/test/grammar-state.test.ts diff --git a/packages/core/src/code-to-tokens-base.ts b/packages/core/src/code-to-tokens-base.ts index aafc4fff6..54163b059 100644 --- a/packages/core/src/code-to-tokens-base.ts +++ b/packages/core/src/code-to-tokens-base.ts @@ -1,12 +1,14 @@ /* --------------------------------------------------------- * Copyright (C) Microsoft Corporation. All rights reserved. *-------------------------------------------------------- */ -import type { IGrammar, IRawThemeSetting } from './textmate' +import type { IGrammar, IRawThemeSetting, StateStack } from './textmate' import { INITIAL } from './textmate' import type { CodeToTokensBaseOptions, FontStyle, ShikiInternal, ThemeRegistrationResolved, ThemedToken, ThemedTokenScopeExplanation, TokenizeWithThemeOptions } from './types' import { StackElementMetadata } from './stack-element-metadata' import { applyColorReplacements, isNoneTheme, isPlainLang, resolveColorReplacements, splitLines } from './utils' import { tokenizeAnsiWithTheme } from './code-to-tokens-ansi' +import { ShikiError } from './error' +import { GrammarState, getGrammarStack } from './grammar-state' /** * Code to tokens, with a simple theme. @@ -30,9 +32,45 @@ export function codeToTokensBase( return tokenizeAnsiWithTheme(theme, code, options) const _grammar = internal.getLanguage(lang) + + if (options.grammarState) { + if (options.grammarState.lang !== _grammar.name) { + throw new ShikiError(`Grammar state language "${options.grammarState.lang}" does not match highlight language "${_grammar.name}"`) + } + if (options.grammarState.theme !== themeName) { + throw new ShikiError(`Grammar state theme "${options.grammarState.theme}" does not match highlight theme "${themeName}"`) + } + } + return tokenizeWithTheme(code, _grammar, theme, colorMap, options) } +export function getLastGrammarState( + internal: ShikiInternal, + code: string, + options: CodeToTokensBaseOptions = {}, +): GrammarState { + const { + lang = 'text', + theme: themeName = internal.getLoadedThemes()[0], + } = options + + if (isPlainLang(lang) || isNoneTheme(themeName)) + throw new ShikiError('Plain language does not have grammar state') + if (lang === 'ansi') + throw new ShikiError('ANSI language does not have grammar state') + + const { theme, colorMap } = internal.setTheme(themeName) + + const _grammar = internal.getLanguage(lang) + + return new GrammarState( + _tokenizeWithTheme(code, _grammar, theme, colorMap, options).stateStack, + _grammar.name, + theme.name, + ) +} + /** for explanations */ interface ThemeSettingsSelectors { settings: IRawThemeSetting @@ -46,6 +84,19 @@ export function tokenizeWithTheme( colorMap: string[], options: TokenizeWithThemeOptions, ): ThemedToken[][] { + return _tokenizeWithTheme(code, grammar, theme, colorMap, options).tokens +} + +function _tokenizeWithTheme( + code: string, + grammar: IGrammar, + theme: ThemeRegistrationResolved, + colorMap: string[], + options: TokenizeWithThemeOptions, +): { + tokens: ThemedToken[][] + stateStack: StateStack + } { const colorReplacements = resolveColorReplacements(theme, options) const { @@ -55,7 +106,22 @@ export function tokenizeWithTheme( const lines = splitLines(code) - let ruleStack = INITIAL + let stateStack = options.grammarState + ? getGrammarStack(options.grammarState) + : options.grammarContextCode != null + ? _tokenizeWithTheme( + options.grammarContextCode, + grammar, + theme, + colorMap, + { + ...options, + grammarState: undefined, + grammarContextCode: undefined, + }, + ).stateStack + : INITIAL + let actual: ThemedToken[] = [] const final: ThemedToken[][] = [] @@ -106,12 +172,12 @@ export function tokenizeWithTheme( let tokensWithScopesIndex if (options.includeExplanation) { - resultWithScopes = grammar.tokenizeLine(line, ruleStack) + resultWithScopes = grammar.tokenizeLine(line, stateStack) tokensWithScopes = resultWithScopes.tokens tokensWithScopesIndex = 0 } - const result = grammar.tokenizeLine2(line, ruleStack, tokenizeTimeLimit) + const result = grammar.tokenizeLine2(line, stateStack, tokenizeTimeLimit) const tokensLength = result.tokens.length / 2 for (let j = 0; j < tokensLength; j++) { @@ -158,10 +224,13 @@ export function tokenizeWithTheme( } final.push(actual) actual = [] - ruleStack = result.ruleStack + stateStack = result.ruleStack } - return final + return { + tokens: final, + stateStack, + } } function explainThemeScopes( diff --git a/packages/core/src/grammar-state.ts b/packages/core/src/grammar-state.ts new file mode 100644 index 000000000..6c31465b0 --- /dev/null +++ b/packages/core/src/grammar-state.ts @@ -0,0 +1,54 @@ +import type { StateStackImpl } from '../vendor/vscode-textmate/src/grammar' +import { ShikiError } from './error' +import type { StateStack } from './textmate' + +/** + * GrammarState is a special reference object that holds the state of a grammar. + * + * It's used to highlight code snippets that are part of the target language. + */ +export class GrammarState { + constructor( + private _stack: StateStack, + public lang: string, + public theme: string, + ) {} + + get scopes() { + return getScopes(this._stack as StateStackImpl) + } + + toJSON() { + return { + lang: this.lang, + theme: this.theme, + scopes: this.scopes, + } + } +} + +function getScopes(stack: StateStackImpl) { + const scopes: string[] = [] + const visited = new Set() + + function pushScope(stack: StateStackImpl) { + if (visited.has(stack)) + return + visited.add(stack) + const name = stack?.nameScopesList?.scopeName + if (name) + scopes.push(name) + if (stack.parent) + pushScope(stack.parent) + } + + pushScope(stack) + return scopes +} + +export function getGrammarStack(state: GrammarState) { + if (!(state instanceof GrammarState)) + throw new ShikiError('Invalid grammar state') + // @ts-expect-error _stack is private + return state._stack +} diff --git a/packages/core/src/highlighter.ts b/packages/core/src/highlighter.ts index f6d1ebfb7..2d3283f49 100644 --- a/packages/core/src/highlighter.ts +++ b/packages/core/src/highlighter.ts @@ -1,7 +1,7 @@ import { codeToHast } from './code-to-hast' import { codeToHtml } from './code-to-html' import { codeToTokens } from './code-to-tokens' -import { codeToTokensBase } from './code-to-tokens-base' +import { codeToTokensBase, getLastGrammarState } from './code-to-tokens-base' import { codeToTokensWithThemes } from './code-to-tokens-themes' import { createShikiInternal } from './internal' import type { HighlighterCore, HighlighterCoreOptions } from './types' @@ -16,6 +16,7 @@ export async function createHighlighterCore(options: HighlighterCoreOptions = {} const internal = await createShikiInternal(options) return { + getLastGrammarState: (code, options) => getLastGrammarState(internal, code, options), codeToTokensBase: (code, options) => codeToTokensBase(internal, code, options), codeToTokensWithThemes: (code, options) => codeToTokensWithThemes(internal, code, options), codeToTokens: (code, options) => codeToTokens(internal, code, options), diff --git a/packages/core/src/registry.ts b/packages/core/src/registry.ts index f764c669b..7e4c31578 100644 --- a/packages/core/src/registry.ts +++ b/packages/core/src/registry.ts @@ -1,13 +1,13 @@ -import type { IGrammar, IGrammarConfiguration, IRawTheme } from './textmate' +import type { IGrammarConfiguration, IRawTheme } from './textmate' import { Registry as TextMateRegistry, Theme as TextMateTheme } from './textmate' -import type { LanguageRegistration, ThemeRegistrationAny, ThemeRegistrationResolved } from './types' +import type { Grammar, LanguageRegistration, ThemeRegistrationAny, ThemeRegistrationResolved } from './types' import type { Resolver } from './resolver' import { normalizeTheme } from './normalize' import { ShikiError } from './error' export class Registry extends TextMateRegistry { private _resolvedThemes: Map = new Map() - private _resolvedGrammars: Map = new Map() + private _resolvedGrammars: Map = new Map() private _langMap: Map = new Map() private _langGraph: Map = new Map() @@ -97,8 +97,9 @@ export class Registry extends TextMateRegistry { // @ts-expect-error Private members, set this to override the previous grammar (that can be a stub) this._syncRegistry._rawGrammars.set(lang.scopeName, lang) - const g = await this.loadGrammarWithConfiguration(lang.scopeName, 1, grammarConfig) - this._resolvedGrammars.set(lang.name, g!) + const g = await this.loadGrammarWithConfiguration(lang.scopeName, 1, grammarConfig) as Grammar + g.name = lang.name + this._resolvedGrammars.set(lang.name, g) if (lang.aliases) { lang.aliases.forEach((alias) => { this._alias[alias] = lang.name diff --git a/packages/core/src/types/highlighter.ts b/packages/core/src/types/highlighter.ts index b6fff5359..1f96f52e4 100644 --- a/packages/core/src/types/highlighter.ts +++ b/packages/core/src/types/highlighter.ts @@ -2,7 +2,7 @@ import type { Root } from 'hast' import type { Grammar } from './textmate' import type { LanguageInput, LanguageRegistration, ResolveBundleKey, SpecialLanguage } from './langs' import type { SpecialTheme, ThemeInput, ThemeRegistrationAny, ThemeRegistrationResolved } from './themes' -import type { CodeToTokensBaseOptions, CodeToTokensOptions, CodeToTokensWithThemesOptions, ThemedToken, ThemedTokenWithVariants, TokensResult } from './tokens' +import type { CodeToTokensBaseOptions, CodeToTokensOptions, CodeToTokensWithThemesOptions, GrammarState, ThemedToken, ThemedTokenWithVariants, TokensResult } from './tokens' import type { CodeToHastOptions } from './options' /** @@ -104,6 +104,14 @@ export interface HighlighterGeneric, ResolveBundleKey> ) => ThemedTokenWithVariants[][] + /** + * Get the last grammar state of a code snippet. + * You can pass the grammar state to `codeToTokens` as `grammarState` to continue tokenizing from an intermediate state. + */ + getLastGrammarState: ( + langId: string, + options: CodeToTokensBaseOptions, ResolveBundleKey> + ) => GrammarState /** * Get internal context object diff --git a/packages/core/src/types/options.ts b/packages/core/src/types/options.ts index 4bf51abef..b722b06e6 100644 --- a/packages/core/src/types/options.ts +++ b/packages/core/src/types/options.ts @@ -123,7 +123,7 @@ export interface CodeToHastOptionsCommon extends TransformerOptions, DecorationOptions, - Pick { + Pick { lang: StringLiteralUnion diff --git a/packages/core/src/types/textmate.ts b/packages/core/src/types/textmate.ts index 70112d525..d851bbc36 100644 --- a/packages/core/src/types/textmate.ts +++ b/packages/core/src/types/textmate.ts @@ -1,13 +1,16 @@ import type { - IGrammar as Grammar, + IGrammar, IRawGrammar as RawGrammar, IRawTheme as RawTheme, IRawThemeSetting as RawThemeSetting, } from '../textmate' export type { - Grammar, RawGrammar, RawTheme, RawThemeSetting, } + +export interface Grammar extends IGrammar { + name: string +} diff --git a/packages/core/src/types/tokens.ts b/packages/core/src/types/tokens.ts index 95e98c786..c72571a6f 100644 --- a/packages/core/src/types/tokens.ts +++ b/packages/core/src/types/tokens.ts @@ -1,7 +1,10 @@ +import type { GrammarState } from '../grammar-state' import type { SpecialLanguage } from './langs' import type { SpecialTheme, ThemeRegistrationAny } from './themes' import type { CodeOptionsThemes } from './options' +export type { GrammarState } + export interface CodeToTokensBaseOptions extends TokenizeWithThemeOptions { lang?: Languages | SpecialLanguage theme?: Themes | ThemeRegistrationAny | SpecialTheme @@ -172,6 +175,21 @@ export interface TokenizeWithThemeOptions { * @default 500 (0.5s) */ tokenizeTimeLimit?: number + + /** + * Represent the state of the grammar, allowing to continue tokenizing from a intermediate grammar state. + * + * You can get the grammar state from `getLastGrammarState`. + */ + grammarState?: GrammarState + + /** + * The code context of the grammar. + * Consider it a prepended code to the input code, that only participate the grammar inference but not presented in the final output. + * + * This will be ignored if `grammarState` is provided. + */ + grammarContextCode?: string } /** diff --git a/packages/shiki/test/grammar-state.test.ts b/packages/shiki/test/grammar-state.test.ts new file mode 100644 index 000000000..598f4107c --- /dev/null +++ b/packages/shiki/test/grammar-state.test.ts @@ -0,0 +1,232 @@ +import { describe, expect, it } from 'vitest' +import { codeToHtml, createHighlighter } from '../src' + +it('getLastGrammarState', async () => { + const shiki = await createHighlighter({ + themes: ['vitesse-light'], + langs: ['typescript'], + }) + + const state = shiki.getLastGrammarState('let a:', { lang: 'typescript', theme: 'vitesse-light' }) + + expect.soft(state).toMatchInlineSnapshot(` + { + "lang": "typescript", + "scopes": [ + "meta.type.annotation.ts", + "meta.var-single-variable.expr.ts", + "meta.var.expr.ts", + "source.ts", + ], + "theme": "vitesse-light", + } + `) + + const input = 'Omit<{}, string | number>' + + const highlightedNatural = shiki.codeToTokens(input, { + lang: 'typescript', + theme: 'vitesse-light', + }) + + const highlightedContext = shiki.codeToTokens(input, { + lang: 'typescript', + theme: 'vitesse-light', + grammarState: state, + }) + + const highlightedContext2 = shiki.codeToTokens(input, { + lang: 'typescript', + theme: 'vitesse-light', + grammarState: state, + }) + + expect.soft(highlightedNatural) + .not.toEqual(highlightedContext) + + expect.soft(highlightedContext) + .toEqual(highlightedContext2) + + expect.soft(highlightedNatural) + .toMatchInlineSnapshot(` + { + "bg": "#ffffff", + "fg": "#393a34", + "rootStyle": undefined, + "themeName": "vitesse-light", + "tokens": [ + [ + { + "color": "#B07D48", + "content": "Omit", + "fontStyle": 0, + "offset": 0, + }, + { + "color": "#999999", + "content": "<{},", + "fontStyle": 0, + "offset": 4, + }, + { + "color": "#393A34", + "content": " ", + "fontStyle": 0, + "offset": 8, + }, + { + "color": "#B07D48", + "content": "string", + "fontStyle": 0, + "offset": 9, + }, + { + "color": "#393A34", + "content": " ", + "fontStyle": 0, + "offset": 15, + }, + { + "color": "#AB5959", + "content": "|", + "fontStyle": 0, + "offset": 16, + }, + { + "color": "#393A34", + "content": " ", + "fontStyle": 0, + "offset": 17, + }, + { + "color": "#B07D48", + "content": "number", + "fontStyle": 0, + "offset": 18, + }, + { + "color": "#999999", + "content": ">", + "fontStyle": 0, + "offset": 24, + }, + ], + ], + } + `) + + expect.soft(highlightedContext) + .toMatchInlineSnapshot(` + { + "bg": "#ffffff", + "fg": "#393a34", + "rootStyle": undefined, + "themeName": "vitesse-light", + "tokens": [ + [ + { + "color": "#2E8F82", + "content": "Omit", + "fontStyle": 0, + "offset": 0, + }, + { + "color": "#999999", + "content": "<{}, ", + "fontStyle": 0, + "offset": 4, + }, + { + "color": "#2E8F82", + "content": "string", + "fontStyle": 0, + "offset": 9, + }, + { + "color": "#999999", + "content": " | ", + "fontStyle": 0, + "offset": 15, + }, + { + "color": "#2E8F82", + "content": "number", + "fontStyle": 0, + "offset": 18, + }, + { + "color": "#999999", + "content": ">", + "fontStyle": 0, + "offset": 24, + }, + ], + ], + } + `) +}) + +it('grammarContextCode', async () => { + const shiki = await createHighlighter({ + themes: ['vitesse-light'], + langs: ['typescript', 'vue', 'html'], + }) + + const input = '
' + + const highlightedHtml = shiki.codeToHtml(input, { + lang: 'html', + theme: 'vitesse-light', + structure: 'inline', + }) + + const highlightedVueTemplate = shiki.codeToHtml(input, { + lang: 'vue', + theme: 'vitesse-light', + structure: 'inline', + grammarContextCode: '