Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: introduce GrammarState #712

Merged
merged 3 commits into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 75 additions & 6 deletions packages/core/src/code-to-tokens-base.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
/* ---------------------------------------------------------
* Copyright (C) Microsoft Corporation. All rights reserved.
*-------------------------------------------------------- */
import type { IGrammar, IRawThemeSetting } from './textmate'
import type { IGrammar, IRawThemeSetting, StateStack } from './textmate'
import { INITIAL } from './textmate'
import type { CodeToTokensBaseOptions, FontStyle, ShikiInternal, ThemeRegistrationResolved, ThemedToken, ThemedTokenScopeExplanation, TokenizeWithThemeOptions } from './types'
import { StackElementMetadata } from './stack-element-metadata'
import { applyColorReplacements, isNoneTheme, isPlainLang, resolveColorReplacements, splitLines } from './utils'
import { tokenizeAnsiWithTheme } from './code-to-tokens-ansi'
import { ShikiError } from './error'
import { GrammarState, getGrammarStack } from './grammar-state'

/**
* Code to tokens, with a simple theme.
Expand All @@ -30,9 +32,45 @@ export function codeToTokensBase(
return tokenizeAnsiWithTheme(theme, code, options)

const _grammar = internal.getLanguage(lang)

if (options.grammarState) {
if (options.grammarState.lang !== _grammar.name) {
throw new ShikiError(`Grammar state language "${options.grammarState.lang}" does not match highlight language "${_grammar.name}"`)
}
if (options.grammarState.theme !== themeName) {
throw new ShikiError(`Grammar state theme "${options.grammarState.theme}" does not match highlight theme "${themeName}"`)
}
}

return tokenizeWithTheme(code, _grammar, theme, colorMap, options)
}

export function getLastGrammarState(
internal: ShikiInternal,
code: string,
options: CodeToTokensBaseOptions = {},
): GrammarState {
const {
lang = 'text',
theme: themeName = internal.getLoadedThemes()[0],
} = options

if (isPlainLang(lang) || isNoneTheme(themeName))
throw new ShikiError('Plain language does not have grammar state')
if (lang === 'ansi')
throw new ShikiError('ANSI language does not have grammar state')

const { theme, colorMap } = internal.setTheme(themeName)

const _grammar = internal.getLanguage(lang)

return new GrammarState(
_tokenizeWithTheme(code, _grammar, theme, colorMap, options).stateStack,
_grammar.name,
theme.name,
)
}

/** for explanations */
interface ThemeSettingsSelectors {
settings: IRawThemeSetting
Expand All @@ -46,6 +84,19 @@ export function tokenizeWithTheme(
colorMap: string[],
options: TokenizeWithThemeOptions,
): ThemedToken[][] {
return _tokenizeWithTheme(code, grammar, theme, colorMap, options).tokens
}

function _tokenizeWithTheme(
code: string,
grammar: IGrammar,
theme: ThemeRegistrationResolved,
colorMap: string[],
options: TokenizeWithThemeOptions,
): {
tokens: ThemedToken[][]
stateStack: StateStack
} {
const colorReplacements = resolveColorReplacements(theme, options)

const {
Expand All @@ -55,7 +106,22 @@ export function tokenizeWithTheme(

const lines = splitLines(code)

let ruleStack = INITIAL
let stateStack = options.grammarState
? getGrammarStack(options.grammarState)
: options.grammarContextCode != null
? _tokenizeWithTheme(
options.grammarContextCode,
grammar,
theme,
colorMap,
{
...options,
grammarState: undefined,
grammarContextCode: undefined,
},
).stateStack
: INITIAL

let actual: ThemedToken[] = []
const final: ThemedToken[][] = []

Expand Down Expand Up @@ -106,12 +172,12 @@ export function tokenizeWithTheme(
let tokensWithScopesIndex

if (options.includeExplanation) {
resultWithScopes = grammar.tokenizeLine(line, ruleStack)
resultWithScopes = grammar.tokenizeLine(line, stateStack)
tokensWithScopes = resultWithScopes.tokens
tokensWithScopesIndex = 0
}

const result = grammar.tokenizeLine2(line, ruleStack, tokenizeTimeLimit)
const result = grammar.tokenizeLine2(line, stateStack, tokenizeTimeLimit)

const tokensLength = result.tokens.length / 2
for (let j = 0; j < tokensLength; j++) {
Expand Down Expand Up @@ -158,10 +224,13 @@ export function tokenizeWithTheme(
}
final.push(actual)
actual = []
ruleStack = result.ruleStack
stateStack = result.ruleStack
}

return final
return {
tokens: final,
stateStack,
}
}

function explainThemeScopes(
Expand Down
54 changes: 54 additions & 0 deletions packages/core/src/grammar-state.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import type { StateStackImpl } from '../vendor/vscode-textmate/src/grammar'
import { ShikiError } from './error'
import type { StateStack } from './textmate'

/**
* GrammarState is a special reference object that holds the state of a grammar.
*
* It's used to highlight code snippets that are part of the target language.
*/
export class GrammarState {
constructor(
private _stack: StateStack,
public lang: string,
public theme: string,
) {}

get scopes() {
return getScopes(this._stack as StateStackImpl)
}

toJSON() {
return {
lang: this.lang,
theme: this.theme,
scopes: this.scopes,
}
}
}

function getScopes(stack: StateStackImpl) {
const scopes: string[] = []
const visited = new Set<StateStackImpl>()

function pushScope(stack: StateStackImpl) {
if (visited.has(stack))
return
visited.add(stack)
const name = stack?.nameScopesList?.scopeName
if (name)
scopes.push(name)
if (stack.parent)
pushScope(stack.parent)
}

pushScope(stack)
return scopes
}

export function getGrammarStack(state: GrammarState) {
if (!(state instanceof GrammarState))
throw new ShikiError('Invalid grammar state')
// @ts-expect-error _stack is private
return state._stack
}
3 changes: 2 additions & 1 deletion packages/core/src/highlighter.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { codeToHast } from './code-to-hast'
import { codeToHtml } from './code-to-html'
import { codeToTokens } from './code-to-tokens'
import { codeToTokensBase } from './code-to-tokens-base'
import { codeToTokensBase, getLastGrammarState } from './code-to-tokens-base'
import { codeToTokensWithThemes } from './code-to-tokens-themes'
import { createShikiInternal } from './internal'
import type { HighlighterCore, HighlighterCoreOptions } from './types'
Expand All @@ -16,6 +16,7 @@ export async function createHighlighterCore(options: HighlighterCoreOptions = {}
const internal = await createShikiInternal(options)

return {
getLastGrammarState: (code, options) => getLastGrammarState(internal, code, options),
codeToTokensBase: (code, options) => codeToTokensBase(internal, code, options),
codeToTokensWithThemes: (code, options) => codeToTokensWithThemes(internal, code, options),
codeToTokens: (code, options) => codeToTokens(internal, code, options),
Expand Down
11 changes: 6 additions & 5 deletions packages/core/src/registry.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import type { IGrammar, IGrammarConfiguration, IRawTheme } from './textmate'
import type { IGrammarConfiguration, IRawTheme } from './textmate'
import { Registry as TextMateRegistry, Theme as TextMateTheme } from './textmate'
import type { LanguageRegistration, ThemeRegistrationAny, ThemeRegistrationResolved } from './types'
import type { Grammar, LanguageRegistration, ThemeRegistrationAny, ThemeRegistrationResolved } from './types'
import type { Resolver } from './resolver'
import { normalizeTheme } from './normalize'
import { ShikiError } from './error'

export class Registry extends TextMateRegistry {
private _resolvedThemes: Map<string, ThemeRegistrationResolved> = new Map()
private _resolvedGrammars: Map<string, IGrammar> = new Map()
private _resolvedGrammars: Map<string, Grammar> = new Map()
private _langMap: Map<string, LanguageRegistration> = new Map()
private _langGraph: Map<string, LanguageRegistration> = new Map()

Expand Down Expand Up @@ -97,8 +97,9 @@ export class Registry extends TextMateRegistry {

// @ts-expect-error Private members, set this to override the previous grammar (that can be a stub)
this._syncRegistry._rawGrammars.set(lang.scopeName, lang)
const g = await this.loadGrammarWithConfiguration(lang.scopeName, 1, grammarConfig)
this._resolvedGrammars.set(lang.name, g!)
const g = await this.loadGrammarWithConfiguration(lang.scopeName, 1, grammarConfig) as Grammar
g.name = lang.name
this._resolvedGrammars.set(lang.name, g)
if (lang.aliases) {
lang.aliases.forEach((alias) => {
this._alias[alias] = lang.name
Expand Down
10 changes: 9 additions & 1 deletion packages/core/src/types/highlighter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import type { Root } from 'hast'
import type { Grammar } from './textmate'
import type { LanguageInput, LanguageRegistration, ResolveBundleKey, SpecialLanguage } from './langs'
import type { SpecialTheme, ThemeInput, ThemeRegistrationAny, ThemeRegistrationResolved } from './themes'
import type { CodeToTokensBaseOptions, CodeToTokensOptions, CodeToTokensWithThemesOptions, ThemedToken, ThemedTokenWithVariants, TokensResult } from './tokens'
import type { CodeToTokensBaseOptions, CodeToTokensOptions, CodeToTokensWithThemesOptions, GrammarState, ThemedToken, ThemedTokenWithVariants, TokensResult } from './tokens'
import type { CodeToHastOptions } from './options'

/**
Expand Down Expand Up @@ -104,6 +104,14 @@ export interface HighlighterGeneric<BundledLangKeys extends string, BundledTheme
code: string,
options: CodeToTokensWithThemesOptions<ResolveBundleKey<BundledLangKeys>, ResolveBundleKey<BundledThemeKeys>>
) => ThemedTokenWithVariants[][]
/**
* Get the last grammar state of a code snippet.
* You can pass the grammar state to `codeToTokens` as `grammarState` to continue tokenizing from an intermediate state.
*/
getLastGrammarState: (
code: string,
options: CodeToTokensBaseOptions<ResolveBundleKey<BundledLangKeys>, ResolveBundleKey<BundledThemeKeys>>
) => GrammarState

/**
* Get internal context object
Expand Down
2 changes: 1 addition & 1 deletion packages/core/src/types/options.ts
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ export interface CodeToHastOptionsCommon<Languages extends string = string>
extends
TransformerOptions,
DecorationOptions,
Pick<TokenizeWithThemeOptions, 'colorReplacements' | 'tokenizeMaxLineLength' | 'tokenizeTimeLimit'> {
Pick<TokenizeWithThemeOptions, 'colorReplacements' | 'tokenizeMaxLineLength' | 'tokenizeTimeLimit' | 'grammarState' | 'grammarContextCode'> {

lang: StringLiteralUnion<Languages | SpecialLanguage>

Expand Down
7 changes: 5 additions & 2 deletions packages/core/src/types/textmate.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import type {
IGrammar as Grammar,
IGrammar,
IRawGrammar as RawGrammar,
IRawTheme as RawTheme,
IRawThemeSetting as RawThemeSetting,
} from '../textmate'

export type {
Grammar,
RawGrammar,
RawTheme,
RawThemeSetting,
}

export interface Grammar extends IGrammar {
name: string
}
18 changes: 18 additions & 0 deletions packages/core/src/types/tokens.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import type { GrammarState } from '../grammar-state'
import type { SpecialLanguage } from './langs'
import type { SpecialTheme, ThemeRegistrationAny } from './themes'
import type { CodeOptionsThemes } from './options'

export type { GrammarState }

export interface CodeToTokensBaseOptions<Languages extends string = string, Themes extends string = string> extends TokenizeWithThemeOptions {
lang?: Languages | SpecialLanguage
theme?: Themes | ThemeRegistrationAny | SpecialTheme
Expand Down Expand Up @@ -172,6 +175,21 @@ export interface TokenizeWithThemeOptions {
* @default 500 (0.5s)
*/
tokenizeTimeLimit?: number

/**
* Represent the state of the grammar, allowing to continue tokenizing from a intermediate grammar state.
*
* You can get the grammar state from `getLastGrammarState`.
*/
grammarState?: GrammarState

/**
* The code context of the grammar.
* Consider it a prepended code to the input code, that only participate the grammar inference but not presented in the final output.
*
* This will be ignored if `grammarState` is provided.
*/
grammarContextCode?: string
}

/**
Expand Down
Loading
Loading